Repository: hankcs/HanLP
Branch: doc-zh
Commit: ddb1299bddff
Files: 697
Total size: 3.2 MB
Directory structure:
gitextract_p7um9exn/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── config.yml
│ │ └── feature_request.md
│ ├── pull_request_template.md
│ └── workflows/
│ └── unit-tests.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── README.md
├── docs/
│ ├── Makefile
│ ├── annotations/
│ │ ├── constituency/
│ │ │ ├── ctb.md
│ │ │ ├── index.md
│ │ │ ├── npcmj.md
│ │ │ └── ptb.md
│ │ ├── dep/
│ │ │ ├── index.md
│ │ │ ├── pmt.md
│ │ │ ├── sd_en.md
│ │ │ ├── sd_zh.md
│ │ │ └── ud.md
│ │ ├── index.md
│ │ ├── ner/
│ │ │ ├── index.md
│ │ │ ├── msra.md
│ │ │ ├── ontonotes.md
│ │ │ └── pku.md
│ │ ├── pos/
│ │ │ ├── 863.md
│ │ │ ├── ctb.md
│ │ │ ├── index.md
│ │ │ ├── npcmj.md
│ │ │ ├── pku.md
│ │ │ └── ud.md
│ │ ├── sdp/
│ │ │ ├── dm.md
│ │ │ ├── index.md
│ │ │ ├── pas.md
│ │ │ ├── psd.md
│ │ │ └── semeval16.md
│ │ ├── srl/
│ │ │ ├── cpb.md
│ │ │ ├── index.md
│ │ │ └── propbank.md
│ │ └── tok/
│ │ ├── ctb.md
│ │ ├── index.md
│ │ └── msr.md
│ ├── api/
│ │ ├── common/
│ │ │ ├── configurable.rst
│ │ │ ├── conll.rst
│ │ │ ├── constant.rst
│ │ │ ├── document.rst
│ │ │ └── index.md
│ │ ├── hanlp/
│ │ │ ├── common/
│ │ │ │ ├── component.rst
│ │ │ │ ├── dataset.md
│ │ │ │ ├── index.md
│ │ │ │ ├── structure.md
│ │ │ │ ├── torch_component.md
│ │ │ │ ├── transform.md
│ │ │ │ └── vocab.md
│ │ │ ├── components/
│ │ │ │ ├── classifiers.md
│ │ │ │ ├── eos.md
│ │ │ │ ├── index.md
│ │ │ │ ├── lemmatizer.md
│ │ │ │ ├── mtl/
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── mtl.md
│ │ │ │ │ └── tasks/
│ │ │ │ │ ├── constituency.md
│ │ │ │ │ ├── dep.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── lem.md
│ │ │ │ │ ├── ner/
│ │ │ │ │ │ ├── biaffine_ner.md
│ │ │ │ │ │ ├── index.md
│ │ │ │ │ │ └── tag_ner.md
│ │ │ │ │ ├── pos.md
│ │ │ │ │ ├── sdp.md
│ │ │ │ │ ├── srl/
│ │ │ │ │ │ ├── bio_srl.md
│ │ │ │ │ │ ├── index.md
│ │ │ │ │ │ └── rank_srl.md
│ │ │ │ │ ├── task.md
│ │ │ │ │ ├── tok.md
│ │ │ │ │ └── ud.md
│ │ │ │ ├── ner/
│ │ │ │ │ ├── biaffine_ner.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── rnn_ner.md
│ │ │ │ │ └── transformer_ner.md
│ │ │ │ ├── parsers/
│ │ │ │ │ ├── biaffine_dep.md
│ │ │ │ │ ├── biaffine_sdp.md
│ │ │ │ │ ├── crf_constituency_parser.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── ud_parser.md
│ │ │ │ ├── pipeline.md
│ │ │ │ ├── srl/
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── span_bio.md
│ │ │ │ │ └── span_rank.md
│ │ │ │ ├── sts.md
│ │ │ │ ├── taggers/
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── rnn_tagger.md
│ │ │ │ │ └── transformer_tagger.md
│ │ │ │ └── tokenizers/
│ │ │ │ ├── index.md
│ │ │ │ ├── multi_criteria.md
│ │ │ │ └── transformer.md
│ │ │ ├── datasets/
│ │ │ │ ├── constituency/
│ │ │ │ │ ├── constituency_dataset.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── resources.md
│ │ │ │ ├── dep/
│ │ │ │ │ ├── conll_dataset.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── resources.md
│ │ │ │ ├── eos/
│ │ │ │ │ ├── eos.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── resources.md
│ │ │ │ ├── index.md
│ │ │ │ ├── ner/
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── json.md
│ │ │ │ │ ├── resources.md
│ │ │ │ │ └── tsv.md
│ │ │ │ ├── pos/
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── resources.md
│ │ │ │ ├── srl/
│ │ │ │ │ ├── conll2012_dataset.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── resources.md
│ │ │ │ └── tok/
│ │ │ │ ├── index.md
│ │ │ │ ├── mcws_dataset.md
│ │ │ │ ├── resources.md
│ │ │ │ └── txt.md
│ │ │ ├── hanlp.rst
│ │ │ ├── index.md
│ │ │ ├── layers/
│ │ │ │ ├── decoders/
│ │ │ │ │ ├── biaffine_ner.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── linear_crf.md
│ │ │ │ ├── embeddings/
│ │ │ │ │ ├── char_cnn.md
│ │ │ │ │ ├── char_rnn.md
│ │ │ │ │ ├── embedding.md
│ │ │ │ │ ├── fasttext.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── transformer.md
│ │ │ │ │ └── word2vec.md
│ │ │ │ ├── index.md
│ │ │ │ └── transformers/
│ │ │ │ ├── encoder.md
│ │ │ │ ├── index.md
│ │ │ │ └── tokenizer.md
│ │ │ ├── pretrained/
│ │ │ │ ├── amr.md
│ │ │ │ ├── amr2text.md
│ │ │ │ ├── constituency.md
│ │ │ │ ├── dep.md
│ │ │ │ ├── eos.md
│ │ │ │ ├── fasttext.md
│ │ │ │ ├── glove.md
│ │ │ │ ├── index.md
│ │ │ │ ├── mlm.md
│ │ │ │ ├── mtl.md
│ │ │ │ ├── ner.md
│ │ │ │ ├── pos.md
│ │ │ │ ├── sdp.md
│ │ │ │ ├── srl.md
│ │ │ │ ├── sts.md
│ │ │ │ ├── tok.md
│ │ │ │ └── word2vec.md
│ │ │ └── utils/
│ │ │ ├── index.md
│ │ │ └── io_util.md
│ │ ├── restful.rst
│ │ ├── restful_golang.md
│ │ ├── restful_java.md
│ │ └── trie/
│ │ ├── dictionary.md
│ │ ├── index.md
│ │ └── trie.md
│ ├── conf.py
│ ├── configure.md
│ ├── contributing.md
│ ├── data_format.md
│ ├── index.md
│ ├── install.md
│ ├── references.bib
│ ├── references.rst
│ └── tutorial.md
├── hanlp/
│ ├── __init__.py
│ ├── callbacks/
│ │ ├── __init__.py
│ │ └── fine_csv_logger.py
│ ├── common/
│ │ ├── __init__.py
│ │ ├── component.py
│ │ ├── dataset.py
│ │ ├── keras_component.py
│ │ ├── structure.py
│ │ ├── torch_component.py
│ │ ├── transform.py
│ │ ├── transform_tf.py
│ │ ├── vocab.py
│ │ └── vocab_tf.py
│ ├── components/
│ │ ├── __init__.py
│ │ ├── amr/
│ │ │ ├── __init__.py
│ │ │ ├── amrbart/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bart_amr_generation.py
│ │ │ │ ├── bart_amr_parser.py
│ │ │ │ ├── common/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── constant.py
│ │ │ │ │ ├── penman_interface.py
│ │ │ │ │ └── postprocessing.py
│ │ │ │ ├── data_interface/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── dataset.py
│ │ │ │ ├── model_interface/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── modeling_bart.py
│ │ │ │ │ └── tokenization_bart.py
│ │ │ │ └── preprocess/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── amr_io.py
│ │ │ │ ├── penman_interface.py
│ │ │ │ └── read_and_process.py
│ │ │ └── seq2seq/
│ │ │ ├── __init__.py
│ │ │ ├── dataset/
│ │ │ │ ├── IO.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── linearization.py
│ │ │ │ ├── penman.py
│ │ │ │ ├── postprocessing.py
│ │ │ │ ├── tokenization_bart.py
│ │ │ │ └── tokenization_t5.py
│ │ │ ├── evaluation.py
│ │ │ ├── optim.py
│ │ │ └── seq2seq_amr_parser.py
│ │ ├── classifiers/
│ │ │ ├── __init__.py
│ │ │ ├── fasttext_classifier.py
│ │ │ ├── transformer_classifier.py
│ │ │ ├── transformer_classifier_hf.py
│ │ │ ├── transformer_classifier_tf.py
│ │ │ └── transformer_regression_hf.py
│ │ ├── distillation/
│ │ │ ├── __init__.py
│ │ │ ├── distillable_component.py
│ │ │ ├── losses.py
│ │ │ └── schedulers.py
│ │ ├── eos/
│ │ │ ├── __init__.py
│ │ │ └── ngram.py
│ │ ├── lambda_wrapper.py
│ │ ├── lemmatizer.py
│ │ ├── lm/
│ │ │ ├── __init__.py
│ │ │ └── mlm.py
│ │ ├── mtl/
│ │ │ ├── __init__.py
│ │ │ ├── multi_task_learning.py
│ │ │ └── tasks/
│ │ │ ├── __init__.py
│ │ │ ├── amr.py
│ │ │ ├── constituency.py
│ │ │ ├── dep.py
│ │ │ ├── dep_2nd.py
│ │ │ ├── lem.py
│ │ │ ├── ner/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── biaffine_ner.py
│ │ │ │ └── tag_ner.py
│ │ │ ├── pos.py
│ │ │ ├── sdp.py
│ │ │ ├── srl/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bio_srl.py
│ │ │ │ └── rank_srl.py
│ │ │ ├── tok/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── reg_tok.py
│ │ │ │ └── tag_tok.py
│ │ │ └── ud.py
│ │ ├── ner/
│ │ │ ├── __init__.py
│ │ │ ├── biaffine_ner/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── biaffine_ner.py
│ │ │ │ └── biaffine_ner_model.py
│ │ │ ├── ner_tf.py
│ │ │ ├── rnn_ner.py
│ │ │ └── transformer_ner.py
│ │ ├── parsers/
│ │ │ ├── __init__.py
│ │ │ ├── alg.py
│ │ │ ├── alg_tf.py
│ │ │ ├── biaffine/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── biaffine.py
│ │ │ │ ├── biaffine_2nd_dep.py
│ │ │ │ ├── biaffine_dep.py
│ │ │ │ ├── biaffine_model.py
│ │ │ │ ├── biaffine_sdp.py
│ │ │ │ ├── mlp.py
│ │ │ │ ├── structual_attention.py
│ │ │ │ └── variationalbilstm.py
│ │ │ ├── biaffine_parser_tf.py
│ │ │ ├── biaffine_tf/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alg.py
│ │ │ │ ├── layers.py
│ │ │ │ └── model.py
│ │ │ ├── chu_liu_edmonds.py
│ │ │ ├── conll.py
│ │ │ ├── constituency/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── crf_constituency_model.py
│ │ │ │ ├── crf_constituency_parser.py
│ │ │ │ └── treecrf.py
│ │ │ ├── parse_alg.py
│ │ │ └── ud/
│ │ │ ├── __init__.py
│ │ │ ├── lemma_edit.py
│ │ │ ├── tag_decoder.py
│ │ │ ├── ud_model.py
│ │ │ ├── ud_parser.py
│ │ │ ├── udify_util.py
│ │ │ └── util.py
│ │ ├── pipeline.py
│ │ ├── rnn_language_model_tf.py
│ │ ├── srl/
│ │ │ ├── __init__.py
│ │ │ ├── span_bio/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── baffine_tagging.py
│ │ │ │ └── span_bio.py
│ │ │ └── span_rank/
│ │ │ ├── __init__.py
│ │ │ ├── highway_variational_lstm.py
│ │ │ ├── inference_utils.py
│ │ │ ├── layer.py
│ │ │ ├── span_rank.py
│ │ │ ├── span_ranking_srl_model.py
│ │ │ ├── srl_eval_utils.py
│ │ │ └── util.py
│ │ ├── sts/
│ │ │ ├── __init__.py
│ │ │ └── transformer_sts.py
│ │ ├── taggers/
│ │ │ ├── __init__.py
│ │ │ ├── cnn_tagger_tf.py
│ │ │ ├── ngram_conv/
│ │ │ │ ├── __init__.py
│ │ │ │ └── ngram_conv_tagger.py
│ │ │ ├── pos_tf.py
│ │ │ ├── rnn/
│ │ │ │ ├── __init__.py
│ │ │ │ └── rnntaggingmodel.py
│ │ │ ├── rnn_tagger.py
│ │ │ ├── rnn_tagger_tf.py
│ │ │ ├── tagger.py
│ │ │ ├── tagger_tf.py
│ │ │ ├── transformers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── metrics_tf.py
│ │ │ │ ├── transformer_tagger.py
│ │ │ │ ├── transformer_tagger_tf.py
│ │ │ │ └── transformer_transform_tf.py
│ │ │ └── util.py
│ │ └── tokenizers/
│ │ ├── __init__.py
│ │ ├── multi_criteria_cws_transformer.py
│ │ ├── tok.py
│ │ ├── tok_tf.py
│ │ └── transformer.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── classification/
│ │ │ ├── __init__.py
│ │ │ └── sentiment.py
│ │ ├── coref/
│ │ │ ├── __init__.py
│ │ │ └── loaders/
│ │ │ ├── __init__.py
│ │ │ └── conll12coref.py
│ │ ├── eos/
│ │ │ ├── __init__.py
│ │ │ ├── eos.py
│ │ │ └── loaders/
│ │ │ ├── __init__.py
│ │ │ └── nn_eos.py
│ │ ├── lm/
│ │ │ ├── __init__.py
│ │ │ └── loaders/
│ │ │ ├── __init__.py
│ │ │ └── lm_dataset.py
│ │ ├── lu/
│ │ │ ├── __init__.py
│ │ │ └── glue.py
│ │ ├── ner/
│ │ │ ├── __init__.py
│ │ │ ├── conll03.py
│ │ │ ├── loaders/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── json_ner.py
│ │ │ │ └── tsv.py
│ │ │ ├── msra.py
│ │ │ ├── resume.py
│ │ │ └── weibo.py
│ │ ├── parsing/
│ │ │ ├── __init__.py
│ │ │ ├── amr.py
│ │ │ ├── ctb5.py
│ │ │ ├── ctb7.py
│ │ │ ├── ctb8.py
│ │ │ ├── ctb9.py
│ │ │ ├── loaders/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _ctb_utils.py
│ │ │ │ ├── conll_dataset.py
│ │ │ │ └── constituency_dataset.py
│ │ │ ├── pmt1.py
│ │ │ ├── ptb.py
│ │ │ ├── semeval15.py
│ │ │ ├── semeval16.py
│ │ │ └── ud/
│ │ │ ├── __init__.py
│ │ │ ├── ud210.py
│ │ │ ├── ud210m.py
│ │ │ ├── ud23.py
│ │ │ ├── ud23m.py
│ │ │ ├── ud27.py
│ │ │ └── ud27m.py
│ │ ├── pos/
│ │ │ ├── __init__.py
│ │ │ └── ctb5.py
│ │ ├── qa/
│ │ │ ├── __init__.py
│ │ │ └── hotpotqa.py
│ │ ├── srl/
│ │ │ ├── __init__.py
│ │ │ ├── loaders/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conll2012.py
│ │ │ │ └── ontonotes_loader.py
│ │ │ └── ontonotes5/
│ │ │ ├── __init__.py
│ │ │ ├── _utils.py
│ │ │ ├── chinese.py
│ │ │ └── english.py
│ │ ├── sts/
│ │ │ ├── __init__.py
│ │ │ └── stsb.py
│ │ └── tokenization/
│ │ ├── __init__.py
│ │ ├── ctb6.py
│ │ ├── loaders/
│ │ │ ├── __init__.py
│ │ │ ├── chunking_dataset.py
│ │ │ ├── multi_criteria_cws/
│ │ │ │ ├── __init__.py
│ │ │ │ └── mcws_dataset.py
│ │ │ └── txt.py
│ │ └── sighan2005/
│ │ ├── __init__.py
│ │ ├── as_.py
│ │ ├── cityu.py
│ │ ├── msr.py
│ │ └── pku.py
│ ├── layers/
│ │ ├── __init__.py
│ │ ├── cnn_encoder.py
│ │ ├── crf/
│ │ │ ├── __init__.py
│ │ │ ├── crf.py
│ │ │ ├── crf_layer_tf.py
│ │ │ └── crf_tf.py
│ │ ├── dropout.py
│ │ ├── embeddings/
│ │ │ ├── __init__.py
│ │ │ ├── char_cnn.py
│ │ │ ├── char_cnn_tf.py
│ │ │ ├── char_rnn.py
│ │ │ ├── char_rnn_tf.py
│ │ │ ├── concat_embedding.py
│ │ │ ├── contextual_string_embedding.py
│ │ │ ├── contextual_string_embedding_tf.py
│ │ │ ├── contextual_word_embedding.py
│ │ │ ├── embedding.py
│ │ │ ├── fast_text.py
│ │ │ ├── fast_text_tf.py
│ │ │ ├── util.py
│ │ │ ├── util_tf.py
│ │ │ ├── word2vec.py
│ │ │ └── word2vec_tf.py
│ │ ├── feed_forward.py
│ │ ├── feedforward.py
│ │ ├── scalar_mix.py
│ │ ├── time_distributed.py
│ │ ├── transformers/
│ │ │ ├── __init__.py
│ │ │ ├── encoder.py
│ │ │ ├── loader_tf.py
│ │ │ ├── pt_imports.py
│ │ │ ├── relative_transformer.py
│ │ │ ├── resource.py
│ │ │ ├── tf_imports.py
│ │ │ ├── utils.py
│ │ │ └── utils_tf.py
│ │ └── weight_normalization.py
│ ├── losses/
│ │ ├── __init__.py
│ │ └── sparse_categorical_crossentropy.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── accuracy.py
│ │ ├── amr/
│ │ │ ├── __init__.py
│ │ │ └── smatch_eval.py
│ │ ├── chunking/
│ │ │ ├── __init__.py
│ │ │ ├── binary_chunking_f1.py
│ │ │ ├── bmes_tf.py
│ │ │ ├── chunking_f1.py
│ │ │ ├── chunking_f1_tf.py
│ │ │ ├── conlleval.py
│ │ │ ├── iobes_tf.py
│ │ │ └── sequence_labeling.py
│ │ ├── f1.py
│ │ ├── metric.py
│ │ ├── mtl.py
│ │ ├── parsing/
│ │ │ ├── __init__.py
│ │ │ ├── attachmentscore.py
│ │ │ ├── conllx_eval.py
│ │ │ ├── labeled_f1.py
│ │ │ ├── labeled_f1_tf.py
│ │ │ ├── labeled_score.py
│ │ │ ├── semdep_eval.py
│ │ │ └── span.py
│ │ ├── spearman_correlation.py
│ │ └── srl/
│ │ ├── __init__.py
│ │ └── srlconll.py
│ ├── optimizers/
│ │ ├── __init__.py
│ │ └── adamw/
│ │ ├── __init__.py
│ │ └── optimization.py
│ ├── pretrained/
│ │ ├── __init__.py
│ │ ├── amr.py
│ │ ├── amr2text.py
│ │ ├── classifiers.py
│ │ ├── constituency.py
│ │ ├── dep.py
│ │ ├── eos.py
│ │ ├── fasttext.py
│ │ ├── glove.py
│ │ ├── mtl.py
│ │ ├── ner.py
│ │ ├── pos.py
│ │ ├── rnnlm.py
│ │ ├── sdp.py
│ │ ├── srl.py
│ │ ├── sts.py
│ │ ├── tok.py
│ │ └── word2vec.py
│ ├── transform/
│ │ ├── __init__.py
│ │ ├── conll_tf.py
│ │ ├── glue_tf.py
│ │ ├── table_tf.py
│ │ ├── tacred_tf.py
│ │ ├── text_tf.py
│ │ ├── transformer_tokenizer.py
│ │ ├── tsv_tf.py
│ │ └── txt_tf.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── component_util.py
│ │ ├── file_read_backwards/
│ │ │ ├── __init__.py
│ │ │ ├── buffer_work_space.py
│ │ │ └── file_read_backwards.py
│ │ ├── init_util.py
│ │ ├── io_util.py
│ │ ├── lang/
│ │ │ ├── __init__.py
│ │ │ ├── en/
│ │ │ │ ├── __init__.py
│ │ │ │ └── english_tokenizer.py
│ │ │ ├── ja/
│ │ │ │ ├── __init__.py
│ │ │ │ └── bert_tok.py
│ │ │ └── zh/
│ │ │ ├── __init__.py
│ │ │ ├── char_table.py
│ │ │ └── localization.py
│ │ ├── log_util.py
│ │ ├── rules.py
│ │ ├── span_util.py
│ │ ├── string_util.py
│ │ ├── tf_util.py
│ │ ├── time_util.py
│ │ └── torch_util.py
│ └── version.py
├── plugins/
│ ├── README.md
│ ├── hanlp_common/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── hanlp_common/
│ │ │ ├── __init__.py
│ │ │ ├── amr.py
│ │ │ ├── configurable.py
│ │ │ ├── conll.py
│ │ │ ├── constant.py
│ │ │ ├── document.py
│ │ │ ├── io.py
│ │ │ ├── reflection.py
│ │ │ ├── structure.py
│ │ │ ├── util.py
│ │ │ └── visualization.py
│ │ └── setup.py
│ ├── hanlp_demo/
│ │ ├── README.md
│ │ ├── hanlp_demo/
│ │ │ ├── __init__.py
│ │ │ ├── block_windows.py
│ │ │ ├── en/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── demo_amr.py
│ │ │ │ ├── demo_dep.py
│ │ │ │ ├── demo_lm.py
│ │ │ │ ├── demo_ner.py
│ │ │ │ ├── demo_pipeline.py
│ │ │ │ ├── demo_pos.py
│ │ │ │ ├── demo_sdp.py
│ │ │ │ ├── demo_sentiment_analysis.py
│ │ │ │ ├── demo_tok.py
│ │ │ │ └── train_sst2_albert_base.py
│ │ │ ├── ja/
│ │ │ │ ├── __init__.py
│ │ │ │ └── demo_mtl.py
│ │ │ ├── mul/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── demo_lid.py
│ │ │ │ ├── demo_lid_restful.py
│ │ │ │ ├── demo_mtl.py
│ │ │ │ └── train/
│ │ │ │ ├── __init__.py
│ │ │ │ └── mul_base.py
│ │ │ ├── sent_split.py
│ │ │ └── zh/
│ │ │ ├── __init__.py
│ │ │ ├── abstractive_summarization_restful.ipynb
│ │ │ ├── amr_restful.ipynb
│ │ │ ├── amr_stl.ipynb
│ │ │ ├── classification_restful.ipynb
│ │ │ ├── con_mtl.ipynb
│ │ │ ├── con_restful.ipynb
│ │ │ ├── con_stl.ipynb
│ │ │ ├── cor_restful.ipynb
│ │ │ ├── demo_amr.py
│ │ │ ├── demo_custom_dict.py
│ │ │ ├── demo_custom_dict_stl.py
│ │ │ ├── demo_del_tasks.py
│ │ │ ├── demo_document.py
│ │ │ ├── demo_mlm.py
│ │ │ ├── demo_mtl.py
│ │ │ ├── demo_ner_dict.py
│ │ │ ├── demo_parse_constituency.py
│ │ │ ├── demo_pipeline.py
│ │ │ ├── demo_pos_dict.py
│ │ │ ├── demo_sts.py
│ │ │ ├── demo_word2vec.py
│ │ │ ├── dep_mtl.ipynb
│ │ │ ├── dep_restful.ipynb
│ │ │ ├── dep_stl.ipynb
│ │ │ ├── extractive_summarization_restful.ipynb
│ │ │ ├── gec_restful.ipynb
│ │ │ ├── keyphrase_restful.ipynb
│ │ │ ├── lid_restful.ipynb
│ │ │ ├── lid_stl.ipynb
│ │ │ ├── ner_mtl.ipynb
│ │ │ ├── ner_restful.ipynb
│ │ │ ├── ner_stl.ipynb
│ │ │ ├── pos_mtl.ipynb
│ │ │ ├── pos_restful.ipynb
│ │ │ ├── pos_stl.ipynb
│ │ │ ├── sdp_mtl.ipynb
│ │ │ ├── sdp_restful.ipynb
│ │ │ ├── sdp_stl.ipynb
│ │ │ ├── sentiment_restful.ipynb
│ │ │ ├── srl_mtl.ipynb
│ │ │ ├── srl_restful.ipynb
│ │ │ ├── srl_stl.ipynb
│ │ │ ├── sts_restful.ipynb
│ │ │ ├── sts_stl.ipynb
│ │ │ ├── tf/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── demo_classifier.py
│ │ │ │ ├── demo_client.py
│ │ │ │ ├── demo_cws.py
│ │ │ │ ├── demo_cws_trie.py
│ │ │ │ ├── demo_dep.py
│ │ │ │ ├── demo_fasttext.py
│ │ │ │ ├── demo_multiprocess.py
│ │ │ │ ├── demo_ner.py
│ │ │ │ ├── demo_pipeline.py
│ │ │ │ ├── demo_pos.py
│ │ │ │ ├── demo_sdp.py
│ │ │ │ ├── demo_serving.py
│ │ │ │ └── train/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cws/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── train_ctb6_cws_albert.py
│ │ │ │ │ ├── train_ctb6_cws_bert.py
│ │ │ │ │ ├── train_ctb6_cws_convseg.py
│ │ │ │ │ ├── train_large_bert_cws.py
│ │ │ │ │ ├── train_large_conv_cws.py
│ │ │ │ │ ├── train_large_cws_albert.py
│ │ │ │ │ ├── train_large_cws_electra.py
│ │ │ │ │ ├── train_large_rnn_cws.py
│ │ │ │ │ ├── train_msr_cws_albert.py
│ │ │ │ │ ├── train_msr_cws_bert.py
│ │ │ │ │ ├── train_msr_cws_ngram_conv.py
│ │ │ │ │ ├── train_msr_cws_ngram_conv_embed.py
│ │ │ │ │ ├── train_pku980106_conv_cws.py
│ │ │ │ │ ├── train_pku980106_rnn_cws.py
│ │ │ │ │ └── train_pku_conv_cws.py
│ │ │ │ ├── finetune_msra_ner_albert.py
│ │ │ │ ├── train_chnsenticorp_bert.py
│ │ │ │ ├── train_conll03_ner_bert.py
│ │ │ │ ├── train_conll03_ner_flair.py
│ │ │ │ ├── train_ctb5_dep.py
│ │ │ │ ├── train_ctb5_pos_rnn.py
│ │ │ │ ├── train_ctb7_dep.py
│ │ │ │ ├── train_ctb9_pos_albert.py
│ │ │ │ ├── train_ctb9_pos_electra.py
│ │ │ │ ├── train_msra_ner_albert.py
│ │ │ │ ├── train_msra_ner_bert.py
│ │ │ │ ├── train_msra_ner_electra.py
│ │ │ │ ├── train_msra_ner_ngram_conv.py
│ │ │ │ ├── train_msra_ner_rnn.py
│ │ │ │ ├── train_ptb_dep_biaffine_albert.py
│ │ │ │ ├── train_ptb_dep_biaffine_bert.py
│ │ │ │ ├── train_ptb_dep_biaffine_bert_96.6.py
│ │ │ │ ├── train_ptb_dep_biaffine_bert_positional.py
│ │ │ │ ├── train_ptb_dep_sa_albert.py
│ │ │ │ ├── train_ptb_dep_sa_albert_topk.py
│ │ │ │ ├── train_ptb_dep_sa_bert.py
│ │ │ │ ├── train_ptb_dep_sa_pos_bert.py
│ │ │ │ ├── train_ptb_pos_rnn_fasttext.py
│ │ │ │ ├── train_semeval15_dm.py
│ │ │ │ ├── train_semeval15_pas.py
│ │ │ │ ├── train_semeval15_psd.py
│ │ │ │ ├── train_semeval16_news.py
│ │ │ │ └── train_semeval16_text.py
│ │ │ ├── tok_mtl.ipynb
│ │ │ ├── tok_restful.ipynb
│ │ │ ├── tok_stl.ipynb
│ │ │ ├── train/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── finetune_ner.py
│ │ │ │ ├── open_base.py
│ │ │ │ └── open_small.py
│ │ │ ├── train_sota_bert_pku.py
│ │ │ ├── tst_restful.ipynb
│ │ │ └── tutorial.ipynb
│ │ └── setup.py
│ ├── hanlp_restful/
│ │ ├── README.md
│ │ ├── hanlp_restful/
│ │ │ └── __init__.py
│ │ ├── setup.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ └── test_client.py
│ ├── hanlp_restful_golang/
│ │ └── README.md
│ ├── hanlp_restful_java/
│ │ ├── pom.xml
│ │ └── src/
│ │ ├── main/
│ │ │ └── java/
│ │ │ └── com/
│ │ │ └── hankcs/
│ │ │ └── hanlp/
│ │ │ └── restful/
│ │ │ ├── BaseInput.java
│ │ │ ├── CoreferenceResolutionOutput.java
│ │ │ ├── DocumentInput.java
│ │ │ ├── HanLPClient.java
│ │ │ ├── SentenceInput.java
│ │ │ ├── Span.java
│ │ │ ├── TokenInput.java
│ │ │ └── mrp/
│ │ │ ├── Anchor.java
│ │ │ ├── Edge.java
│ │ │ ├── MeaningRepresentation.java
│ │ │ └── Node.java
│ │ └── test/
│ │ └── java/
│ │ └── com/
│ │ └── hankcs/
│ │ └── hanlp/
│ │ └── restful/
│ │ ├── HanLPClientTest.java
│ │ └── MeaningRepresentationTest.java
│ └── hanlp_trie/
│ ├── README.md
│ ├── hanlp_trie/
│ │ ├── __init__.py
│ │ ├── dictionary.py
│ │ └── trie.py
│ ├── setup.py
│ └── tests/
│ ├── __init__.py
│ ├── test_trie.py
│ └── test_trie_dict.py
├── setup.py
└── tests/
├── __init__.py
├── test_config_tracker.py
├── test_mtl.py
├── test_pipeline.py
├── test_rules.py
└── test_string_util.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: 🐛发现一个bug
about: 需提交版本号、触发代码、错误日志
title: ''
labels: bug
assignees: hankcs
---
**Describe the bug**
A clear and concise description of what the bug is.
**Code to reproduce the issue**
Provide a reproducible test case that is the bare minimum necessary to generate the problem.
```python
```
**Describe the current behavior**
A clear and concise description of what happened.
**Expected behavior**
A clear and concise description of what you expected to happen.
**System information**
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
- Python version:
- HanLP version:
**Other info / logs**
Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
* [ ] I've completed this form and searched the web for solutions.
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
- name: ⁉️ 提问求助请上论坛
url: https://bbs.hankcs.com/
about: 欢迎前往蝴蝶效应论坛求助
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: 🚀新功能请愿
about: 建议增加一个新功能
title: ''
labels: feature request
assignees: hankcs
---
**Describe the feature and the current behavior/state.**
**Will this change the current api? How?**
**Who will benefit with this feature?**
**Are you willing to contribute it (Yes/No):**
**System information**
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
- Python version:
- HanLP version:
**Any other info**
* [ ] I've carefully completed this form.
================================================
FILE: .github/pull_request_template.md
================================================
# Title of Your Pull Request
## Description
Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
Fixes # (issue)
## Type of Change
Please check any relevant options and delete the rest.
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] This change requires a documentation update
## How Has This Been Tested?
Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
## Checklist
Check all items that apply.
- [ ] ⚠️Changes **must** be made on `dev` branch instead of `master`
- [ ] I have added tests that prove my fix is effective or that my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] My code follows the style guidelines of this project
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have checked my code and corrected any misspellings
================================================
FILE: .github/workflows/unit-tests.yml
================================================
name: Unit Tests
on:
push:
branches: [ "**" ]
pull_request:
branches: [ "**" ]
jobs:
build:
runs-on: ${{ matrix.os }}
env:
HANLP_HOME: ${{ github.workspace }}/data
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, macos-latest, windows-latest ]
python-version: [ 3.6, 3.7, 3.8, 3.9, '3.10' ]
exclude:
# GHA doesn't list 3.6 for ubuntu-22.04
- os: ubuntu-latest
python-version: "3.6"
# MacOS 14.4.1 for arm64 doesn't support Python < 3.8
- os: macos-latest
python-version: "3.6"
- os: macos-latest
python-version: "3.7"
include:
# GHA doesn't list 3.6 for ubuntu-22
- os: ubuntu-20.04
python-version: "3.6"
# MacOS 13 required for Python < 3.8
- os: macos-13
python-version: "3.6"
- os: macos-13
python-version: "3.7"
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
shell: bash
run: |
python -m pip install -e plugins/hanlp_trie
python -m pip install -e plugins/hanlp_common
python -m pip install -e .
python -m pip install pytest
- name: Cache data
uses: actions/cache@v3
with:
path: ${{ env.HANLP_HOME }}
key: hanlp-data
- name: Test with pytest
shell: bash
run: |
pytest tests
pytest plugins/hanlp_trie/tests
deploy:
needs: build
if: github.event_name == 'push' && github.ref == 'refs/heads/master'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install dependencies
run: |
python -m pip install setuptools wheel twine
- name: Deploy to PyPI
run: |
python setup.py sdist bdist_wheel
python -m twine upload dist/*
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
TWINE_REPOSITORY: pypi
================================================
FILE: .gitignore
================================================
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
### Java template
# Compiled class file
*.class
# Log file
# BlueJ files
*.ctxt
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.jar
*.war
*.nar
*.ear
*.zip
*.tar.gz
*.rar
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
### Eclipse template
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.settings/
.loadpath
.recommenders
# External tool builders
.externalToolBuilders/
# Locally stored "Eclipse launch configurations"
*.launch
# PyDev specific (Python IDE for Eclipse)
*.pydevproject
# CDT-specific (C/C++ Development Tooling)
.cproject
# CDT- autotools
.autotools
# Java annotation processor (APT)
.factorypath
# PDT-specific (PHP Development Tools)
.buildpath
# sbteclipse plugin
.target
# Tern plugin
.tern-project
# TeXlipse plugin
.texlipse
# STS (Spring Tool Suite)
.springBeans
# Code Recommenders
.recommenders/
# Annotation Processing
.apt_generated/
# Scala IDE specific (Scala & Java development for Eclipse)
.cache-main
.scala_dependencies
.worksheet
### VisualStudioCode template
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest HanLPClient
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
.idea
*.iml
data
.vscode
*.pkl
*.pdf
_static/
_build/
_templates/
================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: He
given-names: Han
orcid: "https://orcid.org/0009-0005-1778-917X"
title: "HanLP: Han Language Processing"
version: 2.1
date-released: 2015-05-27
url: "https://github.com/hankcs/HanLP"
preferred-citation:
type: conference-paper
authors:
- family-names: He
given-names: Han
- family-names: Choi
given-names: Jinho D.
title: "The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders"
editors:
- family-names: Moens
given-names: Marie-Francine
- family-names: Huang
given-names: Xuanjing
- family-names: Specia
given-names: Lucia
- family-names: Yih
given-names: Scott Wen-tau
year: 2021
month: 11
date-released: 2021-11
conference:
name: "2021 Conference on Empirical Methods in Natural Language Processing"
place: "Online and Punta Cana, Dominican Republic"
url: "https://aclanthology.org/2021.emnlp-main.451"
doi: "10.18653/v1/2021.emnlp-main.451"
url: "https://aclanthology.org/2021.emnlp-main.451"
publisher: "Association for Computational Linguistics"
booktitle: "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing"
location: "Online and Punta Cana, Dominican Republic"
pages: "5555-5577"
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
HanLP: Han Language Processing
面向生产环境的多语种自然语言处理工具包,基于PyTorch和TensorFlow 2.x双引擎,目标是普及落地最前沿的NLP技术。HanLP具备功能完善、精度准确、性能高效、语料时新、架构清晰、可自定义的特点。
[](https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftutorial.ipynb)
借助世界上最大的多语种语料库,HanLP2.1支持包括简繁中英日俄法德在内的[130种语言](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mtl.html#hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6)上的10种联合任务以及多种单任务。HanLP预训练了十几种任务上的数十个模型并且正在持续迭代语料库与模型:
| 功能 | RESTful | 多任务 | 单任务 | 模型 | 标注标准 |
| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| [分词](https://hanlp.hankcs.com/demos/tok.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_stl.ipynb) | [tok](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html) | [粗分](https://hanlp.hankcs.com/docs/annotations/tok/msr.html)、[细分](https://hanlp.hankcs.com/docs/annotations/tok/ctb.html) |
| [词性标注](https://hanlp.hankcs.com/demos/pos.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_stl.ipynb) | [pos](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/pos.html) | [CTB](https://hanlp.hankcs.com/docs/annotations/pos/ctb.html)、[PKU](https://hanlp.hankcs.com/docs/annotations/pos/pku.html)、[863](https://hanlp.hankcs.com/docs/annotations/pos/863.html) |
| [命名实体识别](https://hanlp.hankcs.com/demos/ner.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_stl.ipynb) | [ner](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/ner.html) | [PKU](https://hanlp.hankcs.com/docs/annotations/ner/pku.html)、[MSRA](https://hanlp.hankcs.com/docs/annotations/ner/msra.html)、[OntoNotes](https://hanlp.hankcs.com/docs/annotations/ner/ontonotes.html) |
| [依存句法分析](https://hanlp.hankcs.com/demos/dep.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_stl.ipynb) | [dep](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/dep.html) | [SD](https://hanlp.hankcs.com/docs/annotations/dep/sd_zh.html)、[UD](https://hanlp.hankcs.com/docs/annotations/dep/ud.html#chinese)、[PMT](https://hanlp.hankcs.com/docs/annotations/dep/pmt.html) |
| [成分句法分析](https://hanlp.hankcs.com/demos/con.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_stl.ipynb) | [con](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/constituency.html) | [Chinese Tree Bank](https://hanlp.hankcs.com/docs/annotations/constituency/ctb.html) |
| [语义依存分析](https://hanlp.hankcs.com/demos/sdp.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_stl.ipynb) | [sdp](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/sdp.html) | [CSDP](https://hanlp.hankcs.com/docs/annotations/sdp/semeval16.html#) |
| [语义角色标注](https://hanlp.hankcs.com/demos/srl.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_stl.ipynb) | [srl](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/srl.html) | [Chinese Proposition Bank](https://hanlp.hankcs.com/docs/annotations/srl/cpb.html) |
| [抽象意义表示](https://hanlp.hankcs.com/demos/amr.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/amr_restful.ipynb) | 暂无 | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/amr_stl.ipynb) | [amr](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/amr.html) | [CAMR](https://www.hankcs.com/nlp/corpus/introduction-to-chinese-abstract-meaning-representation.html) |
| [指代消解](https://hanlp.hankcs.com/demos/cor.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/cor_restful.ipynb) | 暂无 | 暂无 | 暂无 | OntoNotes |
| [语义文本相似度](https://hanlp.hankcs.com/demos/sts.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sts_restful.ipynb) | 暂无 | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sts_stl.ipynb) | [sts](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/sts.html) | 暂无 |
| [文本风格转换](https://hanlp.hankcs.com/demos/tst.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tst_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 |
| [关键词短语提取](https://hanlp.hankcs.com/demos/keyphrase.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/keyphrase_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 |
| [抽取式自动摘要](https://hanlp.hankcs.com/demos/exsum.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/extractive_summarization_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 |
| [生成式自动摘要](https://hanlp.hankcs.com/demos/absum.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/abstractive_summarization_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 |
| [文本语法纠错](https://hanlp.hankcs.com/demos/gec.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/gec_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 |
| [文本分类](https://hanlp.hankcs.com/demos/classification.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/classification_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 |
| [情感分析](https://hanlp.hankcs.com/demos/sentiment.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sentiment_restful.ipynb) | 暂无 | 暂无 | 暂无 | `[-1,+1]` |
| [语种检测](https://hanlp.hankcs.com/demos/classification.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/lid_restful.ipynb) | 暂无 | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/lid_stl.ipynb) | 暂无 | [ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) |
- 词干提取、词法语法特征提取请参考[英文教程](https://hanlp.hankcs.com/docs/tutorial.html);[词向量](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/word2vec.html)和[完形填空](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mlm.html)请参考相应文档。
- 简繁转换、拼音、新词发现、文本聚类请参考[1.x教程](https://github.com/hankcs/HanLP/tree/1.x)。
量体裁衣,HanLP提供**RESTful**和**native**两种API,分别面向轻量级和海量级两种场景。无论何种API何种语言,HanLP接口在语义上保持一致,在代码上坚持开源。如果您在研究中使用了HanLP,请引用我们的[EMNLP论文](https://aclanthology.org/2021.emnlp-main.451/)。
### 轻量级RESTful API
仅数KB,适合敏捷开发、移动APP等场景。简单易用,无需GPU配环境,秒速安装。语料更多、模型更大、精度更高,**强烈推荐**。服务器GPU算力有限,匿名用户配额较少,[建议申请**免费公益**API秘钥`auth`](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。
#### Python
```shell
pip install hanlp_restful
```
创建客户端,填入服务器地址和秘钥:
```python
from hanlp_restful import HanLPClient
HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种
```
#### Golang
安装 `go get -u github.com/hankcs/gohanlp@main` ,创建客户端,填入服务器地址和秘钥:
```go
HanLP := hanlp.HanLPClient(hanlp.WithAuth(""),hanlp.WithLanguage("zh")) // auth不填则匿名,zh中文,mul多语种
```
#### Java
在`pom.xml`中添加依赖:
```xml
com.hankcs.hanlp.restful
hanlp-restful
0.0.12
```
创建客户端,填入服务器地址和秘钥:
```java
HanLPClient HanLP = new HanLPClient("https://www.hanlp.com/api", null, "zh"); // auth不填则匿名,zh中文,mul多语种
```
#### 快速上手
无论何种开发语言,调用`parse`接口,传入一篇文章,得到HanLP精准的分析结果。
```java
HanLP.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。")
```
更多功能包括语义相似度、风格转换、指代消解等,请参考[文档](https://hanlp.hankcs.com/docs/api/restful.html)和[测试用例](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful/tests/test_client.py)。
### 海量级native API
依赖PyTorch、TensorFlow等深度学习技术,适合**专业**NLP工程师、研究者以及本地海量数据场景。要求Python 3.6至3.10,支持Windows,推荐*nix。可以在CPU上运行,推荐GPU/TPU。安装PyTorch版:
```bash
pip install hanlp
```
- HanLP每次发布都通过了Linux、macOS和Windows上Python3.6至3.10的[单元测试](https://github.com/hankcs/HanLP/actions?query=branch%3Amaster),不存在安装问题。
HanLP发布的模型分为多任务和单任务两种,多任务速度快省显存,单任务精度高更灵活。
#### 多任务模型
HanLP的工作流程为加载模型然后将其当作函数调用,例如下列联合多任务模型:
```python
import hanlp
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) # 世界最大中文语料库
HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
```
Native API的输入单位为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful和native两种API的语义设计完全一致,用户可以无缝互换。简洁的接口也支持灵活的参数,常用的技巧有:
- 灵活的`tasks`任务调度,任务越少,速度越快,详见[教程](https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftutorial.ipynb)。在内存有限的场景下,用户还可以[删除不需要的任务](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_del_tasks.py)达到模型瘦身的效果。
- 高效的trie树自定义词典,以及强制、合并、校正3种规则,请参考[demo](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb)和[文档](https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html)。规则系统的效果将无缝应用到后续统计模型,从而快速适应新领域。
#### 单任务模型
根据我们的[最新研究](https://aclanthology.org/2021.emnlp-main.451),多任务学习的优势在于速度和显存,然而精度往往不如单任务模型。所以,HanLP预训练了许多单任务模型并设计了优雅的[流水线模式](https://hanlp.hankcs.com/docs/api/hanlp/components/pipeline.html#hanlp.components.pipeline.Pipeline)将其组装起来。
```python
import hanlp
HanLP = hanlp.pipeline() \
.append(hanlp.utils.rules.split_sentence, output_key='sentences') \
.append(hanlp.load('FINE_ELECTRA_SMALL_ZH'), output_key='tok') \
.append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \
.append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \
.append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=0), output_key='dep', input_key='tok')\
.append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok')
HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。')
```
更多功能,请参考[demo](https://github.com/hankcs/HanLP/tree/doc-zh/plugins/hanlp_demo/hanlp_demo/zh)和[文档](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html)了解更多模型与用法。
### 输出格式
无论何种API何种开发语言何种自然语言,HanLP的输出统一为`json`格式兼容`dict`的[`Document`](https://hanlp.hankcs.com/docs/api/common/document.html):
```json
{
"tok/fine": [
["2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"],
["阿婆主", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司", "。"]
],
"tok/coarse": [
["2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"],
["阿婆主", "来到", "北京立方庭", "参观", "自然语义科技公司", "。"]
],
"pos/ctb": [
["NT", "NR", "P", "NN", "NN", "VV", "JJ", "NN", "AD", "JJ", "DEG", "CD", "NN", "NR", "NN", "PU"],
["NN", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN", "PU"]
],
"pos/pku": [
["t", "nx", "p", "vn", "n", "v", "b", "n", "d", "a", "u", "a", "n", "nx", "n", "w"],
["n", "v", "ns", "ns", "v", "n", "n", "n", "n", "w"]
],
"pos/863": [
["nt", "w", "p", "v", "n", "v", "a", "nt", "d", "a", "u", "a", "n", "ws", "n", "w"],
["n", "v", "ns", "n", "v", "n", "n", "n", "n", "w"]
],
"ner/pku": [
[],
[["北京立方庭", "ns", 2, 4], ["自然语义科技公司", "nt", 5, 9]]
],
"ner/msra": [
[["2021年", "DATE", 0, 1], ["HanLPv2.1", "ORGANIZATION", 1, 2]],
[["北京", "LOCATION", 2, 3], ["立方庭", "LOCATION", 3, 4], ["自然语义科技公司", "ORGANIZATION", 5, 9]]
],
"ner/ontonotes": [
[["2021年", "DATE", 0, 1], ["HanLPv2.1", "ORG", 1, 2]],
[["北京立方庭", "FAC", 2, 4], ["自然语义科技公司", "ORG", 5, 9]]
],
"srl": [
[[["2021年", "ARGM-TMP", 0, 1], ["HanLPv2.1", "ARG0", 1, 2], ["为生产环境", "ARG2", 2, 5], ["带来", "PRED", 5, 6], ["次世代最先进的多语种NLP技术", "ARG1", 6, 15]], [["最", "ARGM-ADV", 8, 9], ["先进", "PRED", 9, 10], ["技术", "ARG0", 14, 15]]],
[[["阿婆主", "ARG0", 0, 1], ["来到", "PRED", 1, 2], ["北京立方庭", "ARG1", 2, 4]], [["阿婆主", "ARG0", 0, 1], ["参观", "PRED", 4, 5], ["自然语义科技公司", "ARG1", 5, 9]]]
],
"dep": [
[[6, "tmod"], [6, "nsubj"], [6, "prep"], [5, "nn"], [3, "pobj"], [0, "root"], [8, "amod"], [15, "nn"], [10, "advmod"], [15, "rcmod"], [10, "assm"], [13, "nummod"], [15, "nn"], [15, "nn"], [6, "dobj"], [6, "punct"]],
[[2, "nsubj"], [0, "root"], [4, "nn"], [2, "dobj"], [2, "conj"], [9, "nn"], [9, "nn"], [9, "nn"], [5, "dobj"], [2, "punct"]]
],
"sdp": [
[[[6, "Time"]], [[6, "Exp"]], [[5, "mPrep"]], [[5, "Desc"]], [[6, "Datv"]], [[13, "dDesc"]], [[0, "Root"], [8, "Desc"], [13, "Desc"]], [[15, "Time"]], [[10, "mDegr"]], [[15, "Desc"]], [[10, "mAux"]], [[8, "Quan"], [13, "Quan"]], [[15, "Desc"]], [[15, "Nmod"]], [[6, "Pat"]], [[6, "mPunc"]]],
[[[2, "Agt"], [5, "Agt"]], [[0, "Root"]], [[4, "Loc"]], [[2, "Lfin"]], [[2, "ePurp"]], [[8, "Nmod"]], [[9, "Nmod"]], [[9, "Nmod"]], [[5, "Datv"]], [[5, "mPunc"]]]
],
"con": [
["TOP", [["IP", [["NP", [["NT", ["2021年"]]]], ["NP", [["NR", ["HanLPv2.1"]]]], ["VP", [["PP", [["P", ["为"]], ["NP", [["NN", ["生产"]], ["NN", ["环境"]]]]]], ["VP", [["VV", ["带来"]], ["NP", [["ADJP", [["NP", [["ADJP", [["JJ", ["次"]]]], ["NP", [["NN", ["世代"]]]]]], ["ADVP", [["AD", ["最"]]]], ["VP", [["JJ", ["先进"]]]]]], ["DEG", ["的"]], ["NP", [["QP", [["CD", ["多"]]]], ["NP", [["NN", ["语种"]]]]]], ["NP", [["NR", ["NLP"]], ["NN", ["技术"]]]]]]]]]], ["PU", ["。"]]]]]],
["TOP", [["IP", [["NP", [["NN", ["阿婆主"]]]], ["VP", [["VP", [["VV", ["来到"]], ["NP", [["NR", ["北京"]], ["NR", ["立方庭"]]]]]], ["VP", [["VV", ["参观"]], ["NP", [["NN", ["自然"]], ["NN", ["语义"]], ["NN", ["科技"]], ["NN", ["公司"]]]]]]]], ["PU", ["。"]]]]]]
]
}
```
特别地,Python RESTful和native API支持基于等宽字体的[可视化](https://hanlp.hankcs.com/docs/tutorial.html#visualization),能够直接将语言学结构在控制台内可视化出来:
```python
HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。']).pretty_print()
Dep Tree Token Relati PoS Tok NER Type Tok SRL PA1 Tok SRL PA2 Tok PoS 3 4 5 6 7 8 9
──────────── ───────── ────── ─── ───────── ──────────────── ───────── ──────────── ───────── ──────────── ───────── ─────────────────────────────────────────────────────────
┌─────────► 2021年 tmod NT 2021年 ───►DATE 2021年 ───►ARGM-TMP 2021年 2021年 NT ───────────────────────────────────────────►NP ───┐
│┌────────► HanLPv2.1 nsubj NR HanLPv2.1 ───►ORGANIZATION HanLPv2.1 ───►ARG0 HanLPv2.1 HanLPv2.1 NR ───────────────────────────────────────────►NP────┤
││┌─►┌───── 为 prep P 为 为 ◄─┐ 为 为 P ───────────┐ │
│││ │ ┌─► 生产 nn NN 生产 生产 ├►ARG2 生产 生产 NN ──┐ ├────────────────────────►PP ───┐ │
│││ └─►└── 环境 pobj NN 环境 环境 ◄─┘ 环境 环境 NN ──┴►NP ───┘ │ │
┌┼┴┴──────── 带来 root VV 带来 带来 ╟──►PRED 带来 带来 VV ──────────────────────────────────┐ │ │
││ ┌─► 次 amod JJ 次 次 ◄─┐ 次 次 JJ ───►ADJP──┐ │ ├►VP────┤
││ ┌───►└── 世代 nn NN 世代 世代 │ 世代 世代 NN ───►NP ───┴►NP ───┐ │ │ │
││ │ ┌─► 最 advmod AD 最 最 │ 最 ───►ARGM-ADV 最 AD ───────────►ADVP──┼►ADJP──┐ ├►VP ───┘ ├►IP
││ │┌──►├── 先进 rcmod JJ 先进 先进 │ 先进 ╟──►PRED 先进 JJ ───────────►VP ───┘ │ │ │
││ ││ └─► 的 assm DEG 的 的 ├►ARG1 的 的 DEG──────────────────────────┤ │ │
││ ││ ┌─► 多 nummod CD 多 多 │ 多 多 CD ───►QP ───┐ ├►NP ───┘ │
││ ││┌─►└── 语种 nn NN 语种 语种 │ 语种 语种 NN ───►NP ───┴────────►NP────┤ │
││ │││ ┌─► NLP nn NR NLP NLP │ NLP NLP NR ──┐ │ │
│└─►└┴┴──┴── 技术 dobj NN 技术 技术 ◄─┘ 技术 ───►ARG0 技术 NN ──┴────────────────►NP ───┘ │
└──────────► 。 punct PU 。 。 。 。 PU ──────────────────────────────────────────────────┘
Dep Tree Tok Relat Po Tok NER Type Tok SRL PA1 Tok SRL PA2 Tok Po 3 4 5 6
──────────── ─── ───── ── ─── ──────────────── ─── ──────── ─── ──────── ─── ────────────────────────────────
┌─► 阿婆主 nsubj NN 阿婆主 阿婆主 ───►ARG0 阿婆主 ───►ARG0 阿婆主 NN───────────────────►NP ───┐
┌┬────┬──┴── 来到 root VV 来到 来到 ╟──►PRED 来到 来到 VV──────────┐ │
││ │ ┌─► 北京 nn NR 北京 ───►LOCATION 北京 ◄─┐ 北京 北京 NR──┐ ├►VP ───┐ │
││ └─►└── 立方庭 dobj NR 立方庭 ───►LOCATION 立方庭 ◄─┴►ARG1 立方庭 立方庭 NR──┴►NP ───┘ │ │
│└─►┌─────── 参观 conj VV 参观 参观 参观 ╟──►PRED 参观 VV──────────┐ ├►VP────┤
│ │ ┌───► 自然 nn NN 自然 ◄─┐ 自然 自然 ◄─┐ 自然 NN──┐ │ │ ├►IP
│ │ │┌──► 语义 nn NN 语义 │ 语义 语义 │ 语义 NN │ ├►VP ───┘ │
│ │ ││┌─► 科技 nn NN 科技 ├►ORGANIZATION 科技 科技 ├►ARG1 科技 NN ├►NP ───┘ │
│ └─►└┴┴── 公司 dobj NN 公司 ◄─┘ 公司 公司 ◄─┘ 公司 NN──┘ │
└──────────► 。 punct PU 。 。 。 。 PU──────────────────────────┘
```
关于标注集含义,请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习,所以HanLP的标注集也是覆盖面最广的。
## 训练你自己的领域模型
写深度学习模型一点都不难,难的是复现较高的准确率。下列[代码](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/train_sota_bert_pku.py)展示了如何在sighan2005 PKU语料库上花6分钟训练一个超越学术界state-of-the-art的中文分词模型。
```python
tokenizer = TransformerTaggingTokenizer()
save_dir = 'data/model/cws/sighan2005_pku_bert_base_96.73'
tokenizer.fit(
SIGHAN2005_PKU_TRAIN_ALL,
SIGHAN2005_PKU_TEST, # Conventionally, no devset is used. See Tian et al. (2020).
save_dir,
'bert-base-chinese',
max_seq_len=300,
char_level=True,
hard_constraint=True,
sampler_builder=SortingSamplerBuilder(batch_size=32),
epochs=3,
adam_epsilon=1e-6,
warmup_steps=0.1,
weight_decay=0.01,
word_dropout=0.1,
seed=1660853059,
)
tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir)
```
其中,由于指定了随机数种子,结果一定是`96.73`。不同于那些虚假宣传的学术论文或商业项目,HanLP保证所有结果可复现。如果你有任何质疑,我们将当作最高优先级的致命性bug第一时间排查问题。
请参考[demo](https://github.com/hankcs/HanLP/tree/master/plugins/hanlp_demo/hanlp_demo/zh/train)了解更多训练脚本。
## 性能
lang corpora model tok pos ner dep con srl sdp lem fea amr fine coarse ctb pku 863 ud pku msra ontonotes SemEval16 DM PAS PSD mul UD2.7 OntoNotes5 small 98.62 - - - - 93.23 - - 74.42 79.10 76.85 70.63 - 91.19 93.67 85.34 87.71 84.51 - base 98.97 - - - - 90.32 - - 80.32 78.74 71.23 73.63 - 92.60 96.04 81.19 85.08 82.13 - zh open small 97.25 - 96.66 - - - - - 95.00 84.57 87.62 73.40 84.57 - - - - - - base 97.50 - 97.07 - - - - - 96.04 87.11 89.84 77.78 87.11 - - - - - - close small 96.70 95.93 96.87 97.56 95.05 - 96.22 95.74 76.79 84.44 88.13 75.81 74.28 - - - - - - base 97.52 96.44 96.99 97.59 95.29 - 96.48 95.72 77.77 85.29 88.57 76.52 73.76 - - - - - - ernie 96.95 97.29 96.76 97.64 95.22 - 97.31 96.47 77.95 85.67 89.17 78.51 74.10 - - - - - -
- 根据我们的[最新研究](https://aclanthology.org/2021.emnlp-main.451),单任务学习的性能往往优于多任务学习。在乎精度甚于速度的话,建议使用[单任务模型](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html)。
HanLP采用的数据预处理与拆分比例与流行方法未必相同,比如HanLP采用了[完整版的MSRA命名实体识别语料](https://bbs.hankcs.com/t/topic/3033),而非大众使用的阉割版;HanLP使用了语法覆盖更广的[Stanford Dependencies标准](https://hanlp.hankcs.com/docs/annotations/dep/sd_zh.html),而非学术界沿用的Zhang and Clark (2008)标准;HanLP提出了[均匀分割CTB的方法](https://bbs.hankcs.com/t/topic/3024),而不采用学术界不均匀且遗漏了51个黄金文件的方法。HanLP开源了[一整套语料预处理脚本与相应语料库](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py),力图推动中文NLP的透明化。
总之,HanLP只做我们认为正确、先进的事情,而不一定是流行、权威的事情。
## 引用
如果你在研究中使用了HanLP,请按如下格式引用:
```bibtex
@inproceedings{he-choi-2021-stem,
title = "The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders",
author = "He, Han and Choi, Jinho D.",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.451",
pages = "5555--5577",
abstract = "Multi-task learning with transformer encoders (MTL) has emerged as a powerful technique to improve performance on closely-related tasks for both accuracy and efficiency while a question still remains whether or not it would perform as well on tasks that are distinct in nature. We first present MTL results on five NLP tasks, POS, NER, DEP, CON, and SRL, and depict its deficiency over single-task learning. We then conduct an extensive pruning analysis to show that a certain set of attention heads get claimed by most tasks during MTL, who interfere with one another to fine-tune those heads for their own objectives. Based on this finding, we propose the Stem Cell Hypothesis to reveal the existence of attention heads naturally talented for many tasks that cannot be jointly trained to create adequate embeddings for all of those tasks. Finally, we design novel parameter-free probes to justify our hypothesis and demonstrate how attention heads are transformed across the five tasks during MTL through label analysis.",
}
```
## License
### 源代码
HanLP源代码的授权协议为 **Apache License 2.0**,可免费用做商业用途。请在产品说明中附加HanLP的链接和授权协议。HanLP受版权法保护,侵权必究。
##### 自然语义(青岛)科技有限公司
HanLP从v1.7版起独立运作,由自然语义(青岛)科技有限公司作为项目主体,主导后续版本的开发,并拥有后续版本的版权。
##### 上海林原公司
HanLP 早期得到了上海林原公司的大力支持,并拥有1.28及前序版本的版权,相关版本也曾在上海林原公司网站发布。
### 预训练模型
机器学习模型的授权在法律上没有定论,但本着尊重开源语料库原始授权的精神,如不特别说明,HanLP的多语种模型授权沿用[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/),中文模型授权为仅供研究与教学使用。
## References
https://hanlp.hankcs.com/docs/references.html
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/annotations/constituency/ctb.md
================================================
# Chinese Tree Bank
See also [The Bracketing Guidelines for the Penn Chinese Treebank (3.0)](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1040&context=ircs_reports).
| Tag | Definition | 定义 | 例子 |
|------|----------------------------------------------|----------------------------------------------------|-------------------|
| ADJP | adjective phrase | 形容词短语,以形容词为中心词 | 不完全、大型 |
| ADVP | adverbial phrase headed by AD (adverb) | 副词短语,以副词为中心词 | 非常、很 |
| CLP | classifier phrase | 由量词构成的短语 | 系列、大批 |
| CP | clause headed by C (complementizer) | 从句,通过带补语(如“的”、“吗”等) | 张三喜欢李四吗? |
| DNP | phrase formed by ‘‘XP + DEG’’ | 结构为XP + DEG(的)的短语,其中XP可以是ADJP、DP、QP、PP等等,用于修饰名词短语。 | 大型的、前几年的、五年的、在上海的 |
| DP | determiner phrase | 限定词短语,通常由限定词和数量词构成 | 这三个、任何 |
| DVP | phrase formed by ‘‘XP + DEV’’ | 结构为XP+地的短评,用于修饰动词短语VP | 心情失落地、大批地 |
| FRAG | fragment | 片段 | (完) |
| INTJ | interjection | 插话,感叹语 | 哈哈、切 |
| IP | simple clause headed by I (INFL) | 简单子句或句子,通常不带补语(如“的”、“吗”等) | 张三喜欢李四。 |
| LCP | phrase formed by ‘‘XP + LC’’ | 用于表本地点+方位词(LC)的短语 | 生活中、田野上 |
| LST | list marker | 列表短语,包括标点符号 | 一. |
| MSP | some particles | 其他小品词 | 所、而、来、去 |
| NN | common noun | 名词 | HanLP、技术 |
| NP | noun phrase | 名词短语,中心词通常为名词 | 美好生活、经济水平 |
| PP | preposition phrase | 介词短语,中心词通常为介词 | 在北京、据报道 |
| PRN | parenthetical | 插入语 | ,(张三说), |
| QP | quantifier phrase | 量词短语 | 三个、五百辆 |
| TOP | root node | 根节点 | 根节点 |
| UCP | unidentical coordination phrase | 不对称的并列短语,指并列词两侧的短语类型不致 | (养老、医疗)保险 |
| VCD | coordinated verb compound | 复合动词 | 出版发行 |
| VCP | verb compounds formed by VV + VC | VV + VC形式的动词短语 | 看作是 |
| VNV | verb compounds formed by A-not-A or A-one-A | V不V形式的动词短语 | 能不能、信不信 |
| VP | verb phrase | 动词短语,中心词通常为动词 | 完成任务、努力工作 |
| VPT | potential form V-de-R or V-bu-R | V不R、V得R形式的动词短语 | 打不赢、打得过 |
| VRD | verb resultative compound | 动补结构短语 | 研制成功、降下来 |
| VSB | verb compounds formed by a modifier + a head | 修饰语+中心词构成的动词短语 | 拿来支付、仰头望去 |
================================================
FILE: docs/annotations/constituency/index.md
================================================
# Constituency Parsing
## Chinese
```{toctree}
ctb
```
## English
```{toctree}
ptb
```
## Japanese
```{toctree}
npcmj
```
================================================
FILE: docs/annotations/constituency/npcmj.md
================================================
# NPCMJ
| Tag | Description |
|-----------------|-----------------------------------------|
| ADVP | adverb phrase |
| ADVP-CMPL | complement adverb phrase |
| ADVP-MSR | measurement adverb phrase |
| ADVP-PRD | predicate adverb phrase |
| ADVP-TMP | temporal adverb phrase |
| CONJP | conjunction phrase |
| CP-EXL | exclamative |
| CP-IMP | imperative |
| CP-FINAL | projection for sentence final particle |
| CP-QUE | question (direct or indirect) |
| CP-QUE-ADV | question used adverbially |
| CP-QUE-OB1 | question used as object |
| CP-QUE-PRD | question used as a nominal predicate |
| CP-QUE-SBJ | question used as subject |
| CP-THT | complementizer clause |
| CP-THT-ADV | complementizer clause used adverbially |
| CP-THT-OB1 | complementizer clause used as object |
| CP-THT-PRD | complementizer clause used as predicate |
| CP-THT-PRP | purposive complementizer clause |
| CP-THT-SBJ | complementizer clause used as subject |
| FRAG | fragment |
| FS | false start |
| INTJP | interjection phrase |
| IP-ADV | adverbial clause |
| IP-ADV-CONJ | coordinated clause |
| IP-ADV-PRD | adverbial clause used as predicate |
| IP-ADV-SCON | subordinate clause |
| IP-ADV-SCON-CND | |
| conditional | clause |
| IP-EMB | gapless noun-modifying clause |
| IP-IMP | imperative clause |
| IP-MAT | matrix clause |
| IP-NMZ | nominalized clause |
| IP-NMZ-PRD | nominalized clause used as predicate |
| IP-REL | relative clause |
| IP-SMC | small clause |
| IP-SMC-CNT | small clause in continuative form |
| IP-SMC-OB1 | small clause used as object |
| IP-SMC-SBJ | small clause used as subject |
| IP-SUB | clause under CP* layer |
| multi-sentence | multiple sentence |
| NML | intermediate nominal layer |
| NP | noun phrase |
| NP-ADV | adverbial noun phrase |
| NP-CZZ | causee noun phrase |
| NP-DOB1 | derived primary object noun phrase |
| NP-DSBJ | derived subject noun phrase |
| NP-LGS | logical subject noun phrase |
| NP-LOC | locational noun phrase |
| NP-MSR | measure noun phrase |
| NP-OB1 | primary object noun phrase |
| NP-OB2 | secondary object noun phrase |
| NP-POS | possessive noun phrase |
| NP-PRD | predicate noun phrase |
| NP-SBJ | subject noun phrase |
| NP-SBJ2 | secondary subject noun phrase |
| NP-TMP | temporal noun phrase |
| NP-TPC | topic noun phrase |
| NP-VOC | vocative noun phrase |
| NUMCLP | numeral-classifier phrase |
| PNLP | prenominal phrase |
| PP | particle phrase |
| PP-ADV | adverbial particle phrase |
| PP-CMPL | complement particle phrase |
| PP-CONJ | coordination particle phrase |
| PP-CZZ | causee particle phrase |
| PP-DOB1 | derived primary object particle phrase |
| PP-DSBJ | derived subject particle phrase |
| PP-LGS | logical subject particle phrase |
| PP-LOC | locational particle phrase |
| PP-MSR | measure particle phrase |
| PP-OB1 | primary object particle phrase |
| PP-OB2 | secondary object particle phrase |
| PP-PRD | predicate particle phrase |
| PP-PRP | purpositive particle phrase |
| PP-SBJ | subject particle phrase |
| PP-SBJ2 | secondary subject particle phrase |
| PP-SCON | subordination particle phrase |
| PP-SCON-CND | conditional particle phrase |
| PP-TMP | temporal particle phrase |
| PP-TPC | topic particle phrase |
| PP-VOC | vocative particle phrase |
| PRN | parenthetical |
================================================
FILE: docs/annotations/constituency/ptb.md
================================================
# Penn Treebank
| Tag | Description |
|--------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ADJP | Adjective Phrase. |
| ADVP | Adverb Phrase. |
| CONJP | Conjunction Phrase. |
| FRAG | Fragment. |
| INTJ | Interjection. Corresponds approximately to the part-of-speech tag UH. |
| LST | List marker. Includes surrounding punctuation. |
| NAC | Not a Constituent; used to show the scope of certain prenominal modifiers within an NP. |
| NP | Noun Phrase. |
| NX | - Used within certain complex NPs to mark the head of the NP. Corresponds very roughly to N-bar level but used quite differently. |
| PP | Prepositional Phrase. |
| PRN | Parenthetical |
| PRT | Particle. Category for words that should be tagged RP. |
| QP | Quantifier Phrase (i.e. complex measure/amount phrase); used within NP. |
| ROOT | No description |
| RRC | Reduced Relative Clause. |
| S | conjunction or a wh-word and that does not exhibit subject-verb inversion. |
| SBAR | Clause introduced by a (possibly empty) subordinating conjunction. |
| SBARQ | - Direct question introduced by a wh-word or a wh-phrase. Indirect questions and relative clauses should be bracketed as SBAR, not SBARQ. |
| SINV | - Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal. |
| SQ | Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ. |
| UCP | Unlike Coordinated Phrase. |
| VP | Verb Phrase. |
| WHADJP | Wh-adjective Phrase. Adjectival phrase containing a wh-adverb, as in how hot. |
| WHADVP | - Wh-adverb Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing a wh-adverb such as how or why. |
| WHNP | - Wh-noun Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing some wh-word, e.g. who, which book, whose daughter, none of which, or how many leopards. |
| WHPP | - Wh-prepositional Phrase. Prepositional phrase containing a wh-noun phrase (such as of which or by whose authority) that either introduces a PP gap or is contained by a WHNP. |
| X | - Unknown, uncertain, or unbracketable. X is often used for bracketing typos and in bracketing the…the-constructions. |
================================================
FILE: docs/annotations/dep/index.md
================================================
# Dependency Parsing
## Chinese
```{toctree}
sd_zh
pmt
```
## English
```{toctree}
sd_en
```
## Multilingual
```{toctree}
ud
```
================================================
FILE: docs/annotations/dep/pmt.md
================================================
# PKU Multi-view Chinese Treebank
```{eval-rst}
See also :cite:`qiu-etal-2014-multi`.
```
| Tag | Description | 依存关系 |
| ---- | ------------------------------------------- | -------------- |
| ACT | action object | 行为宾语 |
| ADV | adverbial | 状语 |
| APP | appositive element | 同位 |
| ATT | attribute | 定语 |
| CMP | complement | 补语 |
| COO | other coordination element | 一般并列 |
| COS | share-right-child coordination element | 共享并列 |
| DE | de (modifier of 的(special function word)) | 的字 |
| DEI | dei (modifier of 得(special function word)) | 得字 |
| DI | di (modifier of 地(special function word)) | 地字 |
| FOC | focus | 强调 |
| HED | root of a sentence | 核心 |
| IC | independent clause | 小句 |
| IOB | indirect object | 间接宾语 |
| IS | independent structure | 独立结构 |
| ISC | non-shared independent structure | 并列式独立结构 |
| LAD | left additive | 前附加 |
| MT | modality and time | 时体 |
| NUM | number | 数字 |
| POB | propositional object | 介宾 |
| PUN | punctuation | 标点 |
| PUS | cross-clause punctuation | 跨句标点 |
| QUC | post-positional quantity | 数量补语 |
| QUCC | non-shared post-positional quantity | 非共享数量补语 |
| QUN | quantity | 数量 |
| RAD | right additive | 后附加 |
| RADC | non-shared right additive | 非共享后附加 |
| RED | reduplicate element | 重叠 |
| SBV | subject | 主语 |
| TPC | topic | 话题 |
| VOB | direct object | 宾语 |
| VV | serial verb construction | 连动 |
================================================
FILE: docs/annotations/dep/sd_en.md
================================================
# Stanford Dependencies English
See also [Stanford typed dependencies manual](https://nlp.stanford.edu/software/dependencies_manual.pdf).
| Tag | Description |
|------------|-----------------------------------|
| abbrev | abbreviation modifier |
| acomp | adjectival complement |
| advcl | adverbial clause modifier |
| advmod | adverbial modifier |
| agent | agent |
| amod | adjectival modifier |
| appos | appositional modifier |
| arg | argument |
| attr | attributive |
| aux | auxiliary |
| auxpass | passive auxiliary |
| cc | coordination |
| ccomp | clausal complement |
| comp | complement |
| complm | complementizer |
| conj | conjunct |
| cop | copula |
| csubj | clausal subject |
| csubjpass | clausal passive subject |
| dep | dependent |
| det | determiner |
| discourse | discourse element |
| dobj | direct object |
| expl | expletive |
| goeswith | goes with |
| iobj | indirect object |
| mark | marker |
| mod | modifier |
| mwe | multi-word expression |
| neg | negation modifier |
| nn | noun compound modifier |
| npadvmod | noun phrase as adverbial modifier |
| nsubj | nominal subject |
| nsubjpass | passive nominal subject |
| num | numeric modifier |
| number | element of compound number |
| obj | object |
| parataxis | parataxis |
| pcomp | prepositional complement |
| pobj | object of a preposition |
| poss | possession modifier |
| possessive | possessive modifier |
| preconj | preconjunct |
| pred | predicate |
| predet | predeterminer |
| prep | prepositional modifier |
| prepc | prepositional clausal modifier |
| prt | phrasal verb particle |
| punct | punctuation |
| purpcl | purpose clause modifier |
| quantmod | quantifier phrase modifier |
| rcmod | relative clause modifier |
| ref | referent |
| rel | relative |
| root | root |
| sdep | semantic dependent |
| subj | subject |
| tmod | temporal modifier |
| vmod | verb modifier |
| xcomp | open clausal complement |
| xsubj | controlling subject |
================================================
FILE: docs/annotations/dep/sd_zh.md
================================================
# Stanford Dependencies Chinese
```{eval-rst}
See also :cite:`chang-etal-2009-discriminative`.
```
|Tag|Description|中文简称|例句|依存弧|
| ---- | ---- | ---- | ---- | ---- |
|nn|noun compound modifier|复合名词修饰|服务中心|nn(中心,服务)|
|punct|punctuation|标点符号|海关统计表明,|punct(表明,,)|
|nsubj|nominal subject|名词性主语|梅花盛开|nsubj (盛开,梅花)|
|conj|conjunct (links two conjuncts)|连接性状语|设备和原材料|conj(原材料,设备)|
|dobj|direct object|直接宾语|浦东颁布了七十一件文件|dobj(颁布,文件)|
|advmod|adverbial modifier|副词性状语|部门先送上文件|advmod(送上,先)|
|prep|prepositional modifier|介词性修饰语|在实践中逐步完善|prep(完善,在)|
|nummod|number modifier|数词修饰语|七十一件文件|nummod(件,七十一)|
|amod|adjectival modifier|形容词修饰语|跨世纪工程|amod(工程,跨世纪)|
|pobj|prepositional object|介词性宾语|根据有关规定|pobj (根据,规定)|
|rcmod|relative clause modifier|关系从句修饰语|不曾遇到过的情况|rcmod(情况,遇到)|
|cpm|complementizer|补语|开发浦东的经济活动|cpm(开发,的)|
|assm|associative marker|关联标记|企业的商品|assm(企业,的)|
|assmod|associative modifier|关联修饰|企业的商品|assmod(商品,企业)|
|cc|coordinating conjunction|并列关系|设备和原材料|cc(原材料,和)|
|clf|classifier modifier|类别修饰|七十一件文件|clf(文件,件)|
|ccomp|clausal complement|从句补充|银行决定先取得信用评级|ccomp(决定,取得)|
|det|determiner|限定语|这些经济活动|det(活动,这些)|
|lobj|localizer object|范围宾语|近年来|lobj(来,近年)|
|range|dative object that is a quantifier phrase|数量词间接宾语|成交药品一亿多元|range(成交,元)|
|asp|aspect marker|时态标记|发挥了作用|asp(发挥,了)|
|tmod|temporal modifier|时间修饰语|以前不曾遇到过|tmod(遇到,以前)|
|plmod|localizer modifier of a preposition|介词性地点修饰|在这片热土上|plmod(在,上)|
|attr|attributive|属性|贸易额为二百亿美元|attr(为,美元)|
|mmod|modal verb modifier|情态动词|利益能得到保障|mmod(得到,能)|
|loc|localizer|位置补语|占九成以上|loc(占,以上)|
|top|topic|主题|建筑是主要活动|top(是,建筑)|
|pccomp|clausal complement of a preposition|介词补语|据有关部门介绍|pccomp(据,介绍)|
|etc|etc modifier|省略关系|科技、文教等领域|etc(文教,等)|
|lccomp|clausal complement of a localizer|位置补语|中国对外开放中升起的明星|lccomp(中,开放)|
|ordmod|ordinal number modifier|量词修饰|第七个机构|ordmod(个,第七)|
|xsubj|controlling subject|控制主语|银行决定先取得信用评级|xsubj (取得,银行)|
|neg|negative modifier|否定修饰|以前不曾遇到过|neg(遇到,不)|
|rcomp|resultative complement|结果补语|研究成功|rcomp(研究,成功)|
|comod|coordinated verb compound modifier|并列联合动词|颁布实行|comod(颁布,实行)|
|vmod|verb modifier|动词修饰|其在支持外商企业方面的作用|vmod(方面,支持)|
|prtmod|particles such as 所,以,来,而|小品词|在产业化所取得的成就|prtmod(取得,所)|
|ba|“ba” construction|把字关系|把注意力转向市场|ba(转向,把)|
|dvpm|manner DE(地)modifier|地字修饰|有效地防止流失|dvpm(有效,地)|
|dvpmod|a "XP+DEV", phrase that modifies VP|地字动词短语|有效地防止流失|dvpmod(防止,有效)|
|prnmod|parenthetical modifier|插入词修饰|八五期间(1990-1995 )|pmmod(期间,1995)|
|cop|copular|系动词|原是自给自足的经济|cop(自给自足,是)|
|pass|passive marker|被动标记|被认定为高技术产业|pass(认定,被)|
|nsubjpass|nominal passive subject|被动名词主语|镍被称作现代工业的维生素|nsubjpass(称作,镍)|
|dep|dependent|其他依赖关系|新华社北京二月十二日电|dep(电,新华社)|
================================================
FILE: docs/annotations/dep/ud.md
================================================
# Universal Dependencies
## Cross-Linguistic
See also [Universal Dependencies](https://universaldependencies.org/docs/u/dep/index.html).
| Tag | Description |
|------------|----------------------------------------------|
| acl | clausal modifier of noun (adjectival clause) |
| advcl | adverbial clause modifier |
| advmod | adverbial modifier |
| amod | adjectival modifier |
| appos | appositional modifier |
| aux | auxiliary |
| auxpass | passive auxiliary |
| case | case marking |
| cc | coordinating conjunction |
| ccomp | clausal complement |
| compound | compound |
| conj | conjunct |
| cop | copula |
| csubj | clausal subject |
| csubjpass | clausal passive subject |
| dep | unspecified dependency |
| det | determiner |
| discourse | discourse element |
| dislocated | dislocated elements |
| dobj | direct object |
| expl | expletive |
| foreign | foreign words |
| goeswith | goes with |
| iobj | indirect object |
| list | list |
| mark | marker |
| mwe | multi-word expression |
| name | name |
| neg | negation modifier |
| nmod | nominal modifier |
| nsubj | nominal subject |
| nsubjpass | passive nominal subject |
| nummod | numeric modifier |
| parataxis | parataxis |
| punct | punctuation |
| remnant | remnant in ellipsis |
| reparandum | overridden disfluency |
| root | root |
| vocative | vocative |
| xcomp | open clausal complement |
## Localization
### Chinese
| Tag | 简称 | 例句 |
| :--------------- |---------:| -----------------------------------------------------------: |
| acl | 形容词子句 |  |
| advcl:loc | 状语从句修饰语 |  |
| advmod | 状语 |  |
| advmod:dvp | 状语:地 |  |
| advmod:loc | 状语:限定 |  |
| advmod:rcomp | 状语:因果 |  |
| amod | 形容 |  |
| amod:ordmod | 形容:数量 |  |
| appos | 同位 |  |
| aux:asp | 助语:时态 |  |
| aux:ba | 助语:把 |  |
| aux:modal | 助语:情态 |  |
| aux:prtmod | 助语:分词 |  |
| auxpass | 被动 |  |
| case | 条件 |  |
| cc | 并列连词 |  |
| ccomp | 从句补语 |  |
| compound:nn | 复合名词 |  |
| compound:vc | 复合动词 |  |
| conj | 连接 |  |
| cop | 系动 |  |
| csubj | 从句主语 |  |
| dep | 未定义 |  |
| det | 限定 |  |
| discourse | 语气 |  |
| dobj | 直接宾语 |  |
| etc | 省略 |  |
| mark | 标记 |  |
| mark:clf | 标记:量词 |  |
| name | 名称 |  |
| neg | 否定 |  |
| nmod | 名词修饰 |  |
| nmod:assmod | 名词修饰:关联 |  |
| nmod:poss | 名词修饰:所有格 |  |
| nmod:prep | 名词修饰:介词 |  |
| nmod:range | 名词修饰:范围 |  |
| nmod:tmod | 名词修饰:时间 |  |
| nmod:topic | 名词修饰:主题 |  |
| nsubj | 名词主语 |  |
| nsubj:xsubj | 名词主语: 补语 |  |
| nsubjpass | 被动态主语 |  |
| nummod | 数量 |  |
| parataxis:prnmod | 并列 |  |
| punct | 标点符号 |  |
| root | 根 |  |
| xcomp | 从句补语 |  |
================================================
FILE: docs/annotations/index.md
================================================
# Annotations
```{toctree}
tok/index
pos/index
ner/index
dep/index
sdp/index
srl/index
constituency/index
```
================================================
FILE: docs/annotations/ner/index.md
================================================
# Named Entity Recognition
## Chinese
```{toctree}
pku
msra
```
## Multilingual
```{toctree}
ontonotes
```
================================================
FILE: docs/annotations/ner/msra.md
================================================
# msra
| Category | Subcategory | Tag-set of Format-1 | Tag-set of Format-2 |
|----------|----------------|---------------------|---------------------|
| NAMEX | Person | P | PERSON |
| | Location | L | LOCATION |
| | Organization | 〇 | ORGANIZATION |
| TIMEX | Date | dat | DATE |
| | Duration | dur | DURATION |
| | Time | tim | TIME |
| NUMEX | Percent | per | PERCENT |
| | Money | mon | MONEY |
| | Frequency | fre | FREQUENCY |
| | Integer | int | INTEGER |
| | Fraction | fra | FRACTION |
| | Decimal | dec | DECIMAL |
| | Ordinal | ord | ORDINAL |
| | Rate | rat | RATE |
| MEASUREX | Age | age | AGE |
| | Weight | wei | WEIGHT |
| | Length | len | LENGTH |
| | Temperature | tem | TEMPERATURE |
| | Angle | ang | ANGLE |
| | Area | are | AREA |
| | Capacity | cap | CAPACITY |
| | Speed | spe | SPEED |
| | Acceleration | acc | ACCELERATION |
| | Other measures | mea | MEASURE |
| ADDREX | Email | ema | EMAIL |
| | Phone | pho | PHONE |
| | Fax | fax | FAX |
| | Telex | tel | TELEX |
| | WWW | WWW | WWW |
| | Postalcode | pos | POSTALCODE |
================================================
FILE: docs/annotations/ner/ontonotes.md
================================================
# ontonotes
| TAG | Description |
|--------------|------------------------------------------------------|
| PERSON | People, including fictional |
| NORP | Nationalities or religious or political groups |
| FACILITY | Buildings, airports, highways, bridges, etc. |
| ORGANIZATION | Companies, agencies, institutions, etc. |
| GPE | Countries, cities, states |
| LOCATION | Non-GPE locations, mountain ranges, bodies of water |
| PRODUCT | Vehicles, weapons, foods, etc. (Not services) |
| EVENT | Named hurricanes, battles, wars, sports events, etc. |
| WORK OF ART | Titles of books, songs, etc. |
| LAW | Named documents made into laws |
| DATE | Absolute or relative dates or periods |
| TIME | Times smaller than a day |
| PERCENT | Percentage |
| MONEY | Monetary values, including unit |
| QUANTITY | Measurements, as of weight or distance |
| ORDINAL | “first”, “second” |
| CARDINAL | Numerals that do not fall under another type |
================================================
FILE: docs/annotations/ner/pku.md
================================================
# pku
| 序号 | 词性 | 名称 | 帮助记忆的诠释 | 例子及注解 |
| ---- | ---- | -------- | ------------------------------------------------------ | ------------------------------------------------------------ |
| 1 | nr | 人名 | 名词代码n和“人(ren)”的声母并在一起。 | 1. 汉族人及与汉族起名方式相同的非汉族人的姓和名单独切分,并分别标注为nr。张/nr 仁伟/nr, 欧阳/nr 修/nr, 阮/nr 志雄/nr, 朴/nr 贞爱/nr汉族人除有单姓和复姓外,还有双姓,即有的女子出嫁后,在原来的姓上加上丈夫的姓。如:陈方安生。这种情况切分、标注为:陈/nr 方/nr 安生/nr;唐姜氏,切分、标注为:唐/nr 姜氏/nr。2. 姓名后的职务、职称或称呼要分开。江/nr 主席/n, 小平/nr 同志/n, 江/nr 总书记/n,张/nr 教授/n, 王/nr 部长/n, 陈/nr 老总/n, 李/nr 大娘/n, 刘/nr 阿姨/n, 龙/nr 姑姑/n3. 对人的简称、尊称等若为两个字,则合为一个切分单位,并标以nr。老张/nr, 大李/nr, 小郝/nr, 郭老/nr, 陈总/nr4. 明显带排行的亲属称谓要切分开,分不清楚的则不切开。三/m 哥/n, 大婶/n, 大/a 女儿/n, 大哥/n, 小弟/n, 老爸/n5. 一些著名作者的或不易区分姓和名的笔名通常作为一个切分单位。鲁迅/nr, 茅盾/nr, 巴金/nr, 三毛/nr, 琼瑶/nr, 白桦/nr6. 外国人或少数民族的译名(包括日本人的姓名)不予切分,标注为nr。克林顿/nr, 叶利钦/nr, 才旦卓玛/nr, 小林多喜二/nr, 北研二/nr,华盛顿/nr, 爱因斯坦/nr有些西方人的姓名中有小圆点,也不分开。卡尔·马克思/nr |
| 2 | ns | 地名 | 名词代码n和处所词代码s并在一起。 | 安徽/ns,深圳/ns,杭州/ns,拉萨/ns,哈尔滨/ns, 呼和浩特/ns, 乌鲁木齐/ns,长江/ns,黄海/ns,太平洋/ns, 泰山/ns, 华山/ns,亚洲/ns, 海南岛/ns,太湖/ns,白洋淀/ns, 俄罗斯/ns,哈萨克斯坦/ns,彼得堡/ns, 伏尔加格勒/ns 1. 国名不论长短,作为一个切分单位。中国/ns, 中华人民共和国/ns, 日本国/ns, 美利坚合众国/ns, 美国/ns2. 地名后有“省”、“市”、“县”、“区”、“乡”、“镇”、“村”、“旗”、“州”、“都”、“府”、“道”等单字的行政区划名称时,不切分开,作为一个切分单位。四川省/ns, 天津市/ns,景德镇/ns沙市市/ns, 牡丹江市/ns,正定县/ns,海淀区/ns, 通州区/ns,东升乡/ns, 双桥镇/ns 南化村/ns,华盛顿州/ns,俄亥俄州/ns,东京都/ns, 大阪府/ns,北海道/ns, 长野县/ns,开封府/ns,宣城县/ns3. 地名后的行政区划有两个以上的汉字,则将地名同行政区划名称切开,不过要将地名同行政区划名称用方括号括起来,并标以短语NS。[芜湖/ns 专区/n] NS,[宣城/ns 地区/n]ns,[内蒙古/ns 自治区/n]NS,[深圳/ns 特区/n]NS, [厦门/ns 经济/n 特区/n]NS, [香港/ns 特别/a 行政区/n]NS,[香港/ns 特区/n]NS, [华盛顿/ns 特区/n]NS,4. 地名后有表示地形地貌的一个字的普通名词,如“江、河、山、洋、海、岛、峰、湖”等,不予切分。鸭绿江/ns,亚马逊河/ns, 喜马拉雅山/ns, 珠穆朗玛峰/ns,地中海/ns,大西洋/ns,洞庭湖/ns, 塞普路斯岛/ns 5. 地名后接的表示地形地貌的普通名词若有两个以上汉字,则应切开。然后将地名同该普通名词标成短语NS。[台湾/ns 海峡/n]NS,[华北/ns 平原/n]NS,[帕米尔/ns 高原/n]NS, [南沙/ns 群岛/n]NS,[京东/ns 大/a 峡谷/n]NS [横断/b 山脉/n]NS6.地名后有表示自然区划的一个字的普通名词,如“ 街,路,道,巷,里,町,庄,村,弄,堡”等,不予切分。 中关村/ns,长安街/ns,学院路/ns, 景德镇/ns, 吴家堡/ns, 庞各庄/ns, 三元里/ns,彼得堡/ns, 北菜市巷/ns, 7.地名后接的表示自然区划的普通名词若有两个以上汉字,则应切开。然后将地名同自然区划名词标成短语NS。[米市/ns 大街/n]NS, [蒋家/nz 胡同/n]NS , [陶然亭/ns 公园/n]NS , 8. 大小地名相连时的标注方式为:北京市/ns 海淀区/ns 海淀镇/ns [南/f 大街/n]NS [蒋家/nz 胡同/n]NS 24/m 号/q , |
| 3 | nt | 机构团体 | “团”的声母为t,名词代码n和t并在一起。 | (参见2。短语标记说明--NT)联合国/nt,中共中央/nt,国务院/nt, 北京大学/nt1.大多数团体、机构、组织的专有名称一般是短语型的,较长,且含有地名或人名等专名,再组合,标注为短语NT。[中国/ns 计算机/n 学会/n]NT, [香港/ns 钟表业/n 总会/n]NT, [烟台/ns 大学/n]NT, [香港/ns 理工大学/n]NT, [华东/ns 理工大学/n]NT,[合肥/ns 师范/n 学院/n]NT, [北京/ns 图书馆/n]NT, [富士通/nz 株式会社/n]NT, [香山/ns 植物园/n]NT, [安娜/nz 美容院/n]NT,[上海/ns 手表/n 厂/n]NT, [永和/nz 烧饼铺/n]NT,[北京/ns 国安/nz 队/n]NT,2. 对于在国际或中国范围内的知名的唯一的团体、机构、组织的名称即使前面没有专名,也标为nt或NT。联合国/nt,国务院/nt,外交部/nt, 财政部/nt,教育部/nt, 国防部/nt,[世界/n 贸易/n 组织/n]NT, [国家/n 教育/vn 委员会/n]NT,[信息/n 产业/n 部/n]NT,[全国/n 信息/n 技术/n 标准化/vn 委员会/n]NT,[全国/n 总/b 工会/n]NT,[全国/n 人民/n 代表/n 大会/n]NT,美国的“国务院”,其他国家的“外交部、财政部、教育部”,必须在其所属国的国名之后出现时,才联合标注为NT。[美国/ns 国务院/n]NT,[法国/ns 外交部/n]NT,[美/j 国会/n]NT,日本有些政府机构名称很特别,无论是否出现在“日本”国名之后都标为nt。[日本/ns 外务省/nt]NT,[日/j 通产省/nt]NT通产省/nt 3. 前后相连有上下位关系的团体机构组织名称的处理方式如下:[联合国/nt 教科文/j 组织/n]NT, [中国/ns 银行/n 北京/ns 分行/n]NT,[河北省/ns 正定县/ns 西平乐乡/ns 南化村/ns 党支部/n]NT, 当下位名称含有专名(如“北京/ns 分行/n”、“南化村/ns 党支部/n”、“昌平/ns 分校/n”)时,也可脱离前面的上位名称单独标注为NT。[中国/ns 银行/n]NT [北京/ns 分行/n]NT,北京大学/nt [昌平/ns 分校/n]NT,4. 团体、机构、组织名称中用圆括号加注简称时:[宝山/ns 钢铁/n (/w 宝钢/j )/w 总/b 公司/n]NT,[宝山/ns 钢铁/n 总/b 公司/n]NT,(/w 宝钢/j )/w |
================================================
FILE: docs/annotations/pos/863.md
================================================
# 863
| 词性 | 名称 | 说明 | 例子 |
| :-- | -----: | ---------------------------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| a | 形容词 | 取英语形容词adjective的第1个字母 | [重要/a 步伐/n]NP ,美丽/a ,看似/v 抽象/a , |
| c | 连词 | 取英语连词conjunction的第1个字母。 | 合作/vn 与/c 伙伴/n |
| d | 副词 | 取adverb的第2个字母,因其第1个字母已用于形容词。 | 进一步/d 发展/v , |
| e | 叹词 | 取英语叹词exclamation的第1个字母。 | 啊/e ,/w 那/r 金灿灿/z 的/u 麦穗/n , |
| f | 方位词 | 取汉字“方”。 | 军人/n 的/u 眼睛/n 里/f 不/d 是/v 没有/v 风景/n , |
| g | 语素字 | | |
| h | 前接成分 | 取英语head的第1个字母。 | 许多/m 非/h 主角/n 人物/n ,办事处/n 的/u “/w 准/h 政府/n ”/w 功能/n 不断/d 加强/v , |
| i | 成语 | 取英语成语idiom的第1个字母。 | 一言一行/i ,义无反顾/i , |
| j | 简称略语 | 取汉字“简”的声母。 | [德/j 外长/n]NP ,文教/j , |
| k | 后接成分 | 后接成分。 | 少年儿童/l 朋友/n 们/k ,身体/n 健康/a 者/k , |
| m | 数词 | 取英语numeral的第3个字母,n,u已有他用。 | 1.数量词组应切分为数词和量词。 三/m 个/q, 10/m 公斤/q, 一/m 盒/q 点心/n ,但少数数量词已是词典的登录单位,则不再切分。 一个/m , 一些/m ,2. 基数、序数、小数、分数、百分数一律不予切分,为一个切分单位,标注为 m 。一百二十三/m,20万/m, 123.54/m, 一个/m, 第一/m, 第三十五/m, 20%/m, 三分之二/m, 千分之三十/m, 几十/m 人/n, 十几万/m 元/q, 第一百零一/m 个/q ,3. 约数,前加副词、形容词或后加“来、多、左右”等助数词的应予分开。约/d 一百/m 多/m 万/m,仅/d 一百/m 个/q, 四十/m 来/m 个/q,二十/m 余/m 只/q, 十几/m 个/q,三十/m 左右/m ,两个数词相连的及“成百”、“上千”等则不予切分。五六/m 年/q, 七八/m 天/q,十七八/m 岁/q, 成百/m 学生/n,上千/m 人/n, 4.表序关系的“数+名”结构,应予切分。二/m 连/n , 三/m 部/n , |
| mq | 数量词 | | |
| n | 名词 | 取英语名词noun的第1个字母。 | (参见 动词--v)岗位/n , 城市/n , 机会/n ,她/r 是/v 责任/n 编辑/n , |
| nd | 方位名词 | 方位名词(nd),表示位置的相对方向 | 上 下 左 右 前 后 里 外 中 东 西 南 北前边 左面 里头 中间 外部 |
| nh | 人名 | 人名(nh),表示人的名称的专有名词 | 华罗庚 阿凡提 诸葛亮 司马相如 松赞干布 卡尔·马克思 |
| nhf | 姓 | | |
| nhs | 名 | | |
| ni | 机构名 | 机构名(ni),表示团体、组织、机构名称的专有名词 | 联合国 教育部 北京大学 中国科学院 |
| nl | 处所名词 | 处所名词(nl),表示处所 | 空中 高处 隔壁 门口 附近 边疆 一旁 野外 |
| ns | 地名 | 地名(ns),表示地理区域名称的专有名词 | 亚洲 大西洋 地中海 阿尔卑斯山 加拿大中国 北京 浙江 景德镇 呼和浩特 中关村 |
| nt | 时间名词 | 时间名词(nt),包括一般所说的时量词 | 年 月 日 分 秒现在 过去 昨天 去年 将来 宋朝 星期一 |
| nz | 其他专有名词 | 其他专有名词(nz) | 五粮液 宫爆鸡丁 桑塔纳 |
| o | 拟声词 | 取英语拟声词onomatopoeia的第1个字母。 | 哈哈/o 一/m 笑/v ,装载机/n 隆隆/o 推进/v , |
| p | 介词 | 取英语介词prepositional的第1个字母。 | 对/p 子孙后代/n 负责/v ,以/p 煤/n 养/v 农/Ng ,为/p 治理/v 荒山/n 服务/v , 把/p 青年/n 推/v 上/v 了/u 领导/vn 岗位/n , |
| q | 量词 | 取英语quantity的第1个字母。 | (参见数词m)首/m 批/q ,一/m 年/q , |
| r | 代词 | 取英语代词pronoun的第2个字母,因p已用于介词。 | 单音节代词“本”、“每”、“各”、“诸”后接单音节名词时,和后接的单音节名词合为代词;当后接双音节名词时,应予切分。本报/r, 每人/r, 本社/r, 本/r 地区/n, 各/r 部门/n |
| u | 助词 | 取英语助词auxiliary。 | [[俄罗斯/ns 和/c 北约/j]NP-BL 之间/f [战略/n 伙伴/n 关系/n]NP 的/u 建立/vn]NP 填平/v 了/u [[欧洲/ns 安全/a 政治/n]NP 的/u 鸿沟/n]NP |
| v | 动词 | 取英语动词verb的第一个字母。 | (参见 名词--n)[[[欧盟/j 扩大/v]S 的/u [历史性/n 决定/n]NP]NP 和/c [北约/j 开放/v]S]NP-BL [为/p [创建/v [一/m 种/q 新/a 的/u 欧洲/ns 安全/a 格局/n]NP]VP-SBI]PP-MD [奠定/v 了/u 基础/n]V-SBI ,, |
| vd | 趋向动词 | 趋向动词(vd),表示趋向 | (走)上 (趴)下 (进)来 (回)去(跑)上来 (掉)下去 (提)起来 (扔)过去 |
| vl | 联系动词 | 联系动词(vl),表示关系的判断 | 是 |
| vu | 能愿动词 | 能愿动词(vu),表示可能、意愿 | 能够 能 应该 可以 可能 情愿 愿意 要 |
| w | 标点符号 | | ”/w :/w |
| ws | 非汉字字符串 | 非汉字字符串(ws),如: | HanLP office windows |
| x | 非语素字 | 非语素字只是一个符号,字母x通常用于代表未知数、符号。 |
================================================
FILE: docs/annotations/pos/ctb.md
================================================
# ctb
See also [The Part-Of-Speech Tagging Guidelines for the Penn Chinese Treebank (3.0)](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports).
| Tag | Description | Chinese | Chinese Description | Examples |
|-----|------------------------------------------------------|---------|---------------------------------------------------------|------------------------|
| AD | adverb | 副词 | 副词 | 仍然、很、大大、约 |
| AS | aspect marker | 动态助词 | 助词 | 了、着、过 |
| BA | `bǎ` in ba-construction | 把字句 | 当“把”、“将”出现在结构“NP0 + BA + NP1+VP”时的词性 | 把、将 |
| CC | coordinating conjunction | 并列连接词 | 并列连词 | 与、和、或者、还是 |
| CD | cardinal number | 概数词 | 数词或表达数量的词 | 一百、好些、若干 |
| CS | subordinating conjunction | 从属连词 | 从属连词 | 如果、那么、就 |
| DEC | `de` as a complementizer or a nominalizer | 补语成分“的” | 当“的”或“之”作补语标记或名词化标记时的词性,其结构为:S/VP DEC {NP},如,喜欢旅游的大学生 | 的、之 |
| DEG | `de` as a genitive marker and an associative marker | 属格“的” | 当“的”或“之”作所有格时的词性,其结构为:NP/PP/JJ/DT DEG {NP}, 如,他的车、经济的发展 | 的、之 |
| DER | resultative `de`, `de` in V-de const and V-de-R | 表结果的“得” | 当“得”出现在结构“V-得-R”时的词性,如,他跑得很快 | 得 |
| DEV | manner `de`, `de` before VP | 表方式的“地” | 当“地”出现在结构“X-地-VP”时的词性,如,高兴地说 | 地 |
| DT | determiner | 限定词 | 代冠词,通常用来修饰名词 | 这、那、该、每、各 |
| ETC | for words like "etc." | 表示省略 | “等”、“等等”的词性 | 等、等等 |
| EM | emoji | 表情符 | 表情符、或称颜文字 | :) |
| FW | foreign words | 外来语 | 外来词 | 卡拉、A型 |
| IC | incomplete component | 不完整成分 | 不完整成分,尤指ASR导致的错误 | 好*xin*、那个*ba* |
| IJ | interjection | 句首感叹词 | 感叹词,通常出现在句子首部 | 啊 |
| JJ | other noun-modifier | 其他名词修饰语 | 形容词 | 共同、新 |
| LB | `bèi` in long bei-const | 长句式表被动 | 当“被”、“叫”、“给”出现在结构“NP0 + LB + NP1+ VP”结构时 的词性,如,他被我训了一顿 | 被、叫、给 |
| LC | localizer | 方位词 | 方位词以及表示范围的限定词 | 前、旁、到、在内、以来、为止 |
| M | measure word | 量词 | 量词 | 个、群、公里 |
| MSP | other particle | 其他小品词 | 其他虚词,包括“所”、“以”、“来”和“而”等出现在VP前的词 | 所、以、来、而 |
| NN | common noun | 其他名词 | 除专有名词和时间名词外的所有名词 | 桌子、生活、经济 |
| NOI | noise that characters are written in the wrong order | 噪声 | 汉字顺序颠倒产生的噪声 | 事/NOI 类/NOI 各/NOI 故/NOI |
| NR | proper noun | 专有名词 | 专有名词,通常表示地名、人名、机构名等 | 北京、乔丹、微软 |
| NT | temporal noun | 时间名词 | 表示时间概念的名词 | 一月、汉朝、当今 |
| OD | ordinal number | 序数词 | 序列词 | 第一百 |
| ON | onomatopoeia | 象声词 | 象声词 | 哗哗、呼、咯吱 |
| P | preposition e.g., "from" and "to" | 介词 | 介词 | 从、对、根据 |
| PN | pronoun | 代词 | 代词,通常用来指代名词 | 我、这些、其、自己 |
| PU | punctuation | 标点符号 | 标点符号 | ?、。、; |
| SB | `bèi` in short bei-const | 短句式表被动 | 当“被”、“给”出现在NP0 +SB+ VP结果时的词性,如,他被训了 一顿 | 被、叫 |
| SP | sentence final particle | 句末助词 | 经常出现在句尾的词 | 吧、呢、啊、啊 |
| URL | web address | 网址 | 网址 | www.hankcs.com |
| VA | predicative adjective | 表语形容词 | 可以接在“很”后面的形容词谓语 | 雪白、厉害 |
| VC | copula, be words | 系动词 | 系动词,表示“是”或“非”概念的动词 | 是、为、非 |
| VE | `yǒu` as the main verb | 动词有无 | 表示“有”或“无”概念的动词 | 有、没有、无 |
| VV | other verb | 其他动词 | 其他普通动词,包括情态词、控制动词、动作动词、心理动词等等 | 可能、要、走、喜欢 |
================================================
FILE: docs/annotations/pos/index.md
================================================
# Part-of-Speech Tagging
## Chinese
```{toctree}
ctb
pku
863
```
## Japanese
```{toctree}
npcmj
```
## Multilingual
```{toctree}
ud
```
================================================
FILE: docs/annotations/pos/npcmj.md
================================================
# NPCMJ
| Tag | Description |
|-----------|-----------------------------------|
| ADJI | イ-adjective |
| ADJI-MD | modal イ-adjective |
| ADJN | ナ-adjective |
| ADJN-MD | modal ナ-adjective |
| ADV | adverb |
| AX | auxiliary verb (including copula) |
| AXD | auxiliary verb, past tense |
| CL | classifier |
| CONJ | coordinating conjunction |
| D | determiner |
| FN | formal noun |
| FW | foreign word |
| INTJ | interjection |
| MD | modal element |
| N | noun |
| N-MENTION | mentioned expression |
| NEG | negation |
| NPR | proper noun |
| NUM | numeral |
| P-COMP | complementizer particle |
| P-CONN | conjunctional particle |
| P-FINAL | final particle |
| P-INTJ | interjectional particle |
| P-OPTR | toritate particle |
| P-ROLE | role particle |
| PASS | direct passive |
| PASS2 | indirect passive |
| PNL | prenominal |
| PRO | pronoun |
| PU | punctuation |
| PUL | left bracket |
| PUR | right bracket |
| Q | quantifier |
| QUOT | quote |
| SYM | symbol |
| VB | verb (or verb stem) |
| VB0 | light verb |
| VB2 | secondary verb |
| WADV | indeterminate adverb |
| WD | indeterminate determiner |
| WNUM | indeterminate numeral |
| WPRO | indeterminate pronoun |
================================================
FILE: docs/annotations/pos/pku.md
================================================
# pku
| 序号 | 词性 | 名称 | 帮助记忆的诠释 | 例子及注解 |
| ---- | ---- | -------- | ------------------------------------------------------ | ------------------------------------------------------------ |
| 1 | Ag | 形语素 | 形容词性语素。形容词代码为a,语素代码g前面置以A。 | 绿色/n 似/d 锦/Ag , |
| 2 | a | 形容词 | 取英语形容词adjective的第1个字母 | [重要/a 步伐/n]NP ,美丽/a ,看似/v 抽象/a , |
| 3 | ad | 副形词 | 直接作状语的形容词。形容词代码a和副词代码d并在一起。 | [积极/ad 谋求/v]V-ZZ ,幻象/n 易/ad 逝/Vg , |
| 4 | an | 名形词 | 具有名词功能的形容词。形容词代码a和名词代码n并在一起。 | [外交/n 和/c 安全/an]NP-BL , |
| 5 | Bg | 区别语素 | 区别词性语素。区别词代码为b,语素代码g前面置以B。 | 赤/Ag 橙/Bg 黄/a 绿/a 青/a 蓝/a 紫/a , |
| 6 | b | 区别词 | 取汉字“别”的声母。 | 女/b 司机/n, 金/b 手镯/n, 慢性/b 胃炎/n, 古/b 钱币/n, 副/b 主任/n, 总/b 公司/n 单音节区别词和单音节名词或名语素组合,作为一个词,并标以名词词性n。 |
| 7 | c | 连词 | 取英语连词conjunction的第1个字母。 | 合作/vn 与/c 伙伴/n |
| 8 | Dg | 副语素 | 副词性语素。副词代码为d,语素代码g前面置以D。 | 了解/v 甚/Dg 深/a ,煞/Dg 是/v 喜人/a , |
| 9 | d | 副词 | 取adverb的第2个字母,因其第1个字母已用于形容词。 | 进一步/d 发展/v , |
| 10 | e | 叹词 | 取英语叹词exclamation的第1个字母。 | 啊/e ,/w 那/r 金灿灿/z 的/u 麦穗/n , |
| 11 | f | 方位词 | 取汉字“方”。 | 军人/n 的/u 眼睛/n 里/f 不/d 是/v 没有/v 风景/n , |
| 12 | h | 前接成分 | 取英语head的第1个字母。 | 许多/m 非/h 主角/n 人物/n ,办事处/n 的/u “/w 准/h 政府/n ”/w 功能/n 不断/d 加强/v , |
| 13 | i | 成语 | 取英语成语idiom的第1个字母。 | 一言一行/i ,义无反顾/i , |
| 14 | j | 简称略语 | 取汉字“简”的声母。 | [德/j 外长/n]NP ,文教/j , |
| 15 | k | 后接成分 | 后接成分。 | 少年儿童/l 朋友/n 们/k ,身体/n 健康/a 者/k , |
| 16 | l | 习用语 | 习用语尚未成为成语,有点“临时性”,取“临”的声母。 | 少年儿童/l 朋友/n 们/k ,落到实处/l , |
| 17 | Mg | 数语素 | 数词性语素。数词代码为m,语素代码g前面置以M。 | 甲/Mg 减下/v 的/u 人/n 让/v 乙/Mg 背上/v ,凡/d “/w 寅/Mg 年/n ”/w 中/f 出生/v 的/u 人/n 生肖/n 都/d 属/v 虎/n , |
| 18 | m | 数词 | 取英语numeral的第3个字母,n,u已有他用。 | 1.数量词组应切分为数词和量词。 三/m 个/q, 10/m 公斤/q, 一/m 盒/q 点心/n ,但少数数量词已是词典的登录单位,则不再切分。 一个/m , 一些/m ,2. 基数、序数、小数、分数、百分数一律不予切分,为一个切分单位,标注为 m 。一百二十三/m,20万/m, 123.54/m, 一个/m, 第一/m, 第三十五/m, 20%/m, 三分之二/m, 千分之三十/m, 几十/m 人/n, 十几万/m 元/q, 第一百零一/m 个/q ,3. 约数,前加副词、形容词或后加“来、多、左右”等助数词的应予分开。约/d 一百/m 多/m 万/m,仅/d 一百/m 个/q, 四十/m 来/m 个/q,二十/m 余/m 只/q, 十几/m 个/q,三十/m 左右/m ,两个数词相连的及“成百”、“上千”等则不予切分。五六/m 年/q, 七八/m 天/q,十七八/m 岁/q, 成百/m 学生/n,上千/m 人/n, 4.表序关系的“数+名”结构,应予切分。二/m 连/n , 三/m 部/n , |
| 19 | Ng | 名语素 | 名词性语素。名词代码为n,语素代码g前面置以N。 | 出/v 过/u 两/m 天/q 差/Ng, 理/v 了/u 一/m 次/q 发/Ng, |
| 20 | n | 名词 | 取英语名词noun的第1个字母。 | (参见 动词--v)岗位/n , 城市/n , 机会/n ,她/r 是/v 责任/n 编辑/n , |
| 21 | nr | 人名 | 名词代码n和“人(ren)”的声母并在一起。 | 1. 汉族人及与汉族起名方式相同的非汉族人的姓和名单独切分,并分别标注为nr。张/nr 仁伟/nr, 欧阳/nr 修/nr, 阮/nr 志雄/nr, 朴/nr 贞爱/nr汉族人除有单姓和复姓外,还有双姓,即有的女子出嫁后,在原来的姓上加上丈夫的姓。如:陈方安生。这种情况切分、标注为:陈/nr 方/nr 安生/nr;唐姜氏,切分、标注为:唐/nr 姜氏/nr。2. 姓名后的职务、职称或称呼要分开。江/nr 主席/n, 小平/nr 同志/n, 江/nr 总书记/n,张/nr 教授/n, 王/nr 部长/n, 陈/nr 老总/n, 李/nr 大娘/n, 刘/nr 阿姨/n, 龙/nr 姑姑/n3. 对人的简称、尊称等若为两个字,则合为一个切分单位,并标以nr。老张/nr, 大李/nr, 小郝/nr, 郭老/nr, 陈总/nr4. 明显带排行的亲属称谓要切分开,分不清楚的则不切开。三/m 哥/n, 大婶/n, 大/a 女儿/n, 大哥/n, 小弟/n, 老爸/n5. 一些著名作者的或不易区分姓和名的笔名通常作为一个切分单位。鲁迅/nr, 茅盾/nr, 巴金/nr, 三毛/nr, 琼瑶/nr, 白桦/nr6. 外国人或少数民族的译名(包括日本人的姓名)不予切分,标注为nr。克林顿/nr, 叶利钦/nr, 才旦卓玛/nr, 小林多喜二/nr, 北研二/nr,华盛顿/nr, 爱因斯坦/nr有些西方人的姓名中有小圆点,也不分开。卡尔·马克思/nr |
| 22 | ns | 地名 | 名词代码n和处所词代码s并在一起。 | 安徽/ns,深圳/ns,杭州/ns,拉萨/ns,哈尔滨/ns, 呼和浩特/ns, 乌鲁木齐/ns,长江/ns,黄海/ns,太平洋/ns, 泰山/ns, 华山/ns,亚洲/ns, 海南岛/ns,太湖/ns,白洋淀/ns, 俄罗斯/ns,哈萨克斯坦/ns,彼得堡/ns, 伏尔加格勒/ns 1. 国名不论长短,作为一个切分单位。中国/ns, 中华人民共和国/ns, 日本国/ns, 美利坚合众国/ns, 美国/ns2. 地名后有“省”、“市”、“县”、“区”、“乡”、“镇”、“村”、“旗”、“州”、“都”、“府”、“道”等单字的行政区划名称时,不切分开,作为一个切分单位。四川省/ns, 天津市/ns,景德镇/ns沙市市/ns, 牡丹江市/ns,正定县/ns,海淀区/ns, 通州区/ns,东升乡/ns, 双桥镇/ns 南化村/ns,华盛顿州/ns,俄亥俄州/ns,东京都/ns, 大阪府/ns,北海道/ns, 长野县/ns,开封府/ns,宣城县/ns3. 地名后的行政区划有两个以上的汉字,则将地名同行政区划名称切开,不过要将地名同行政区划名称用方括号括起来,并标以短语NS。[芜湖/ns 专区/n] NS,[宣城/ns 地区/n]ns,[内蒙古/ns 自治区/n]NS,[深圳/ns 特区/n]NS, [厦门/ns 经济/n 特区/n]NS, [香港/ns 特别/a 行政区/n]NS,[香港/ns 特区/n]NS, [华盛顿/ns 特区/n]NS,4. 地名后有表示地形地貌的一个字的普通名词,如“江、河、山、洋、海、岛、峰、湖”等,不予切分。鸭绿江/ns,亚马逊河/ns, 喜马拉雅山/ns, 珠穆朗玛峰/ns,地中海/ns,大西洋/ns,洞庭湖/ns, 塞普路斯岛/ns 5. 地名后接的表示地形地貌的普通名词若有两个以上汉字,则应切开。然后将地名同该普通名词标成短语NS。[台湾/ns 海峡/n]NS,[华北/ns 平原/n]NS,[帕米尔/ns 高原/n]NS, [南沙/ns 群岛/n]NS,[京东/ns 大/a 峡谷/n]NS [横断/b 山脉/n]NS6.地名后有表示自然区划的一个字的普通名词,如“ 街,路,道,巷,里,町,庄,村,弄,堡”等,不予切分。 中关村/ns,长安街/ns,学院路/ns, 景德镇/ns, 吴家堡/ns, 庞各庄/ns, 三元里/ns,彼得堡/ns, 北菜市巷/ns, 7.地名后接的表示自然区划的普通名词若有两个以上汉字,则应切开。然后将地名同自然区划名词标成短语NS。[米市/ns 大街/n]NS, [蒋家/nz 胡同/n]NS , [陶然亭/ns 公园/n]NS , 8. 大小地名相连时的标注方式为:北京市/ns 海淀区/ns 海淀镇/ns [南/f 大街/n]NS [蒋家/nz 胡同/n]NS 24/m 号/q , |
| 23 | nt | 机构团体 | “团”的声母为t,名词代码n和t并在一起。 | (参见2。短语标记说明--NT)联合国/nt,中共中央/nt,国务院/nt, 北京大学/nt1.大多数团体、机构、组织的专有名称一般是短语型的,较长,且含有地名或人名等专名,再组合,标注为短语NT。[中国/ns 计算机/n 学会/n]NT, [香港/ns 钟表业/n 总会/n]NT, [烟台/ns 大学/n]NT, [香港/ns 理工大学/n]NT, [华东/ns 理工大学/n]NT,[合肥/ns 师范/n 学院/n]NT, [北京/ns 图书馆/n]NT, [富士通/nz 株式会社/n]NT, [香山/ns 植物园/n]NT, [安娜/nz 美容院/n]NT,[上海/ns 手表/n 厂/n]NT, [永和/nz 烧饼铺/n]NT,[北京/ns 国安/nz 队/n]NT,2. 对于在国际或中国范围内的知名的唯一的团体、机构、组织的名称即使前面没有专名,也标为nt或NT。联合国/nt,国务院/nt,外交部/nt, 财政部/nt,教育部/nt, 国防部/nt,[世界/n 贸易/n 组织/n]NT, [国家/n 教育/vn 委员会/n]NT,[信息/n 产业/n 部/n]NT,[全国/n 信息/n 技术/n 标准化/vn 委员会/n]NT,[全国/n 总/b 工会/n]NT,[全国/n 人民/n 代表/n 大会/n]NT,美国的“国务院”,其他国家的“外交部、财政部、教育部”,必须在其所属国的国名之后出现时,才联合标注为NT。[美国/ns 国务院/n]NT,[法国/ns 外交部/n]NT,[美/j 国会/n]NT,日本有些政府机构名称很特别,无论是否出现在“日本”国名之后都标为nt。[日本/ns 外务省/nt]NT,[日/j 通产省/nt]NT通产省/nt 3. 前后相连有上下位关系的团体机构组织名称的处理方式如下:[联合国/nt 教科文/j 组织/n]NT, [中国/ns 银行/n 北京/ns 分行/n]NT,[河北省/ns 正定县/ns 西平乐乡/ns 南化村/ns 党支部/n]NT, 当下位名称含有专名(如“北京/ns 分行/n”、“南化村/ns 党支部/n”、“昌平/ns 分校/n”)时,也可脱离前面的上位名称单独标注为NT。[中国/ns 银行/n]NT [北京/ns 分行/n]NT,北京大学/nt [昌平/ns 分校/n]NT,4. 团体、机构、组织名称中用圆括号加注简称时:[宝山/ns 钢铁/n (/w 宝钢/j )/w 总/b 公司/n]NT,[宝山/ns 钢铁/n 总/b 公司/n]NT,(/w 宝钢/j )/w |
| 24 | nx | 外文字符 | 外文字符。 | A/nx 公司/n ,B/nx 先生/n ,X/nx 君/Ng ,24/m K/nx 镀金/n ,C/nx 是/v 光速/n ,Windows98/nx ,PentiumIV/nx ,I LOVE THIS GAME/nx ,HanLP/nx |
| 25 | nz | 其他专名 | “专”的声母的第1个字母为z,名词代码n和z并在一起。 | (参见2。短语标记说明--NZ)除人名、国名、地名、团体、机构、组织以外的其他专有名词都标以nz。满族/nz,俄罗斯族/nz,汉语/nz,罗马利亚语/nz, 捷克语/nz,中文/nz, 英文/nz, 满人/nz, 哈萨克人/nz, 诺贝尔奖/nz, 茅盾奖/nz, 1.包含专有名称(或简称)的交通线,标以nz;短语型的,标为NZ。津浦路/nz, 石太线/nz, [京/j 九/j 铁路/n]NZ, [京/j 津/j 高速/b 公路/n]NZ, 2. 历史上重要事件、运动等专有名称一般是短语型的,按短语型专有名称处理,标以NZ。[卢沟桥/ns 事件/n]NZ, [西安/ns 事变/n]NZ,[五四/t 运动/n]NZ, [明治/nz 维新/n]NZ,[甲午/t 战争/n]NZ,3.专有名称后接多音节的名词,如“语言”、“文学”、“文化”、“方式”、“精神”等,失去专指性,则应分开。欧洲/ns 语言/n, 法国/ns 文学/n, 西方/ns 文化/n, 贝多芬/nr 交响乐/n, 雷锋/nr 精神/n, 美国/ns 方式/n,日本/ns 料理/n, 宋朝/t 古董/n 4. 商标(包括专名及后接的“牌”、“型”等)是专指的,标以nz,但其后所接的商品仍标以普通名词n。康师傅/nr 方便面/n, 中华牌/nz 香烟/n, 牡丹III型/nz 电视机/n, 联想/nz 电脑/n, 鳄鱼/nz 衬衣/n, 耐克/nz 鞋/n5. 以序号命名的名称一般不认为是专有名称。2/m 号/q 国道/n ,十一/m 届/q 三中全会/j如果前面有专名,合起来作为短语型专名。[中国/ns 101/m 国道/n]NZ, [中共/j 十一/m 届/q 三中全会/j]NZ,6. 书、报、杂志、文档、报告、协议、合同等的名称通常有书名号加以标识,不作为专有名词。由于这些名字往往较长,名字本身按常规处理。《/w 宁波/ns 日报/n 》/w ,《/w 鲁迅/nr 全集/n 》/w,中华/nz 读书/vn 报/n, 杜甫/nr 诗选/n,少数书名、报刊名等专有名称,则不切分。红楼梦/nz, 人民日报/nz,儒林外史/nz 7. 当有些专名无法分辨它们是人名还是地名或机构名时,暂标以nz。[巴黎/ns 贝尔希/nz 体育馆/n]NT,其中“贝尔希”只好暂标为nz。 |
| 26 | o | 拟声词 | 取英语拟声词onomatopoeia的第1个字母。 | 哈哈/o 一/m 笑/v ,装载机/n 隆隆/o 推进/v , |
| 27 | p | 介词 | 取英语介词prepositional的第1个字母。 | 对/p 子孙后代/n 负责/v ,以/p 煤/n 养/v 农/Ng ,为/p 治理/v 荒山/n 服务/v , 把/p 青年/n 推/v 上/v 了/u 领导/vn 岗位/n , |
| 28 | q | 量词 | 取英语quantity的第1个字母。 | (参见数词m)首/m 批/q ,一/m 年/q , |
| 29 | Rg | 代语素 | 代词性语素。代词代码为r,在语素的代码g前面置以R。 | 读者/n 就/d 是/v 这/r 两/m 棵/q 小树/n 扎根/v 于/p 斯/Rg 、/w 成长/v 于/p 斯/Rg 的/u 肥田/n 沃土/n , |
| 30 | r | 代词 | 取英语代词pronoun的第2个字母,因p已用于介词。 | 单音节代词“本”、“每”、“各”、“诸”后接单音节名词时,和后接的单音节名词合为代词;当后接双音节名词时,应予切分。本报/r, 每人/r, 本社/r, 本/r 地区/n, 各/r 部门/n |
| 31 | s | 处所词 | 取英语space的第1个字母。 | 家里/s 的/u 电脑/n 都/d 联通/v 了/u 国际/n 互联网/n ,西部/s 交通/n 咽喉/n , |
| 32 | Tg | 时语素 | 时间词性语素。时间词代码为t,在语素的代码g前面置以T。 | 3日/t 晚/Tg 在/p 总统府/n 发表/v 声明/n ,尊重/v 现/Tg 执政/vn 当局/n 的/u 权威/n , |
| 33 | t | 时间词 | 取英语time的第1个字母。 | 1. 年月日时分秒,按年、月、日、时、分、秒切分,标注为t 。1997年/t 3月/t 19日/t 下午/t 2时/t 18分/t若数字后无表示时间的“年、月、日、时、分、秒”等的标为数词m。1998/m 中文/n 信息/n 处理/vn 国际/n 会议/n 2. 历史朝代的名称虽然有专有名词的性质,仍标注为t。西周/t, 秦朝/t, 东汉/t, 南北朝/t, 清代/t“牛年、虎年”等一律不予切分,标注为:牛年/t, 虎年/t, 甲午年/t, 甲午/t 战争/n, 庚子/t 赔款/n, 戊戌/t 变法/n |
| 34 | u | 助词 | 取英语助词auxiliary。 | [[俄罗斯/ns 和/c 北约/j]NP-BL 之间/f [战略/n 伙伴/n 关系/n]NP 的/u 建立/vn]NP 填平/v 了/u [[欧洲/ns 安全/a 政治/n]NP 的/u 鸿沟/n]NP |
| 35 | Vg | 动语素 | 动词性语素。动词代码为v。在语素的代码g前面置以V。 | 洗/v 了/u 一个/m 舒舒服服/z 的/u 澡/Vg |
| 36 | v | 动词 | 取英语动词verb的第一个字母。 | (参见 名词--n)[[[欧盟/j 扩大/v]S 的/u [历史性/n 决定/n]NP]NP 和/c [北约/j 开放/v]S]NP-BL [为/p [创建/v [一/m 种/q 新/a 的/u 欧洲/ns 安全/a 格局/n]NP]VP-SBI]PP-MD [奠定/v 了/u 基础/n]V-SBI ,, |
| 37 | vd | 副动词 | 直接作状语的动词。动词和副词的代码并在一起。 | 形势/n 会/v 持续/vd 好转/v ,认为/v 是/v 电话局/n 收/v 错/vd 了/u 费/n , |
| 38 | vn | 名动词 | 指具有名词功能的动词。动词和名词的代码并在一起。 | 引起/v 人们/n 的/u 关注/vn 和/c 思考/vn ,收费/vn 电话/n 的/u 号码/n , |
| 39 | w | 标点符号 | | ”/w :/w |
| 40 | x | 非语素字 | 非语素字只是一个符号,字母x通常用于代表未知数、符号。 | |
| 41 | Yg | 语气语素 | 语气词性语素。语气词代码为y。在语素的代码g前面置以Y。 | 唯/d 大力/d 者/k 能/v 致/v 之/u 耳/Yg |
| 42 | y | 语气词 | 取汉字“语”的声母。 | 会/v 泄露/v 用户/n 隐私/n 吗/y ,又/d 何在/v 呢/y ? |
| 43 | z | 状态词 | 取汉字“状”的声母的前一个字母。 | 取得/v 扎扎实实/z 的/u 突破性/n 进展/vn ,四季/n 常青/z 的/u 热带/n 树木/n ,短短/z 几/m 年/q 间, |
================================================
FILE: docs/annotations/pos/ud.md
================================================
# Universal Dependencies
See also [Universal Dependencies](https://universaldependencies.org/u/pos/).
| Tag | Description |
|------------|----------------------------------------------|
| ADJ | adjective |
| ADP | adposition |
| ADV | adverb |
| AUX | auxiliary |
| CCONJ | coordinating conjunction |
| DET | determiner |
| INTJ | interjection |
| NOUN | noun |
| NUM | numeral |
| PART | particle |
| PRON | pronoun |
| PROPN | proper noun |
| PUNCT | punctuation |
| SCONJ | subordinating conjunction |
| SYM | symbol |
| VERB | verb |
| X | other |
================================================
FILE: docs/annotations/sdp/dm.md
================================================
# The reduction of Minimal Recursion Semantics
Please refer to [Minimal Recursion Semantics An Introduction](https://www.cl.cam.ac.uk/~aac10/papers/mrs.pdf).
================================================
FILE: docs/annotations/sdp/index.md
================================================
# Semantic Dependency Parsing
## Chinese
```{toctree}
semeval16
```
## English
```{toctree}
dm
pas
psd
```
================================================
FILE: docs/annotations/sdp/pas.md
================================================
# Predicate-Argument Structures
Please refer to [Probabilistic disambiguation models for wide-coverage HPSG parsing](https://www.aclweb.org/anthology/P05-1011.pdf).
================================================
FILE: docs/annotations/sdp/psd.md
================================================
# Prague Czech-English Dependency Treebank
Please refer to [Prague Czech-English Dependency Treebank](http://ufal.mff.cuni.cz/pcedt2.0/en/index.html).
================================================
FILE: docs/annotations/sdp/semeval16.md
================================================
# SemEval2016
## CSDP
SemEval2016 adopts the CSDP guideline listed as follows.
### 语义关系标注标签集
| 分类 | | | |
| ------------ | ------------ | --------------- | ------------------------------------------------------------ |
| 语义周边角色 | 主体角色 | 施事AGT; | 施事Agt;感事Aft |
| | | 当事EXP; | 当事Exp;领事Poss |
| | 客体角色 | 受事PAT; | 受事Pat |
| | | 客事CONT; | 客事Cont;成事Prod;结局Cons |
| | | 涉事DATV; | 涉事Datv;比较Comp;源事Orig |
| | | 系事LINK; | 类事Clas;属事Belg |
| | 情境角色 | 工具TOOL; | 工具Tool |
| | | 材料MATL; | 材料Matl |
| | | 方式MANN; | 方式Mann;依据Accd |
| | | 范围SCO; | 范围Sco |
| | | 缘由REAS; | 缘故Reas;意图Int |
| | | 时间TIME; | 时间Time;时间起点Tini;时间终点Tfin;时段Tdur;时距Trang |
| | | 空间LOC; | 空间Loc;原处所Lini;终处所Lfin;通过处所Lthru;趋向Dir |
| | | 度量MEAS; | 数量Quan;起始量Nini;终止量Nfin;数量短语Qp;频率Freq;顺序Seq;变化量Nvar |
| | | 状态STAT; | 状态Stat;起始状态Sini;终止状态Sfin;历经状态Sproc |
| | | 修饰FEAT; | 描写Desc;宿主Host;名词修饰语Nmod;时间修饰语Tmod |
| 语义结构关系 | 反关系 | 反施事rAGT; | 反施事rAgt;反感事rAft |
| | | 反当事rEXP。 | 反当事rExp;反领事rPoss |
| | | 反受事rPAT; | 反受事rPat |
| | | 反客事rCONT; | 反客事rCont;反成事rProd;反结局rCons |
| | | 反涉事rDATV; | 反涉事rDatv;反比较rComp;反源事rOrig |
| | | 反系事rLINK。 | 反类事rClas;反属事rBelg |
| | | 反工具rTOOL; | 反工具rTool |
| | | 反材料rMATL; | 反材料rMatl |
| | | 反方式RMANN; | 反方式rMann;反依据rAccd |
| | | 反范围rSCO; | 反范围rSco |
| | | 反缘由rREAS; | 反缘故rReas;反意图rInt |
| | | 反时间rTIME; | 反时间rTime;反时间起点rTini;反时间终点rTfin;反时段rTdur;反时距rTrang |
| | | 反空间rLOC; | 反空间rLoc;反原处所rLini;反终处所rLfin;反通过处所rLthru;反趋向rDir |
| | | 反度量rMEAS; | 反数量rQuan;反起始量rNini;反终止量rNfin;反数量短语rQp;反频率rFreq;反顺序rSeq;反变化量rNvar |
| | | 反状态rSTAT; | 反状态rStat;反起始状态rSini;反终止状态rSfin;反历经状态rSproc |
| | | 反修饰rFEAT; | 反描写rDesc;反宿主rHost; 反名词修饰语rNmod; 反时间修饰语rTmod |
| | 嵌套事件关系 | 嵌套施事dAGT; | 嵌套施事dAgt;嵌套感事dAft |
| | | 嵌套当事dEXP。 | 嵌套当事dExp;嵌套领事dPoss |
| | | 嵌套受事dPAT; | 嵌套受事dPat |
| | | 嵌套客事dCONT; | 嵌套客事dCont;嵌套成事dProd;嵌套结局dCons |
| | | 嵌套涉事dDATV; | 嵌套涉事dDatv;嵌套比较dComp;嵌套源事dOrig |
| | | 嵌套系事dLINK。 | 嵌套类事dClas;嵌套属事dBelg |
| | | 嵌套工具dTOOL; | 嵌套工具dTool |
| | | 嵌套材料dMATL; | 嵌套材料dMatl |
| | | 嵌套方式dMANN; | 嵌套方式dMann;嵌套依据dAccd |
| | | 嵌套范围dSCO; | 嵌套范围dSco |
| | | 嵌套缘由dREAS; | 嵌套缘故dReas;嵌套意图dInt |
| | | 嵌套时间dTIME; | 嵌套时间dTime;嵌套时间起点dTini;嵌套时间终点dTfin;嵌套时段dTdur;嵌套时距dTrang |
| | | 嵌套空间dLOC; | 嵌套空间dLoc;嵌套原处所dLini;嵌套终处所dLfin;嵌套通过处所dLthru;嵌套趋向dDir |
| | | 嵌套度量dMEAS; | 嵌套数量dQuan;嵌套起始量dNini;嵌套终止量dNfin;嵌套数量短语dQp;嵌套频率dFreq;嵌套顺序dSeq;嵌套变化量dNvar |
| | | 嵌套状态dSTAT; | 嵌套状态dStat;嵌套起始状态dSini;嵌套终止状态dSfin;嵌套历经状态dSproc |
| | | 嵌套修饰dFEAT; | 嵌套描写dDesc;嵌套宿主dHost; 嵌套名词修饰语dNmod; 嵌套时间修饰语dTmod |
| | 事件关系 | 并列关系eCOO; | 并列eCoo;等同eEqu;分叙eRect;选择eSelt;割舍eAban;选取ePref;总括eSum |
| | | 先行关系ePREC; | 先行ePrec;原因eCau;条件eCond;假设eSupp;手段eMetd;让步eConc |
| | | 后继关系eSUCC; | 后继eSucc;递进eProg;转折 eAdvt;目的ePurp;结果eResu;推论eInf |
| 语义依附标记 | 标点标记 | 标点标记mPUNC; | 标点标记mPunc |
| | 依附标记 | 否定标记mNEG; | 否定标记mNeg |
| | | 关系标记mRELA; | 连词标记mConj;介词标记mPrep |
| | | 依附标记mDEPD; | 语气标记mTone;时间标记mTime;范围标记mRang;情态标记mMod; 频率标记mFreq;程度标记mDegr;趋向标记mDir;的字标记mAux; 多数标记mMaj;插入语标记mPars;离合标记mSepa;实词虚化标记mVain 重复标记mRept |
## SemEval2016
The following table is a subset of CSDP but offers some examples to illustrate the idea.
| 关系类型 | Tag | Description | Example |
|--------|---------------|--------------------|-----------------------------|
| 施事关系 | Agt | Agent | 我送她一束花 (我 <– 送) |
| 当事关系 | Exp | Experiencer | 我跑得快 (跑 –> 我) |
| 感事关系 | Aft | Affection | 我思念家乡 (思念 –> 我) |
| 领事关系 | Poss | Possessor | 他有一本好读 (他 <– 有) |
| 受事关系 | Pat | Patient | 他打了小明 (打 –> 小明) |
| 客事关系 | Cont | Content | 他听到鞭炮声 (听 –> 鞭炮声) |
| 成事关系 | Prod | Product | 他写了本小说 (写 –> 小说) |
| 源事关系 | Orig | Origin | 我军缴获敌人四辆坦克 (缴获 –> 坦克) |
| 涉事关系 | Datv | Dative | 他告诉我个秘密 ( 告诉 –> 我 ) |
| 比较角色 | Comp | Comitative | 他成绩比我好 (他 –> 我) |
| 属事角色 | Belg | Belongings | 老赵有俩女儿 (老赵 <– 有) |
| 类事角色 | Clas | Classification | 他是中学生 (是 –> 中学生) |
| 依据角色 | Accd | According | 本庭依法宣判 (依法 <– 宣判) |
| 缘故角色 | Reas | Reason | 他在愁女儿婚事 (愁 –> 婚事) |
| 意图角色 | Int | Intention | 为了金牌他拼命努力 (金牌 <– 努力) |
| 结局角色 | Cons | Consequence | 他跑了满头大汗 (跑 –> 满头大汗) |
| 方式角色 | Mann | Manner | 球慢慢滚进空门 (慢慢 <– 滚) |
| 工具角色 | Tool | Tool | 她用砂锅熬粥 (砂锅 <– 熬粥) |
| 材料角色 | Malt | Material | 她用小米熬粥 (小米 <– 熬粥) |
| 时间角色 | Time | Time | 唐朝有个李白 (唐朝 <– 有) |
| 空间角色 | Loc | Location | 这房子朝南 (朝 –> 南) |
| 历程角色 | Proc | Process | 火车正在过长江大桥 (过 –> 大桥) |
| 趋向角色 | Dir | Direction | 部队奔向南方 (奔 –> 南) |
| 范围角色 | Sco | Scope | 产品应该比质量 (比 –> 质量) |
| 数量角色 | Quan | Quantity | 一年有365天 (有 –> 天) |
| 数量数组 | Qp | Quantity-phrase | 三本书 (三 –> 本) |
| 频率角色 | Freq | Frequency | 他每天看书 (每天 <– 看) |
| 顺序角色 | Seq | Sequence | 他跑第一 (跑 –> 第一) |
| 描写角色 | Desc(Feat) | Description | 他长得胖 (长 –> 胖) |
| 宿主角色 | Host | Host | 住房面积 (住房 <– 面积) |
| 名字修饰角色 | Nmod | Name-modifier | 果戈里大街 (果戈里 <– 大街) |
| 时间修饰角色 | Tmod | Time-modifier | 星期一上午 (星期一 <– 上午) |
| 反角色 | r + main role | | 打篮球的小姑娘 (打篮球 <– 姑娘) |
| 嵌套角色 | d + main role | | 爷爷看见孙子在跑 (看见 –> 跑) |
| 并列关系 | eCoo | event Coordination | 我喜欢唱歌和跳舞 (唱歌 –> 跳舞) |
| 选择关系 | eSelt | event Selection | 您是喝茶还是喝咖啡 (茶 –> 咖啡) |
| 等同关系 | eEqu | event Equivalent | 他们三个人一起走 (他们 –> 三个人) |
| 先行关系 | ePrec | event Precedent | 首先,先 |
| 顺承关系 | eSucc | event Successor | 随后,然后 |
| 递进关系 | eProg | event Progression | 况且,并且 |
| 转折关系 | eAdvt | event adversative | 却,然而 |
| 原因关系 | eCau | event Cause | 因为,既然 |
| 结果关系 | eResu | event Result | 因此,以致 |
| 推论关系 | eInf | event Inference | 才,则 |
| 条件关系 | eCond | event Condition | 只要,除非 |
| 假设关系 | eSupp | event Supposition | 如果,要是 |
| 让步关系 | eConc | event Concession | 纵使,哪怕 |
| 手段关系 | eMetd | event Method | |
| 目的关系 | ePurp | event Purpose | 为了,以便 |
| 割舍关系 | eAban | event Abandonment | 与其,也不 |
| 选取关系 | ePref | event Preference | 不如,宁愿 |
| 总括关系 | eSum | event Summary | 总而言之 |
| 分叙关系 | eRect | event Recount | 例如,比方说 |
| 连词标记 | mConj | Conjunction | 和,或 |
| 的字标记 | mAux | Auxiliary | 的,地,得 |
| 介词标记 | mPrep | Preposition | 把,被 |
| 语气标记 | mTone | Tone | 吗,呢 |
| 时间标记 | mTime | Time | 才,曾经 |
| 范围标记 | mRang | Range | 都,到处 |
| 程度标记 | mDegr | Degree | 很,稍微 |
| 频率标记 | mFreq | Frequency Marker | 再,常常 |
| 趋向标记 | mDir | Direction Marker | 上去,下来 |
| 插入语标记 | mPars | Parenthesis Marker | 总的来说,众所周知 |
| 否定标记 | mNeg | Negation Marker | 不,没,未 |
| 情态标记 | mMod | Modal Marker | 幸亏,会,能 |
| 标点标记 | mPunc | Punctuation Marker | ,。! |
| 重复标记 | mPept | Repetition Marker | 走啊走 (走 –> 走) |
| 多数标记 | mMaj | Majority Marker | 们,等 |
| 实词虚化标记 | mVain | Vain Marker | |
| 离合标记 | mSepa | Seperation Marker | 吃了个饭 (吃 –> 饭) 洗了个澡 (洗 –> 澡) |
| 根节点 | Root | Root | 全句核心节点 |
See also [SemEval-2016 Task 9](https://www.hankcs.com/nlp/sdp-corpus.html) and [CSDP](https://csdp-doc.readthedocs.io/zh_CN/latest/%E9%99%84%E5%BD%95/).
================================================
FILE: docs/annotations/srl/cpb.md
================================================
# Chinese Proposition Bank
| | 标签 | 角色 | 例子 |
|------|----------|-------|-------------------------|
| 中心角色 | ARG0 | 施事者 | (ARG0中国政府)提供援助 |
| | ARG1 | 受事者 | 中国政府提供(ARG1援助) |
| | ARG2 | 依谓词而定 | 失业率控制(ARG2在百分之十内) |
| | ARG3 | 依谓词而定 | (ARG3从城市)扩大到农村 |
| | ARG4 | 依谓词而定 | 提高(ARG4百分之二十) |
| 附属角色 | ARGM-ADV | 状语 | (ARGM-ADV共同)承担 |
| | ARGM-BNF | 受益者 | (ARGM-BNF为其他国家)进行融资 |
| | ARGM-CND | 条件 | (ARGM-CND如果成功),他就留下 |
| | ARGM-DIR | 方向 | (ARGM-DIR向和平)迈出一大步 |
| | ARGM-EXT | 范围 | 在北京逗留(ARGM-EXT两天) |
| | ARGM-FRQ | 频率 | 每半年执行(ARGM-FRQ一次) |
| | ARGM-LOC | 地点、位置 | (ARGM-LOC在机场)被捕获 |
| | ARGM-MNR | 方式 | (ARGM-MNR以中英文)发行 |
| | ARGM-PRP | 目的或原因 | (ARGM-PRP由于危机)而破产 |
| | ARGM-TMP | 时间 | 公司(ARGM-TMP去年)成立 |
| | ARGM-TPC | 主题 | (ARGM-TPC稳定政策),核心是... |
| | ARGM-DIS | 话语标记 | (ARGM-DIS因此),他感到不公 |
| | ARGM-CRD | 并列论元 | (ARGM-CRD与台湾)非正式接触 |
| | ARGM-PRD | 次谓词 | 指控廉政公署五人(ARGM-PRD接受贿赂) |
```{note}
Although ARG0 and ARG1 share general definitions across all predicates, word sense disambiguation is required to find
the coresponding definition of semantic roles. Given the word sense of `变化`, say `变化-2`,
[its second frameset](http://verbs.colorado.edu/chinese/cpb/html_frames/0183-bian-hua.html) can
be found which defines the following 2 arguments:
1. ARG0: agent/cause
2. ARG1: entity arg0 changes
These definitions are different from that of frameset `变化-1`:
1. ARG0: entity undergoing change
Sometimes, the number of arguments and definitions can vary a lot across framesets.
In summary, word sense disambiguation is essential if SRL is to be used to best effect in practical applications
```
================================================
FILE: docs/annotations/srl/index.md
================================================
# Semantic Role Labeling
## Chinese
```{toctree}
cpb
```
## English
```{toctree}
propbank
```
================================================
FILE: docs/annotations/srl/propbank.md
================================================
# English PropBank
| Role | Description |
|------|----------------------------------------|
| ARG0 | agent |
| ARG1 | patient |
| ARG2 | instrument, benefactive, attribute |
| ARG3 | starting point, benefactive, attribute |
| ARG4 | ending point |
| ARGM | modifier |
| COM | Comitative |
| LOC | Locative |
| DIR | Directional |
| GOL | Goal |
| MNR | Manner |
| TMP | Temporal |
| EXT | Extent |
| REC | Reciprocals |
| PRD | Secondary Predication |
| PRP | Purpose |
| CAU | Cause |
| DIS | Discourse |
| ADV | Adverbials |
| ADJ | Adjectival |
| MOD | Modal |
| NEG | Negation |
| DSP | Direct Speech |
| LVB | Light Verb |
| CXN | Construction |
================================================
FILE: docs/annotations/tok/ctb.md
================================================
The Segmentation Guidelines for the Penn Chinese Treebank (3.0)
===============================================================
Fei Xia
*University of Pennsylvania*
This is an OCR version. See also the [PDF version](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1038&context=ircs_reports).
## Abstract
This document describes the segmentation guidelines for the Penn Chinese Treebank Project. The goal of the project is the creation of a 100-thousand-word corpus of Mandarin Chinese text with syntactic bracketing. The Chinese Treebank has been released via the Linguistic Data Consortium (LDC) and is available to the public.
The segmentation guidelines have been revised several times during the two-year period of the project. The previous two versions were completed in December 1998 and March 1999, respectively. This document is the third and final version. We have added an introduction chapter in order to explain some rationale behind certain decisions in the guidelines. We also include the English gloss to the Chinese words in the guidelines.
In this document, we first discuss the notion of word and tests for wordhood that have been proposed in the literature. Then we give the specification for word segmentation. The specification is organized according to the potential Part-of-Speech tag of an expression and the internal structure of the expression. Next, we specify the treatment for some common collocations. Finally, we compare our guidelines with two segmentation standards: the first (Liu et al., 1993) is used in Mainland China and the second (CKIP, 1996) is used in Academia Sinica in Taiwan.
## Chapter 1 Introduction
This document is designed for the Penn Chinese Treebank Project [XPX+ 00]. The goal of the project is the creation of a 100-thousand word corpus of Mandarin Chinese text with syntactic bracketing. The annotation consists of two stages: the first phrase is word segmentation and part-of-speech (POS) tagging and the second phrase is syntactic bracketing. Each stage includes at least two passes, that is, the data are annotated by one annotator, then the resulting files are checked by another annotator.
The segmentation guidelines, like POS guidelines and bracketing guidelines, have been revised several times during the project. So far, we have released all three versions on our web site: the first draft was completed in December 1998, after the first pass of word segmentation and POS tagging; the second draft in March 1999, after the second pass of word segmentation and POS tagging. This document, which is the third draft, is revised after the second pass of bracketing. The major changes in the third draft, compared with the previous two drafts, are (1) we add an introduction chapter in order to explain some rationale behind the guideline, (2) we add the gloss to the Chinese words in the guidelines,1 and (3) we also turn the guidelines into a technical report, which is published by the Institute for Research in Cognitive Science (IRCS) of the University of Pennsylvania.
### 1.1 Notion of *word*
The difficulty in defining the notion of word is not unique to Chinese,2 but the problem is certainly more severe for Chinese for a number of reasons. First, Chinese is not written with word delimiters so segmenting a sentence into "words" is not a natural task even for a native speaker. Second, Chinese has little inflectional morphology to ease word identification. Third, there is little consensus in the community on difficult constructions that could affect word segmentation. For instance, the segmentation of verb resultative compounds depends on the syntactic analysis of the construction. One view on how a verb resultative compound is formed says that a simple sentence with a compound is actually bi-clausal and the compound is formed by movement, therefore, the compound should be treated as two words. Another view believes that the compound is formed in the lexicon, and therefore should be one word. The segmentation of the verb resultative compounds depends on which view we adopt for this construction. Fourth, many monosyllabic morphemes that used to be able to stand alone in non-Modern Chinese become bound in Modern Chinese. The influence of non-Modern Chinese makes it difficult to draw the line between bound morphemes and free morphemes, the notions which could otherwise have been very useful for deciding word boundaries.
Our approach is based on both linguistic and engineering consideration. The notion word in our Treebank is roughly a syntactic atom as defined in [SW87], that is, anything that can be inserted into an X° position in syntax. This includes both compounds and simple words.
### 1.2 Tests of wordhood
What tests can be used to decide whether a string of hanzi[Chinese character] is a word or not? Without loss of generalization, we assume the string that we are trying to segment is X-Y, which has two morphemes X and Y. The following tests for establishing word boundaries have been proposed by various authors:
- Bound morpheme: a bound morpheme should be attached to its neighboring morpheme to form a word when possible.
- Productivity: if a rule that combines the expression X-Y does not apply generally (i.e., it is not productive), then X-Y is likely to be a word.
- Frequency of co-occurrence: if the expression X-Y occurs very often, it is likely to be a word.
- Complex internal structure: strings with complex internal structures should be segmented when possible.
- Compositionality: if the meaning of X-Y is not compositional, it is likely to be a word.
- Insertion: if another morpheme can be inserted between X and Y, then X-Y is unlikely to be a word.
- XP-substitution: if a morpheme can not be replaced by a phrase of the same type, then it is likely to be part of a word.
- The number of syllables: several guidelines [LTS93, Chi96] have used syllable numbers on certain cases. For example, in [LTS93], a verb resultative compound is treated as one word if the resultative part is monosyllabic, and it is treated as two words if the resultative part has more than one syllable.
All of these tests are very useful. However, none of them is sufficient by itself for covering the entire range of difficult cases. Either the test is applicable only to limited cases (e.g., the XP-substitution test) or there is no objective way to perform the test as the test refers to vaguely defined properties (e.g., in the productive test, it is not clear where to draw the line between a productive rule and a non-productive rule). For more discussion on this topic from the linguistics point of view, please refer to [Pac98, SW87].
Since no single test is sufficient, we chose a set of tests for our segmentation guidelines which includes all of the ones mentioned except for the productivity test and the frequency test. Rather than have the annotators try to memorize the entire set and make each decision from these principles, in the guidelines we spell out what the results of applying the tests would be for all of the relevant phenomena. For example, for the treatment of verb resultative compounds, we select the relevant tests (e.g., the number of syllables and the insertion test), and give several examples of the results of applying these tests to verb resultative compounds. This makes it straightforward, and thus efficient, for the annotators to follow the guidelines.
### 1.3 Compatibility with other guidelines
We have studied other groups, guidelines, such as the Segmentation Standard in China [LTS93] and the one in Taiwan [Chi96], and tried to accommodate them in our guidelines if possible.
Since the final result of the Treebank is a list of bracketed sentences, our guidelines have some flexibility with regards to the segmentation of certain constructions. For example, the string 走上来[walk up] is treated as two segments in [LTS93], but one segment in [Chi96]. In our Treebank, we will segment it into two parts, and then group them together as a compound ——that is, (走[walk]/V 上来[up]/V)/V. We call 走上来 a word with internal structures. Out annotation, in this case, is compatible with both [LTS93] and [Chi96]. The comparisons of these three guidelines can be found in Appendix A.
Note: For the sake of annotation efficiency, the grouping of the words with internal structure is done at bracketing stage, rather than at the segmentation stage. In this document, we show the grouping format, but keep in mind that the format is the one AFTER the bracketing is completed. For example, we consider 走上来[walk up] 2us one word. It is segmented into “走[walk]/V 上来[up]/V” at the segmentation stage, and it will be grouped into (走[walk]/V 上来[up]/V)/V at the bracketing stage. In the paper, we just say 走上来[walk up] should be annotated as (走[walk]/V 上来[up]/V)/V.
Most disagreements among these three guidelines do not make much difference to parsing or sentence interpretation. For most patterns for which the guidelines give different treatments (e.g., numbers and reduplication strings), simple conversion programs can be written to convert the data from one format to another.
Our goal is: in the final output, the word boundary (the highest-level X° in the parse tree) should be as accurate as possible, while the internal structure serves as a bridge for the resource sharing with other systems.
### 1.4 Treatment for unclear cases
There are two types of unclear cases:
- A construction is easy to identify but there is no consensus on its treatment.
Ex: A-not-A, V-de construction, V-R, potential form (i.e., V-de-R).Our approach: we will choose one analysis, and annotate the data according to that analysis. Make sure that the annotation is easy to convert to the structures for other analyses if necessary.
- Two constructions are difficult to tell apart by existing tests.
Ex: some N+N are compounds, others are phrases.
Our approach: for the sake of consistency and efficiency, we don^ disambiguate the two constructions unless making the distinction is crucial for various reasons.
### 1.5 Organization of this guidelines
The guidelines are organized according to the internal structure of the corresponding expressions (e.g., a verb resultative compound is represented as V+V, while a verb-object expression is as V+N), so it is easy for the annotators to search the guidelines for reference. The Part-of-speech tags used in this paper are identical to the ones used in the POS tagging task except that the tags for verbs are merged into V and the ones for nouns are merged into N. For the descriptions of the complete POS tagset, please refer to our Part-of-Speech Tagging Guidelines for the Penn Chinese Treebank (3.0). The list of POS tags can be found in Appendix B.
In this guidelines, we list mainly the decision for each case without going into detail elaborating other alternatives and the reasoning behind each decision.
Chapter 2 Specification
---------
In this chapter, we assume that a sentence has been segmented into large chunks, and the next step is to decide whether each chunk should be further divided. The chapter is arranged by the potential POS of the chunk if the chunk is a word. To search through the section, first use the ^POS^ of the chunk to find the subsection, then use the ^word^ formation information to find the subsection; or simply use the “word” formation information.
### 2.1 Common noun: NN
#### 2.1.1 Name of relative
Treat it as one word.
Ex:三叔[uncle]/NN,表叔[uncle]/NN,.大姑父[uncle]/NN.
#### 2.1.2 CD+N
If a measure word can be inserted between CD and N without changing the meaning, tag it as CD+N; otherwise, tag it as one word (N).
One word:三排[the third platoon]/NN,一方[one side]/NN,三者[three entities]/NN, 一行[a group traveling together]/NN,2 1 世纪[the 21st century]/NT.
Two words: — [one]/CD 学生[student]/NN.
#### 2.1.3 DT+N
Treat it as one word if both DT and N are monosyllabic and either DT or N is bound; otherwise, treat it as two words.
Sometimes, it is difficult to decide whether a morpheme is bound or not because of the influence of non-Modern Chinese. To be consistent, we maintain a list of nouns and a list of determiners. If a morpheme is in one of the lists, we consider it as bound:
- monosyllabic bound nouns: /^.[school], ^ (when it means the earth).
- monosyllabic bound determiners:当[this/that]
We also treat 本人[oneself]/NN as one word and tag it as NN.
One word:本人[oneself]/NN,本校[our school]/NN,全球[whole world]/NN,当地[the place mentioned]/NN,当今[present time]/NT,当代[the contemporary era]/NN.
Two words:本[one’s]/DT 单位[organization]/NN.
#### 2.1.4 PN+N
Treat it as one word if both PN and N are monosyllabic and N is bound; otherwise, treat it as two words.
In this case, the current list of bound nouns is:校[school].
One word:我校[my school]/NN.
Two words:我[my]/PN 单祆[organization]/NN.
#### 2.1.5 JJ+N
The pattern is: X+N, where X modifies the N, and X is either a JJ or a prefix.
Note: JJ+N can be a phrase. For example, in one of the files we annotated,全国性[nationwide]/JJ 网络[network]/NN is extended into “全国性[nationwide]/JJ 观测[observe]/VV 苏梅克一列桌/NR 9 号[number 9]/NN 彗星[comet]/NN 撞击[hit]/W 木星[Jupiter]/NN 的/DEC 网络[network]/NN”.
Segment X+N according to the type of X:
- X is a prefix: treat X+N as one word.[1](#bookmark93) A list of prefixes:啊,非[non-].
Ex:啊爸[father]/NN,非商业化[non-commercial]/JJ 宗旨[purpose]/NN.
A list of JJs:原[former],前[former]
Ex:原[former]/JJ 在[at]/P 华[China]/NR 老挝[Laos]/NR 难民[refugee]/NN;
前[former]/JJ 民主德国[German Democratic Republic]/NR.
- X is a non-predicate adjective:[2](#bookmark94) if both JJ and N are monosyllabic, tag it as one word; otherwise, treat it as JJ+N.
One word:女人[woman]/NN.
Two words:共同[mutual]/JJ 利益[interest]/NN.
- X is an adjective: treat it as one word if X or N is bound or the meaning of X+N is non-compositional. For unclear cases, if both JJ and N are monosyllabic, treat JJ+N as one word (e.g” 鲜花[fresh flower]/NN,强队[strong team]/NN, •红茶[black tea]/NN,好评[favorable comment]/NN).
One word:小媳妇[daughter-in-law]/NN,大洲[continent]/NN,大海[sea]/NN.
Two words:厚[thick]/JJ 书[book]/NN.
#### 2.1.6 LC+N
If both LC and N are monosyllabic, treat the string as one word, and tag it as NN or NT according to its meaning.
Ex:前院[front yard]/NN,前天[day before yesterday]/NT,左肩[left shoulder]/NN.
#### 2.1.7 N+LC
Treat N+LC as one word if:[3](#bookmark95)
- the N and LC are monosyllabic; and
- in this context, the N is non-referential or bound; and
- in this context, the N can not be modified by Det-M or other modifiers.
Otherwise, treat it as two words.
- One word (some of them might be two words in other context):室内[indoor](室内[indoor]/NN 训练[training]/NN),台下[off stage],眼前[at present],境外[foreign](境外[foreign]/NN 集团[group]/NN 境内外[domestic and international /NN,海外[oversea](海外[oversea]/NN 市场[market]/NN),背后[at the back]/NN,天下[world]/NN,国内[domestic]/NN,午后[afternoon]/NT,赛前[before the contest]/NT.
- Two words:中午[noon]/NT 以后[afterwards]/LC.
#### 2.1.8 N+N: N1 modifies N2
If it is 1-hl or 2+1 (i.e., N1 has one or two hanzi and N2 has one hanzi), treat N1+N2 as one word (i.e.,we treat all monosyllabic nouns as potential “接尾词. If a noun with no more than 2 hanzi is followed by multiple "接尾词" monosyllabic noun attaches to the preceding the whole string is treated as one word (e.g•,物理学家[physicist]/NN).
For other cases, the string is treated as two words.
- One word:北京市[Beijing]/NR,研究室[research lab]/NN,发展史[developmental history]/NN,始祖鸟[proto-bird]/NN, 残疾人[the physically challenged]/NN, 清晰度[visibility]/NN, [sense of urgency]/NN, 大奖赛[tournament]/NN,太阳系[the solar system]/NN.
- Two words:北京[Beijing]/NR 大学[University]/NN,坑具[toy]/NN 工厂[factory]/NN,合作[collaboration]/NN, 领城[area]/NN,史学[history]/NN 研究[research]/NN.
#### 2.1.9 PN+LC
If both PN and LC are monosyllabic, treat PN+LC as one word and tag it as NT or NN.
One word:此间[here]/NN,此前[before this]/NN,其中[among them]/NN,何时[when]/NT.
Two word:这[this]/PN 以后[after]/LC.
#### 2.1.10 V+N
In this pattern, we assume V is VV (For VA+N, please refer to the section for JJ+N) If V modifies N, treat V+N as one word and tag it as a noun.
one word:烤肉[barbecue]/NN,炒菜[stir-fried dishes]/NN,证明信[certificate]/NN,讨论会[symposium]/NN.[4](#bookmark96)
### 2.2 Proper Noun: NR
Currently, if the proper noun is composed of multiple words, we don^ group them.
#### 2.2.1 Personal name
Treat it as one word. Don't give the internal structure unless there is a space between two names (in foreign alphabet).
Ex:张胜利/NR,卡尔[Karl].马克斯[Maxx]/NR, John/NR Smith/NR.
#### 2.2.2 Personal name with affixes
Treat it as one word.
Ex:老张/NR,张老/NR
#### 2.2.3 Personal name + title
Treat it as two words.
Ex:张/NR 教授[professor]/NN,张/NR 李/NR 两[two]/CD 位/M 教授[professor"^
#### 2.2.4 Name of Organization/Country/School/..
If the pattern is N1+N2, where N2 is a common noun, then if N2 is monosyllabic, treat N1+N2 as one word, else treat N1+N2 as two words.
Simple names:北京市[Beijing]/NR,黄河[the Yellow River]/NR,沙市[Sha City]/NR,黑龙江省[Heilongji^ Province]/NR.
Complex names:北京[Beijing]/NR.大学[University]/NN,北京[Beijing]/NR 第一[First]/OD 服装厂[Clothing Factory]/NN,美国[the United States]/NR 国会[Congress]/NN.
#### 2.2.5 NR+NR: coordination without conjunction
Treat it as two words.
Ex:中[China]/NR 美[the United States]/NR,中[China]/NR 美[the United States]/NR 关系[relation]/NN, 东[Eastern Asia]/NR 新[Singapore]/NR 澳[Macao]/NR.
### 2.3 Temporal noun: NT
The names of years/months/day/hour and so on axe words.
Ex: 1998年[1998]/NT 3月[March]/NT 21 日[21st]/NT, 5点钟[5 o’clock]/NT,初一[the first day of a lunar month]NT,i年[last year]/NT.
#### 2.3.1 CD+N
If CD+N is the name of a time, treat it as one word (NT). If it is the count of the time, treat it as two words (CD+M).
One word: 1998年[1998]/NT, 5点钟[5 o,clock]/NT, 9 0 年代[the 90s]/NT,
Two words: 3/CD 年[year]/M, 3/CD 个/M 月[month]/NN.
### 2.4 Localizer: LC
Localizers are separated from the noun that it attaxJies to except for the case mentioned in Section 2.1.7 (i.e., N+LC).
A localizer is either one or two syllables:
- monosyllabic localizers: e.g.内[in],后[after].
- bisyUabic localizers: e.g.之间[between],以来[since],以后[afterwards],左右[around].
### 2.5 Pronoun: PN
Treat it as one word.
Ex:他们[they]/PN,他自己piimself]/PN,自己[self]/PN.
### 2.6 Determiner: DT
We separate DTs from the succeeding words.
Ex:这[this]/DT 三[three]/CD 个/M 人[people]/NN,各[each]/DT 国[nation]/NN.
Currently, we treat 这些[these] as one word, and tag it as DT.
Some examples of bisyllabic DTs:全体[all],其余[the rest], —切[all],这些[these],那些[those],所
### 2.7 Cardinal number: CD
Treat it as one word. Note: the internal structure of a CD is very easy to recover if needed.
Some examples:
- Pure numbers: 一亿三千万[one hundred and thirty million]/CD, 30.1/CD, 123,456/CD, 35.6%/CD, 30万[three hundred thousand]/CD, 30几[thirty odd]/CD.
- Estimation:三四十[between thirty and forty-nine]/CD 岁[years old]/M.
- CD + X + CD(5.5.4): X is a morpheme such as 余[odd],分之[fraction],点[point]•三十几亿[three billion odd]/CD,三分之一[one third]/CD,三点一[three point one]/CD,好几[multiple]/CD 个/M.
- CD+X: X is a morpheme such as 余[odd],来[over/odd]:四千一百余[four thousand and one hundred odd]/CD 人[people]/NN,三十雇[about thirty]/CD 个/M.
### 2.8 Ordinal number: OD
Treat it as one word.
Ex:第一[first]/OD,第三十一[thirty-first]/OD.
### 2.9 Measure word: M
Treat the measure word, including a reduplicated or a compound measure word, as one word. Treat the string such as 分钟[minute] as one word.
Ex:杯[cup]/M,杯杯[cup-cup]/M,架次[number of flights]/M,分钟[minute]/M.
### 2.10 Verb: VA, VC, VE, and VV
#### 2.10.1 Reduplication: A A, ABAB, A ABB, A AB, ABB,ABAC
Treat it as one word.
- AA, A is a verb: AA/V
Ex:看看[see]/W,红红[vivid red]/VA.
- ABAB: AB is a verb: ABAB/V
Ex:研究研究[research]/VV,雪白雪白[snow white]/VA.
- AABB, AB is a verb: AABB/V
Ex:来来往往[come and go]/W,高髙兴兴[happy]/VA Note: most of the time, AA or BB is not a word.
- AAB(except for AA-看 in 2.10.2):AAB/V
Ex:蒙蒙亮
Note: most of the time, AA or B is not a word.
- ABB: ABB/V
Ex:绿油油[bright green]/VA,红彤彤[bright red]/VA.
Note: most of the time, A or BB is not a word.
- ABAC, etc.: ABAC/V
Ex:马里马虎[careless]/VA,有条有理[orderly]/VA,一清二楚[very clear]/VA.
#### 2.10.2 “Reduplication”: AA-kan, A-one-A, A-le-one-A,A-le-A
Treat it as one word with internal structure.
- AA-看:(AA/V 看/V)/V
Ex:(说说[say]/W 看/VV)/V.
The basic meaning of the word 看 is to “see”,but in this context,it roughly means "try to do something".
- A-one-A: (A/V one/CD A/V)/V
Ex:(想[think]/W — [one]/CD 想[think]/VV)/V.
- A-le-A: (A/V le/AS A/V)/V
Ex:(想[think]/W 了/AS 想[think]/W)/V.
- A-l^on^A: (A/V le/AS one/CD A/V)/V
Ex:(想[think]/W 了/AS — [one]/CD 想[think]/W)/V.
Note: V+CD+M is treated as three words, e.g. [look]/V [one]/CD [eye]/M (take a look).
#### 2.10.3 A-not-A
Treat it as one word with internal structure.
Ex:(来[come]/VV 没[not]/AD 来[come]/VV)/V,(高[happy]/VA 不[not]/AD 高兴[happy]/VA)/V, (喜[like]/VV 不[not]/AD 喜欢[like]/VV)/V.
#### 2.10.4 AD+V
If one or more of the following hold, treat AD+V as one word (V):
- no free word can intervene between AD and V,
- the V cannot be a predicate without the AD,
- the subcategorization frame of AD+V is different from that of the V.
Otherwise, treat it as two words.
- One word:胡说[talk nonsense],胡来[mess things up],敬献[present with great respect],尚余[remain]
[(尚余[still remain]/VV 七十五[75]/^D 名)M 难民[refugee]〉NN),历任[have served successively as],并列[tie处 不喪[not afraid o月.
- Two words:已经[already]/AD 采取[take]/VV,不[not]/AD 应该[should]/VV,没[not]/AD 完成[complete]/VV.
#### 2.10.5 MSP+V
If the V can not be a predicate without the MSP, treat MSP+V as one word (V).
One word:以期[in order to]/W (以期[in order to]/W 在[at] 与[with] 美国[the United States]、 瑞典[Sweden]、挪威[Norway]、这些 [these]、世界[world]、强队[strong teams] 、交锋[competition] 、中[during]...).
#### 2.10.6 N+V
Some subject-predicate strings Coin be either a phrase or a word depending on the context.
If a VP-modifier can be inserted between the subject and the predicate part and the “subject” is referential, then the string is a phrase, otherwise it is a word.
One word:头疼[headache]/VA in “他[he]/PN 让[make]/VV 我[me]/PN 很[very]/AD 〈He gives me a headache}”.
Two words:头[head]/NN 疼[ache]/VA in “我[I]/PN 头[head]/NN {很[very]/AD}疼[ache]/VA〈I have a headache}’’.
#### 2.10.7 V+N
If the V and the N axe separated (by the aspect markers, by the modifiers of the N, or because the V is reduplicated), treat V+N as two words.
If the V and the N are adjacent,[6](#bookmark98)
- If V-N is semantically transitive and its object can occur after N only when VN are adjacent (therefore the V is not a ditransitive verb),treat V+N as one word (e.g.,投资[invest]/VV, 出席[be present]/W,关心[care]/VV,为期[scheduled for a specific duration of time]/W).
- If V and VN have similar meaning and both axe semantically intransitive, treat VN as one word (e.g.,睡觉[sleep]/VV).
- If N is “bound”, treat VN as one word (e.g.,游泳[swim]/VV,无望[hopeless]/VV,无效[invalid]/VV, 无法[unable to]/VV,辞职[resign]/W).
- If V-N is 1+1 AND the meaning is non-compositional,treat V-N as one word (e.g.,念书[study]/VV, 流血[bleed]/VV).
Examples of V-N as two words:访[visit]/VY 华[China]/NR in the sentence 他[he]/PN 曾[previously]/AD 七[seven]/CD 次[time]/M 访[visit]/W 华[China]/NR〈He has visited China seven times、
#### 2.10.8 V+R
The tests for verb resultative compounds (V-Hs): both V and R are verbs and the potential forms (V-de-R, V-not-R) exist. So our definition of V-R includes resultative and directional verb com-pounds (e.g.,看见[see] and 走上来[walk up]),but it does NOT include words such as 改善[improve] and 鼓动[agitate].
- We treat it as one word. For the sake of compatibility with other guidelines, we give the internal structure for the words if they have more than 2 syllables or if the R is the following:完[finish]/W.
- Words without internal structure:吃掉[eat up]/VV,看见[see]/W,擦净[wipe clean]/VV.
- Words with internal structures:(做[do]/VV 完[finish]/W)/V,(擦[wipe]/VV 干净[clean]/VV)/V, (认识[realize]/W 到[reach]/VV)/V.
#### 2.10.9 Potential form: V-de/bu-R
We treat it as one word.
- If V-R exists, give the internal structure of V-de/bu-R, otherwise, don^ give one.
Ex: words with internal structure:(擦[wipe]/VV 不[not]/AD 冷[clean]/VA)/V,(擦[wipe]/VV 得/DER 净[clean]/VA)/V. "
- words without internal structure:吃不了 [unable to eat anymore]/W,买不起[cannot afford]/VV.
Note: the string WV de R,? can be ambiguous between potential form and V-de construction. For example, “这[this]张[M]桌子[table]擦[wipe]得pER]干净[clean]吗[SP]?’’ can either be a potential form (which means Can this table be wiped clean?), or it could be a V-de construction (which means Has the table been wiped clean?). The two constructions have different syntactic structures. Normally, we can tell them apart by meaning, by the position of the object or by checking whether adverbs can be inserted between the de and the R.
#### 2.10.10 V+DIR
See Section 2.10.8 (i.e., the section for V+R).
Words with internal structure:(走[walk]/VV 出去[out]/VV)/V,(走[walk]/VV 不[not]/AD 出去[o叫 Words without internal structure:走出[walk out of]/VV,想出[think of]/VV.
#### 2.10.11 V+AS
Treat it as two words.[7](#bookmark99)
Ex:走[walk]/VV 了/AS.
#### 2.10.12 V+DER
The pattern is V-de in V-de construction. We treat V-de as two words.[8](#bookmark100) Ex:走[walk]/VV 得/DER (走[walk]/W 得/DER 很[very]/AD 快[fast]/VA).
#### 2.10.13 Verb coordination without conjunctive words
If the pattern is 1+1, treat it as a word; otherwise, treat it as multiple words.
One word:修建[build]/VV.
Two words:宣传[propagate]/VV 鼓动[agitate]/VV.
#### 2.10.14 V+coverb
The pattern is V+X, where X is monosyllabic and it is either a P or a V.[9](#bookmark101)
- We first decide whether V+X is a word. If it is, we use its syllable count to decide whether to show its internal structure. That is, if V is monosyllabic, don^ give the internal structure;
otherwise, give the internal structure.
- treat V+X as one word if X is in the following list:给[give];为[become],成[become],作[treat as],到[arrive],出[out];自[from],向[toward],入[in],以[with].
Ex:
- 给[give]:送给[give/send to]/VV,交给[hand in]/VV,(赠送[give as a gift to]/VV 给[give]/VV)/V.
- 为[to],成[become/into],作[do/as],到[arrive],出[out]:(翻译[translate]/VV 成[become] 当作[treat as]/VV,起到[take effect]>V,找到[find]/VV,(认识[realize]/VV 到[reach]/VV)/V,决出[decide victors]/VV.
- 自[from],向[toward],入[in],以[with]:来自[come from]面向[face toward]/ into]/VV,迈向[step toward],VV,报以[respond with]/VV,加以[supplement with]/VV.
- treat V+X as two words if X is in the following list:在[at],似[like].
- Ex:生[to be born]/W 在[at]/P,坐[sit]/W 在[at]/P,留[stay]/W 在[at]/P,深[deep]/VA 似pike]/P 海[sea]〉NN.
- treat V+X as one word or two words (V+P) according to the meaning of the X, if X is in the following list:于[at].
- If 于 in V + 于 can be replaced by 在[at], tag V+于 £us two words (V+P). Otherwise, tag it as one word.
- One word:等于[equal to]/VV,缘于[due to]/VV,大于[bigger than]/VV,小于[smaller than]/VV, 无助于[of no help to]/VY 低于[lower than]/W,利于[be beneficial for]/W,有利于[be beneficial for]/VV.
- Two words:生[to be born]/W 于[at]/P,建[build]/VV 于[at]/P.
#### 2.10.15 Others
Generally, in X+V(or V+X) where X modifies V, if X cannot modify other verbs, or V cannot be a predicate without the X, treat X+V as one word.
- Ex:以期[in order to]/W
### 2.11 Adverb: AD
Adverbs are separated from the XP that it modifies.
Adverbs that modify numbers:近[almost]/AD 三十[thirty]/CD,5[five]/CD 分[minute]/M 多[odd]^ 钟[minute]/NN.[10](#bookmark102)
The string such as fe^[extremely big] is an adverb when it modifies VPs, not AD+VA, because the VA(大[big]) cannot modify VPs without the AD(极[extremely]).
#### 2.11.1 Reduplication
When VA(or AD) reduplicates, the resulting word can be an AD.
Ex:妤好[well]/AD 干[do]/W,常常[always]/AD,仅仅[only]/AD.
#### 2.11.2 DT+M/N
The following are tagged as ADs when they modify VP/S:这样[this way]/AD (这样[this way]/AD 做[do]/W),同机[on the same airplane]/AD (同机[on the same airplane]/AD 到达[arrive]/W).
#### 2.11.3 P+PN
We treat the following as two words:为[for]/P 此[this]/PN.
#### 2.11.4 P+N
The following can be seen as frozen PPs. Since they have the same function as the ADs, we treat them as words, and tag them as ADs:迄今[until now],沿途[on the way],即席[impromptu], 为何[why](为何[why]/AD 愈演愈洩[get worse and worse]/VA),为什么[why]/AD 来[come]/VV
#### 2.11.5 PN+LC
If a PN+LC totally loses the function of an NP and the string acts like an adverb, treat it as an adverb.
We treat the following as ADs:此外[in addition]/AD.
#### 2.11.6 Others
If in that context a string totally loses the function of the XP(where X is the head of the string) and the string behaves like an adverb, tag it as AD.
We treat the following as ADs:进一步[a step further]/AD.
### 2.12 Preposition: P
Separate it from NP/S that follows it.
Most prepositions are monosyllabic. Some common bisyllabic prepositions are:为了 [in order to],随着[along with],沿着[along],本着[in conformity with],鉴于[due to],除了[except],经过[through],
作为[being/regard as],截止[until].
When a coverb follows a verb, we have to decide whether the word is part of a verb compound. A list of such coverbs are:于,给,为, See Section 2.10.14 for details.
### 2.13 Subordinating Conjunction: CS
Separate it from the XP that follows it.
Strings such as 只有[only] is ambiguous:
- CS:只有[only if]/CS ...才[then]/AD ....
- AD+VE:他[he]只[only]/AD 有[have]/VE 三[three]/CD 块/M 钱[money]/NN〈He only has three dollars).
### 2.14 Conjunction: CC
Separate it from the XPs that it conjoins.
Ex:和[and]/CC,与
### 2.15 Particle: DEC, DEG, DEV, DER,AS, SP,ETC,and MSP
Separate it from the XP that it attaches to.[11](#bookmark103)
Most particles axe monosyllabic. One of bisyllabic particles is 的话[if so]/SP.
### 2.16 Interjection: IJ
Treat it as one word.
Ex: 哈[expressing satisfaction and so on]/IJ.
### 2.17 Onomatopoeia: ON
Treat it as one word.
Ex:哈哈[sound of laughter]/ON,哔啦啦[sound of water/rain]/ON
### 2.18 Other noun-modifier: JJ
Separate it from the measure word (M) or the noun (N) that it modifies. Ex:三[three]/CD 大[big]/JJ 杯[glass]/M 水[water]/NN
"When JJs modify nouns, the JJs can be adjectives,区别词(非谓形容词),or “phrasal words”. Most of the <4phrasal words,? have two parts: X+Y, both X and Y are monosyllabic, and X or Y is the short-form of the corresponding words. Some examples of the "phrasal words" are as follows:
#### 2.18.1 V+N
V+N:随军[being with the army]/JJ.妓女[prostitute]/NN,旅英[having studied in England]/JJ 学者[scholar]/N^ 成套[forming a complete set]/JJ 设备[equipment]/NN,.发稿[sending manuscripts to press]/JJ 时间[time]/NN, ^^[receiving award]/JJ #i[scholar]/NN, 驻华[being stationed in China]/JJ 使馆[embassy]/NN, ^4[giving benefit]/JJ 国家[nation]/NN,
#### 2.18.2 AD+VA
AD+VA:最新[the newest]/JJ 消息[news]/NN,超大[extra-large]/JJ 规模[scale]/NN 集成[integrate]/NN 电路[circuit]/NN,较大[relatively big]/JJ 增长[growth]/NN.
The common “AD”:最[the most],超[extra-],较[relatively].
#### 2.18.3 VA+N
VA+N/M:高层[high-ranking]/JJ 人士[official]/NN,高速[high speed]/JJ 公路[highway]/NN,大幅[big size]/JJ 标语[slogan]/NN.
#### 2.18.4 CD+N
CD+N/M:两国[two~nation]/JJ 关系[relation]/NN,多国[multi-nation|/JJ 部队[troop]/NN
#### 2.18.5 P+N
P+N/LC:对外[foreign]/JJ 政策[policy]/NN
#### 2.18.6 Others
others:关贸[tariff and trade]/JJ 总协定[treaty]/NN,年均[annual average]/JJ 增长率[growth rate]/NN, 上述[aforementioned]/JJ 三[three]/CD 国[nation]/NN,历届[all previous sessions]/JJ 世界[world]/NN 体操[gymnastics]/NN 大赛[championship]/NN,有关[related]/JJ 方面[parties]/]S[N.
### 2.19 Punctuation: PU
Treat it as one word, except when it is part of another word; for example, 4V? in a number (e.g., 123,456/CD) or in proper names,(e.g.,卡尔[Karl].马克斯[Marx]/NR).
### 2.20 Foreign word: FW
Treat it as one word, except when it is part of another word (e.g., [Karaoke]/NN).
### 2.21 Others
#### 2.21.1 Idioms
The frozen idioms (成语)axe treated as words when they function as an NP or a VP.
Ex:各有所好[each has his likes and dislikes]/V, 一比高低[compete]/V.
#### 2.21.2 Telescopic strings
Telescopic strings are treated as one word if they are not too long (less than four characters). K it is too long, segment them according to pauses.
Short strings:进出口[imports and exports]/NN 贸易[trade]/NN,国内外[foreign and domestic]/NN 形勢[situation] /NN.
Long strings:交响[symphony]/JJ 乐团[orchestra]/NN,北京[Beijing]/NR 市长[mayor]/NN.
#### 2.21.3 Short form
Ex:三好[three-merit]/JJ 学生[student]/NN,教科文[education,science,紐d culture]/NN 组织[organization] (UNESCO),七中[the seventh central government]/NN 全会[convention]/NN.
Shortened part is treated as one word. If the shortened part is longer than 3 syllables, segment them according to phonologic evidence (e.g., pauses). The structure of the short form might be different from that of the full form.
Chapter 3 Collocation with Some Morphemes
---------
### 3.1 Strings with zhe5
Some prepositions end with 着.
Ex:随着[along with]/P.
### 3.2 Strings with zhi1
zhi+LC, where LC is monosyllabic, is treated as one word (LC).
- Ex:之外[aside from]/LC,之中[among]/LC.
- zhi1+CD is treated as DEG+CD (e.g.,方法[method]/NN 之/DEG 一[one]/CD,方法[method]/NN 之/DEG 三[three]/CD).
For simplicity,之一 in a sentence such as 中国是发展中国家之一 is treated as one word and tagged as an NN.
zhi1+N is treated as DEG+N (e.g.,少年[Children]/NN 之/DEG 家[Club/Center]/NN).
### 3.3 Strings with bu4
If X in X+不[not] (or 不[not]+X) must co-occur with bu4 or the meaning of X+不[not] is not com-positional, we treat X+bu4 as one word.
Words that include bu4(不[not]):不到[less than](不到[less than] 5 分钟[minutes],不足[less than] (不足[less than] 5 公斤[kilogram]),不便[inconvenient],不久[not before long].
### 3.4 Strings with shi4
For simplicity, we treat 特别是[particularly]/AD as one word.
### 3.5 Strings with xiel
The following axe treated as one word: [these]/PN(or DT), [some]/CD.
### 3.6 Strings with you3
V+有[have] is often a verb; for example,刻有[engraved with]/VV,真有possess]/VV,富有[rich]/VV.
mei2you3(没有) is always treated as one word(VV or VE or SP).
Many idioms include the word 有[have]; for example,若有所思[as if lost in thought]/W.
The following are two words:有[have]/V 所/MSP,仅[only]/AD 有[have]/V,有[have]/V 可能[possibility]/NN.
The following are ambiguous without the context:
- you3-dian3(有点):V[have]+M or AD[a little bit]
It is V+M when 点 can be dropped or replaced by 一点[a little bit].
you3-dian3 is an AD when it can be replaced by other degree adverbs such as ^[very] or when it is followed by a VP.
- 他[he]/PN 有点[a little bit]/AD 下不了 [unable to get off]/VV 台[stage]/NN〈He felt embarrassed}.
- 这[this]/DT 本/M 书[book]/NN 有[have]/V 点/M 意思[meaning]/NN〈This book is interesting〉.
- 这[this]/DT 本/M 书[book]/NN 有[have]/V 点/M 看头[worth reading]/NN〈This book is worth reading).
- you3-de5(有的):V[have]+DEC or DT[some]
- 他[he]有[have]/V 的/DEC 书[book]我[I]也[also]有[have]〈The books that lie has, I have, too〉.
- 有的[some]/DT 人[people]已经[already]走[leave] 了[AS]〈Some people have already left〉.
- you3-xie1 (有些):V[have]+M or DT[some]:
- 我[I]只[only]有[have]/VV 些[some]/M 旧书[old books]〈I only have some old books.}
- 他[he]不[not]像[like]有些[certain]/DT 人[people]专门[especially]爱[like]抬杜[argue]〈沿 like certain people who especially like to argue).
- zhi3-you3(只有):AD[only]+V[have] or CS[only if]:
- 你[you]只有[only]/CS 学习[learn]才[then]/AD 能[able to]改进[improve]工作[work]〈You can only improve your work by learning).
- 他[he]只[only]/AD 有[have]/VV 10 块[M].钱[dollars]〈He only has ten dollars〉.
### 3.7 Strings with zai4
One word:正在[in the process of]/AD.
### 3.8 Strings with zi4ji3
Always treat PN+zi4ji3 (自己[self]) as one word. Ex:他自己/PN.
Chapter 4 Common Collocations
---------
### 4.1 As one word
- AD:迄今为止[皿til today],迄今[皿til now],进一步[one step further],越来越[more and more],同机[on the same airplane],沿途[on the way],即席[impromptu].
- DT:这些[these].
- JJ:对外[foreign] (e.g.,对外[foreign]/JJ 政策[policy]/NN),各界[all circles]/JJ.
- LC:之间[between],在内[inside].
- NN:其中[among them], —行[group traveling together].
- P:为了[in order to].
- V:来自[come from],面向[face toward],流入[flow in],迈向[step toward],报以[respond with],为期[scheduled for a specific duration of time],有利于[be beneficial for].
### 4.2 As two words
- AD-like:并[yet]/AD 未[not]/AD.
- CC-like:及[and]/CC 其[his/its/her]/PN,而[and]/CC 又[in addition]/AD.
- DT-like:各[each]/DT 个/M.
- NN-like:超大[extra-large]/JJ 祝模[scale]/NN,我[our]/PN 国[nation]/NN.
- NT-like:零点[midnight]/NT 零一分[one]/NT〈one minute past midnight〉.
### 4.3 Other cases
V-V:(迎上[step forward]/W 前去[go forward]/VV)/V.
Appendix A Comparison with Other Guidelines
----------
In this appendix, we compare our guidelines with the guidelines from PRC [LTS93] and from Rocling [Chi96]. The grouping of words in our system is done in bracketing stage.
| | Ours | PRC | Rocling | Example |
| --- | --- | --- | --- | --- |
| Verb | | | | |
| AA | AA | AA | AA | 看看 |
| ABAB | ABAB | ABAB | ABAB | 研究研究 |
| AABB | AABB | AABB | AABB | 高高兴兴 |
| ABB | ABB | ABB | ABB | 绿油油 |
| AAB(excl AA-看) | AAB | AAB | AAB | 蒙蒙亮 |
| ABAC etc. | ABAC | ABAC | ABAC | 有条有理 |
| AA-看 | (AA/V kan/V)/V | AA kan | AA kan | |
| A-yi-A | (A/V yi/CD A/V)/V | AyiA | AyiA | 走一走 |
| A-l^A | (A/V le/AS A/V)/V | A le A | A le A | 走了走 |
| A-le-yi-A | (A/V le/AS yi/CD A/V)/V | A le yi A | A le yi A | 走了一走 |
| nonreduced A-not-A | (A/V not/AD A/V)/V | A not A | A not A | 喜欢不喜欢 |
| reduced A-not-A | (A/V not/AD A/V)/V | A-not-A | A-not-A | 喜不喜欢 |
| V-R(R is monosyl.) | v-r except v/V 完/V | v-r | v-r | 打破 |
| V-R(R is bisyl.) | (v/V r/V)/V | v r | v r | 扫千净 |
| V-de/bu-R | (v/V de/DER r/v)/V | v de r | v de r | 打得破 |
| (V-R exists) | (v/V bu4/AD r/v)/V | v bu r | y bu r | 打不破 |
| V-de/bu-R | y-de-r/V | ?? | y-de-r | 来得及 |
| (V-R doesn’t exist) | v-bu-r/V | ?? | y-bu-r | 来不及 |
| V-DIR | (v/V dix/V)/V | v dir | v-dir | 走上来 |
| V-x-0 | v/V x/X o/N | v x n | v x n | 吃了饭 |
| VO | depends | depends | depends | 关心,吃饭 |
| V-de | y/V de/DER | v de5 | v de5 | 走得 |
| V-AS | y/V as/AS | v as | v as | 走了 |
**Table A. 1: Comparison with PRC’s and Rocling’s Guidelines**
| | Ours | PRC | Rocling | Example |
| --- | --- | --- | --- | --- |
| Nouns
| Proper Names (NR)
LstNm+Fst Nm | one seg | two segs | one seg | 王鸣 |
| IstNm+title | name/NR title/NN | name title | name title | 王市长 |
| NR +接尾词 | nr-nn/NR | depends | nr-nn | 北京市 |
| NR + common noun | nr/NR nn/NN | nr nn | nr nn | 北京大学 |
| complex names | several segs | depends | several segs | 北京第一服装厂 |
| Common nouns N+men5 | one seg | one seg | two segs | 学生们 |
| VA+N | depends | depends | depends | 小媳妇 |
| N+N | depends | depends | depends | 牛肉 |
| Temporal nouns name of time | cd-year/NT | cd year | cd-year | 1998年 |
| count of time | cd/CD year/NN | cd year | cd year | 3年 |
| DP-related
CD | one seg | ?? | one seg | 一万三千 |
| CD+X+CD | one seg | several | one seg | 三分之一 |
| AD + CD | ad/AD + cd/CD | ad cd | ad cd | 约三百 |
| CD + X | cd-X/CD | cdX | cd-X | 三百多 |
| di4-CD | di 娈 cd/OD | di4 cd | di4-cd | 第一 |
| CD+M | cd/CD m/M | cd m | cd m | 这个 |
| M + M | m-m/M | m-m | m-m | 片片 |
| yi1+M+M | yi1/CD m-m/M | yi1 m-m | yi1-mm | 一片片 |
| yi1-M-yi1-M | yi1/CD m/M yi1/CD m/M | ?? | yi1 m yi1 m | —^^~-个 |
| Markers
V-AS | v/V as/AS | v AS | v AS | 打了 |
| V-de | v/V de/DER | v de5 | v de5 | 走得 |
| SP | one seg | one seg | one seg | 吗 |
| de5(的,地) | one seg | one seg | one seg | 我的,高兴地 |
| zhi1(之)+CD/N | two segs | two segs | two segs | 方法之三 |
| zhi1(之)+LOC | one seg | ?? | one seg | |
| Others
成语(no insertion) | one seg | one seg | one seg | 鼠目寸光 |
| ACROM | one seg | one seg | one seg | 北大 |
**Table A.2: Comparison with PRC and Rocling^ Guidelines(Ctd)**
Appendix B Treebank Part-of-Speech Tagset
----------
The following is the Part-of-Speech Tagset used in our Penn Chinese Treebank.
| | | |
| --- | --- | --- |
| AD | adverb | 还 |
| AS | aspect marker | 着 |
| BA | 把 in barconstmction | 把,将 |
| CC | coordinating conjunction | 和 |
| CD | cardinal number | 一百 |
| CS | subordinating conjunction | 虽然 |
| DEC | 的 in a relative-clause | 的 |
| DEG | associative 的 | 的 |
| DER | 得 in V-de const, and V-de-R | 得 |
| DEV | 地 before VP | 地 |
| DT | determiner | 这 |
| ETC | for words等,等等 | 等,等等 |
| FW | foreign words | ISO |
| IJ | interjection | 啊 |
| JJ | other noun-modifier | 男,共同 |
| LB | 被 in long bei-const | 被^给 |
| LC | localizer | 里 |
| M | measure word | 个 |
| MSP | other particle | 所 |
| NN | common noun | 书 |
| NR | proper noun | 美国 |
| NT | temporal noun | 今天 |
| OD | ordinal number | 第一 |
| ON | onomatopoeia | 哔哔 |
| P | preposition excl.被 and 把 | 从 |
| PN | pronoun | 他 |
| PU | punctuation | |
| SB | 被 in short bei-const | 被^给 |
| SP | sentence-final particle | 吗 |
| VA | predicative adjective | •红 |
| VC | 是 | 是 |
| VE | # as the main verb | 有 |
| VV | other verb | 走 |
**Table B.l: Our POS tagset in alphabetical order**
Bibliography
------------
[Chi96] Chinese Knowledge Information Processing Group. Shouwen Jiezi - A study of Chinese Word Boundaries and Segmentation Standard for Information Processing (in Chinese). Technical report, Taipei: Academia Sinica, 1996.
[1D92] John Xiang ling Dai. The Head in Wo Pao De Kuai. Journal of Chinese Linguistics, 1992.
[LTS93] Y. Liu, Q. Tan, and X. Shen. Segmentation Standard for Modern Chinese Information Processing and Automatic Segmentation Methodology, 1993.
[Pac98] Jerome L. Packard, editor. New Approaches to Chinese Word Formation, Mouton de Gruyter, 1998.
[SW87] Anna Maria Di Sciullo and Edwin Williams. On the Definition of Word. The MIT Press, 1987.
[XPX+00] Fei Xia, Martha Palmer, Nianwen Xue, Mary Ellen Okurowski, John Kovarik, Shizhe Huang, Tony Kroch, and Mitch Marcus. Developing Guidelines and Ensuring Consistency for Chinese Text Annotation. In Proc. of the 2nd International Conference on Language Resources and Evaluation (LREC-2000)^ Athens, Greece, 2000.
[1](#footnote1)
The difference between a JJ and a prefix is that the latter, not the former, is bound. As mentioned before, sometimes, it is difficult to tell whether a morpheme is bound or not, so we keep a list of morphemes that we regard as prefixes. In this case, if the N in X+N can be replaced with, an NP, we treat X as a JJ, ratter than a prefix.
[2](#footnote2)
A word is a non-predicate adjective if it can not appear as a predicate after the subject without the help of 是...的.
[3](#footnote3)
N+LC1+LC2, where LC1 and LC2 denote opposite directions, is treated similarly.
[4](#footnote4)
In either of the last two examples, the first morpheme is bisyllabic, and it could be tagged as nouns in some context. Because the second morpheme is mono-syllabic, the expression should be treated as one word regardless of the POS tag of the first morpheme.
[5](#footnote5)
[6](#footnote6)
The V+N combination is among the hardest cases for the word definition. The tests proposed here are not perfect. They tend to treat idiomatic phrases (similar to "kick the bucket" in English) as words. However, Those errors can be easily corrected if later a dictionary becomes available.
[7](#footnote7)
It has been argued that aspect markers are affixes (e.g., [1D92]). Right now, we do not group the V and the AS together.
[8](#footnote8)
The function of de in the V-de construction is controversial. It ranges from an affix, a particle, to a verb. We will not get into details here.
[9](#footnote9)
Many of Xs in this pattern are ^coverbs^ and it is highly debated which tag, V or P, X should Lave in this pattern and whether V+X forms a word by the process such as reanalysis.
[10](#footnote10)
Note: 50 多分钟 is segmented as 50 多[50\_odd]/CD 分钟/M.
[11](#footnote11)
In the literature(e.g., [1D92]), it has been argued that some of the particles such as 得,了 are affixes. For the sake of compatibility with other guidelines and also because it is very easy to automatically group these particles with preceding words, we separate the particles from the preceding words.
================================================
FILE: docs/annotations/tok/index.md
================================================
# Tokenization
## Chinese
```{toctree}
ctb
msr
```
================================================
FILE: docs/annotations/tok/msr.md
================================================
# MSR中文文本标注规范 (5.0 版)
[**Tokenization Guidelines of Chinese Text (V5.0)**](http://sighan.cs.uchicago.edu/bakeoff2006/MSRAsamp/msra-spec.pdf)
黄昌宁 李玉梅 朱晓丹
Chang-Ning Huang, Yumei Li, and Xiaodan Zhu
微软亚洲研究院
Microsoft Research Asia
2006 年 3 月 27 日
March 27, 2006
微软《中文文本标注规范(5.0 版)》
## 第一章 概述
### 1.1 版本说明
微软亚洲研究院《命名实体标注规范》3.0版是为30万词《人民日报》语料的命名实体(NE)标注任务制定的。其英文版‘Guideline on Chinese Named Entity Annotation’成稿于2003年2月,用于LSP(Lexical Service Platform)课题。当时在研究院,命名实体识别(Name Entity Recognition)和自动分词(Word Segmentation)是文本处理中互相独立的两个过程,所以未曾深入考虑分词词表(lexicon)对命名实体标注带来的影响。2005年3月至7月在准备第二届国际自动分词评测(SIGHANBakeoff2005)的237万词训练语料的过程中修订了该规范,形成4.0版。《命名实体标注规范》4.0版的一个最大特点是把命名实体识别有机地融入到中文自动分词的整体过程中去。因此,除了命名实体自身的定义以外,还需要系统地阐明词表词和各类实体之间的复杂关系。本规范是在微软亚洲研究院《命名实体标注规范》4.0版的基础上编制的。由于规范实际上涵盖了文本中词语和各类实体的标注规则与实例,所以更名为《中文文本标注规范》(Tokenization Guidelines of Chinese Text) 5.0版。
### 1.2导读
规范的第一章(概述)、第二章(专有名词标注总则)、第六章(数字串标注总则)以及第九章(分词歧义消解细则)是每个标注人员必读的材料。其它章节收集了大量的实体标注规则与实例,用以补充各类实体定义的不足。凭借这些具有上下文信息的词例化实例可以进一步提高文本标注的精度和一致性,所以它们是供标注人员经常查阅的参考资料。诚恳欢迎读者对本规范和带标语料中的错误提出宝贵意见,以便及时更正。批评和意见请寄[黄昌宁](mailto:cnhuang@msrchina.research.microsoft.com)。
### 1.3标注格式
format-1是面向标注人员的格式:
/十月九日/上午/ ->/[dat十月九日]/[tim上午]/
format-2是基于XML的标注格式:
/十月九日/上午/ -> `十月九日 上午 `
*TIMEX* 是时间表达式,日期 *DATE* 和时间 *TIME* 是它的两个子类。
考虑到本规范主要是为标注人员编写的,以后的例子主要以第一种格式(format-1)表示。想了解更多 XML 格式的读者,请参见 MET-2 Guideline1。
1MET(MultipleEntityTask)是1997年第七届MUC(Meassage Understading Conference)会议多实体识别任务的简称。MET-2是当年美国NIST公布的命名实体标注规范。可查阅:http://www.itl.nist.gov/iaui/894.02/related_projects/muc/proceedings/ne_task.html
### 1.4命名实体标记集
表1-1是本规范定义的全部命名实体标记,包括专有名词(*NAMEX*)、时间表达式(*TIMEX*)、数字表达式(*NUMEX*)、度量表达式(*MEASUREX*)和地址表达式(*ADDREX*)等类五大类及其下属的三十个子类。
| 大类 | 子类 | Format-1标注集 | Format-2标注集 |
| --- | --- | --- | --- |
| NAMEX | Person | P | PERSON |
| Location | L | LOCATION |
| Organization | O | ORGANIZATION |
| TIMEX | Date | dat | DATE |
| Duration | dur | DURATION |
| Time | tim | TIME |
| NUMEX | Percent | per | PERCENT |
| Money | mon | MONEY |
| Frequency | fre | FREQUENCY |
| Integer | int | INTEGER |
| Fraction | fra | FRACTION |
| Decimal | dec | DECIMAL |
| Ordinal | ord | ORDINAL |
| Rate | rat | RATE |
| MEASUREX | Age | age | AGE |
| Weight | wei | WEIGHT |
| Length | len | LENGTH |
| Temperature | tem | TEMPERATURE |
| Angle | ang | ANGLE |
| Area | are | AREA |
| Capacity | cap | CAPACITY |
| Speed | spe | SPEED |
| Acceleration | acc | ACCELERATION |
| Othermeasures | mea | MEASURE |
| ADDREX | Email | ema | EMAIL |
| Phone | pho | PHONE |
| Fax | fax | FAX |
| Telex | tel | TELEX |
| WWW | www | WWW |
| Postalcode | pos | POSTALCODE |
**表1-1命名实体的标记集**
### 1.5基本原则
#### 1.5.1基本考虑
通用性:尽量遵循国际标准MET-2和ER-992,不同之处在本规范中阐明。
实用性:可用于LSP (Lexical Service Platform), TTS (Text To Speech conversion), IR (Information Retrieval), IE (Information Extraction), QA (Question Answering), IME(Input Method Editor)等应用系统。
#### 1.5.2标注对象
##### 1.5.2.1词表词与未登录词
本规范认为:文本中的任何一个词要么是词表词(LW),要么是未登录词(OOV)。两者都是文本的标注对象。未登录词可以进一步分成命名实体(NE)、词法派生词(MDW)和新词(NW)等三部分。本规范定义的命名实体是未登录词的主体。
(1)命名实体(NE)
命名实体可以进一步分成如下五大类共三十个子类(详见表1-1):
- 专有名词(*NAMEX*)包括人名(*P*)、地名(*L*)和机构名(*O*)等3种。
- 时间表达式(*TIMEX*)包括日期(*dat*)、时间(*tim*)和时段(*dur*)等3种。
- 数字表达式(*NUMEX*)包括百分数(*per*)、钱款(*mon*)、频度(*fre*)、整数(*int*)、分数(*fra*)、小数(*dec*)、序数(*ord*)和比率(*rat*)等8种。
- 度量表达式(*MEASUREX*)包括年龄(*age*)、温度(*tem*)、角度(*ang*)、长度(*len*)、
- 面积(*are*)、容积(*cap*)、重量(*wei*)、速度(*spe*)、加速度(acc)和其它(*mea*)等10种。
- 地址表达式(*ADDREX*)包括电子邮箱(*ema*)、电话(*pho*)、传真(*fax*)、电报挂号(*tel*)、邮政编码(*pos*)和网址(*www*)等6种。
在标注过的文本中,词的边界一律用斜线(slash)表示。除了词表词以外,每个独立的命名实体(即非嵌入到词表词内部的实体,见1.5.2.3)也被视为一个词,其标注符号及形式详见本规范。
(2)词法派生词(MDW)
以词表词AB的重迭形式AABB和AB/AB为例:
/*转轨*/*哪*/*有*/*像*/*人*/*说*/*得*/*那般*/*轻轻松松*/?
/*积累*/*多*/*了*/*,*/*抽出*/*时间*/*,*/*认真*/*整理*/*整理*/,
(3)新词(NW)
一个新词的左右两侧用符号&标示,其内部的切分符保留3,如:
/&*桑拿*&/*浴*/
/*天时地利*/&*人*/*和*&/*;*/
/[L*罗*]/*货币*/&*列*/*伊*&/
以下是一些真实的例句,例句中的实体标注符号请参阅表1-1。
[Example-1]
```
/[dat 6月29日]/、/[dat 30日]/[tim 晚上]/,/[L 北京市]/下/了/[int 两场]/大雨/,/笔者/
居住/的/宿舍/楼/前/,/宽/[len 六七米]/、/长/[len 30多米]/的/路/上/积水/达/膝盖/之上/。
6月29日 、 30日 晚 上
, 北京市
下 了 两 场 大 雨
, 笔者 居住 的 宿舍 楼 前
, 宽 六七米
、 长 30多米
的 路 上 , 积水 达
膝盖 之上 。
```
[Example-2]
```
/[dat 6月中下旬]/,/笔者/到/[L 意大利]/、/ [L 西班牙]/等/国/访问/时/,/一个/很/深/的/感受
/是/[L 意]/、/[L 西]/两国/的/高速公路/非常/发达/,/东西南北/,/纵横/成/网/,/.四通八达/。
6月中下旬 , 笔者 到
意大利 、 西班牙 等 国 访问 时
, 一 个 很 深 的 感 受 是
意 、 西 两国 的 高速公路 非
常 发达 , 东西南北 , 纵横 成
网 , 四通八达 。
```
[Example-3]
```
/[O 县委]/决定/选派/任/了/[dur 八年]/[O 城建局]/长/的/[P 周欣光]/担任/[O 老干部局]/长/。
县 委 决 定 选 派
任 了 八 年
城建局 长
的 周欣光 担 任
老干部局 长 。
```
[Example-4]
```
/[L喇嘛寺村]/地处/[L承德避暑山庄]/,/[L山庄]/寺庙/林立/,/僧侣/穿梭/,/[L山庄]/[L外八庙]/的/[ord第一个]/庙/就/是/[L喇嘛寺]/。
喇嘛寺村 地处 承德避暑山庄 , >山庄 寺庙 林立 , 僧侣
穿 梭 , > 山 庄
外八庙 的
第一个 庙 就
是 喇嘛寺 。
```
##### 1.5.2.2*L*, *P*,*O*, *dat*,*tim*,*dur*等实体的边界允许跨越多个词表词
例如:
/[L*瑞典*]/[O*斯德哥尔摩国际和平研究所*]/ /[O*中国工商银行上海市分行*]/
/[tim*下午当地时间*5*时*59*分*]/
1.5.2.3专名的标记(L,P,O)可以插入到一个词表词的内部
例如,词表词抗日战争*和*事后诸葛亮*中的地名和人名应分别予以标注。
/*抗*[L*日*]*战争*/----正确标注。
/*抗日战争*/*----未标出*L,是错误标注。
/*抗*/[L*日*]/*战争*/ ----插入分词标记,是错误标注。
/*事后*[P*诸葛亮*]/
##### 1.5.2.4数字串(除专名以外的其他四类表达式)的标记不得插入到词表词的内部
###### 1.5.2.4.1dat,tim等标记不得插入到一个词表词的内部
词表词*夏令营、*春耕、*冬训*、*早出晚归*中的*夏、春、冬、早、晚*等词素都有*dat*和*tim*的意思,但不得标注。例如,
/[dat*冬*]*训*/ ---错误标注。
/[tim*早*]*出*[dat*晚*]*归* / ---错误标注。
然而词表词被整体标注为*dat*和*tim*的情况是常有的,例如:
/[dat*初冬*]/ ----*初冬*是词表词。
/*[dat*夏季*]/*----*夏季*是词表词。
/告别/*了*/[dat*冬日*]/*的*/*凝重*/*、*/[dat*春天*]/*的*/*轻盈*/*、*/[dat*夏日*]/*的*/*浪漫*/,
- 注:在文本中具有比喻意义的*春、夏、秋、冬*、历史上的*今天、昨天、明天*不作标注。例如:
/[dat*今年*]/*又*/*迎来*/*了*/*一个*/*科学*/*的*/*春天*/ /"/*在*/*陆地*/*资源*/*日渐*/*减少*/*的*/*今天*/*,*/
/*他们*/*的*/*明天*/*将*/*更加*/*辉煌*/*。*/
###### 1.5.2.4.2int,ord等标记不得插入到到一个词表词的内部
词表词*五湖四海*、*不管三七二十一*、*三纲五常中的数词不允许标注*int(整数)。例如,
/*[int*五*]*湖*[int*四*]*海*/*----错误标注。
/*十年动乱*/*结束*/*不久*/*,*/ ----*十年动乱*是词表词。*十年*不标。
/*不管三七二十一*/ /*三纲五常*/
##### 1.5.2.5数词首、半、双、两等
###### 1.5.2.5.1序数词素首
词表中有许多词含有词素*首*,如*首创、首倡、首选、首发、首航、首飞、首演、首映、首战、首展、首席代表、首席科学家、首席执行官、首富、榜首、魁首、居首*等。但不可把词表词中的词素*首*单独作为*ord*(序数)来标注。
/*首席执行官*/----正确标注。
/*[ord首席]执行官*/----在词表词中插标*ord*是错误的。
以下的词表词属于"首+量词"结构,可以整体作为*ord*标注。例如:
*[ord*首届*]*,*[ord*首次*]*,*[ord*首批*]*,*[ord*首位*]*,*[ord*首例*]*
###### 1.5.2.5.2分数词素半
词表中有许多词含有词素*半**如半价、半票、半饱、半身、半世、半辈子、上半时、下半场、半边*等,但不可把上述词表词中的词素*半*标注为*fra*(分数)。
/*下半场*/*比赛*/[O*中国队*]/*未进*/*一*/*球*/
/*上半时*/
/*下*[fra*半*]*场*/----在词表词中插标*fra*是错误的。
以下的词表词可作为不同的数字串(*dur*,*tim*,*fra*,*int*,*age*)标注:
*[dur*半年*]*,*[dur*半天*]*,*[tim|dur*半夜*]*,*[fra*半个*]*,*[int|age*半百*]*
- 注:半个西瓜中的半个,与四半中的半概念不一样,前一个半是指二分之一,
后一个半是量词,所以标注也不同!!/*[int*一个*]*/*西瓜*/*分为*/[int*四半*]/ /[fra*半个*]/*西瓜*/
###### 1.5.2.5.3整数词素双
当数词双成为词表词的一个词素时,如"双方、双边、双手、双打、双杠、双轨、双层、双目、双亲"等,一律不作为整数(*int*)标注。对于非词表词,只标[*int双*]。例如:
/*窗外*/*又*/*起风*/*了*/*,*/*双层*/*的*/*窗*/*硬是*/*阻挡*/*不住*/*沙尘*/*的*/*侵扰*/*。*/
/*双方*/*认为*/*,*/[L*中*][L*美*]/*两国*/*应该*/*从*/*战略*/*的*/*高度*/*和*/*长远*/*的*/*角度*/
- 注:一般情况下,数词和"方"之间不切分整体标为*int*。但"四方"是词表词所以不标。
[*int三方*]/*会谈*/ /*举行*/*四方*/*会谈*/
以下是相关的例子:
/*用*/*任何*/*一*/*部*/[*int双*]/*音频*/*电话*/*只需*/*拨打*/[pho*2580*]/*就*/*可以*/
/*部队*/*进行*/*的*/*海上*/*训练*/*、*/[int*双*]/*机*/*穿云*/*、*/*超低空*/*等*/*高难*/*课目*/*训练*/ /*全村*/[are*700亩*]/*旱地*/*都*/*种上*/*了*/[int*双*]/*膜*/*棉*/*,*/
###### 1.5.2.5.4整数词素两
当数词"两"成为词表词的一个词素时,如"两国、两会、两地、两者、两头、 两手、两边、两旁、两侧"等,一律不作为数位串(*int*)标注。例如:
/*使*/*两国*/*的*/*友好*/*合作*/*得到*/*巩固*/*和*/*发展*/。
- 注:一般情况下,数词和"国"之间是要切分的,如:[*int五*]/*国*/*元首*/
/*前*/*些*/*年*/*我*/*对*/*参加*/*『*/*两会*/*』*/*总是*/*有点*/*发怵*/*。*/
/*大街*/*两旁*/*店铺*/*林立*/
/*戏台*/*两侧*/*立柱*/*上*/*有*/*这样*/*一*/*副*/*对联*/*:*/
/*中间*/[int*两间*]/*是*/*客厅*/*,*/*两边*/*是*/*卧室*/*和*/*书房*/*。*/
/*对*/*分散*/*居住*/*的*/*"*/*五保*/*"*/*户*/*,*/*镇*/*、*/*村*/[int*两*]/*级*/*拨*/*专款*/
/[int*两*]/*车*/*饮料*/*以及*/*办公*/*桌椅*/*,*/
- 注:临时量词"车、船、床、桌、屋子、院子"等不进入int标注。/*成为*/[O*议会*]/[int*两*]/*院*/*审议*/*的*/*重点*/*和*/*舆论*/*关注*/*的*/*焦点*/
- 注:两院不是词表词,所以应当切分标注如上。
/*及早*/*进行*/*政治*/*谈判*/*推动*/[L*两岸*]/*关系*/*发展*/ /*沿江*/[int*两*]/*岸*/*苗家*/*吊脚楼*/*上*/*的*/*观众*/
- 注:词表词两岸是专指台湾海峡两岸的地名。如果泛指江河两岸,则不作为 地名标注,而且要切分并标数词"两"为*int*。
/*一下*/*进*/*了*/[int*两*]/*球*/
#### 1.5.3基本规定
1)标注时,不得在原来的文本中加入回车换行符。
2)对于NIST制定的两个中文NE标准:MET-2和ER-99。前者已有系统参加评测,它们的评测结果可供后来者参考;后者是前者的修订版,但尚未有系统参加测试。本规范与这两种标准不同之处将尽可能在注释中加以说明。例如:
/[dat*去年上半年*]/
- 注:MET-2把去年上半年*整体视为*dat*;ER-99则只将上半年*视为*dat*。
3)对于微软研究院根据自己的需要而加入的标记,本规范也将在注释中加以说明。比如本规范要求的如下标注:
/[P*邓小平*]/*理论*/
- 注:MET-2和ER-99规定,*理论,主义,思想,定律*等词前面的人名均不作为专名标注(见2.8)。
## 第二章 专有名词标注总则
### 2.1专有名词(NAMEX)标注通则
对于人名、地名和机构名这三类专有名词,MET-2和ER-99之间的差异甚微,在它们给出的示例中,只有两处不同:中南美*和长江流域*(具体情况见后)。所以在制订人名、地名、机构名的标注规范时,我们没有刻意去区分这两个标准,而是力图把它们统一地融入本规范。
下面给出人名、地名、机构名的定义。
### 2.2专有名词是具体的、特定的,而不是抽象的、泛指的
比如:*上苍、老外、姑娘,小镇,企业*等就不应视为专有名词。
### 2.3复合专有名词的标注不允许嵌套
在MET-2和ER-99标准中,任何命名实体都不允许嵌套。换句话讲,只标一个实体的最长边界,不标其内部包含的其它实体。
### 2.4人名、地名、机构名中的数字串不单独标出
例如:
/[P*龟山一郎*]/
/[L*德富路二四一至二六三号*]/
/[O*北京*101*中学*]/
/[O*北京*[ord*四*]*中*]/ ----这种嵌套式的的标注是错误的!
### 2.5含有外文和数字的命名实体应整体一起标注
例如:
/[O*American航空公司*]/
/[O*SONY公司*]/
### 2.6当两个实体用虚词的连接时应分别标注为两个实体
例如:
/[L*美国*]/*的*/[L*纽约*]/
/[L*美国*]/*的*/[P*理查德本森*]/
但当*的*成为实体的一部分时,要整体一起标注。例如:
/[O*美的电器集团*]/
### 2.7实体前后有引号或书名号的情况
如果一个命名实体中间有引号或书名号,则引号或书名号是该实体的一部分。如果一个实体被外面的引号或书名号括起来,那么其引号或书名号就不作为实体的一部分标注。例如:
/[O"*阿克布拉克*"*中哈合资企业*]/
/[O*美国《幸福》杂志*]/
/*《*/[O*星岛日报*]/*》*/*的*/*社论*/*说*/
### 2.8短语内部包含实体、但整体又不是命名实体的情况
ER-99规定:如果一个短语内部包含实体、但整体又不是命名实体,则一律不作标注。本规范则要求对该短语中的实体部分加以标注。例如:
/[L*巴拿马运河*]/*条约*/
- 注:ER-99认为,巴拿马运河条约*整体不能分解,其中的地名不应标注。但本规范把其中的巴拿马运河*单独标为地名。
/[L*巴拿马运河*]/*----巴拿马运河*单独出现时,作为地名标注。
/[L*香港*]/*脚*/
- 注:英文为"HongKongfoot",类似于"athletesfoot",不可分解,所以ER-99规定整体不标。本规范,仍将*香港*标为地名。
/[L*美国*]/*小姐*/
- 注:原文为"MissAmerica",指选美活动中获全美第一名的小姐。对此ER-99规定整体不标。本规范,仍将*美国*标为地名。
/[L*美国*]/*姑娘*/*----ER-99对本例的美国*也是标注的。
/[ord*第四十六届*]/[O*太平洋亚洲旅行协会*]/*年会*/
- 注:此例在ER-99中整体不标,理由是不可分解。本规范认为找不出充分理由说明其不可分解。所以我们把太平洋亚洲旅行协会*标为机构名。第四十六届太平洋亚洲旅行协会年会*整体不是机构名。
/[P*毛泽东*]*思想*/ /[P*马克思*]*主义*/
/[P*马克思*]/*主义*/ ----*错误标注!因为*马克思主义*是词表词。
/[P*阿佛加罗*]/*定律*/
- 注:ER-99规定,在理论、主义、思想、定律*等词前面出现人名时,是整体不可分解的字符串;因此该字符串和其中的人名都不标注。但本规范仍将标注其中的人名。
### 2.9与军队相关的情况
当泛指某个国家的军队(如英军、美军*等)时,不是机构名;当指一个具体的军种(如空军、陆军、海军*等)时,要标注为机构名。例如:
/[L*美*]/*军*/*飞机*/
/[O*斯里兰卡空军*]/
/[O*英国皇家空军*]/
但是,有如下特殊情况:
*[L*济南军区*]/ ----*军区是*L*而不是*O*。
/[L*彼得森空军基地*]/ -----军事基地是L而不是O。
/[L*西非*]/&*维*/*和*&/*部队*/ ------部队不作为机构名标准。
### 2.10多媒体、产品和条约中的人名、地名、机构名
ER-99规定:当人名、地名、机构名属于多媒体、产品和条约时,均不加标注。但本规范对上述实体名还是要标注的。例如:
/[P*邓小平*]/*一*/*片*/*的*/*播出*/
- 注:ER-99规定,电视节目的名字邓小平*不标。本规范仍把它标为人名。此外,邓小平*作为片名,在规范的文本中应当用书名号括出,如《邓小平》。
/*二战*/ ----*二战*是事件,所以不标注。
/[L*香港*]/*百*/*题*/*今天*/*为*/*您*/*解答*/
- 注:ER-99规定,香港百题是电视片的标题,所以专名香港不予标注。但本规范仍把香港标为地名。下面其它的例子就不一一解释了。
/*这*/*本*/*介绍*/[P*毛泽东*]/*的*/*小说*/ ----*毛泽东*要标注。
/*这*/*本*/*名*/*为*/[P*毛泽东*]/*的*/*小说*/ ----ER-99*毛泽东*不标。
/[L*广州*]/*条约*/ ----ER-99*广州*不标。
/[L*辽*][L*沈*]*战役*/ ----ER-99*辽沈*不标。
本规范在后面还要对人名、地名、机构名中不加标注的情况作专门的说明,详见下面的各章节标注细则。
### 2.11别名或简称的标注
对人名、地名、机构名的别名或简称要标注。例如:
/[O*IBM*]/
/[L*深*]/[L*沪*]/*股市*/
/[O*北约*]/
/[L*中*][L*美*]/*首脑*/*互访*/
/[L*中*]/[L*文*]/*双方*/*一致*/*认为*/
- 注:由于中美是词表词,标注地名时不可插入分词标记。中文也是词表词,但这里是指中国和文莱,所以标成地名时需要在两个简称中插入分词符号。这样的词表词还有中意、意中、中巴、日中、中肯、中非等。巴中是一个地名,但表示巴基斯坦和中国时需要用分词符号把两个简称分隔开。
- 注:对于简称中嵌套的人名、地名、机构名不予标注,如:
/[O*中共*]/ ----*中*指*中国*,但不标。
/[O*中共中央政治局*]/ - ---同理,不标注*中*。
## 第三章人名
人名一般包含姓和名两部分,姓是表明家族的字,有单姓和复姓之别;名也就是名字,是一种称号,由一个或几个字组成,跟姓合在一起,用来代表一个人,以区别于别的人。下面将对人名的标注规则进行详解。
### 3.1人名标注规则
正常情况下,人名一般包含姓和名两部分,标注规则如下表所示:
| **序号** | **情况** | **标记方法** |例子 |
| --- | --- | --- | --- |
| 1 | 只含姓,没有名 | 标出姓氏部分 | *[P*庄*]*、*[P*欧阳*]*、*[P*司马*]* |
| 2 | 只包含名字 | 标出名字部分 | *[P*育焜*]* |
| 3 | 姓名 | 姓名整体标出 | *[P*苏宗哲*]*、*[P*萝莉胡吉温*]* |
| 4 | 姓名|姓|名+称谓称谓+姓名|
| 5 | 前缀+姓名|姓|名姓名|
| 6 | 姓名+姓名 | 分开来标 | *[P*李向东*]/[P*李向阳*]* |
| 7 | 外国人名 | 作为一个整体来标 | *[P*罗马里奥*]*[P*马拉多纳*]*[P*比尔*•*盖茨*]* |
- 说明:当人名中包含•时,整体标注为人名,如[P*比尔•盖茨*]。
### 3.2人名标注细则
#### 3.2.1人名的示例和详细说明
#### 3.2.1.1人名实例
/[P*颜惠忠*]/
/[P*连战*]/
/[P*凡*•*高*]/
/[P*陈方安生*]/
---当妻子与丈夫的名字写在一起时,要作为一个人名整体标注为P!
#### 3.2.1.2称谓、绰号、官职不作为人名的一部分
称谓、绰号、官职(如先生、总理等)不作为人名的一部分。例如,
/[P*张*]/*经理*/
/[P*李*]/*市长*/
/[P*陈*]/*姓*/*游客*/*说*/
/[P*刘*]/[ord*二*]/*嫂*/ /[P*周*]/*总理*/
/[P*雷锋*]/*同志*/
/[P*奥尔布赖特*]/*国务卿*/
#### 3.2.1.3当称谓和姓名不可分时应整体标注为人名
/[P*李主席登辉*]/*先生*/
/*处*/[P*李犯清龙*]/*死刑*/*,*/
/[P*李犯*×*龙*]/*持*/*刀*/*行凶*/*杀害*/*无辜*/*青年*/*,*/
#### 3.2.1.4几世、几代要作为人名的一部分
/[P*十四世达赖丹增加措*]/
/[L*英国*]/*女王*/[P*伊丽莎白二世*]/
#### 3.2.1.5家族实体
/[P*蒋*]/*氏*/*父子*/
/[P*西迪*]/*兄弟*/
#### 3.2.1.6圣人和宗教人物要标注为人名
/[P*释迦穆尼*]/
/[P*达赖*]/*喇嘛*/
### 3.3虚构的人物、动物的名字要标注为人名
#### 3.3.1在童话、小说中虚构人物要标注为人名
/[P*孙悟空*]/
/[P*玉皇大帝*]/
#### 3.3.2虚构的动物和非人的人物要标注为人名
/[P*唐老鸭*]/
/[P*花仙子*]/
/"/[P*盼盼*]/"/*是*/*国内外*/*著名*/*的*/*熊猫*/*明星*/*,*/
/*争相*/*目睹*/*狮*/*王*/[P*木法沙*]/*和*/*王后*/[P*色拉碧*]*产下*/*的*/*小*/*王子*/[P*辛巴*]/
/*走进*/*一家*/*饭馆*/*,*/*发现*/*老板*/*就*/*是*/*大*/*灰*/*狼*/[P*罗克*]/*。*/
#### 3.3.3用称谓或朝代等名号来指称特定人时要标注为人名
例如:
/[P*康熙*]/
/[P*乾隆*]/
/[P*秦始皇*]/
/[P*老子*]/
/[P*孔子*]/
### 3.4不标注为人名的各种情况
#### 3.4.1虚构的非人的植物的名字不作为人名标注
如:
/"/*彩霞*/*,*/"/*石子*/*小声*/*嘟哝*/*着*/*,*/"/*多*/*恶心*/*的*/*名字*/*!*/"/
/*电磨*/*姐姐*/*故意*/*气*/*气*/*小*/*毛驴*/*,*/*说*/*:*/"/*输*/*了*/*,*/*可*/*不能*/*哭鼻子*/*。*/"/
/"/*卡车*/*哥哥*/*,*/*我*/*和*/*你*/*来*/*比*/*一*/*比*/*谁*/*运*/*得*/*多*/*,*/*怎么样*/*?*/"/
/*好像*/*在*/*说*/*:*/"/*荷花*/*姐姐*/*,*/*你*/*好*/*!*/
#### 3.4.2对于嵌套在地名和机构名中的人名,不作标注
如:
/[L*嘉诚广场*]/
/[O*中山大学*]/
/[O*宋庆龄基金会*]/
#### 3.4.3作为书名或画名的人名
作为书名或画名的人名ER-99不标(见2.8),但本规范是要作标注的。如:
/*世界*/*名画*/*《*/[P*蒙娜莉萨*]/*》*/
*/《/*[P*蒋介石*]/*与*/[P*毛泽东*]/*》*/
3.4.4法律、法庭事件、天气形成、疾病和奖金等五种情况
当人名后面紧跟法律名、法庭事件、天气形成、疾病、奖金这五种情况时,人名不标注。
例如:
/*里*/*氏*/[ord*六点二级*]/ -----*里*不标。
/*专家*/*呼吁*/*人们*/*要*/*注意*/*沙*/*氏*/*杆菌*/ -----*沙*不标。
/[P*诺贝尔*]*奖*/ -----ER-99*诺贝尔*不标。
#### 3.4.5在人名后面出现基金会时要整体标注为机构名
/[O*李嘉诚基金会*]/
所以基金会*和奖、奖金*是不同的两种情况,需加以区别。又如
/[O*李嘉诚股份有限公司*]/
/[O*诺贝尔股份有限公司*]/
## 第四章 地名
地名包括洲、海洋、国家、省、市、县、地区、街道、乡、镇、村、机场、军事基地、军区、铁路、公路、桥梁、海峡、海湾、港湾、河流、湖、公园、草原、煤矿、牧场、养殖场、音乐厅、剧院、教堂、寺庙、图书馆、博物馆、美术馆、展览中心、公园、动物园、植物园、火车站、广场、大厦、大楼、体育场(馆)、游泳馆(池)、赛车场、商城、超市、书店(城)等城市公共设施,还包括某些特定的城市建筑和虚构的处所。详见下表。
### 4.1地名标注规则
| **序号** | **情况** | 标记方法 |例子 |
| --- | --- | --- | --- |
| 1 | 只是单独地名 | 标出地名部分 | *[L*中国*]*[L*竹塘乡*] |
| 2 | 地名+地理(行政)单位 | 作为整体标出 | *[L*北京市*]*[L*台北县*]*地理单位如:省、地区、市、县、乡、镇、村、店、庙、沟、屯、坟、崖、海洋、河、川、江、峡谷、海湾、港湾、丘陵、湖、半岛、三角洲、区、街、路、街、街道、社区、小区、公寓、音乐厅、剧院、图书馆、博物馆、美术馆、展览馆、公园、动(植)物园、火车站、广场、大厦、大楼、体育场(馆)、游泳馆(池)、赛车场、商城、超市、书店(城)等城市公共设施及象征性建筑物、军事基地、军区等。*[L*天安门广场*]*[L*艾菲尔铁塔*]* |
| 3 | 包含上、下位的地名(即合成地名)以及并列的地名 | 一律分别单独标出 | *[L*山东省*]/[L*青岛市*]/[L*胜利广场*]*[L*青岛市*]/[L*孙中山广场*]*[L*北京市*]/[L*海淀区*]/[L*知春路*]/[L*希格玛大厦*]*[L*北京*]/*、*/[L*天津*]/*、*/[L*上海*] |
| 4 | 地名简称 | 单独标出 | *[L*鲁*]/*、*/[L*冀*]/*、*/[L*京*]* |
| 5 | 并列的简称 | 单独标出 | [L*中*]/[L*俄*]/*两国*/*领导人*/*进行*/*了*/*会晤*[L*港*][L*澳*][L*台*]/地区 |
| 6 | 地名包含人名以及地名包含地名的情况 | 地名中的人名、地名不标 | *[L*李嘉诚广场*]*[L*南京路*] |
| 7 | 地名+地名关键词表达一个完整的概念时 | 相对完整的地名 | *[L*南非共和国*]*[L*宁夏回族自治区*]*[L*香港特别行政区*]* |
### 4.2地名标注细则
#### 4.2.1地名实体示例
/[L*北京*]/
/[L*亚洲*]/
/[dat*2008年*]/[L*奥*]*运会*/*,*/[L*中国*]*人*/
/[L*中国*]*人民*/ ----*中国人、中国人民*都是词表词。
/[L*朝鲜*]/*南北*/*对话*/ ----*不标注南*,北。
- 注:词表词"京剧、京白、京腔、京味儿"中的"京"字要标注为:
/[L*京*]*剧*/*、*/[L*京*]*白、*/[L*京*]*腔*/*、*/[L*京*]*味儿*/
/[L*台东火车站*]/
/[L*卑南文化公园*]/
/[L*基隆文化中心广场*]/
/[L*高雄港第一港口*]/
/[L*苏澳镇*]/[L*南方澳渔港*]/
/*环*/[L*渤海湾*]/*地区*/*的*/*天然气*/*市场*/
/*来自*/[L*沈阳军区*]/*各*/*集团军*/
/[L*梅狮路后段*]/
/[L*中横公路天祥段*]/
/[L*华禄溪*]/*及*/[L*碧绿隧道*]/
/[L*南二高*]/[L*高雄支线*]/
/[L*台廿一线*]/
/[L*美国空军基地*]/
/[L*上海*]/[L*国际航运大厦*]/
/[L*上海*]/[L*虹口足球场*]/
/[L*上海博物馆*]/
/[L*上海*]/[L*城市规划展示馆*]/
/[L*石家庄*]/[L*富强电力新村*]/
/[L*西安第二长途通讯大楼*]/
/[L*北京市*]/[L*王府井百货大楼*]/
/[L*广深铁路*]/*以及*/[O*深圳发展银行*]/*部分*/*高官*/*也*/*被*/*免职*/
/[L*汉江*]/*上*/*的*/[L*圣水大桥*]/
/[L*新亚欧大陆桥*]/
---从世界知识知道此处大陆桥的名字叫*新亚欧大陆桥*,是不可分解的。
#### 4.2.2地名指示词(如国、省、市等)视为地名的一部分一起标注
地名指示词(如国、省、市等)视为地名的一部分一起标注。复杂的、具有包含关系的地名要分开标注,但分开标注时不可把一个有完整意义的地名拆散。以下是正确的标注:
/[L*德国联邦*]/*政府*/*总理*/
/[L*基隆市*]/
/[L*台东县*]/
/[L*南山部落*]/
/[L*美国*]/ [L*马里兰州*]/
/[L*约旦河*]/
/[L*朝鲜半岛*]/
/[L*长江三角洲*]/ -----*长江三角洲*是词表词。
/[L*吉林省*]/[L*延边朝鲜族自治州*]/[L*图们市*]/
以下两例均为错误的标注,因为*延边朝鲜族自治州*是具有完整意义的地名:
/[L*吉林省*]/[L*延边*]/[L*朝鲜族自治州*]/[L*图们市*]/
/[L*吉林省延边朝鲜族自治州*]/[L*图们市*]/
- 注:在ER-99的标准测试集中,把中国西昌卫星发射基地*整体标为地名。我们认为这是错误的,因为在一个地名中不应当包含具有上、下位关系的另一地名。正确的标注是:
/[L*中国*]/[L*西昌卫星发射基地*]/
/[L*美国洛克希德·马丁卫星测控中心*]/*和*/[L*中卫公司测控站*]/
/*从*/[L*法*]/*属*/[L*圭亚那*]/[L*库鲁航天中心*]/*发射*/
- 注:本规范不采用ER-99的标注:*[L*法属圭亚那库鲁航天中心*]*。
/[L*武汉*]/[L*长江大桥*]/
/[L*上海*]/[L*中山公园*]/
- 注:尽管其它城市也有长江大桥和中山公园,但在当地它们已构成完整的地名,所以应单独标注。
/*位于*/[L*朝阳门*]/*外*/*商务*/*区*/*之中*/*,*/
/[L*盛华公寓*]/*坐落*/*于*/[L*西直门*]/*内*/[L*冠英园小区*]/
- 注:内、外都不在标注范围之内,但如果地名中的内、外去掉不能说明是一个完整的地名时,内、外要标注在地名内。如:
/[O*外交部*]/*位于*/[L*北京市*]/[L*朝阳门内南小街*52*号*]/
/[L*西直门外大街*71*号*]/
4.2.3并列的地名应分别标注
对于并列的多个地名应分别标注。对于嵌套在地名中的人名、地名和机构名不再单独标注。例如:
/[L*中*]/[L*意*]/*双方*/ ----*中意*是词表词,作为国名时要切开。
/[L*香港*]/*和*/[L*澳门特别行政区*]/
/*目前*/*已*/*有*/[int*12个*]/[L*中*]/*、*/[L*东欧*]/*国家*/
/[L*北京*]/[L*上海*]/
/[L*科*]/[L*伊*]/*边境*/
#### 4.2.4跨国家的和国家内部的地名
/[L*西非*]/*国家领导人*/
/*从*/[L*陕*]/[L*甘*]/*革命*/*老区*/*到*/*沿海*/*经济特区*/*,*/
/[L*亚太*]/----亚太是词表词,它是一个地名,而不是两个地名。
/[L*近东*]/*和*/[L*北非*]/
##### 4.2.4.1表示地理方位的名词
一些表示地理方位的名词如*南半球、北半球、江南、江北、西南、西北、华南、华北、华中、东北*等虽然不完全具备确指性,也要作为地名标注为*L*。
/[L*汉水*]/*流域*/*、*[L*西南*]/*地区*/*东部*/
/[L*江南*]/*大*/*部*/*、*[L*华南*]/*有*/*小*/*到*/*中雨*/
/*近*/[dur*两天*]/*造成*/[L*东北*]/*、*/[L*华北*]/*地区*/*的*/*降雨*/*天气*/*系统*/
/*迫使*/[L*北半球*]/*的*/*副热带*/*高压带*/*在*/[L*青*][L*藏*]/*地区*/"/*断裂*/"/
- 注:上述地名后面的方位词*南部、北部、东部、西部*不应包括在地名的括号里,
因为其所指的区域是更不确定的。
##### 4.2.4.2方位词修饰地名实体时要整体标注为L
/[L*东西九龙*]/ ----这是一个并列的地名。
/*一代*/*又*/*一代*/*海*/*测*/*官兵*/*犁*/*波*/*耕*/*浪*/*于*/[L*南中国海*]/*,*
/[L*北爱尔兰*]/
/[L*中西伯利亚*]/
- 注:ER-99将此例标为*中*/ [L*西伯利亚*]。我们认为它整体是一个专指性的地名。
/[L*中南美*]/
/[L*东南亚*]/
- 注:ER-99要求把上面两个地名分别标注为[L*中*]/[L*南美*]*和*/[L*东*]/[L*南亚*]/*。其实中南美*指*中美*和*南美*两个地名,而*东南亚*是一个地名。这样的细节需要专门的地理知识才能做出判断。所以我们不遵循ER-99的这条规则。
#### 4.2.5地名实体受时间词修饰时,时间词不标
/*前*/[L*苏联*]/
/*前*/[L*南*]/*地区*----*南*指南斯拉夫,时间词*前*不标。
#### 4.2.6 只有经纬度在一起时才能标注为 **L**
只有经纬度在一起时才能标注为L,否则经度或纬度单独标为角度*ang*。如:
/*震*/*中*/*位于*/[L*北纬三十六点二零度,东经九十点二九度*]/
/*并*/*将*/*卫星*/*定点*/*在*/[L*东经*110.5*度赤道*]/*上空*/*。*/
/*震*/*中*/*位于*/[ang*北纬*30.5*度*]/*,*/
#### 4.2.7天体的标注
/[L*宇宙*]/
/[L*地球*]/
/[L*太阳*]/
/[L*太阳系*]/
/[L*银河*]/
/[L*银河系*]/
/[L*月亮*]/
/[L*海王星*]/
/[L*东方红三号*]/
/[L"*鑫诺1号*"*卫星*]/
- 注:火箭只是卫星的发射工具,故火箭型号不作为星体标注。
/[dat*96年2月15日*]/*长征*/[ord*三号乙*]/*火箭*/*发射*/*失利*/,
/*长*/[ord*二*]/*捆*/*火箭*/ ----*全名为*"*长征二号捆绑式运载火箭*"。
### 4.3不作地名标注的示例
/[L*阿*]/[L*以*]/*冲突*/
- 注:ER-99和MT-2认为阿(阿拉伯)不是一个特定国家的简称,本规范不采纳他们的规定。
/*回答*/*了*/[L*中*]*外*/*记者*/*的*/*提问*/ ---*外*不标。
#### 4.3.1地区一般不作为地名的一部分标注
仅当*地区*特指行政单位时,才被视为地名的一部分。一般情况下,*地区*泛指一片地方,不是地名的一部分。若不能确定时,*地区*不作为地名的一部分标注。
/[L*港*][L*澳*][L*台*]/*地区*/ -----*港澳台*是词表词。
/[L*巴尔干*]*地区*/
/[L*临沂*]/*地区*/*现*/*更名*/*为*/[L*临沂市*]/
#### 4.3.2平原、山脉、山区、盆地、沙漠、流域不在标注范围内
*平原、山脉、山区、盆地、沙漠、戈壁、流域、故里、故居、纪念馆、风景区、开发区、经济区*等都不在地名标注范围内。但当某某故居、故里、纪念馆成为一个对外开放的旅游景点时,才作为地名标注。如:
/[L*云*][L*贵*]*高原*/
----*云贵高原*是词表词不可分割,但云、贵要分别标注*L*。
/[L*成都*]/*平原*/
/[L*秦岭山*]/*脉*/
/[L*秦*]/[L*巴*]/*山区*/
/[L*四川*]/*盆地*/
/[L*撒哈拉*]*沙漠*/ ----*撒哈拉沙漠*是词表词。
/[L*长江*]/*流域*/
/[L*毕加索故居*]/
/*造型*/*典雅*//*毗邻*/[L*青云岩*]/*风景区*/*及*/[L*北山湾*]/*旅游区*/
*[L*约旦河西岸*]*----因为*约旦河西岸*是专指。
/[L*海峡两岸*] / ----指*台湾湾海峡两岸*。
/[L*两岸*]/
- 注:词表词*两岸*只有在表示台湾海峡两岸时,才作为地名标注为*L*,当作为*江河、湖泊*的两岸时,*两岸*要切分标注。如:
/[L*长江*]/*的*/*丰姿*/*和*/[int*两*]/*岸*/*的*/*美景*/*尽收眼底*/*。*/
/*祖国*/[L*大陆*]/
- 注:内地虽然指中国大陆,但不作为地名标注,这里遵从了ER-99的规定。特区只有在确指是香港和澳门时才作标注。如:
/*来自*/*内地*/*和*/[L*香港特区*]/
/[L*特区*]/*政府*/*和*/[L*香港*]/*同胞*/*正*/*以*/*喜悦*/*的*/*心情*/
/[L*中国*]/[L*厦门*]/*经济特区*/
#### 4.3.3对语言文字前的单音节地名不标,双音节的地名标注为L
*英语*----对*英*不标注。
*汉语*----对*汉*不标注。
*中文*----对*中*不标注。
/*对*/[L*西藏*]/*地区*/*的*/*藏语*/*广播*/
/*主张*/*台语*/*在*/[L*台*]/
/*用*/[L*四川*]/*话*/ ----如果*语、文*前面的地名为双音节时,就要标注。
/[L*荷兰*]/*语*/
#### 4.3.4以族或裔结尾的词组中地名也要标注
MT-2和ER-99规定:以族或裔结尾的词组中的地名不标注。因此*华裔*、*汉族*中的*华*和*汉(指汉族)*都不作为地名标,但*华人、华侨、华商、中医、中草药、中餐馆、亚运会、奥运会*里的*华、中、亚、奥*仍需标注*L*。本规范不采用这一规则。作为民族的名字,单音节的不标,双音节的标*L*。
下面是一些标准实例:
/[L*美*]*籍*[L*华*]人----"美籍华人"是词表词。
/*目的*/*是*/*促进*/[L*塞浦路斯*]/*西*/*族*/*与*/*土*/*族*/*的*/*和解*
/*她*/*和*/*同*/*是*/[L*日*]/*裔*/[int*三*]/*世*/*的*/*男*/*友*/
/*通过*/*在*/[L*中*]*医药*/*宝库*/*里*/*寻找*/*线索*/
/*人们*/*纷纷*/*拥向*/[L*中*]*餐*/*馆*/*,*/*一时间*/*人满为患*/
/[L*吉普赛*]/*人*/----*吉普赛*不是词表词。
/[L*印地安*]/*民族*/*;*/ ----*印地安人*是词表词。
## 第五章 机构名
机构名包括:股票(证券)交易所、国家或国际组织、商业团体(公司、企业、工厂)、电视台、广播电台、报刊杂志、出版社、政党或党派、学校、科研院所、医院、诊所、邮电局、乐队、体育运动队、联盟、议会或代表大会、军队、咖啡厅、酒吧、饭店、旅馆,以及虚构的机构等。
### 5.1机构名标注规则
机构名的后缀应视为机构名的一部分。
| **序号** | **情况** | 标记方法 |例子 |
| --- | --- | --- | --- |
| 1 | 普通名字+机构名 | 整体标出 | *[O*板桥市胜捷公司*]* |
| 2 | 地名+机构名 | 机构名整体标出 | [O*北京市电信局*]*[O*台北县立莺歌高职*]*[O*台北看守所*]*[O*基隆长庚医院*]*[O*东直门敬老院*]机构名的关键词如:幼儿园、各级学校、科学院、部委、实验室、工厂、公司、报刊杂志、出版社、大使馆、领事馆、咖啡店、快餐店、饭店、酒店、旅馆等 |
| 3 | 人名+机构名 | 机构名整体标出 | *[O*李嘉诚基金会*]* |
| 4 | 简称 | 一律整体标注 | *[O*北约*]*[O*上轮集团*]----*指上海轮胎集团*[O*白宫*]/*官员*/表示 |
### 5.2机构名标注细则
#### 5.2.1机构名标注实体示例
/[O*国防部*]/*长*/[P*迟浩田*]/
/[O*美国国防部*]/*长*/[P*佩里*]/
/[O*台北县地政局地权课*]/
/[O*地政局*]/
/[O*政风室*]/*接*/*获*/*检举*/*调查*/
/[O*国军北投医院*]/
/[O*三重地政事务所*]/
/[O*台湾银行宜兰分行*]/
/[O*省立关山工商*]/
/[O*基隆市光隆家商*]/
/[O*东信国小*]/
/[O*安乐国中*]/
/[O*原住民委员会*]/
/[O*连萧全国竞选总部*]/
/[O*北京钓鱼台国宾馆*]/
/[L*浙江*]/[O*温州大酒店*]/
/[O*松下电工株式会社*]/
/[O*公司*]/*英文*/*名称*/[O *HUNAN* FORE *SCAPE* TECHNOLOGY*CO*.,*LTD*]/
/[O*朝鲜人民武装力量部*]/*副*/*部长*/
/[O*美国海军*]/
/[O*欧共体*]/
/[O*中国国家生育委员会*]/
/[O*中国奥林匹克队*]/
/[O*披头四*]/
/[O*飞虎队*]/
/*敢死队*/ -----泛指不标。
/*但是*/[O*共和党*]/*人*/*说*/
/[O*土耳其议会外交关系委员会*]/
/[O*终战*50*周年国会议员联盟*]/
/*记者*/*来到*/[O*中山医科大学第一附属医院住院部*]/
/[O*中共中央政治局*]/*常委*/*、*/[O*中央纪委*]/*书记*/[P*尉健行*]/
- 注:中国共产党的简称中共或共要标注为O。例如:
/[ord*第二次*]/[O*国*]/[O*共*]/*合作*/
- 注:类似的简称党,由于专指性不强,不标,如:
/但/这种/现象/的/产生/,/是/同/党/和/国家/尊师重教/的/方针/背道而驰/的/,
/全国/"/[dat三八]/"/红旗手/、/全国/优秀/共青团员/
- 注:"三八红旗手"是词表词。但如果"三八"在文中被双引号断开,就要单独表为dat。另外,词表词共青团员、共产党员、共产党人、中的机构名不确指,所以一律不标。
/[O*中共中央政治局常委会*]/
- 注:常委会可以是机构名,常委则不是。
/*党*/*的*/[O*十四大*]/*以来*/
- 注:中共的*X中全会*不是机构名,除了词表词*三中全会*什么也不标以外,数词*X*应单独标注为*ord*。例如:
/*根据*/*党*/*的*/[ord*十五届*]/[ord*二*]/*中*/*全会*/
/[O*八届全国人大*]/*代表*/[P*陈妙珍*]/
/[O*西藏政协*]/*委员*/*强调*/*,*/*必须*/*旗帜*/*鲜明*/*地*/*反对*/*民族*/*分裂*/
[O*澳门中华总商会*]/*会*/*董*/*兼*/[O*青年委员会*]/*副*/*主任*/
/[O*足协*]/*杯赛*/*冠军*/[O*北京国安队*]/ ----*杯赛*是词表词。
/[O*以国家电视一台*]/ ----指以色列国家电视一台
/[L*汉城*]/[O*路透*]/*电*/
/*前*/[L*苏联*]/[O*切尔诺贝利核电站*]/*泄漏*/*事件*/
/*参加*/*这次*/*比赛*/*的*/*还有*/[O*日本*]/*、*/[O*俄罗斯*]/*、*/[O*美国*]/*、*/[O*德国*]/
和/[O*意大利队*]/*。*/
/*前往*/[O*解放军驻港部队总部*]/*慰问*/*驻军*/
/[O*第四届和平小天使台湾访问团*]/*抵达*/[L*重庆直辖市*]/
/[O*塔里班*]/*部队*/*已经*/*到达*/[P*杜斯塔姆*]/*将军*/*的*/*家乡*/
/*用*/*公款*/*购买*/[O*靖国神社*]/*和*/[O*护国神社*]/*的*/*祭祀*/*品*/
/*纪念币*/*正面*/*是*/*由*/[O*解放军*]/*军徽*/*光,*/*八一南昌起义*/*和*/[O*解放军*]/[O*陆*]/[O*海*]/[O*空*]/*三军*/*战士*/*的*/*图案*/
- 注:词表词八一南昌起义*是一个事件,不是机构名。三军*是词表词,所以数字*三*不作为*int*标注。
- 注:股市报导中的企业和公司名不论其前后有没有外文字符,一律作为一个整体
标注成*O*。例如:
/[O*ST辽物资*]/[dec*14.141*]/[O*宁波中百*]/[dec*20.354*]/
/[O*DR沪港机*]/[dec*11.194*]/[O*鲁北化工*]/[dec*8.051*]/
- 注:商城或百货公司本应标注为L,但作为股市中时企业时应标注为O。
- 注:股票指数在没有明确说明是多少元的情况下一律标注为*int*或*dec*。
- 注:被命名的轮船、飞机、机车应标注为*O*。例如:
/*却*/*购*/*回*/*了*/[int*3张*]/ [*O"长月"号轮船*]/*船票*/*,*/
/[O*泰坦尼克号游轮*]/*上*/*的*/*这*/*对*/*情人*/*实在*/*浅*/*得*/*很*/*。*/
/[O*美国"哥伦比亚"号航天飞机*]/*上*/*的*/*宇航员*/
#### 5.2.2机构名的后缀是机构名的一部分
机构名的后缀是机构名的一部分,即要准确的标出机构名的最长边界(机构名的全称)。机构名中可以包含人名、地名和机构名,但对于它们不再单独标注。例如:
/[O*苗栗县环保局*]/
/[O*卫生署桃园医院*]/
/[O*兰阳民生医院*]/*前身*/*为*/[O*吴外妇科*]/
/[O*台北爱乐青年管弦乐团*]/
/[O*行政院农委会林业试验所福山分所*]/
/[O*宋庆龄基金会*]/
/[O*上海轮胎橡胶(集团)股份有限公司*]/
/[O*中国驻日本大使馆*]/
/[O*美国白宫*]/
/*前*/[O*中国新华社香港分社*]/*社长*/[P*许家屯*]/
[O*清华大学计算机系人工智能实验室*]/
[O*中保财产保险四川省分公司*]/
#### 5.2.3国家(或国际)立法部门或行政部门标注为机构名
/*当选*/[O*国会*]/*议员*/
/[O*内阁*]/*改组*/*将*/*会*/*在*/[dat*八月底*]/*前*/*完成*/
/*前*/[O*内阁官房*]/*长官*/[P*山静六*]/
/[P*刹瓦什*]/*向*/[O*宪政法庭*]/*提出*/*动议*/
5.2.4地名和机构名紧邻时的情况
地名和机构名的关系一般有以下两种情况:
(1)表示所属关系(如:法国航空航天局,航空航天局隶属法国)。
(2)表示地理位置关系(如:北京邮电大学表示大学位于北京,而不是隶属于北京)。
地名和机构名之间还可能有更复杂的情况,这里不予讨论。
##### 5.2.4.1规则一
如果机构名以一个地名开头,而且删除这个地名后所剩部分不再是一个具有特指性的机构名,那么该地名必须留在机构名中作为该机构名的一部分标注;
/[O*北京大学*]/
/[O*深圳中学*]/
/[O*复旦大学专用集成电路与系统实验室*]/
/[O*东南大学*]/[O*深圳宝安设计院*]/
##### 5.2.4.2规则二
如果机构名前面还有一个或多个地名,那么该机构名与前面紧邻的地名应当分开标注。
如:
/[L*中国*]/[O*北京大学*]/
/[L*中国*]/[L*广东*]/[O*深圳中学*]/
/[L*北京*]/[L*昌平*]/[O*十三陵抽水蓄能电站*]/
##### 5.2.4.3规则三
如果一个机构名的开头不是地名,那么当它前面邻接一个或多个地名时,只有其中与该机构名紧邻的那个地名需一起标注。例如:
/[O*上海同济大学*]/
/[L*中国*]/[O*上海同济大学*]/
/[O*湖北省武钢三中*]/
##### 5.2.4.4规则四
如果一个机构名本身以两个或两个以上并列的地名开头,则这些地名都要留在该机构名中。如果在它前面再出现其它地名时,一律同该机构名分开标注。但是如果上一级地名不能管辖下一级地名时,要把上一级地名标注在机构名内。
例如:
/[L*洛杉矶*]/[O*亚太法律中心*]/
/[L*香港*]/[O*中港贸易协会*]/
/[O*广东亚洲大酒店*]/
/[O*澳大利亚维多利亚投资公司上海办事处*]/*》*/,
/[O*澳大利亚维多利亚投资公司*]/*》*/
- 注:"广东"与"亚洲、澳大利亚与维多利亚"都不属于上、下级管辖关系,所以要把上一级地名标注在机构名内。
##### 5.2.4.5更复杂的情况
在更复杂的情况下,我们可能无法判定某机构名究竟是以一个还是两个地名开头的。这时可按规则5.2.5和5.2.6来处理。
例如,*洛杉矶台北经济文化办事处*
究竟是A:*[L*洛杉矶*]/[O*台北经济文化办事处*]*
还是B:*[O*洛杉矶台北经济文化办事处*]*
这时,默认的标注方式是B(理由见5.2.8)。
##### 5.2.4.6地名概念比较模糊的情况
如果该地名比较模糊,而标注者又没有足够的知识来判断某机构名的开头是否是一个地名。就标注到一个比较明确的地名,
例如:*印度尼西亚莫巴蒂努山打腊航空公司*中的*莫巴蒂*·*努山打腊*不知道是不是地名。但至少知道一旦拿走了这个字符串,剩下的字符串已不构成专指性的地名。此时,按规则2.5的标注方式应是:
/[L*印度尼西亚*]/[O*莫巴蒂*·*努山打腊航空公司*]/
/[O*河北沙岭子电厂*]/
----*沙岭子*是一个乡镇的地名,河北和内蒙古都有一个沙岭子镇,地名的概念比较模糊,故标注在机构名内。
/*国际*/*著名*/*的*/[O*加拿大*B+*H国际建筑师事务所*]/
##### 5.2.4.7紧邻的地名和机构名不构成修饰关系的情况
一个地名后紧邻一个机构名,但它们不构成修饰关系,则一律分开标注。
/*促进*/*了*/[L*中国*]/[O*东盟*]/*的*/*合作*/
/*在*/[L*日内瓦*]/[O*联合国*]/&*人*/*权*&/*会议*/*上*/
更典型的例子需借助上下文来判断,如:
/*促进*/*了*/[L*中国*]/[O*微软*]*的合作*/
/[O*中国微软*]/*即将*/*发布*/*新产品*/
- 注:如果标注者不能判断它们是不是修饰关系,则默认为分开标注,如:
/[L*中国*]/[O*微软*]/
/[O*美国众议院*]/
/[L*重庆*]/[O*长江救助打捞公司*]/ /[L*日本*]/[O*东京股市*]/ ----错误标注!
/[L*日本*]/[L*东京*]/*股市*/ ----正确标注。
/[L*美国*]/[L*华盛顿*]/[O*三普证券公司*]/ ----错误标注!
/[L*美国*]/[O*华盛顿三普证券公司*]/ ----正确标注。
/[L*华盛顿*]/[O*美国国务院*]/
/[L*瑞典*]/[O*斯德哥尔摩国际和平研究所*]/
#### 5.2.5会议、晚会、运动会等以会结尾的短语是事件,不作机构名标注
/*泛*/[L*美*]/*运动会*/
/[L*中国*]/[ord*第一届*]/*人工智能*/*大会*/
/[ord*第四届*]/[L*中*]/[L*法*]/*经济*/*研讨会*/
/[ord*第三届*]/[L*海峡两岸*]/*水利*/*科技*/*交流*/*研讨会*/
----以上几例为事件,不是机构名。
/[O*中国人工智能协会*]/
/[O*中国人工智能联合会*]/ ----为机构名。
当会议指议会(congress)或代表大会(chamberofdeputies)时,应视为机构名。但是要注意:虽然议会或代表大会是机构名,但是议会或代表大会中的某一次会议是一个事件,不是机构名。为了更明确的区分各种情况,我们用以下例子说明:
/*通报*/*了*/[O*八届政协*]/[ord*五次*]/*会议*/*的*/*各*/*项*/*安排*/
/[O*全国政协*]/[ord*八届*]/[[ord*五次*]/*会议*/*将*/*于*/
/*听取*/*和*/*审议*/[O*全国政协八届五次会议常务委员会*]/*报告*/
/*审议*/[ord*八届*]/[ord*五次*]/*会议*/*提案*/*审查*/*情况*/*的*/*报告*/
- 注:*八届五次会议*、*五次会议*是一个事件,不应标注为机构名。但是这次会议的组委会、委员会应视为机构名。例如:
/[O*八届全国人大*]/[ord*五次*]/*会议*/
/[O*政协九届一次会议*]/ --错误标注!
/[O*中国共产党第十五次全国代表大会*]/
/[O*九届人大*]/[ord*一次*]/*会议*/
/[O*中国全国人大*]/
/[O*中共十五大*]/
/*各级*/*人大*/*常委会*/ --不是专指,故不标。
/[O*中国科协*]/[ord*第五次*]/*全国代表大会*/
/[L*湖南省*]/[ord*六届*]/[ord*二次*]/*全*/*委*/*会议*/
/*向*/*同级*/*人民代表大会*/*或*/*人民代表大会常务委员会*/*提请*/*审议*/
- 注:*全国人民代表大会*和确指的省、市人民代表大会及其常委会、常务委员会需作为机构名标注。泛指的人大、中央银行、人民银行、&*农*/*发*/*行*&不作为机构名标注。
/[O*临澧县人大*]/*抓*/*村*/*级*/*财务监督*/*一瞥*/*(*/*监督*/*广角*/)
/*由于*/*各级*/*人大*/*代表*/*的*/*有效*/*监督*/*,*/[dat*去年*]/*以来*/*该*/*县*/*各*/*村*/*村*/*务*/*情况*/*出现*/*好转*/*,*/
- 注:在地名*国会大厦*中,*国会*不可作为机构名标注,否则就出现嵌套了。
/[L*国会大厦*]/
- 注:"联合国大会"及其简称"联大"都是词表词,但不可整体标为O。如:
/[O*联合国*]*大会*/*于*/[dat*1992年*]/*批准*/*了*/*这*/*一*/*条约*/*。*
/[P*沈国放*]/[dat*27日*]/*在*/[O*联*]*大*/*全体*/*会议*/*上*/*表示*/,
- 注:*会*也可能出现在一般的机构名中,如:
/[O*红十字协会*]/
#### 5.2.6用我们、我等代词修饰的机构名,只对机构名进行标注
/*我国*/[O*共产党*]/
/*我们*/[O*清华大学*]/
- 注:根据上下文是确指的某公司、单位名称的简称要标注为机构,否则不标注!但如果在公司、集团等词前面有本、我、该等字样时,此处的公司、集团不进行标注。其他特殊情况依据上下文进行标注。如:
/*凡*/*《*/[O*克罗伏特缓冲器股份有限公司*]/*股份*/*》*/*记名*/*的*/*持有*/*人*/*均*/*为*/*本*/*公司*/*股东*/*。*/
/*我*/*公司*/*出资*/*总额*/[mon*50万元*]/
/[O*港资陕西华懋实业公司*]/*总经理*/[P*商铭渔*]/*,*/*受*/[O*公司董事会*]/*委托*/*来到*/[O*咸阳市西北地勘局二一五医院*]/*看望*/[O*公司*]/*保安*/*员*/[P*韩玉刚*]/*,*/
#### 5.2.7大使馆和领事馆的标注
当大使馆(或领事馆或其它外交使团)所代表的国家和所在地区相连时,整体标为机构名。如:
/*后来*/*调*/*任*/[O*美国驻洪都拉斯大使馆*]/
当大使馆(或领事馆或其它外交使团)所代表的国家或所在地没有出现在上下文中,或者在描述范围内不连续,那么存在两种情况:
(1)大使馆所代表的国家和大使馆(领事馆)相连,此地名和大使馆一起标记 为机构名。如:
/*前往*/[L*香港*]/*的*/[O*洪都拉斯领事馆*]/
(2)大使馆所在地和大使馆(领事馆)相连,此地名应单独标记,整体不作为机构名。如:
/[L*美国*]/*在*/*通过*/*驻*/[L*金沙萨*]/*大使馆*/*和*/*其它*/*正常*/*管道*/
- 注:虽然*驻金沙萨大使馆*是一个连续的短语,但它的实际意思是*美国(或*X*国)驻金沙萨大使馆*,而不是什么*金沙萨(的)大使馆*。因此在这里*大使馆*不视为机构名。
#### 5.2.8生产厂家要标注为机构名,产品则不标
这里定义的产品范围较广,不仅包括生产厂家生产出来的产品(如自行车等),还包括计算出来的产品(如:股票指数)、媒体产品(如:电视节目)
/[O*道琼*]/*工业*/*平均*/*指数*/
----因为股票指数可以视为产品,那么*道琼*就可以视为生产厂家。
/[O*纳斯达克*]/*指数*/ ---原因同前。
/[O*太原刚玉*]/[dec*10.581*]/
/[O*咸阳偏转*]/[dec*16.112*]/
/[O*深华发A*]/[dec*15.663*]/
/[O*渝开发*A]/
#### 5.2.9报纸、广播电台、电视台和杂志的名字要标为机构名
新闻媒体(如:报纸、广播电台、电视台和杂志等)的名字要标为*O*,但报刊、电视栏目的名字不标。例如:
/[O*美国之音*]/*记者*/*表示*/
/[O*人民日报*]/*海外*/*版*/[ord*第三版*]/
/*《*/[O*泰晤士报*]/*》*/*援引*/*一个*/*国际*/*专家*/*委员会*/
/[O*中央电视台*]/*《*/*焦点*/*访谈*/*》*/*、*/*《*/*东方*/*时空*/*》*/*主持人*/
/[O*武汉电视台*]/*《*/*科技*/*之*/*光*/*》*/*栏目*/*的*/*《*/*科学家*/*,*/*您好*/*》*/*专栏*/
/[O*美国《科学》杂志*]/
/[O*美国探索电视网*]/
/*创办*/*《*/[O*深圳房地产快讯*]/*》*/
/*办*/*好*/*《*/[O*中外房地产导报*]/*》*/
#### 5.2.10特殊情况
***民族不作为机构名***
***泛指的*部队不作为机构名**
***政府不作为机构名***
***学术或商务会议(conference,meeting)不作为机构名***
***交易会不作为机构名***
***运动会不作为机构名***
***联赛不作为机构名***
#### 5.2.11特殊情况示例
/[L*中国*]/[L*天津*]/*出口*/*商品*/*交易会*/
/[L*中国*]/[O*天津出口商品交易会*]/ ----错误标注!
/[L*中国*]/*政府*/ ----*不把政府*标为机构名。
/[L*非洲*]/*维持*/*和平*/*部队*/ ----*不把部队*标为机构名。
/[L*中国*]/*公安*/*部门*/ ----*不把部门*标为机构名。
/[O*中国公安部门*]/ ----错误的标注!
- 注:标注并列的机构名(*O*)时,连接词和标点符号不进入标注范围。例如:
/[O*上海*]/*、*/[O*北京人类基因组研究中心*]/
/[P*贺国中*]/*分别*/*任*/[O*一*]/*、*/[O*四*]/*、*/[O*七团*]/*党代表*/
- 注:上述情况和标注并列的序数(*ord*)不同,连接词和标点符号是否进入标注范围取决于序数词所修饰的词语。例如:
/*获得*/*个人*/[ord*一、二、三等*]/*奖*/
/[ord*一*]/*、*/[ord*二*]/*、*/[ord*三*]/*产业*/
/*书店*/[ord*三、四层*]/
- 注:*中央*不作为机构名,但党中央*标为机构名。
/*在*/*中央*/*的*/*领导*/*下*/
/*以*/[P*胡锦涛*]/*同志*/*为*/*核心*/*的*/[O*党中央*]/*周围*/
#### 5.2.12地名和机构名容易混淆的情况
/[L*人民大会堂*] ----地名。
/[O*五角大楼*]/*发言人*/*说*/*,*
/[O*白宫*] ----机构名。
/[O*克里姆林宫*]/*表示* ----机构名。
/*在*/[L*总统府*]/*分别*/*约见*/*了*/*多*/*位*/[O*国民党*]/*中*/*常委*/*检察官*/
- 注:*总统府*标注为L而不是*O*。这是因为有的国家有多处总统府,所以不能把它们视为国家或政府的唯一代表。
- 注:下面的例子中出现的类似单位名称的,因不是确指,而且是出现在各种条令、合同中,适合任何一个省、市、县的单位机构名称,所以不能作为一个机构名称标注为*O*。如:
/*本*/*合同*/*正本*/[int*三份*]/*,*/*出租*/*人*/*、*/*承租*/*人*/*、*/*市*/*公证处*/*各*/*执*/*一*/*份*/*。*/*副本*/*若干*/*份*/*,*/*报*/*市*/*经济*/*委员会*/*、*/*市*/*经济体制*/*改革*/*委员会*/*、*/*市*/*财政*/*局*/*、*/*劳动局*/*、*/*税务局*/*、*/*审计*/*局*/*、*/*工商*/*行政管理*/*局*/*、*/[O*中国人民银行*]/*市*/*分行*/*、*/[O*中国工商银行*]/*、*/*市*/*分行*/*等*/*有关*/*部门*/*备案*/ */*本*/*合同*/*在*/*履行*/*中*/*如*/*发生*/*争议*/*,*/*双方*/*应*/*协商*/*解决*/*;/*协商*/*不*/*成*/*时*/ /*任何*/*一方*/*均*/*可*/*向*/*工商*/*行政管理*/*局*/*合同*/*仲裁*/*委员会*/*申请*/*调解*/*或*/*仲裁*/
## 第六章 数字串标注总则
数字串(**Factoid**)包括时间表达式(**TIMEX**) 、数字表达式( **NUMEX** )、度量表达式(**MEASUREX**)和地址表达式(**ADDREX**)等**4**大类,***27***个小类,详见表**1-1**。标注数字串的一条重要原则就是:它的标记不得插入到词表词的内部(见**1.5.2.4**)。
### 6.1时间表达式
时间表达式(*TIMEX*)包括日期(*dat*)、时间(*tim*)和时段(*dur*)三小类。所有小于一天的时间都被定义为时间(*tim*),如秒,分,小时。一天或者大于一天的时间单位则属于日期(*dat*),如*天,日,星期,礼拜,月,季度,年,五年,十年,世纪*等。时段(dur)通常也使用日期和时间中的单位,如月、年、时、分*等。对此标注者要注意区分。
将日期、时间同时段区分开来有时是困难的,下面分别给出它们的定义。
#### 6.1.1日期(dat)和时间(tim)的定义
日期和时间在一维的时间坐标轴上有相对确定的位置。小于一天的时间都被定义为时间。一天或者大于一天的时间则属于日期。
/[tim*8*点30*分*]/
/[dat*今天*]/[tim*晚上*]/ ----*晚上*是词表词。
/[dat*昨天*]/[tim*夜里*]/ ----*昨天*和*夜里*都是词表词。
/[tim*昨夜*]/ ----*昨夜、昨晚*都是词表词,只能整体标*tim*。
/[dat*昨*]/[tim*晚*]/ ----错误的标注!
/[dat*春节*]/---在每一年中,是比较固定一天或几天。
/[dat*1999*年*]/---以*年*为单位,与别的年份相区别。
/*在*/"/[dat*六五*]/"/*中*/---以*五年*为单位,与别的*五年*相区别
- 注:严格地说,每一个*dat*或*tim*都占据了一个时间段,因此这里出现的*期间*和*中*,不能作为标注时段的理由。
/"/[dat*九五*]/"/计划
/[dat"*九五*"*初*]/
/*仅*/*"*[dat*八五*]/*"*/*期间*/*就*/*达*/[mon一百一十五亿元]/。
/[dat*下半年*]/---以*半年*为单位,与*上半年*相区别。
/[dat*二十世纪*]/---以一百年为单位,与别的*世纪*相区别。
/*为*/*庆祝*/[O*北京大学*]/*建*/*校*/[dat*100周年*]/*,*/
/[dat*民国八十六年*]/
/[dat*民国六十年代*]/
/[dat*八十八年下半年*]/*及*/[dat*八十九年*]/*中央*/*统筹*/*分配*/*款*/*,*/
/[dat*公元二千年*]/
/[dat*今年九月*]/
/*"*/[O*迈特兴华*]/*"*/*杯*/[ord*首届*]/*全国*/*象棋*/*大师*/*赛*/*于*/[dat*今日*]/*收*/*秤*
/[dat*1997年下半年*]/*,*/
/*可*/*于*/[dat*农历年*]/*前*/*迁居*/*。*/
/[tim*第七十三分钟*]/
/[tim*中午*12*点*]/
/[tim*格林威治时间*5*时*59*分*]/----含有地名。
/[dat*第二天*]/[tim*一大早*]/*,*----*一大早*是词表词。
/*在*/[dat*今年暑期*]/*大学生*/*送*/*科技*/*下乡*/*活动*/*中*/,
/*大约*/[tim*七点*]/*到达*/*----大约*不标。
/[tim*晚上大约七点*]/*到达*/
- 注:*大约*被两个*tim*包围,分割不开,所以整体标上。这条标注规则遵照了ER-99和MET-2的标准。
- 注:事件戊戌变法、辛亥革命、甲午战争、五四运动等都是词表词,其中的日期不标注。但当戊戌、辛亥、五四单独出现时,应作为日期来标注。例如:
/*与*/*稍*/*后*/*的*/*辛亥革命*/*,*/*都*/*有*/*相通*/*的*/*地方*/,
/*在*/[L*香港*]/*回归*/[dat*周年*]/*前夕*/*和*/*"*/*七七事变*/*"*/*纪念日*/*,*/[dat*戊戌*]/*思潮*/*与*/*前此*/*的*/*洋务运动*/,
#### 6.1.2时段(dur)的定义
时段既可以长于一天,也可以短于一天。它不同于日期和时间,在一维的时间坐标轴上没有确定的位置。例如:
/[dur*三年*]/
/[dur*半年*]/
/[dur*四分之一个世纪*]/
/[dur*廿四个月*]/
/*时间*/*长*/*达*/[dur*六分钟*]/
/[dur*两个星期*]/
/[dur*一个月*]/*后*/
/*曾*/*在*/[dur*5、6年*]/*前*/*撰文*/*陈述*/
/*早产*/[dur*十二周*]/*左右*/
/*大水*/[dur*十天*]/*后*/*才*/*退*/*尽*/ /[dur*一至两年*]/
/[dur*一小时卅分钟*]/ /*这*/[dur*几天*]/
/[dur*卅天*]/*会期*/*只*/*开*/*了*/[dur*九天*]/
/*虽*/*经*/[dur*一整天*]/*磋商*/*,*----*一整天*不是词表词,但要标为*dur*。
与*/*洪水*/*奋战*/[dur*一天一夜*]/*,*----*一天一夜*也不是词表词。
时间表达式的标注细则详见第七章。
### 6.2数字表达式
数字表达式(*NUMEX*)包括百分数(*per*)、钱款(*mon*)、频度(*fre*)、整数(*int*)、分数(*fra*)、小数(*dec*)、序数(*ord*)、比率(*rat*)等8小类。
#### 6.2.1百分数(per)
/[per*百分之二十五*]/
/[per*百分之一点七*]/ ---虽然是小数,但要标作per。
/[per*六点五百分点*]/ /[per*五成*]/*以上*/ /[per*六折*]/
/[fra*百万分之八*]/ ----注意标的是*fra*而不是*per*。
/*大约/*[per5%]/ ----约数*大约*不进入标注。
6.2.2钱款(mon)
/[mon*四亿元台币*]/
/[mon*43.6亿美元*]/
/[mon*卅万元*]/
/[mon*四万五千块钱*]/
/[mon*四万五千元人民币*]/
/*只*/*增加*/*了*/[mon*几元钱*]/*的*/*成本*/
/*决不*/*乱*/*花*/*国家*/*的*/[mon*一分钱*]/。
- 注:同一笔钱的不同货币形式需分开标注。货币中的地名不标。
[mon*26万英镑*]/ (/[mon*43.6亿美元*]/)/
- 注:*约*是一个不确切的概念,故不标注。但*上*、*数*、*好*要和数字串捆绑在一起标注。但*近*作为特例,不与数词捆绑!!
/*约*/[mon*十万元*]/
/*大概*/*需要*/*花费*/[mon*上千万美元*]/*的*/*投资*/*和*/[dur*3*年*]/*左右*/*时间*/*,*/
/*多*/*收入*/[mon*好几十元*]/
#### 6.2.3频度(fre)
/[fre*数度*]/ /[fre*两次*]/ /[fre*26次*]/ /[fre*十多次*]/ /[fre*多次*]/
- 注:动量词次除了一次不标注以外,其余的全部标注为*fre*。
/[fre*一次次*]/
/[fre*再次*]/ /[fre*无数次*]/ /[fre*数次*]/
#### 6.2.4整数(int)
*int*标注的是数词和量词组合成数量词组。
/[int*卅七件*]/ /[int*一百卅项*]/ /[int*三种*]/
/[int*九个*]/*课室*/ /[int*几家*]/
/*后*/[int*几名*]/ /[int*十*]/*多*/*人*/ /[int*四条*]/*断层*/ /[int*五十户*]/ /[int*百余名*]/ /[int*上万*]/*人潮*/
/*"*/[int*双*]/[int*百*]/*"*/*方针*/,
- 注:"双百方针"是词表词,由于文中"双百"用引号括起,而且它们是两个数字,所以要分别按数字串标注。类似情况还有词表词"五四运动",这是个事件不标。但是如果文中日期"五四"被引号括起,就要单独标为:/"/[dat五四]/"/运动/。又如"六一儿童节、六一国际儿童节、六一节"都是词表词。由于"六一"和"儿童节"是同一个日期,即使在文中"六一"被引号括起,也可以整体标为dat,如:/[dat"六一"儿童节]/。
- 注:人次应标注为*mea*而不是*int*,例如:
/*近*/[dur*3年*]/*中*/*,*/*该*/*市*/*采取*/*多*/*形式*/*的*/*农技*/*培训*/*近*/[mea*万人次*]/,
- 注:"*数词*+*强*"不一定表示序数,因此只单独标注数词为*int*。例如:
*/*在*/*这次*/*从*/[int*十六*]/*强*/*到*/*冠*/*、*/*亚军*/*的*/*一次性*/*竞猜*/*中*/*,*/*
/[O*宝钢*]/*为*/*跻身*/*世界*/[int*500*]/*强*/*而*/*采取*/*的*/*重要*/*步骤*/*。*/
#### 6.2.5分数(fra)
/[fra*数倍*]/
/[fra*一半*]/
/[fra*千百倍*]/
/[fra*3/4]/
/[fra*四分之三*]/
/[fra*百万分之三百六十四*]/ *----*注意标记是*fra*而不是*per*。
/[fra*半个*]/ /[fra*4倍半*]/ *----*倍数是分数的一种表示,应标*fra*。
/[fra*4倍半*]/
/[fra*4.5倍*]/ ----*虽然* *4.5*是个小数,但不标*dec*。
/*有效*/*载*/*力*/*提高*/[fra*2至3倍*]/
注:"过半数"是词表词,因此不作为分数fra标注。例如:*/*都*/*难以*/*获得*/*过半数*/*的*/[int*207张*]/*选票*/*,*/*
#### 6.2.6小数(dec)
/[dec*3.14]/
/[dec*三点一四*]/
/*看*/*了*/*那么*/*长*/*时间*/*的*/*电视*/*,*/*视力*/*依旧*/[dec*1*.*5*]/
/*我*/*有着*/*足以*/*令*/*我*/*自豪*/*的*/[dec*1*.*2*]/*视力*/
----视力的多少是一个量级,没有单位,故按数量标注整数或小数。/*并*/*以*/[dec*6139.69点*]/*收盘*/
/*以*/ [dec*33.8*]/*收盘*/ /*比重*/*:*/[dec*1.02*]/
#### 6.2.7序数(ord)
/[ord*第一任*]/
/[ord*第一期*]/
/[ord*十六楼*]/
/[ord*第三次*]/*世界大战*/
/[ord*首*]/*日*/*销售*/*欠佳*/
/[ord*第二*]/*故乡*/
/[ord*三等*]/*奖*/
/[ord*前*6*名*]/
/*地震烈度*/*不*/*超过*/[ord*8度*]/
/*这*/[ord*第二条*]/*尤为*/*重要*/
/*位居*/*金牌*/*榜*/[ord*第二名*]/
/[O*北京市*]/[ord*首家*]/*就业*/*与*/*创业*/*组合*/*市场*/
/[ord*1174号*]/*文件*/
/[ord*6*路*]/*汽车*/ /[ord*六年级*]/*学生*/
/[dat*今年*]/*读*/[ord*大三*]/
/*发展*/*第一产业*/* ----第一产业*是词表词。
/*发展*/[ord*第一*]*产业*/ ----错误的标注。
/*阵风*/[ord*五级*]
/[ord*一、二、三等*]/*奖*/。
/*他*/*亲手*/*接*/*治*/[L*墨西哥*]/[ord*首例*]/*艾滋病*/*患*/*儿*/
#### 6.2.8比率(rat)
/ [rat*一比廿五*]/
/*以*/[rat*0∶6*]/*失利*/
/*上*/*一*/*届*/*世界杯*/*赛*/*就*/*以*/[rat*1∶0*]/*胜过*/*。*/
/*最终*/*以*/[rat*三比三*]/*握手言和*/*。*/ /*用*/*原液*/*与*/*水*/*稀释*/[rat*1*∶*20*倍*]/*。*/
数字表达式的标注细则详见第八章。
### 6.3度量表达式
度量表达式(*MEASUREX*)包括年龄(*age*)、温度(*tem*)、角度(*ang*)、长度(*len*)、面积(*are*)、容积(*cap*)、重量(*wei*)、速度(*spe*)、加速度(acc)和其它(*mea*)等10小类。
#### 6.3.1年龄(age)
/[age*卅五岁*]/
/[age*廿一岁*]/
/[age*六十五岁*]/
/[age*34岁*]/
/[age*六十寿辰*]/
/[age*花甲*]/*老人*/ ----*花甲*是词表词。
/*如同*/[age*年过半百*]/*的*/*老*/*妇*/*。*/ ----*年过半百*是词表词。
/[P*李元*]/*、*/[P*卞德培*]/[int*两位*]/*先生*/*都*/*已*/[age*年逾古稀*]/。
#### 6.3.2温度(tem)
/*寒流*/*耍*/*酷*/*平地*/[tem*6℃*]/
/*才*/*会*/*微*/*升*/[tem*6.1℃*]/
/*但*/*平地*/*温度*/*还*/*会*/*下*/*探*/[tem*5℃*]/*左右*/
/*积温*/*高*/*(*/[tem*2800度*]/*)*/----注意!
/[tem*零下*5*到*6*摄氏度*]/
- 注:数字范围的标注方式详见7.1.1。
/*大约*/[tem*5~7℃*]/
/*低温*/*反而*/*只*/*有*/[tem*10℃*]/~/[tem*12℃*]/
/[tem*摄氏19*-*24度*]/
/[tem*摄氏*19*度*]/ -/[tem*24度*]/
#### 6.3.3角度(ang)
/*钝角*/*就*/*是*/*大于*/ [ang*90度*]/*的*/*角*/
/*并*/*将*/*卫星*/*定点*/*在*/[agn*东经*110.5*度*]/[L*赤道*]/*上空*/*。*/
/*震*/*中*/*位于*/[ang*北纬*30.5*度*]/*,*/ ----详见4.2.4.2
#### 6.3.4长度(len)
/*开掘*/*到*/ [len*一米六七*]/*深度*/*时*/
/*高*/ [len*五米*]/*宽*/ [len*一百米*]/ /[len*109×78厘米*]/
/[len*1纳米*]/=/[len*十的负九次方米*]/
/*应用*/*于*/*紧*/*固*/*件*/*直径*/*为*/[len*1*/*4″*]/*(*/[len6m]/*)*/
/*最高*/*速度*/*每*/*秒*/ [len*360米*]/
/*发生*/*每*/*秒*/*速度*/*达*/[len*四十二米*]/*的*/*大风*/*。*/
/[L*三峡*]/*截流*/*落差*/*在*/[len*0.7-0.8米*]/*之间*/*,*/
#### 6.3.5面积(are)
/[are*廿七公顷*]/*土地*/
/*占*/*地*/[are*六百多公顷*]/
/*兴建*/[are*五千坪*]/*大*/*的*/*厂房*/ /[are*七百余坪*]/
/*每*/*套*/*住宅*/*面积*/[are*140*-*160m2*]/*,*/
/[are*997万平方公里*]/
/*农田*/ [are*20万亩*]/
#### 6.3.6容积(cap)
/*运输量*/*为*/ [cap*34个立方*]/
/[cap*一两箩*]/*谷子*/
/*选定*/*的*/*设计*/*流量*/*是*/*每*/*秒*/[cap*1.4*万至*1.9万立方米*]/*。*/
/*工程*/*已*/*完成*/*土方*/[cap*2300多万方*]/*,*/
/*全国*/*消费*/*了*/[cap*25万升*]/*啤酒*/*。*/
#### 6.3.7重量(wei)
/[wei*九百至一千吨*]/
/[wei*零点三公克*/]
/[wei*三千二百英吨*]/
/*重*/*约*/[wei*五、六公斤*]/*的*/*鲤鱼*/
/[wei*十台斤*]/
/[wei*三点五公吨*]/
/*产量*/*达到*/ [wei*数千万吨*]/ /[wei*几万吨*]/
/[wei*二十万吨*]/*级*/*以上*/
- 注:ER-99把上例标为:[wei*二十万吨级*]/以上。
#### 6.3.8速度(spe)
/*最高*/*速度*/ [spe*360米每秒*]/
/*打印*/*速度*/*:*/[spe12cps]/
----"*cps*"表示"characterspersecond(每秒字符数)"。
#### 6.3.9加速度(acc)
/*抗震*/*能力*/*:*/*地面*/*水平*/*加速度*/*≤*/[acc*0.4m*/*s2*]/ /*地面*/*垂直*/*加速度*/*≤*/[acc*0.2m*/*s2*]/
#### 6.3.10其它度量表达式(mea)
除了上面提到的度量单位元之外,物理、化学及其它度量单位的统一标注为*mea*。/*额定*/*电压*/*至*/[mea*660V]/
/[mea*5.5瓦特*]/
/*参观*/*人数*/*达*/[mea*620万人次*]/ /*工资*/[mea*3500元*/*人*/*月*]/
/[mea*25元*/*公斤*]/
/*风*/*压*/*不*/*超过*/[mea*700Pa*]/*(*/*相当于*/*风速*/[spe*34m*/*s*]/*)*/*。*
/*迁移*/*到*/[mea*千兆比特*]/*的*/*能力*/*能够*/*降低*/*拥有*/*总*/*成本*/*的*/*管理*/*方案*/
/*这些*/*快速*/*以太*/*网*/*和*/[mea*千兆位*]/*以太*/*网*/*服务器*/
#### 6.4地址表达式
地址表达式(*ADDREX*)包括电子邮箱(*ema*)、电话(*pho*)、传真(*fax*)、电报挂号(*tel*)、邮政编码(*pos*)和网址(*www*)等6种。
#### 6.4.1电子邮箱(ema)
/[ema *exp@email.com.cn*]/
/[ema*cnhuang@msrchina.research.microsoft.com*]/
#### 6.4.2电话(pho)
在标注电话号码时,要把国际区号、国内区号、本地区号等作为一个整体标注。如果有分机号码也要一并标注。当有多个分机号码时,要分别标注。如:
*预约*/*订*/*位*/*电话*/[pho*九五一八六二八*]/
/*洽*/*询*/*电话*/[pho*二四九三一零二零*]/
/*订*/*席*/*专线*/[pho(*8610*)-78906617]/
/*查询*/*电话*/*是*/(/[pho*零三八六二一一零零转二五二*]/)/
/*查询*/*电话*/[pho*三六九九七二一转二三三一*]/*或*/[pho*二三三二*]/
/[pho*120*]/
/[pho*119*]/
#### 6.4.3传真(fax)
/*全国*/*客户*/*服务*/*传真*/*:*/[fax*010-58722727*]/
/*传真*/*号码*/:/[fax*86-10-66665555*]/
/*公司*/*传真*/*:*/[fax*86-10-66665555*]/
#### 6.4.4电报挂号(tel)
/[O*搜狐公司*]/*电报挂号*/*是*/*:*/[tel(8610)*62726666*]/
/*电报挂号*/*:*/[tel*86-10-66665555*]/
/*联系*/*电话*/*:*/[tel*86-10-66665555*]/
#### 6.4.5邮政编码(pos)
/[O*清华大学*]/*的*/*邮政编码*/*是*/*:*/ [pos*100080*]/
/[L*安徽*]/[L*阜阳*]*/*地区*/*的*/*邮政编码*/*是*/*:/[pos*233600*]/
#### 6.4.6网址(www)
/*活动*/*报名*/*网址*/*:*/[www http:www.acer.net/event/apply]/
/[O*蕃薯藤*]/*购物*/*网*/*(*/[www http:shopping.yam.com]/*)*/
## 第七章 时间表达式标注细则
**时间表达式(**TIMEX***)包括日期(***dat**)** 、时间( tim )和时段( dur )三小 **类。**
### 7.1日期(dat)
/[dat*明治三十九年*]/*(*/[dat*公元一九零六年*]/*)*/
/[dat*大正十四年*]/*(*/[dat*公元一九二五年*]/*)*/
/[dat*昭和二年*]/*(*/[dat*公元一九二七年*]/*)*/
/[dat*清*]/[dat*道光十四年*]/
/[dat*清*]/[dat*咸丰十一年*]/
/[dat*民国六十八年*]/*拆除*/*后*/*迁到*/[L*芦洲*]/*,*/[dat*八十一年*]/*间*/*又*/*扩建*/
/[dat*一九九九*]/
/[dat*一九九九年十二月三十号*]/
/[dat*公元*1990*年*4*月*22*日*]/
/[dat*旧石器时代*]/
/[dat*八十年代*]/
/[dat*下半年*]/
/[dat*1989财年*]/ ----注意!
/[dat*1989*财年第三季度*]/
/[dat*1990*上半财年*]/
/[dat*1991*财政年度*]/
/[dat*秋季*]/*报告*/
/[dat*第四季度*]/
/[dat*十五世纪*]/
/*努力*/*成为*/*一*/*名*/*高*/*素质*/*的*/[dat*跨世纪*]/*人才*/*。*/
/*值*/*此*/[dat*世纪之交*]/*的*/*时候*/*,*
/*走*/*向*/[dat*新世纪*]/*的*/[L*中国*]/*律师*/*业*/
/[dat*新旧世纪交替*]/*之际*/
/*黑色*/[dat*星期一*]----注意!
/[*L北京]*/*在*/[dat*23号*]/*发表*/*了*/*报告*/
- 注:数字串*23号*若不表示日期,则不标。
/[dat*五月上旬*]/ ----*上、中、下旬*要标注。
/*科技*/*之*/[dat*夏*]/ ----注意!
/[dat*夏*]/[dat*秋*]/*之间*/
/[dur*一年*]/*中*/*四季*/*分明*/ ----*四季*是词表词不标注。
/[L*南极*]/*的*/[dat*夏季*]/
/[L*中国*]/[dat*汉代*]/
/[dat*春节*]/ ----日期确定的节日要标注。
/[dat*肉孜节*]/
/[dat*开斋节*]/
/[dat*中秋*]/*时节*/
----注意*时节*不标。
/[L*美国*]/*的*/[dat*独立日*]/----美国独立日为每年7月4日。
/[dat*27年*]/*是*/*一个*/*多*/*事*/*的*/*年份*/
- 注:*27*年*可能表示时段,标注者须根据上下文注意区分。
/*现在*/*是*/[dat*26号*]/*,*/[dat*星期三*]/
----同一个时间的不同表达,要分开标注。
/*现在*/*是*/[dat*二月九号*]/*,*/[dat*农历大年初三*]/
/*大约*/[dat*五月四日*]/*----大约,大致,大概*等词不标。
/[dat*第二个十年*]/ /[dat*第二年3月*]/
/[dat*当年*9*月*]/
/[dat*今春*]/ ----*今春*不是词表词。
#### 7.1.1日期起讫表达式的标注
当日期表达式中有至、到和连结符-时,处在至、到和连结符-前后的日期表达式分别叫做前式和后式。如果前式和后式都是完整的日期表达式,则它们应分别进行*da*t标注;否则前、后式要整体标注为*dat*。
这条规则同样适合于其它各类数字串的标注,如:*tim*,*dur*,*int*,*tem*,*wei*,*mon*等。其一般表达式为:
/X+量词/到/X+量词/
/X+量词/至/X+量词/
/X+量词/-/X+量词/
/X+至+X+量词/
/X+到+X+量词/
/X+-+X+量词/
/X+、+X+量词/
例如:
/[dat*三月三日*]/*至*/[dat*三月卅一日*]/
/[dat*一月十八日*]/*到*/[dat*廿一日*]/
/[dat*三月三至廿一日*]/
/[dat*二月十八日*]/-/[dat*廿一日*]/
/*于*/[dat*今明两年*]/*陆续*/*推出*/*。*/
/[dat*民国五十五、五十六年*]/
/[dat*今明两天*]/
/[dat*今*]/*、*/[dat*明*]/[dur*两日*]/
/[dat*1980年*]*到*[dat*1990*年*1月*]/
- 注:含有比喻意义的今天、昨天、明天、今日、昨日、明日全不标注。
*/*"*/*一失足成千古恨*/*,*/*同学*/*们*/*,*/*看到*/*今天*/*的*/*我*/*,*/*你们*/*是否*/*感悟*/*到*/了*/*什么*/*?*/*"*/
/*尽管*/*炮火*/*已*/*消失*/*在*/*昨天*/*那*/*段*/*苦难*/*,*/
/[O"四方"集团*]/*的*/*明天*/*将*/*会*/*更加*/*灿烂*/*美好*/*。*/
- 注:当年、同年、当月等词语后有具体的日期时,要整体标注dat,如果当年、当月、同年等词语单独出现,而其前后有确指的日期时也要标注为dat,否则不作标注!当日、当天等词后有具体的时间时标注为dat,否则不作标注!如:
/[dat*当年7月*]/*在*/[L*莫斯科*]/*举行*/
/*然后*/*于*/[dat*同年8月*]/*奉调*/*回国*/*。*/
/[P*克林顿*]/*在*/[dat*当月13日*]/*表示*/*,*/
/*那*/*是*/[dat*当天*]/[tim*中午1时*]/*的*/*汇率*/
/*发言人*/*于*/[dat*当日*]/[tim*午夜*]/*发表*/*声明*/
#### 7.1.2前、头、下+时段(dur)应整体标注为dat
/[dat*头两个礼拜*]/
/[dat*前3天*]/
/[dat*今年头四个月*]/
/*比*/[dat*上一年*]/*增长*/[per*10.4%*]/*。*/
/*集中*/*研究*/*解决*/[dat*下半年*]/*纠风*/*工作*/*如何*/*突出*/*重点*/*,*/
/[dur*两周*]/*前*/
/[dat*1993年之初*]/ ----注意!
/[dat*公元之初*]/
#### 7.1.3当乾隆、康熙、道光等表示年代时标注为dat
当乾隆、康熙、道光*等表示年代时标注为*dat*,而当*乾隆、康熙、道光*等表示皇帝本人的名字时标为P。如:
/*最近*/*发现*/*一*/*张*/*在*/*农家*/*珍藏*/*的*/[dat*清代*]/[P*康熙*]/*、*/[P*雍正*]/*、*/[P*乾隆*]/*、*/[P*嘉庆*]/*、*/[P*道光*]/[int*五*]/*皇帝*/*诰封*/*圣旨*/[int*九道*]/*,*/ /*收藏*/*了*/*自*/[dat*清代*]/[dat*乾隆*]/*年间*/*至今*/*各个*/*历史*/*时期*/*的*/*鼻烟壶*/*艺术*/*珍品*/*,*/
#### 7.1.4朝代名的默认值为dat
当朝代名被上下文确认为国家名时标注*L*,否则默认为*dat*。如:
/*如果*/[P*刘伯温*]/*不是*/*一直*/*压抑*/*着*/*对*/[dat*元*]/*王朝*/*的*/*不满*/*,*/
/[dat*楚*]/*霸王*/[P*项羽*]/*带领*/[int*两万*]/*兵*/*将*/*,*/
/*只*/*带*/[dur*三天*]/*粮食*/*,*/*渡过*/[L*漳河*]/*去*/*与*/*强大*/*的*/[dat*秦*]/*兵*/*作战*/*。*/*结果*/*,*/[dat*楚*]/*军*/*大败*/[dat*秦*]/*军*/*。*/
/[dat*吴*]/*王*/[*P*夫差]/*战胜*/*了*/[dat*越*]/*王*/[P*勾践*]/,
/[dat*战国*]/*时*/[L*赵国*]/*良*/*相*/[P*蔺相如*]/*曾*/*为*/[L*赵国*]/*立*/*下*/*汗马功劳*/;
*[P*唐睢*]/*出使*/[L*秦国*]/*,*/*
《*/[L*水浒*]/*全传*/*》*/*描述*/*的*/*是*/[dat*北宋末年*]/*震撼*/[dat*宋*]/*室*/*江山*/*的*/[P*宋江*]*起义*/*。*/*
/*从*/*侧面*/*表现*/*了*/[dat*清*]/*政府*/*的*/*腐败*/*无能*/*,*/*激起*/*了*/*深*/*埋*/*在*/*人们*/*心底*
/*对*/*侵略者*/*的*/*敌视*/*和*/*对*/[dat*清*]/*政府*/*的*/*愤怒*/*,*/
/*但是*/*,*/*战争*/*最终*/*因*/[dat*清*]/*政府*/*的*/*妥协*/*、*/*投降*/*而*/*告*/*失败*/*。*/ /*无奈*/*夜郎自大*/*、*/*腐败*/*不堪*/*的*/*大*/[L*清国*]/*武器*/*太*/*落后*/*,*/
#### 7.1.5在"过去、今后、未来+时段(dur)"等修饰成分不进入标注范围
/*过去*/[dur*3年*]/*中*/*,*
/*将*/*在*/*未来*/[dur*几年*]/*内*/*出现*/
/*未来*/[dur*两天*]/*沿江*/*地区*/*仍*/*有*/*中*/*到*/*大雨*/*,*/
/[dat*今年七八月*]/*间*/
#### 7.1.6词表词近年来、近些年、近几年来、近几年、几年来等均不标注
按规定,词表词*近年来、近几年、近几年、几年来、多年来、近些年*等内部的*dat*、*tim*、*dur*都是不标的。但对非词表词则要分开标注。例如:
/[L*瑞士*]/*多年来*/*是*/[ord*第一次*]/*。*
/近几年/,/[L中]/[L菲]/关系/
/*近*/[dur*五年*]/*来*/
/*时至今日*/*仍*/*在*/*缓刑*/*期间*/*。*/*-----时至今日*是词表词。
### 7.2时间
/[tim*凌晨零时*]/
/[tim*清晨六时卅五分*]/*到*/[tim*四十分*]/
/[tim*凌晨二至四点*]/
/[tim*中午十二时*]/-/[tim*晚上九时*]/
/[tim*上午十一时*]/*至*/[tim*下午二时*]/
/[tim*第七十三分钟*]/
/[tim*格林威治时间*5*时*59*分*]/ ----含有地名。
/[tim*下午当地时间*5*时*59*分*]/
/[tim*九点整*]/*到达*/[L*北京站*]/
/[dat*九月十三日*]/*大约*/[tim*七点*]/*到达*/[L*北京*]/
- 注:这里*大约*不标。因为它虽被一个*dat*和一个*tim*包围,但是仍可以分割开。
### 7.3时段
/[dur*两个星期*]/
/[dur*一个月*]/*后*/
/*曾*/*在*/[dur*5、6年*]/*前*/*撰文*/*陈述*/
/*早产*/[dur*十二周*]/*左右*/
/*大水*/[dur*十天*]/*后*/*才*/*退*/*尽*/
/[dur*一至两年*]/ /[dur*一小时卅分钟*]/
/*这*/[dur*几天*]/
/[dur*卅天*]/*会期*/*只*/*开*/*了*/[dur*九天*]/
/[dur*10个月*]/
/*虽*/*经*/[dur*一整天*]/*磋商*/ ----*一整天*不是词表词,但要标为*dur*。
/*与*/*洪水*/*奋战*/[dur*一天一夜*]/*,*/ ----*一天一夜*也不是词表词。
*/*历经*/[dur*一二十年*]/*创建*/*了*/*庞大*/*的*/*船队*/*,*/*
/*让*/*我们*/*全家*/*人*/*感动*/*了*/[dur*好几天*]/
/*在*/*水门*/*丑闻*/ [dur*四分之一世纪*]/*时*/*发表*/*的*/*评论。*/
- 注:按照前面的原则:*水门*/*丑闻*/ [dur*四分之一世纪*]/*时*在时间坐标轴上有比较固定的位置,因此应当标为*dat*。但这种与事件(水门丑闻)相关的时间表达,在ER-99和MET-2中都是不标注的。这样,只有*四分之一世纪*需要标注为*dur*。
/[dur*十多年*]/
/[dur*几年*]/*以来*/
/*在*/[dur*半年*]/*时间*/*内*/----注意:*上半年*是*dat*。
/*在*/*总结*/[dur*14年*]/*改革开放*/*经验*/*的*/*基础*/*上*/
- 注:*14年*、*30*年*也可能表示dat。标注者要注意区分。*/*我们*/*在*/*美国*/*奔波*/*了*/[dur*30年*]/
/[dur*27年*]/*的*/*军旅*/*生涯*/
/*整整*/[dur*十五年*]/ ----*整整*不标。
/*大约/*[dur*十年*]*/的/时间*/ ----*大约*不标。
/[dur*十年*]/*来*/
/[dur*十几年*]/*的*/*时间*/ ----注意!
/[dur*十几年*]/*来*/ /[dur*十来年*]/ /[dur*数年*]/
/[dur*多年*]/ ----ER99不标。
#### 7.3.1一年都标为dur
/*新*/*的*/[dur*一年*]/*即将*/*开始*/
/*硬*/*是*/*在*/*地下室*/*干*/*了*/ [dur*一年*]/*的*/*公司*/
/[dur*一年*]/*创*/*产值*/*效益*/…/…/
/*聘金*/*为*/[dur*一年*]/ [mon*900万美元*]/*的*/*价码*/
- 注:*/*这*/*一年*/*、/*那*/*一年*/*中的一年不是确指不作标注。
*/*这*/*一年*/*,*/*企业*/*增收节支*/*达*/[mon*110万元*]/
/*在*/[O*北大*]/*就读*/*的*/*那*/*一年*/*,*/
- 注:整天、整日、整夜一律标注为*dur*,如:
/[dur*整天*]/*都*/*很*/*安静*/*,*/
/*还*/*东奔西走*/[dur*整日*]/*忙*/*个*/*不停*/*,*/
/*让*/*人*/[dur*整夜*]/*不得*/*入睡*/
- 注:当年、月、日、周等词修饰后面的工资、交易(销售)额、创汇等词语时,要作为时段(*dur*)来标注。如:
/[dur*月*]/*收入*/*就*/*在*/[mon*千元*]/*以上*/
/[dur*年*]/*交易额*/*近*/[mon*1000亿元*]/*。*
/*这*/*一*/*工程*/[dur*日*]/*处理*/*污水*/[cap*2万立方米*]/*。*
#### 7.3.2一天的标注有以下三种情况,需区别对待:
##### 7.3.2.1"前一天",不论其前面有没有定语修饰统统标注为dat(参见7.4.1):
/[dat*前一天*]/*还*/*静止*/*的*/*电梯*/[dat*今天*]/*动*/*起来*/*了*/*,*/
/[L*香港*]/[O*恒生*]/*指数*/*比*/[dat*前一天*]/*下跌*/[int*412点*]/*,*/
/*这次*/[L*中*]/[L*韩*]/*足球*/*对抗赛*/*是*/*在*/[O*韩国队*]/*准备*/*赴*/[L*法*]/*出征*/
*世界杯*/*的*/[dat*前一天*]/*举行*/*的*/*,*/
##### 7.3.2.2"一天"的意思是指时间段(24小时),标注为dur:
/*每人*/*每月*/*接待*/*来访*/[dur*一天*]/*,*
/[P*汤*]/[P*尤*]/*杯*/[dur*一天*]/*不*/*拿*/*回来*/*,*/
/*仅*/[dat*5月31日*]/[dur*一天*]/*,*/[L*莫斯科市*]/*税*/*警*/*就*/*查出*/[int*1600个*]/*违法*/*经营者*/*。*/
/*青年人*/*辛苦*/*忙碌*/*了*/[dur*一天*]/*来*/*此*/*坐*/*坐*/*,*/ /*在*/[L*墨西哥*]/*最后*/[dur*一天*]/*的*/*访问*/*中*/*,*/
/*每*/*枚*/*多*/*赚*/[mon*7分钱*]/*,*/[dur*一天*]/*下来*/*能*/*多*/*收入*/[mon*好几十元*]/*。*
##### 7.3.2.3"一天"的意思相当于"有一天",由于不是确指的日期所以什么也不标:
/*但愿*/*有一天*/*我们*/*轻松*/*地*/*说*/*:*/*消费*/*着*/*是*/*美丽*/*的*/*。*/
/[dat*1997年*]/*的*/*一天*/*,*/[P*吴佩民*]/*在*/*办公室*/*热情*/*接待*/*了*/*一个*/*素不相识*/*的*/*中年*/*妇女*/*。*/
/*一天*/[tim*下午*]/*,*/*记者*/*到*/*那*/*店*/*里*/*专门*/*拜访*/*了*/[P*佛朗科*]/*师傅*/*。*/
/*一天*/*,*/[P*列宁*]/*收到*/*一*/*封*/*前线*/*发*/*来*/*的*/*要求*/*支援*/*武器*/*和*/*服装*/*的*/*电报*/*。*/
/*一天*/*上*/*晚*/*自习*/*回来*/*,*/*有*/*一*/*条*/*狗*/*总*/*跟着*/*她*/*,*/
/*一天*/[tim*深夜*]/*,*/*一*/*人*/*酒后*/*拦截*/*过往*/*的*/*外地*/*车辆*/*,*/ /*一天*/*,*/*我*/*走过*/*他*/*的*/*门前*/*,*/
/*一天*/[tim*晚上*]/*,*/*新*/*上任*/*的*/[L*河北省*]/[O*栾城县委*]/*书记*/[dat*六月八日*]/*,*/
- 注:"这/一天、那/一天"中的"一天"也非确指,所以也不标。
/*记住*/*这*/*一天*/*,*/*也是*/*表达*/*我*/*对*/[L*香港*]/*回归祖国*/*的*/*预祝*/*。*/
/[P*王龙雨*]/*从*/*上任*/*的*/*那*/*一天*/*起*/*,*/
### 7.4有关时间表达式的规则
#### 7.4.1前(后)+日期|时间要整体标注
/[dat*今年前五个月*]/
/[dat*前三天*]/
- 注:以下的标注是正确的:
/*在*/*上半时*/*结束*/*前*/[dur*1分钟*]/ ----*上半时*是词表词。
/*比赛*/*前*/[dur*十分钟*]/
/*在*/*上*/*半场*/[tim*第27分钟*]/*时*/
#### 7.4.2反例——不应该标注的例子
刚才、最近、开始军备谈判以来、一会儿*等表示不确定时间的词语,不标。如果节日没有确定的时间,也不标。如:
/[L*印度*]/*国际*/*电影节*/
/[L*中国*]/*旅游年*/
#### 7.4.3特例
若两个短语属于不同的子类*dat*和*tim*,就需分开标注。
/[dat*2*月*12日*]/[tim*上午*8*点*]/
/[dat*星期一*]/[tim*8点*]/
- 注1:时间中的地名,如北京时间下午*5*点,在ER-99中不标注,而在NET-2中要标注。本规范按NET-2标注(参照前面的例子)。如果*dat*和*tim*分不开,就整体标注。
/[tim*北京时间*1997*年*2*月*9*号*19*点*28*分*]/
- 注2:*去年、昨天、今早*等词在MET-2中要标,在ER-99中不标。本规范只参照MET-2:
/[dat*去年上半年*]/
/[dat*今年夏天*]/
/[dat*今年三月一日*]/
/[dat*去年春夏之交*]/
/[dat*昨天*]/[tim*夜里*]/ ---*夜里*是词表词。
/[dat*今天*]/[tim*晚上*]/ ---*晚上*是词表词。
/[dat*今*]/[tim*早六点*]/ ---*今早*不是词表词。
/[tim*早上六点*]/ ---*早上*是词表词。
/[dat*5月份*]/*产品*/*出口*/*和*/*转口*/*总值*/*比*/[dat*去年同月*]/*下降*/[per*3.2%*]/*,*
/[dat*同一天*][tim*晚上*]/
/[dat*当日*]/[tim*下午*]/
- 注3:当日是词表词。如果在上下文中能确定*当日、当天*或*同一天*的具体日期时,就标注;否则不标。
/*每日*/[tim*上午11时*]/*至*/[tim*深夜3时*]/ ----*深夜*是词表词。
/[tim*昨夜*/]/ ----*昨夜*是词表词。
/*每*/[dat*周四,二,一*]/
- 注:MET-2和ER-99对*早上六点*的标注是相同的。但ER-99认为*早上六点*与*今早六点*不同。原因可以从英语的表达来理解:前者是"6:00am",后者是"6:00thismorning"。"thismorning"在ER-99中被视为"相对时间",不标注。但在MET-2中,"相对时间"是要标的。本规范遵循MET-2。
/[dat*11月24至27日*]/
/[dat*3*月*15日*]/*至*/[dat*17日*]/
/[dat*1949年*]/-/[dat*1972年*]/
/[L*美国*]/*南北战争*/*(*/[dat*1861—1865年*]/*)*/*中*/
/*软件*/*最*/*长*/*的*/*寿命*/*为*/[dur*两到三年*]/*,*/
---清注意这里日期范围的标注方式。
*迄今*----*词表词不标-,MET-2标今*。
*今后*----*词表词不标-,MET-2标今。
*晨练*----*词表词中的*晨*不标。*-
*晚宴*----*词表词中的*晚*不标。*-
*春联*----词表词中的*春*不标。
*他们*/*的*/*今天*/*,*/*仿佛*/*就是*/*我们*/*的*/*明天*/*。*----泛指不标。
*参加*/*半决赛*----*半决赛*是词表词,*半*不标。
*双边*/*会谈*----*双边*是词表词,因此*双*不标。
#### 7.4.4每年和年不标注
本规则也适用于*月,天,小时*等其它时间单位。例如:
*/*年产值*/*…*/*…*/*
/*每年*/*创*/*产值*/*效益*/*…*/*…*/
/*每年*/*收入*/*…*/*…*/
## 第八章 数字表达式标注细则
数字表达式(*NUMEX*)包括百分数(*per*)、钱款(*mon*)、频度(*fre*)、整数(*int*)、分数(*fra*)、小数(*dec*)、序数(*ord*)、比率(*rat*)等8小类。以下是数字表达式的一些标注规则。
### 8.1如果整数、分数、小数、序数后面有量词,数量短语要整体标注
例如:
/[int*几千万盆*]/
/[int*几家*]/*工厂*/
/*一*/*家*/ [int*5*]/*人*/
/*一*/*家*/ [int*5口*]/*人*/
/*铁人*/[int*三项*]/*比赛*/*是*/*多*/*项目*/*的*/*综合*/*运动*/*,*
/*计算机*/*配置*/*:*/586/*以上*/*,*/[int*8兆*]/*内存*/*以上*/
/*打印*/*分辨率*/*:*/[mea*180dpi*]/
注:*dpi*表示每英寸的点数,所以作为*mea*标注。
/*评为*/*"*/[int*十*]/*星*/*级*/*乡镇*/*"*/*、*/*"*/[int*十*]/*星*/*级*/*支部*/*"*
### 8.2单纯的数字、词表词(包括俗语)中的数字都不作标注
如:
/*自然数*/5/*和*/6/*都是*/*整数*/
/*大家*/*听*/*口令*/*,*/*齐步走*/*,*/*一*/*二*/*一*/*,一*/*二*/*一*/*,*/*一*/*二*/*三*/*四*/*,/*
/*但是*/*卷子*/*上*/*的*/"/6/"/*还是*/*颠*/*巍巍*/*地*/*变成*/*了*/"/8/"/*。*/
/[L*瑞士*]/*、*/[L*西班牙*]/*、*/[L*比利时*]/*、*/[L*丹麦*]/[int*四*]/*国*/
/*并*/*促进*/*了*/[L*中*][L*美*]/*两国*/*的*/*交流*/*与*/*合作*/ ----*两国*是词表词。
/*并*/*促进*/*了*/[L*中*][L*美*]/[int*两*]*国*/*的*/*交流*/*与*/*合作*/*,* ----错误!
/*垄断*/*了*/[L*神奈川*]/*、*/[L*青森*]/*等*/[int*5*]/*县*/*的*/*交通*/*信号*/*维修*/*业务*/*。*
/[L*两岸*]/*经济*/*合作*/*和*/*直接*/*三通*/ ----*三通*是词表词。
/[L*两岸*]/*经济*/*合作*/*和*/*直接*/[int*三*]*通*/ ----错误!
/*到*/[L*云*]/[L*贵*]/[L*川*]/*的*/*大三线*/*地区*/*,----大三线*是词表词。
/*到*/[L*云*]/[L*贵*]/[L*川*]/*的*/*大*[int*三*]*线*/*地区*/*,----错误!
/*十年寒窗*/ ----*词表词中的十年*不标。
/*千载难逢*/ ----*词表词中的千载*不标。
/*十*/*年*/*九*/*旱*/*----非词表词。虚指的十年*不标。
/*眼*/*观*/*六*/*路*/*,耳*/*听*/*八*/*方*/ ----非词表词。虚指的六、八不标。
/*利*/*在*/*千秋*/*的*/*大事*/ ----*虚指的*"*千秋*"不标。
/*十*/*年*/*如*/*一*/*日*/ ---*-虚指的十年*和*一日*,不标。
/*万里*[L*长城*]/ ---*-虚指的万里*,不标。
/*三皇五帝*/----*三皇五帝*是词表词。
/*乌七八糟*/*的*/*东西*/*几乎*/*扫荡*/*殆尽*/*----乌七八糟*是词表词。
/*三大球*/*在*/*走*/*向*/*市场*/*时*/----*三大球*是词表词。
/*第二次世界大战*/*的*/*反法西斯*/*斗争*/----*第二次世界大战*是词表词。
/*三五成群*/*地*/*散落*/*着*/*警察*/*,*----*三五成群*是词表词。
- 注:*一会儿,一起,唯一,付之一炬,一流,千方百计,一分为二,一切,二娃*等词表词中的数字一律不标。
/*本职*/*创*/"/*一流*/"/*活动*/ /[int*亿万*]/*人民*/
/[int*百万*]/*民众*/
- 注:按照ER-99,*亿万、百万*不是一个抽象的数字,因此是要标注的。
### 8.3约、近是一个不确切概念,故不同后面的数字串一起标注
*上*、*数*、*几*、*好*则要和数字串捆绑在一起标注,而*约、近*作为特例,不与数词捆绑。
/*大约*/[int*12亿*]/*人口*/
/*约*/[int*四五千*]/*人*/*在*/[L*金边奥林匹克运动中心*]/*举行*/*集会*/*,*/
/*约*/[mon*十万元*]/
/*近*/[mon*千万元*]/
/*大概*/*需要*/*花费*/[mon*上千万美元*]/*的*/*投资*/*和*/[dur*3年*]/*左右*/*时间*/*,*/
/[O*省电力公司*]/*还*/*投资*/[mon*好几百万元*]/*,*/
/*多于*/[mon$90,000]/ /[mon*几百万新元*]/
/*统计*/*了*/[int*上百种*]/*数字*/*,*/
/*每年*/*都*/*要*/*花费*/*大量*/*外汇*/*引进*/[int*上百套*]/*系统*/
/*每年*/*搞*/[int*一两个*]/*工程*/*,*/
/*邀请*/*全国*/*近*/[int*百名*]/*书法*/*名家*/*,*/
/*近*/[int*千名*]/*员工*/
- 注:余、多本不应标注,但当它们位于量词前分割不开,所以整体加以标注。
/[mon*二十七万余元*]/
/[mon*五百多万元*]/
### 8.4钱款式中的地名
钱款表达式中的地名不论是单音节还是多音节的,Er-99和MET-2都不标,否则就形成嵌套。
如果货币字符串在文本中单独出现,字符串中没有数字修饰,那么双音节的地名要标注为*L*,单音节的地名不标注。例如非词表词*泰铢*中的*泰*不标。注意词表词*日元*、*美元*中的单音节的地名也不标。
/[mon*2000新元*]/
/[mon*2000新加坡元*]/
/*泰*/*铢*/*汇率*/*稳定*/*在*/[mon*38铢*]/—/[mon*39铢*]/*兑*/[mon*美元*]/*水平*/
/*纷纷*/*抛*/*出*/*日元*/*购*/*进*/[L*德国*]/*马克*/*,*/
/[L*菲律宾*]/*比索*/*对*/*美元*/*汇率*/*也*/*下跌*/*。*/
### 8.5钱款标注中的特例
MET-2规定:如果没有表示钱款的单位,则不标。ER-99则不然。本规范采用ER-99的规定。
/*这*/*辆*/*汽车*/*值*/[mon*20万*]/
/*卷标*/*上*/*的*/*价格*/*是*/ [mon*50*]/
/[O*纳斯达克*]/*跌*/ [int*140*]/
### 8.6频率的特例
/[fre*四年一度*]/ ----*四年一度*并非词表词,但整体标注为*fre*。
/[fre*一年一度*]/
----*一年一度*是词表词,整体标注为*fre*。
*/*主要*/*在*/*交流*/[fre*50Hz*]/*,/*额定*/*电压*/*至*/[mea*660V*]/
*---*交流电的频率是*50Hz*(赫兹)*,*即每秒变化*50*周。所以理应标成*fre*而不是*mea*。
/*频率*/*高*/*(*/[fre*30*-*60KHz*]/)
/*卫星*/*每年*/*发射*/[fre*6至7次*]/。
- 注:又一次、再一次全部标注为fre,但/*一次*/*又*/*一次*/例外,不作标注。
如:
/*此间*/*舆论*/[fre*又一次*]/*注意*/*到*/[L*亚*]/[L*非*]/*足球*/*的*/*差距*/
/*精湛*/*演技*/*,*/[fre*再一次*]/*赢得*/*了*/*首都*/*观众*/*的*/*由衷*/*赞赏*/
### 8.7名词方没有与之搭配的量词,因此可以和前面的数词直接结合
在我方、校方中的名词方没有与之搭配的量词,因此可以和前面的数词直接结合,如:
/[int*三方*]/*已*/*就*/[O*劳斯莱斯*]/*汽车*/*的*/*前景*/*达成*/*协定*/*,*/
### 8.8一相当于英语的冠词a,一般不标
一相当于英语的冠词a,一般不标,但一倍是例外,要标fra。例如:
/*一个*/*条件*/
/*一*/*座*/*城市*/
/*最大*/*的*/*企业*/*之一*/
/*荣立*/*一等功*/ ----*一等功*是词表词,不可标注。
/荣立*/[ord*一等*]*功*/----错误的标注!
/*获*/*县*/*政府*/*新技术*/*推广*/[ord*一、二等*]/*奖*/*。*/
/*我*/*的*/*收入*/*是*/*她*/*的*/[fra*一倍*]/ ----*一倍*是要标的。
### 8.9一(1)+量词不标注*int*
#### 8.9.1一+量词是词表词的情况
词表词一个、一种、一类、一批、一次、一套、一阵等作为数量短语不予切分,也不标注*int*。其中有些量词重迭形式也是词表词,如一个个、一天天,应保持其整词形式,而其它非词表词的数量短语和量词重迭形式都是要切开的。
/*一个*/*人*/
/*一个个*/*观众*/
/*一种*/*算法*/
/*一套*/*特种*/*邮票*/
/*一次*/*讨论*/
/*一*/*匹*/*黄骠马*/
/*一*/*栋*/*栋*/*楼房*/
/*一天天*/*暖和*/*起来*/
#### 8.9.2词表词一起、一块、一道、一面用作数量短语时应切开
词表词*一起、一块、一道、一面*有副词和其它词性的用法,但当它们用作数量短语时一律切开,而且不标注*int*。
/*一*/*块*/*石头*/
/*一*/*起*/*交通*/*事故*/
/*一*/*面*/*镜子*/
### 8.10一(1)"+物理单位元需按度量表达式标注
一(1)"+物理单位元(如米、公斤、摄氏度等)需按度量表达式(见6.3)标注。如:
/[wei*一公斤*]/*大米*/
/[mea*一度*]/*电*/
### 8.11分数词素半
#### 8.11.1词表词中的词素半不可标注为fra(分数)
词表词*如半价、半票、半饱、半身、半世、半辈子、上半时、下半场、半边*等,但不可把上述词表词中的词素*半*标注为*fra*(分数)。
/*上*/[fra*半*]/*场*/*比赛*/[L*中国*]*队*/*未进*/*一*/*球*/
/*下半场*/----词表词,是正确标注。
/*下*[fra*半*]*场*/----在词表词中插标*fra*是错误的。
/*目前*/*还*/*空闲*/*着*/[fra*一大半*]/*的*/*营业*/*面积*/*。*/
/*他们*/*之中*/*肯定*/*有*/[fra*一多半*]/*人*/*没有*/*球*/*票*/
/*有*/[fra*大半个*]/*篮球*/*场*/*那么*/*大*/
- 注:当半作为一个独立的词时要标注,标注的原则是:半+量词或名词时标注,半+动词或形容词时不作标注,如:
/*下半场*/*后*/[fra*半*]/*段*/
/*地处*/*偏僻*/[fra*半*]/*山区*/
/*部分*/*企业*/*停产*/*或*/*半*/*停产*/
/*而*/*处于*/*半*/*死亡*/*或*/*休眠*/*状态*/*,*/
/*干旱*/*半*/*干旱*/*地区*/*径流*/*造林*/*技术*/*、*/
#### 8.11.2以下的词表词不作为分数标注,而作为其它不同的数字串标注
/[dur*半年*]/
/[dur*半天*]/
/[tim|dur*半夜*]/
/[int|age*半百*]/
#### 8.11.3例外
半个西瓜中的半个,与四半中的半概念不一样,前一个半是指二分之一,后一个半是量词,所以标注也不同。
/[fra*半个*]/*西瓜*/
/[int*一个*]/*西瓜*/*分为*/[int*四半*]/
### 8.12序数词素首
#### 8.12.1词表词中的词素首不可标注为ord(序数)
词表中有许多词含有词素*首*,如*首创、首倡、首选、首发、首航、首飞、首演、首映、首战、首展、首席代表、首席科学家、首席执行官、首富、榜首、魁首、居首*等。但不可把词表词中的词素*首*单独作为*ord*(序数)来标注。
/*首席执行官*/----正确标注。
/[*ord首席*]*执行官*/----在词表词中插标*ord*是错误的。
#### 8.12.2具有首+量词结构的词表词或非词表词,应整体作为ord标注
具有"首+量词"结构的词表词有:*[ord*首届*]*,*[ord*首次*]*,*[ord*首批*]*,*[ord*首位*]*,[ord*首例*]等。
具有首+量词结构的非词表词,如:
/[L*北京市*]/[ord*首家*]/*就业*/*与*/*创业*/*组合*/*市场*/
/[P*满文军*]/*则*/*以*/*自己*/*的*/[ord*首张*]/*个人*/*专辑*/
/[dat*首日*]/*销售*/*欠佳*/
----这里首日不能作序数词来标注,应标注为日期*dat*。(详见7.1)。
- 注:头版、头条是词表词。它们和头一回统统标注为*ord*。如:
/*在*/[dat*4月11日*]/*的*/*《*/[O*人民日报*]/*》*/[ord*头版*]/[ord*头条*]/*社论*/*位置*/*发表*/*出来*/*,*
/*由于*/*是*/[ord*头一回*]/*,*/*总*/*怕*/*有*/*个*/*闪失*/*,*
- 注:"头"的上述标注不可类推到其它词组中,例如,
*上*/*半场*/*表现*/*不好*/*,*/*头*/[dur*10分钟*]/*甚至*/*有些*/*拖泥带水*/*。*
*----*注:这里半场时词表词,但不标注为*fra*。
### 8.13序数词+量词结构,应整体作为ord标注
/[ord*第一期*]/
/[ord*第二*]/*故乡*/
/[ord*三等*]/*奖*/
/[dat*第一天*]/ *---*相对日期,标*dat,*而不是** [ord*第一*]/*天*。
/[dat*第二年*]/ *---*相对日期,标*dat,*而不是** [ord*第二*]/*年*。
/[O*波音*]/747 */* ----*产品序号不标*。
/*地震烈度*/*不*/*超过*/[ord*8度*]//
/*这*/[ord*第二条*]/*尤为*/*重要*/*,*/
/*位居*/*金牌*/*榜*/[ord*第二名*]/*。*/
/*作为*/*大豆*/*行动*/*计划*/*的*/[ord*第二步*]/
/[ord*1174号*]/*文件*/
/[ord*6路*]/*汽车*/ /[ord*六年级*]/*学生*/
/[dat*今年*]/*读*/[ord*大三*]/
/*发展*/*第一产业*/ ----*第一产业*是词表词。
/*发展*/[ord*第一*]*产业*/ ----错误的标注。
/*阵风*/[ord*五级]*/
/*通过*/*大学*/*英语*/[ord*六级*]/
- 注:联赛中的A/组、B/组等不作为序数字串标注。如:
/*在*/[L*里昂*]/*进行*/*的*/*世界杯*/*G*/*组*/*比赛*/*中*/
- 注:"甲级、甲/A、乙/级、乙/A"等不作为序数ord标注。如:
*/*当即*/*停止*/*该*/*场*/*比赛*/*主*/*裁判员*/*执法*/*全国*/*足球*/*甲*/*A*/*联赛*/*;
/*获得*/[ord*前两名*]/*的*/*球队*/*晋级*/*甲*/*A*/*行列*/*。*/
/[dat*1998年*]/*全国*/[O*男篮*]/*甲*/*B*/*联赛*/
/*判处*/*以*/[P*东条英机*]/*为首*/*的*/[int*7名*]/*甲级*/*战犯*/*死刑*/*。*/
### 8.14仅当形容词前表示比赛名次时才和后面的序数结构一起标注
仅当形容词前表示比赛名次,如前*6*名、前四(指前四名)时,才和后面的序数结构一起标注为*ord*。其余的情况如前两次、前三组、前三场、前两项等,前都不得进入被标注的数字表达式。
/*获得*/[ord*前十名*]/*的*/*是*/*:*/*在*/*前*/[int*两轮*]/*小组*/*赛*/*中*/
/*列*/*前*/[int*两位*]/*的*/*是*/[O*澳大利亚队*]/*和*/[O*日本队*]/*。*
### 8.15文本中表示标号的数字不标
规范、条例中的条款标号,包括一、二、三、Ⅰ、Ⅱ、Ⅲ、1,2,3、第一条、第二条、第三条等,一律不予标注。只有当这些条款被正文引用时,才作为序号ord被标注。例如:
/*第二*/*,*/*制定*/*必要*/*的*/*行规*/*、*/*行约*/*,*/*共同*/*规范*/*,*/*共同*/*遵守*/*,*/
/*一*/*无*/*资金*/*,*/*二*/*无*/*场地*/
/*一*/*靠*/*政策*/*调动*/*农民*/*的*/*积极性;*/ /*二*/*靠*/*科技;*/
/*一*/*是*/*继续*/*加强*/*农业;*/
/*二*/*是*/*采取*/*措施*/*稳定*/*物价*/*,*/*抑制*/*通货膨胀;*/
/*1*/*.*/*自卑*/*的*/*羞耻*/*感*/*。*/
/*2*/*.*/*依赖*/*的*/*恐惧*/*感*/*。*/
/*(1)*/*加强*/*爱国主义*/*的*/*宣传*/*教育。*/
/*(2)*/*加强*/*正确*/*的*/*理想*/*、*/*信念*/*、*/*人生观*/*、*/*价值观*/*的*/*宣传*/*教育*/*。*
"*第*+*数词*+*条*"视为词表词,但作为文中陈述的标号时不标注*ord*。仅当其在文中被引用时才作为*ord*标注。例如:
/*第一条*/*、*/*消费者*/*永远*/*是*/*对*/*的*/*;*
/*第二条*/*、*/*如果*/*消费者*/*真*/*的*/*错*/*了*/*,*/*清*/*参照*/[ord*第一条*]/*。*/
- 注:当上述数字表示等级序号时,则要标注为*ord*。例如:
*污秽*/*等级*/*:*/[ord*Ⅰ*]/*、*/[ord*Ⅱ*]/*、*/[ord*Ⅲ*]/*、*/[ord*Ⅳ*]/*。*
### 8.16人名、地名、机构名中的数字,不单独标注int
/[P*佐腾一郎*]/
/[L*梅竹蹊六十七号茶花庄*]/
/[O*子弟一中*]/
/[O*三明市*]/
/*任*/*队长*/*的*/ [O*1205钻井队*]/
### 8.17外文字符串的标注
由于外文的词与词之间都有空格作为分隔符,因此无需再去切分,只在标点符号的前后加切分标记。遇到字母词、名称缩写等情况也不作切分,如:/COM/经济/(网络经济)、/E/产品/(电子产品)、/卡拉/OK/等。
/Good morning/ ,/everyone/./
/*最近*/*引进*/*一*/*台*/JT-ESWL-*Ⅲ*/*型*/*体*/*外*/*震波*/*粉碎*/*肾结石*/*机*/*,*/
"*/[L *ZHONG* HUA *REN* MIN *GONG* HE*GUO]/"/*,*/*这是*/[L*中华人民共和国*]/*的*/*汉语拼音*/*。*/*
"*/Brother/*,*/I *love* you *all* the *time/*,*/ Thank *you* very *much/*!*/"/ "/Happy *birthday* to*you/*!*/"
/Dip *one* end *of* a *straw* in *the* solution/./Blow *gently* through *the* straw/./ */A* soap *bubble* forms/./What *happens* when *you* keep *on* blowing/?/
/The *bubble* bursts *because* the *pressure* inside *the* bubble *is* more *than* the*pressure *outside* the*bubble/./
### 8.18数学公式和机型标号均作为一个整体来切分和标注
例如:
/*△*S/=/[len*12*(*S1*+*S2*)*mm*]/
/*IEC298*.*265*.*129*.*694. *420*.*56.* 529*.*932*/
/*GB3804.* *3906*.*11022*/
/IEC60129A2/*(*/[dat*1996*]/)*UES*-*K3*/*2/ /UEMC40K8U*/*1*/ */1* V/FJ220001R2/
/*SFL12*/*17.5*/IVD *P575303RI/ /S*FL24A/IVDP5753/O2RI/
## 第九章 分词歧义消解细则
本章中的歧义切分实例是从微软亚洲研究院237万词训练语料、10万词测试语料和20万词散页语料中抽取出来的。这些歧义字段可粗分为交集型歧义(OAS)和组合型歧义(CAS)两大类。交集型歧义又包含用正反向最大匹配(MM)算法侦查不到的所谓隐藏的CAS。下面就分别介绍不同歧义字段的消解规则。
### 9.1交集型歧义字段(OAS)
#### 9.1.1交集型歧义字段示例
由于交集型歧义字段的例子太多,不便穷举,所以下面只列举少量实例供参考。
(1)/矛头/所/指/正是/以/包/代/管/、/负/盈/不/负/亏/、/
(2)/[L四川]/一/私营企业/家/向/下岗/女工/捐款/
(3)/柚木/购/进/后/市场价格/大/跌/,/
(4)/图/为/[O保险公司]/向/受灾/企业/赔/付/现场/
(5)/地方政府/亟需/在/加强/压/锭/监管/力度/方面/下功夫/,/
(6)/与/厂/内/存留/的/旧/纱/机/一并/销毁/。/
(7)经/请示/,/自行/将/本/厂/经/改造/的/应/压缩/设备
(8)擅自/新/增/棉纺/生产能力/,
(9)/有人/钻/政策/空子/、/骗/财政补贴/。
(10)日益/猖獗/的/走私/犯罪/活动/,
(11)/他/在/教务/活动/中/积极/研究/、
(12)对/全/山/的/商业/网点/和/摊/区/重新/进行/了/规划/和/建设/。
(13)/加强/各级/领导班子/建设/。
(14)全体/员工/开展/了/"/人家/学/我们/,/我们/怎么办/"/的/大/讨论/,
(15)/[O欧佩克]/提高/原油/配额/和/暖冬/等/因素/影响/,
(16)/保护/国家/和/人民群众/的/生命/财产/安全/。
(17)/以/维护/民族团结/为/己任/,
(18)/各级/领导干部/要/站/在/党/和/国家/全局/的/高度/,
(19)/只有/坚持/解放/思想/、/实事求是/的/思想/路线/,
(20)/呈现/了/"/部队/添/战斗力/,/企业/增/生产力/,
(21)/共建/双方/通过/自上而下/层/层/签约/,
(22)/电力/部门/还/专门/建立/了/正规/的/转业军人/业务/培训/机制/,
(23)/这/条/线/不/停电/,/官兵/跳伞/太/危险/了/。
(24)如同/[L华中]/电网/强大/的/发电机/群/按照/同一/频率/转动/一样/,
(25)通过/举办/一些/全/集团/参与/的/拥军/活动/,
(26)/在/全国/工业/[ord500]/强/中/名列前茅/的/大型/企业集团/。
(27)/本/次/检测/中/性能/系数/最高/者/。
(28)/不是/主张/所有/的/会议/都/开/成/电视电话/会议/。
#### 9.1.2隐藏的交集型歧义字段
隐藏的交集型歧义字段是指那些用正、反向最大匹配(MM)算法无法侦查到的交集型歧义字段。
注:以下例句中,双百分号右面为改正后的切分。
(1)/[L新疆]/经济/社会/发展/一定/会展/现出/越来越/美好/的/前景/
/[L新疆]/经济/社会/发展/一定/会/展现/出/越来越/美好/的/前景/
(2)/成立/了/专/司空/中和/地面/服务/质量/监管/的/服务/质量/督察/办公室/,/
/成立/了/专/司/空中/和/地面/服务/质量/监管/的/服务/质量/督察/办公室/,/
(4)/其内/容或/规则/已/译/成/[int15]/国/语言/,/
/其/内容/或/规则/已/译/成/[int15]/国/语言/,/
(5)/这/一发/现有/可能/加速/艾滋病/新药/和/疫苗/的/研制/。/
这/一/发现/有/可能/加速/艾滋病/新药/和/疫苗/的/研制/。/
(6)/[L韩国]/对/日出/口中/,/
/[L韩国]/对/[L日]/出口/中/,/
(7)/恰/在/此时/,/奉/党委/派赴/[O共产国际]/工作/的/[P张太雷]/于/[dat8月]/回国/
/恰/在/此时/,/奉/党/委派/赴/[O共产国际]/工作/的/[P张太雷]/于/[dat8月]/回国
(8)/站/在建/设有/[L中国]/特色/社会主义/全局/
/站/在/建设/有/[L中国]/特色/社会主义/全局/
(9)/金融/危机/就/可能/会演/变为/经济危机/
金融/危机/就/可能/会/演变/为/经济危机/
(10)/如/少数/司机/在/东侧/门楼/外道/路上/违章/占/道/停车/,/
如/少数/司机/在/东侧/门楼/外/道路/上/违章/占/道/停车/,/
(11)/有关/部门/要/下决心/下力/气管/好/电子/游戏/室/。/
有关/部门/要/下决心/下/力气/管/好/电子/游戏/室/。/
(12)/相近/似的/设施/化/保护/菜地/面积/达/[are1300万亩]/;/
/相/近似/的/设施/化/保护/菜地/面积/达/[are1300万亩]/;/
(13)/表明/了/财政部/门对/落实/科教/兴/国/战略/采取/的/实际/行动/。/
/表明/了/财政/部门/对/落实/科教/兴/国/战略/采取/的/实际/行动/。/
(14)儿时/站/在家/门口/向/四面/望/,/
儿时/站/在/家门/口/向/四面/望/,/
(15)/就/不可能/正确/地理/解和/执行/党/的/路线/方针/政策/,/
/就/不可能/正确/地/理解/和/执行/党/的/路线/方针/政策/,/
(16)/使得/高校/中原/有的/个别/的/知识/物化/行为/迅速/扩展/为/一种/专门/职能/。/
/使得/高校/中/原有/的/个别/的/知识/物化/行为/迅速/扩展/为/一种/专门/职能/。/
(17)/这是/大都/市里/的/一个/皮货/修理/店/,/
/这是/大/都市/里/的/一个/皮货/修理/店/,/
(18)/需/招集/体制/女性/业务员/[int四名]/,/
/需/招/集体/制/女性/业务员/[int四名]/,/
(19)/连同/应/交费/用以/划/支票/寄/还/。/
/连同/应/交/费用/以/划/支票/寄/还/。/
(20)/我们/一口气/跑/到家/门口/的/一/棵/大树/前/,/
/我们/一口气/跑/到/家门/口/的/一/棵/大树/前/,/
(21)/"/唉/,/要是/好/好/复习/,/可不/会考/得/这样/糟/。/"/
"/唉/,/要是/好/好/复习/,/可/不会/考/得/这样/糟/。/"/
(22)/特制/定本/规定/。/
/特/制定/本/规定/。/
(23)/股东/会所/议事/项/作/成/会议/记录/,/
/股东/会/所/议/事项/作/成/会议/记录/,/
### 9.2组合型歧义字段(CAS)
组合型歧义字段在真实文本中大量出现,有的是比较常见的,有的是非常罕见的。尤其是有的CAS即使根据上下文也很难判断其正确切分,如正在、就是、还是、只有、只是、一道、一起等等。因此有必要针对那些高频的CAS逐条加以说明。
#### 9.2.1常见的组合型歧义字段
下面对一些常见的组合型歧义字段加以解释。
##### 9.2.1.1数词一和量词组成的CAS
词表词一个、一种、一类、一批、一次、一套、一阵等作为数量短语不予切分,也不标注int。其中有些量词重迭形式也是词表词,如一个个、一天天,应保持其整词形式,而其它非词表词的数量短语和量词重迭形式都是要切开的。(详见8.9)
/*一个*/*人*/
/*一个个*/*观众*/
/*一天天*/*暖和*/*起来*/
/*一套*/*特种*/*邮票*/
/*一次*/*讨论*/
/*一*/*匹*/*黄骠马*/
/*一*/*栋*/*栋*/*楼房*/
词表词一起、一道、一样、一手、一面、一口、一头、一气等既可以用作连词、副词、名词或形容词等,又可以切分开来成为数量短语。但像一套这样的词表词,除了数量短语的用法以外,不再有其它用法,因此不存在切分问题。词表词有一套是有本事的意思时,也不切分。这类词的切分问题只能逐个加以描述。
##### 9.2.1.2动量词次与频率int的标注
动量词中只有*次*被标注为频率*fre*,如[fre*再次*]*、*[fre*数次*]*、*[fre*一次次*]*、*[fre*无数次*]*、*[fre*好几次*]*,而*遍、回、趟*不标注为频率,一*/*遍、一*/*回、一*/*趟、一次(词表词不切分)、一*/*遍*/*又*/*一*/*遍、一*/*回*/*又*/*一*/*回、一*/*趟*/*又*/*一*/*趟,一次*/*又*/*一次*也不标注为*fre*。这条规则的理由如下:
(1)遍表达的是动作从开始到结束的全过程;次、回描写动作的重复;趟只用于表示行走意义的动词。*去一趟*可以说成*去一次*、*去一回*,但*做一次*、*做一回*不能说成*做一趟*。
*遍、次、回*有时可通用,如*你再唱一遍*,可以说成*你再唱一次*或*你再唱一回*而意思不变。但单纯表示动作数量时,只用*次*,不用*遍*,如*他表示了多次*、*敌人的三次进攻都被击退了*。
*次*与*回*区别在于,*次*既用于书面语又用于口语;*回*只用于口语。如*多次、数次*等带文言色彩的短语,就不能说成*多回、数回*。
(2)*这本书我看了一遍*,是指从书的开头到末尾的全过程。*这本书我看了一次*,着重指看的次数,不指看的全过程。
##### 9.2.1.3一(1)+物理单位元量词构成度量表达式
当一(1)后面是长度、重量等物理单位元时应分别按度量表达式标注为*len*,*wei*,如[*len*一米*]*、*[wei*1*公斤*](见8.10)。
#### 9.2.2CAS示例
下面是一些常见CAS的切分规则和示例。
(1)人为作形容词时不切分。
(1a)而是/深究/灾难/中/的/人为/因素/。(1b)以/人/为/本/
(2)为人:
(2a)也/包括/最/基本/的/为人/处事/的/行为/准则/
(2b)/始终/主宰/着/他/的/为人/之/道/和/为/艺/之/方/。(2c)/我/把/不大/为/人/所/知/的/一些/往事/写/下来/,
(3)一起:作名词和副词使用时不切分,作为"数+量词"时切分。(3a)/和/市民/一起/聊天/,/听取/群众/反映/。
(3b)/[dat4月17日]/发生/在/[L北京]/[L海淀区]/[L阜石路]/的/一/起/车祸/,/
(4)一点:形容词,意思是少许,不切分。但作为数量短语时要切开。(4a)/文/中/还有/一点/小/差误/,/也/顺便/提/提/。/
(4b)/都/清楚/地/意识到/了/这/一/点/,/
(5)一道:作副词使用时不切分;作为"数+量词"时切分。
(5a)/而且/要求/未来/的/丈夫/同/她/一道/挑起/照顾/[P穆]/大爷/的/担子/。
(5b)/已/成为/百里/油田/的/一/道/风景/线/。
(5c)/在/我/的/前额/刻下/了/一/道/道/弯曲/的/青春/印记/。
(6)一面:作名词和副词使用时不切分,作为"数+量词"时切分。
(6a)/虽然/在/现代汉语/里/含有/贬义/,/但/其/积极/的/一面/应该/肯定/。
(6b)/一面/学习/,/一面/实践/,/贯彻/到/筹组/[L澳门特别行政区]/的/工作/中/去/。(6c)/爱/是/一/面/辽阔/光滑/的/回音壁/,/微小/的/爱/意/反复/回响/着/,
(7)一口:作形容词和副词使用时不切分,作为"数+量词"时切分。(7a)/[P崔]/又/一口/回绝/并/与/其/发生/争吵/。
(7b)要不/则/是/一/脸/匪/相/或者/一口/痞/气/,
(7c)不由得/倒/吸/了/一/口/冷气/打/了/一个/寒战/,
(7d)一/口/大/锅/解决/了/[int两家]/的/再就业/难题/。
(8)一手既有名词和副词的用时,又有"数+量词"的用法,但在文本中一律不予区分。
(8a)所有/的/为/官/为/政/者/都/能/写/一手/好/文章/,(8b)/整个/事件/是/他/一手/策划/的/,
(8c)/他/一手/划水/,/一手/搂/着/女/青年/游/向/岸边/。
(9)一头:作名词和副词使用时不切分,作为"数+量词"时切分。(9a)/二来/街道/一头/联/着/片/内/的/企业单位/,/一头/联/着/居民/,
(9b)/此后/,/他/一头/钻进/常年/云雾/缭绕/的/云雾山/,/拜访/民间/郎中/,(9c)/一/头/经过/救助/已/恢复/健康/的/灰/鲸/『/[P杰杰]/』/
(10)一路:作名词和副词使用时不切分;作为"数+量词"时切分。
(10a)有时/公共汽车/挤/不/上/,/干脆/快步/当/车/一路/小跑/。
(10b)我们/一路/攀登/来到/[P王永祥]/简陋/的/护林/小屋/。
(10c)另/一/路/是/探索/[L火星]/、/[L木星]/等/星球/。
(11)一下:用作副词和数量词使用时不切分;当一作副词下作动词时要切分。
(11a)/只要/通融/一下/,/既/能/得到/一/笔/大钱/,/又/能/保持/友情/。
(11b)/相互/拍打/一下/:/"/你/猜/[rat几比几]/?/"
(11c)/书包/斜/背/在/肩/上/,/带子/太/长/,/随着/步子/一/上/一/下/跳跃/着/拍打/在/屁股/上/。
(12)一片:作形容词使用时不切分;作数量短语时切分。
(12a)/台上/台/下/那/一片/亲切/和谐/的/气氛/,
(12b)/融入/一片/[dat夏日]/的/浓绿/之中
(12c)/地板/上/看/不/到/一/片/碎/纸屑/。
(12d)/宽宽大大/的/粽/叶/,/她/总/要/一/片/片/洗/净/。
(13)一则:作副词使用时不切分;作数量短语时切分。
(13a)/一则/表达/对/同乡/画/马/大师/[P徐悲鸿]/的/敬仰/,/二/则/愿/家乡/建设/如/骏马/奔腾/一日千里/。
(13b)/[L法国]/报纸/刊/出/一/则/特写/,
(14)不见:是动词见的否定形式不切分。当它同前面的动词形成V/*得*/*见*、*V/*不*/*见*的*
可能式动补结构时,要切分。类似的可能式动补结构还有*V/*得*/*下去*/*、*V/*不*/*下去*/*,
*V/*得*/*来*/*、*V/*不*/*来*,* *V/*得*/*起、*V/*不*/*起*,** V/*得*/*了*/*、*V/*不*/*了*/*,*V/*得*/*成*/*、*V/*不*/*成*/*,*长*/*得*/*大*/*、长*/*不*/*大*/*等*。
(14a)/全/都是/"/不见/兔子/不/撒/鹰/"/。
(14b)/人武部/就/看/不/见/一/盏/长明灯/,
(15)不对:作形容词表示不正确时不切分;如果对作为介词,就要切开。
(15a)父母/这么/想/当然/不对/,/可/也/不能/全/怪/他们/的/愚钝/和/落后/。
(15b)/中国/主张/和平/的/外交/政策/,/中国/不/对/任何/国家/构成/威胁/。
(16)不等:作形容词表示不相同时不切分;如果等作为动词,就要切开。
(16a)/按照/用户/要求/生产/大小/不等/的/编织/塑料/袋/,
(16b)不/等/妻子/说/什么/,/他/自己/悄悄/地/找/开/了/出路/。
(16c)/时间/不/等/人/
(17)不下:表示不少于时不切;作为动词下的否定式和可能式动补结构(见14),就要切开。
(17a)/每天/她/经手/的/业务/不下/[int百笔]/,
(17b)/架子/还/放/不/下/,/面子/还/丢/不/开/,
(17c)/刑/不/上/大夫/,/礼/不/下/庶人/
(17d)/[L俄罗斯]/整个/国家/开支/居/高/不/下/,
(18)不成:作动词、形容词和助词使用时不切分;当它作为可能式动补结构(见14)时,一律切开。
(18a)/难道/自己/这/一辈子/就/这么/过/不成/?
(18b)/毛虾/已/不成/汛/,
(18c)攀/"/亲/"/不成/反/折本/,
(18d)/往往/是/有/点/而/形/不/成/网/,
(19)上下:用作动词时一律切开,如"上/下/火车";用作名词(包括并列意义)时则不切,如"上下/两册"。
(19a)/经过/上下/的/共同/努力/,
(19b)/上/下/车/、/船/,/须/待/车/、/船/停/稳/后/先/下/客/后/上/客/,
(20)从前:作时间名词时不切分;如果从作介词前作方位词,就要切开。
(20a)/有的/是/从前/在/队/中/当/板凳/球员/,
(20b)/导致/美元/对/马克/的/汇价/从/前/一/交易/日/的/[rat1比1·7766]/降/至/[rat1比1·7762]/。
(20c)/从/前/不久/[L深圳]/一家/公司/大规模/地/恶意/抢/注/商标/案/,
(21)以为:作动词时不切分。
(21a)/以为/强大/的/[P卡斯珀罗夫]/恢复/了/他/的/本来面目/。
(21b)有/一些/干部/想/不/通/,/以为/是/搞/形式/,/出风头/。
(21c)/我们/引/以/为/自豪/的/风格/多少/应/有些/改变/了/。
(21d)/代表/们/以/为/人民/高度/负责/的/精神/,/提出/批评/和/意见/。
(22)正当:作形容词时不切分。
(22a)/我们/是否/能/以/某/种/不/正当/的/方式/反对/,
(22b)/正/当/禾苗/生长/关键/时期/,/
(23)正在:
(23a)/对/各地/已/建成/尚未/售出/和/正在/建设/的/住房/,
(23b)/记者/正/在/回/[L巴黎]/的/高速/列车/上/。
(23c)/[O世界卫生组织]/正/在/[L科特迪瓦]/召开/国际/会议/,
(24)会上:
(24a)/[L苏州]/等/省市/及/有关单位/在/会上/介绍/了/经验/,
(24b)/在/[O国际泳联]/[dat二十四日]/举行/的/听证/会/上/,
(25)台上:
(25a)/表演/完/节目/后/竟/在/台上/掩/面/而/泣/。/
(25b)/预赛/是/在/有/围/绳/的/拳击/台/上/
(26)走向:用作名词时不切分;用作动词+介词时,一律切分。
(26a)/[L北京]/输/气/管道/工程/线路/走向/示意图/(/示意图/:/[P孙伟]/绘/)/
(26b)/迈出/了/我国/航天/事业/走/向/世界/的/[ord第一步]/。
(27)才能:用作名词时不切分;用作副词+能愿动词时,必须切开。
(27a)/但/如果/施展/才能/的/空间/很/大/,/而且/能/充分/发挥/所/学/专长/,/不妨/一/试/。
(27b)/勤奋/才/能/有/真知灼见/;
(28)人才:用作名词时不切分。
(28a)要/想/成为/[dat跨世纪]/人才/,/光/有/专业知识/不够/,
(28b)/这/恐怕/只有/浪漫/的/[L法国]/人/才/想/得/出来/。
(29)上来:作趋向动词和动词时不切分;当上作方位词时,就要切开。
(29a)/一/届/新/班子/上来/以后/,/
(29b)/把/工作/重点/转移/到/社会主义/现代化/建设/上/来/,/
(30)上去:作趋向动词和动词时不切分;当上作方位词时,就要切开。
(30a)/显然/是/上/了/学/的/[L瑶族]/娃子/写/上去/的/。/
(30b)/把/科研/技术/成果/转移/到/社会/应用/上/去/。/
(31)上前:
(31a)/他/的/四个/弟兄/挨次/伸出/手/来/上前/祝贺/
(31b)/当即/冲/上/前/去/,/扭/住/一/名/歹徒/不/放/,/
(32)上路:作动词时不切分;上作动词是要切分。
(32a)/我/背/起/你/的/薄被/送/你/上路/,
(32b)/过去/村里/也是/上/路/打场/,/
(33)得了:取助词用法时不切分;但作为动词+助词(了)和可能式动补结构(见14)时要切分。
(33a)/没/叫/到/你/的/时候/,/安心/等/着/就/得了/。
(33b)/经/医生/诊断/他/得/了/胃癌/。
(33c)/书记/何以/承受/得/了/,
(34)得出:作动词时不切;作可能式动补结构(见14)时要切分。
(34a)[L天津]/近几年/的/实践/得出/了/肯定/的/答案/。
(34b)为了/让/[dat今年]/蒜农/的/产品/卖/得/出/、/卖/出/好/价钱/,/
(35)人称:作名词时不切分;当称作动词是要切分。
(35a)作者/用/第一/人称/的/叙述/手法/,
(35b)据/用/过/的/人/称/,/打/国际/长途/如/从/[L北京]/到/[L美国]/,/每/分钟/只需/传统/电话/费用/的/[fra1/4]/。
(36)同行:用作名词时不切分,读作tonghang;用作动词时读作tongxing,一律切分。
(36a)/这时/恰/有/同行/到来/,/只好/借/[mon一元钱]/给/他/。
(36b)/笔者/与/她/骑车/同/行/。
(37)从小:
(37a)/图文并茂/、/声/形/兼备/的/写作/能力/将要/从小/培养/,
(37b)/企业/从无到有/,/从/小/到/大/,
(38)中学:
(38a)/在/长期/的/中学/教学/实践/中/我/体会/到/,
(38b)/引导/他们/在/实践/中/学/会/正确/行使/民主/权利/。
(39)上门:
(39a)/营业员/们/便/主动/上门/收款/。
(39b)/打/出/了/名气/,/找/上/门/来/的/工程/一个/接/一个/。
(39c)我/冲/出/门/去/,/随手/拉/上/门/。/
(40)声响:作名词使用时不切分。
(40a)/而/轻轻/地/挪动/椅子/走开/,/无/一点/声响/。/
(40b)/"/哗哗/"/的/潮水/声/响/成/一片/,
(41)就此:作副词使用时不切分。
(41a)/国际/足球/界/一些/有识之士/就此/产生/一种/忧虑/,
(41b)我们/也/欢迎/科技界/人士/就/此/问题/发表/意见/和/建议/。
(42)高层次作形容词时不切分;当该词前有副词修饰时需切分。
(42a)/着眼点/放/在/培养/造就/大批/高层次/科技/人才/上/。
(42b)/实现/更/大/规模/、/更/高/层次/的/扩张/和/发展/。/
(43)有的:
(43a)/有的/用/汉语/,/有的/用/俄语/,
(43b)/是/[L北大荒]/独/有/的/风味/。
(44)的话:作助词使用时不切。
(44a)/如果/要/使/谈判/取得/迅速/进展/的话/,
(44b)/[P卡比拉]/先生/对/我/的/话/是/持/认真/态度/的/。
(45)话说:整体作动词使用时不切。
(45a)/话说/当年/,/他/言语/铿锵/:/"/在/当时/,/一切/都/得/打破常规/。
(45b)用/他/自己/的/话/说/,
(46)标本:意思为生物/标本时不切;表示"直接和根本"并列的意思时要切开。
(46a)/他们/还/结合/挂图/、/标本/进行/讲解/。
(46b)/反/腐败/要/坚持/标/本/兼/治/,
(47)上将:作为军衔使用时不切分。
(47a)/[O中央军委]/副/主席/、/国务委员/兼/[O国防部]/长/[P迟浩田]/上将/
(47b)/[L中国]/在/人口/问题/上/将/面临/新/的/挑战/。
(48)将军:作为军衔使用时不切分。
(48a)/党/和/国家/领导人/、/解放军/元帅/、/将军/、/政府/省/部级/干部
(48b)/将/军:将/军/体/与/群体/紧密/结合/,/开办/体育/知识/讲座
(49)之一:
(49a)/企业/领导班子/不/适应/社会主义/市场经济/的/要求/是/主要/原因/之一/。
(49b)/游人/视线/随/之/一/收/,/"/[L太和宫]/"/[int三个]/大字/豁然/在/目/。/
(49c)/我/不禁/为/之/一/震/。
(50)到家:作为形容词不切分。
(50a)/现在/不行/,/你/技术/不/过关/,/说明/练/得/还/不/到家/,/
(50b)/果不其然/,/此/儿/到/家/就/猝不及防/地/给/了/他/妈/一/刀/。
(50c)/[P赵匡胤]/终于/将/义/妹/[P京娘]/送/到/家/。
(51)在家:
(51a)/一直/在家/等待/厂子/通知/上班/的/她/再/也/沉/不住/气/了/
(51b)/实现/访客/在/家/门口/与/住户/可/视/
(51c)/把/她/一/人/放/在/家/中/[P孙威锋]/放心不下/,
(52)人均:
(52a)/学生/拥有/计算机/的/人均/占有率/最高/
(52b)/[int两]/人/均/未/达到/[fra2/3]/的/当选/票/数/,
(53)中用:
(53a)/"/察/古/知/今/"/基本上/不/中用/了/,
(53b)/天文学/上/把/[L宇宙]/中/用/光学/方法/看/不/到/的/物质/称/做/暗/物质/,
(53c)/西/体/[L中]/用/我/也/反对/,
(54)前去:
(54a)/让/[P胡洁青]/前去/扶持/、/帮忙/。
(54b)一/名/应邀/到会/的/[L北京]/小学生/激动/地/跑/上/前/去/请/他/签名/。
(55)词表词"受过"只有"代人受过"的意思。当动词受和助词过构成"动+助"结构时,一律切开。
(55a)/它们/代/四/奸/受过/,
(55b)/[P鲁迅]/虽然/在/[dat二十年代中期]/受/过/[P托洛茨基]/的/一定/影响/,
(56)结果:有名词和动词两种用法,都不切分,动词结果的意思是杀死,而不是结出果实的意思。作为后一个意思,名词果是动词结的宾语,所以需切分。
(56a)/矫枉过正/的/结果/,/是/大家/几乎/忘/了/怎么/吃/,/
(56b)/种/果树/一般/要/三年/才/能/结/果/,
#### 9.2.4就是、只有、只是、还是的切分规则
##### 9.2.4.1就是
就是作副词、连词、助词使用时不切分。但作动词时,就是副词,是是动词,一律切分。
(A)作副词时,共有6个义项:
(i)单用,表示同意,对;
(ii)表示坚决,不可更改;
(iii)强调肯定某种性质和状态,含有反驳意味;(iv)强调迅速果断;
(v)确定范围,排除其它;(vi)表示没有别的情况;。
(1)我/一定/办到/,您/放心/就是/。/
(2)/反正/姥爷/就是/看/我/不/顺心/,/一点/也/不/喜欢/我/。/
(3)/望/着/车/来/车/往/的/马路/,/一/站/就是/[int几个小时]/。/
(4)/就是/节目/诉/求/为/非常/鲜明/的/单一/主题/,
(B)作连词有2个义项:
(i)表示假设的让步;即使(后面常用也作呼应);
(ii)表示一种极端情况;纵然。如:
(5)不是/播种/,/就是/锄地/;/不是/下/田/挖/野菜/,/就是/上山/打柴/。
(6)这个/建议/好/倒/是/好/,/就是/远水/不解/近/渴/。/
(C)就是作动词时,一律切分,就是副词,是是动词。如:
(7)/[O光华国中]/职员/[P杨一中]/就/是/买/菜/变成/[O慈德]/会员/的/一/例/。
(8)/多元化/的/意思/就/是/有/了/更/多/的/选择/,
(9)/最/特别/的/就/是/黄金/压制/的/邮票/。
(10)/最/关键/的/原则/就/是/「/避/凶/趋/吉/」/,
(11)这/就/是/[L海尔-波普彗星]/。/
##### 9.2.4.2只有
只有作为一个词表词有副词和连词两个义项。当他用作动词时,一律切开。(A)只有做副词相当于只好,表示唯一的选择。如:
(1)/家属/最后/只有/寄/望/对岸/[O海协会]/能/请/[L大陆]/渔船/协/寻/。
(2)/[L中国]/的/体育/长期/是/国家/一/家/办/,/发达国家/是/国家/不/办/,/只有/社会/办/,/现在/国际/体育/的/潮流/是/国家/与/社会/共同/兴办/。/
(3)无/雪/的/[dat冬天]/是/难挨/的/,/我/只有/在/心中/落/着/一/场/场/大雪/。/
(4)协办员/和/见习员/在/通过/[int三道]/关/后/,/还要/经由/主办员/挑选/,/没有/主办员/
挑选/的/也/只有/待岗/。/
(5)如果/"/邪恶/的/敌人/对/[L伊]/发动/侵略/,/[L伊拉克]/将/别无选择/,/只有/用/其/全部/的/潜力/、/经验/和/信仰/进行/自卫/"/。/
(B)"只有"作连词用表示必要条件,下文常用副词才、方呼应。如:
(6)只有/掌握/了/最/先进/的/科学/,/我们/才/能/有/巩固/的/国防/。/
(7)高尚/的/世界/只/对/高尚/的/人们/存在/,/高尚/的/精神/境界/只有/高尚/的/人们/才/有/
资格/领略/。/
(C)"只有"用作动词时一律切开。这时"只"做副词、"有"作动词。如:
(8)/完成/管理/的/比率/只/有/[per百分之八十九]/,
(9)/车行/时速/只/有/[len卅到五十公里]/左右/;
(10)/目前/[O基隆邮局]/只/有/一个/集邮/柜台/,
(11)/因/[O中嵙国小]/整个/学区/只/有/一个/[L中嵙里]/,
##### 9.2.4.3还是
还是有三种用法:连词、副词和动词。作动词时一律切分。
(A)作连词用时表示选择,通常跟无论、不管等连用。带连词还是的句子,除疑问句外,还是都可以换成或者,意思不变。例如:
(1)无论是/说/新/话/,/提/新/观点/,/还是/放弃/前人/和/本本/上/的/过时/的/观点/、/错误/的/结论/,/都/需要/勇气/。/
(2)农民/[P张戎梅]/说/:/"/我们/村/不论/是/养猪/还是/种菜/的/,/现在/都/把/眼睛/盯/在/了/铁路/两头/。/"/
(3)不管/是/开工/还是/竣工/,/既/有/庆典/,/又/有/报导/,/或/称/世纪/工程/,
(4)他们/不但/是/我们/公司/发展/的/"/动力/之/源/"/,/还是/我们/学习/的/好榜样/!/
(B)还是作副词用时有三个义项:
(i)表示行为、动作或状态继续保持不变,相当于"仍然"、"依然"。
(ii)表示经过比较后做出的选择。
(iii)加强语气,相当于"到底"、"究竟"、"毕竟"。
还是/d用在动词、形容词前,可以省作还,而用在主语前不能省作还。
(5)/但/现场/交通/还是/十分/杂乱/。
(6)/该/基金/还是/可以/支应/灾民/最高/[mon一百万元]/的/贷款/额/,
(7)/很多/居民/还是/使用/地下水/,
(8)/[P陈]/还是/不/改/顽皮/个性/,
(9)/多数人/还是/喜欢/为/宝宝/选/个/金/饰/,
(10)/[P陈小弟]/的/父/母亲/还是/勇敢/地/生/下/他/,
(C)还是用作动词时一律都要切开,即还作副词使用,有作动词使用。句型"是/v……的"可以帮助我们判断还是在句中是不是一种动词的用法。
(11)/关键/还/是/在/府/会/双方/态度/,
(12)/初/到/部队/,/[age十五六岁]/,/还/是/个/没/见/过/世面/的/毛孩子/。/
(13)/她/不/相信/歌剧/这/门/综合/艺术/会/落入/低谷/,/认为/关键/还/是/提高/歌剧/自身/的
/品质/。/
(14)但/在/日常/工作/中/,/我/深感/除了/忙/还/是/忙/,/搞/得/焦头烂额/,/一天到晚/自己/不/属于/自己/。/
##### 9.2.4.4只是
只是有三种用法:副词、连词和动词。作动词时统统切分。
(A)只是作副词使用时有两个义项:
(i)表示限定某种情况或范围,相当于仅仅是。句末用而已或罢了等配合,
表示语气更为缓和。
(ii)强调在任何条件下情况都不变,有总是的意思。
(1)/只是/作为/预定/分娩/日/的/参考/。
(2)/只是/没有/焢/窑/经验/的/[P张]/课/长/,
(3)/他/虽/表示/民意调查/结果/数据/只是/具有/参考/价值/,
(4)/施工/初期/只是/修剪/树枝/,
(B)只是作连词用,用在后一分句,表示轻微的转折,补充修正上文的意思,与不过的用法相近。
(5)/记者/在/重灾区/[L大河乡]/注意/到/,/群众/有/饭/吃/,/有/衣/穿/,/有/伤病/能/医治/,/只是/搭建/的/小/窝棚/难以/抵御/坝上/呼啸/的/寒风/。/
(6)[dat唐朝]/著名/诗人/[P李商隐]/『/夕阳/无限/好/,/只是/近/黄昏/』/的/诗句/是/对/黄昏/的/叹息/和/无奈/,/
(C)只是用作动词时一律要切开,即只作副词,是作动词。如:
(7)/他/只/是/[dur一个月]/领/[mon二万多元]/的/工人/
(8)/事实上/[L盐埔乡]/公所/的/薪水/无/着落/只/是/冰山/一/角/,
(9)/这些/需求/不只/是/钱/或/资源/,/
================================================
FILE: docs/api/common/configurable.rst
================================================
.. _api/configurable:
configurable
====================
.. autoclass:: hanlp_common.configurable.Configurable
:members:
.. autoclass:: hanlp_common.configurable.AutoConfigurable
:members:
================================================
FILE: docs/api/common/conll.rst
================================================
.. _api/conll:
conll
====================
.. autoclass:: hanlp_common.conll.CoNLLWord
:members:
.. autoclass:: hanlp_common.conll.CoNLLUWord
:members:
.. autoclass:: hanlp_common.conll.CoNLLSentence
:members:
================================================
FILE: docs/api/common/constant.rst
================================================
constant
====================
.. automodule:: hanlp_common.constant
:members:
================================================
FILE: docs/api/common/document.rst
================================================
.. _api/document:
document
====================
.. currentmodule:: hanlp_common
.. autoclass:: hanlp_common.document.Document
:members:
================================================
FILE: docs/api/common/index.md
================================================
# hanlp_common
Common APIs shared between `hanlp` and `restful`.
```{toctree}
document
conll
configurable
constant
```
================================================
FILE: docs/api/hanlp/common/component.rst
================================================
component
=================
.. currentmodule:: hanlp.common
.. autoclass:: hanlp.common.component.Component
:members:
================================================
FILE: docs/api/hanlp/common/dataset.md
================================================
# dataset
This module provides base definition for datasets, dataloaders and samplers.
## datasets
```{eval-rst}
.. currentmodule:: hanlp.common
.. autoclass:: hanlp.common.dataset.Transformable
:members:
.. autoclass:: hanlp.common.dataset.TransformableDataset
:members:
:special-members:
:exclude-members: __init__, __repr__
```
## dataloaders
```{eval-rst}
.. currentmodule:: hanlp.common
.. autoclass:: hanlp.common.dataset.PadSequenceDataLoader
:members:
:special-members:
:exclude-members: __init__, __repr__
.. autoclass:: hanlp.common.dataset.PrefetchDataLoader
:members:
:special-members:
:exclude-members: __init__, __repr__
```
## samplers
```{eval-rst}
.. currentmodule:: hanlp.common
.. autoclass:: hanlp.common.dataset.BucketSampler
:members:
.. autoclass:: hanlp.common.dataset.KMeansSampler
:members:
.. autoclass:: hanlp.common.dataset.SortingSampler
:members:
```
## sampler builders
```{eval-rst}
.. currentmodule:: hanlp.common
.. autoclass:: hanlp.common.dataset.SamplerBuilder
:members:
.. autoclass:: hanlp.common.dataset.SortingSamplerBuilder
:members:
.. autoclass:: hanlp.common.dataset.KMeansSamplerBuilder
:members:
```
================================================
FILE: docs/api/hanlp/common/index.md
================================================
# common
Common base classes.
```{toctree}
structure
vocab
transform
dataset
component
torch_component
```
================================================
FILE: docs/api/hanlp/common/structure.md
================================================
# structure
```{eval-rst}
.. currentmodule:: hanlp.common
.. autoclass:: hanlp.common.structure.ConfigTracker
:members:
.. autoclass:: hanlp.common.structure.History
:members:
```
================================================
FILE: docs/api/hanlp/common/torch_component.md
================================================
# torch_component
```{eval-rst}
.. currentmodule:: hanlp.common.torch_component
.. autoclass:: hanlp.common.torch_component.TorchComponent
:members:
```
================================================
FILE: docs/api/hanlp/common/transform.md
================================================
# transform
```{eval-rst}
.. currentmodule:: hanlp.common
.. autoclass:: hanlp.common.transform.VocabDict
:members:
```
================================================
FILE: docs/api/hanlp/common/vocab.md
================================================
# vocab
```{eval-rst}
.. currentmodule:: hanlp.common
.. autoclass:: hanlp.common.transform.Vocab
:members:
:special-members:
:exclude-members: __init__, __repr__, __call__, __str__
```
================================================
FILE: docs/api/hanlp/components/classifiers.md
================================================
# classifiers
```{eval-rst}
.. currentmodule:: hanlp.components.classifiers
.. autoclass:: hanlp.components.classifiers.transformer_classifier.TransformerClassifier
:members:
```
================================================
FILE: docs/api/hanlp/components/eos.md
================================================
# eos
```{eval-rst}
.. currentmodule:: hanlp.components.eos
.. autoclass:: hanlp.components.eos.ngram.NgramSentenceBoundaryDetector
:members:
```
================================================
FILE: docs/api/hanlp/components/index.md
================================================
# components
NLP components.
```{toctree}
mtl/index
classifiers
eos
tokenizers/index
lemmatizer
taggers/index
ner/index
parsers/index
srl/index
pipeline
sts
```
================================================
FILE: docs/api/hanlp/components/lemmatizer.md
================================================
# lemmatizer
```{eval-rst}
.. currentmodule:: hanlp.components.lemmatizer
.. autoclass:: TransformerLemmatizer
:members:
```
================================================
FILE: docs/api/hanlp/components/mtl/index.md
================================================
# mtl
Multi-Task Learning (MTL) framework.
```{toctree}
mtl
tasks/index
```
================================================
FILE: docs/api/hanlp/components/mtl/mtl.md
================================================
# MultiTaskLearning
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.multi_task_learning.MultiTaskLearning
:members:
:special-members:
:exclude-members: __init__, __repr__
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/constituency.md
================================================
# con
Constituency parsing.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.constituency.CRFConstituencyParsing
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/dep.md
================================================
# dep
Dependency parsing.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.dep.BiaffineDependencyParsing
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/index.md
================================================
# tasks
Multi-Task Learning (MTL) tasks.
```{toctree}
task
constituency
dep
sdp
ud
lem
pos
tok
ner/index
srl/index
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/lem.md
================================================
# lem
Lemmatization.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.lem.TransformerLemmatization
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/ner/biaffine_ner.md
================================================
# biaffine_ner
Biaffine Named Entity Recognition.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.ner.biaffine_ner.BiaffineNamedEntityRecognition
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/ner/index.md
================================================
# ner
Named Entity Recognition.
```{toctree}
tag_ner
biaffine_ner
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/ner/tag_ner.md
================================================
# tag_ner
Tagging based Named Entity Recognition.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.ner.tag_ner.TaggingNamedEntityRecognition
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/pos.md
================================================
# pos
Part-of-speech tagging.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.pos.TransformerTagging
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/sdp.md
================================================
# sdp
Semantic Dependency Parsing.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.sdp.BiaffineSemanticDependencyParsing
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/srl/bio_srl.md
================================================
# bio_srl
BIO Tagging based Semantic Role Labeling.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.srl.bio_srl.SpanBIOSemanticRoleLabeling
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/srl/index.md
================================================
# srl
Semantic Role Labeling.
```{toctree}
bio_srl
rank_srl
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/srl/rank_srl.md
================================================
# rank_srl
Span Ranking Semantic Role Labeling.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.srl.rank_srl.SpanRankingSemanticRoleLabeling
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/task.md
================================================
# Task
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.Task
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/tok.md
================================================
# tok
Tokenization.
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.tok.tag_tok.TaggingTokenization
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/mtl/tasks/ud.md
================================================
# ud
Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing).
```{eval-rst}
.. currentmodule:: hanlp.components.mtl
.. autoclass:: hanlp.components.mtl.tasks.ud.UniversalDependenciesParsing
:members:
:exclude-members: execute_training_loop, fit_dataloader
```
================================================
FILE: docs/api/hanlp/components/ner/biaffine_ner.md
================================================
# biaffine_ner
Biaffine Named Entity Recognition.
```{eval-rst}
.. currentmodule:: hanlp.components.ner.transformer_ner
.. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner.BiaffineNamedEntityRecognizer
:members:
```
================================================
FILE: docs/api/hanlp/components/ner/index.md
================================================
# ner
Named Entity Recognition.
```{toctree}
transformer_ner
rnn_ner
biaffine_ner
```
================================================
FILE: docs/api/hanlp/components/ner/rnn_ner.md
================================================
# rnn_ner
Tagging based Named Entity Recognition.
```{eval-rst}
.. currentmodule:: hanlp.components.ner.rnn_ner
.. autoclass:: hanlp.components.ner.rnn_ner.RNNNamedEntityRecognizer
:members:
```
================================================
FILE: docs/api/hanlp/components/ner/transformer_ner.md
================================================
# transformer_ner
Tagging based Named Entity Recognition.
```{eval-rst}
.. currentmodule:: hanlp.components.ner.transformer_ner
.. autoclass:: hanlp.components.ner.transformer_ner.TransformerNamedEntityRecognizer
:members:
```
================================================
FILE: docs/api/hanlp/components/parsers/biaffine_dep.md
================================================
# biaffine_dep
Biaffine dependency parser.
```{eval-rst}
.. currentmodule:: hanlp.components
.. autoclass:: hanlp.components.parsers.biaffine.biaffine_dep.BiaffineDependencyParser
:members:
```
================================================
FILE: docs/api/hanlp/components/parsers/biaffine_sdp.md
================================================
# biaffine_sdp
Biaffine dependency parser.
```{eval-rst}
.. currentmodule:: hanlp.components
.. autoclass:: hanlp.components.parsers.biaffine.biaffine_sdp.BiaffineSemanticDependencyParser
:members:
```
================================================
FILE: docs/api/hanlp/components/parsers/crf_constituency_parser.md
================================================
# crf_constituency_parser
Biaffine dependency parser.
```{eval-rst}
.. currentmodule:: hanlp.components
.. autoclass:: hanlp.components.parsers.constituency.crf_constituency_parser.CRFConstituencyParser
:members:
```
================================================
FILE: docs/api/hanlp/components/parsers/index.md
================================================
# parsers
Parsers.
```{toctree}
biaffine_dep
biaffine_sdp
ud_parser
crf_constituency_parser
```
================================================
FILE: docs/api/hanlp/components/parsers/ud_parser.md
================================================
# ud_parser
Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing).
```{eval-rst}
.. currentmodule:: hanlp.components
.. autoclass:: hanlp.components.parsers.ud.ud_parser.UniversalDependenciesParser
:members:
```
================================================
FILE: docs/api/hanlp/components/pipeline.md
================================================
# pipeline
```{eval-rst}
.. currentmodule:: hanlp.components.pipeline
.. autoclass:: hanlp.components.pipeline.Pipe
:members:
.. autoclass:: hanlp.components.pipeline.Pipeline
:members:
```
================================================
FILE: docs/api/hanlp/components/srl/index.md
================================================
# srl
Semantic Role Labelers.
```{toctree}
span_rank
span_bio
```
================================================
FILE: docs/api/hanlp/components/srl/span_bio.md
================================================
# span_bio
Span BIO tagging based SRL.
```{eval-rst}
.. currentmodule:: hanlp.components.srl.span_bio.span_bio
.. autoclass:: SpanBIOSemanticRoleLabeler
:members:
```
================================================
FILE: docs/api/hanlp/components/srl/span_rank.md
================================================
# span_rank
Span Rank based SRL.
```{eval-rst}
.. currentmodule:: hanlp.components.srl.span_rank.span_rank
.. autoclass:: SpanRankingSemanticRoleLabeler
:members:
```
================================================
FILE: docs/api/hanlp/components/sts.md
================================================
# sts
```{eval-rst}
.. currentmodule:: hanlp.components.sts
.. autoclass:: hanlp.components.sts.transformer_sts.TransformerSemanticTextualSimilarity
:members:
```
================================================
FILE: docs/api/hanlp/components/taggers/index.md
================================================
# taggers
Taggers.
```{toctree}
transformer_tagger
rnn_tagger
```
================================================
FILE: docs/api/hanlp/components/taggers/rnn_tagger.md
================================================
# rnn_tagger
RNN based tagger.
```{eval-rst}
.. currentmodule:: hanlp.components
.. autoclass:: hanlp.components.taggers.rnn_tagger.RNNTagger
:members:
```
================================================
FILE: docs/api/hanlp/components/taggers/transformer_tagger.md
================================================
# transformer_tagger
Transformer based tagger.
```{eval-rst}
.. currentmodule:: hanlp.components
.. autoclass:: hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger
:members:
```
================================================
FILE: docs/api/hanlp/components/tokenizers/index.md
================================================
# tokenizers
Tokenizers.
```{toctree}
transformer
multi_criteria
```
================================================
FILE: docs/api/hanlp/components/tokenizers/multi_criteria.md
================================================
# multi_criteria
Transformer based Multi-Criteria Word tokenizer.
```{eval-rst}
.. currentmodule:: hanlp.components.tokenizers.multi_criteria_cws_transformer
.. autoclass:: hanlp.components.tokenizers.multi_criteria_cws_transformer.MultiCriteriaTransformerTaggingTokenizer
:members:
```
================================================
FILE: docs/api/hanlp/components/tokenizers/transformer.md
================================================
# transformer
Transformer based tokenizer.
```{eval-rst}
.. currentmodule:: hanlp.components.tokenizers.transformer
.. autoclass:: hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer
:members:
```
================================================
FILE: docs/api/hanlp/datasets/constituency/constituency_dataset.md
================================================
# constituency_dataset
```{eval-rst}
.. autoclass:: hanlp.datasets.parsing.loaders.constituency_dataset.ConstituencyDataset
:members:
```
================================================
FILE: docs/api/hanlp/datasets/constituency/index.md
================================================
# con
Constituency parsing datasets.
```{toctree}
constituency_dataset
resources
```
================================================
FILE: docs/api/hanlp/datasets/constituency/resources.md
================================================
# resources
## Chinese Treebank
### CTB8
````{margin} **Discussion**
```{seealso}
About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024).
```
````
```{eval-rst}
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_DEV
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TEST
```
### CTB9
````{margin} **Discussion**
```{seealso}
About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024).
```
````
```{eval-rst}
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_DEV
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TEST
```
## English Treebank
### PTB
```{eval-rst}
.. autodata:: hanlp.datasets.parsing.ptb.PTB_TRAIN
.. autodata:: hanlp.datasets.parsing.ptb.PTB_DEV
.. autodata:: hanlp.datasets.parsing.ptb.PTB_TEST
```
================================================
FILE: docs/api/hanlp/datasets/dep/conll_dataset.md
================================================
# conll
```{eval-rst}
.. currentmodule:: hanlp.datasets.parsing.loaders.conll_dataset
.. autoclass:: CoNLLParsingDataset
:members:
```
================================================
FILE: docs/api/hanlp/datasets/dep/index.md
================================================
# dep
Dependency parsing datasets.
```{toctree}
conll_dataset
resources
```
================================================
FILE: docs/api/hanlp/datasets/dep/resources.md
================================================
# resources
## PKU Multiview Treebank
PKU Multi-view Chinese Treebank, released by PKU-ICL. It contains the sentences from People's Daily(19980101-19980110).
The number of sentences in it is 14463.
```{eval-rst}
.. automodule:: hanlp.datasets.parsing.pmt1
:members:
```
## Chinese Treebank
### CTB5
```{eval-rst}
.. automodule:: hanlp.datasets.parsing.ctb5
:members:
```
### CTB7
```{eval-rst}
.. automodule:: hanlp.datasets.parsing.ctb7
:members:
```
### CTB8
```{eval-rst}
.. Attention::
We propose a new data split for CTB which is different from the academia conventions with the following 3 advantages.
- Easy to reproduce. Files ending with ``8`` go to dev set, ending with ``9`` go to the test set, otherwise go to the training set.
- Full use of CTB8. The academia conventional split omits 50 gold files while we recall them.
- More balanced split across genres. Proportions of samples in each genres are similar.
We also use Stanford Dependencies 3.3.0 which offers fine-grained relations and more grammars than the conventional
head finding rules introduced by :cite:`zhang-clark-2008-tale`.
Therefore, scores on our preprocessed CTB8 are not directly comparable to those in most literatures. We have
experimented the same model on the conventionally baked CTB8 and the scores could be 4~5 points higher.
We believe it's worthy since HanLP is made for practical purposes, not just for producing pretty numbers.
```
````{margin} **Discussion**
```{seealso}
We have a discussion on [our forum](https://bbs.hankcs.com/t/topic/3024).
```
````
```{eval-rst}
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_DEV
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_TEST
```
### CTB9
```{eval-rst}
.. Attention::
Similar preprocessing and splits with CTB8 are applied. See the notice above.
```
```{eval-rst}
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_DEV
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_TEST
```
## English Treebank
### PTB
```{eval-rst}
.. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_TRAIN
.. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_DEV
.. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_TEST
```
## Universal Dependencies
### Languages
```{eval-rst}
.. automodule:: hanlp.datasets.parsing.ud.ud27
:members:
```
### Multilingual
```{eval-rst}
.. automodule:: hanlp.datasets.parsing.ud.ud27m
:members:
```
================================================
FILE: docs/api/hanlp/datasets/eos/eos.md
================================================
# eos
```{eval-rst}
.. currentmodule:: hanlp.datasets.eos.eos
.. autoclass:: SentenceBoundaryDetectionDataset
:members:
```
================================================
FILE: docs/api/hanlp/datasets/eos/index.md
================================================
# eos
Sentence boundary detection datasets.
```{toctree}
eos
resources
```
================================================
FILE: docs/api/hanlp/datasets/eos/resources.md
================================================
# resources
## nn_eos
```{eval-rst}
.. automodule:: hanlp.datasets.eos.loaders.nn_eos
:members:
```
================================================
FILE: docs/api/hanlp/datasets/index.md
================================================
# datasets
```{eval-rst}
NLP datasets grouped by tasks. For each task, we provide at least one ``torch.utils.data.Dataset`` compatible class
and several open-source resources. Their file format and description can be found in their ``Dataset.load_file``
documents. Their contents are split into ``TRAIN``, ``DEV`` and ``TEST`` portions, each of them is stored in
a Python constant which can be fetched using :meth:`~hanlp.utils.io_util.get_resource`.
```
````{margin} **Professionals use Linux**
```{note}
Many preprocessing scripts written by professionals make heavy use of Linux/Unix tool chains like shell, perl, gcc,
etc., which is not available or buggy on Windows. You may need a *nix evironment to run these scripts.
```
````
```{toctree}
eos/index
tok/index
pos/index
ner/index
dep/index
srl/index
constituency/index
```
================================================
FILE: docs/api/hanlp/datasets/ner/index.md
================================================
# ner
NER datasets.
```{toctree}
tsv
json
resources
```
================================================
FILE: docs/api/hanlp/datasets/ner/json.md
================================================
# json
```{eval-rst}
.. currentmodule:: hanlp.datasets.ner.loaders.json_ner
.. autoclass:: JsonNERDataset
:members:
```
================================================
FILE: docs/api/hanlp/datasets/ner/resources.md
================================================
# resources
## CoNLL 2003
```{eval-rst}
.. automodule:: hanlp.datasets.ner.conll03
:members:
```
## MSRA
```{eval-rst}
.. automodule:: hanlp.datasets.ner.msra
:members:
```
## OntoNotes5
```{eval-rst}
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TRAIN
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_DEV
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TEST
```
## Resume
```{eval-rst}
.. automodule:: hanlp.datasets.ner.resume
:members:
```
## Weibo
```{eval-rst}
.. automodule:: hanlp.datasets.ner.weibo
:members:
```
================================================
FILE: docs/api/hanlp/datasets/ner/tsv.md
================================================
# tsv
```{eval-rst}
.. currentmodule:: hanlp.datasets.ner.loaders.tsv
.. autoclass:: TSVTaggingDataset
:members:
```
================================================
FILE: docs/api/hanlp/datasets/pos/index.md
================================================
# pos
PoS datasets.
```{eval-rst}
PoS is a normal tagging task which uses :class:`hanlp.datasets.ner.loaders.tsv.TSVTaggingDataset` for loading.
```
```{toctree}
resources
```
================================================
FILE: docs/api/hanlp/datasets/pos/resources.md
================================================
# resources
## CTB5
```{eval-rst}
.. automodule:: hanlp.datasets.pos.ctb5
:members:
```
## CTB8
```{eval-rst}
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_DEV
.. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TEST
```
## CTB9
```{eval-rst}
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TRAIN
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_DEV
.. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TEST
```
================================================
FILE: docs/api/hanlp/datasets/srl/conll2012_dataset.md
================================================
# conll2012_dataset
```{eval-rst}
.. autoclass:: hanlp.datasets.srl.loaders.conll2012.CoNLL2012SRLDataset
:members:
```
================================================
FILE: docs/api/hanlp/datasets/srl/index.md
================================================
# srl
Semantic Role Labeling datasets.
```{toctree}
conll2012_dataset
resources
```
================================================
FILE: docs/api/hanlp/datasets/srl/resources.md
================================================
# resources
## OntoNotes 5
### Chinese
```{eval-rst}
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN
:noindex:
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV
:noindex:
.. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST
:noindex:
```
================================================
FILE: docs/api/hanlp/datasets/tok/index.md
================================================
# tok
Tokenization datasets.
```{toctree}
txt
mcws_dataset
resources
```
================================================
FILE: docs/api/hanlp/datasets/tok/mcws_dataset.md
================================================
# mcws_dataset
```{eval-rst}
.. currentmodule:: hanlp.datasets.tokenization.loaders.multi_criteria_cws.mcws_dataset
.. autoclass:: MultiCriteriaTextTokenizingDataset
:members:
```
================================================
FILE: docs/api/hanlp/datasets/tok/resources.md
================================================
# resources
## sighan2005
[The Second International Chinese Word Segmentation Bakeoff](http://sighan.cs.uchicago.edu/bakeoff2005/) took place over the summer of 2005.
### pku
```{eval-rst}
.. automodule:: hanlp.datasets.tokenization.sighan2005.pku
:members:
```
### msr
```{eval-rst}
.. automodule:: hanlp.datasets.tokenization.sighan2005.msr
:members:
```
### as
```{eval-rst}
.. automodule:: hanlp.datasets.tokenization.sighan2005.as_
:members:
```
### cityu
```{eval-rst}
.. automodule:: hanlp.datasets.tokenization.sighan2005.cityu
:members:
```
## CTB6
```{eval-rst}
.. automodule:: hanlp.datasets.tokenization.ctb6
:members:
```
## CTB8
```{eval-rst}
.. automodule:: hanlp.datasets.parsing.ctb8
.. autodata:: CTB8_CWS_TRAIN
.. autodata:: CTB8_CWS_DEV
.. autodata:: CTB8_CWS_TEST
```
## CTB9
```{eval-rst}
.. automodule:: hanlp.datasets.parsing.ctb9
.. autodata:: CTB9_CWS_TRAIN
.. autodata:: CTB9_CWS_DEV
.. autodata:: CTB9_CWS_TEST
```
================================================
FILE: docs/api/hanlp/datasets/tok/txt.md
================================================
# txt
```{eval-rst}
.. currentmodule:: hanlp.datasets.tokenization.loaders.txt
.. autoclass:: TextTokenizingDataset
:members:
```
================================================
FILE: docs/api/hanlp/hanlp.rst
================================================
.. _api/main:
hanlp
==========
.. currentmodule:: hanlp
.. autofunction:: load
.. autofunction:: pipeline
================================================
FILE: docs/api/hanlp/index.md
================================================
# hanlp
Core APIs for `hanlp`.
```{toctree}
hanlp
common/index
components/index
pretrained/index
datasets/index
utils/index
layers/index
```
================================================
FILE: docs/api/hanlp/layers/decoders/biaffine_ner.md
================================================
# biaffine_ner
```{eval-rst}
.. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner_model.BiaffineNamedEntityRecognitionDecoder
:members:
```
================================================
FILE: docs/api/hanlp/layers/decoders/index.md
================================================
# decoders
```{toctree}
linear_crf
biaffine_ner
```
================================================
FILE: docs/api/hanlp/layers/decoders/linear_crf.md
================================================
# linear_crf
```{eval-rst}
.. autoclass:: hanlp.components.mtl.tasks.pos.LinearCRFDecoder
:members:
```
================================================
FILE: docs/api/hanlp/layers/embeddings/char_cnn.md
================================================
# char_cnn
```{eval-rst}
.. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNN
:members:
.. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNNEmbedding
:members:
```
================================================
FILE: docs/api/hanlp/layers/embeddings/char_rnn.md
================================================
# char_rnn
```{eval-rst}
.. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNN
:members:
.. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNNEmbedding
:members:
```
================================================
FILE: docs/api/hanlp/layers/embeddings/embedding.md
================================================
# embedding
```{eval-rst}
.. autoclass:: hanlp.layers.embeddings.embedding.Embedding
:members:
.. autoclass:: hanlp.layers.embeddings.embedding.ConcatModuleList
:members:
.. autoclass:: hanlp.layers.embeddings.embedding.EmbeddingList
:members:
```
================================================
FILE: docs/api/hanlp/layers/embeddings/fasttext.md
================================================
# fasttext
```{eval-rst}
.. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbedding
:members:
.. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbeddingModule
:members:
```
================================================
FILE: docs/api/hanlp/layers/embeddings/index.md
================================================
# embeddings
```{toctree}
embedding
word2vec
fasttext
char_cnn
char_rnn
transformer
```
================================================
FILE: docs/api/hanlp/layers/embeddings/transformer.md
================================================
# transformer
```{eval-rst}
.. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbedding
:members:
.. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule
:members:
```
================================================
FILE: docs/api/hanlp/layers/embeddings/word2vec.md
================================================
# word2vec
```{eval-rst}
.. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbedding
:members:
.. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbeddingModule
:members:
```
================================================
FILE: docs/api/hanlp/layers/index.md
================================================
# layers
```{toctree}
embeddings/index
transformers/index
decoders/index
```
================================================
FILE: docs/api/hanlp/layers/transformers/encoder.md
================================================
# encoder
```{eval-rst}
.. autoclass:: hanlp.layers.transformers.encoder.TransformerEncoder
:members:
```
================================================
FILE: docs/api/hanlp/layers/transformers/index.md
================================================
# transformers
```{toctree}
encoder
tokenizer
```
================================================
FILE: docs/api/hanlp/layers/transformers/tokenizer.md
================================================
# tokenizer
```{eval-rst}
.. autoclass:: hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/amr.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# amr
AMR captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph with labels on edges (relations) and leaves (concepts).
Before loading an AMR model, make sure to install HanLP with the `amr` dependencies:
```shell
pip install hanlp[amr] -U
```
To parse a raw sentence into AMR:
```{eval-rst}
.. margin:: Batching is Faster
.. Hint:: Parse multiple sentences at once for faster speed!
```
```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE)
amr = amr_parser('The boy wants the girl to believe him.')
print(amr)
```
All the pre-trained parsers and their scores are listed below.
```{eval-rst}
.. automodule:: hanlp.pretrained.amr
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/amr2text.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# amr2text
AMR captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph with labels on edges (relations) and leaves (concepts).
The goal of AMR-to-Text Generation is to recover the original sentence realization given an AMR. This task can be seen as the reverse of the structured prediction found in AMR parsing.
Before loading an AMR model, make sure to install HanLP with the `amr` dependencies:
```shell
pip install hanlp[amr] -U
```
To generate a sentence given an AMR:
```{eval-rst}
.. margin:: Batching is Faster
.. Hint:: Generate multiple sentences at once for faster speed!
```
```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
generation = hanlp.load(hanlp.pretrained.amr2text.AMR3_GRAPH_PRETRAIN_GENERATION)
print(generation('''
(z0 / want-01
:ARG0 (z1 / boy)
:ARG1 (z2 / believe-01
:ARG0 (z3 / girl)
:ARG1 z1))
'''))
```
All the pre-trained parsers and their scores are listed below.
```{eval-rst}
.. automodule:: hanlp.pretrained.amr2text
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/constituency.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# constituency
Constituency Parsing is the process of analyzing the sentences by breaking down it into sub-phrases also known as constituents.
To parse a tokenized sentence into constituency tree, first load a parser:
```{eval-rst}
.. margin:: Batching is Faster
.. Hint:: To speed up, parse multiple sentences at once, and use a GPU.
```
```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
con = hanlp.load(hanlp.pretrained.constituency.CTB9_CON_FULL_TAG_ELECTRA_SMALL)
```
Then parse a sequence or multiple sequences of tokens to it.
```{code-cell} ipython3
:tags: [output_scroll]
tree = con(["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"])
```
The constituency tree is a nested list of constituencies:
```{code-cell} ipython3
:tags: [output_scroll]
tree
```
You can `str` or `print` it to get its bracketed form:
```{code-cell} ipython3
:tags: [output_scroll]
print(tree)
```
All the pre-trained parsers and their scores are listed below.
```{eval-rst}
.. automodule:: hanlp.pretrained.constituency
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/dep.md
================================================
# dep
```{eval-rst}
.. automodule:: hanlp.pretrained.dep
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/eos.md
================================================
# eos
```{eval-rst}
.. automodule:: hanlp.pretrained.eos
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/fasttext.md
================================================
# fasttext
```{eval-rst}
.. automodule:: hanlp.pretrained.fasttext
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/glove.md
================================================
# glove
```{eval-rst}
.. automodule:: hanlp.pretrained.glove
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/index.md
================================================
# pretrained
```{eval-rst}
NLP components grouped by tasks. For each task, we provide at least one :class:`~hanlp.common.component.Component`
compatible class and several pretrained models. Each of them is stored in a Python constant which can be fetched
using :meth:`hanlp.load`.
```
```{toctree}
mtl
eos
tok
pos
ner
dep
constituency
srl
sdp
amr
amr2text
sts
word2vec
glove
fasttext
mlm
```
================================================
FILE: docs/api/hanlp/pretrained/mlm.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# mlm
Masked Language Model (MLM) predicts words that were originally hidden intentionally in a sentence.
To perform such prediction, first load a pre-trained MLM (e.g., `bert-base-chinese`):
````{margin} Batching is Faster
```{hint}
Predict multiple sentences in batch mode for faster speed!
```
````
````{margin} Multilingual Support
```{note}
HanLP always support multilingual. Feel free to use a multilingual model listed [here](https://huggingface.co/models?pipeline_tag=fill-mask&sort=downloads).
```
````
```{code-cell} ipython3
:tags: [output_scroll]
from hanlp.components.lm.mlm import MaskedLanguageModel
mlm = MaskedLanguageModel()
mlm.load('bert-base-chinese')
```
Represent blanks (masked tokens) with `[MASK]` and let MLM fills them:
```{code-cell} ipython3
:tags: [output_scroll]
mlm('生活的真谛是[MASK]。')
```
Batching is always faster:
```{code-cell} ipython3
:tags: [output_scroll]
mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。'])
```
All the pre-trained MLM models and their details are listed in the [docs](https://huggingface.co/models?pipeline_tag=fill-mask&sort=downloads) of Hugging Face 🤗 Transformers.
================================================
FILE: docs/api/hanlp/pretrained/mtl.md
================================================
# mtl
```{eval-rst}
.. automodule:: hanlp.pretrained.mtl
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/ner.md
================================================
# ner
```{eval-rst}
.. automodule:: hanlp.pretrained.ner
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/pos.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# pos
The process of classifying words into their **parts of speech** and labeling them accordingly is known as **part-of-speech tagging**, **POS-tagging**, or simply **tagging**.
To tag a tokenized sentence:
````{margin} Batching is Faster
```{hint}
Tag multiple sentences at once for faster speed!
```
````
```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
pos(['我', '的', '希望', '是', '希望', '世界', '和平'])
```
````{margin} Custom Dictionary Supported
```{seealso}
See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py) for custom dictionary.
```
````
All the pre-trained taggers and their details are listed below.
```{eval-rst}
.. automodule:: hanlp.pretrained.pos
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/sdp.md
================================================
# sdp
```{eval-rst}
.. automodule:: hanlp.pretrained.sdp
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/srl.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# srl
Semantic Role Labeling (SRL) is one shallow semantic parsing that produces predicate-argument structures which are semantic roles (or participants) such as agent, patient, and theme associated with verbs.
Inputs to SRL are tokenized sentences:
````{margin} Batching is Faster
```{hint}
Feed in multiple sentences at once for faster speed!
```
````
```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
srl = hanlp.load(hanlp.pretrained.srl.CPB3_SRL_ELECTRA_SMALL)
srl(['男孩', '希望', '女孩', '相信', '他', '。'])
```
All the pre-trained labelers and their details are listed below.
```{eval-rst}
.. automodule:: hanlp.pretrained.srl
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/sts.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# sts
`sts` package holds pre-trained Semantic Textual Similarity (STS) models. We surveyed both supervised and unsupervised
models and we believe that unsupervised models are still immature at this moment. Unsupervised STS is good for IR but
not NLP especially on sentences with little lexical overlap.
```{eval-rst}
.. automodule:: hanlp.pretrained.sts
:members:
```
```{code-cell} ipython3
import hanlp
sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)
sim([
['看图猜一电影名', '看图猜电影'],
['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'],
['北京到上海的动车票', '上海到北京的动车票'],
])
```
================================================
FILE: docs/api/hanlp/pretrained/tok.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# tok
Tokenization is a way of separating a sentence into smaller units called tokens. In lexical analysis, tokens usually refer to words.
````{margin} Batching is Faster
```{hint}
Tokenize multiple sentences at once for faster speed!
```
````
````{margin} Custom Dictionary Supported
```{seealso}
See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py) for custom dictionary.
```
````
To tokenize raw sentences:
```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
tok(['商品和服务。', '晓美焰来到北京立方庭参观自然语义科技公司'])
```
All the pre-trained tokenizers and their details are listed below.
```{eval-rst}
.. automodule:: hanlp.pretrained.tok
:members:
```
================================================
FILE: docs/api/hanlp/pretrained/word2vec.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# word2vec
Word2Vec is a family of model architectures and optimizations that can be used to learn word embeddings from large unlabeled datasets. In this document, it is narrowly defined as a component to map discrete words to distributed representations which are dense vectors.
To perform such mapping:
````{margin} Batching is Faster
```{hint}
Map multiple tokens in batch mode for faster speed!
```
````
````{margin} Multilingual Support
```{note}
HanLP always support multilingual. Feel free to use a multilingual model listed [here](http://vectors.nlpl.eu/repository/).
```
````
```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU)
word2vec('先进')
```
These vectors have already been normalized to facilitate similarity computation:
```{code-cell} ipython3
:tags: [output_scroll]
import torch
print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0))
print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0))
```
Using these similarity scores, the most similar words can be found:
```{code-cell} ipython3
:tags: [output_scroll]
word2vec.most_similar('上海')
```
Word2Vec usually can not process OOV or phrases:
```{code-cell} ipython3
:tags: [output_scroll]
word2vec.most_similar('非常寒冷') # phrases are usually OOV
```
Doc2Vec, as opposite to Word2Vec model, can create a vectorised representation by averaging a group of words. To enable Doc2Vec for OOV and phrases, pass `doc2vec=True`:
```{code-cell} ipython3
:tags: [output_scroll]
word2vec.most_similar('非常寒冷', doc2vec=True)
```
All the pre-trained word2vec models and their details are listed below.
```{eval-rst}
.. automodule:: hanlp.pretrained.word2vec
:members:
```
================================================
FILE: docs/api/hanlp/utils/index.md
================================================
# utils
Utilities.
```{toctree}
io_util
```
================================================
FILE: docs/api/hanlp/utils/io_util.md
================================================
# io_util
```{eval-rst}
.. currentmodule:: hanlp.utils
.. automodule:: hanlp.utils.io_util
:members:
```
================================================
FILE: docs/api/restful.rst
================================================
.. _api/hanlp_restful:
hanlp_restful
====================
.. currentmodule:: hanlp_restful
.. autoclass:: HanLPClient
:members:
:special-members:
:exclude-members: __init__, __repr__, __weakref__
================================================
FILE: docs/api/restful_golang.md
================================================
# Golang RESTful API
## Install
```shell script
go get -u github.com/hankcs/gohanlp@main
```
## Quick Start
Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `Parse` interface.
```java
package main
import (
"fmt"
"github.com/hankcs/gohanlp/hanlp"
)
func main() {
client := hanlp.HanLPClient(hanlp.WithAuth("The auth you applied for")) // anonymous users can skip auth
s, _ := client.Parse("In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.",hanlp.WithLanguage("mul"))
fmt.Println(s)
}
```
Refer to our [testcases](https://github.com/hankcs/gohanlp/blob/main/main_test.go) and [data format](../data_format) for more details.
================================================
FILE: docs/api/restful_java.md
================================================
# Java RESTful API
Add the following dependency into the `pom.xml` file of your project.
```xml
com.hankcs.hanlp.restful
hanlp-restful
0.0.15
```
Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `parse` interface.
```java
HanLPClient client = new HanLPClient("https://hanlp.hankcs.com/api", null); // Replace null with your auth
System.out.println(client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。"));
```
Refer to our [testcases](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java) and [data format](../data_format) for more details.
================================================
FILE: docs/api/trie/dictionary.md
================================================
# dictionary
```{eval-rst}
.. currentmodule:: hanlp_trie
.. autoclass:: hanlp_trie.dictionary.DictInterface
:members:
.. autoclass:: hanlp_trie.dictionary.TrieDict
:members:
```
================================================
FILE: docs/api/trie/index.md
================================================
# hanlp_trie
HanLP trie/dictionary interface and referential implementation.
```{toctree}
trie
dictionary
```
================================================
FILE: docs/api/trie/trie.md
================================================
# trie
```{eval-rst}
.. currentmodule:: hanlp_trie
.. autoclass:: hanlp_trie.trie.Node
:members:
.. autoclass:: hanlp_trie.trie.Trie
:members:
```
================================================
FILE: docs/conf.py
================================================
# -- Project information -----------------------------------------------------
import sys
import os
from datetime import datetime
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../plugins/hanlp_common'))
sys.path.append(os.path.abspath('../plugins/hanlp_trie'))
sys.path.append(os.path.abspath('../plugins/hanlp_restful'))
import hanlp
project = 'HanLP'
copyright = f'2020-{datetime.now().year}, hankcs'
author = 'hankcs'
# The short X.Y version.
version = hanlp.__version__
# The full version, including alpha/beta/rc tags.
release = hanlp.__version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
language = 'en'
master_doc = "index"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"myst_nb",
"sphinx_copybutton",
"sphinx_togglebutton",
"sphinxcontrib.bibtex",
'sphinx_astrorefs', # astrophysics style, similar to ACL
"sphinx_thebe",
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
"ablog",
'sphinx.ext.napoleon',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
intersphinx_mapping = {
"python": ("https://docs.python.org/3.8", None),
"sphinx": ("https://www.sphinx-doc.org/en/3.x", None),
}
nitpick_ignore = [
("py:class", "docutils.nodes.document"),
("py:class", "docutils.parsers.rst.directives.body.Sidebar"),
]
autoclass_content = 'both'
numfig = True
myst_admonition_enable = True
myst_deflist_enable = True
myst_url_schemes = ("http", "https", "mailto")
panels_add_bootstrap_css = False
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_book_theme"
html_title = "HanLP Documentation"
html_logo = "_static/logo.png"
html_favicon = "_static/favicon.png"
html_copy_source = True
html_sourcelink_suffix = ""
html_sidebars = {
# "reference/blog/*": [
# "sidebar-search-bs.html",
# "postcard.html",
# "recentposts.html",
# "tagcloud.html",
# "categories.html",
# "archives.html",
# "sbt-sidebar-nav.html",
# "sbt-sidebar-footer.html",
# ]
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
jupyter_execute_notebooks = "cache"
thebe_config = {
"repository_url": "https://github.com/binder-examples/jupyter-stacks-datascience",
"repository_branch": "master",
}
html_theme_options = {
"theme_dev_mode": False,
"path_to_docs": "docs",
"repository_url": "https://github.com/hankcs/HanLP",
# "repository_branch": "gh-pages", # For testing
# "launch_buttons": {
# # "binderhub_url": "https://mybinder.org",
# # "jupyterhub_url": "https://datahub.berkeley.edu", # For testing
# "colab_url": "https://colab.research.google.com/",
# "notebook_interface": "jupyterlab",
# "thebe": True,
# },
"use_edit_page_button": True,
"use_issues_button": True,
"use_repository_button": True,
"use_download_button": True,
# For testing
# "home_page_in_toc": True,
# "single_page": True,
# "extra_footer": "Test ", # DEPRECATED KEY
# "extra_navbar": "Test ",
}
html_baseurl = "https://hanlp.hankcs.com/docs/"
# -- ABlog config -------------------------------------------------
blog_path = "reference/blog"
blog_post_pattern = "reference/blog/*.md"
blog_baseurl = "https://hanlp.hankcs.com/docs/"
fontawesome_included = True
post_auto_image = 1
post_auto_excerpt = 2
execution_show_tb = "READTHEDOCS" in os.environ
# Localization
nb_render_priority = {
"gettext": (
"application/vnd.jupyter.widget-view+json",
"application/javascript",
"text/html",
"image/svg+xml",
"image/png",
"image/jpeg",
"text/markdown",
"text/latex",
"text/plain",
)
}
locale_dirs = ['locale/']
# bibtex
bibtex_default_style = 'unsrtalpha'
================================================
FILE: docs/configure.md
================================================
# Configuration
## Customize ``HANLP_HOME``
All resources HanLP use will be cached into a directory called `HANLP_HOME`.
It is an environment variable which you can customize to any path you like.
By default, `HANLP_HOME` resolves to `~/.hanlp` and `%appdata%\hanlp` on *nix and Windows respectively.
If you want to redirect `HANLP_HOME` to a different location, say `/data/hanlp`, the following shell command can be very helpful.
```bash
export HANLP_HOME=/data/hanlp
```
## Use GPUs
By default, HanLP tries to use the least occupied GPU so that mostly you don't need to worry about it, HanLP makes the best choice for you. This behavior is very useful when you're using a public server shared across your lab or company with your colleagues.
HanLP also honors the ``CUDA_VISIBLE_DEVICES`` used by PyTorch and TensorFlow to limit which devices HanLP can choose from. For example, the following command will only keep the `0`th and `1`st GPUs.
```bash
export CUDA_VISIBLE_DEVICES=0,1
```
```{eval-rst}
If you need fine grained control over each component, ``hanlp.load(..., devices=...)`` is what you're looking for.
See documents for :meth:`hanlp.load`.
```
### External Resources
For deep learning beginners, you might need to learn how to set up a working GPU environment first. Here are some
resources.
- [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
- It's a good practice to install the driver shipped with a CUDA package.
- [PyTorch](https://pytorch.org/get-started/locally/)
- If no existing PyTorch found, `pip install hanlp` will have the CPU-only PyTorch installed, which is universal and assumes no GPU or CUDA dependencies.
- You will need to install a GPU-enabled PyTorch according to your CUDA and OS versions.
- Cloud servers
- There are many cloud services providing out-of-the-box deep learning images. HanLP works fine on these platforms.
They could save your time and efforts.
- Google Colab
- Colab allows you to write excutable notebooks with full GPU support. PyTorch and TensorFlow have been pre-installed and configured to the best state.
- In fact, you can click [](https://colab.research.google.com/drive/1KPX6t1y36TOzRIeB4Kt3uJ1twuj6WuFv?usp=sharing) to play with the GPU-enabled HanLP tutorial right now.
## Use Mirror Sites
By default, models are downloaded from a global CDN we maintain. However, in some regions the downloading speed can
be slow occasionally. If you happen to be in one of those regions, you can find some third party mirror sites
on our [bbs](https://bbs.hankcs.com/). When you find a working URL, say
[https://ftp.hankcs.com/hanlp/](https://ftp.hankcs.com/hanlp/), you can set a `HANLP_URL`
environment variable and HanLP will pick it up at the next startup.
```bash
export HANLP_URL=https://ftp.hankcs.com/hanlp/
```
## Control Verbosity
By default, HanLP will print progressive message to the console when you load a model. If you want to silence it, use the
following environment variable.
```bash
export HANLP_VERBOSE=0
```
================================================
FILE: docs/contributing.md
================================================
# Contributing Guide
Thank you for being interested in contributing to `HanLP`! You
are awesome ✨.
This guideline contains information about our conventions around coding style, pull request workflow, commit messages and more.
This page also contains information to help you get started with development on this
project.
## Development
### Set-up
Get the source code of this project using git:
```bash
git clone https://github.com/hankcs/HanLP --branch master
cd HanLP
pip install -e plugins/hanlp_trie
pip install -e plugins/hanlp_common
pip install -e plugins/hanlp_restful
pip install -e .
```
To work on this project, you need Python 3.6 or newer.
### Running Tests
This project has a test suite to ensure certain important APIs work properly. The tests can be run using:
```bash
python -m unittest discover ./tests
```
```{tip}
It's hard to cover every API especially those of deep learning models, due to the limited computation resource of CI. However, we suggest all inference APIs to be tested at least.
```
## Repository Structure
This repository is a split into a few critical folders:
hanlp/
: The HanLP core package, containing the Python code.
plugins/
: Contains codes shared across several individual packages or non core APIs.
docs/
: The documentation for HanLP, which is in markdown format mostly.
: The build configuration is contained in `conf.py`.
tests/
: Testing infrastructure that uses `unittest` to ensure the output of API is what we expect it to be.
.github/
: Contains Continuous-integration (CI) workflows, run on commits/PRs to the GitHub repository.
================================================
FILE: docs/data_format.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# Data Format
## Input Format
### RESTful Input
#### Definition
To make a RESTful call, one needs to send a `json` HTTP POST request to the server, which contains at least a `text`
field or a `tokens` field. The input to RESTful API is very flexible. It can be one of the following 3 formats:
1. It can be a document of raw `str` filled into `text`. The server will split it into sentences.
1. It can be a `list` of sentences, each sentence is a raw `str`, filled into `text`.
1. It can be a `list` of tokenized sentences, each sentence is a list of `str` typed tokens, filled into `tokens`.
```{eval-rst}
Additionally, fine-grained controls are performed with the arguments defined in
:meth:`hanlp_restful.HanLPClient.parse`.
```
#### Examples
```shell script
curl -X 'POST' \
'https://hanlp.hankcs.com/api/parse' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"language": "zh",
"text": "HanLP为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京参观自然语义科技公司。"
}'
```
### Model Input
````{margin} **How about training inputs?**
```{seealso}
We mostly follow the conventional file format of each NLP task instead of re-inventing them. Thus, we use `.tsv` for tagging and
`.conllu` for parsing etc. For more details, refer to [datasets](https://hanlp.hankcs.com/docs/api/hanlp/datasets/index.html).
```
````
The input format to models is specified per model and per task. Generally speaking, if a model has no tokenizer built in, then its input is
a sentence in `list[str]` form (a list of tokens), or multiple such sentences nested in a `list`.
If a model has a tokenizer built in, each sentence is in `str` form.
Additionally, you can use `skip_tasks='tok*'` to ask the model to use your tokenized inputs instead of tokenizing
them, in which case, each of your sentence needs to be in `list[str]` form, as if there was no tokenizer.
```{eval-rst}
For any model, its input is of sentence level, which means you have to split a document into sentences beforehand.
You may want to try :class:`~hanlp.components.eos.ngram.NgramSentenceBoundaryDetector` for sentence splitting.
```
## Output Format
```{eval-rst}
The outputs of both :class:`~hanlp_restful.HanLPClient` and
:class:`~hanlp.components.mtl.multi_task_learning.MultiTaskLearning` are unified as the same
:class:`~hanlp_common.document.Document` format.
```
For example, the following RESTful codes will output such an instance.
```{code-cell} ipython3
:tags: [output_scroll]
from hanlp_restful import HanLPClient
HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None) # Fill in your auth
print(HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。'))
```
The outputs above is represented as a `json` dictionary where each key is a task name and its value is
the output of the corresponding task.
For each output, if it's a nested `list` then it contains multiple sentences otherwise it's just one single sentence.
We make the following naming convention of NLP tasks, each consists of 3 letters.
````{margin} **How about annotations?**
```{seealso}
Each NLP task can exploit multiple datasets with their annotations, see our [annotations](annotations/index) for details.
```
````
### Naming Convention
| key | Task | Chinese |
| ---- | ------------------------------------------------------------ | ------------ |
| tok | Tokenization. Each element is a token. | 分词 |
| pos | Part-of-Speech Tagging. Each element is a tag. | 词性标注 |
| lem | Lemmatization. Each element is a lemma. | 词干提取 |
| fea | Features of Universal Dependencies. Each element is a feature. | 词法语法特征 |
| ner | Named Entity Recognition. Each element is a tuple of `(entity, type, begin, end)`, where `end`s are exclusive offsets. | 命名实体识别 |
| dep | Dependency Parsing. Each element is a tuple of `(head, relation)` where `head` starts with index `0` (which is `ROOT`). | 依存句法分析 |
| con | Constituency Parsing. Each list is a bracketed constituent. | 短语成分分析 |
| srl | Semantic Role Labeling. Similar to `ner`, each element is a tuple of `(arg/pred, label, begin, end)`, where the predicate is labeled as `PRED`. | 语义角色标注 |
| sdp | Semantic Dependency Parsing. Similar to `dep`, however each token can have any number (including zero) of heads and corresponding relations. | 语义依存分析 |
| amr | Abstract Meaning Representation. Each AMR graph is represented as list of logical triples. See [AMR guidelines](https://github.com/amrisi/amr-guidelines/blob/master/amr.md#example). | 抽象意义表示 |
When there are multiple models performing the same task, their keys are appended with a secondary identifier.
For example, `tok/fine` and `tok/corase` means a fine-grained tokenization model and a coarse-grained one respectively.
================================================
FILE: docs/index.md
================================================
# HanLP: Han Language Processing
[](https://github.com/hankcs/HanLP/stargazers) [](https://github.com/hankcs/HanLP/network)  [](https://pepy.tech/project/HanLP) [](https://github.com/hankcs/HanLP/blob/master/LICENSE) [](https://colab.research.google.com/drive/1KPX6t1y36TOzRIeB4Kt3uJ1twuj6WuFv?usp=sharing)
The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing
state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be
efficient, user friendly and extendable. It comes with pretrained models for various human languages
including English, Chinese, Japanese and many others.
## Tutorials
```{toctree}
:maxdepth: 1
:caption: Introduction
tutorial
install
configure
data_format
annotations/index
contributing
Live Demo
```
## Python API
```{toctree}
:caption: Python API
:maxdepth: 2
api/hanlp/index
api/common/index
api/restful
api/trie/index
```
## Java API
```{toctree}
:maxdepth: 1
:caption: Java API
1.x API
api/restful_java
```
## Golang API
```{toctree}
:maxdepth: 1
:caption: Golang API
api/restful_golang
```
## References
```{toctree}
:caption: References
:maxdepth: 2
references
```
## Acknowledgements
HanLPv2.1 is heavily inspired by [AllenNLP](https://allennlp.org/) and [SuPar](https://pypi.org/project/supar/).
[pypi-badge]: https://img.shields.io/pypi/v/hanlp.svg
[pypi-link]: https://pypi.org/project/hanlp
================================================
FILE: docs/install.md
================================================
# Install
```{figure} _static/install-versions.svg
---
width: 100%
figclass: caption
alt: HanLP versions
name: hanlp-versions
---
Choose your HanLP version
```
## Install RESTful Packages
[](https://pepy.tech/project/hanlp-restful) [](https://pepy.tech/project/hanlp-restful) [](https://pepy.tech/project/hanlp-restful)
```{eval-rst}
.. margin:: **Beginners Attention**
.. Hint:: New to NLP? Just install RESTful packages and call :meth:`~hanlp_restful.HanLPClient.parse` without pain.
```
For beginners, the recommended RESTful packages are easier to start with.
The only requirement is [an auth key](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178).
We officially released the following language bindings:
### Python
```shell script
pip install hanlp_restful
```
### Java
See [Java instructions](https://hanlp.hankcs.com/docs/api/restful_java.html).
### Golang
See [Golang instructions](https://hanlp.hankcs.com/docs/api/restful_golang.html).
## Install Native Package
[](https://pepy.tech/project/hanlp) [](https://pepy.tech/project/hanlp) [](https://pepy.tech/project/hanlp)
The native package running locally can be installed via pip.
````{margin} **Install from Source**
```{note}
See [developer guideline](https://hanlp.hankcs.com/docs/contributing.html#development).
```
````
```
pip install hanlp
```
HanLP requires Python 3.6 or later. GPU/TPU is suggested but not mandatory. Depending on your preference, HanLP offers the following flavors:
````{margin} **Windows Support**
```{note}
Installation on Windows is **perfectly** supported. No need to install Microsoft Visual C++ Build Tools anymore.
```
````
````{margin} **Apple Silicon**
```{note}
HanLP also perfectly supports accelerating on Apple Silicon M1 chips, see [tutorial](https://www.hankcs.com/nlp/hanlp-official-m1-support.html).
```
````
| Flavor | Description |
| ------- | ------------------------------------------------------------ |
| default | This installs the default version which delivers the most commonly used functionalities. However, some heavy dependencies like TensorFlow are not installed. |
| tf | This installs TensorFlow and fastText. |
| amr | To support Abstract Meaning Representation (AMR) models, this installs AMR related dependencies like `penman`. |
| full | For experts who seek to maximize the efficiency via TensorFlow and C++ extensions, `pip install hanlp[full]` installs all the above dependencies. |
## Install Models
In short, you don't need to manually install any model. Instead, they are automatically downloaded to a directory called [`HANLP_HOME`](https://hanlp.hankcs.com/docs/configure.html#customize-hanlp-home) when you call `hanlp.load`.
Occasionally, some errors might occur the first time you load a model, in which case you can refer to the following tips.
### Download Error
#### HanLP Models
If the auto-download of a HanLP model fails, you can either:
1. Retry as our file server might be busy serving users from all over the world.
1. Follow the message on your terminal, which often guides you to manually download a `zip` file to a particular path.
1. Use a [mirror site](https://hanlp.hankcs.com/docs/configure.html#use-mirror-sites) which could be faster and stabler in your region.
#### Hugging Face 🤗 Transformers Models
If the auto-download of a Hugging Face 🤗 Transformers model fails, e.g., the following exception is threw out:
```bash
lib/python3.8/site-packages/transformers/file_utils.py", line 2102, in get_from_cache
raise ValueError(
ValueError: Connection error, and we cannot find the requested files in the cached
path. Please try again or make sure your Internet connection is on.
```
You can either:
1. Retry as the Internet is quite unstable in some regions (e.g., China).
2. Force Hugging Face 🤗 Transformers to use cached models instead of checking updates from the Internet **if you have ever successfully loaded it before**, by setting the following environment variable:
```bash
export TRANSFORMERS_OFFLINE=1
```
### Server without Internet
If your server has no Internet access at all, just debug your codes on your local PC and copy the following directories to your server via a USB disk or something.
1. `~/.hanlp`: the home directory for HanLP models.
1. `~/.cache/huggingface`: the home directory for Hugging Face 🤗 Transformers.
### Import Error
Some TensorFlow/fastText models will ask you to install the missing TensorFlow/fastText modules, in which case you'll need to install the full version:
```shell script
pip install hanlp[full]
```
```{danger}
NEVER install thirdparty packages (TensorFlow/fastText etc.) by yourself, as higher or lower versions of thirparty packages have not been tested and might not work properly.
```
================================================
FILE: docs/references.bib
================================================
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/
%% Created for hankcs at 2022-12-07 15:02:16 -0500
%% Saved with string encoding Unicode (UTF-8)
@inproceedings{bai-etal-2022-graph,
address = {Dublin, Ireland},
author = {Bai, Xuefeng and Chen, Yulong and Zhang, Yue},
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
date-added = {2022-12-07 15:02:15 -0500},
date-modified = {2022-12-07 15:02:15 -0500},
month = may,
pages = {6001--6015},
publisher = {Association for Computational Linguistics},
title = {Graph Pre-training for {AMR} Parsing and Generation},
url = {https://aclanthology.org/2022.acl-long.415},
year = {2022},
bdsk-url-1 = {https://aclanthology.org/2022.acl-long.415}}
@inproceedings{wang-etal-2021-minilmv2,
address = {Online},
author = {Wang, Wenhui and Bao, Hangbo and Huang, Shaohan and Dong, Li and Wei, Furu},
booktitle = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021},
date-added = {2022-06-14 20:10:18 -0400},
date-modified = {2022-06-14 20:10:18 -0400},
doi = {10.18653/v1/2021.findings-acl.188},
month = aug,
pages = {2140--2151},
publisher = {Association for Computational Linguistics},
title = {{M}ini{LM}v2: Multi-Head Self-Attention Relation Distillation for Compressing Pretrained Transformers},
url = {https://aclanthology.org/2021.findings-acl.188},
year = {2021},
bdsk-url-1 = {https://aclanthology.org/2021.findings-acl.188},
bdsk-url-2 = {https://doi.org/10.18653/v1/2021.findings-acl.188}}
@article{zhang2021mengzi,
author = {Zhang, Zhuosheng and Zhang, Hanqing and Chen, Keming and Guo, Yuhang and Hua, Jingyun and Wang, Yulong and Zhou, Ming},
date-added = {2022-04-15 10:32:14 -0400},
date-modified = {2022-04-15 10:32:14 -0400},
journal = {arXiv preprint arXiv:2110.06696},
title = {Mengzi: Towards Lightweight yet Ingenious Pre-trained Models for Chinese},
year = {2021}}
@inproceedings{samuel-straka-2020-ufal,
abstract = {We present PERIN, a novel permutation-invariant approach to sentence-to-graph semantic parsing. PERIN is a versatile, cross-framework and language independent architecture for universal modeling of semantic structures. Our system participated in the CoNLL 2020 shared task, Cross-Framework Meaning Representation Parsing (MRP 2020), where it was evaluated on five different frameworks (AMR, DRG, EDS, PTG and UCCA) across four languages. PERIN was one of the winners of the shared task. The source code and pretrained models are available at http://www.github.com/ufal/perin.},
address = {Online},
author = {Samuel, David and Straka, Milan},
booktitle = {Proceedings of the CoNLL 2020 Shared Task: Cross-Framework Meaning Representation Parsing},
date-added = {2022-04-12 22:36:23 -0400},
date-modified = {2022-04-12 22:36:23 -0400},
doi = {10.18653/v1/2020.conll-shared.5},
month = nov,
pages = {53--64},
publisher = {Association for Computational Linguistics},
title = {{{\'U}FAL} at {MRP} 2020: Permutation-invariant Semantic Parsing in {PERIN}},
url = {https://aclanthology.org/2020.conll-shared.5},
year = {2020},
bdsk-url-1 = {https://aclanthology.org/2020.conll-shared.5},
bdsk-url-2 = {https://doi.org/10.18653/v1/2020.conll-shared.5}}
@inproceedings{qiu-etal-2014-multi,
address = {Dublin, Ireland},
author = {Qiu, Likun and Zhang, Yue and Jin, Peng and Wang, Houfeng},
booktitle = {Proceedings of {COLING} 2014, the 25th International Conference on Computational Linguistics: Technical Papers},
date-added = {2022-02-15 04:42:58 -0500},
date-modified = {2022-02-15 04:42:58 -0500},
month = aug,
pages = {257--268},
publisher = {Dublin City University and Association for Computational Linguistics},
title = {Multi-view {C}hinese Treebanking},
url = {https://aclanthology.org/C14-1026},
year = {2014},
bdsk-url-1 = {https://aclanthology.org/C14-1026}}
@inproceedings{li-etal-2018-analogical,
abstract = {Analogical reasoning is effective in capturing linguistic regularities. This paper proposes an analogical reasoning task on Chinese. After delving into Chinese lexical knowledge, we sketch 68 implicit morphological relations and 28 explicit semantic relations. A big and balanced dataset CA8 is then built for this task, including 17813 questions. Furthermore, we systematically explore the influences of vector representations, context features, and corpora on analogical reasoning. With the experiments, CA8 is proved to be a reliable benchmark for evaluating Chinese word embeddings.},
address = {Melbourne, Australia},
author = {Li, Shen and Zhao, Zhe and Hu, Renfen and Li, Wensi and Liu, Tao and Du, Xiaoyong},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
date-added = {2022-01-30 22:52:52 -0500},
date-modified = {2022-01-30 22:52:52 -0500},
doi = {10.18653/v1/P18-2023},
month = jul,
pages = {138--143},
publisher = {Association for Computational Linguistics},
title = {Analogical Reasoning on {C}hinese Morphological and Semantic Relations},
url = {https://aclanthology.org/P18-2023},
year = {2018},
bdsk-url-1 = {https://aclanthology.org/P18-2023},
bdsk-url-2 = {https://doi.org/10.18653/v1/P18-2023}}
@inproceedings{NIPS2013_9aa42b31,
author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
booktitle = {Advances in Neural Information Processing Systems},
date-added = {2022-01-30 18:17:28 -0500},
date-modified = {2022-01-30 18:17:28 -0500},
editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger},
publisher = {Curran Associates, Inc.},
title = {Distributed Representations of Words and Phrases and their Compositionality},
url = {https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf},
volume = {26},
year = {2013},
bdsk-url-1 = {https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf}}
@inproceedings{bevilacqua-etal-2021-one,
author = {Bevilacqua, Michele and Blloshmi, Rexhina and Navigli, Roberto},
booktitle = {Proceedings of AAAI},
date-added = {2022-01-25 11:58:03 -0500},
date-modified = {2022-01-25 11:58:03 -0500},
title = {One {SPRING} to Rule Them Both: {S}ymmetric {AMR} Semantic Parsing and Generation without a Complex Pipeline},
year = {2021}}
@inproceedings{lewis-etal-2020-bart,
abstract = {We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. BART is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and other recent pretraining schemes. We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of sentences and using a novel in-filling scheme, where spans of text are replaced with a single mask token. BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa on GLUE and SQuAD, and achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 3.5 ROUGE. BART also provides a 1.1 BLEU increase over a back-translation system for machine translation, with only target language pretraining. We also replicate other pretraining schemes within the BART framework, to understand their effect on end-task performance.},
address = {Online},
author = {Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Veselin and Zettlemoyer, Luke},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
date-added = {2022-01-25 11:56:10 -0500},
date-modified = {2022-01-25 11:56:10 -0500},
doi = {10.18653/v1/2020.acl-main.703},
month = jul,
pages = {7871--7880},
publisher = {Association for Computational Linguistics},
title = {{BART}: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
url = {https://www.aclweb.org/anthology/2020.acl-main.703},
year = {2020},
bdsk-url-1 = {https://www.aclweb.org/anthology/2020.acl-main.703},
bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.703}}
@article{knight2014abstract,
author = {Knight, Kevin and Baranescu, Lauren and Bonial, Claire and Georgescu, Madalina and Griffitt, Kira and Hermjakob, Ulf and Marcu, Daniel and Palmer, Martha and Schneifer, Nathan},
date-added = {2022-01-25 11:54:11 -0500},
date-modified = {2022-01-25 11:54:11 -0500},
journal = {Web download},
title = {Abstract meaning representation (amr) annotation release 1.0},
year = {2014}}
@inproceedings{he-choi-2021-stem,
abstract = {Multi-task learning with transformer encoders (MTL) has emerged as a powerful technique to improve performance on closely-related tasks for both accuracy and efficiency while a question still remains whether or not it would perform as well on tasks that are distinct in nature. We first present MTL results on five NLP tasks, POS, NER, DEP, CON, and SRL, and depict its deficiency over single-task learning. We then conduct an extensive pruning analysis to show that a certain set of attention heads get claimed by most tasks during MTL, who interfere with one another to fine-tune those heads for their own objectives. Based on this finding, we propose the Stem Cell Hypothesis to reveal the existence of attention heads naturally talented for many tasks that cannot be jointly trained to create adequate embeddings for all of those tasks. Finally, we design novel parameter-free probes to justify our hypothesis and demonstrate how attention heads are transformed across the five tasks during MTL through label analysis.},
address = {Online and Punta Cana, Dominican Republic},
author = {He, Han and Choi, Jinho D.},
booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
date-added = {2021-11-06 18:24:44 -0400},
date-modified = {2021-11-06 18:24:44 -0400},
month = nov,
pages = {5555--5577},
publisher = {Association for Computational Linguistics},
title = {The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders},
url = {https://aclanthology.org/2021.emnlp-main.451},
year = {2021},
bdsk-url-1 = {https://aclanthology.org/2021.emnlp-main.451}}
@inproceedings{he-choi-2019,
abstract = {This paper presents new state-of-the-art models for three tasks, part-of-speech tagging, syntactic parsing, and semantic parsing, using the cutting-edge contextualized embedding framework known as BERT. For each task, we first replicate and simplify the current state-of-the-art approach to enhance its model efficiency. We then evaluate our simplified approaches on those three tasks using token embeddings generated by BERT. 12 datasets in both English and Chinese are used for our experiments. The BERT models outperform the previously best-performing models by 2.5\% on average (7.5\% for the most significant case). All models and source codes are available in public so that researchers can improve upon and utilize them to establish strong baselines for the next decade.},
author = {Han He and Jinho Choi},
booktitle = {The Thirty-Third International Flairs Conference},
conference = {Florida Artificial Intelligence Research Society Conference},
date-added = {2021-10-16 21:09:00 -0400},
date-modified = {2021-10-16 21:09:00 -0400},
keywords = {part-of-speech tagging, syntactic parsing, semantic parsing, Transformer, BERT},
title = {Establishing Strong Baselines for the New Decade: Sequence Tagging, Syntactic and Semantic Parsing with BERT},
url = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438},
year = {2020},
bdsk-url-1 = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438}}
@inproceedings{xiao-etal-2021-ernie,
abstract = {Coarse-grained linguistic information, such as named entities or phrases, facilitates adequately representation learning in pre-training. Previous works mainly focus on extending the objective of BERT{'}s Masked Language Modeling (MLM) from masking individual tokens to contiguous sequences of n tokens. We argue that such contiguously masking method neglects to model the intra-dependencies and inter-relation of coarse-grained linguistic information. As an alternative, we propose ERNIE-Gram, an explicitly n-gram masking method to enhance the integration of coarse-grained information into pre-training. In ERNIE-Gram, n-grams are masked and predicted directly using explicit n-gram identities rather than contiguous sequences of n tokens. Furthermore, ERNIE-Gram employs a generator model to sample plausible n-gram identities as optional n-gram masks and predict them in both coarse-grained and fine-grained manners to enable comprehensive n-gram prediction and relation modeling. We pre-train ERNIE-Gram on English and Chinese text corpora and fine-tune on 19 downstream tasks. Experimental results show that ERNIE-Gram outperforms previous pre-training models like XLNet and RoBERTa by a large margin, and achieves comparable results with state-of-the-art methods. The source codes and pre-trained models have been released at https://github.com/PaddlePaddle/ERNIE.},
address = {Online},
author = {Xiao, Dongling and Li, Yu-Kun and Zhang, Han and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
date-added = {2021-09-04 14:09:52 -0400},
date-modified = {2021-09-04 14:09:52 -0400},
doi = {10.18653/v1/2021.naacl-main.136},
month = jun,
pages = {1702--1715},
publisher = {Association for Computational Linguistics},
title = {{ERNIE}-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding},
url = {https://aclanthology.org/2021.naacl-main.136},
year = {2021},
bdsk-url-1 = {https://aclanthology.org/2021.naacl-main.136},
bdsk-url-2 = {https://doi.org/10.18653/v1/2021.naacl-main.136}}
@inproceedings{akbik-etal-2018-contextual,
abstract = {Recent advances in language modeling using recurrent neural networks have made it viable to model language as distributions over characters. By learning to predict the next character on the basis of previous characters, such models have been shown to automatically internalize linguistic concepts such as words, sentences, subclauses and even sentiment. In this paper, we propose to leverage the internal states of a trained character language model to produce a novel type of word embedding which we refer to as contextual string embeddings. Our proposed embeddings have the distinct properties that they (a) are trained without any explicit notion of words and thus fundamentally model words as sequences of characters, and (b) are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. We conduct a comparative evaluation against previous embeddings and find that our embeddings are highly useful for downstream tasks: across four classic sequence labeling tasks we consistently outperform the previous state-of-the-art. In particular, we significantly outperform previous work on English and German named entity recognition (NER), allowing us to report new state-of-the-art F1-scores on the CoNLL03 shared task. We release all code and pre-trained language models in a simple-to-use framework to the research community, to enable reproduction of these experiments and application of our proposed embeddings to other tasks: https://github.com/zalandoresearch/flair},
address = {Santa Fe, New Mexico, USA},
author = {Akbik, Alan and Blythe, Duncan and Vollgraf, Roland},
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
date-added = {2021-09-01 13:10:59 -0400},
date-modified = {2021-09-01 13:10:59 -0400},
month = aug,
pages = {1638--1649},
publisher = {Association for Computational Linguistics},
title = {Contextual String Embeddings for Sequence Labeling},
url = {https://aclanthology.org/C18-1139},
year = {2018},
bdsk-url-1 = {https://aclanthology.org/C18-1139}}
@inproceedings{he-choi-2021-levi,
abstract = {Coupled with biaffine decoders, transformers have been effectively adapted to text-to-graph transduction and achieved state-of-the-art performance on AMR parsing. Many prior works, however, rely on the biaffine decoder for either or both arc and label predictions although most features used by the decoder may be learned by the transformer already. This paper presents a novel approach to AMR parsing by combining heterogeneous data (tokens, concepts, labels) as one input to a transformer to learn attention, and use only attention matrices from the transformer to predict all elements in AMR graphs (concepts, arcs, labels). Although our models use significantly fewer parameters than the previous state-of-the-art graph parser, they show similar or better accuracy on AMR 2.0 and 3.0.},
address = {Online},
author = {He, Han and Choi, Jinho D.},
booktitle = {Proceedings of the 17th International Conference on Parsing Technologies and the IWPT 2021 Shared Task on Parsing into Enhanced Universal Dependencies (IWPT 2021)},
date-added = {2021-09-01 13:09:14 -0400},
date-modified = {2021-09-01 13:09:14 -0400},
doi = {10.18653/v1/2021.iwpt-1.5},
month = aug,
pages = {50--57},
publisher = {Association for Computational Linguistics},
title = {Levi Graph {AMR} Parser using Heterogeneous Attention},
url = {https://aclanthology.org/2021.iwpt-1.5},
year = {2021},
bdsk-url-1 = {https://aclanthology.org/2021.iwpt-1.5},
bdsk-url-2 = {https://doi.org/10.18653/v1/2021.iwpt-1.5}}
@inproceedings{conneau-etal-2020-unsupervised,
abstract = {This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6{\%} average accuracy on XNLI, +13{\%} average F1 score on MLQA, and +2.4{\%} F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7{\%} in XNLI accuracy for Swahili and 11.4{\%} for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code and models publicly available.},
address = {Online},
author = {Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
date-added = {2021-09-01 12:41:50 -0400},
date-modified = {2021-09-01 12:41:50 -0400},
doi = {10.18653/v1/2020.acl-main.747},
month = jul,
pages = {8440--8451},
publisher = {Association for Computational Linguistics},
title = {Unsupervised Cross-lingual Representation Learning at Scale},
url = {https://aclanthology.org/2020.acl-main.747},
year = {2020},
bdsk-url-1 = {https://aclanthology.org/2020.acl-main.747},
bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.747}}
@inproceedings{xue-etal-2021-mt5,
abstract = {The recent {``}Text-to-Text Transfer Transformer{''} (T5) leveraged a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual benchmarks. We also describe a simple technique to prevent {``}accidental translation{''} in the zero-shot setting, where a generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model checkpoints used in this work are publicly available.},
address = {Online},
author = {Xue, Linting and Constant, Noah and Roberts, Adam and Kale, Mihir and Al-Rfou, Rami and Siddhant, Aditya and Barua, Aditya and Raffel, Colin},
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
date-added = {2021-09-01 12:40:34 -0400},
date-modified = {2021-09-01 12:40:34 -0400},
doi = {10.18653/v1/2021.naacl-main.41},
month = jun,
pages = {483--498},
publisher = {Association for Computational Linguistics},
title = {m{T}5: A Massively Multilingual Pre-trained Text-to-Text Transformer},
url = {https://aclanthology.org/2021.naacl-main.41},
year = {2021},
bdsk-url-1 = {https://aclanthology.org/2021.naacl-main.41},
bdsk-url-2 = {https://doi.org/10.18653/v1/2021.naacl-main.41}}
@misc{https://doi.org/10.35111/gvd0-xk91,
author = {Xue, Nianwen and {Zhang, Xiuhong} and {Jiang, Zixin} and {Palmer, Martha} and {Xia, Fei} and {Chiou, Fu-Dong} and {Chang, Meiyu}},
date-added = {2021-09-01 12:32:05 -0400},
date-modified = {2021-09-01 12:36:22 -0400},
doi = {10.35111/GVD0-XK91},
publisher = {Linguistic Data Consortium},
title = {Chinese Treebank 9.0},
url = {https://catalog.ldc.upenn.edu/LDC2016T13},
year = {2016},
bdsk-url-1 = {https://catalog.ldc.upenn.edu/LDC2016T13},
bdsk-url-2 = {https://doi.org/10.35111/GVD0-XK91}}
@inproceedings{clark2020electra,
author = {Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning},
booktitle = {ICLR},
date-added = {2021-08-07 15:53:27 -0400},
date-modified = {2021-08-07 15:53:27 -0400},
title = {{ELECTRA}: Pre-training Text Encoders as Discriminators Rather Than Generators},
url = {https://openreview.net/pdf?id=r1xMH1BtvB},
year = {2020},
bdsk-url-1 = {https://openreview.net/pdf?id=r1xMH1BtvB}}
@inproceedings{chang-etal-2009-discriminative,
address = {Boulder, Colorado},
author = {Chang, Pi-Chuan and Tseng, Huihsin and Jurafsky, Dan and Manning, Christopher D.},
booktitle = {Proceedings of the Third Workshop on Syntax and Structure in Statistical Translation ({SSST}-3) at {NAACL} {HLT} 2009},
date-added = {2021-03-17 13:37:03 -0400},
date-modified = {2021-03-17 13:37:03 -0400},
month = jun,
pages = {51--59},
publisher = {Association for Computational Linguistics},
title = {Discriminative Reordering with {C}hinese Grammatical Relations Features},
url = {https://www.aclweb.org/anthology/W09-2307},
year = {2009},
bdsk-url-1 = {https://www.aclweb.org/anthology/W09-2307}}
@inproceedings{pennington-etal-2014-glove,
address = {Doha, Qatar},
author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher},
booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
date-added = {2020-12-31 15:07:29 -0500},
date-modified = {2020-12-31 15:07:29 -0500},
doi = {10.3115/v1/D14-1162},
month = oct,
pages = {1532--1543},
publisher = {Association for Computational Linguistics},
title = {{G}lo{V}e: Global Vectors for Word Representation},
url = {https://www.aclweb.org/anthology/D14-1162},
year = {2014},
bdsk-url-1 = {https://www.aclweb.org/anthology/D14-1162},
bdsk-url-2 = {https://doi.org/10.3115/v1/D14-1162}}
@incollection{he2018dual,
author = {He, Han and Wu, Lei and Yang, Xiaokun and Yan, Hua and Gao, Zhimin and Feng, Yi and Townsend, George},
booktitle = {Information Technology-New Generations},
date-added = {2020-12-31 15:03:58 -0500},
date-modified = {2020-12-31 15:03:58 -0500},
pages = {421--426},
publisher = {Springer},
title = {Dual long short-term memory networks for sub-character representation learning},
year = {2018}}
@inproceedings{devlin-etal-2019-bert,
abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).},
address = {Minneapolis, Minnesota},
author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
date-added = {2020-12-31 14:46:54 -0500},
date-modified = {2020-12-31 14:46:54 -0500},
doi = {10.18653/v1/N19-1423},
month = jun,
pages = {4171--4186},
publisher = {Association for Computational Linguistics},
title = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},
url = {https://www.aclweb.org/anthology/N19-1423},
year = {2019},
bdsk-url-1 = {https://www.aclweb.org/anthology/N19-1423},
bdsk-url-2 = {https://doi.org/10.18653/v1/N19-1423}}
@inproceedings{Lan2020ALBERT:,
author = {Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut},
booktitle = {International Conference on Learning Representations},
date-added = {2020-12-31 14:44:52 -0500},
date-modified = {2020-12-31 14:44:52 -0500},
title = {ALBERT: A Lite BERT for Self-supervised Learning of Language Representations},
url = {https://openreview.net/forum?id=H1eA7AEtvS},
year = {2020},
bdsk-url-1 = {https://openreview.net/forum?id=H1eA7AEtvS}}
@inproceedings{wang-xu-2017-convolutional,
abstract = {Character-based sequence labeling framework is flexible and efficient for Chinese word segmentation (CWS). Recently, many character-based neural models have been applied to CWS. While they obtain good performance, they have two obvious weaknesses. The first is that they heavily rely on manually designed bigram feature, i.e. they are not good at capturing $n$-gram features automatically. The second is that they make no use of full word information. For the first weakness, we propose a convolutional neural model, which is able to capture rich $n$-gram features without any feature engineering. For the second one, we propose an effective approach to integrate the proposed model with word embeddings. We evaluate the model on two benchmark datasets: PKU and MSR. Without any feature engineering, the model obtains competitive performance {---} 95.7{\%} on PKU and 97.3{\%} on MSR. Armed with word embeddings, the model achieves state-of-the-art performance on both datasets {---} 96.5{\%} on PKU and 98.0{\%} on MSR, without using any external labeled resource.},
address = {Taipei, Taiwan},
author = {Wang, Chunqi and Xu, Bo},
booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
date-added = {2020-12-31 14:42:35 -0500},
date-modified = {2020-12-31 14:42:35 -0500},
month = nov,
pages = {163--172},
publisher = {Asian Federation of Natural Language Processing},
title = {Convolutional Neural Network with Word Embeddings for {C}hinese Word Segmentation},
url = {https://www.aclweb.org/anthology/I17-1017},
year = {2017},
bdsk-url-1 = {https://www.aclweb.org/anthology/I17-1017}}
@article{bojanowski2017enriching,
author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
date-added = {2020-12-25 22:31:59 -0500},
date-modified = {2020-12-25 22:31:59 -0500},
issn = {2307-387X},
journal = {Transactions of the Association for Computational Linguistics},
pages = {135--146},
title = {Enriching Word Vectors with Subword Information},
volume = {5},
year = {2017}}
@article{collins-koo-2005-discriminative,
author = {Collins, Michael and Koo, Terry},
date-added = {2020-12-25 17:25:59 -0500},
date-modified = {2020-12-25 17:25:59 -0500},
doi = {10.1162/0891201053630273},
journal = {Computational Linguistics},
number = {1},
pages = {25--70},
title = {Discriminative Reranking for Natural Language Parsing},
url = {https://www.aclweb.org/anthology/J05-1003},
volume = {31},
year = {2005},
bdsk-url-1 = {https://www.aclweb.org/anthology/J05-1003},
bdsk-url-2 = {https://doi.org/10.1162/0891201053630273}}
@inproceedings{zhang-clark-2008-tale,
address = {Honolulu, Hawaii},
author = {Zhang, Yue and Clark, Stephen},
booktitle = {Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing},
date-added = {2020-12-25 15:10:10 -0500},
date-modified = {2020-12-25 15:10:10 -0500},
month = oct,
pages = {562--571},
publisher = {Association for Computational Linguistics},
title = {A Tale of Two Parsers: {I}nvestigating and Combining Graph-based and Transition-based Dependency Parsing},
url = {https://www.aclweb.org/anthology/D08-1059},
year = {2008},
bdsk-url-1 = {https://www.aclweb.org/anthology/D08-1059}}
@inproceedings{pradhan-etal-2012-conll,
address = {Jeju Island, Korea},
author = {Pradhan, Sameer and Moschitti, Alessandro and Xue, Nianwen and Uryupina, Olga and Zhang, Yuchen},
booktitle = {Joint Conference on {EMNLP} and {C}o{NLL} - Shared Task},
date-added = {2020-12-24 23:42:41 -0500},
date-modified = {2020-12-24 23:42:41 -0500},
month = jul,
pages = {1--40},
publisher = {Association for Computational Linguistics},
title = {{C}o{NLL}-2012 Shared Task: Modeling Multilingual Unrestricted Coreference in {O}nto{N}otes},
url = {https://www.aclweb.org/anthology/W12-4501},
year = {2012},
bdsk-url-1 = {https://www.aclweb.org/anthology/W12-4501}}
@inproceedings{levow-2006-third,
address = {Sydney, Australia},
author = {Levow, Gina-Anne},
booktitle = {Proceedings of the Fifth {SIGHAN} Workshop on {C}hinese Language Processing},
date-added = {2020-12-24 23:21:14 -0500},
date-modified = {2020-12-24 23:21:14 -0500},
month = jul,
pages = {108--117},
publisher = {Association for Computational Linguistics},
title = {The Third International {C}hinese Language Processing Bakeoff: Word Segmentation and Named Entity Recognition},
url = {https://www.aclweb.org/anthology/W06-0115},
year = {2006},
bdsk-url-1 = {https://www.aclweb.org/anthology/W06-0115}}
@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,
author = {Tjong Kim Sang, Erik F. and De Meulder, Fien},
booktitle = {Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003},
date-added = {2020-12-24 23:19:00 -0500},
date-modified = {2020-12-24 23:19:00 -0500},
pages = {142--147},
title = {Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition},
url = {https://www.aclweb.org/anthology/W03-0419},
year = {2003},
bdsk-url-1 = {https://www.aclweb.org/anthology/W03-0419}}
@inproceedings{koehn2005europarl,
author = {Koehn, Philipp},
booktitle = {MT summit},
date-added = {2020-12-24 23:06:03 -0500},
date-modified = {2020-12-24 23:06:03 -0500},
organization = {Citeseer},
pages = {79--86},
title = {Europarl: A parallel corpus for statistical machine translation},
volume = {5},
year = {2005}}
@inproceedings{Schweter:Ahmed:2019,
author = {Stefan Schweter and Sajawel Ahmed},
booktitle = {Proceedings of the 15th Conference on Natural Language Processing (KONVENS)},
date-added = {2020-12-24 23:03:23 -0500},
date-modified = {2020-12-24 23:03:23 -0500},
location = {Erlangen, Germany},
note = {accepted},
title = {{Deep-EOS: General-Purpose Neural Networks for Sentence Boundary Detection}},
year = 2019}
@incollection{he2019effective,
author = {He, Han and Wu, Lei and Yan, Hua and Gao, Zhimin and Feng, Yi and Townsend, George},
booktitle = {Smart Intelligent Computing and Applications},
date-added = {2020-12-24 19:35:03 -0500},
date-modified = {2020-12-24 19:35:03 -0500},
pages = {133--142},
publisher = {Springer},
title = {Effective neural solution for multi-criteria word segmentation},
year = {2019}}
@inproceedings{dozat2017stanford,
author = {Dozat, Timothy and Qi, Peng and Manning, Christopher D},
booktitle = {Proceedings of the CoNLL 2017 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies},
date-added = {2020-12-24 15:02:18 -0500},
date-modified = {2020-12-24 15:02:18 -0500},
pages = {20--30},
title = {Stanford's graph-based neural dependency parser at the conll 2017 shared task},
year = {2017}}
@inproceedings{he-etal-2018-jointly,
abstract = {Recent BIO-tagging-based neural semantic role labeling models are very high performing, but assume gold predicates as part of the input and cannot incorporate span-level features. We propose an end-to-end approach for jointly predicting all predicates, arguments spans, and the relations between them. The model makes independent decisions about what relationship, if any, holds between every possible word-span pair, and learns contextualized span representations that provide rich, shared input features for each decision. Experiments demonstrate that this approach sets a new state of the art on PropBank SRL without gold predicates.},
address = {Melbourne, Australia},
author = {He, Luheng and Lee, Kenton and Levy, Omer and Zettlemoyer, Luke},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
date-added = {2020-12-24 14:23:45 -0500},
date-modified = {2020-12-24 14:23:45 -0500},
doi = {10.18653/v1/P18-2058},
month = jul,
pages = {364--369},
publisher = {Association for Computational Linguistics},
title = {Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling},
url = {https://www.aclweb.org/anthology/P18-2058},
year = {2018},
bdsk-url-1 = {https://www.aclweb.org/anthology/P18-2058},
bdsk-url-2 = {https://doi.org/10.18653/v1/P18-2058}}
@inproceedings{yu-etal-2020-named,
abstract = {Named Entity Recognition (NER) is a fundamental task in Natural Language Processing, concerned with identifying spans of text expressing references to entities. NER research is often focused on flat entities only (flat NER), ignoring the fact that entity references can be nested, as in [Bank of [China]] (Finkel and Manning, 2009). In this paper, we use ideas from graph-based dependency parsing to provide our model a global view on the input via a biaffine model (Dozat and Manning, 2017). The biaffine model scores pairs of start and end tokens in a sentence which we use to explore all spans, so that the model is able to predict named entities accurately. We show that the model works well for both nested and flat NER through evaluation on 8 corpora and achieving SoTA performance on all of them, with accuracy gains of up to 2.2 percentage points.},
address = {Online},
author = {Yu, Juntao and Bohnet, Bernd and Poesio, Massimo},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
date-added = {2020-12-24 13:35:09 -0500},
date-modified = {2020-12-24 13:35:09 -0500},
doi = {10.18653/v1/2020.acl-main.577},
month = jul,
pages = {6470--6476},
publisher = {Association for Computational Linguistics},
title = {Named Entity Recognition as Dependency Parsing},
url = {https://www.aclweb.org/anthology/2020.acl-main.577},
year = {2020},
bdsk-url-1 = {https://www.aclweb.org/anthology/2020.acl-main.577},
bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.577}}
@inproceedings{10.1145/1457838.1457895,
abstract = {Many computer applications require the storage of large amounts of information within the computer's memory where it will be readily available for reference and updating. Quite commonly, more storage space is required than is available in the computer's high-speed working memory. It is, therefore, a common practice to equip computers with magnetic tapes, disks, or drums, or a combination of these to provide additional storage. This additional storage is always slower in operation than the computer's working memory and therefore care must be taken when using it to avoid excessive operating time.},
address = {New York, NY, USA},
author = {De La Briandais, Rene},
booktitle = {Papers Presented at the the March 3-5, 1959, Western Joint Computer Conference},
date-added = {2020-12-24 13:07:31 -0500},
date-modified = {2020-12-24 13:07:31 -0500},
doi = {10.1145/1457838.1457895},
isbn = {9781450378659},
location = {San Francisco, California},
numpages = {4},
pages = {295--298},
publisher = {Association for Computing Machinery},
series = {IRE-AIEE-ACM '59 (Western)},
title = {File Searching Using Variable Length Keys},
url = {https://doi.org/10.1145/1457838.1457895},
year = {1959},
bdsk-url-1 = {https://doi.org/10.1145/1457838.1457895}}
@article{lafferty2001conditional,
author = {Lafferty, John and McCallum, Andrew and Pereira, Fernando CN},
date-added = {2020-12-24 11:46:30 -0500},
date-modified = {2020-12-24 12:08:29 -0500},
journal = {Departmental Papers (CIS)},
title = {Conditional random fields: Probabilistic models for segmenting and labeling sequence data},
year = {2001}}
@inproceedings{clark-etal-2019-bam,
abstract = {It can be challenging to train multi-task neural networks that outperform or even match their single-task counterparts. To help address this, we propose using knowledge distillation where single-task models teach a multi-task model. We enhance this training with teacher annealing, a novel method that gradually transitions the model from distillation to supervised learning, helping the multi-task model surpass its single-task teachers. We evaluate our approach by multi-task fine-tuning BERT on the GLUE benchmark. Our method consistently improves over standard single-task and multi-task training.},
address = {Florence, Italy},
author = {Clark, Kevin and Luong, Minh-Thang and Khandelwal, Urvashi and Manning, Christopher D. and Le, Quoc V.},
booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
date-added = {2020-12-24 11:26:54 -0500},
date-modified = {2020-12-24 11:26:54 -0500},
doi = {10.18653/v1/P19-1595},
month = jul,
pages = {5931--5937},
publisher = {Association for Computational Linguistics},
title = {{BAM}! Born-Again Multi-Task Networks for Natural Language Understanding},
url = {https://www.aclweb.org/anthology/P19-1595},
year = {2019},
bdsk-url-1 = {https://www.aclweb.org/anthology/P19-1595},
bdsk-url-2 = {https://doi.org/10.18653/v1/P19-1595}}
@inproceedings{kondratyuk-straka-2019-75,
address = {Hong Kong, China},
author = {Kondratyuk, Dan and Straka, Milan},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
date-added = {2020-12-23 23:51:07 -0500},
date-modified = {2020-12-23 23:51:07 -0500},
pages = {2779--2795},
publisher = {Association for Computational Linguistics},
title = {75 Languages, 1 Model: Parsing Universal Dependencies Universally},
url = {https://www.aclweb.org/anthology/D19-1279},
year = {2019},
bdsk-url-1 = {https://www.aclweb.org/anthology/D19-1279}}
@inproceedings{dozat:17a,
author = {Dozat, Timothy and Manning, Christopher D.},
booktitle = {Proceedings of the 5th International Conference on Learning Representations},
date-added = {2020-12-23 23:46:20 -0500},
date-modified = {2020-12-23 23:46:20 -0500},
series = {ICLR'17},
title = {{Deep Biaffine Attention for Neural Dependency Parsing}},
url = {https://openreview.net/pdf?id=Hk95PK9le},
year = {2017},
bdsk-url-1 = {http://arxiv.org/abs/1611.01734},
bdsk-url-2 = {https://openreview.net/pdf?id=Hk95PK9le}}
@inproceedings{smith-smith-2007-probabilistic,
address = {Prague, Czech Republic},
author = {Smith, David A. and Smith, Noah A.},
booktitle = {Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning ({EMNLP}-{C}o{NLL})},
date-added = {2020-12-23 21:46:06 -0500},
date-modified = {2020-12-23 21:46:06 -0500},
month = jun,
pages = {132--140},
publisher = {Association for Computational Linguistics},
title = {Probabilistic Models of Nonprojective Dependency Trees},
url = {https://www.aclweb.org/anthology/D07-1014},
year = {2007},
bdsk-url-1 = {https://www.aclweb.org/anthology/D07-1014}}
@inproceedings{ijcai2020-560,
author = {Zhang, Yu and Zhou, Houquan and Li, Zhenghua},
booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence, {IJCAI-20}},
date-added = {2020-12-23 21:36:56 -0500},
date-modified = {2020-12-23 21:36:56 -0500},
doi = {10.24963/ijcai.2020/560},
editor = {Christian Bessiere},
month = {7},
note = {Main track},
pages = {4046--4053},
publisher = {International Joint Conferences on Artificial Intelligence Organization},
title = {Fast and Accurate Neural CRF Constituency Parsing},
url = {https://doi.org/10.24963/ijcai.2020/560},
year = {2020},
bdsk-url-1 = {https://doi.org/10.24963/ijcai.2020/560}}
@inproceedings{buchholz-marsi-2006-conll,
address = {New York City},
author = {Buchholz, Sabine and Marsi, Erwin},
booktitle = {Proceedings of the Tenth Conference on Computational Natural Language Learning ({C}o{NLL}-X)},
date-added = {2020-12-22 22:57:41 -0500},
date-modified = {2020-12-22 22:57:41 -0500},
month = jun,
pages = {149--164},
publisher = {Association for Computational Linguistics},
title = {{C}o{NLL}-{X} Shared Task on Multilingual Dependency Parsing},
url = {https://www.aclweb.org/anthology/W06-2920},
year = {2006},
bdsk-url-1 = {https://www.aclweb.org/anthology/W06-2920}}
================================================
FILE: docs/references.rst
================================================
References
==================
.. bibliography:: references.bib
:cited:
:style: astrostyle
================================================
FILE: docs/tutorial.md
================================================
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# Tutorial
Natural Language Processing is an exciting field consisting of many closely related tasks like lexical analysis
and parsing. Each task involves many datasets and models, all requiring a high degree of expertise.
Things become even more complex when dealing with multilingual text, as there's simply no datasets for some
low-resource languages. However, with HanLP 2.1, core NLP tasks have been made easy to access and efficient in
production environments. In this tutorial, we'll walk through the APIs in HanLP step by step.
HanLP offers out-of-the-box RESTful API and native Python API which share very similar interfaces
while they are designed for different scenes.
```{code-cell} ipython3
:tags: [remove_cell]
import hanlp_common.constant
hanlp_common.constant.IPYTHON = False # Avoid pretty_print prints html which doesn't play well with this theme
```
## RESTful API
RESTful API is an endpoint where you send your documents to then get the parsed annotations back.
We are hosting a **non-commercial** API service and you are welcome to [apply for an auth key](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178).
An auth key is a password which gives you access to our API and protects our server from being abused.
Once obtained such an auth key, you can parse your document with our RESTful client which can be installed via:
````{margin} **Non-Commercial**
```{seealso}
Our models and RESTful APIs are under the [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) licence.
```
````
````{margin} **Zero-Shot Learning**
```{note}
Although UD covers 104 languages, OntoNotes (NER, CON, SRL) covers only English, Chinese and Arabic.
So NER/CON/SRL of languages other than the 3 are considered as Zero-Shot and their accuracies can be very low.
```
````
```bash
pip install hanlp_restful
```
```{eval-rst}
Then initiate a :class:`~hanlp_restful.HanLPClient` with your auth key and send a document to have it parsed.
```
```{code-cell} ipython3
:tags: [output_scroll]
from hanlp_restful import HanLPClient
# Fill in your auth, set language='zh' to use Chinese models
HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul')
doc = HanLP('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments. ' \
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。' \
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。')
print(doc)
```
````{margin} **But what do these annotations mean?**
```{seealso}
See our [data format](data_format) and [annotations](annotations/index) for details.
```
````
## Visualization
```{eval-rst}
The returned :class:`~hanlp_common.document.Document` has a handy method :meth:`~hanlp_common.document.Document.pretty_print`
which offers visualization in any mono-width text environment.
```
````{margin} **Non-ASCII**
```{note}
Non-ASCII text might be skewed in terminals but in Jupyter Notebook it will align correctly.
You can also use our [live demo](https://hanlp.hankcs.com/).
```
````
````{margin} **Non-Projective**
```{note}
Non-projective dependency trees cannot be visualized and won't be printed out at this moment.
```
````
```{code-cell} ipython3
doc.pretty_print()
```
## Native API
### Multi-Task Learning
If you want to run our models locally or you want to implement your own RESTful server,
you can [install the native API](https://hanlp.hankcs.com/docs/install.html#install-native-package)
and call it just like the RESTful one.
````{margin} **Sentences Required**
```{seealso}
As MTL doesn't predict sentence boundaries, inputs have to be split beforehand.
See our [data format](data_format) for details.
```
````
```{code-cell} ipython3
:tags: [output_scroll]
import hanlp
HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE)
print(HanLP(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。']))
```
Due to the fact that the service provider is very likely running a different model or having different settings, the
RESTful and native results might be slightly different.
To process Chinese or Japanese, HanLP provides mono-lingual models in each language which significantly outperform the multi-lingual model. See [docs](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mtl.html) for the list of models.
### Single-Task Learning
HanLP also provides a full spectrum of single-task learning models for core NLP tasks including tagging and parsing. Please refer to the documentations of [`pretrained`](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html) models for details.
================================================
FILE: hanlp/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 18:05
import hanlp.common
import hanlp.components
import hanlp.pretrained
import hanlp.utils
from hanlp.version import __version__
hanlp.utils.ls_resource_in_module(hanlp.pretrained)
def load(save_dir: str, verbose=None, **kwargs) -> hanlp.common.component.Component:
"""Load a pretrained component from an identifier.
Args:
save_dir (str): The identifier to the saved component. It could be a remote URL or a local path.
verbose: ``True`` to print loading progress.
**kwargs: Arguments passed to :func:`hanlp.common.torch_component.TorchComponent.load`, e.g.,
``devices`` is a useful argument to specify which GPU devices a PyTorch component will use.
Examples::
import hanlp
# Load component onto the 0-th GPU.
hanlp.load(..., devices=0)
# Load component onto the 0-th and 1-st GPUs using data parallelization.
hanlp.load(..., devices=[0, 1])
.. Note::
A component can have dependencies on other components or resources, which will be recursively loaded. So it's
common to see multiple downloading messages per single load.
Returns:
hanlp.common.component.Component: A pretrained component.
"""
save_dir = hanlp.pretrained.ALL.get(save_dir, save_dir)
from hanlp.utils.component_util import load_from_meta_file
if verbose is None:
from hanlp_common.constant import HANLP_VERBOSE
verbose = HANLP_VERBOSE
return load_from_meta_file(save_dir, 'meta.json', verbose=verbose, **kwargs)
def pipeline(*pipes) -> hanlp.components.pipeline.Pipeline:
"""Creates a pipeline of components. It's made for bundling `KerasComponents`. For `TorchComponent`, use
:class:`~hanlp.components.mtl.multi_task_learning.MultiTaskLearning` instead.
Args:
*pipes: Components if pre-defined any.
Returns:
hanlp.components.pipeline.Pipeline: A pipeline, which is a list of components in order.
"""
return hanlp.components.pipeline.Pipeline(*pipes)
================================================
FILE: hanlp/callbacks/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-05 02:10
================================================
FILE: hanlp/callbacks/fine_csv_logger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-05 02:12
import copy
from io import TextIOWrapper
from typing import List
import numpy as np
import tensorflow as tf
class StreamTableFormatter(object):
def __init__(self) -> None:
super().__init__()
self.col_widths = None
def format_row(self, cells) -> List[str]:
if not isinstance(cells, list):
cells = list(cells)
if not self.col_widths:
self.col_widths = [0] * len([_ for _ in cells])
for i, c in enumerate(cells):
self.col_widths[i] = max(self.col_widths[i], len(self.format_cell(c, self.col_widths[i])))
return list(self.format_cell(cell, width) for cell, width in zip(cells, self.col_widths))
def format_cell(self, cell: str, min_width) -> str:
if isinstance(cell, (np.float32, np.float)):
return '{:>{}.4f}'.format(cell, min_width)
return '{:>{}}'.format(cell, min_width)
class FineCSVLogger(tf.keras.callbacks.History):
def __init__(self, filename, separator=',', append=False):
super().__init__()
self.append = append
self.separator = separator
self.filename = filename
self.out: TextIOWrapper = None
self.keys = []
self.formatter = StreamTableFormatter()
def on_train_begin(self, logs=None):
super().on_train_begin(logs)
self.out = open(self.filename, 'a' if self.append else 'w')
def on_train_end(self, logs=None):
self.out.close()
def on_epoch_end(self, epoch, logs=None):
super().on_epoch_end(epoch, logs)
if not self.keys:
self.keys = sorted(logs.keys())
if getattr(self.model, 'stop_training', None):
# We set NA so that csv parsers do not fail for this last epoch.
logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])
# feed them twice to decide the actual width
values = self.formatter.format_row([epoch + 1] + [logs.get(k, 'NA') for k in self.keys])
headers = self.formatter.format_row(['epoch'] + self.keys)
# print headers and bars
self.out.write(self.separator.join(headers) + '\n')
# bars for markdown style
bars = [''.join(['-'] * width) for width in self.formatter.col_widths]
self.out.write(self.separator.join(bars) + '\n')
values = self.formatter.format_row([epoch + 1] + [logs.get(k, 'NA') for k in self.keys])
self.out.write(self.separator.join(values) + '\n')
self.out.flush()
================================================
FILE: hanlp/common/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 14:45
================================================
FILE: hanlp/common/component.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 14:45
import inspect
from abc import ABC, abstractmethod
from typing import Any
from hanlp_common.configurable import Configurable
class Component(Configurable, ABC):
@abstractmethod
def predict(self, *args, **kwargs):
"""Predict on data. This is the base class for all components, including rule based and statistical ones.
Args:
*args: Any type of data subject to sub-classes
**kwargs: Additional arguments
Returns: Any predicted annotations.
"""
raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
def __call__(self, *args, **kwargs):
"""
A shortcut for :func:`~hanlp.common.component.predict`.
Args:
*args: Any type of data subject to sub-classes
**kwargs: Additional arguments
Returns: Any predicted annotations.
"""
return self.predict(*args, **kwargs)
================================================
FILE: hanlp/common/dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 20:27
import math
import os
import random
import tempfile
import warnings
from abc import ABC, abstractmethod
from copy import copy
from logging import Logger
from typing import Union, List, Callable, Iterable, Dict, Any
import torch
import torch.multiprocessing as mp
from hanlp.common.transform import TransformList, VocabDict, EmbeddingNamedTransform
from hanlp.common.vocab import Vocab
from hanlp.components.parsers.alg import kmeans
from hanlp.utils.io_util import read_cells, get_resource
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import dtype_of
from hanlp_common.configurable import AutoConfigurable
from hanlp_common.constant import IDX, HANLP_VERBOSE
from hanlp_common.util import isdebugging, merge_list_of_dict, k_fold
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, Sampler
from torch.utils.data.dataset import IterableDataset
class Transformable(ABC):
def __init__(self, transform: Union[Callable, List] = None) -> None:
"""An object which can be transformed with a list of functions. It is the final result of an object being passed
through a list of functions, while these functions are kept in a list.
Args:
transform: A transform function or a list of functions.
"""
super().__init__()
if isinstance(transform, list) and not isinstance(transform, TransformList):
transform = TransformList(*transform)
self.transform: Union[Callable, TransformList] = transform
def append_transform(self, transform: Callable):
"""Append a transform to its list of transforms.
Args:
transform: A new transform to be appended.
Returns:
Itself.
"""
assert transform is not None, 'None transform not allowed'
if not self.transform:
self.transform = TransformList(transform)
elif not isinstance(self.transform, TransformList):
if self.transform != transform:
self.transform = TransformList(self.transform, transform)
else:
if transform not in self.transform:
self.transform.append(transform)
return self
def insert_transform(self, index: int, transform: Callable):
"""Insert a transform to a certain position.
Args:
index: A certain position.
transform: A new transform.
Returns:
Itself.
"""
assert transform is not None, 'None transform not allowed'
if not self.transform:
self.transform = TransformList(transform)
elif not isinstance(self.transform, TransformList):
if self.transform != transform:
self.transform = TransformList(self.transform)
self.transform.insert(index, transform)
else:
if transform not in self.transform:
self.transform.insert(index, transform)
return self
def transform_sample(self, sample: dict, inplace=False) -> dict:
"""Apply transforms to a sample.
Args:
sample: A sample, which is a ``dict`` holding features.
inplace: ``True`` to apply transforms inplace.
.. Attention::
If any transform modifies existing features, it will modify again and again when ``inplace=True``.
For example, if a transform insert a ``BOS`` token to a list inplace, and it is called twice,
then 2 ``BOS`` will be inserted which might not be an intended result.
Returns:
Transformed sample.
"""
if not inplace:
sample = copy(sample)
if self.transform:
sample = self.transform(sample)
return sample
class TransformableDataset(Transformable, Dataset, ABC):
def __init__(self,
data: Union[str, List],
transform: Union[Callable, List] = None,
cache=None,
generate_idx=None) -> None:
"""A :class:`~torch.utils.data.Dataset` which can be applied with a list of transform functions.
Args:
data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
transform: Predefined transform(s).
cache: ``True`` to enable caching, so that transforms won't be called twice.
generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
samples are re-ordered by a sampler.
"""
super().__init__(transform)
if generate_idx is None:
generate_idx = isinstance(data, list)
data_ = self.load_data(data, generate_idx)
# assert data_, f'No samples loaded from {data}'
if data_:
assert isinstance(data_[0], dict
), f'TransformDataset expects each sample to be a dict but got {type(data_[0])} instead.'
self.data = data_
if cache:
self.cache = [None] * len(data_)
else:
self.cache = None
def load_data(self, data, generate_idx=False):
"""A intermediate step between constructor and calling the actual file loading method.
Args:
data: If data is a file, this method calls :meth:`~hanlp.common.dataset.TransformableDataset.load_file`
to load it.
generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
samples are re-ordered by a sampler.
Returns: Loaded samples.
"""
if self.should_load_file(data):
if isinstance(data, str):
data = get_resource(data)
data = list(self.load_file(data))
if generate_idx:
for i, each in enumerate(data):
each[IDX] = i
# elif isinstance(data, list):
# data = self.load_list(data)
return data
# noinspection PyMethodMayBeStatic
# def load_list(self, data: list) -> List[Dict[str, Any]]:
# return data
def should_load_file(self, data) -> bool:
"""Determines whether data is a filepath.
Args:
data: Data to check.
Returns: ``True`` to indicate it's a filepath.
"""
return isinstance(data, str)
@abstractmethod
def load_file(self, filepath: str):
"""The actual file loading logic.
Args:
filepath: The path to a dataset.
"""
pass
def __getitem__(self, index: Union[int, slice]) -> Union[dict, List[dict]]:
""" Get the index-th sample in this dataset.
Args:
index: Either a integer index of a list of indices.
Returns: Either a sample or or list of samples depending on how many indices are passed in.
"""
# if isinstance(index, (list, tuple)):
# assert len(index) == 1
# index = index[0]
if isinstance(index, slice):
indices = range(*index.indices(len(self)))
return [self[i] for i in indices]
if self.cache:
cache = self.cache[index]
if cache:
return cache
sample = self.data[index]
sample = self.transform_sample(sample)
if self.cache:
self.cache[index] = sample
return sample
def __len__(self) -> int:
return len(self.data)
def __repr__(self) -> str:
return f'{len(self)} samples: {self[0] if len(self) else ""} ...'
def purge_cache(self):
"""Purges all cache. If cache is not enabled, this method enables it.
"""
self.cache = [None] * len(self.data)
def split(self, *ratios):
"""Split dataset into subsets.
Args:
*ratios: The ratios for each subset. They can be any type of numbers which will be normalized. For example,
``8, 1, 1`` are equivalent to ``0.8, 0.1, 0.1``.
Returns:
list[TransformableDataset]: A list of subsets.
"""
ratios = [x / sum(ratios) for x in ratios]
chunks = []
prev = 0
for r in ratios:
cur = prev + math.ceil(len(self) * r)
chunks.append([prev, cur])
prev = cur
chunks[-1][1] = len(self)
outputs = []
for b, e in chunks:
dataset = copy(self)
dataset.data = dataset.data[b:e]
if dataset.cache:
dataset.cache = dataset.cache[b:e]
outputs.append(dataset)
return outputs
def k_fold(self, k, i):
"""Perform k-fold sampling.
Args:
k (int): Number of folds.
i (int): The i-th fold.
Returns:
TransformableDataset: The i-th fold subset of this dataset.
"""
assert 0 <= i <= k, f'Invalid split {i}'
train_indices, test_indices = k_fold(k, len(self), i)
return self.subset(train_indices), self.subset(test_indices)
def subset(self, indices):
"""Create a subset given indices of samples.
Args:
indices: Indices of samples.
Returns:
TransformableDataset: The a subset of this dataset.
"""
dataset = copy(self)
dataset.data = [dataset.data[i] for i in indices]
if dataset.cache:
dataset.cache = [dataset.cache[i] for i in indices]
return dataset
def shuffle(self):
"""Shuffle this dataset inplace.
"""
if not self.cache:
random.shuffle(self.data)
else:
z = list(zip(self.data, self.cache))
random.shuffle(z)
self.data, self.cache = zip(*z)
def prune(self, criterion: Callable, logger: Logger = None):
"""Prune (to discard) samples according to a criterion.
Args:
criterion: A functions takes a sample as input and output ``True`` if the sample needs to be pruned.
logger: If any, log statistical messages using it.
Returns:
int: Size before pruning.
"""
# noinspection PyTypeChecker
size_before = len(self)
good_ones = [i for i, s in enumerate(self) if not criterion(s)]
self.data = [self.data[i] for i in good_ones]
if self.cache:
self.cache = [self.cache[i] for i in good_ones]
if logger:
size_after = len(self)
num_pruned = size_before - size_after
logger.info(f'Pruned [yellow]{num_pruned} ({num_pruned / size_before:.1%})[/yellow] '
f'samples out of {size_before}.')
return size_before
class TransformSequentialDataset(Transformable, IterableDataset, ABC):
pass
class DeviceDataLoader(DataLoader):
def __init__(self, dataset, batch_size=32, shuffle=False, sampler=None,
batch_sampler=None, num_workers=None, collate_fn=None,
pin_memory=False, drop_last=False, timeout=0,
worker_init_fn=None, multiprocessing_context=None,
device=None, **kwargs):
if batch_sampler is not None:
batch_size = 1
if num_workers is None:
if isdebugging():
num_workers = 0
else:
num_workers = 2
# noinspection PyArgumentList
super(DeviceDataLoader, self).__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle,
sampler=sampler,
batch_sampler=batch_sampler, num_workers=num_workers,
collate_fn=collate_fn,
pin_memory=pin_memory, drop_last=drop_last, timeout=timeout,
worker_init_fn=worker_init_fn,
multiprocessing_context=multiprocessing_context, **kwargs)
self.device = device
def __iter__(self):
for raw_batch in super(DeviceDataLoader, self).__iter__():
if self.device is not None:
for field, data in raw_batch.items():
if isinstance(data, torch.Tensor):
data = data.to(self.device)
raw_batch[field] = data
yield raw_batch
def collate_fn(self, samples):
return merge_list_of_dict(samples)
class PadSequenceDataLoader(DataLoader):
def __init__(self, dataset, batch_size=32, shuffle=False, sampler=None,
batch_sampler=None, num_workers=0, collate_fn=None,
pin_memory=False, drop_last=False, timeout=0,
worker_init_fn=None, multiprocessing_context=None,
pad: dict = None, vocabs: VocabDict = None, device=None, **kwargs):
""" A dataloader commonly used for NLP tasks. It offers the following convenience.
- Bachify each field of samples into a :class:`~torch.Tensor` if the field name satisfies the following criterion.
- Name ends with _id, _ids, _count, _offset, _span, mask
- Name is in `pad` dict.
- Pad each field according to field name, the vocabs and pad dict.
- Move :class:`~torch.Tensor` onto device.
Args:
dataset: A :class:`~torch.utils.data.Dataset` to be bachified.
batch_size: Max size of each batch.
shuffle: ``True`` to shuffle batches.
sampler: A :class:`~torch.utils.data.Sampler` to sample samples from data.
batch_sampler: A :class:`~torch.utils.data.Sampler` to sample batches form all batches.
num_workers: Number of workers for multi-thread loading. Note that multi-thread loading aren't always
faster.
collate_fn: A function to perform batchifying. It must be set to ``None`` in order to make use of the
features this class offers.
pin_memory: If samples are loaded in the Dataset on CPU and would like to be pushed to
the GPU, enabling pin_memory can speed up the transfer. It's not useful since most data field are
not in Tensor type.
drop_last: Drop the last batch since it could be half-empty.
timeout: For multi-worker loading, set a timeout to wait for a worker.
worker_init_fn: Init function for multi-worker.
multiprocessing_context: Context for multiprocessing.
pad: A dict holding field names and their padding values.
vocabs: A dict of vocabs so padding value can be fetched from it.
device: The device tensors will be moved onto.
**kwargs: Other arguments will be passed to :meth:`torch.utils.data.Dataset.__init__`
"""
if device == -1:
device = None
if collate_fn is None:
collate_fn = self.collate_fn
if num_workers is None:
if isdebugging():
num_workers = 0
else:
num_workers = 2
if batch_sampler is None:
assert batch_size, 'batch_size has to be specified when batch_sampler is None'
else:
batch_size = 1
shuffle = None
drop_last = None
# noinspection PyArgumentList
super(PadSequenceDataLoader, self).__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle,
sampler=sampler,
batch_sampler=batch_sampler, num_workers=num_workers,
collate_fn=collate_fn,
pin_memory=pin_memory, drop_last=drop_last, timeout=timeout,
worker_init_fn=worker_init_fn,
multiprocessing_context=multiprocessing_context, **kwargs)
self.vocabs = vocabs
if isinstance(dataset, TransformableDataset) and dataset.transform:
transform = dataset.transform
if not isinstance(transform, TransformList):
transform = []
for each in transform:
if isinstance(each, EmbeddingNamedTransform):
if pad is None:
pad = {}
if each.dst not in pad:
pad[each.dst] = 0
self.pad = pad
self.device = device
def __iter__(self):
for raw_batch in super(PadSequenceDataLoader, self).__iter__():
yield self.tensorize(raw_batch, vocabs=self.vocabs, pad_dict=self.pad, device=self.device)
@staticmethod
def tensorize(raw_batch: Dict[str, Any], vocabs: VocabDict, pad_dict: Dict[str, int] = None, device=None):
for field, data in raw_batch.items():
if isinstance(data, torch.Tensor):
continue
vocab_key = field[:-len('_id')] if field.endswith('_id') else None
vocab: Vocab = vocabs.get(vocab_key, None) if vocabs and vocab_key else None
if vocab:
pad = vocab.safe_pad_token_idx
dtype = torch.long
elif pad_dict is not None and pad_dict.get(field, None) is not None:
pad = pad_dict[field]
dtype = dtype_of(pad)
elif field.endswith('_offset') or field.endswith('_id') or field.endswith(
'_count') or field.endswith('_ids') or field.endswith('_score') or field.endswith(
'_length') or field.endswith('_span'):
# guess some common fields to pad
pad = 0
dtype = torch.long
elif field.endswith('_mask'):
pad = False
dtype = torch.bool
else:
# no need to pad
continue
data = PadSequenceDataLoader.pad_data(data, pad, dtype)
raw_batch[field] = data
if device is not None:
for field, data in raw_batch.items():
if isinstance(data, torch.Tensor):
data = data.to(device)
raw_batch[field] = data
return raw_batch
@staticmethod
def pad_data(data: Union[torch.Tensor, Iterable], pad, dtype=None, device=None):
"""Perform the actual padding for a given data.
Args:
data: Data to be padded.
pad: Padding value.
dtype: Data type.
device: Device to be moved onto.
Returns:
torch.Tensor: A ``torch.Tensor``.
"""
if isinstance(data[0], torch.Tensor):
data = pad_sequence(data, True, pad)
elif isinstance(data[0], Iterable):
inner_is_iterable = False
for each in data:
if len(each):
if isinstance(each[0], Iterable):
inner_is_iterable = True
if len(each[0]):
if not dtype:
dtype = dtype_of(each[0][0])
else:
inner_is_iterable = False
if not dtype:
dtype = dtype_of(each[0])
break
if inner_is_iterable:
max_seq_len = len(max(data, key=len))
max_word_len = len(max([chars for words in data for chars in words], key=len))
ids = torch.zeros(len(data), max_seq_len, max_word_len, dtype=dtype, device=device)
for i, words in enumerate(data):
for j, chars in enumerate(words):
ids[i][j][:len(chars)] = torch.tensor(chars, dtype=dtype, device=device)
data = ids
else:
data = pad_sequence([torch.tensor(x, dtype=dtype, device=device) for x in data], True, pad)
elif isinstance(data, list):
data = torch.tensor(data, dtype=dtype, device=device)
return data
def collate_fn(self, samples):
return merge_list_of_dict(samples)
class CachedDataLoader(object):
def __init__(self, dataloader: torch.utils.data.DataLoader, filename=None):
if not filename:
filename = tempfile.NamedTemporaryFile(prefix='hanlp-cache-', delete=False).name
self.filename = filename
self.size = len(dataloader)
self._build_cache(dataloader)
def _build_cache(self, dataset, verbose=HANLP_VERBOSE):
timer = CountdownTimer(self.size)
with open(self.filename, "wb") as f:
for i, batch in enumerate(dataset):
torch.save(batch, f, _use_new_zipfile_serialization=False)
if verbose:
timer.log(f'Caching {self.filename} [blink][yellow]...[/yellow][/blink]')
def close(self):
if os.path.isfile(self.filename):
os.remove(self.filename)
def __iter__(self):
with open(self.filename, "rb") as f:
for i in range(self.size):
batch = torch.load(f)
yield batch
def __len__(self):
return self.size
def _prefetch_generator(dataloader, queue, batchify=None):
while True:
for batch in dataloader:
if batchify:
batch = batchify(batch)
queue.put(batch)
class PrefetchDataLoader(DataLoader):
def __init__(self, dataloader: torch.utils.data.DataLoader, prefetch: int = 10, batchify: Callable = None) -> None:
""" A dataloader wrapper which speeds up bachifying using multi-processing. It works best for dataloaders
of which the bachify takes very long time. But it introduces extra GPU memory consumption since prefetched
batches are stored in a ``Queue`` on GPU.
.. Caution::
PrefetchDataLoader only works in spawn mode with the following initialization code:
Examples::
if __name__ == '__main__':
import torch
torch.multiprocessing.set_start_method('spawn')
And these 2 lines **MUST** be put into ``if __name__ == '__main__':`` block.
Args:
dataloader: A :class:`~torch.utils.data.DatasetLoader` to be prefetched.
prefetch: Number of batches to prefetch.
batchify: A bachify function called on each batch of samples. In which case, the inner dataloader shall
return samples without really bachify them.
"""
super().__init__(dataset=dataloader)
self._batchify = batchify
self.prefetch = None if isdebugging() else prefetch
if self.prefetch:
self._fire_process(dataloader, prefetch)
def _fire_process(self, dataloader, prefetch):
self.queue = mp.Queue(prefetch)
self.process = mp.Process(target=_prefetch_generator, args=(dataloader, self.queue, self._batchify))
self.process.start()
def __iter__(self):
if not self.prefetch:
for batch in self.dataset:
if self._batchify:
batch = self._batchify(batch)
yield batch
else:
size = len(self)
while size:
batch = self.queue.get()
yield batch
size -= 1
def close(self):
"""Close this dataloader and terminates internal processes and queue. It's recommended to call this method to
ensure a program can gracefully shutdown.
"""
if self.prefetch:
self.queue.close()
self.process.terminate()
@property
def batchify(self):
return self._batchify
@batchify.setter
def batchify(self, batchify):
self._batchify = batchify
if not self.prefetch:
prefetch = vars(self.queue).get('maxsize', 10)
self.close()
self._fire_process(self.dataset, prefetch)
class BucketSampler(Sampler):
# noinspection PyMissingConstructor
def __init__(self, buckets: Dict[float, List[int]], batch_max_tokens, batch_size=None, shuffle=False):
"""A bucketing based sampler which groups samples into buckets then creates batches from each bucket.
Args:
buckets: A dict of which keys are some statistical numbers of each bucket, and values are the indices of
samples in each bucket.
batch_max_tokens: Maximum tokens per batch.
batch_size: Maximum samples per batch.
shuffle: ``True`` to shuffle batches and samples in a batch.
"""
self.shuffle = shuffle
self.sizes, self.buckets = zip(*[
(size, bucket) for size, bucket in buckets.items()
])
# the number of chunks in each bucket, which is clipped by
# range [1, len(bucket)]
if batch_size:
self.chunks = [
max(batch_size, min(len(bucket), max(round(size * len(bucket) / batch_max_tokens), 1)))
for size, bucket in zip(self.sizes, self.buckets)
]
else:
self.chunks = [
min(len(bucket), max(round(size * len(bucket) / batch_max_tokens), 1))
for size, bucket in zip(self.sizes, self.buckets)
]
def __iter__(self):
# if shuffle, shuffle both the buckets and samples in each bucket
range_fn = torch.randperm if self.shuffle else torch.arange
for i in range_fn(len(self.buckets)).tolist():
split_sizes = [(len(self.buckets[i]) - j - 1) // self.chunks[i] + 1 for j in range(self.chunks[i])]
# DON'T use `torch.chunk` which may return wrong number of chunks
for batch in range_fn(len(self.buckets[i])).split(split_sizes):
yield [self.buckets[i][j] for j in batch.tolist()]
def __len__(self):
return sum(self.chunks)
class KMeansSampler(BucketSampler):
def __init__(self, lengths, batch_max_tokens, batch_size=None, shuffle=False, n_buckets=1):
"""A bucket sampler which groups samples using KMeans on their lengths.
Args:
lengths: Lengths of each sample, usually measured by number of tokens.
batch_max_tokens: Maximum tokens per batch.
batch_size: Maximum samples per batch.
shuffle: ``True`` to shuffle batches. Samples in the same batch won't be shuffled since the ordered sequence
is helpful to speed up RNNs.
n_buckets: Number of buckets. Clusters in terms of KMeans.
"""
if n_buckets > len(lengths):
n_buckets = 1
self.n_buckets = n_buckets
self.lengths = lengths
buckets = dict(zip(*kmeans(self.lengths, n_buckets)))
super().__init__(buckets, batch_max_tokens, batch_size, shuffle)
class SortingSampler(Sampler):
# noinspection PyMissingConstructor
def __init__(self, lengths: List[int], batch_size=None, batch_max_tokens=None, use_effective_tokens=False,
shuffle=False) -> None:
"""A sampler which sorts samples according to their lengths. It takes a continuous chunk of sorted samples to
make a batch. The effective batch size is determined by ``batch_size``, ``batch_max_tokens`` and
``use_effective_tokens``.
Args:
lengths: Lengths of each sample, usually measured by number of tokens.
batch_max_tokens: Maximum tokens per batch.
use_effective_tokens: Whether to calculate the effective number of tokens after padding when applying the
``batch_max_tokens``.
batch_size: Maximum samples per batch.
shuffle: ``True`` to shuffle batches and samples in a batch.
"""
# assert any([batch_size, batch_max_tokens]), 'At least one of batch_size and batch_max_tokens is required'
self.shuffle = shuffle
self.batch_size = batch_size
# self.batch_max_tokens = batch_max_tokens
self.batch_indices = []
num_tokens = 0
mini_batch = []
for i in torch.argsort(torch.tensor(lengths), descending=True).tolist():
# if batch_max_tokens:
effective_tokens = lengths[i] if (not mini_batch or not use_effective_tokens) else lengths[mini_batch[0]]
if (batch_max_tokens is None or num_tokens + effective_tokens <= batch_max_tokens) and (
batch_size is None or len(mini_batch) < batch_size):
mini_batch.append(i)
num_tokens += effective_tokens
else:
if not mini_batch: # this sequence is longer than batch_max_tokens
mini_batch.append(i)
self.batch_indices.append(mini_batch)
mini_batch = []
num_tokens = 0
else:
self.batch_indices.append(mini_batch)
mini_batch = [i]
num_tokens = effective_tokens
if mini_batch:
self.batch_indices.append(mini_batch)
# print(len(max(self.batch_indices, key=len)))
def __iter__(self):
if self.shuffle:
random.shuffle(self.batch_indices)
for batch in self.batch_indices:
yield batch
def __len__(self) -> int:
return len(self.batch_indices)
class SamplerBuilder(AutoConfigurable, ABC):
@abstractmethod
def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler:
"""Build a ``Sampler`` given statistics of samples and other arguments.
Args:
lengths: The lengths of samples.
shuffle: ``True`` to shuffle batches. Note samples in each mini-batch are not necessarily shuffled.
gradient_accumulation: Number of mini-batches per update step.
**kwargs: Other arguments to be passed to the constructor of the sampler.
"""
pass
def __call__(self, lengths: List[int], shuffle=False, **kwargs) -> Sampler:
return self.build(lengths, shuffle, **kwargs)
def scale(self, gradient_accumulation):
r"""Scale down the ``batch_size`` and ``batch_max_tokens`` to :math:`\frac{1}{\text{gradient_accumulation}}`
of them respectively.
Args:
gradient_accumulation: Number of mini-batches per update step.
Returns:
tuple(int,int): batch_size, batch_max_tokens
"""
batch_size = self.batch_size
batch_max_tokens = self.batch_max_tokens
if gradient_accumulation:
if batch_size:
batch_size //= gradient_accumulation
if batch_max_tokens:
batch_max_tokens //= gradient_accumulation
return batch_size, batch_max_tokens
class SortingSamplerBuilder(SortingSampler, SamplerBuilder):
# noinspection PyMissingConstructor
def __init__(self, batch_size=None, batch_max_tokens=None, use_effective_tokens=False) -> None:
"""Builds a :class:`~hanlp.common.dataset.SortingSampler`.
Args:
batch_max_tokens: Maximum tokens per batch.
use_effective_tokens: Whether to calculate effective number of tokens when applying the `batch_max_tokens`.
batch_size: Maximum samples per batch.
"""
self.use_effective_tokens = use_effective_tokens
self.batch_max_tokens = batch_max_tokens
self.batch_size = batch_size
def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler:
batch_size, batch_max_tokens = self.scale(gradient_accumulation)
return SortingSampler(lengths, batch_size, batch_max_tokens, shuffle)
def __len__(self) -> int:
return 1
class KMeansSamplerBuilder(KMeansSampler, SamplerBuilder):
# noinspection PyMissingConstructor
def __init__(self, batch_max_tokens, batch_size=None, n_buckets=1):
"""Builds a :class:`~hanlp.common.dataset.KMeansSampler`.
Args:
batch_max_tokens: Maximum tokens per batch.
batch_size: Maximum samples per batch.
n_buckets: Number of buckets. Clusters in terms of KMeans.
"""
self.n_buckets = n_buckets
self.batch_size = batch_size
self.batch_max_tokens = batch_max_tokens
def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler:
batch_size, batch_max_tokens = self.scale(gradient_accumulation)
return KMeansSampler(lengths, batch_max_tokens, batch_size, shuffle, self.n_buckets)
def __len__(self) -> int:
return 1
class TableDataset(TransformableDataset):
def __init__(self,
data: Union[str, List],
transform: Union[Callable, List] = None,
cache=None,
delimiter='auto',
strip=True,
headers=None) -> None:
self.headers = headers
self.strip = strip
self.delimiter = delimiter
super().__init__(data, transform, cache)
def load_file(self, filepath: str):
for idx, cells in enumerate(read_cells(filepath, strip=self.strip, delimiter=self.delimiter)):
if not idx and not self.headers:
self.headers = cells
if any(len(h) > 32 for h in self.headers):
warnings.warn('As you did not pass in `headers` to `TableDataset`, the first line is regarded as '
'headers. However, the length for some headers are too long (>32), which might be '
'wrong. To make sure, pass `headers=...` explicitly.')
else:
yield dict(zip(self.headers, cells))
================================================
FILE: hanlp/common/keras_component.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 14:45
import logging
import math
import os
import sys
from abc import ABC, abstractmethod
from typing import Optional, List, Any, Dict
import numpy as np
import tensorflow as tf
import hanlp.utils
from hanlp_common.io import save_json, load_json
from hanlp.callbacks.fine_csv_logger import FineCSVLogger
from hanlp.common.component import Component
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.metrics.chunking.iobes_tf import IOBES_F1_TF
from hanlp.optimizers.adamw import AdamWeightDecay
from hanlp.utils import io_util
from hanlp.utils.io_util import get_resource, tempdir_human
from hanlp.utils.log_util import init_logger, logger
from hanlp.utils.string_util import format_scores
from hanlp.utils.tf_util import format_metrics, size_of_dataset, summary_of_model, get_callback_by_class, NumpyEncoder
from hanlp.utils.time_util import Timer, now_datetime
from hanlp_common.reflection import str_to_type, classpath_of
from hanlp_common.structure import SerializableDict
from hanlp_common.util import merge_dict
class KerasComponent(Component, ABC):
def __init__(self, transform: Transform) -> None:
super().__init__()
self.meta = {
'class_path': classpath_of(self),
'hanlp_version': hanlp.version.__version__,
}
self.model: Optional[tf.keras.Model] = None
self.config = SerializableDict()
self.transform = transform
# share config with transform for convenience, so we don't need to pass args around
if self.transform.config:
for k, v in self.transform.config.items():
self.config[k] = v
self.transform.config = self.config
def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, logger: logging.Logger = None,
callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=True, verbose=True, **kwargs):
input_path = get_resource(input_path)
file_prefix, ext = os.path.splitext(input_path)
name = os.path.basename(file_prefix)
if not name:
name = 'evaluate'
if save_dir and not logger:
logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO if verbose else logging.WARN,
mode='w')
tst_data = self.transform.file_to_dataset(input_path, batch_size=batch_size)
samples = self.num_samples_in(tst_data)
num_batches = math.ceil(samples / batch_size)
if warm_up:
for x, y in tst_data:
self.model.predict_on_batch(x)
break
if output:
assert save_dir, 'Must pass save_dir in order to output'
if isinstance(output, bool):
output = os.path.join(save_dir, name) + '.predict' + ext
elif isinstance(output, str):
output = output
else:
raise RuntimeError('output ({}) must be of type bool or str'.format(repr(output)))
timer = Timer()
eval_outputs = self.evaluate_dataset(tst_data, callbacks, output, num_batches, **kwargs)
loss, score, output = eval_outputs[0], eval_outputs[1], eval_outputs[2]
delta_time = timer.stop()
speed = samples / delta_time.delta_seconds
if logger:
f1: IOBES_F1_TF = None
for metric in self.model.metrics:
if isinstance(metric, IOBES_F1_TF):
f1 = metric
break
extra_report = ''
if f1:
overall, by_type, extra_report = f1.state.result(full=True, verbose=False)
extra_report = ' \n' + extra_report
logger.info('Evaluation results for {} - '
'loss: {:.4f} - {} - speed: {:.2f} sample/sec{}'
.format(name + ext, loss,
format_scores(score) if isinstance(score, dict) else format_metrics(self.model.metrics),
speed, extra_report))
if output:
logger.info('Saving output to {}'.format(output))
with open(output, 'w', encoding='utf-8') as out:
self.evaluate_output(tst_data, out, num_batches, self.model.metrics)
return loss, score, speed
def num_samples_in(self, dataset):
return size_of_dataset(dataset)
def evaluate_dataset(self, tst_data, callbacks, output, num_batches, **kwargs):
loss, score = self.model.evaluate(tst_data, callbacks=callbacks, steps=num_batches)
return loss, score, output
def evaluate_output(self, tst_data, out, num_batches, metrics: List[tf.keras.metrics.Metric]):
# out.write('x\ty_true\ty_pred\n')
for metric in metrics:
metric.reset_states()
for idx, batch in enumerate(tst_data):
outputs = self.model.predict_on_batch(batch[0])
for metric in metrics:
metric(batch[1], outputs, outputs._keras_mask if hasattr(outputs, '_keras_mask') else None)
self.evaluate_output_to_file(batch, outputs, out)
print('\r{}/{} {}'.format(idx + 1, num_batches, format_metrics(metrics)), end='')
print()
def evaluate_output_to_file(self, batch, outputs, out):
for x, y_gold, y_pred in zip(self.transform.X_to_inputs(batch[0]),
self.transform.Y_to_outputs(batch[1], gold=True),
self.transform.Y_to_outputs(outputs, gold=False)):
out.write(self.transform.input_truth_output_to_str(x, y_gold, y_pred))
def _capture_config(self, config: Dict,
exclude=(
'trn_data', 'dev_data', 'save_dir', 'kwargs', 'self', 'logger', 'verbose',
'dev_batch_size', '__class__')):
"""
Save arguments to config
Parameters
----------
config
`locals()`
exclude
"""
if 'kwargs' in config:
config.update(config['kwargs'])
config = dict(
(key, tf.keras.utils.serialize_keras_object(value)) if hasattr(value, 'get_config') else (key, value) for
key, value in config.items())
for key in exclude:
config.pop(key, None)
self.config.update(config)
def save_meta(self, save_dir, filename='meta.json', **kwargs):
self.meta['create_time']: now_datetime()
self.meta.update(kwargs)
save_json(self.meta, os.path.join(save_dir, filename))
def load_meta(self, save_dir, filename='meta.json'):
save_dir = get_resource(save_dir)
metapath = os.path.join(save_dir, filename)
if os.path.isfile(metapath):
self.meta.update(load_json(metapath))
def save_config(self, save_dir, filename='config.json'):
self.config.save_json(os.path.join(save_dir, filename))
def load_config(self, save_dir, filename='config.json'):
save_dir = get_resource(save_dir)
self.config.load_json(os.path.join(save_dir, filename))
def save_weights(self, save_dir, filename='model.h5'):
self.model.save_weights(os.path.join(save_dir, filename))
def load_weights(self, save_dir, filename='model.h5', **kwargs):
assert self.model.built or self.model.weights, 'You must call self.model.built() in build_model() ' \
'in order to load it'
save_dir = get_resource(save_dir)
self.model.load_weights(os.path.join(save_dir, filename))
def save_vocabs(self, save_dir, filename='vocabs.json'):
vocabs = SerializableDict()
for key, value in vars(self.transform).items():
if isinstance(value, VocabTF):
vocabs[key] = value.to_dict()
vocabs.save_json(os.path.join(save_dir, filename))
def load_vocabs(self, save_dir, filename='vocabs.json'):
save_dir = get_resource(save_dir)
vocabs = SerializableDict()
vocabs.load_json(os.path.join(save_dir, filename))
for key, value in vocabs.items():
vocab = VocabTF()
vocab.copy_from(value)
setattr(self.transform, key, vocab)
def load_transform(self, save_dir) -> Transform:
"""
Try to load transform only. This method might fail due to the fact it avoids building the model.
If it do fail, then you have to use `load` which might be too heavy but that's the best we can do.
:param save_dir: The path to load.
"""
save_dir = get_resource(save_dir)
self.load_config(save_dir)
self.load_vocabs(save_dir)
self.transform.build_config()
self.transform.lock_vocabs()
return self.transform
def save(self, save_dir: str, **kwargs):
self.save_config(save_dir)
self.save_vocabs(save_dir)
self.save_weights(save_dir)
def load(self, save_dir: str, logger=hanlp.utils.log_util.logger, **kwargs):
self.meta['load_path'] = save_dir
save_dir = get_resource(save_dir)
self.load_config(save_dir)
self.load_vocabs(save_dir)
self.build(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True))
self.load_weights(save_dir, **kwargs)
self.load_meta(save_dir)
@property
def input_shape(self) -> List:
return self.transform.output_shapes[0]
def build(self, logger, **kwargs):
self.transform.build_config()
self.model = self.build_model(**merge_dict(self.config, training=kwargs.get('training', None),
loss=kwargs.get('loss', None)))
self.transform.lock_vocabs()
optimizer = self.build_optimizer(**self.config)
loss = self.build_loss(
**self.config if 'loss' in self.config else dict(list(self.config.items()) + [('loss', None)]))
# allow for different
metrics = self.build_metrics(**merge_dict(self.config, metrics=kwargs.get('metrics', 'accuracy'),
logger=logger, overwrite=True))
if not isinstance(metrics, list):
if isinstance(metrics, tf.keras.metrics.Metric):
metrics = [metrics]
if not self.model.built:
sample_inputs = self.sample_data
if sample_inputs is not None:
self.model(sample_inputs)
else:
if len(self.transform.output_shapes[0]) == 1 and self.transform.output_shapes[0][0] is None:
x_shape = self.transform.output_shapes[0]
else:
x_shape = list(self.transform.output_shapes[0])
for i, shape in enumerate(x_shape):
x_shape[i] = [None] + shape # batch + X.shape
self.model.build(input_shape=x_shape)
self.compile_model(optimizer, loss, metrics)
return self.model, optimizer, loss, metrics
def compile_model(self, optimizer, loss, metrics):
try:
self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics, run_eagerly=self.config.run_eagerly)
except ValueError:
from keras.saving.object_registration import CustomObjectScope
with CustomObjectScope({'adamweightdecay': AdamWeightDecay}):
self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics, run_eagerly=self.config.run_eagerly)
def build_optimizer(self, optimizer, **kwargs) -> tf.keras.optimizers.Optimizer:
if isinstance(optimizer, (str, dict)):
custom_objects = {'AdamWeightDecay': AdamWeightDecay}
try:
optimizer = tf.keras.utils.deserialize_keras_object(optimizer, module_objects=vars(tf.keras.optimizers),
custom_objects=custom_objects)
except ValueError:
optimizer['config'].pop('decay', None)
optimizer = tf.keras.utils.deserialize_keras_object(optimizer, module_objects=vars(tf.keras.optimizers),
custom_objects=custom_objects)
self.config.optimizer = tf.keras.utils.serialize_keras_object(optimizer)
return optimizer
def build_loss(self, loss, **kwargs):
if not loss:
loss = tf.keras.losses.SparseCategoricalCrossentropy(
reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
from_logits=True)
elif isinstance(loss, (str, dict)):
loss = tf.keras.utils.deserialize_keras_object(loss, module_objects=vars(tf.keras.losses))
if isinstance(loss, tf.keras.losses.Loss):
self.config.loss = tf.keras.utils.serialize_keras_object(loss)
return loss
def build_transform(self, **kwargs):
return self.transform
def build_vocab(self, trn_data, logger):
train_examples = self.transform.fit(trn_data, **self.config)
self.transform.summarize_vocabs(logger)
return train_examples
def build_metrics(self, metrics, logger: logging.Logger, **kwargs):
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
return [metric]
@abstractmethod
def build_model(self, **kwargs) -> tf.keras.Model:
pass
def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=False, logger=None, verbose=True,
finetune: str = None, **kwargs):
self._capture_config(locals())
if sys.version_info >= (3, 10):
logger.warning(f'Training with TensorFlow {tf.__version__} has not been tested on Python '
f'{sys.version_info.major}.{sys.version_info.minor}. Please downgrade to '
f'Python<=3.9 in case any compatibility issues arise.')
self.transform = self.build_transform(**self.config)
if not save_dir:
save_dir = tempdir_human()
if not logger:
logger = init_logger(name='train', root_dir=save_dir, level=logging.INFO if verbose else logging.WARN)
logger.info('Hyperparameter:\n' + self.config.to_json())
num_examples = self.build_vocab(trn_data, logger)
# assert num_examples, 'You forgot to return the number of training examples in your build_vocab'
logger.info('Building...')
train_steps_per_epoch = math.ceil(num_examples / batch_size) if num_examples else None
self.config.train_steps = train_steps_per_epoch * epochs if num_examples else None
model, optimizer, loss, metrics = self.build(**merge_dict(self.config, logger=logger, training=True))
logger.info('Model built:\n' + summary_of_model(self.model))
if finetune:
finetune = get_resource(finetune)
if os.path.isdir(finetune):
finetune = os.path.join(finetune, 'model.h5')
model.load_weights(finetune, by_name=True, skip_mismatch=True)
logger.info(f'Loaded pretrained weights from {finetune} for finetuning')
self.save_config(save_dir)
self.save_vocabs(save_dir)
self.save_meta(save_dir)
trn_data = self.build_train_dataset(trn_data, batch_size, num_examples)
dev_data = self.build_valid_dataset(dev_data, batch_size)
callbacks = self.build_callbacks(save_dir, **merge_dict(self.config, overwrite=True, logger=logger))
# need to know #batches, otherwise progbar crashes
dev_steps = math.ceil(self.num_samples_in(dev_data) / batch_size)
checkpoint = get_callback_by_class(callbacks, tf.keras.callbacks.ModelCheckpoint)
timer = Timer()
try:
history = self.train_loop(**merge_dict(self.config, trn_data=trn_data, dev_data=dev_data, epochs=epochs,
num_examples=num_examples,
train_steps_per_epoch=train_steps_per_epoch, dev_steps=dev_steps,
callbacks=callbacks, logger=logger, model=model, optimizer=optimizer,
loss=loss,
metrics=metrics, overwrite=True))
except KeyboardInterrupt:
print()
if not checkpoint or checkpoint.best in (np.Inf, -np.Inf):
self.save_weights(save_dir)
logger.info('Aborted with model saved')
else:
logger.info(f'Aborted with model saved with best {checkpoint.monitor} = {checkpoint.best:.4f}')
# noinspection PyTypeChecker
history: tf.keras.callbacks.History() = get_callback_by_class(callbacks, tf.keras.callbacks.History)
delta_time = timer.stop()
best_epoch_ago = 0
if history and hasattr(history, 'epoch'):
trained_epoch = len(history.epoch)
logger.info('Trained {} epochs in {}, each epoch takes {}'.
format(trained_epoch, delta_time, delta_time / trained_epoch if trained_epoch else delta_time))
save_json(history.history, io_util.path_join(save_dir, 'history.json'), cls=NumpyEncoder)
monitor_history: List = history.history.get(checkpoint.monitor, None)
if monitor_history:
best_epoch_ago = len(monitor_history) - monitor_history.index(checkpoint.best)
if checkpoint and monitor_history and checkpoint.best != monitor_history[-1]:
logger.info(f'Restored the best model saved with best '
f'{checkpoint.monitor} = {checkpoint.best:.4f} '
f'saved {best_epoch_ago} epochs ago')
self.load_weights(save_dir) # restore best model
return history
def train_loop(self, trn_data, dev_data, epochs, num_examples, train_steps_per_epoch, dev_steps, model, optimizer,
loss, metrics, callbacks,
logger, **kwargs):
history = self.model.fit(trn_data, epochs=epochs, steps_per_epoch=train_steps_per_epoch,
validation_data=dev_data,
callbacks=callbacks,
validation_steps=dev_steps,
) # type:tf.keras.callbacks.History
return history
def build_valid_dataset(self, dev_data, batch_size):
dev_data = self.transform.file_to_dataset(dev_data, batch_size=batch_size, shuffle=False)
return dev_data
def build_train_dataset(self, trn_data, batch_size, num_examples):
trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size,
shuffle=True,
repeat=-1 if self.config.train_steps else None)
return trn_data
def build_callbacks(self, save_dir, logger, **kwargs):
metrics = kwargs.get('metrics', 'accuracy')
if isinstance(metrics, (list, tuple)):
metrics = metrics[-1]
monitor = f'val_{metrics}'
checkpoint = tf.keras.callbacks.ModelCheckpoint(
os.path.join(save_dir, 'model.h5'),
# verbose=1,
monitor=monitor, save_best_only=True,
mode='max',
save_weights_only=True)
logger.debug(f'Monitor {checkpoint.monitor} for checkpoint')
tensorboard_callback = tf.keras.callbacks.TensorBoard(
log_dir=io_util.makedirs(io_util.path_join(save_dir, 'logs')))
csv_logger = FineCSVLogger(os.path.join(save_dir, 'train.log'), separator=' | ', append=True)
callbacks = [checkpoint, tensorboard_callback, csv_logger]
lr_decay_per_epoch = self.config.get('lr_decay_per_epoch', None)
if lr_decay_per_epoch:
learning_rate = self.model.optimizer.get_config().get('learning_rate', None)
if not learning_rate:
logger.warning('Learning rate decay not supported for optimizer={}'.format(repr(self.model.optimizer)))
else:
logger.debug(f'Created LearningRateScheduler with lr_decay_per_epoch={lr_decay_per_epoch}')
callbacks.append(tf.keras.callbacks.LearningRateScheduler(
lambda epoch: learning_rate / (1 + lr_decay_per_epoch * epoch)))
anneal_factor = self.config.get('anneal_factor', None)
if anneal_factor:
callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(factor=anneal_factor,
patience=self.config.get('anneal_patience', 10)))
early_stopping_patience = self.config.get('early_stopping_patience', None)
if early_stopping_patience:
callbacks.append(tf.keras.callbacks.EarlyStopping(monitor=monitor, mode='max',
verbose=1,
patience=early_stopping_patience))
return callbacks
def on_train_begin(self):
"""
Callback before the training starts
"""
pass
def predict(self, data: Any, batch_size=None, **kwargs):
assert self.model, 'Please call fit or load before predict'
if not data:
return []
data, flat = self.transform.input_to_inputs(data)
if not batch_size:
batch_size = self.config.batch_size
dataset = self.transform.inputs_to_dataset(data, batch_size=batch_size, gold=kwargs.get('gold', False))
results = []
num_samples = 0
data_is_list = isinstance(data, list)
for idx, batch in enumerate(dataset):
samples_in_batch = tf.shape(batch[-1] if isinstance(batch[-1], tf.Tensor) else batch[-1][0])[0]
if data_is_list:
inputs = data[num_samples:num_samples + samples_in_batch]
else:
inputs = None # if data is a generator, it's usually one-time, not able to transform into a list
for output in self.predict_batch(batch, inputs=inputs, **kwargs):
results.append(output)
num_samples += samples_in_batch
self.transform.cleanup()
if flat:
return results[0]
return results
def predict_batch(self, batch, inputs=None, **kwargs):
X = batch[0]
Y = self.model.predict_on_batch(X)
for output in self.transform.Y_to_outputs(Y, X=X, inputs=inputs, batch=batch, **kwargs):
yield output
@property
def sample_data(self):
return None
@staticmethod
def from_meta(meta: dict, **kwargs):
"""
Parameters
----------
meta
kwargs
Returns
-------
KerasComponent
"""
cls = str_to_type(meta['class_path'])
obj: KerasComponent = cls()
assert 'load_path' in meta, f'{meta} doesn\'t contain load_path field'
obj.load(meta['load_path'])
return obj
def export_model_for_serving(self, export_dir=None, version=1, overwrite=False, show_hint=False):
assert self.model, 'You have to fit or load a model before exporting it'
if not export_dir:
assert 'load_path' in self.meta, 'When not specifying save_dir, load_path has to present'
export_dir = get_resource(self.meta['load_path'])
model_path = os.path.join(export_dir, str(version))
if os.path.isdir(model_path) and not overwrite:
logger.info(f'{model_path} exists, skip since overwrite = {overwrite}')
return export_dir
logger.info(f'Exporting to {export_dir} ...')
tf.saved_model.save(self.model, model_path)
logger.info(f'Successfully exported model to {export_dir}')
if show_hint:
logger.info(f'You can serve it through \n'
f'tensorflow_model_server --model_name={os.path.splitext(os.path.basename(self.meta["load_path"]))[0]} '
f'--model_base_path={export_dir} --rest_api_port=8888')
return export_dir
def serve(self, export_dir=None, grpc_port=8500, rest_api_port=0, overwrite=False, dry_run=False):
export_dir = self.export_model_for_serving(export_dir, show_hint=False, overwrite=overwrite)
if not dry_run:
del self.model # free memory
logger.info('The inputs of exported model is shown below.')
os.system(f'saved_model_cli show --all --dir {export_dir}/1')
cmd = f'nohup tensorflow_model_server --model_name={os.path.splitext(os.path.basename(self.meta["load_path"]))[0]} ' \
f'--model_base_path={export_dir} --port={grpc_port} --rest_api_port={rest_api_port} ' \
f'>serve.log 2>&1 &'
logger.info(f'Running ...\n{cmd}')
if not dry_run:
os.system(cmd)
================================================
FILE: hanlp/common/structure.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 14:58
from typing import Dict
from hanlp_common.configurable import Configurable
from hanlp_common.reflection import classpath_of
from hanlp_common.structure import SerializableDict
class ConfigTracker(Configurable):
def __init__(self, locals_: Dict, exclude=('kwargs', 'self', '__class__', 'locals_')) -> None:
"""This base class helps sub-classes to capture their arguments passed to ``__init__``, and also their types so
that they can be deserialized from a config in dict form.
Args:
locals_: Obtained by :meth:`locals`.
exclude: Arguments to be excluded.
Examples:
>>> class MyClass(ConfigTracker):
>>> def __init__(self, i_need_this='yes') -> None:
>>> super().__init__(locals())
>>> obj = MyClass()
>>> print(obj.config)
{'i_need_this': 'yes', 'classpath': 'test_config_tracker.MyClass'}
"""
if 'kwargs' in locals_:
locals_.update(locals_['kwargs'])
self.config = SerializableDict(
(k, v.config if hasattr(v, 'config') else v) for k, v in locals_.items() if k not in exclude)
self.config['classpath'] = classpath_of(self)
class History(object):
def __init__(self):
""" A history of training context. It records how many steps have passed and provides methods to decide whether
an update should be performed, and to caculate number of training steps given dataloader size and
``gradient_accumulation``.
"""
self.num_mini_batches = 0
def step(self, gradient_accumulation):
""" Whether the training procedure should perform an update.
Args:
gradient_accumulation: Number of batches per update.
Returns:
bool: ``True`` to update.
"""
self.num_mini_batches += 1
return self.num_mini_batches % gradient_accumulation == 0
def num_training_steps(self, num_batches, gradient_accumulation):
""" Caculate number of training steps.
Args:
num_batches: Size of dataloader.
gradient_accumulation: Number of batches per update.
Returns:
"""
return len(
[i for i in range(self.num_mini_batches + 1, self.num_mini_batches + num_batches + 1) if
i % gradient_accumulation == 0])
================================================
FILE: hanlp/common/torch_component.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 21:20
import logging
import os
import re
import time
from abc import ABC, abstractmethod
from typing import Optional, Dict, List, Union, Callable
import torch
from torch import nn
from torch.utils.data import DataLoader
import hanlp
from hanlp.common.component import Component
from hanlp.common.dataset import TransformableDataset
from hanlp.common.transform import VocabDict
from hanlp.utils.io_util import get_resource, basename_no_ext
from hanlp.utils.log_util import init_logger, flash
from hanlp.utils.torch_util import cuda_devices, set_seed
from hanlp_common.configurable import Configurable
from hanlp_common.constant import IDX, HANLP_VERBOSE
from hanlp_common.reflection import classpath_of
from hanlp_common.structure import SerializableDict
from hanlp_common.util import merge_dict, isdebugging
class TorchComponent(Component, ABC):
def __init__(self, **kwargs) -> None:
"""The base class for all components using PyTorch as backend. It provides common workflows of building vocabs,
datasets, dataloaders and models. These workflows are more of a conventional guideline than en-forced
protocols, which means subclass has the freedom to override or completely skip some steps.
Args:
**kwargs: Addtional arguments to be stored in the ``config`` property.
"""
super().__init__()
self.model: Optional[torch.nn.Module] = None
self.config = SerializableDict(**kwargs)
self.vocabs = VocabDict()
def _capture_config(self, locals_: Dict,
exclude=(
'trn_data', 'dev_data', 'save_dir', 'kwargs', 'self', 'logger', 'verbose',
'dev_batch_size', '__class__', 'devices', 'eval_trn')):
"""Save arguments to config
Args:
locals_: Dict:
exclude: (Default value = ('trn_data')
'dev_data':
'save_dir':
'kwargs':
'self':
'logger':
'verbose':
'dev_batch_size':
'__class__':
'devices'):
Returns:
"""
if 'kwargs' in locals_:
locals_.update(locals_['kwargs'])
locals_ = dict((k, v) for k, v in locals_.items() if k not in exclude and not k.startswith('_'))
self.config.update(locals_)
return self.config
def save_weights(self, save_dir, filename='model.pt', trainable_only=True, **kwargs):
"""Save model weights to a directory.
Args:
save_dir: The directory to save weights into.
filename: A file name for weights.
trainable_only: ``True`` to only save trainable weights. Useful when the model contains lots of static
embeddings.
**kwargs: Not used for now.
"""
model = self.model_
state_dict = model.state_dict()
if trainable_only:
trainable_names = set(n for n, p in model.named_parameters() if p.requires_grad)
state_dict = dict((n, p) for n, p in state_dict.items() if n in trainable_names)
torch.save(state_dict, os.path.join(save_dir, filename))
def load_weights(self, save_dir, filename='model.pt', **kwargs):
"""Load weights from a directory.
Args:
save_dir: The directory to load weights from.
filename: A file name for weights.
**kwargs: Not used.
"""
save_dir = get_resource(save_dir)
filename = os.path.join(save_dir, filename)
# flash(f'Loading model: {filename} [blink]...[/blink][/yellow]')
try:
self.model_.load_state_dict(torch.load(filename, map_location='cpu', weights_only=True), strict=False)
except TypeError:
self.model_.load_state_dict(torch.load(filename, map_location='cpu'), strict=False)
# flash('')
def save_config(self, save_dir, filename='config.json'):
"""Save config into a directory.
Args:
save_dir: The directory to save config.
filename: A file name for config.
"""
self._savable_config.save_json(os.path.join(save_dir, filename))
def load_config(self, save_dir, filename='config.json', **kwargs):
"""Load config from a directory.
Args:
save_dir: The directory to load config.
filename: A file name for config.
**kwargs: K-V pairs to override config.
"""
save_dir = get_resource(save_dir)
self.config.load_json(os.path.join(save_dir, filename))
self.config.update(kwargs) # overwrite config loaded from disk
for k, v in self.config.items():
if isinstance(v, dict) and 'classpath' in v:
self.config[k] = Configurable.from_config(v)
self.on_config_ready(**self.config, save_dir=save_dir)
def save_vocabs(self, save_dir, filename='vocabs.json'):
"""Save vocabularies to a directory.
Args:
save_dir: The directory to save vocabularies.
filename: The name for vocabularies.
"""
if hasattr(self, 'vocabs'):
self.vocabs.save_vocabs(save_dir, filename)
def load_vocabs(self, save_dir, filename='vocabs.json'):
"""Load vocabularies from a directory.
Args:
save_dir: The directory to load vocabularies.
filename: The name for vocabularies.
"""
if hasattr(self, 'vocabs'):
self.vocabs = VocabDict()
self.vocabs.load_vocabs(save_dir, filename)
def save(self, save_dir: str, **kwargs):
"""Save this component to a directory.
Args:
save_dir: The directory to save this component.
**kwargs: Not used.
"""
self.save_config(save_dir)
self.save_vocabs(save_dir)
self.save_weights(save_dir)
def load(self, save_dir: str, devices=None, verbose=HANLP_VERBOSE, **kwargs):
"""Load from a local/remote component.
Args:
save_dir: An identifier which can be a local path or a remote URL or a pre-defined string.
devices: The devices this component will be moved onto.
verbose: ``True`` to log loading progress.
**kwargs: To override some configs.
"""
save_dir = get_resource(save_dir)
# flash('Loading config and vocabs [blink][yellow]...[/yellow][/blink]')
if devices is None and self.model:
devices = self.devices
self.load_config(save_dir, **kwargs)
self.load_vocabs(save_dir)
if verbose:
flash('Building model [blink][yellow]...[/yellow][/blink]')
self.config.pop('training', None) # Some legacy versions accidentally put training into config file
self.model = self.build_model(
**merge_dict(self.config, **kwargs, overwrite=True, inplace=True), training=False, save_dir=save_dir)
if verbose:
flash('')
self.load_weights(save_dir, **kwargs)
self.to(devices, verbose=verbose)
self.model.eval()
def fit(self,
trn_data,
dev_data,
save_dir,
batch_size,
epochs,
devices=None,
logger=None,
seed=None,
finetune: Union[bool, str] = False,
eval_trn=True,
_device_placeholder=False,
**kwargs):
"""Fit to data, triggers the training procedure. For training set and dev set, they shall be local or remote
files.
Args:
trn_data: Training set.
dev_data: Development set.
save_dir: The directory to save trained component.
batch_size: The number of samples in a batch.
epochs: Number of epochs.
devices: Devices this component will live on.
logger: Any :class:`logging.Logger` instance.
seed: Random seed to reproduce this training.
finetune: ``True`` to load from ``save_dir`` instead of creating a randomly initialized component. ``str``
to specify a different ``save_dir`` to load from.
eval_trn: Evaluate training set after each update. This can slow down the training but provides a quick
diagnostic for debugging.
_device_placeholder: ``True`` to create a placeholder tensor which triggers PyTorch to occupy devices so
other components won't take these devices as first choices.
**kwargs: Hyperparameters used by sub-classes.
Returns:
Any results sub-classes would like to return. Usually the best metrics on training set.
"""
# Common initialization steps
config = self._capture_config(locals())
if not logger:
logger = self.build_logger('train', save_dir)
if seed is None:
self.config.seed = 233 if isdebugging() else int(time.time())
set_seed(self.config.seed)
logger.info(self._savable_config.to_json(sort=True))
if isinstance(devices, list) or devices is None or isinstance(devices, float):
flash('[yellow]Querying CUDA devices [blink]...[/blink][/yellow]')
devices = -1 if isdebugging() else cuda_devices(devices)
flash('')
# flash(f'Available GPUs: {devices}')
if isinstance(devices, list):
first_device = (devices[0] if devices else -1)
elif isinstance(devices, dict):
first_device = next(iter(devices.values()))
elif isinstance(devices, int):
first_device = devices
else:
first_device = -1
if _device_placeholder and first_device >= 0:
_dummy_placeholder = self._create_dummy_placeholder_on(first_device)
if finetune:
if isinstance(finetune, str):
self.load(finetune, devices=devices)
else:
self.load(save_dir, devices=devices)
self.config.finetune = finetune
self.vocabs.unlock() # For extending vocabs
logger.info(
f'Finetune model loaded with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}'
f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
self.on_config_ready(**self.config, save_dir=save_dir)
trn = self.build_dataloader(**merge_dict(config, data=trn_data, batch_size=batch_size, shuffle=True,
training=True, device=first_device, logger=logger, vocabs=self.vocabs,
overwrite=True))
dev = self.build_dataloader(**merge_dict(config, data=dev_data, batch_size=batch_size, shuffle=False,
training=None, device=first_device, logger=logger, vocabs=self.vocabs,
overwrite=True)) if dev_data else None
flash('[yellow]Building model [blink]...[/blink][/yellow]')
self.model = self.build_model(**merge_dict(config, training=True), logger=logger)
flash('')
logger.info(f'Model built with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}'
f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
assert self.model, 'build_model is not properly implemented.'
_description = repr(self.model)
if len(_description.split('\n')) < 10:
logger.info(_description)
self.save_config(save_dir)
self.save_vocabs(save_dir)
self.to(devices, logger)
if _device_placeholder and first_device >= 0:
del _dummy_placeholder
criterion = self.build_criterion(**merge_dict(config, trn=trn))
optimizer = self.build_optimizer(**merge_dict(config, trn=trn, criterion=criterion))
metric = self.build_metric(**self.config)
if hasattr(trn, 'dataset') and dev and hasattr(dev, 'dataset'):
if trn.dataset and dev.dataset:
logger.info(f'{len(trn.dataset)}/{len(dev.dataset)} samples in trn/dev set.')
if hasattr(trn, '__len__') and dev and hasattr(dev, '__len__'):
trn_size = len(trn) // self.config.get('gradient_accumulation', 1)
ratio_width = len(f'{trn_size}/{trn_size}')
else:
ratio_width = None
return self.execute_training_loop(**merge_dict(config, trn=trn, dev=dev, epochs=epochs, criterion=criterion,
optimizer=optimizer, metric=metric, logger=logger,
save_dir=save_dir,
devices=devices,
ratio_width=ratio_width,
trn_data=trn_data,
dev_data=dev_data,
eval_trn=eval_trn,
overwrite=True))
def build_logger(self, name, save_dir):
"""Build a :class:`logging.Logger`.
Args:
name: The name of this logger.
save_dir: The directory this logger should save logs into.
Returns:
logging.Logger: A logger.
"""
logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO, fmt="%(message)s")
return logger
@abstractmethod
def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None,
**kwargs) -> DataLoader:
"""Build dataloader for training, dev and test sets. It's suggested to build vocabs in this method if they are
not built yet.
Args:
data: Data representing samples, which can be a path or a list of samples.
batch_size: Number of samples per batch.
shuffle: Whether to shuffle this dataloader.
device: Device tensors should be loaded onto.
logger: Logger for reporting some message if dataloader takes a long time or if vocabs has to be built.
**kwargs: Arguments from ``**self.config``.
"""
pass
def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger):
"""Override this method to build vocabs.
Args:
trn: Training set.
logger: Logger for reporting progress.
"""
pass
@property
def _savable_config(self):
def convert(k, v):
if not isinstance(v, SerializableDict) and hasattr(v, 'config'):
v = v.config
elif isinstance(v, (set, tuple)):
v = list(v)
if isinstance(v, dict):
v = dict(convert(_k, _v) for _k, _v in v.items())
return k, v
config = SerializableDict(
convert(k, v) for k, v in sorted(self.config.items()))
config.update({
# 'create_time': now_datetime(),
'classpath': classpath_of(self),
'hanlp_version': hanlp.__version__,
})
return config
@abstractmethod
def build_optimizer(self, **kwargs):
"""Implement this method to build an optimizer.
Args:
**kwargs: The subclass decides the method signature.
"""
pass
@abstractmethod
def build_criterion(self, **kwargs):
"""Implement this method to build criterion (loss function).
Args:
**kwargs: The subclass decides the method signature.
"""
pass
@abstractmethod
def build_metric(self, **kwargs):
"""Implement this to build metric(s).
Args:
**kwargs: The subclass decides the method signature.
"""
pass
@abstractmethod
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None,
**kwargs):
"""Implement this to run training loop.
Args:
trn: Training set.
dev: Development set.
epochs: Number of epochs.
criterion: Loss function.
optimizer: Optimizer(s).
metric: Metric(s)
save_dir: The directory to save this component.
logger: Logger for reporting progress.
devices: Devices this component and dataloader will live on.
ratio_width: The width of dataset size measured in number of characters. Used for logger to align messages.
**kwargs: Other hyper-parameters passed from sub-class.
"""
pass
@abstractmethod
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
"""Fit onto a dataloader.
Args:
trn: Training set.
criterion: Loss function.
optimizer: Optimizer.
metric: Metric(s).
logger: Logger for reporting progress.
**kwargs: Other hyper-parameters passed from sub-class.
"""
pass
@abstractmethod
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
"""Evaluate on a dataloader.
Args:
data: Dataloader which can build from any data source.
criterion: Loss function.
metric: Metric(s).
output: Whether to save outputs into some file.
**kwargs: Not used.
"""
pass
@abstractmethod
def build_model(self, training=True, **kwargs) -> torch.nn.Module:
"""Build model.
Args:
training: ``True`` if called during training.
**kwargs: ``**self.config``.
"""
raise NotImplementedError
def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs):
"""Evaluate test set.
Args:
tst_data: Test set, which is usually a file path.
save_dir: The directory to save evaluation scores or predictions.
logger: Logger for reporting progress.
batch_size: Batch size for test dataloader.
output: Whether to save outputs into some file.
**kwargs: Not used.
Returns:
(metric, outputs) where outputs are the return values of ``evaluate_dataloader``.
"""
if not self.model:
raise RuntimeError('Call fit or load before evaluate.')
if isinstance(tst_data, str):
tst_data = get_resource(tst_data)
filename = os.path.basename(tst_data)
else:
filename = None
if output is True:
output = self.generate_prediction_filename(tst_data if isinstance(tst_data, str) else 'test.txt', save_dir)
if logger is None:
_logger_name = basename_no_ext(filename) if filename else None
logger = self.build_logger(_logger_name, save_dir)
if not batch_size:
batch_size = self.config.get('batch_size', 32)
data = self.build_dataloader(**merge_dict(self.config, data=tst_data, batch_size=batch_size, shuffle=False,
device=self.devices[0], logger=logger, overwrite=True))
dataset = data
while dataset and hasattr(dataset, 'dataset'):
dataset = dataset.dataset
num_samples = len(dataset) if dataset else None
if output and isinstance(dataset, TransformableDataset):
def add_idx(samples):
for idx, sample in enumerate(samples):
if sample:
sample[IDX] = idx
add_idx(dataset.data)
if dataset.cache:
add_idx(dataset.cache)
criterion = self.build_criterion(**self.config)
metric = self.build_metric(**self.config)
start = time.time()
outputs = self.evaluate_dataloader(data, criterion=criterion, filename=filename, output=output, input=tst_data,
save_dir=save_dir,
test=True,
num_samples=num_samples,
**merge_dict(self.config, batch_size=batch_size, metric=metric,
logger=logger, **kwargs))
elapsed = time.time() - start
if logger:
if num_samples:
logger.info(f'speed: {num_samples / elapsed:.0f} samples/second')
else:
logger.info(f'speed: {len(data) / elapsed:.0f} batches/second')
return metric, outputs
def generate_prediction_filename(self, tst_data, save_dir):
assert isinstance(tst_data,
str), 'tst_data has be a str in order to infer the output name'
output = os.path.splitext(os.path.basename(tst_data))
output = os.path.join(save_dir, output[0] + '.pred' + output[1])
return output
def to(self,
devices: Union[int, float, List[int], Dict[str, Union[int, torch.device]]] = None,
logger: logging.Logger = None, verbose=HANLP_VERBOSE):
"""Move this component to devices.
Args:
devices: Target devices.
logger: Logger for printing progress report, as copying a model from CPU to GPU can takes several seconds.
verbose: ``True`` to print progress when logger is None.
"""
if devices is None:
# if getattr(torch, 'has_mps', None): # mac M1 chips
# devices = torch.device('mps:0')
# else:
devices = cuda_devices(devices)
elif devices == -1 or devices == [-1]:
devices = []
elif isinstance(devices, (int, float)):
devices = cuda_devices(devices)
if devices:
if logger:
logger.info(f'Using GPUs: [on_blue][cyan][bold]{devices}[/bold][/cyan][/on_blue]')
if isinstance(devices, list):
if verbose:
flash(f'Moving model to GPUs {devices} [blink][yellow]...[/yellow][/blink]')
self.model = self.model.to(devices[0])
if len(devices) > 1 and not isdebugging() and not isinstance(self.model, nn.DataParallel):
self.model = self.parallelize(devices)
elif isinstance(devices, dict):
for name, module in self.model.named_modules():
for regex, device in devices.items():
try:
on_device: torch.device = next(module.parameters()).device
except StopIteration:
continue
if on_device == device:
continue
if isinstance(device, int):
if on_device.index == device:
continue
if re.match(regex, name):
if not name:
name = '*'
flash(f'Moving module [yellow]{name}[/yellow] to [on_yellow][magenta][bold]{device}'
f'[/bold][/magenta][/on_yellow]: [red]{regex}[/red]\n')
module.to(device)
elif isinstance(devices, torch.device):
if verbose:
flash(f'Moving model to {devices} [blink][yellow]...[/yellow][/blink]')
self.model = self.model.to(devices)
else:
raise ValueError(f'Unrecognized devices {devices}')
if verbose:
flash('')
else:
if logger:
logger.info('Using [red]CPU[/red]')
def parallelize(self, devices: List[Union[int, torch.device]]):
return nn.DataParallel(self.model, device_ids=devices)
@property
def devices(self):
"""The devices this component lives on.
"""
if self.model is None:
return None
# next(parser.model.parameters()).device
if hasattr(self.model, 'device_ids'):
return self.model.device_ids
device: torch.device = next(self.model.parameters()).device
return [device]
@property
def device(self):
"""The first device this component lives on.
"""
devices = self.devices
if not devices:
return None
return devices[0]
def on_config_ready(self, **kwargs):
"""Called when config is ready, either during ``fit`` or ``load``. Subclass can perform extra initialization
tasks in this callback.
Args:
**kwargs: Not used.
"""
pass
@property
def model_(self) -> nn.Module:
"""
The actual model when it's wrapped by a `DataParallel`
Returns: The "real" model
"""
if isinstance(self.model, nn.DataParallel):
return self.model.module
return self.model
# noinspection PyMethodOverriding
@abstractmethod
def predict(self, *args, **kwargs):
"""Predict on data fed by user. Users shall avoid directly call this method since it is not guarded with
``torch.no_grad`` and will introduces unnecessary gradient computation. Use ``__call__`` instead.
Args:
*args: Sentences or tokens.
**kwargs: Used in sub-classes.
"""
pass
@staticmethod
def _create_dummy_placeholder_on(device):
if device < 0:
device = 'cpu:0'
return torch.zeros(16, 16, device=device)
@torch.no_grad()
def __call__(self, *args, **kwargs):
"""Predict on data fed by user. This method calls :meth:`~hanlp.common.torch_component.predict` but decorates
it with ``torch.no_grad``.
Args:
*args: Sentences or tokens.
**kwargs: Used in sub-classes.
"""
return super().__call__(*args, **merge_dict(self.config, overwrite=True, **kwargs))
================================================
FILE: hanlp/common/transform.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-03 14:44
import logging
import os
from abc import ABC, abstractmethod
from typing import Tuple, Union, List
from hanlp_common.constant import EOS, PAD
from hanlp_common.structure import SerializableDict
from hanlp_common.configurable import Configurable
from hanlp.common.vocab import Vocab
from hanlp.utils.io_util import get_resource
from hanlp_common.io import load_json
from hanlp_common.reflection import classpath_of, str_to_type
from hanlp.utils.string_util import ispunct
class ToIndex(ABC):
def __init__(self, vocab: Vocab = None) -> None:
super().__init__()
if vocab is None:
vocab = Vocab()
self.vocab = vocab
@abstractmethod
def __call__(self, sample):
pass
def save_vocab(self, save_dir, filename='vocab.json'):
vocab = SerializableDict()
vocab.update(self.vocab.to_dict())
vocab.save_json(os.path.join(save_dir, filename))
def load_vocab(self, save_dir, filename='vocab.json'):
save_dir = get_resource(save_dir)
vocab = SerializableDict()
vocab.load_json(os.path.join(save_dir, filename))
self.vocab.copy_from(vocab)
class FieldToIndex(ToIndex):
def __init__(self, src, vocab: Vocab, dst=None) -> None:
super().__init__(vocab)
self.src = src
if not dst:
dst = f'{src}_id'
self.dst = dst
def __call__(self, sample: dict):
sample[self.dst] = self.vocab(sample[self.src])
return sample
def save_vocab(self, save_dir, filename=None):
if not filename:
filename = f'{self.dst}_vocab.json'
super().save_vocab(save_dir, filename)
def load_vocab(self, save_dir, filename=None):
if not filename:
filename = f'{self.dst}_vocab.json'
super().load_vocab(save_dir, filename)
class VocabList(list):
def __init__(self, *fields) -> None:
super().__init__()
for each in fields:
self.append(FieldToIndex(each))
def append(self, item: Union[str, Tuple[str, Vocab], Tuple[str, str, Vocab], FieldToIndex]) -> None:
if isinstance(item, str):
item = FieldToIndex(item)
elif isinstance(item, (list, tuple)):
if len(item) == 2:
item = FieldToIndex(src=item[0], vocab=item[1])
elif len(item) == 3:
item = FieldToIndex(src=item[0], dst=item[1], vocab=item[2])
else:
raise ValueError(f'Unsupported argument length: {item}')
elif isinstance(item, FieldToIndex):
pass
else:
raise ValueError(f'Unsupported argument type: {item}')
super(self).append(item)
def save_vocab(self, save_dir):
for each in self:
each.save_vocab(save_dir, None)
def load_vocab(self, save_dir):
for each in self:
each.load_vocab(save_dir, None)
class VocabDict(SerializableDict):
def __init__(self, *args, **kwargs) -> None:
"""A dict holding :class:`hanlp.common.vocab.Vocab` instances. When used as a transform, it transforms the field
corresponding to each :class:`hanlp.common.vocab.Vocab` into indices.
Args:
*args: A list of vocab names.
**kwargs: Names and corresponding :class:`hanlp.common.vocab.Vocab` instances.
"""
vocabs = dict(kwargs)
for each in args:
vocabs[each] = Vocab()
super().__init__(vocabs)
def save_vocabs(self, save_dir, filename='vocabs.json'):
"""Save vocabularies to a directory.
Args:
save_dir: The directory to save vocabularies.
filename: The name for vocabularies.
"""
vocabs = SerializableDict()
for key, value in self.items():
if isinstance(value, Vocab):
vocabs[key] = value.to_dict()
vocabs.save_json(os.path.join(save_dir, filename))
def load_vocabs(self, save_dir, filename='vocabs.json', vocab_cls=Vocab):
"""Load vocabularies from a directory.
Args:
save_dir: The directory to load vocabularies.
filename: The name for vocabularies.
"""
save_dir = get_resource(save_dir)
vocabs = SerializableDict()
vocabs.load_json(os.path.join(save_dir, filename))
self._load_vocabs(self, vocabs, vocab_cls)
@staticmethod
def _load_vocabs(vd, vocabs: dict, vocab_cls=Vocab):
"""
Args:
vd:
vocabs:
vocab_cls: Default class for the new vocab
"""
for key, value in vocabs.items():
if 'idx_to_token' in value:
cls = value.get('type', None)
if cls:
cls = str_to_type(cls)
else:
cls = vocab_cls
vocab = cls()
vocab.copy_from(value)
vd[key] = vocab
else: # nested Vocab
# noinspection PyTypeChecker
vd[key] = nested = VocabDict()
VocabDict._load_vocabs(nested, value, vocab_cls)
def lock(self):
"""
Lock each vocab.
"""
for key, value in self.items():
if isinstance(value, Vocab):
value.lock()
def unlock(self):
"""
Unlock each vocab.
"""
for key, value in self.items():
if isinstance(value, Vocab):
value.unlock()
@property
def mutable(self):
status = [v.mutable for v in self.values() if isinstance(v, Vocab)]
return len(status) == 0 or any(status)
def __call__(self, sample: dict):
for key, value in self.items():
if isinstance(value, Vocab):
field = sample.get(key, None)
if field is not None:
sample[f'{key}_id'] = value(field)
return sample
def __getattr__(self, key):
if key.startswith('__'):
return dict.__getattr__(key)
return self.__getitem__(key)
def __setattr__(self, key, value):
return self.__setitem__(key, value)
def __getitem__(self, k: str) -> Vocab:
return super().__getitem__(k)
def __setitem__(self, k: str, v: Vocab) -> None:
super().__setitem__(k, v)
def summary(self, logger: logging.Logger = None):
"""Log a summary of vocabs using a given logger.
Args:
logger: The logger to use.
"""
for key, value in self.items():
if isinstance(value, Vocab):
report = value.summary(verbose=False)
if logger:
logger.info(f'{key}{report}')
else:
print(f'{key}{report}')
def put(self, **kwargs):
"""Put names and corresponding :class:`hanlp.common.vocab.Vocab` instances into self.
Args:
**kwargs: Names and corresponding :class:`hanlp.common.vocab.Vocab` instances.
"""
for k, v in kwargs.items():
self[k] = v
class NamedTransform(ABC):
def __init__(self, src: str, dst: str = None) -> None:
if dst is None:
dst = src
self.dst = dst
self.src = src
@abstractmethod
def __call__(self, sample: dict) -> dict:
return sample
class ConfigurableTransform(Configurable, ABC):
@property
def config(self):
return dict([('classpath', classpath_of(self))] +
[(k, v) for k, v in self.__dict__.items() if not k.startswith('_')])
@classmethod
def from_config(cls, config: dict):
"""
Args:
config:
kwargs:
config: dict:
Returns:
"""
cls = config.get('classpath', None)
assert cls, f'{config} doesn\'t contain classpath field'
cls = str_to_type(cls)
config = dict(config)
config.pop('classpath')
return cls(**config)
class ConfigurableNamedTransform(NamedTransform, ConfigurableTransform, ABC):
pass
class EmbeddingNamedTransform(ConfigurableNamedTransform, ABC):
def __init__(self, output_dim: int, src: str, dst: str) -> None:
super().__init__(src, dst)
self.output_dim = output_dim
class RenameField(NamedTransform):
def __call__(self, sample: dict):
sample[self.dst] = sample.pop(self.src)
return sample
class CopyField(object):
def __init__(self, src, dst) -> None:
self.dst = dst
self.src = src
def __call__(self, sample: dict) -> dict:
sample[self.dst] = sample[self.src]
return sample
class FilterField(object):
def __init__(self, *keys) -> None:
self.keys = keys
def __call__(self, sample: dict):
sample = dict((k, sample[k]) for k in self.keys)
return sample
class TransformList(list):
"""Composes several transforms together.
Args:
transforms(list of ``Transform`` objects): list of transforms to compose.
Example:
Returns:
>>> transforms.TransformList(
>>> transforms.CenterCrop(10),
>>> transforms.ToTensor(),
>>> )
"""
def __init__(self, *transforms) -> None:
super().__init__()
self.extend(transforms)
def __call__(self, sample):
for t in self:
sample = t(sample)
return sample
def index_by_type(self, t):
for i, trans in enumerate(self):
if isinstance(trans, t):
return i
class LowerCase(object):
def __init__(self, src, dst=None) -> None:
if dst is None:
dst = src
self.src = src
self.dst = dst
def __call__(self, sample: dict) -> dict:
src = sample[self.src]
if isinstance(src, str):
sample[self.dst] = src.lower()
elif isinstance(src, list):
sample[self.dst] = [x.lower() for x in src]
return sample
class LowerCase3D(LowerCase):
def __call__(self, sample: dict) -> dict:
src = sample[self.src]
sample[self.dst] = [[y.lower() for y in x] for x in src]
return sample
class ToChar(object):
def __init__(self, src, dst='char', max_word_length=None, min_word_length=None, pad=PAD) -> None:
if dst is None:
dst = src
self.src = src
self.dst = dst
self.max_word_length = max_word_length
self.min_word_length = min_word_length
self.pad = pad
def __call__(self, sample: dict) -> dict:
src = sample[self.src]
if isinstance(src, str):
sample[self.dst] = self.to_chars(src)
elif isinstance(src, list):
sample[self.dst] = [self.to_chars(x) for x in src]
return sample
def to_chars(self, word: str):
chars = list(word)
if self.min_word_length and len(chars) < self.min_word_length:
chars = chars + [self.pad] * (self.min_word_length - len(chars))
if self.max_word_length:
chars = chars[:self.max_word_length]
return chars
class AppendEOS(NamedTransform):
def __init__(self, src: str, dst: str = None, eos=EOS) -> None:
super().__init__(src, dst)
self.eos = eos
def __call__(self, sample: dict) -> dict:
sample[self.dst] = sample[self.src] + [self.eos]
return sample
class WhitespaceTokenizer(NamedTransform):
def __call__(self, sample: dict) -> dict:
src = sample[self.src]
if isinstance(src, str):
sample[self.dst] = self.tokenize(src)
elif isinstance(src, list):
sample[self.dst] = [self.tokenize(x) for x in src]
return sample
@staticmethod
def tokenize(text: str):
return text.split()
class NormalizeDigit(object):
def __init__(self, src, dst=None) -> None:
if dst is None:
dst = src
self.src = src
self.dst = dst
@staticmethod
def transform(word: str):
new_word = ""
for char in word:
if char.isdigit():
new_word += '0'
else:
new_word += char
return new_word
def __call__(self, sample: dict) -> dict:
src = sample[self.src]
if isinstance(src, str):
sample[self.dst] = self.transform(src)
elif isinstance(src, list):
sample[self.dst] = [self.transform(x) for x in src]
return sample
class Bigram(NamedTransform):
def __init__(self, src: str, dst: str = None) -> None:
if not dst:
dst = f'{src}_bigram'
super().__init__(src, dst)
def __call__(self, sample: dict) -> dict:
src: List = sample[self.src]
dst = src + [EOS]
dst = [dst[i] + dst[i + 1] for i in range(len(src))]
sample[self.dst] = dst
return sample
class FieldLength(NamedTransform):
def __init__(self, src: str, dst: str = None, delta=0) -> None:
self.delta = delta
if not dst:
dst = f'{src}_length'
super().__init__(src, dst)
def __call__(self, sample: dict) -> dict:
sample[self.dst] = len(sample[self.src]) + self.delta
return sample
class BMESOtoIOBES(object):
def __init__(self, field='tag') -> None:
self.field = field
def __call__(self, sample: dict) -> dict:
sample[self.field] = [self.convert(y) for y in sample[self.field]]
return sample
@staticmethod
def convert(y: str):
if y.startswith('M-'):
return 'I-'
return y
class NormalizeToken(ConfigurableNamedTransform):
def __init__(self, mapper: Union[str, dict], src: str, dst: str = None) -> None:
super().__init__(src, dst)
self.mapper = mapper
if isinstance(mapper, str):
mapper = get_resource(mapper)
if isinstance(mapper, str):
self._table = load_json(mapper)
elif isinstance(mapper, dict):
self._table = mapper
else:
raise ValueError(f'Unrecognized mapper type {mapper}')
def __call__(self, sample: dict) -> dict:
src = sample[self.src]
if self.src == self.dst:
sample[f'{self.src}_'] = src
if isinstance(src, str):
src = self.convert(src)
else:
src = [self.convert(x) for x in src]
sample[self.dst] = src
return sample
def convert(self, token) -> str:
return self._table.get(token, token)
class PunctuationMask(ConfigurableNamedTransform):
def __init__(self, src: str, dst: str = None) -> None:
"""Mask out all punctuations (set mask of punctuations to False)
Args:
src:
dst:
Returns:
"""
if not dst:
dst = f'{src}_punct_mask'
super().__init__(src, dst)
def __call__(self, sample: dict) -> dict:
src = sample[self.src]
if isinstance(src, str):
dst = not ispunct(src)
else:
dst = [not ispunct(x) for x in src]
sample[self.dst] = dst
return sample
class NormalizeCharacter(NormalizeToken):
def convert(self, token) -> str:
return ''.join([NormalizeToken.convert(self, c) for c in token])
================================================
FILE: hanlp/common/transform_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-27 14:22
import inspect
from abc import ABC, abstractmethod
from typing import Generator, Tuple, Union, Iterable, Any
import tensorflow as tf
from hanlp_common.structure import SerializableDict
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.io_util import get_resource
from hanlp.utils.log_util import logger
class Transform(ABC):
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
super().__init__()
self.map_y = map_y
self.map_x = map_x
if kwargs:
if not config:
config = SerializableDict()
for k, v in kwargs.items():
config[k] = v
self.config = config
self.output_types = None
self.output_shapes = None
self.padding_values = None
# Fix tf memory leak: https://github.com/tensorflow/tensorflow/issues/37653#issuecomment-1000517720
self.py_func_set_to_cleanup = set()
@abstractmethod
def fit(self, trn_path: str, **kwargs) -> int:
"""
Build the vocabulary from training file
Parameters
----------
trn_path : path to training set
kwargs
Returns
-------
int
How many samples in the training set
"""
raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
def build_config(self):
"""
By default, call build_types_shapes_values, usually called in component's build method.
You can perform other building task here. Remember to call super().build_config
"""
self.output_types, self.output_shapes, self.padding_values = self.create_types_shapes_values()
# We prefer list over shape here, as it's easier to type [] than ()
# if isinstance(self.output_shapes, tuple):
# self.output_shapes = list(self.output_shapes)
# for i, shapes in enumerate(self.output_shapes):
# if isinstance(shapes, tuple):
# self.output_shapes[i] = list(shapes)
# for j, shape in enumerate(shapes):
# if isinstance(shape, tuple):
# shapes[j] = list(shape)
@abstractmethod
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
"""
Create dataset related values,
"""
raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
@abstractmethod
def file_to_inputs(self, filepath: str, gold=True):
"""
Transform file to inputs. The inputs are defined as raw features (e.g. words) to be processed into more
features (e.g. forms and characters)
Parameters
----------
filepath
gold
"""
raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
def inputs_to_samples(self, inputs, gold=False):
if gold:
yield from inputs
else:
for x in inputs:
yield x, self.padding_values[-1]
def file_to_samples(self, filepath: str, gold=True):
"""
Transform file to samples
Parameters
----------
filepath
gold
"""
filepath = get_resource(filepath)
inputs = self.file_to_inputs(filepath, gold)
yield from self.inputs_to_samples(inputs, gold)
def file_to_dataset(self, filepath: str, gold=True, map_x=None, map_y=None, batch_size=32, shuffle=None,
repeat=None,
drop_remainder=False,
prefetch=1,
cache=True,
**kwargs) -> tf.data.Dataset:
"""
Transform file to dataset
Parameters
----------
filepath
gold : bool
Whether it's processing gold data or not. Example: there is usually a column for gold answer
when gold = True.
map_x : bool
Whether call map_x or not. Default to self.map_x
map_y : bool
Whether call map_y or not. Default to self.map_y
batch_size
shuffle
repeat
prefetch
kwargs
Returns
-------
"""
# debug
# for sample in self.file_to_samples(filepath):
# pass
def generator():
inputs = self.file_to_inputs(filepath, gold)
samples = self.inputs_to_samples(inputs, gold)
yield from samples
return self.samples_to_dataset(generator, map_x, map_y, batch_size, shuffle, repeat, drop_remainder, prefetch,
cache)
def inputs_to_dataset(self, inputs, gold=False, map_x=None, map_y=None, batch_size=32, shuffle=None, repeat=None,
drop_remainder=False,
prefetch=1, cache=False, **kwargs) -> tf.data.Dataset:
# debug
# for sample in self.inputs_to_samples(inputs):
# pass
def generator():
samples = self.inputs_to_samples(inputs, gold)
yield from samples
return self.samples_to_dataset(generator, map_x, map_y, batch_size, shuffle, repeat, drop_remainder, prefetch,
cache)
def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=32, shuffle=None, repeat=None,
drop_remainder=False,
prefetch=1, cache=True) -> tf.data.Dataset:
output_types, output_shapes, padding_values = self.output_types, self.output_shapes, self.padding_values
if not all(v for v in [output_shapes, output_shapes,
padding_values]):
# print('Did you forget to call build_config() on your transform?')
self.build_config()
output_types, output_shapes, padding_values = self.output_types, self.output_shapes, self.padding_values
assert all(v for v in [output_shapes, output_shapes,
padding_values]), 'Your create_types_shapes_values returns None, which is not allowed'
# if not callable(samples):
# samples = Transform.generator_to_callable(samples)
if not hasattr(tf.compat.v1.get_default_graph(), '_py_funcs_used_in_graph'):
tf.compat.v1.get_default_graph()._py_funcs_used_in_graph = []
py_func_set_before = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph)
dataset = tf.data.Dataset.from_generator(samples, output_types=output_types, output_shapes=output_shapes)
if cache:
logger.debug('Dataset cache enabled')
dataset = dataset.cache(cache if isinstance(cache, str) else '')
if shuffle:
if isinstance(shuffle, bool):
shuffle = 1024
dataset = dataset.shuffle(shuffle)
if repeat:
dataset = dataset.repeat(repeat)
if batch_size:
dataset = dataset.padded_batch(batch_size, output_shapes, padding_values, drop_remainder)
if prefetch:
dataset = dataset.prefetch(prefetch)
if map_x is None:
map_x = self.map_x
if map_y is None:
map_y = self.map_y
if map_x or map_y:
def mapper(X, Y):
if map_x:
X = self.x_to_idx(X)
if map_y:
Y = self.y_to_idx(Y)
return X, Y
dataset = dataset.map(mapper, num_parallel_calls=tf.data.experimental.AUTOTUNE)
py_func_set_after = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph) - py_func_set_before
self.py_func_set_to_cleanup |= py_func_set_after
return dataset
@abstractmethod
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
@abstractmethod
def y_to_idx(self, y) -> tf.Tensor:
raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
def lock_vocabs(self):
for key, value in vars(self).items():
if isinstance(value, VocabTF):
value.lock()
def summarize_vocabs(self, logger=None, header='Vocab summary:'):
output = header + '\n'
vocabs = {}
for key, value in vars(self).items():
if isinstance(value, VocabTF):
vocabs[key] = value
# tag vocab comes last usually
for key, value in sorted(vocabs.items(), key=lambda kv: len(kv[1]), reverse=True):
output += f'{key}' + value.summary(verbose=False) + '\n'
output = output.strip()
if logger:
logger.info(output)
else:
print(output)
@staticmethod
def generator_to_callable(generator: Generator):
return lambda: (x for x in generator)
def str_to_idx(self, X, Y) -> Tuple[Union[tf.Tensor, Tuple], tf.Tensor]:
return self.x_to_idx(X), self.y_to_idx(Y)
def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
return [repr(x) for x in X]
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
batch=None) -> Iterable:
return [repr(y) for y in Y]
def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]],
Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False) -> Iterable:
"""
Convert predicted tensors to outputs
Parameters
----------
X : Union[tf.Tensor, Tuple[tf.Tensor]]
The inputs of model
Y : Union[tf.Tensor, Tuple[tf.Tensor]]
The outputs of model
Returns
-------
"""
return [(x, y) for x, y in zip(self.X_to_inputs(X), self.Y_to_outputs(Y, gold))]
def input_is_single_sample(self, input: Any) -> bool:
return False
def input_to_inputs(self, input: Any) -> Tuple[Any, bool]:
"""
If input is one sample, convert it to a list which contains this unique sample
Parameters
----------
input :
sample or samples
Returns
-------
(inputs, converted) : Tuple[Any, bool]
"""
flat = self.input_is_single_sample(input)
if flat:
input = [input]
return input, flat
def input_truth_output_to_str(self, input, truth, output):
"""
Convert input truth output to string representation, usually for writing to file during evaluation
Parameters
----------
input
truth
output
Returns
-------
"""
return '\t'.join([input, truth, output]) + '\n'
def cleanup(self):
new_py_funcs = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph) - self.py_func_set_to_cleanup
tf.compat.v1.get_default_graph()._py_funcs_used_in_graph = list(new_py_funcs)
self.py_func_set_to_cleanup = set()
================================================
FILE: hanlp/common/vocab.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 22:42
from collections import Counter
from typing import List, Dict, Union, Iterable
from hanlp_common.constant import UNK, PAD
from hanlp_common.structure import Serializable
from hanlp_common.reflection import classpath_of
class Vocab(Serializable):
def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD,
unk_token=UNK) -> None:
"""Vocabulary base class which converts tokens to indices and vice versa.
Args:
idx_to_token: id to token mapping.
token_to_idx: token to id mapping.
mutable: ``True`` to allow adding new tokens, ``False`` to map OOV to ``unk``.
pad_token: The token representing padding.
unk_token: The token representing OOV.
"""
super().__init__()
if idx_to_token:
t2i = dict((token, idx) for idx, token in enumerate(idx_to_token))
if token_to_idx:
t2i.update(token_to_idx)
token_to_idx = t2i
if token_to_idx is None:
token_to_idx = {}
if pad_token is not None:
token_to_idx[pad_token] = len(token_to_idx)
if unk_token is not None:
token_to_idx[unk_token] = token_to_idx.get(unk_token, len(token_to_idx))
self.token_to_idx = token_to_idx
self.idx_to_token: List[str] = None
self.mutable = mutable
self.pad_token = pad_token
self.unk_token = unk_token
def __setitem__(self, token: str, idx: int):
assert self.mutable, 'Update an immutable Vocab object is not allowed'
self.token_to_idx[token] = idx
def __getitem__(self, key: Union[str, int, List]) -> Union[int, str, List]:
""" Get the index/indices associated with a token or a list of tokens or vice versa.
Args:
key: ``str`` for token(s) and ``int`` for index/indices.
Returns: Associated indices or tokens.
"""
if isinstance(key, str):
return self.get_idx(key)
elif isinstance(key, int):
return self.get_token(key)
elif isinstance(key, list):
if len(key) == 0:
return []
elif isinstance(key[0], str):
return [self.get_idx(x) for x in key]
elif isinstance(key[0], int):
return [self.get_token(x) for x in key]
def __contains__(self, key: Union[str, int]):
if isinstance(key, str):
return key in self.token_to_idx
elif isinstance(key, int):
return 0 <= key < len(self.idx_to_token)
else:
return False
def add(self, token: str) -> int:
""" Tries to add a token into a vocab and returns its id. If it has already been there, its id will be returned
and the vocab won't be updated. If the vocab is locked, an assertion failure will occur.
Args:
token: A new or existing token.
Returns:
Its associated id.
"""
assert self.mutable, 'It is not allowed to call add on an immutable Vocab'
assert isinstance(token, str), f'Token type must be str but got {type(token)} from {token}'
assert token is not None, 'Token must not be None'
idx = self.token_to_idx.get(token, None)
if idx is None:
idx = len(self.token_to_idx)
self.token_to_idx[token] = idx
return idx
def update(self, tokens: Iterable[str]) -> None:
"""Update the vocab with these tokens by adding them to vocab one by one.
Args:
tokens (Iterable[str]): A list of tokens.
"""
assert self.mutable, 'It is not allowed to update an immutable Vocab'
for token in tokens:
self.add(token)
def get_idx(self, token: str) -> int:
"""Get the idx of a token. If it's not there, it will be added to the vocab when the vocab is locked otherwise
the id of UNK will be returned.
Args:
token: A token.
Returns:
The id of that token.
"""
assert isinstance(token, str), 'token has to be `str`'
idx = self.token_to_idx.get(token, None)
if idx is None:
if self.mutable:
idx = len(self.token_to_idx)
self.token_to_idx[token] = idx
else:
idx = self.token_to_idx.get(self.unk_token, None)
return idx
def get_idx_without_add(self, token: str) -> int:
idx = self.token_to_idx.get(token, None)
if idx is None:
idx = self.token_to_idx.get(self.safe_unk_token, None)
return idx
def get_token(self, idx: int) -> str:
"""Get the token using its index.
Args:
idx: The index to a token.
Returns:
"""
if self.idx_to_token:
return self.idx_to_token[idx]
if self.mutable:
for token in self.token_to_idx:
if self.token_to_idx[token] == idx:
return token
def has_key(self, token):
return token in self.token_to_idx
def __len__(self):
return len(self.token_to_idx)
def lock(self):
"""Lock this vocab up so that it won't accept new tokens.
Returns:
Itself.
"""
if self.locked:
return self
self.mutable = False
self.build_idx_to_token()
return self
def build_idx_to_token(self):
max_idx = max(self.token_to_idx.values())
self.idx_to_token = [None] * (max_idx + 1)
for token, idx in self.token_to_idx.items():
self.idx_to_token[idx] = token
def unlock(self):
"""Unlock this vocab so that new tokens can be added in.
Returns:
Itself.
"""
if not self.locked:
return
self.mutable = True
self.idx_to_token = None
return self
@property
def locked(self):
"""
``True`` indicates this vocab is locked.
"""
return not self.mutable
@property
def unk_idx(self):
"""
The index of ``UNK`` token.
"""
if self.unk_token is None:
return None
else:
return self.token_to_idx.get(self.unk_token, None)
@property
def pad_idx(self):
"""
The index of ``PAD`` token.
"""
if self.pad_token is None:
return None
else:
return self.token_to_idx.get(self.pad_token, None)
@property
def tokens(self):
"""
A set of all tokens in this vocab.
"""
return self.token_to_idx.keys()
def __str__(self) -> str:
return self.token_to_idx.__str__()
def summary(self, verbose=True) -> str:
"""Get or print a summary of this vocab.
Args:
verbose: ``True`` to print the summary to stdout.
Returns:
Summary in text form.
"""
# report = 'Length: {}\n'.format(len(self))
# report += 'Samples: {}\n'.format(str(list(self.token_to_idx.keys())[:min(50, len(self))]))
# report += 'Mutable: {}'.format(self.mutable)
# report = report.strip()
report = '[{}] = '.format(len(self))
report += str(list(self.token_to_idx.keys())[:min(50, len(self))])
if verbose:
print(report)
return report
def __call__(self, some_token: Union[str, Iterable[str]]) -> Union[int, List[int]]:
if isinstance(some_token, (list, tuple, set)):
indices = []
if len(some_token) and isinstance(some_token[0], (list, tuple, set)):
for sent in some_token:
inside = []
for token in sent:
inside.append(self.get_idx(token))
indices.append(inside)
return indices
for token in some_token:
indices.append(self.get_idx(token))
return indices
else:
return self.get_idx(some_token)
def to_dict(self) -> dict:
"""Convert this vocab to a dict so that it can be json serialized.
Returns:
A dict.
"""
idx_to_token = self.idx_to_token
pad_token = self.pad_token
unk_token = self.unk_token
mutable = self.mutable
items = locals().copy()
items.pop('self')
return items
def copy_from(self, item: dict):
"""Copy properties from a dict so that it can json de-serialized.
Args:
item: A dict holding ``token_to_idx``
Returns:
Itself.
"""
for key, value in item.items():
setattr(self, key, value)
self.token_to_idx = {k: v for v, k in enumerate(self.idx_to_token)}
return self
def lower(self):
"""Convert all tokens to lower case.
Returns:
Itself.
"""
self.unlock()
token_to_idx = self.token_to_idx
self.token_to_idx = {}
for token in token_to_idx.keys():
self.add(token.lower())
return self
@property
def first_token(self):
"""The first token in this vocab.
"""
if self.idx_to_token:
return self.idx_to_token[0]
if self.token_to_idx:
return next(iter(self.token_to_idx))
return None
def merge(self, other):
"""Merge this with another vocab inplace.
Args:
other (Vocab): Another vocab.
"""
for word, idx in other.token_to_idx.items():
self.get_idx(word)
@property
def safe_pad_token(self) -> str:
"""Get the pad token safely. It always returns a pad token, which is the pad token or the first token
if pad does not present in the vocab.
"""
if self.pad_token:
return self.pad_token
if self.first_token:
return self.first_token
return PAD
@property
def safe_pad_token_idx(self) -> int:
"""Get the idx to the pad token safely. It always returns an index, which corresponds to the pad token or the
first token if pad does not present in the vocab.
"""
return self.token_to_idx.get(self.safe_pad_token, 0)
@property
def safe_unk_token(self) -> str:
"""Get the unk token safely. It always returns a unk token, which is the unk token or the first token if unk
does not presented in the vocab.
"""
if self.unk_token:
return self.unk_token
if self.first_token:
return self.first_token
return UNK
def __repr__(self) -> str:
if self.idx_to_token is not None:
return self.idx_to_token.__repr__()
return self.token_to_idx.__repr__()
def extend(self, tokens: Iterable[str]):
self.unlock()
self(tokens)
def reload_idx_to_token(self, idx_to_token: List[str], pad_idx=0, unk_idx=1):
self.idx_to_token = idx_to_token
self.token_to_idx = dict((s, i) for i, s in enumerate(idx_to_token))
if pad_idx is not None:
self.pad_token = idx_to_token[pad_idx]
if unk_idx is not None:
self.unk_token = idx_to_token[unk_idx]
def set_unk_as_safe_unk(self):
"""Set ``self.unk_token = self.safe_unk_token``. It's useful when the dev/test set contains OOV labels.
"""
self.unk_token = self.safe_unk_token
def clear(self):
self.unlock()
self.token_to_idx.clear()
class CustomVocab(Vocab):
def to_dict(self) -> dict:
d = super().to_dict()
d['type'] = classpath_of(self)
return d
class LowercaseVocab(CustomVocab):
def get_idx(self, token: str) -> int:
idx = self.token_to_idx.get(token, None)
if idx is None:
idx = self.token_to_idx.get(token.lower(), None)
if idx is None:
if self.mutable:
idx = len(self.token_to_idx)
self.token_to_idx[token] = idx
else:
idx = self.token_to_idx.get(self.unk_token, None)
return idx
class VocabWithNone(CustomVocab):
def get_idx(self, token: str) -> int:
if token is None:
return -1
return super().get_idx(token)
class VocabWithFrequency(CustomVocab):
def __init__(self, counter: Counter = None, min_occur_cnt=0, pad_token=PAD, unk_token=UNK, specials=None) -> None:
super().__init__(None, None, True, pad_token, unk_token)
if specials:
for each in specials:
counter.pop(each, None)
self.add(each)
self.frequencies = [1] * len(self)
if counter:
for token, freq in counter.most_common():
if freq >= min_occur_cnt:
self.add(token)
self.frequencies.append(freq)
self.lock()
def to_dict(self) -> dict:
d = super().to_dict()
d['frequencies'] = self.frequencies
return d
def copy_from(self, item: dict):
super().copy_from(item)
self.frequencies = item['frequencies']
def get_frequency(self, token):
idx = self.get_idx(token)
if idx is not None:
return self.frequencies[idx]
return 0
class VocabCounter(CustomVocab):
def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD,
unk_token=UNK) -> None:
super().__init__(idx_to_token, token_to_idx, mutable, pad_token, unk_token)
self.counter = Counter()
def get_idx(self, token: str) -> int:
if self.mutable:
self.counter[token] += 1
return super().get_idx(token)
def trim(self, min_frequency):
assert self.mutable
specials = {self.unk_token, self.pad_token}
survivors = list((token, freq) for token, freq in self.counter.most_common()
if freq >= min_frequency and token not in specials)
survivors = [(x, -1) for x in specials if x] + survivors
self.counter = Counter(dict(survivors))
self.token_to_idx = dict()
self.idx_to_token = None
for token, freq in survivors:
idx = len(self.token_to_idx)
self.token_to_idx[token] = idx
def copy_from(self, item: dict):
super().copy_from(item)
self.counter = Counter(item['counter'].items()) if 'counter' in item else Counter()
def to_dict(self) -> dict:
d = super().to_dict()
d['counter'] = dict(self.counter.items())
return d
class Vocab3D(CustomVocab):
def __call__(self, some_token: Union[str, Iterable[str], Iterable[Iterable[str]]]) \
-> Union[int, List[int], List[List[int]]]:
"""It supports 3D arrays of tokens.
Args:
some_token: Tokens of 1D to 3D
Returns:
A list of indices.
"""
if isinstance(some_token, (list, tuple, set)):
indices = []
if len(some_token) and isinstance(some_token[0], (list, tuple, set)):
for sent in some_token:
inside = []
for token in sent:
inside.append(self.get_idx(token))
indices.append(inside)
return indices
for token in some_token:
if isinstance(token, str):
indices.append(self.get_idx(token))
else:
indices.append([self.get_idx(x) for x in token])
return indices
else:
return self.get_idx(some_token)
def create_label_vocab() -> Vocab:
return Vocab(pad_token=None, unk_token=None)
================================================
FILE: hanlp/common/vocab_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 22:42
from typing import List, Dict, Union, Iterable
from hanlp_common.structure import Serializable
from hanlp_common.constant import PAD, UNK
import tensorflow as tf
from tensorflow.python.ops.lookup_ops import index_table_from_tensor
class VocabTF(Serializable):
def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD,
unk_token=UNK) -> None:
super().__init__()
if idx_to_token:
t2i = dict((token, idx) for idx, token in enumerate(idx_to_token))
if token_to_idx:
t2i.update(token_to_idx)
token_to_idx = t2i
if token_to_idx is None:
token_to_idx = {}
if pad_token:
token_to_idx[pad_token] = len(token_to_idx)
if unk_token:
token_to_idx[unk_token] = len(token_to_idx)
self.token_to_idx = token_to_idx
self.idx_to_token: list = None
self.mutable = mutable
self.pad_token = pad_token
self.unk_token = unk_token
self.token_to_idx_table: tf.lookup.StaticHashTable = None
self.idx_to_token_table = None
def __setitem__(self, token: str, idx: int):
assert self.mutable, 'Update an immutable Vocab object is not allowed'
self.token_to_idx[token] = idx
def __getitem__(self, key: Union[str, int, List]) -> Union[int, str, List]:
if isinstance(key, str):
return self.get_idx(key)
elif isinstance(key, int):
return self.get_token(key)
elif isinstance(key, list):
if len(key) == 0:
return []
elif isinstance(key[0], str):
return [self.get_idx(x) for x in key]
elif isinstance(key[0], int):
return [self.get_token(x) for x in key]
def __contains__(self, key: Union[str, int]):
if isinstance(key, str):
return key in self.token_to_idx
elif isinstance(key, int):
return 0 <= key < len(self.idx_to_token)
else:
return False
def add(self, token: str) -> int:
assert self.mutable, 'It is not allowed to call add on an immutable Vocab'
assert isinstance(token, str), f'Token type must be str but got {type(token)} from {token}'
assert token, 'Token must not be None or length 0'
idx = self.token_to_idx.get(token, None)
if idx is None:
idx = len(self.token_to_idx)
self.token_to_idx[token] = idx
return idx
def update(self, tokens: Iterable[str]) -> None:
"""Update the vocab with these tokens by adding them to vocab one by one.
Args:
tokens: Iterable[str]:
Returns:
"""
assert self.mutable, 'It is not allowed to update an immutable Vocab'
for token in tokens:
self.add(token)
def get_idx(self, token: str) -> int:
idx = self.token_to_idx.get(token, None)
if idx is None:
if self.mutable:
idx = len(self.token_to_idx)
self.token_to_idx[token] = idx
else:
idx = self.token_to_idx.get(self.unk_token, None)
return idx
def get_idx_without_add(self, token: str) -> int:
idx = self.token_to_idx.get(token, None)
if idx is None:
idx = self.token_to_idx.get(self.safe_unk_token, None)
return idx
def get_token(self, idx: int) -> str:
if self.idx_to_token:
return self.idx_to_token[idx]
if self.mutable:
for token in self.token_to_idx:
if self.token_to_idx[token] == idx:
return token
def has_key(self, token):
return token in self.token_to_idx
def __len__(self):
return len(self.token_to_idx)
def lock(self):
if self.locked:
return self
self.mutable = False
self.build_idx_to_token()
self.build_lookup_table()
return self
def build_idx_to_token(self):
max_idx = max(self.token_to_idx.values())
self.idx_to_token = [None] * (max_idx + 1)
for token, idx in self.token_to_idx.items():
self.idx_to_token[idx] = token
def build_lookup_table(self):
tensor = tf.constant(self.idx_to_token, dtype=tf.string)
self.token_to_idx_table = index_table_from_tensor(tensor, num_oov_buckets=1 if self.unk_idx is None else 0,
default_value=-1 if self.unk_idx is None else self.unk_idx)
# self.idx_to_token_table = index_to_string_table_from_tensor(self.idx_to_token, self.safe_unk_token)
def unlock(self):
if not self.locked:
return
self.mutable = True
self.idx_to_token = None
self.idx_to_token_table = None
self.token_to_idx_table = None
return self
@property
def locked(self):
return not self.mutable
@property
def unk_idx(self):
if self.unk_token is None:
return None
else:
return self.token_to_idx.get(self.unk_token, None)
@property
def pad_idx(self):
if self.pad_token is None:
return None
else:
return self.token_to_idx.get(self.pad_token, None)
@property
def tokens(self):
return self.token_to_idx.keys()
def __str__(self) -> str:
return self.token_to_idx.__str__()
def summary(self, verbose=True) -> str:
# report = 'Length: {}\n'.format(len(self))
# report += 'Samples: {}\n'.format(str(list(self.token_to_idx.keys())[:min(50, len(self))]))
# report += 'Mutable: {}'.format(self.mutable)
# report = report.strip()
report = '[{}] = '.format(len(self))
report += str(list(self.token_to_idx.keys())[:min(50, len(self))])
if verbose:
print(report)
return report
def __call__(self, some_token: Union[str, List[str]]) -> Union[int, List[int]]:
if isinstance(some_token, list):
indices = []
for token in some_token:
indices.append(self.get_idx(token))
return indices
else:
return self.get_idx(some_token)
def lookup(self, token_tensor: tf.Tensor) -> tf.Tensor:
if self.mutable:
self.lock()
return self.token_to_idx_table.lookup(token_tensor)
def to_dict(self) -> dict:
idx_to_token = self.idx_to_token
pad_token = self.pad_token
unk_token = self.unk_token
mutable = self.mutable
items = locals().copy()
items.pop('self')
return items
def copy_from(self, item: dict):
for key, value in item.items():
setattr(self, key, value)
self.token_to_idx = {k: v for v, k in enumerate(self.idx_to_token)}
if not self.mutable:
self.build_lookup_table()
def lower(self):
self.unlock()
token_to_idx = self.token_to_idx
self.token_to_idx = {}
for token in token_to_idx.keys():
self.add(token.lower())
return self
@property
def first_token(self):
if self.idx_to_token:
return self.idx_to_token[0]
if self.token_to_idx:
return next(iter(self.token_to_idx))
return None
def merge(self, other):
for word, idx in other.token_to_idx.items():
self.get_idx(word)
@property
def safe_pad_token(self) -> str:
"""Get the pad token safely. It always returns a pad token, which is the token
closest to pad if not presented in the vocab.
Args:
Returns:
"""
if self.pad_token:
return self.pad_token
if self.first_token:
return self.first_token
return PAD
@property
def safe_pad_token_idx(self) -> int:
return self.token_to_idx.get(self.safe_pad_token, 0)
@property
def safe_unk_token(self) -> str:
"""Get the unk token safely. It always returns a unk token, which is the token
closest to unk if not presented in the vocab.
Args:
Returns:
"""
if self.unk_token:
return self.unk_token
if self.first_token:
return self.first_token
return UNK
def create_label_vocab() -> VocabTF:
return VocabTF(pad_token=None, unk_token=None)
================================================
FILE: hanlp/components/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 16:10
from .pipeline import Pipeline
================================================
FILE: hanlp/components/amr/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-20 17:35
================================================
FILE: hanlp/components/amr/amrbart/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-05 17:53
================================================
FILE: hanlp/components/amr/amrbart/bart_amr_generation.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-05 17:56
import logging
import os.path
from typing import Callable, Union, List
import penman
import torch
from torch.utils.data import DataLoader
from hanlp.components.amr.amrbart.data_interface.dataset import AMR2TextDataSet
from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset
from hanlp.layers.transformers.pt_imports import AutoConfig_
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.constant import IDX
from hanlp_common.util import reorder
from hanlp.components.amr.amrbart.model_interface.modeling_bart import BartForConditionalGeneration
from hanlp.components.amr.amrbart.model_interface.tokenization_bart import AMRBartTokenizer
from hanlp.components.amr.amrbart.preprocess.read_and_process import dfs_linearize
class BART_AMR_Generation(TorchComponent):
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.tokenizer: AMRBartTokenizer = None
self.transformer_config = None
self.model: BartForConditionalGeneration = None
def build_dataloader(self, data, batch_size=32, shuffle=False, device=None, logger: logging.Logger = None,
sampler_builder=None,
**kwargs) -> DataLoader:
dataset = AMRDataset(data, generate_idx=True, cache=True)
dataset.append_transform(lambda x: {**x, 'lamr': ' '.join(dfs_linearize(x['amr']))})
dataset.append_transform(
lambda x: AMR2TextDataSet.tokenize(x, tokenizer=self.tokenizer, text='text', amr='lamr')
)
if not sampler_builder:
sampler_builder = SortingSamplerBuilder(batch_max_tokens=500)
sampler = sampler_builder.build([len(x['input_ids']) for x in dataset], shuffle, 1)
return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler,
pad={'input_ids': self.transformer_config.pad_token_id,
'labels': self.transformer_config.pad_token_id})
def build_optimizer(self, **kwargs):
pass
def build_criterion(self, **kwargs):
pass
def build_metric(self, **kwargs):
pass
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, **kwargs):
pass
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
pass
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
pass
def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module:
model = BartForConditionalGeneration.from_pretrained(
transformer,
config=self.transformer_config,
)
if not training:
model.eval()
model.resize_token_embeddings(len(self.tokenizer))
return model
def input_is_flat(self, data):
return isinstance(data, (str, penman.Graph))
def predict(
self,
data: Union[str, List[str]], num_beams=5, max_length=1024, beautiful_amr_graph=True, verbose=False,
**kwargs
):
flat = self.input_is_flat(data)
if flat:
data = [data]
dataloader = self.build_dataloader([{'amr': penman.loads(x)[0] if isinstance(x, str) else x} for x in data],
**self.config, device=self.device)
orders = []
results = []
if verbose:
timer = CountdownTimer(len(dataloader))
for batch in dataloader:
pieces = self.predict_batch(batch, num_beams, max_length)
results.extend(pieces)
orders.extend(batch[IDX])
if verbose:
# noinspection PyUnboundLocalVariable
timer.log()
results = reorder(results, orders)
if flat:
results = results[0]
return results
def predict_batch(self, batch, num_beams, max_length):
tokenizer = self.tokenizer
input_ids = batch['input_ids']
preds = self.model.generate(
input_ids,
num_beams=num_beams,
use_cache=True,
decoder_start_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
no_repeat_ngram_size=0,
max_length=max_length,
min_length=0,
length_penalty=1.0,
)
# tokens = batch['tgt']
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_preds = [x.strip() for x in decoded_preds]
return decoded_preds
def load_config(self, save_dir: str, filename='config.json', **kwargs):
if os.path.isdir(save_dir):
super().load_config(save_dir, filename, **kwargs)
transformer = self.config.transformer
else:
self.config.transformer = transformer = save_dir
self.transformer_config = AutoConfig_.from_pretrained(transformer)
def load_vocabs(self, save_dir, filename='vocabs.json'):
self.tokenizer = AMRBartTokenizer.from_pretrained(
self.config.transformer,
use_fast=True,
)
def load_weights(self, save_dir, filename='model.pt', **kwargs):
pass
================================================
FILE: hanlp/components/amr/amrbart/bart_amr_parser.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-05 17:56
import logging
import os.path
from typing import Callable, Union, List
import datetime
import torch
from torch.utils.data import DataLoader
from hanlp.components.amr.amrbart.data_interface.dataset import AMRParsingDataSet
from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset
from hanlp.components.amr.seq2seq.dataset.penman import AMRGraph
from hanlp.components.amr.seq2seq.evaluation import write_predictions, compute_smatch
from hanlp.layers.transformers.pt_imports import AutoConfig_
from hanlp.metrics.amr.smatch_eval import smatch_eval
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.constant import IDX
from hanlp_common.util import reorder
from hanlp.components.amr.amrbart.model_interface.modeling_bart import BartForConditionalGeneration
from hanlp.components.amr.amrbart.model_interface.tokenization_bart import AMRBartTokenizer
class BART_AMR_Parser(TorchComponent):
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.tokenizer: AMRBartTokenizer = None
self.transformer_config = None
self.model: BartForConditionalGeneration = None
def build_dataloader(self, data, batch_size=32, shuffle=False, device=None, logger: logging.Logger = None,
sampler_builder=None,
**kwargs) -> DataLoader:
dataset = AMRDataset(data, generate_idx=True, cache=True)
if isinstance(data, str):
dataset.append_transform(lambda x: {**x, 'text': x['amr'].metadata['snt']})
dataset.append_transform(
lambda x: AMRParsingDataSet.tokenize(x, tokenizer=self.tokenizer, text='text')
)
if not sampler_builder:
sampler_builder = SortingSamplerBuilder(batch_max_tokens=500)
sampler = sampler_builder.build([len(x['input_ids']) for x in dataset], shuffle, 1)
return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler,
pad={'input_ids': self.transformer_config.pad_token_id,
'labels': self.transformer_config.pad_token_id})
def build_optimizer(self, **kwargs):
pass
def build_criterion(self, **kwargs):
pass
def build_metric(self, **kwargs):
pass
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, **kwargs):
pass
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
pass
def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module:
model = BartForConditionalGeneration.from_pretrained(
transformer,
config=self.transformer_config,
)
if not training:
model.eval()
model.resize_token_embeddings(len(self.tokenizer))
return model
def input_is_flat(self, data):
return isinstance(data, str)
def predict(
self,
data: Union[str, List[str]], num_beams=5, max_length=1024, beautiful_amr_graph=True, verbose=False,
**kwargs
):
flat = self.input_is_flat(data)
if flat:
data = [data]
dataloader = self.build_dataloader([{'text': x} for x in data], **self.config, device=self.device)
orders = []
results = []
# inputs, logits, labels, loss = torch.load('/local/scratch/hhe43/amrbart/batch.pt')
if verbose:
timer = CountdownTimer(len(dataloader))
for batch in dataloader:
pieces = self.predict_batch(batch, num_beams, max_length)
results.extend(pieces)
orders.extend(batch[IDX])
if verbose:
# noinspection PyUnboundLocalVariable
timer.log()
results = reorder(results, orders)
if flat:
results = results[0]
return results
def predict_batch(self, batch, num_beams, max_length):
tokenizer = self.tokenizer
input_ids = batch['input_ids']
preds = self.model.generate(
input_ids,
num_beams=num_beams,
num_return_sequences=num_beams,
use_cache=True,
decoder_start_token_id=tokenizer.amr_bos_token_id,
eos_token_id=tokenizer.amr_eos_token_id,
no_repeat_ngram_size=0,
max_length=max_length,
min_length=0,
length_penalty=1.0,
).tolist()
# tokens = batch['tgt']
graphs = []
for i in range(0, len(preds), num_beams):
graphs_same_source = []
for j in range(i, i + num_beams):
ith_pred = preds[j]
ith_pred[0] = tokenizer.bos_token_id
ith_pred = [
tokenizer.eos_token_id if itm == tokenizer.amr_eos_token_id else itm
for itm in ith_pred if itm != tokenizer.pad_token_id
]
graph, status, (lin, backr) = tokenizer.decode_amr(
ith_pred, restore_name_ops=False
)
graph.status = status
graph.nodes = lin
graph.backreferences = backr
graph.tokens = ith_pred
graphs_same_source.append(graph)
graphs_same_source[:] = \
tuple(zip(*sorted(enumerate(graphs_same_source), key=lambda x: (x[1].status.value, x[0]))))[1]
graphs.append(graphs_same_source)
# assert len(graphs) == len(tokens), f"inconsistent lengths {len(graphs)} vs {len(tokens)}"
# for idx, gps, snt in zip(batch[IDX], graphs, tokens):
# for gp in gps:
# gp.metadata = {"id": str(idx), "annotator": "bart-amr",
# "snt": snt.replace("", '').replace(" ", '').strip()}
pieces = [AMRGraph(g.triples, g.top, g.epidata, g.metadata) for g in [gs[0] for gs in graphs]]
return pieces
def load_config(self, save_dir: str, filename='config.json', **kwargs):
if os.path.isdir(save_dir):
super().load_config(save_dir, filename, **kwargs)
transformer = self.config.transformer
else:
self.config.transformer = transformer = save_dir
self.transformer_config = AutoConfig_.from_pretrained(transformer)
def load_vocabs(self, save_dir, filename='vocabs.json'):
self.tokenizer = AMRBartTokenizer.from_pretrained(
self.config.transformer,
use_fast=True,
)
def load_weights(self, save_dir, filename='model.pt', **kwargs):
pass
@torch.no_grad()
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, ratio_width=None,
logger=None, input=None, use_fast=False, num_beams=5, max_length=1024,
**kwargs):
self.model.eval()
timer = CountdownTimer(len(data))
graphs = []
orders = []
smatch = 0
for idx, batch in enumerate(data):
graphs_per_batch = self.predict_batch(batch, num_beams, max_length)
# Copy meta data from gold graph
for gp, gg in zip(graphs_per_batch, batch['amr']):
metadata = gg.metadata.copy()
metadata['annotator'] = f'{self.transformer_config.name_or_path}-amr'
metadata['date'] = str(datetime.datetime.now())
if 'save-date' in metadata:
del metadata['save-date']
gp.metadata = metadata
graphs.extend(graphs_per_batch)
orders.extend(batch[IDX])
if idx == timer.total - 1:
graphs = reorder(graphs, orders)
write_predictions(output, None, graphs)
try:
if use_fast:
smatch = compute_smatch(output, input)
else:
smatch = smatch_eval(output, input, use_fast=False)
except:
pass
timer.log(smatch.cstr() if isinstance(smatch, MetricDict) else f'{smatch:.2%}', ratio_percentage=False,
logger=logger)
else:
timer.log(ratio_percentage=False, logger=logger)
return smatch
def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=True, **kwargs):
return super().evaluate(tst_data, save_dir, logger, batch_size, output, **kwargs)
================================================
FILE: hanlp/components/amr/amrbart/common/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-05 17:53
================================================
FILE: hanlp/components/amr/amrbart/common/constant.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
BartTokenizer,
BartForConditionalGeneration,
T5Tokenizer,
T5Model,
T5ForConditionalGeneration,
)
from transformers.optimization import (
get_cosine_schedule_with_warmup,
get_cosine_with_hard_restarts_schedule_with_warmup,
get_linear_schedule_with_warmup,
get_polynomial_decay_schedule_with_warmup,
get_constant_schedule_with_warmup,
)
raw_special_tokens = ['Ġcause-01', 'Ġpossible-01', 'Ġcontrast-01', 'Ġsay-01', 'Ġhave-03', 'Ġgovern-01', 'Ġstate-01',
'Ġthink-01', 'Ġdo-02', 'Ġwant-01', 'Ġknow-01', 'Ġrecommend-01', 'Ġsee-01', 'Ġresemble-01',
'Ġmean-01', 'Ġobligate-01', 'Ġuse-01', 'Ġgood-02', 'Ġneed-01', 'Ġwork-01', 'Ġpay-01', 'Ġget-01',
'Ġattack-01', 'Ġreal-04', 'Ġbelieve-01', 'Ġsupport-01', 'Ġreport-01', 'Ġtry-01', 'Ġsame-01',
'Ġtax-01', 'Ġoppose-01', 'Ġlive-01', 'Ġtell-01', 'Ġmake-02', 'Ġdie-01', 'Ġkill-01', 'Ġnew-01',
'Ġgive-01', 'Ġincrease-01', 'Ġagree-01', 'Ġactual-02', 'Ġgo-02', 'Ġright-05', 'Ġvote-01',
'Ġmake-01', 'Ġtake-01', 'Ġseem-01', 'Ġtalk-01', 'Ġissue-02', 'Ġbecome-01', 'Ġpost-01', 'Ġhelp-01',
'Ġstart-01', 'Ġend-01', 'Ġdevelop-02', 'Ġdecide-01', 'Ġfind-01', 'Ġclaim-01', 'Ġdefend-01',
'Ġlead-02', 'Ġhigh-02', 'Ġcontrol-01', 'Ġfree-04', 'Ġtraffic-01', 'Ġlong-03', 'Ġprovide-01',
'Ġcome-01', 'Ġplan-01', 'Ġproduce-01', 'Ġchange-01', 'Ġdiffer-02', 'Ġmarry-01', 'Ġemploy-01',
'Ġchoose-01', 'Ġfight-01', 'Ġmeet-03', 'Ġcall-01', 'Ġread-01', 'Ġunderstand-01', 'Ġsure-02',
'Ġcapable-01', 'Ġallow-01', 'Ġcrime-02', 'Ġinclude-01', 'Ġsell-01', 'Ġinfer-01', 'Ġshow-01',
'Ġfeel-01', 'Ġwar-01', 'Ġquestion-01', 'Ġlook-01', 'Ġopine-01', 'Ġlegal-02', 'Ġlose-02',
'Ġstop-01', 'Ġcreate-01', 'Ġcost-01', 'Ġcontinue-01', 'Ġbad-07', 'Ġact-02', 'Ġcare-03', 'Ġwin-01',
'Ġdiscuss-01', 'Ġdestroy-01', 'Ġpolicy-01', 'Ġelect-01', 'Ġgo-01', 'Ġtrue-01', 'Ġlie-08',
'Ġbase-02', 'Ġinsure-02', 'Ġinvest-01', 'Ġfund-01', 'Ġliberal-02', 'Ġtrade-01', 'Ġspeak-01',
'Ġinvolve-01', 'Ġfail-01', 'Ġhear-01', 'Ġlet-01', 'Ġhope-01', 'Ġinterest-01', 'Ġthreaten-01',
'Ġgrow-01', 'Ġdeal-01', 'Ġspend-01', 'Ġexist-01', 'Ġbegin-01', 'Ġdepend-01', 'Ġarrest-01',
'Ġprove-01', 'Ġbuy-01', 'Ġput-01', 'Ġget-05', 'Ġactivity-06', 'Ġoffer-01', 'Ġpersonal-02',
'Ġprotect-01', 'Ġquote-01', 'Ġwrite-01', 'Ġown-01', 'Ġbuild-01', 'Ġbenefit-01', 'Ġrelation-03',
'Ġequal-01', 'Ġsurrender-01', 'Ġexpect-01', 'Ġlike-01', 'Ġcooperate-01', 'Ġmove-01', 'Ġexcept-01',
'Ġrealize-01', 'Ġstrong-02', 'Ġhate-01', 'Ġargue-01', 'Ġask-01', 'Ġanswer-01', 'Ġlow-04',
'Ġcase-03', 'Ġresult-01', 'Ġeasy-05', 'Ġhard-02', 'Ġconcern-01', 'Ġsuspect-01', 'Ġbear-02',
'Ġserve-01', 'Ġaccept-01', 'Ġclear-06', 'Ġlove-01', 'Ġdemand-01', 'Ġlaunch-01', 'Ġexplain-01',
'Ġwrong-04', 'Ġright-06', 'Ġrequire-01', 'Ġaffect-01', 'Ġeffort-01', 'Ġforce-01', 'Ġlook-02',
'Ġwatch-01', 'Ġout-06', 'Ġoperate-01', 'Ġattempt-01', 'Ġban-01', 'Ġstudy-01', 'Ġsuggest-01',
'Ġlikely-01', 'Ġconcern-02', 'Ġthank-01', 'Ġpublic-02', 'Ġwork-09', 'Ġexemplify-01', 'Ġintend-01',
'Ġprice-01', 'Ġrespond-01', 'Ġpropose-01', 'Ġvisit-01', 'Ġcomplete-02', 'Ġtransfer-01',
'Ġaccuse-01', 'Ġcounter-01', 'Ġcut-02', 'Ġsimple-02', 'Ġcare-01', 'Ġcharge-05', 'Ġrepresent-01',
'Ġsucceed-01', 'Ġlocal-02', 'Ġmurder-01', 'Ġremember-01', 'Ġsend-01', 'Ġevidence-01',
'Ġresearch-01', 'Ġmajor-02', 'Ġwait-01', 'Ġestablish-01', 'Ġremain-01', 'Ġtest-01', 'Ġkeep-02',
'Ġexport-01', 'Ġannounce-01', 'Ġbomb-01', 'Ġfavor-01', 'Ġdeny-01', 'Ġrun-01', 'Ġexperience-01',
'Ġexpert-01', 'Ġprevent-01', 'Ġfair-01', 'Ġknow-02', 'Ġgeneral-02', 'Ġapprove-01', 'Ġwhite-02',
'Ġdescribe-01', 'Ġshare-01', 'Ġconsider-01', 'Ġcase-04', 'Ġreceive-01', 'Ġignore-01', 'Ġlink-01',
'Ġkeep-01', 'Ġcomment-01', 'Ġsex-01', 'Ġlaugh-01', 'Ġinvestigate-01', 'Ġview-02',
'Ġproliferate-01', 'Ġrefuse-01', 'Ġfear-01', 'Ġget-03', 'Ġwill-02', 'Ġrape-01', 'Ġallege-01',
'Ġget-04', 'Ġstay-01', 'Ġrise-01', 'Ġsupply-01', 'Ġdirect-02', 'Ġhonest-01', 'Ġdebate-01',
'Ġobvious-01', 'Ġappear-02', 'Ġcampaign-01', 'Ġblack-05', 'Ġreduce-01', 'Ġask-02',
'Ġcriticize-01', 'Ġguess-01', 'Ġlearn-01', 'Ġseek-01', 'Ġaccess-01', 'Ġsafe-01', 'Ġwish-01',
'Ġwrong-02', 'Ġeducate-01', 'Ġconflict-01', 'Ġrespect-01', 'Ġreach-01', 'Ġage-01', 'Ġmention-01',
'Ġexecute-01', 'Ġfind-02', 'Ġjudge-01', 'Ġbring-01', 'Ġblame-01', 'Ġhead-01', 'Ġwell-09',
'Ġensure-01', 'Ġarm-01', 'Ġcover-01', 'Ġserious-02', 'Ġtreat-01', 'Ġteach-01', 'Ġdoubt-01',
'Ġimmigrate-01', 'Ġinvade-01', 'Ġsmuggle-01', 'Ġlack-01', 'Ġearn-01', 'Ġhold-01', 'Ġlimit-01',
'Ġparticipate-01', 'Ġsentence-01', 'Ġdamage-01', 'Ġconsider-02', 'Ġname-01', 'Ġsorry-01',
'Ġrelate-01', 'Ġcriminal-03', 'Ġleft-19', 'Ġadmit-01', 'Ġadministrate-01', 'Ġtarget-01',
'Ġrun-02', 'Ġgo-06', 'Ġimprove-01', 'Ġconstruct-01', 'Ġmoral-02', 'Ġfollow-01', 'Ġcorrect-02',
'Ġprotest-01', 'Ġleave-11', 'Ġaid-01', 'Ġvalue-01', 'Ġsense-02', 'Ġdrop-01', 'Ġface-01',
'Ġserious-01', 'Ġseize-01', 'Ġtrain-01', 'Ġwarn-01', 'Ġavoid-01', 'Ġeffective-04', 'Ġdeserve-01',
'Ġplay-01', 'Ġenter-01', 'Ġregulate-01', 'Ġnear-02', 'Ġborder-01', 'Ġsolve-01', 'Ġprefer-01',
'Ġviolate-01', 'Ġrelease-01', 'Ġcite-01', 'Ġfocus-01', 'Ġadvise-01', 'Ġsound-01', 'Ġrisk-01',
'Ġreturn-01', 'Ġlist-01', 'Ġsignificant-02', 'Ġhire-01', 'Ġsurprise-01', 'Ġopen-01', 'Ġnice-01',
'Ġraise-01', 'Ġmaintain-01', 'Ġprivate-03', 'Ġimplement-01', 'Ġassist-01', 'Ġcall-02',
'Ġcompare-01', 'Ġprofit-01', 'Ġcontribute-01', 'Ġhave-to-do-with-04', 'Ġcorrupt-01', 'Ġclose-10',
'Ġsuffer-01', 'Ġexpand-01', 'Ġwonder-01', 'Ġresponsible-01', 'Ġtotal-01', 'Ġspecific-02',
'Ġpass-01', 'Ġhappy-01', 'Ġassume-02', 'Ġchance-02', 'Ġremove-01', 'Ġadd-02', 'Ġmanufacture-01',
'Ġexpress-01', 'Ġinspect-01', 'Ġwalk-01', 'Ġgood-03', 'Ġrule-01', 'Ġmanage-01', 'Ġhold-04',
'Ġspecial-02', 'Ġinfluence-01', 'Ġexchange-01', 'Ġtake-10', 'Ġconvict-01', 'Ġprocess-02',
'Ġtravel-01', 'Ġcarry-01', 'Ġdefine-01', 'Ġdisagree-01', 'Ġsave-02', 'Ġpermit-01', 'Ġestimate-01',
'Ġrate-01', 'Ġcall-03', 'Ġsingle-02', 'Ġabuse-01', 'Ġsign-01', 'Ġrule-03', 'Ġact-01',
'Ġachieve-01', 'Ġintervene-01', 'Ġfall-01', 'Ġattend-02', 'Ġfeel-02', 'Ġadopt-01', 'Ġfollow-02',
'Ġgo-on-15', 'Ġloan-01', 'Ġnegotiate-01', 'Ġhit-01', 'Ġcondition-01', 'Ġshort-07', 'Ġpromise-01',
'Ġrebel-01', 'Ġpromote-02', 'Ġstrengthen-01', 'Ġsanction-02', 'Ġwarm-01', 'Ġbehave-01',
'Ġhave-06', 'Ġsuffice-01', 'Ġlead-03', 'Ġtry-02', 'Ġlike-02', 'Ġfire-01', 'Ġdrive-01', 'Ġfly-01',
'Ġgain-02', 'Ġafford-01', 'Ġexplode-01', 'Ġpoint-out-02', 'Ġconsume-01', 'Ġmeasure-02',
'Ġreform-01', 'Ġenjoy-01', 'Ġsit-01', 'Ġavailable-02', 'Ġstrike-01', 'Ġsign-02', 'Ġcome-03',
'Ġnatural-03', 'Ġorganize-01', 'Ġprepare-02', 'Ġreplace-01', 'Ġhanging-07', 'Ġleave-15',
'Ġretire-01', 'Ġimport-01', 'Ġrange-01', 'Ġokay-04', 'Ġcover-03', 'Ġimagine-01', 'Ġkey-02',
'Ġsurvive-01', 'Ġfree-03', 'Ġbase-01', 'Ġcomplain-01', 'Ġnormal-02', 'Ġcomplete-01', 'Ġreveal-01',
'Ġenforce-01', 'Ġdetermine-01', 'Ġvictimize-01', 'Ġrepeat-01', 'Ġinterview-01', 'Ġmake-05',
'Ġdonate-01', 'Ġsteal-01', 'Ġquick-02', 'Ġattract-01', 'Ġanalyze-01', 'Ġally-01', 'Ġsuppose-01',
'Ġresponsible-03', 'Ġclose-01', 'Ġcombat-01', 'Ġidentify-01', 'Ġsuppose-02', 'Ġrecord-01',
'Ġnominate-01', 'Ġrely-01', 'Ġturn-02', 'Ġhandle-01', 'Ġprocess-01', 'Ġpredict-01', 'Ġdeploy-01',
'Ġfortunate-01', 'Ġeat-01', 'Ġjustify-01', 'Ġexpend-01', 'Ġbullshit-01', 'Ġdiscover-01',
'Ġenrich-01', 'Ġcommit-02', 'Ġshoot-02', 'Ġcheap-02', 'Ġreject-01', 'Ġweak-02', 'Ġpowerful-02',
'Ġdispute-01', 'Ġlegislate-01', 'Ġissue-01', 'Ġarrive-01', 'Ġjoin-01', 'Ġapply-02',
'Ġindicate-01', 'Ġengage-01', 'Ġinnocent-01', 'Ġfast-02', 'Ġpressure-01', 'Ġpublish-01',
'Ġobtain-01', 'Ġsad-02', 'Ġconfirm-01', 'Ġtreat-03', 'Ġlead-01', 'Ġlisten-01', 'Ġoffend-01',
'Ġaddress-02', 'Ġword-01', 'Ġright-08', 'Ġnote-01', 'Ġcontain-01', 'Ġpurchase-01', 'Ġrequest-01',
'Ġgood-04', 'Ġdesign-01', 'Ġnotice-01', 'Ġpresent-01', 'Ġshock-01', 'Ġright-02', 'Ġtransport-01',
'Ġdeliver-01', 'Ġburn-01', 'Ġfault-01', 'Ġmatter-01', 'Ġabort-01', 'Ġstick-01', 'Ġconnect-01',
'Ġconclude-01', 'Ġcontract-02', 'Ġpossess-01', 'Ġend-up-03', 'Ġsearch-01', 'Ġget-02',
'Ġqualify-02', 'Ġreact-01', 'Ġconfuse-01', 'Ġanger-01', 'Ġpursue-01', 'Ġreside-01',
'Ġrelevant-01', 'Ġoccupy-01', 'Ġwithdraw-01', 'Ġokay-01', 'Ġconform-01', 'Ġdemonstrate-01',
'Ġwear-01', 'Ġhave-04', 'Ġdecrease-01', 'Ġpunish-01', 'Ġpractice-01', 'Ġcapture-01', 'Ġgo-03',
'Ġpoll-01', 'Ġshow-04', 'Ġrefer-01', 'Ġcommit-01', 'Ġdisarm-01', 'Ġbelong-01', 'Ġdivide-02',
'Ġdrink-01', 'Ġdesire-01', 'Ġsave-01', 'Ġignorant-02', 'Ġperfect-02', 'Ġposition-02', 'Ġcrap-01',
'Ġinsult-01', 'Ġprivate-02', 'Ġwaste-01', 'Ġguilty-01', 'Ġeliminate-01', 'Ġmortgage-01',
'Ġworth-01', 'Ġinherit-01', 'Ġthrow-01', 'Ġtour-01', 'Ġsuspend-01', 'Ġharm-01', 'Ġimpose-01',
'Ġimprison-01', 'Ġrecognize-01', 'Ġprosecute-01', 'Ġview-01', 'Ġforget-01', 'Ġfound-01',
'Ġchallenge-01', 'Ġtrouble-01', 'Ġsecure-02', 'Ġorder-01', 'Ġpartner-01', 'Ġspend-02',
'Ġprogressive-02', 'Ġaccount-01', 'Ġblock-01', 'Ġguarantee-01', 'Ġconvince-01', 'Ġworry-02',
'Ġendanger-01', 'Ġmovement-07', 'Ġfuck-01', 'Ġextend-01', 'Ġseparate-02', 'Ġbalance-01',
'Ġlose-03', 'Ġpower-01', 'Ġsue-02', 'Ġurge-01', 'Ġcheck-01', 'Ġpoint-01', 'Ġturn-01',
'Ġprogress-01', 'Ġrecover-01', 'Ġridiculous-02', 'Ġaccompany-01', 'Ġappear-01', 'Ġworry-01',
'Ġplace-01', 'Ġattend-01', 'Ġsleep-01', 'Ġbreak-01', 'Ġfind-out-03', 'Ġbias-01', 'Ġaccord-03',
'Ġwide-02', 'Ġenable-01', 'Ġaffair-02', 'Ġhide-01', 'Ġhold-02', 'Ġrecognize-02', 'Ġback-01',
'Ġbet-01', 'Ġhack-04', 'Ġacquire-01', 'Ġtake-04', 'Ġpenalize-01', 'Ġmessage-01', 'Ġready-02',
'Ġcease-01', 'Ġcrazy-03', 'Ġbad-04', 'Ġcompete-02', 'Ġcontact-01', 'Ġsource-01', 'Ġset-up-03',
'Ġrestrict-01', 'Ġregard-01', 'Ġwitness-01', 'Ġlabor-01', 'Ġsmoke-02', 'Ġkick-01', 'Ġcompete-01',
'Ġhouse-01', 'Ġhurt-01', 'Ġimprovise-01', 'Ġfinance-01', 'Ġinsist-01', 'Ġfarm-01', 'Ġapply-01',
'Ġstep-01', 'Ġdeep-02', 'Ġpride-01', 'Ġbill-01', 'Ġpretend-01', 'Ġfill-01', 'Ġfine-04',
'Ġstop-03', 'Ġoffend-03', 'Ġadvertise-01', 'Ġstand-01', 'Ġaim-02', 'Ġimpact-01', 'Ġfeed-01',
'Ġgrant-01', 'Ġlast-01', 'Ġform-01', 'Ġdrive-02', 'Ġengineer-01', 'Ġinjure-01', 'Ġdevelop-01',
'Ġpresent-02', 'Ġsubsidize-01', 'Ġbring-up-02', 'Ġintelligent-01', 'Ġwelcome-01', 'Ġtake-away-05',
'Ġresolve-01', 'Ġappropriate-02', 'Ġencourage-01', 'Ġperform-02', 'Ġgo-back-19', 'Ġdeclare-02',
'Ġfull-09', 'Ġhopeful-03', 'Ġconduct-01', 'Ġsurgery-01', 'Ġdetain-01', 'Ġrelative-05',
'Ġcount-01', 'Ġglad-02', 'Ġrare-02', 'Ġcome-out-09', 'Ġapproach-02', 'Ġrace-02', 'Ġbattle-01',
'Ġcross-02', 'Ġmove-02', 'Ġquestion-03', 'Ġadminister-01', 'Ġgrow-03', 'Ġmeet-02', 'Ġdown-03',
'Ġmeet-01', 'Ġcondemn-01', 'Ġreason-01', 'Ġcarry-out-03', 'Ġworth-02', 'Ġinform-01', 'Ġstable-03',
'Ġstand-11', 'Ġutilize-01', 'Ġperpetrate-01', 'Ġassociate-01', 'Ġapologize-01', 'Ġcredit-01',
'Ġdisgust-01', 'Ġspread-03', 'Ġcommand-02', 'Ġsense-01', 'Ġdetail-01', 'Ġdefeat-01',
'Ġdistribute-01', 'Ġgive-up-07', 'Ġpain-01', 'Ġship-01', 'Ġkeep-04', 'Ġaddict-01',
'Ġcompromise-01', 'Ġlegitimate-02', 'Ġregular-02', 'Ġpick-01', 'Ġsource-02', 'Ġraid-01',
'Ġhard-04', 'Ġrain-01', 'Ġcommunicate-01', 'Ġmarket-01', 'Ġlower-05', 'Ġill-01', 'Ġdefraud-01',
'Ġposition-01', 'Ġterrible-01', 'Ġdivorce-01', 'Ġamaze-01', 'Ġedit-01', 'Ġspread-02',
'Ġclarify-10', 'Ġargue-02', 'Ġpush-01', 'Ġmiss-01', 'Ġimply-01', 'Ġdiscriminate-02', 'Ġlight-06',
'Ġappoint-01', 'Ġdelay-01', 'Ġgross-03', 'Ġput-03', 'Ġintroduce-02', 'Ġstandard-02', 'Ġpull-01',
'Ġdraw-02', 'Ġgo-08', 'Ġaim-01', 'Ġmodern-02', 'Ġdare-01', 'Ġneighbor-01', 'Ġconfront-01',
'Ġsuperior-01', 'Ġreasonable-02', 'Ġschedule-01', 'Ġadd-01', 'Ġnew-02', 'Ġlend-01', 'Ġdouble-01',
'Ġfinish-01', 'Ġraise-03', 'Ġexcuse-02', 'Ġmonitor-01', 'Ġobserve-01', 'Ġpopular-02',
'Ġcharge-01', 'Ġbudget-01', 'Ġnegative-03', 'Ġdirect-01', 'Ġrid-01', 'Ġmake-18', 'Ġmean-02',
'Ġfame-01', 'Ġjoke-01', 'Ġbeautiful-02', 'Ġtend-02', 'Ġrob-01', 'Ġriot-01', 'Ġsponsor-01',
'Ġentitle-01', 'Ġlobby-01', 'Ġbad-02', 'Ġcollapse-01', 'Ġexpose-01', 'Ġemphasize-01',
'Ġfriendly-01', 'Ġplay-02', 'Ġinitiate-01', 'Ġappreciate-02', 'Ġremind-01', 'Ġblack-04',
'Ġefficient-01', 'Ġconverse-01', 'Ġresponsible-02', 'Ġmeasure-01', 'Ġcome-04', 'Ġeffect-03',
'Ġsubject-01', 'Ġmistake-02', 'Ġpass-03', 'Ġsignal-07', 'Ġguard-01', 'Ġopen-04', 'Ġset-02',
'Ġfun-01', 'Ġcome-up-11', 'Ġflee-05', 'Ġlabel-01', 'Ġsize-01', 'Ġconfident-01', 'Ġsmart-06',
'Ġhost-01', 'Ġtough-02', 'Ġrecall-02', 'Ġscare-01', 'Ġdream-01', 'Ġassault-01', 'Ġfreeze-02',
'Ġtake-over-12', 'Ġrecession-02', 'Ġfunction-01', 'Ġwhine-01', 'Ġshort-06', 'Ġprosper-01',
'Ġadvanced-02', 'Ġvalue-02', 'Ġbother-01', 'Ġcomply-01', 'Ġright-04', 'Ġrevolution-03',
'Ġaccomplish-01', 'Ġgo-out-17', 'Ġfigure-out-05', 'Ġslow-05', 'Ġaccountable-02', 'Ġcool-01',
'Ġdocument-01', 'Ġauthorize-01', 'Ġembargo-01', 'Ġvolunteer-01', 'Ġregister-02', 'Ġfrequent-02',
'Ġrank-01', 'Ġresist-01', 'Ġbreak-up-08', 'Ġred-02', 'Ġcomfortable-02', 'Ġexamine-01',
'Ġadjust-01', 'Ġoriginate-01', 'Ġreply-01', 'Ġbreak-18', 'Ġshoot-01', 'Ġmiss-02', 'Ġdismiss-01',
'Ġcollect-01', 'Ġdraft-01', 'Ġsubmit-01', 'Ġrelieve-01', 'Ġembarrass-01', 'Ġreturn-02',
'Ġvoluntary-02', 'Ġpure-02', 'Ġbeat-01', 'Ġbear-01', 'Ġvary-01', 'Ġsick-05', 'Ġaffair-01',
'Ġtypical-02', 'Ġnegative-02', 'Ġserve-02', 'Ġeradicate-01', 'Ġrealize-02', 'Ġperceive-01',
'Ġleave-14', 'Ġgive-16', 'Ġback-up-04', 'Ġgenerate-01', 'Ġbail-out-02', 'Ġtouch-01',
'Ġcultivate-01', 'Ġconvert-01', 'Ġdismantle-01', 'Ġservice-05', 'Ġstraight-04', 'Ġbad-05',
'Ġforce-04', 'Ġadvocate-01', 'Ġpray-01', 'Ġdecline-01', 'Ġinfect-01', 'Ġtitle-01',
'Ġdesperate-02', 'Ġupset-01', 'Ġtolerate-01', 'Ġprohibit-01', 'Ġmind-05', 'Ġbeat-03', 'Ġveto-01',
'Ġcrash-01', 'Ġside-01', 'Ġcombine-01', 'Ġclose-13', 'Ġgo-10', 'Ġequip-01', 'Ġrant-01',
'Ġjail-01', 'Ġcopy-01', 'Ġdrop-05', 'Ġconsistent-02', 'Ġspend-04', 'Ġsend-03', 'Ġcritical-02',
'Ġcarry-on-02', 'Ġraise-02', 'Ġmotivate-01', 'Ġguide-01', 'Ġwonderful-03', 'Ġtrust-01',
'Ġreverse-01', 'Ġjust-02', 'Ġclaim-02', 'Ġsurvey-01', 'Ġspy-01', 'Ġget-22', 'Ġhave-05',
'Ġcool-04', 'Ġpicture-01', 'Ġunion-02', 'Ġmanage-02', 'Ġinstruct-01', 'Ġblow-03', 'Ġsacrifice-01',
'Ġowe-01', 'Ġappeal-01', 'Ġexceed-01', 'Ġradiate-01', 'Ġhonor-01', 'Ġseparate-01', 'Ġarrange-01',
'Ġdominate-01', 'Ġtransact-01', 'Ġgrow-up-04', 'Ġverify-01', 'Ġgo-05', 'Ġfamiliarize-01',
'Ġrenew-01', 'Ġfire-02', 'Ġtake-out-11', 'Ġinterpret-01', 'Ġvalid-02', 'Ġshow-up-02',
'Ġconfiscate-01', 'Ġshut-down-05', 'Ġcheat-03', 'Ġharass-01', 'Ġtie-01', 'Ġabuse-02',
'Ġassess-01', 'Ġcompensate-01', 'Ġsensitive-03', 'Ġsettle-02', 'Ġencounter-01', 'Ġmatch-01',
'Ġrecover-02', 'Ġtrust-02', 'Ġperform-01', 'Ġborrow-01', 'Ġselect-01', 'Ġbetray-01', 'Ġride-01',
'Ġuseful-05', 'Ġsplit-01', 'Ġshift-01', 'Ġannoy-01', 'Ġmind-01', 'Ġfair-04', 'Ġoppress-01',
'Ġinterfere-01', 'Ġcredit-02', 'Ġlaunder-01', 'Ġamount-01', 'Ġleave-13', 'Ġrescue-01',
'Ġstaff-01', 'Ġplay-11', 'Ġkind-01', 'Ġauthor-01', 'Ġsympathize-01', 'Ġupgrade-02',
'Ġsuppress-01', 'Ġwake-up-02', 'Ġinvite-01', 'Ġcome-12', 'Ġdeter-01', 'Ġbrainwash-01', 'Ġshit-01',
'Ġfix-02', 'Ġwhite-03', 'Ġgroup-01', 'Ġabsent-01', 'Ġarmor-01', 'Ġup-03', 'Ġpraise-01',
'Ġreview-01', 'Ġdry-02', 'Ġintercept-01', 'Ġbroadcast-01', 'Ġworship-01', 'Ġterm-01',
'Ġobject-01', 'Ġpledge-01', 'Ġprepare-01', 'Ġopen-up-03', 'Ġlay-01', 'Ġfile-01', 'Ġcheck-out-05',
'Ġattach-01', 'Ġsatisfy-01', 'Ġdepart-01', 'Ġopposite-01', 'Ġworsen-01', 'Ġaward-01',
'Ġpollute-01', 'Ġretaliate-01', 'Ġdisrupt-01', 'Ġreturn-05', 'Ġpopulate-01', 'Ġenvision-01',
'Ġplease-01', 'Ġrepair-01', 'Ġslaughter-01', 'Ġsin-01', 'Ġconstitute-01', 'Ġshop-01',
'Ġtranslate-01', 'Ġassure-01', 'Ġpay-off-02', 'Ġstimulate-01', 'Ġdamn-01', 'Ġswitch-01',
'Ġdisappear-01', 'Ġreelect-01', 'Ġspin-03', 'Ġtestify-01', 'Ġlegalize-01', 'Ġprint-01',
'Ġaverage-01', 'Ġright-03', 'Ġfix-03', 'Ġundermine-01', 'Ġcome-on-25', 'Ġlicense-01',
'Ġindict-01', 'Ġtransit-01', 'Ġwash-01', 'Ġbreathe-01', 'Ġbroad-02', 'Ġleave-17', 'Ġorder-02',
'Ġhead-02', 'Ġsing-01', 'Ġentertain-01', 'Ġcomplicate-01', 'Ġpush-02', 'Ġrealistic-03',
'Ġdisappoint-01', 'Ġbother-02', 'Ġtough-03', 'Ġdisplay-01', 'Ġflow-01', 'Ġdiffer-01', 'Ġlie-07',
'Ġpremise-01', 'Ġrelocate-01', 'Ġcorrect-01', 'Ġcoordinate-01', 'Ġabandon-01', 'Ġdictate-01',
'Ġplay-08', 'Ġrebuild-01', 'Ġclean-04', 'Ġwork-out-02', 'Ġrun-13', 'Ġcurious-01', 'Ġpromote-01',
'Ġspecialize-01', 'Ġstarve-01', 'Ġshame-02', 'Ġfit-06', 'Ġflaw-01', 'Ġfigure-01', 'Ġhunt-01',
'Ġexperiment-01', 'Ġmix-01', 'Ġregular-03', 'Ġfree-01', 'Ġdeclare-01', 'Ġescape-01', 'Ġput-02',
'Ġobsess-01', 'Ġbuild-up-05', 'Ġshut-up-06', 'Ġrally-01', 'Ġdissent-01', 'Ġprogram-01',
'Ġamend-01', 'Ġinvent-01', 'Ġleak-01', 'Ġtrigger-01', 'Ġdistinguish-01', 'Ġsymbolize-01',
'Ġexcellent-02', 'Ġlook-04', 'Ġcry-02', 'Ġassign-01', 'Ġrecruit-01', 'Ġcope-01', 'Ġmigrate-01',
'Ġtake-on-09', 'Ġbless-01', 'Ġsharp-02', 'Ġuse-02', 'Ġdisturb-01', 'Ġconsult-01', 'Ġlay-off-02',
'Ġbid-01', 'Ġaccord-02', 'Ġbusy-01', 'Ġprovoke-01', 'Ġisolate-01', 'Ġdirty-02', 'Ġblind-02',
'Ġstage-01', 'Ġboost-01', 'Ġoutrage-01', 'Ġtrack-01', 'Ġretard-01', 'Ġexclude-01', 'Ġpatent-01',
'Ġblog-01', 'Ġtorture-01', 'Ġplot-01', 'Ġcut-01', 'Ġhunger-01', 'Ġoverwhelm-01', 'Ġexploit-01',
'Ġland-01', 'Ġreserve-01', 'Ġbetter-01', 'Ġup-02', 'Ġremark-01', 'Ġpiss-03', 'Ġexcuse-01',
'Ġparalyze-01', 'Ġsummarize-01', 'Ġload-01', 'Ġdevote-01', 'Ġbury-01', 'Ġsurround-01',
'Ġdance-01', 'Ġdistort-01', 'Ġretain-01', 'Ġoverthrow-01', 'Ġrival-01', 'Ġready-01', 'Ġevolve-01',
'Ġimpoverish-01', 'Ġalarm-01', 'Ġunify-01', 'Ġrepay-01', 'Ġassume-01', 'Ġclose-06', 'Ġadmire-01',
'Ġvow-01', 'Ġaverage-04', 'Ġsight-01', 'Ġinflate-01', 'Ġreference-04', 'Ġlook-up-05',
'Ġcivilize-01', 'Ġsuitable-04', 'Ġdetect-01', 'Ġpiss-off-02', 'Ġassassinate-01', 'Ġopen-05',
'Ġshave-01', 'Ġemail-01', 'Ġfuel-01', 'Ġincentivize-01', 'Ġmark-01', 'Ġsustain-01',
'Ġspeculate-01', 'Ġsurveil-01', 'Ġswim-01', 'Ġconquer-01', 'Ġgenocide-01', 'Ġhoax-01',
'Ġnotice-03', 'Ġbe-done-08', 'Ġopt-01', 'Ġbait-01', 'Ġcompile-01', 'Ġinnovate-01', 'Ġallocate-01',
'Ġshelter-01', 'Ġcontrary-01', 'Ġburden-01', 'Ġfreeze-01', 'Ġinspire-01', 'Ġgraduate-01',
'Ġwipe-out-02', 'Ġfall-05', 'Ġcover-up-04', 'Ġrepute-01', 'Ġenhance-01', 'Ġclassify-01',
'Ġgreen-03', 'Ġscore-01', 'Ġmodify-01', 'Ġreflect-01', 'Ġforce-02', 'Ġequate-01',
'Ġmerchandise-01', 'Ġregret-01', 'Ġovercome-01', 'Ġprocure-01', 'Ġscam-01', 'Ġquit-01',
'Ġdrill-01', 'Ġdisable-01', 'Ġgrasp-01', 'Ġorbit-01', 'Ġlaughable-03', 'Ġconsent-01',
'Ġendorse-01', 'Ġcatch-02', 'Ġleave-02', 'Ġweigh-01', 'Ġroll-01', 'Ġrestore-01', 'Ġshape-01',
'Ġcomprehend-01', 'Ġtrip-03', 'Ġget-away-08', 'Ġsingle-03', 'Ġphone-01', 'Ġintimidate-01',
'Ġinstall-01', 'Ġsuck-03', 'Ġback-02', 'Ġdeem-01', 'Ġmake-up-10', 'Ġplant-01', 'Ġhand-out-03',
'Ġgo-off-16', 'Ġspeed-01', 'Ġrefute-01', 'Ġimplicate-01', 'Ġdock-01', 'Ġcrack-down-06',
'Ġforecast-01', 'Ġrush-01', 'Ġgenerous-01', 'Ġunite-01', 'Ġgrab-01', 'Ġcompetent-01',
'Ġground-02', 'Ġevaluate-01', 'Ġadvance-01', 'Ġmainstream-02', 'Ġdiagnose-01', 'Ġpass-05',
'Ġuphold-01', 'Ġhalt-01', 'Ġhinder-01', 'Ġbefriend-01', 'Ġconvene-01', 'Ġawe-01', 'Ġapplaud-01',
'Ġmodernize-01', 'Ġintegrate-01', 'Ġexecute-02', 'Ġwound-01', 'Ġprostitute-01', 'Ġexercise-01',
'Ġbind-01', 'Ġphotograph-01', 'Ġfascinate-01', 'Ġreward-01', 'Ġclean-up-02', 'Ġrepeal-01',
'Ġtwist-01', 'Ġmodel-01', 'Ġmandate-01', 'Ġconspire-01', 'Ġtear-01', 'Ġbrutal-02', 'Ġcharge-08',
'Ġdry-08', 'Ġwow-01', 'Ġbank-01', 'Ġfuck-up-02', 'Ġstand-up-07', 'Ġportray-01', 'Ġnationalize-01',
'Ġliberate-01', 'Ġexempt-01', 'Ġdefy-01', 'Ġshout-01', 'Ġdevastate-01', 'Ġhijack-01',
'Ġacknowledge-01', 'Ġcompromise-02', 'Ġconsist-01', 'Ġcoach-01', 'Ġintense-02', 'Ġdrag-01',
'Ġminor-01', 'Ġfulfill-01', 'Ġclear-01', 'Ġdeceive-01', 'Ġshake-01', 'Ġcold-01', 'Ġalign-01',
'Ġsupervise-01', 'Ġinternal-02', 'Ġgift-01', 'Ġstruggle-01', 'Ġcast-01', 'Ġfeature-01',
'Ġharsh-02', 'Ġemerge-01', 'Ġfollow-04', 'Ġcut-off-04', 'Ġmistake-01', 'Ġlocate-01', 'Ġslow-01',
'Ġaccelerate-01', 'Ġcover-02', 'Ġsoft-02', 'Ġidentical-01', 'Ġsail-01', 'Ġjump-03',
'Ġfacilitate-01', 'Ġexcessive-02', 'Ġalter-01', 'Ġescalate-01', 'Ġmad-04', 'Ġkid-01', 'Ġfloat-01',
'Ġmess-up-02', 'Ġkidnap-01', 'Ġbore-02', 'Ġclean-01', 'Ġforgive-01', 'Ġgo-through-20', 'Ġcare-04',
'Ġmeet-up-04', 'Ġmoisturize-01', 'Ġhighlight-01', 'Ġdislike-01', 'Ġboom-02', 'Ġblow-up-06',
'Ġappeal-02', 'Ġadhere-02', 'Ġcontradict-01', 'Ġleave-12', 'Ġdialogue-01', 'Ġpush-04',
'Ġcontaminate-01', 'Ġfinalize-01', 'Ġtape-02', 'Ġpatrol-01', 'Ġincite-01', 'Ġrenounce-01',
'Ġhallucinate-01', 'Ġundertake-01', 'Ġaverage-03', 'Ġcompel-01', 'Ġstruggle-02', 'Ġgo-12',
'Ġtrap-01', 'Ġquiet-04', 'Ġconvey-01', 'Ġopen-02', 'Ġclothe-01', 'Ġexclusive-02', 'Ġgather-03',
'Ġextensive-03', 'Ġapproach-01', 'Ġmanipulate-02', 'Ġinfringe-01', 'Ġruin-01', 'Ġstrive-01',
'Ġproductive-03', 'Ġexplore-01', 'Ġinhabit-01', 'Ġpress-01', 'Ġforbid-01', 'Ġhit-02',
'Ġabolish-01', 'Ġimpress-01', 'Ġprospect-02', 'Ġgoogle-01', 'Ġsink-01', 'Ġresign-01',
'Ġpull-out-02', 'Ġstation-01', 'Ġcenter-02', 'Ġindustrialize-01', 'Ġcounsel-01', 'Ġpropel-01',
'Ġsmell-01', 'Ġmoderate-03', 'Ġpresume-01', 'Ġrun-09', 'Ġkeep-up-10', 'Ġdeal-03', 'Ġapprehend-01',
'Ġsick-02', 'Ġsmell-02', 'Ġhave-11', 'Ġfrustrate-01', 'Ġcatch-01', 'Ġimpression-03',
'Ġspecify-01', 'Ġemploy-02', 'Ġthankful-02', 'Ġman-01', 'Ġprioritize-01', 'Ġattribute-01',
'Ġproject-01', 'Ġparrot-01', 'Ġbitch-01', 'Ġstand-04', 'Ġvoice-01', 'Ġpreserve-01',
'Ġpublicize-01', 'Ġexhibit-01', 'Ġundergo-28', 'Ġhelp-02', 'Ġbankrupt-01', 'Ġflood-01',
'Ġprecede-01', 'Ġreinforce-01', 'Ġtask-01', 'Ġtype-03', 'Ġtransform-01', 'Ġdespair-01',
'Ġchase-01', 'Ġspread-01', 'Ġappall-01', 'Ġrestrain-01', 'Ġterrify-01', 'Ġfool-01', 'Ġaspire-01',
'Ġwarm-07', 'Ġbring-up-08', 'Ġbleed-01', 'Ġdepress-01', 'Ġcare-02', 'Ġalert-01', 'Ġwonder-02',
'Ġdrop-out-04', 'Ġspoil-01', 'Ġstink-01', 'Ġdrug-01', 'Ġoverturn-01', 'Ġheat-01', 'Ġmerge-01',
'Ġpeak-01', 'Ġset-01', 'Ġsolid-02', 'Ġinteract-01', 'Ġthrow-out-06', 'Ġholiday-01', 'Ġrefine-01',
'Ġallow-02', 'Ġsign-up-03', 'Ġbribe-01', 'Ġappease-01', 'Ġstress-02', 'Ġfine-01', 'Ġminor-02',
'Ġmine-01', 'Ġlove-02', 'Ġnetwork-01', 'Ġdeposit-01', 'Ġstore-01', 'Ġextract-01',
'Ġinterrogate-01', 'Ġturn-out-11', 'Ġimpregnate-01', 'Ġfake-02', 'Ġwhore-01', 'Ġconceal-01',
'Ġfire-03', 'Ġlean-01', 'Ġharmful-02', 'Ġout-05', 'Ġfall-07', 'Ġdodge-01', 'Ġorient-01',
'Ġbrand-01', 'Ġsocial-03', 'Ġcut-03', 'Ġcap-01', 'Ġoverpay-01', 'Ġbridge-01', 'Ġcollaborate-01',
'Ġaddress-03', 'Ġdivert-01', 'Ġpull-09', 'Ġrevise-01', 'Ġmolest-01', 'Ġextradite-01',
'Ġdismiss-02', 'Ġreprocess-01', 'Ġaccumulate-01', 'Ġoccasion-02', 'Ġobstruct-01',
'Ġbreak-down-12', 'Ġrumor-01', 'Ġfirm-03', 'Ġsettle-03', 'Ġorder-03', 'Ġstipulate-01',
'Ġaudit-01', 'Ġenact-01', 'Ġcelebrate-02', 'Ġbargain-01', 'Ġsucceed-03', 'Ġinject-01',
'Ġexcite-01', 'Ġgreet-01', 'Ġblack-07', 'Ġterminate-01', 'Ġdescend-01', 'Ġemerge-02', 'Ġwreck-01',
'Ġabsorb-01', 'Ġblow-01', 'Ġfine-03', 'Ġcirculate-01', 'Ġtight-05', 'Ġoffense-02', 'Ġactivate-01',
'Ġsecure-01', 'Ġpass-by-17', 'Ġbash-01', 'Ġprop-up-01', 'Ġcount-04', 'Ġslap-01', 'Ġbring-down-03',
'Ġamuse-01', 'Ġfilm-01', 'Ġintroduce-01', 'Ġdesignate-01', 'Ġhang-01', 'Ġwave-04',
'Ġprivilege-01', 'Ġtake-02', 'Ġcycle-02', 'Ġcancel-01', 'Ġbuy-05', 'Ġsweep-01', 'Ġhelp-out-03',
'Ġleft-20', 'Ġsuit-01', 'Ġenslave-01', 'Ġrest-01', 'Ġambush-01', 'Ġmean-04', 'Ġdistract-01',
'Ġmatch-03', 'Ġwarrant-01', 'Ġdisguise-01', 'Ġmake-up-07', 'Ġparty-01', 'Ġclose-11', 'Ġfall-10',
'Ġpump-01', 'Ġresort-01', 'Ġget-back-10', 'Ġregain-01', 'Ġlose-01', 'Ġerr-01', 'Ġrun-out-05',
'Ġthat-is-it-00', 'Ġaggravate-01', 'Ġloot-01', 'Ġhappen-02', 'Ġscrew-02', 'Ġmake-it-14',
'Ġpick-up-04', 'Ġrefer-02', 'Ġbreak-13', 'Ġupdate-01', 'Ġshine-01', 'Ġcongratulate-01',
'Ġpilot-01', 'Ġdisgrace-01', 'Ġfabricate-01', 'Ġsicken-01', 'Ġcriticism-04', 'Ġpreach-01',
'Ġdeport-01', 'Ġdeal-02', 'Ġinflict-01', 'Ġgain-01', 'Ġresume-01', 'Ġoutlaw-01', 'Ġshoot-down-05',
'Ġpartition-01', 'Ġaddress-01', 'Ġenvy-01', 'Ġbreak-02', 'Ġspeak-out-03', 'Ġbroaden-01',
'Ġstress-01', 'Ġinfiltrate-01', 'Ġflat-06', 'Ġimpeach-01', 'Ġtransgress-01', 'Ġpardon-01',
'Ġuncover-01', 'Ġcomprise-01', 'Ġreconstruct-01', 'Ġlibel-01', 'Ġhand-01', 'Ġhint-01',
'Ġencourage-02', 'Ġprevail-02', 'Ġbrave-02', 'Ġforesee-01', 'Ġconcede-01', 'Ġdeteriorate-01',
'Ġtopple-01', 'Ġmobile-02', 'Ġpanic-01', 'Ġmisunderstand-01', 'Ġtire-01', 'Ġenthusiastic-03',
'Ġexercise-02', 'Ġpersist-01', 'Ġinferior-01', 'Ġbrilliant-01', 'Ġbuild-02', 'Ġscream-01',
'Ġanticipate-01', 'Ġout-03', 'Ġration-01', 'Ġcount-02', 'Ġconsistent-01', 'Ġawait-01',
'Ġschool-01', 'Ġrent-01', 'Ġarise-02', 'Ġappeal-03', 'Ġhelpful-04', 'Ġsee-03', 'Ġlock-01',
'Ġstereotype-01', 'Ġjoin-in-05', 'Ġscrew-up-01', 'Ġwithhold-01', 'Ġmoderate-01', 'Ġaffiliate-01',
'Ġwaive-01', 'Ġsuck-01', 'Ġgolf-01', 'Ġturn-out-17', 'Ġput-up-11', 'Ġkeep-up-05', 'Ġstraight-05',
'Ġdress-01', 'Ġdig-01', 'Ġplead-02', 'Ġlecture-01', 'Ġgo-09', 'Ġpervert-01', 'Ġcry-01',
'Ġmitigate-01', 'Ġsubstitute-01', 'Ġsend-02', 'Ġdown-01', 'Ġwesternize-01', 'Ġcolor-01',
'Ġrefer-03', 'Ġpersecute-01', 'Ġscheme-01', 'Ġreactionary-02', 'Ġsubscribe-01', 'Ġshield-01',
'Ġexile-01', 'Ġdetonate-01', 'Ġstall-01', 'Ġbroker-01', 'Ġcalculate-01', 'Ġnarrow-02',
'Ġstock-01', 'Ġturn-down-05', 'Ġparole-01', 'Ġjoin-04', 'Ġinstitute-01', 'Ġdisprove-01',
'Ġpass-20', 'Ġspew-01', 'Ġbid-03', 'Ġwage-01', 'Ġsample-01', 'Ġretail-01', 'Ġratify-01',
'Ġspank-01', 'Ġdispatch-01', 'Ġharvest-01', 'Ġrot-01', 'Ġdelude-01', 'Ġclimb-01', 'Ġfrighten-01',
'Ġyell-01', 'Ġcoerce-01', 'Ġscary-03', 'Ġstretch-01', 'Ġdestabilize-01', 'Ġblood-02',
'Ġconfine-01', 'Ġoutrageous-02', 'Ġbeg-01', 'Ġwield-01', 'Ġscrap-01', 'Ġprivatize-01', 'Ġcure-01',
'Ġmature-02', 'Ġcoexist-01', 'Ġassert-02', 'Ġget-along-18', 'Ġreunify-01', 'Ġlook-forward-03',
'Ġnumber-01', 'Ġtrash-01', 'Ġrun-04', 'Ġgive-up-08', 'Ġbright-02', 'Ġout-01', 'Ġheal-01',
'Ġmassacre-01', 'Ġtackle-01', 'Ġstake-01', 'Ġopen-09', 'Ġknow-04', 'Ġcorrespond-02',
'Ġdisregard-01', 'Ġalienate-01', 'Ġinsure-01', 'Ġdisapprove-01', 'Ġdrain-01', 'Ġdeflect-01',
'Ġexit-01', 'Ġvacation-01', 'Ġcook-01', 'Ġadapt-01', 'Ġdissolve-01', 'Ġlift-01', 'Ġclose-down-04',
'Ġcome-down-23', 'Ġbully-01', 'Ġdenounce-01', 'Ġstab-01', 'Ġexpel-01', 'Ġabstain-01',
'Ġcut-out-06', 'Ġswallow-01', 'Ġcome-in-07', 'Ġstep-in-02', 'Ġseek-out-02', 'Ġpace-01', 'Ġwed-01',
'Ġgo-on-25', 'Ġsave-03', 'Ġcome-up-13', 'Ġsort-out-02', 'Ġtattoo-01', 'Ġleave-out-03', 'Ġkiss-01',
'Ġchance-01', 'Ġprolong-01', 'Ġtroll-01', 'Ġconcentrate-01', 'Ġchannel-01', 'Ġrecreation-02',
'Ġcenter-01', 'Ġweaponize-01', 'Ġexplicit-03', 'Ġdraft-02', 'Ġpose-02', 'Ġcrush-01',
'Ġdiscredit-01', 'Ġfurther-01', 'Ġdedicate-01', 'Ġsit-down-02', 'Ġleave-10', 'Ġforge-02',
'Ġcensor-01', 'Ġparade-02', 'Ġpaint-02', 'Ġcatch-03', 'Ġremortgage-01', 'Ġslow-down-03',
'Ġadmit-02', 'Ġbreak-19', 'Ġcounterfeit-01', 'Ġrun-10', 'Ġupgrade-01', 'Ġdeduct-01',
'Ġconfess-01', 'Ġdecline-02', 'Ġbar-01', 'Ġbrief-01', 'Ġconduct-02', 'Ġlynch-01', 'Ġacquit-01',
'Ġhyperlink-01', 'Ġlight-04', 'Ġconcrete-02', 'Ġreach-02', 'Ġmarch-01', 'Ġpurport-01',
'Ġcall-on-05', 'Ġpaddle-01', 'Ġfilter-02', 'Ġstrip-01', 'Ġcompose-01', 'Ġerupt-01', 'Ġwipe-01',
'Ġtrace-02', 'Ġdespise-01', 'Ġminimize-01', 'Ġneglect-01', 'Ġloyal-01', 'Ġslip-01', 'Ġrevive-01',
'Ġwork-07', 'Ġbeat-up-05', 'Ġdetermined-02', 'Ġpass-07', 'Ġprescribe-02', 'Ġfuss-01',
'Ġdemolish-01', 'Ġavail-01', 'Ġput-in-05', 'Ġlease-01', 'Ġembrace-01', 'Ġmerit-01',
'Ġintensify-01', 'Ġhearing-02', 'Ġweaken-01', 'Ġcolonize-01', 'Ġoffset-01', 'Ġgather-01',
'Ġtake-off-07', 'Ġbright-03', 'Ġextend-02', 'Ġget-30', 'Ġpreexist-01', 'Ġsnow-01', 'Ġstrike-02',
'Ġgross-06', 'Ġdiminish-01', 'Ġprejudice-01', 'Ġrage-02', 'Ġnotify-01', 'Ġcontest-02', 'Ġhype-01',
'Ġrevisit-01', 'Ġdark-02', 'Ġstand-08', 'Ġcertify-01', 'Ġoversee-01', 'Ġname-02', 'Ġlock-up-03',
'Ġknow-03', 'Ġminimal-02', 'Ġtell-02', 'Ġrotate-01', 'Ġoperate-02', 'Ġfat-03', 'Ġindulge-01',
'Ġfeel-06', 'Ġset-08', 'Ġsurpass-01', 'Ġpull-06', 'Ġget-06', 'Ġcamp-02', 'Ġgut-01', 'Ġchair-01',
'Ġqualify-01', 'Ġspare-01', 'Ġblunt-02', 'Ġproceed-01', 'Ġdump-01', 'Ġreckon-01', 'Ġpierce-01',
'Ġmelt-01', 'Ġfeel-05', 'Ġstand-03', 'Ġelaborate-01', 'Ġreach-03', 'Ġspark-01', 'Ġcoincide-01',
'Ġslander-01', 'Ġjoin-up-02', 'Ġshame-01', 'Ġboard-01', 'Ġrule-out-02', 'Ġblockade-01',
'Ġincinerate-01', 'Ġderive-01', 'Ġget-by-17', 'Ġcharacterize-01', 'Ġstockpile-01', 'Ġpersuade-01',
'Ġdecapitate-01', 'Ġrun-08', 'Ġpack-01', 'Ġbust-01', 'Ġpolice-01', 'Ġtrick-01', 'Ġblast-05',
'Ġtreat-04', 'Ġrun-off-24', 'Ġapprentice-01', 'Ġdispose-01', 'Ġinhibit-01', 'Ġwire-01', 'Ġtop-01',
'Ġhand-over-02', 'Ġknow-06', 'Ġabet-01', 'Ġcatch-up-04', 'Ġsleep-02', 'Ġslam-02', 'Ġbreed-01',
'Ġcontend-02', 'Ġperjure-01', 'Ġmanipulate-01', 'Ġprobe-01', 'Ġtrend-01', 'Ġtighten-01',
'Ġboycott-01', 'Ġtable-01', 'Ġindoctrinate-01', 'Ġsafeguard-01', 'Ġevacuate-01', 'Ġinterdict-01',
'Ġpetition-01', 'Ġformulate-01', 'Ġpartake-01', 'Ġpass-04', 'Ġoverride-01', 'Ġemit-01',
'Ġcharacteristic-02', 'Ġtimely-03', 'Ġstun-01', 'Ġcrumble-01', 'Ġmaximize-01', 'Ġpass-away-16',
'Ġrun-07', 'Ġsmile-01', 'Ġinquire-01', 'Ġlag-01', 'Ġlive-up-04', 'Ġdistance-01', 'Ġcold-02',
'Ġdeep-03', 'Ġrelax-01', 'Ġill-02', 'Ġsignify-01', 'Ġhold-back-07', 'Ġtransplant-01', 'Ġsmoke-01',
'Ġcurb-01', 'Ġdelegate-01', 'Ġseal-01', 'Ġlure-01', 'Ġintimate-02', 'Ġfresh-04', 'Ġseat-01',
'Ġmove-03', 'Ġkeep-03', 'Ġoutweigh-01', 'Ġrevere-01', 'Ġclone-01', 'Ġenlist-01', 'Ġclick-01',
'Ġempty-02', 'Ġfire-04', 'Ġcontend-01', 'Ġabide-01', 'Ġcraft-01', 'Ġtip-05', 'Ġwrap-01',
'Ġbite-01', 'Ġtoss-01', 'Ġpolite-01', 'Ġdesirable-02', 'Ġdefuse-01', 'Ġthrill-01', 'Ġproduce-02',
'Ġoblige-02', 'Ġdate-02', 'Ġalternate-01', 'Ġget-on-21', 'Ġramble-02', 'Ġhurt-02', 'Ġdistant-02',
'Ġhot-05', 'Ġpale-03', 'Ġproclaim-01', 'Ġclass-01', 'Ġcome-across-21', 'Ġsneak-01', 'Ġerode-01',
'Ġchampion-01', 'Ġneutral-02', 'Ġalien-01', 'Ġgrieve-01', 'Ġswear-01', 'Ġgo-21',
'Ġunderestimate-01', 'Ġaddictive-02', 'Ġpropagate-01', 'Ġlast-04', 'Ġcommence-01', 'Ġair-01',
'Ġmark-02', 'Ġaccommodate-01', 'Ġdemonize-01', 'Ġmock-01', 'Ġnuke-01', 'Ġswell-01', 'Ġbrag-01',
'Ġassert-03', 'Ġdisrespect-01', 'Ġwork-12', 'Ġremarkable-02', 'Ġpool-01', 'Ġpaint-03', 'Ġpour-01',
'Ġdecommission-01', 'Ġamplify-01', 'Ġmad-02', 'Ġcorrelate-01', 'Ġautomate-01', 'Ġmoney-01',
'Ġcontent-02', 'Ġstorm-01', 'Ġthrive-01', 'Ġliable-01', 'Ġhopeful-02', 'Ġexpire-01', 'Ġwork-06',
'Ġdisperse-01', 'Ġlay-04', 'Ġfall-apart-09', 'Ġterror-02', 'Ġphilander-01', 'Ġscrutinize-01',
'Ġfathom-01', 'Ġmake-up-08', 'Ġhumiliate-01', 'Ġcharge-06', 'Ġnatural-02', 'Ġfollow-up-03',
'Ġbend-01', 'Ġgrade-01', 'Ġenter-02', 'Ġpend-01', 'Ġprey-01', 'Ġmediate-01', 'Ġconclude-02',
'Ġmask-01', 'Ġreactivate-01', 'Ġevolve-02', 'Ġrestart-01', 'Ġencrypt-01', 'Ġget-through-12',
'Ġgrow-02', 'Ġbestow-01', 'Ġput-out-10', 'Ġdisplace-01', 'Ġcount-03', 'Ġstabilize-01',
'Ġembezzle-01', 'Ġpass-on-09', 'Ġform-02', 'Ġroot-02', 'Ġtrample-01', 'Ġmake-out-23',
'Ġfit-in-02', 'Ġhospitalize-01', 'Ġcut-down-11', 'Ġconstrain-01', 'Ġclash-01', 'Ġconsolidate-01',
'Ġmeddle-01', 'Ġreproduce-01', 'Ġclever-01', 'Ġdiversify-01', 'Ġpostpone-01', 'Ġstructure-01',
'Ġnarrow-01', 'Ġincur-01', 'Ġdraw-up-03', 'Ġdrive-04', 'Ġpin-01', 'Ġdelight-01', 'Ġput-on-08',
'Ġcoverage-06', 'Ġbring-about-05', 'Ġstir-up-04', 'Ġlet-down-04', 'Ġsigh-02', 'Ġspace-01',
'Ġcheat-02', 'Ġlessen-01', 'Ġrender-02', 'Ġrender-01', 'Ġmenace-01', 'Ġprevail-01', 'Ġreclaim-01',
'Ġpuzzle-01', 'Ġhesitate-01', 'Ġgo-23', 'Ġcharm-01', 'Ġturn-over-12', 'Ġwander-01',
'Ġrenovate-01', 'Ġpackage-01', 'Ġheadquarter-01', 'Ġline-01', 'Ġstraight-06', 'Ġpark-01',
'Ġturn-on-13', 'Ġarbitrary-02', 'Ġconceive-01', 'Ġexert-01', 'Ġspell-01', 'Ġdye-01', 'Ġtune-01',
'Ġrip-01', 'Ġgarner-01', 'Ġsick-04', 'Ġshove-01', 'Ġwave-01', 'Ġrust-01', 'Ġkneel-01',
'Ġcelebrate-01', 'Ġmisrepresent-01', 'Ġincarcerate-01', 'Ġawake-03', 'Ġup-01', 'Ġslip-02',
'Ġconcentrate-02', 'Ġround-05', 'Ġloose-04', 'Ġcripple-01', 'Ġpart-01', 'Ġhoard-01', 'Ġchain-01',
'Ġtricky-02', 'Ġhook-up-02', 'Ġtype-01', 'Ġglance-01', 'Ġprize-01', 'Ġtransmit-01', 'Ġhold-03',
'Ġsurge-01', 'Ġheadline-01', 'Ġvote-02', 'Ġdraw-01', 'Ġtext-01', 'Ġshower-01', 'Ġcalm-down-02',
'Ġfeed-up-03', 'Ġslide-01', 'Ġgo-down-27', 'Ġforward-01', 'Ġproject-02', 'Ġempower-01',
'Ġmind-04', 'Ġpass-02', 'Ġneutralize-01', 'Ġrepress-01', 'Ġserve-04', 'Ġeye-01',
'Ġdiscriminate-01', 'Ġoverlook-01', 'Ġtop-02', 'Ġmobilize-01', 'Ġstart-out-05', 'Ġpunishable-02',
'Ġunderlie-01', 'Ġpenetrate-01', 'Ġgrind-01', 'Ġjump-01', 'Ġpertain-01', 'Ġincline-01',
'Ġhumble-01', 'Ġmoderate-02', 'Ġmeaningful-05', 'Ġmislead-01', 'Ġfinish-07', 'Ġdisgruntle-01',
'Ġturn-up-15', 'Ġknock-01', 'Ġtake-03', 'Ġlunch-01', 'Ġadd-03', 'Ġcommend-01', 'Ġpatient-01',
'Ġattain-01', 'Ġhike-02', 'Ġlurk-01', 'Ġbe-02', 'Ġblackmail-01', 'Ġdubious-02', 'Ġentrench-01',
'Ġget-off-23', 'Ġflame-01', 'Ġstand-02', 'Ġsurvive-02', 'Ġafford-02', 'Ġlive-02', 'Ġmoan-01',
'Ġportion-01', 'Ġslash-02', 'Ġbreak-through-22', 'Ġplague-01', 'Ġblunt-01', 'Ġabominable-02',
'Ġhonorable-03', 'Ġrelated-04', 'Ġdeprive-01', 'Ġdecay-01', 'Ġdistress-01', 'Ġredistribute-01',
'Ġforeclose-01', 'Ġwarm-06', 'Ġjealous-02', 'Ġcohere-01', 'Ġpaste-01', 'Ġprompt-01',
'Ġcurtail-01', 'Ġtrack-down-02', 'Ġpity-01', 'Ġticket-02', 'Ġtransition-01', 'Ġburst-02',
'Ġbroke-23', 'Ġrewrite-01', 'Ġdeliberate-01', 'Ġdisclose-01', 'Ġsituate-01', 'Ġreiterate-01',
'Ġprofess-01', 'Ġbabble-01', 'Ġlift-02', 'Ġdeclassify-01', 'Ġremand-01', 'Ġreconcile-01',
'Ġassemble-01', 'Ġextort-01', 'Ġcorroborate-01', 'Ġsnip-01', 'Ġnormalize-01', 'Ġclose-03',
'Ġremit-01', 'Ġsweep-06', 'Ġbreach-01', 'Ġbehead-01', 'Ġsimulate-01', 'Ġastonish-01',
'Ġdeviate-01', 'Ġsmear-02', 'Ġgive-away-02', 'Ġdifferentiate-01', 'Ġintersect-01', 'Ġrectify-01',
'Ġlose-out-06', 'Ġtelephone-01', 'Ġrevolutionary-04', 'Ġblow-14', 'Ġexaggerate-01', 'Ġsoar-01',
'Ġcontent-01', 'Ġpreside-01', 'Ġcheck-07', 'Ġrefrain-01', 'Ġcrack-02', 'Ġdisintegrate-01',
'Ġexterminate-01', 'Ġridicule-01', 'Ġobey-01', 'Ġbundle-01', 'Ġcompound-01', 'Ġwine-01',
'Ġdine-01', 'Ġresent-01', 'Ġjeopardize-01', 'Ġusher-in-01', 'Ġcrowd-01', 'Ġelevate-01',
'Ġtear-down-05', 'Ġresolve-02', 'Ġearnest-01', 'Ġirritate-01', 'Ġgreen-02', 'Ġheed-01',
'Ġplay-10', 'Ġspread-out-04', 'Ġcruise-01', 'Ġcater-01', 'Ġstay-on-02', 'Ġstick-around-03',
'Ġcall-13', 'Ġbicker-01', 'Ġcurse-02', 'Ġopen-07', 'Ġrun-up-19', 'Ġtrump-01', 'Ġhappy-02',
'Ġredeem-01', 'Ġstrike-04', 'Ġbring-on-06', 'Ġenlighten-01', 'Ġgray-02', 'Ġnote-02', 'Ġshred-01',
'Ġgas-03', 'Ġlevy-01', 'Ġturn-18', 'Ġlevel-04', 'Ġbow-01', 'Ġturn-14', 'Ġrehabilitate-01',
'Ġcouple-01', 'Ġdent-01', 'Ġcautious-02', 'Ġbust-02', 'Ġshut-01', 'Ġflip-01', 'Ġvalidate-01',
'Ġkill-03', 'Ġhot-04', 'Ġchat-01', 'Ġcurious-02', 'Ġlump-01', 'Ġexacerbate-01', 'Ġsneaky-03',
'Ġconviction-02', 'Ġproceeding-02', 'Ġreorganize-01', 'Ġfit-05', 'Ġsee-05', 'Ġacquaint-01',
'Ġvile-02', 'Ġzap-01', 'Ġuniform-01', 'Ġreplicate-01', 'Ġintent-02', 'Ġgrip-01', 'Ġswear-02',
'Ġdecry-01', 'Ġsegregate-01', 'Ġspur-01', 'Ġstorm-02', 'Ġcap-02', 'Ġslant-01', 'Ġspan-01',
'Ġcut-back-05', 'Ġfledge-01', 'Ġfoster-01', 'Ġgripe-01', 'Ġquest-01', 'Ġpunch-01',
'Ġderegulate-01', 'Ġloathe-01', 'Ġimitate-01', 'Ġhang-out-06', 'Ġbaffle-01', 'Ġsuck-up-04',
'Ġtempt-01', 'Ġcondone-01', 'Ġassemble-02', 'Ġoust-01', 'Ġvent-01', 'Ġspout-01', 'Ġsound-02',
'Ġevade-01', 'Ġendure-01', 'Ġinvoke-01', 'Ġdevalue-01', 'Ġpose-01', 'Ġbear-06', 'Ġhypothesize-01',
'Ġspot-01', 'Ġdiscount-02', 'Ġrail-01', 'Ġhaul-01', 'Ġgauge-01', 'Ġcopyright-01', 'Ġgive-in-09',
'Ġimpede-01', 'Ġblast-01', 'Ġtrue-02', 'Ġbeware-01', 'Ġrestore-02', 'Ġnegative-05', 'Ġsteady-01',
'Ġfluctuate-01', 'Ġdate-01', 'Ġbathe-01', 'Ġgo-22', 'Ġrestructure-01', 'Ġpile-01', 'Ġspin-01',
'Ġtake-down-22', 'Ġbake-01', 'Ġtriple-01', 'Ġdowngrade-02', 'Ġordain-01', 'Ġmultiply-01',
'Ġskip-01', 'Ġincorporate-02', 'Ġsettle-01', 'Ġpass-on-14', 'Ġcreepy-04', 'Ġstuff-01',
'Ġline-up-02', 'Ġimmune-02', 'Ġlust-01', 'Ġnotable-04', 'Ġbuy-into-04', 'Ġimpair-01',
'Ġfigure-04', 'Ġpiss-01', 'Ġgive-back-03', 'Ġboast-01', 'Ġlay-off-06', 'Ġdive-01', 'Ġcommute-02',
'Ġracket-02', 'Ġdip-01', 'Ġrotate-02', 'Ġdemagogue-01', 'Ġchange-02', 'Ġbarter-01', 'Ġalike-05',
'Ġbind-03', 'Ġwhip-up-03', 'Ġmanifest-01', 'Ġcheck-03', 'Ġyield-01', 'Ġslay-01', 'Ġtally-01',
'Ġget-through-13', 'Ġbreak-through-26', 'Ġrelinquish-01', 'Ġreopen-01', 'Ġdefame-01',
'Ġinterrupt-01', 'Ġcast-03', 'Ġpattern-01', 'Ġdose-01', 'Ġreenter-01', 'Ġmotivate-02',
'Ġstandardize-01', 'Ġdate-entity', 'Ġgovernment-organization', 'Ġtemporal-quantity',
'Ġamr-unknown', 'Ġmulti-sentence', 'Ġpolitical-party', 'Ġ:compared-to', 'Ġmonetary-quantity',
'Ġordinal-entity', 'Ġreligious-group', 'Ġpercentage-entity', 'Ġworld-region', 'Ġ:consist',
'Ġurl-entity', 'Ġpolitical-movement', 'Ġet-cetera', 'Ġat-least', 'Ġmass-quantity',
'Ġhave-org-role-91', 'Ġhave-rel-role-91', 'Ġinclude-91', 'Ġhave-concession-91',
'Ġhave-condition-91', 'Ġbe-located-at-91', 'Ġrate-entity-91', 'Ġinstead-of-91', 'Ġhyperlink-91',
'Ġrequest-confirmation-91', 'Ġhave-purpose-91', 'Ġbe-temporally-at-91', 'Ġregardless-91',
'Ġhave-polarity-91', 'Ġbyline-91', 'Ġhave-manner-91', 'Ġhave-part-91', 'Ġhave-quant-91',
'Ġpublication-91', 'Ġbe-from-91', 'Ġhave-mod-91', 'Ġhave-frequency-91', 'Ġscore-on-scale-91',
'Ġhave-li-91', 'Ġbe-compared-to-91', 'Ġbe-destined-for-91', 'Ġcourse-91', 'Ġhave-subevent-91',
'Ġstreet-address-91', 'Ġhave-extent-91', 'Ġstatistical-test-91', 'Ġhave-instrument-91',
'Ġhave-name-91', 'Ġbe-polite-91', '-00', '-01', '-02', '-03', '-04', '-05', '-06', '-07', '-08',
'-09', '-10', '-11', '-12', '-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20', '-21', '-22',
'-23', '-24', '-25', '-26', '-27', '-28', '-29', '-20', '-31', '-32', '-33', '-34', '-35', '-36',
'-37', '-38', '-39', '-40', '-41', '-42', '-43', '-44', '-45', '-46', '-47', '-48', '-49', '-50',
'-51', '-52', '-53', '-54', '-55', '-56', '-57', '-58', '-59', '-60', '-61', '-62', '-63', '-64',
'-65', '-66', '-67', '-68', '-69', '-70', '-71', '-72', '-73', '-74', '-75', '-76', '-77', '-78',
'-79', '-80', '-81', '-82', '-83', '-84', '-85', '-86', '-87', '-88', '-89', '-90', '-91', '-92',
'-93', '-94', '-95', '-96', '-97', '-98', '-of', 'Ġ:op1', 'Ġ:op2', 'Ġ:op3', 'Ġ:op4', 'Ġ:op5',
'Ġ:ARG0', 'Ġ:ARG1', 'Ġ:ARG2', 'Ġ:ARG3', 'Ġ:ARG4', 'Ġ:ARG5', 'Ġ:ARG6', 'Ġ:ARG7', 'Ġ:ARG8',
'Ġ:ARG9', 'Ġ:ARG10', 'Ġ:ARG11', 'Ġ:ARG12', 'Ġ:ARG13', 'Ġ:ARG14', 'Ġ:ARG15', 'Ġ:ARG16', 'Ġ:ARG17',
'Ġ:ARG18', 'Ġ:ARG19', 'Ġ:ARG20', 'Ġ:accompanier', 'Ġ:age', 'Ġ:beneficiary', 'Ġ:calendar',
'Ġ:cause', 'Ġ:century', 'Ġ:concession', 'Ġ:condition', 'Ġ:conj-as-if', 'Ġ:consist-of', 'Ġ:cost',
'Ġ:day', 'Ġ:dayperiod', 'Ġ:decade', 'Ġ:degree', 'Ġ:destination', 'Ġ:direction', 'Ġ:domain',
'Ġ:duration', 'Ġ:employed-by', 'Ġ:era', 'Ġ:example', 'Ġ:extent', 'Ġ:frequency', 'Ġ:instrument',
'Ġ:li', 'Ġ:location', 'Ġ:manner', 'Ġ:meaning', 'Ġ:medium', 'Ġ:mod', 'Ġ:mode', 'Ġ:month', 'Ġ:name',
'Ġ:ord', 'Ġ:part', 'Ġ:path', 'Ġ:polarity', 'Ġ:polite', 'Ġ:poss', 'Ġ:purpose', 'Ġ:quant',
'Ġ:quarter', 'Ġ:range', 'Ġ:relation', 'Ġ:role', 'Ġ:scale', 'Ġ:season', 'Ġ:source', 'Ġ:subevent',
'Ġ:subset', 'Ġ:superset', 'Ġ:time', 'Ġ:timezone', 'Ġ:topic', 'Ġ:unit', 'Ġ:value', 'Ġ:weekday',
'Ġ:wiki', 'Ġ:year', 'Ġ:year2', 'Ġ:snt0', 'Ġ:snt1', 'Ġ:snt2', 'Ġ:snt3', 'Ġ:snt4', 'Ġ:snt5',
'ĠCOUNTRY', 'ĠQUANTITY', 'ĠORGANIZATION', 'ĠDATE_ATTRS', 'ĠNATIONALITY', 'ĠLOCATION', 'ĠENTITY',
'ĠMISC', 'ĠORDINAL_ENTITY', 'ĠIDEOLOGY', 'ĠRELIGION', 'ĠSTATE_OR_PROVINCE', 'ĠCAUSE_OF_DEATH',
'ĠTITLE', 'ĠDATE', 'ĠNUMBER', 'ĠHANDLE', 'ĠSCORE_ENTITY', 'ĠDURATION', 'ĠORDINAL', 'ĠMONEY',
'ĠCRIMINAL_CHARGE', '_1', '_2', '_3', '_4', '_2', '_5', '_6', '_7', '_8', '_9', '_10', '_11',
'_12', '_13', '_14', '_15', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ',
'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ ', 'Ġ',
'Ġ', '', ' ']
special_tokens = [itm.lstrip("Ġ") for itm in raw_special_tokens]
recategorizations = [
"\u0120COUNTRY",
"\u0120QUANTITY",
"\u0120ORGANIZATION",
"\u0120DATE_ATTRS",
"\u0120NATIONALITY",
"\u0120LOCATION",
"\u0120ENTITY",
"\u0120MISC",
"\u0120ORDINAL_ENTITY",
"\u0120IDEOLOGY",
"\u0120RELIGION",
"\u0120STATE_OR_PROVINCE",
"\u0120CAUSE_OF_DEATH",
"\u0120TITLE",
"\u0120DATE",
"\u0120NUMBER",
"\u0120HANDLE",
"\u0120SCORE_ENTITY",
"\u0120DURATION",
"\u0120ORDINAL",
"\u0120MONEY",
"\u0120CRIMINAL_CHARGE",
]
# special_tokens = ["", " "]
arg_to_scheduler = {
"linear": get_linear_schedule_with_warmup,
"cosine": get_cosine_schedule_with_warmup,
"cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
"polynomial": get_polynomial_decay_schedule_with_warmup,
"constant": get_constant_schedule_with_warmup,
}
arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
arg_to_tokenizer = {
"AutoTokenizer": AutoTokenizer,
"BartTokenizer": BartTokenizer,
"T5Tokenizer": T5Tokenizer,
}
arg_to_plm_model = {
"AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM,
"BartForConditionalGeneration": BartForConditionalGeneration,
"T5Model": T5Model,
"T5ForConditionalGeneration": T5ForConditionalGeneration,
}
================================================
FILE: hanlp/components/amr/amrbart/common/penman_interface.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from penman import load as load_, Graph, Triple
from penman import loads as loads_
from penman import encode as encode_
from penman.model import Model
from penman.models.noop import NoOpModel
from penman.models import amr
op_model = Model()
noop_model = NoOpModel()
amr_model = amr.model
DEFAULT = op_model
def _get_model(dereify):
if dereify is None:
return DEFAULT
elif dereify:
return op_model
else:
return noop_model
def _remove_wiki(graph):
metadata = graph.metadata
triples = []
for t in graph.triples:
v1, rel, v2 = t
if rel == ":wiki":
t = Triple(v1, rel, "+")
triples.append(t)
graph = Graph(triples)
graph.metadata = metadata
return graph
def load(source, dereify=None, remove_wiki=False):
model = _get_model(dereify)
out = load_(source=source, model=model)
if remove_wiki:
for i in range(len(out)):
out[i] = _remove_wiki(out[i])
return out
def loads(string, dereify=None, remove_wiki=False):
model = _get_model(dereify)
out = loads_(string=string, model=model)
if remove_wiki:
for i in range(len(out)):
out[i] = _remove_wiki(out[i])
return out
def encode(g, top=None, indent=-1, compact=False):
model = amr_model
return encode_(g=g, top=top, indent=indent, compact=compact, model=model)
================================================
FILE: hanlp/components/amr/amrbart/common/postprocessing.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
import enum
import penman
import networkx as nx
from hanlp.components.amr.amrbart.common.penman_interface import encode
from collections import defaultdict, Counter
BACKOFF = penman.Graph(
[
penman.Triple("d2", ":instance", "dog"),
penman.Triple("b1", ":instance", "bark-01"),
penman.Triple("b1", ":ARG0", "d2"),
]
)
def token_processing(tok):
if tok is None:
return None
elif tok.isdigit():
try:
return eval(tok)
except:
return tok
elif tok.startswith('"') and (not tok.endswith('"')):
return tok + '"'
elif tok.endswith('"') and (not tok.startswith('"')):
return '"' + tok
else:
return tok
def decode_into_node_and_backreferences(subtoken_ids, tokenizer):
rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)")
rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>")
# subtoken_ids.insert(1,36) # add "(" id
# subtoken_ids.insert(-1, 4839) # add ")" id
# get strings
subtokens = [tokenizer.decoder.get(t) for t in subtoken_ids]
# print("subtokens:", subtokens)
# fix backreferences
subtoken_backreferences = [max(t - len(tokenizer.encoder), -1) for t in subtoken_ids]
# strip padding
subtokens, subtoken_backreferences = zip(
*[
(s, b)
for s, b in zip(subtokens, subtoken_backreferences)
if s != ("")
]
)
# subword collapse
tokens = []
backreferences = []
subword_to_token_map = {}
current_token_i = 0
for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)):
subword_to_token_map[subw_i] = current_token_i
# if empty you cannot do anything but add a new word
if not tokens:
tokens.append(subtok.lstrip(tokenizer.INIT))
backreferences.append(-1)
current_token_i += 1
# backref can't be splitted
elif subw_backr > -1:
tokens.append(None)
backreferences.append(subword_to_token_map[subw_backr])
current_token_i += 1
# after a special token release
elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]):
tokens.append(subtok.lstrip(tokenizer.INIT))
backreferences.append(-1)
current_token_i += 1
# after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT
# TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':'
elif (tokens[-1] == ":") and rex_arg.match(subtok):
tokens[-1] = tokens[-1] + subtok[1:]
# leading tokenizer.INIT
elif subtok.startswith(tokenizer.INIT):
tokens.append(subtok.lstrip(tokenizer.INIT))
backreferences.append(-1)
current_token_i += 1
# very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge
elif (
isinstance(tokens[-1], str)
and tokens[-1].startswith(":")
and tokens[-1][-1].isdigit()
and (subtok != "-of")
):
tokens.append(subtok.lstrip(tokenizer.INIT))
backreferences.append(-1)
current_token_i += 1
# in any other case attach to the previous
else:
tokens[-1] = tokens[-1] + subtok
# strip INIT and fix byte-level
tokens = [
tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t
for t in tokens
]
# tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens]
# unks are substituted with thing
tokens = [t if t != "" else "thing" for t in tokens]
old_tokens = tokens
old_backreferences = backreferences
# Barack Obama -> "Barack Obama"
tokens = []
backreferences = []
token_to_token_map = {}
start_search = 0
removed = 0
while True:
try:
lit_start = old_tokens.index("", start_search)
token_addition = old_tokens[start_search:lit_start]
for i, t in enumerate(token_addition, start=start_search):
token_to_token_map[i] = i - removed
tokens += token_addition
backreferences_addition = [
token_to_token_map[b] if b > -1 else -1
for b in old_backreferences[start_search:lit_start]
]
backreferences += backreferences_addition
lit_end = min(lit_start + 2, len(old_tokens) - 1)
while lit_end < len(old_tokens):
old_tok = old_tokens[lit_end]
if isinstance(old_tok, str) and (
(old_tok.startswith(":") and len(old_tok) > 3) or (old_tok == "")
):
res_tok = old_tokens[lit_start + 1 : lit_end]
for i in range(lit_start, lit_end):
token_to_token_map[i] = len(tokens)
# Remove possible wrong None
res = old_tokens[lit_start + 1 : lit_end]
res = [str(r) for r in res if r is not None]
res = '"' + "_".join(res) + '"'
removed += len(res_tok)
start_search = lit_end
tokens += [res, old_tok]
backreferences += [-1, -1]
break
elif old_tok == " ":
res_tok = old_tokens[lit_start + 1 : lit_end]
for i in range(lit_start, lit_end + 1):
token_to_token_map[i] = len(tokens)
# Remove possible wrong None
res = old_tokens[lit_start + 1 : lit_end]
res = [str(r) for r in res if r is not None]
res = '"' + "_".join(res) + '"'
removed += len(res_tok) + 1
start_search = lit_end + 1
tokens.append(res)
backreferences.append(-1)
break
else:
lit_end += 1
start_search = lit_end
except ValueError:
token_addition = old_tokens[start_search:]
for i, t in enumerate(token_addition, start=start_search):
token_to_token_map[i] = i - removed
backreferences_addition = [
token_to_token_map[b] if b > -1 else b for b in old_backreferences[start_search:]
]
tokens += token_addition
backreferences += backreferences_addition
break
tokens = [token_processing(t) for t in tokens]
shift = 1
if tokens[1] == "":
shift = 2
tokens = tokens[shift:]
backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]]
if tokens[-1] == " ":
tokens.pop()
backreferences.pop()
return tokens, backreferences
def index_of(element, iterable, default=None, start=None, end=None):
if not callable(element):
def check(x):
return element == x
else:
check = element
if start is None:
start = 0
if end is None:
end = len(iterable)
item = start
while item < end:
if check(iterable[item]):
return item
item += 1
return default
def separate_edges_nodes(edges_nodes_slice, *other):
is_arg = lambda x: isinstance(x, str) and x.startswith(":")
start = 0
edges = []
nodes = []
l = len(edges_nodes_slice)
while start < l:
edge_index = index_of(is_arg, edges_nodes_slice, start=start)
if edge_index is None or edge_index == (l - 1):
break
if is_arg(edges_nodes_slice[edge_index + 1]):
start = edge_index + 1
continue
edges.append(edge_index)
nodes.append(edge_index + 1)
start = edge_index + 2
ret = []
for oth in other:
edges_oth = [oth[i] for i in edges]
nodes_oth = [oth[i] for i in nodes]
ret.append((edges_oth, nodes_oth))
return ret
def _split_name_ops(graph):
# identify name triples
name_vars = {}
for i, (v1, rel, v2) in enumerate(graph.triples):
if rel == ":instance" and v2 == "name":
name_vars[v1] = 1
# check if they have ops
name_vars_to_ops = defaultdict(list)
for i, (v1, rel, v2) in enumerate(graph.triples):
if v1 in name_vars and rel.startswith(":op"):
name_vars_to_ops[v1].append((i, rel, v2.strip('"')))
triples = graph.triples.copy()
for nv, ops in name_vars_to_ops.items():
ops = sorted(ops, key=lambda x: int(x[1][3:]))
idx, _, lits = zip(*ops)
for i in idx:
triples[i] = None
lits = ['"' + l + '"' for lit in lits for l in lit.split("_")]
tt = []
for i, l in enumerate(lits, start=1):
rel = ":op" + str(i)
tt.append(penman.Triple(nv, rel, l))
triples[min(idx)] = tt
triples = [t if isinstance(t, list) else [t] for t in triples if t is not None]
triples = [t for tt in triples for t in tt]
graph_ = penman.Graph(triples)
graph_.metadata = graph.metadata
return graph_
def _reconstruct_graph_from_nodes(nodes, backreferences):
triples = []
triples_added = set()
variable2index = {}
index2variable = {}
start_index = 0
cnt = defaultdict(Counter)
while start_index < len(nodes):
stop_index = index_of("", nodes, default=len(nodes) + 1, start=start_index)
old_start_index = start_index
start_index = stop_index + 1
src_node, src_backr = nodes[old_start_index], backreferences[old_start_index]
if src_node == "":
continue
trg_nodes_edges = nodes[old_start_index:stop_index]
trg_nodes_edges_backr = backreferences[old_start_index:stop_index]
trg_nodes_edges_indices = list(range(old_start_index, stop_index))
if isinstance(src_node, str):
if src_node in ("", " ", ""):
continue
elif ("/" in src_node) or (":" in src_node) or ("(" in src_node) or (")" in src_node):
src_node = "thing"
if src_node is not None:
src_node = str(src_node)
src_var = src_node[0].lower()
if not src_var not in "abcdefghijklmnopqrstuvwxyz":
src_var = "x"
# src_var = f'{src_var}_{len(variable2index)}'
src_var = f"{src_var}{len(variable2index)}"
src_var_i = old_start_index
variable2index[src_var] = src_var_i
index2variable[src_var_i] = src_var
triple = penman.Triple(src_var, ":instance", src_node)
if triple not in triples_added:
triples.append(triple)
triples_added.add(triple)
else:
if src_backr in index2variable:
src_var = index2variable[src_backr]
# more resilient logic here
(trg_edges, trg_nodes), (_, trg_nodes_backr), (_, trg_nodes_indices) = separate_edges_nodes(
trg_nodes_edges, trg_nodes_edges, trg_nodes_edges_backr, trg_nodes_edges_indices
)
for n, e, nb, ni in zip(trg_nodes, trg_edges, trg_nodes_backr, trg_nodes_indices):
if isinstance(n, str) and n.startswith(":"):
continue
if isinstance(n, str) and n.startswith("<") and n.endswith(">"):
continue
if e == ":li":
pass
elif len(e) < 4 or (not e.startswith(":")):
continue
# same edge more than once
num = cnt[src_var][e]
# num = 0
if num:
if e.startswith(":op") or e.startswith(":snt"):
continue
# elif e.startswith(':ARG'):
# continue
elif num > 3:
continue
if n is None:
if nb not in index2variable:
continue
trg_var = index2variable[nb]
trg = trg_var
elif e == ":mode":
trg = n
elif (
(not isinstance(n, str))
or re.match(r"^[+-]?\d+\.?\d*$", n)
or (n == "-")
or (n == "+")
):
trg = str(n)
elif n.startswith('"') and n.endswith('"') and len(n) > 2:
trg = '"' + n.replace('"', "") + '"'
elif ("/" in n) or (":" in n) or ("(" in n) or (")" in n) or ("=" in n):
trg = f'"{n}"'
elif n == '"':
continue
elif (
(n.startswith('"') and (not n.endswith('"')))
or (not n.startswith('"') and (n.endswith('"')))
or ('"' in n)
):
trg = '"' + n.replace('"', "") + '"'
else:
trg_var = n[0].lower()
if trg_var not in "abcdefghijklmnopqrstuvwxyz":
trg_var = "x"
# trg_var = f'{trg_var}_{len(variable2index)}'
trg_var = f"{trg_var}{len(variable2index)}"
trg_var_i = ni
variable2index[trg_var] = trg_var_i
index2variable[trg_var_i] = trg_var
triple = penman.Triple(trg_var, ":instance", n)
if triple not in triples_added:
triples.append(triple)
triples_added.add(triple)
trg = trg_var
triple = penman.Triple(src_var, e, trg)
if triple not in triples_added:
triples.append(triple)
triples_added.add(triple)
cnt[src_var][e] += 1
return penman.Graph(triples)
def build_graph(nodes, backreferences, restore_name_ops=False):
graph = _reconstruct_graph_from_nodes(nodes, backreferences)
if restore_name_ops:
graph = _split_name_ops(graph)
return graph
class ParsedStatus(enum.Enum):
OK = 0
FIXED = 1
BACKOFF = 2
def connect_graph_if_not_connected(graph):
try:
encoded = encode(graph)
return graph, ParsedStatus.OK
except:
pass
nxgraph = nx.MultiGraph()
variables = graph.variables()
for v1, _, v2 in graph.triples:
if v1 in variables and v2 in variables:
nxgraph.add_edge(v1, v2)
elif v1 in variables:
nxgraph.add_edge(v1, v1)
triples = graph.triples.copy()
new_triples = []
addition = f"a{len(variables) + 1}"
triples.append(penman.Triple(addition, ":instance", "and"))
for i, conn_set in enumerate(nx.connected_components(nxgraph), start=1):
edge = f":op{i}"
conn_set = sorted(conn_set, key=lambda x: int(x[1:]))
conn_set = [c for c in conn_set if c in variables]
node = conn_set[0]
new_triples.append(penman.Triple(addition, edge, node))
triples = new_triples + triples
metadata = graph.metadata
graph = penman.Graph(triples)
graph.metadata.update(metadata)
encode(graph)
return graph, ParsedStatus.FIXED
def restore_backreferences_from_pointers(nodes):
new_nodes, new_backreferences = [], []
prev_pointer = None
pointer2i = {}
for n in nodes:
is_pointer = isinstance(n, str) and n.startswith("")
if not is_pointer:
if prev_pointer is not None:
if prev_pointer in pointer2i:
new_nodes.append(None)
new_backreferences.append(pointer2i[prev_pointer])
new_nodes.append(n)
new_backreferences.append(-1)
else:
pointer2i[prev_pointer] = len(new_nodes)
new_nodes.append(n)
new_backreferences.append(-1)
else:
new_nodes.append(n)
new_backreferences.append(-1)
prev_pointer = None
else:
prev_pointer = n
return new_nodes, new_backreferences
================================================
FILE: hanlp/components/amr/amrbart/data_interface/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-07 14:36
================================================
FILE: hanlp/components/amr/amrbart/data_interface/dataset.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
class AMRParsingDataSet(object):
@staticmethod
def tokenize(sample: dict, tokenizer, max_src_length=400, max_tgt_length=1024, unified_input=True, amr="src",
text="tgt"):
amr = sample.get(amr, None) # AMR tokens
txt = sample[text] # Text tokens
if amr is not None:
sample['labels'] = tokenizer.tokenize_amr(amr.split())[:max_src_length - 2] + [tokenizer.amr_eos_token_id]
raw_txt_ids = tokenizer(
txt, max_length=max_tgt_length, padding=False, truncation=True
)["input_ids"]
if unified_input:
txt_ids = raw_txt_ids[:max_tgt_length - 3] + [tokenizer.amr_bos_token_id, tokenizer.mask_token_id,
tokenizer.amr_eos_token_id]
else:
txt_ids = raw_txt_ids
sample['input_ids'] = txt_ids
return sample
class AMR2TextDataSet(object):
@staticmethod
def tokenize(sample: dict, tokenizer, max_src_length=400, max_tgt_length=1024, unified_input=True, amr="src",
text="tgt"):
src = sample[amr] # AMR tokens
tgt = sample.get(text, None) # Text tokens
if not unified_input:
src_ids = [tokenizer.amr_bos_token_id] + tokenizer.tokenize_amr(src.split())[
:max_src_length - 2] + [tokenizer.amr_eos_token_id]
else:
# [[mask] xxx ]
src_ids = [tokenizer.bos_token_id, tokenizer.mask_token_id, tokenizer.eos_token_id] + [
tokenizer.amr_bos_token_id] + tokenizer.tokenize_amr(src.split())[:max_src_length - 5] + [
tokenizer.amr_eos_token_id]
sample["input_ids"] = src_ids
if tgt is not None:
with tokenizer.as_target_tokenizer():
tgt_ids = tokenizer(
tgt, max_length=max_tgt_length, padding=False, truncation=True
)
tgt_ids["input_ids"] = [
label[1:] for label in tgt_ids["input_ids"]
]
sample["labels"] = tgt_ids["input_ids"]
return sample
================================================
FILE: hanlp/components/amr/amrbart/model_interface/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-03 20:33
================================================
FILE: hanlp/components/amr/amrbart/model_interface/modeling_bart.py
================================================
# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch BART model."""
import copy
import math
import random
import warnings
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqQuestionAnsweringModelOutput,
Seq2SeqSequenceClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from transformers.models.bart.configuration_bart import BartConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/bart-base"
_CONFIG_FOR_DOC = "BartConfig"
_TOKENIZER_FOR_DOC = "BartTokenizer"
# Base model docstring
_EXPECTED_OUTPUT_SHAPE = [1, 8, 768]
# SequenceClassification docstring
_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "valhalla/bart-large-sst2"
_SEQ_CLASS_EXPECTED_LOSS = 0.0
_SEQ_CLASS_EXPECTED_OUTPUT = "'POSITIVE'"
# QuestionAsnwering docstring
_CHECKPOINT_FOR_QA = "valhalla/bart-large-finetuned-squadv1"
_QA_EXPECTED_LOSS = 0.59
_QA_EXPECTED_OUTPUT = "' nice puppet'"
BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/bart-large",
# see all BART models at https://huggingface.co/models?filter=bart
]
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
"""
Shift input ids one token to the right.
"""
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
# replace possible -100 values in labels by `pad_token_id`
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
return shifted_input_ids
def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
"""
Make causal mask used for bi-directional self-attention.
"""
bsz, tgt_len = input_ids_shape
mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
mask_cond = torch.arange(mask.size(-1))
mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
mask = mask.to(dtype)
if past_key_values_length > 0:
mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
class BartLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
# Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids_shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
)
return super().forward(positions + self.offset)
class BartAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
# if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder
is_cross_attention = key_value_states is not None
bsz, tgt_len, _ = hidden_states.size()
# get query proj
query_states = self.q_proj(hidden_states) * self.scaling
# get key, value proj
if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_states = past_key_value[0]
value_states = past_key_value[1]
elif is_cross_attention:
# cross_attentions
key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
elif past_key_value is not None:
# reuse k, v, self_attention
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
else:
# self_attention
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_states, value_states)
proj_shape = (bsz * self.num_heads, -1, self.head_dim)
query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
key_states = key_states.view(*proj_shape)
value_states = value_states.view(*proj_shape)
src_len = key_states.size(1)
attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
raise ValueError(
f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
f" {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
if layer_head_mask is not None:
if layer_head_mask.size() != (self.num_heads,):
raise ValueError(
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
f" {layer_head_mask.size()}"
)
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
else:
attn_weights_reshaped = None
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
attn_output = torch.bmm(attn_probs, value_states)
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
attn_output = attn_output.transpose(1, 2)
# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned aross GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights_reshaped, past_key_value
class BartEncoderLayer(nn.Module):
def __init__(self, config: BartConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BartAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.FloatTensor,
attention_mask: torch.FloatTensor,
layer_head_mask: torch.FloatTensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class BartDecoderLayer(nn.Module):
def __init__(self, config: BartConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = BartAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = BartAttention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
encoder_hidden_states (`torch.FloatTensor`):
cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
size `(decoder_attention_heads,)`.
past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
# Self Attention
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
# add present self-attn cache to positions 1,2 of present_key_value tuple
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
past_key_value=self_attn_past_key_value,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
# Cross-Attention Block
cross_attn_present_key_value = None
cross_attn_weights = None
if encoder_hidden_states is not None:
residual = hidden_states
# cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
hidden_states=hidden_states,
key_value_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
layer_head_mask=cross_attn_layer_head_mask,
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# add cross-attn to positions 3,4 of present_key_value tuple
present_key_value = present_key_value + cross_attn_present_key_value
# Fully Connected
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights, cross_attn_weights)
if use_cache:
outputs += (present_key_value,)
return outputs
class BartClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(
self,
input_dim: int,
inner_dim: int,
num_classes: int,
pooler_dropout: float,
):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class BartPretrainedModel(PreTrainedModel):
config_class = BartConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"]
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (BartDecoder, BartEncoder)):
module.gradient_checkpointing = value
@property
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
}
return dummy_inputs
class PretrainedBartModel(BartPretrainedModel):
def __init_subclass__(self):
warnings.warn(
"The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
FutureWarning,
)
BART_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`BartConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
BART_GENERATION_EXAMPLE = r"""
Summarization example:
```python
>>> from transformers import BartTokenizer, BartForConditionalGeneration
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
>>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
>>> ARTICLE_TO_SUMMARIZE = (
... "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
... )
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")
>>> # Generate Summary
>>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
>>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'
```
Mask filling example:
```python
>>> from transformers import BartTokenizer, BartForConditionalGeneration
>>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
>>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
>>> TXT = "My friends are but they eat too many carbs."
>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
>>> logits = model(input_ids).logits
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)
>>> tokenizer.decode(predictions).split()
['not', 'good', 'healthy', 'great', 'very']
```
"""
BART_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are decoder input IDs?](../glossary#decoder-input-ids)
Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
For translation and summarization training, `decoder_input_ids` should be provided. If no
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
`(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
can choose to directly pass an embedded representation. This is useful if you want more control over how to
convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
input (see `past_key_values`). This is useful if you want more control over how to convert
`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
of `inputs_embeds`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class BartEncoder(BartPretrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`BartEncoderLayer`].
Args:
config: BartConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
self.embed_positions = BartLearnedPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
)
self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_shape)
hidden_states = inputs_embeds + embed_pos
hidden_states = self.layernorm_embedding(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
# expand attention_mask
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
# check if head_mask has a correct number of layers specified if desired
if head_mask is not None:
if head_mask.size()[0] != (len(self.layers)):
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
f" {head_mask.size()[0]}."
)
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1)
if self.training and (dropout_probability < self.layerdrop): # skip the layer
layer_outputs = (None, None)
else:
if self.gradient_checkpointing and self.training:
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, output_attentions)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(encoder_layer),
hidden_states,
attention_mask,
(head_mask[idx] if head_mask is not None else None),
)
else:
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
class BartDecoder(BartPretrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]
Args:
config: BartConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
self.embed_positions = BartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
)
self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
# create causal mask
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
combined_attention_mask = None
if input_shape[-1] > 1:
combined_attention_mask = _make_causal_mask(
input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
).to(inputs_embeds.device)
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
combined_attention_mask = (
expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
)
return combined_attention_mask
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
control over how to convert `input_ids` indices into associated vectors than the model's internal
embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
attention_mask = self._prepare_decoder_attention_mask(
attention_mask, input_shape, inputs_embeds, past_key_values_length
)
# expand encoder attention mask
if encoder_hidden_states is not None and encoder_attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
# embed positions
positions = self.embed_positions(input_shape, past_key_values_length)
hidden_states = inputs_embeds + positions
hidden_states = self.layernorm_embedding(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
next_decoder_cache = () if use_cache else None
# check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
if attn_mask is not None:
if attn_mask.size()[0] != (len(self.layers)):
raise ValueError(
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
f" {head_mask.size()[0]}."
)
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
dropout_probability = random.uniform(0, 1)
if self.training and (dropout_probability < self.layerdrop):
continue
past_key_value = past_key_values[idx] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
def create_custom_forward(module):
def custom_forward(*inputs):
# None for past_key_value
return module(*inputs, output_attentions, use_cache)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(decoder_layer),
hidden_states,
attention_mask,
encoder_hidden_states,
encoder_attention_mask,
head_mask[idx] if head_mask is not None else None,
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
None,
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
cross_attn_layer_head_mask=(
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
),
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
if output_attentions:
all_self_attns += (layer_outputs[1],)
if encoder_hidden_states is not None:
all_cross_attentions += (layer_outputs[2],)
# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if not return_dict:
return tuple(
v
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
cross_attentions=all_cross_attentions,
)
@add_start_docstrings(
"The bare BART Model outputting raw hidden-states without any specific head on top.",
BART_START_DOCSTRING,
)
class BartModel(BartPretrainedModel):
def __init__(self, config: BartConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = BartEncoder(config, self.shared)
self.decoder = BartDecoder(config, self.shared)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqModelOutput]:
# different to other models, Bart automatically creates decoder_input_ids from
# input_ids if no decoder_input_ids are provided
if decoder_input_ids is None and decoder_inputs_embeds is None:
if input_ids is None:
raise ValueError(
"If no `decoder_input_ids` or `decoder_inputs_embeds` are "
"passed, `input_ids` cannot be `None`. Please pass either "
"`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
)
decoder_input_ids = shift_tokens_right(
input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
)
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
# decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return decoder_outputs + encoder_outputs
return Seq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
)
class BartForConditionalGeneration(BartPretrainedModel):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"]
def __init__(self, config: BartConfig):
super().__init__(config)
self.model = BartModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens)
self._resize_final_logits_bias(new_num_tokens)
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BART_GENERATION_EXAMPLE)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqLMOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if use_cache:
logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs
):
# cut decoder_input_ids if past is used
if past is not None:
decoder_input_ids = decoder_input_ids[:, -1:]
return {
"input_ids": None, # encoder_outputs is defined. input_ids not needed
"encoder_outputs": encoder_outputs,
"past_key_values": past,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging)
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
@staticmethod
def _reorder_cache(past, beam_idx):
reordered_past = ()
for layer_past in past:
# cached cross_attention states don't have to be reordered -> they are always the same
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"""
Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
""",
BART_START_DOCSTRING,
)
class BartForSequenceClassification(BartPretrainedModel):
def __init__(self, config: BartConfig, **kwargs):
super().__init__(config, **kwargs)
self.model = BartModel(config)
self.classification_head = BartClassificationHead(
config.d_model,
config.d_model,
config.num_labels,
config.classifier_dropout,
)
self.model._init_weights(self.classification_head.dense)
self.model._init_weights(self.classification_head.out_proj)
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
output_type=Seq2SeqSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
if input_ids is None and inputs_embeds is not None:
raise NotImplementedError(
f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0] # last hidden state
eos_mask = input_ids.eq(self.config.eos_token_id)
if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
raise ValueError("All examples must have the same number of tokens.")
sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
:, -1, :
]
logits = self.classification_head(sentence_representation)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.config.num_labels == 1:
self.config.problem_type = "regression"
elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.config.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return Seq2SeqSequenceClassifierOutput(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
@add_start_docstrings(
"""
BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BART_START_DOCSTRING,
)
class BartForQuestionAnswering(BartPretrainedModel):
def __init__(self, config):
super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels
self.model = BartModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.model._init_weights(self.qa_outputs)
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_TOKENIZER_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_QA,
output_type=Seq2SeqQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_loss=_QA_EXPECTED_LOSS,
expected_output=_QA_EXPECTED_OUTPUT,
)
def forward(
self,
input_ids: torch.Tensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if start_positions is not None and end_positions is not None:
use_cache = False
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (
start_logits,
end_logits,
) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
return Seq2SeqQuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
class BartDecoderWrapper(BartPretrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
super().__init__(config)
self.decoder = BartDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
class BartForCausalLM(BartPretrainedModel):
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = BartDecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Returns:
Example:
```python
>>> from transformers import BartTokenizer, BartForCausalLM
>>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
>>> model = BartForCausalLM.from_pretrained("facebook/bart-base", add_cross_attention=False)
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model.decoder(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
logits = self.lm_head(outputs[0])
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past:
input_ids = input_ids[:, -1:]
# first step, decoder_cached_states are empty
return {
"input_ids": input_ids, # encoder_outputs is defined. input_ids not needed
"attention_mask": attention_mask,
"past_key_values": past,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past, beam_idx):
reordered_past = ()
for layer_past in past:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past
================================================
FILE: hanlp/components/amr/amrbart/model_interface/tokenization_bart.py
================================================
# coding:utf-8
# this is a simplified version of "https://github.com/SapienzaNLP/spring/blob/main/spring_amr/tokenization_bart.py"
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import penman
import regex as re
from transformers import BartTokenizer
from hanlp.components.amr.amrbart.common import postprocessing
from hanlp.components.amr.amrbart.common.constant import raw_special_tokens, recategorizations
from hanlp.components.amr.amrbart.common.penman_interface import encode
class AMRBartTokenizer(BartTokenizer):
INIT = 'Ġ'
def __init__(self, vocab_file, merges_file, errors="replace", bos_token="", eos_token=" ", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", add_prefix_space=False, **kwargs):
super().__init__(vocab_file, merges_file, errors, bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, mask_token, add_prefix_space, **kwargs)
self.modified = 0
self.recategorizations = set(recategorizations)
self.patterns = re.compile(r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.remove_pars = False
@classmethod
def from_pretrained(cls, pretrained_model_path, *args, **kwargs):
inst = super().from_pretrained(pretrained_model_path, *args, **kwargs)
inst.init_amr_vocabulary()
return inst
def init_amr_vocabulary(self):
self.old_enc_size = old_enc_size = len(self.encoder)
tokens = [t for t in raw_special_tokens if t not in self.encoder]
for i, t in enumerate(tokens, start=old_enc_size):
self.encoder[t] = i
self.encoder = {k: i for i, (k,v) in enumerate(sorted(self.encoder.items(), key=lambda x: x[1]))}
self.decoder = {v: k for k, v in sorted(self.encoder.items(), key=lambda x: x[1])}
self.modified = len(tokens)
self.amr_bos_token = ""
self.amr_bos_token_id = self.encoder[self.amr_bos_token]
self.amr_eos_token = " "
self.amr_eos_token_id = self.encoder[self.amr_eos_token]
# print(f"Added {self.modified} AMR tokens")
def _tokenize(self, text):
""" Tokenize a string. Modified in order to handle sentences with recategorization pointers"""
bpe_tokens = []
for tok_span in text.lstrip().split(' '):
tok_span = tok_span.strip()
recats = tok_span.rsplit('_', 1)
if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]])
else:
for token in re.findall(self.pat, ' ' + tok_span):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _tok_bpe(self, token):
tokk = []
tok = token.strip()
recats = tok.rsplit('_', 1)
if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
tokk.extend([self.INIT + recats[0], '_' + recats[1]])
else:
for tok in self.patterns.findall(' ' + token):
tok = "".join(
self.byte_encoder[b] for b in tok.encode("utf-8"))
toks = self.bpe(tok).split(' ')
tokk.extend(toks)
return tokk
def tokenize_amr(self, amr_tokens):
bpe_tokens = []
for i, tokk in enumerate(amr_tokens):
is_in_enc = self.INIT + tokk in self.encoder
is_rel = tokk.startswith(':') and len(tokk) > 1
is_spc = tokk.startswith('<') and tokk.endswith('>')
is_of = tokk.startswith(':') and tokk.endswith('-of')
is_frame = re.match(r'.+-\d\d', tokk) is not None
if tokk.startswith('"') and tokk.endswith('"'): # dealing with examples like "The_United_Kingdom_of_xxx"
tokk = tokk[1:-1].replace('_', ' ')
bpe_toks = [self.INIT + ""]
bpe_toks += self._tok_bpe(tokk)
bpe_toks.append(self.INIT + " ")
elif (is_rel or is_spc or is_frame or is_of):
if is_in_enc:
bpe_toks = [self.INIT + tokk]
elif is_frame:
bpe_toks = self._tok_bpe(tokk[:-3]) + [tokk[-3:]]
elif is_of:
rel = tokk[:-3]
if self.INIT + rel in self.encoder:
bpe_toks = [self.INIT + rel, '-of']
else:
bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:]) + ['-of']
elif is_rel:
bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:])
else:
print("tok:", tokk)
print(f"is_rel:{is_rel}, is_spc:{is_spc}, is_frame:{is_frame}, is_of:{is_of}")
exit()
raise
else:
if is_in_enc:
bpe_toks = [self.INIT + tokk]
else:
bpe_toks = self._tok_bpe(tokk)
bpe_tokens.append(bpe_toks)
bpe_tokens = [b for bb in bpe_tokens for b in bb]
bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens]
return bpe_token_ids
def decode_amr(self, tokens, restore_name_ops=None):
try:
nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self)
except Exception as e:
# print('Decoding failure:', file=sys.stderr)
# print(e, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
try:
graph_ = graph = self._fix_and_make_graph(nodes)
# if collapse_name_ops:
# graph_ = graph = postprocessing._split_name_ops(graph)
except Exception as e:
# print('Building failure:', file=sys.stderr)
# print(nodes, file=sys.stderr)
# print(backreferences, file=sys.stderr)
# print(e, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
try:
graph, status = postprocessing.connect_graph_if_not_connected(graph)
# if status == postprocessing.ParsedStatus.BACKOFF:
# print('Reconnection 1 failure:')
# print(nodes, file=sys.stderr)
# print(backreferences, file=sys.stderr)
# print(graph_, file=sys.stderr)
return graph, status, (nodes, backreferences)
except Exception as e:
# print('Reconnction 2 failure:', file=sys.stderr)
# print(e, file=sys.stderr)
# print(nodes, file=sys.stderr)
# print(backreferences, file=sys.stderr)
# print(graph_, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences)
def _fix_and_make_graph(self, nodes):
nodes_ = []
for n in nodes:
if isinstance(n, str):
if n.startswith('<') and n.endswith('>') and (not n.startswith('')
if e != len(nxt) -1:
pst = nxt[e+1:]
nxt = nxt[:e+1]
nodes_.append(nxt)
if pst is not None:
nodes_.append(pst)
else:
nodes_.append(nxt)
i += 1
nodes = nodes_
i = 1
nodes_ = [nodes[0]]
while i < len(nodes):
nxt = nodes[i]
if isinstance(nxt, str) and nxt.startswith(' 0:
line = line[:i].strip()
break
old_line = line
while True:
open_count = len(re.findall(r'\(', line))
close_count = len(re.findall(r'\)', line))
if open_count > close_count:
line += ')' * (open_count - close_count)
elif close_count > open_count:
for i in range(close_count - open_count):
line = line.rstrip(')')
line = line.rstrip(' ')
if old_line == line:
break
old_line = line
"""
graph = penman.decode(linearized + ' ')
triples = []
newvars = 2000
for triple in graph.triples:
x, rel, y = triple
if x is None:
pass
elif rel == ':instance' and y is None:
triples.append(penman.Triple(x, rel, 'thing'))
elif y is None:
var = f'z{newvars}'
newvars += 1
triples.append(penman.Triple(x, rel, var))
triples.append(penman.Triple(var, ':instance', 'thing'))
else:
triples.append(triple)
graph = penman.Graph(triples)
linearized = encode(graph)
def fix_text(linearized=linearized):
n = 0
def _repl1(match):
nonlocal n
out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3)
n += 1
return out
linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized,
flags=re.IGNORECASE | re.MULTILINE)
def _repl2(match):
return match.group(1)
linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2,
linearized,
flags=re.IGNORECASE | re.MULTILINE)
# adds a ':' to args w/o it
linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized)
# removes edges with no node
# linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE)
return linearized
linearized = fix_text(linearized)
g = penman.decode(linearized)
return g
def _classify(self, node):
if not isinstance(node, str):
return "CONST"
elif node == 'i':
return "I"
elif re.match(r'^[a-z]\d*$', node) is not None:
return "VAR"
elif node[0].isdigit():
return "CONST"
elif node.startswith('"') and node.endswith('"'):
return "CONST"
elif node in ('+', '-'):
return "CONST"
elif node == ':mode':
return 'MODE'
elif node.startswith(':'):
return "EDGE"
elif node in ['/', '(', ')']:
return node
elif node[0].isalpha():
for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'):
if char in node:
return "CONST"
return "INST"
else:
return 'CONST'
================================================
FILE: hanlp/components/amr/amrbart/preprocess/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-03 20:33
================================================
FILE: hanlp/components/amr/amrbart/preprocess/amr_io.py
================================================
# coding:utf-8
# the code is migrated from https://github.com/SapienzaNLP/spring
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import glob
from pathlib import Path
from typing import List, Union, Iterable
from hanlp.components.amr.amrbart.preprocess.penman_interface import load as pm_load
def read_raw_amr_data(
paths: List[Union[str, Path]], use_recategorization=False, dereify=True, remove_wiki=False,
):
""" code for loading AMR from a set of files
- use_recategorization: use graph recategorization trick
- dereify: Dereify edges in g that have reifications in model.
- remove_wiki: remove wiki links
"""
assert paths
if not isinstance(paths, Iterable):
paths = [paths]
graphs = []
for path_ in paths:
for path in glob.glob(str(path_)):
path = Path(path)
graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki))
assert graphs
if use_recategorization:
for g in graphs:
metadata = g.metadata
metadata["snt_orig"] = metadata["snt"]
tokens = eval(metadata["tokens"])
metadata["snt"] = " ".join(
[
t
for t in tokens
if not ((t.startswith("-L") or t.startswith("-R")) and t.endswith("-"))
]
)
return graphs
================================================
FILE: hanlp/components/amr/amrbart/preprocess/penman_interface.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from penman import load as load_, Graph, Triple
from penman import loads as loads_
from penman import encode as encode_
from penman.model import Model
from penman.models.noop import NoOpModel
from penman.models import amr
op_model = Model()
noop_model = NoOpModel()
amr_model = amr.model
DEFAULT = op_model
def _get_model(dereify):
if dereify is None:
return DEFAULT
elif dereify:
return op_model
else:
return noop_model
def _remove_wiki(graph):
metadata = graph.metadata
triples = []
for t in graph.triples:
v1, rel, v2 = t
if rel == ":wiki":
t = Triple(v1, rel, "+")
triples.append(t)
graph = Graph(triples)
graph.metadata = metadata
return graph
def load(source, dereify=None, remove_wiki=False):
model = _get_model(dereify)
out = load_(source=source, model=model)
if remove_wiki:
for i in range(len(out)):
out[i] = _remove_wiki(out[i])
return out
def loads(string, dereify=None, remove_wiki=False):
model = _get_model(dereify)
out = loads_(string=string, model=model)
if remove_wiki:
for i in range(len(out)):
out[i] = _remove_wiki(out[i])
return out
def encode(g, top=None, indent=-1, compact=False):
model = amr_model
return encode_(g=g, top=top, indent=indent, compact=compact, model=model)
================================================
FILE: hanlp/components/amr/amrbart/preprocess/read_and_process.py
================================================
# coding:utf-8
# MIT License
#
# Copyright (c) 2022 xfbai
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
import copy
import json
import yaml
import penman
from tqdm import tqdm
from pathlib import Path
from hanlp.components.amr.amrbart.preprocess.amr_io import read_raw_amr_data
def _tokenize_encoded_graph(encoded):
linearized = re.sub(r"(\".+?\")", r" \1 ", encoded)
pieces = []
for piece in linearized.split():
if piece.startswith('"') and piece.endswith('"'):
pieces.append(piece)
else:
piece = piece.replace("(", " ( ")
piece = piece.replace(")", " ) ")
piece = piece.replace(":", " :")
piece = piece.replace("/", " / ")
piece = piece.strip()
pieces.append(piece)
linearized = re.sub(r"\s+", " ", " ".join(pieces)).strip()
return linearized.split(" ")
def dfs_linearize(graph, remove_pars=False, use_pointer_tokens=True):
graph_ = copy.deepcopy(graph)
graph_.metadata = {}
linearized = penman.encode(graph_)
linearized_nodes = _tokenize_encoded_graph(linearized)
if use_pointer_tokens:
remap = {}
for i in range(1, len(linearized_nodes)):
nxt = linearized_nodes[i]
lst = linearized_nodes[i - 1]
if nxt == "/":
remap[lst] = f""
i = 1
linearized_nodes_ = [linearized_nodes[0]]
while i < (len(linearized_nodes)):
nxt = linearized_nodes[i]
lst = linearized_nodes_[-1]
if nxt in remap:
if lst == "(" and linearized_nodes[i + 1] == "/":
nxt = remap[nxt]
i += 1
elif lst.startswith(":"):
nxt = remap[nxt]
linearized_nodes_.append(nxt)
i += 1
linearized_nodes = linearized_nodes_
if remove_pars:
linearized_nodes = [n for n in linearized_nodes if n != "("]
return linearized_nodes
def main():
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
parser = ArgumentParser(
description="AMR processing script",
formatter_class=ArgumentDefaultsHelpFormatter,
)
parser.add_argument('--config', type=Path, default='default.yaml',
help='Use the following config for hparams.')
parser.add_argument('--input_file', type=str,
help='The input AMR file.')
parser.add_argument('--output_prefix', type=str,
help='The output_prefix.')
args, unknown = parser.parse_known_args()
with args.config.open() as y:
config = yaml.load(y, Loader=yaml.FullLoader)
remove_pars = False
use_pointer_tokens = True
graphs = read_raw_amr_data(
[args.input_file],
use_recategorization=config["use_recategorization"],
remove_wiki=config["remove_wiki"],
dereify=config["dereify"],
)
line_amr, sentences = [], []
for g in tqdm(graphs):
lin_tokens = dfs_linearize(g)
sentences.append(g.metadata["snt"])
# line_amr.append(" ".join(lin_tokens[1:-1]))
line_amr.append(" ".join(lin_tokens))
print(f"all {len(line_amr)} AMRs processed")
with open(args.output_prefix + ".amr", "w", encoding="utf-8") as fout:
fout.write("\n".join(line_amr) + "\n")
with open(args.output_prefix + ".txt", "w", encoding="utf-8") as fout:
fout.write("\n".join(sentences) + "\n")
res_out = [json.dumps({"sent": sent, "amr": lamr}) for lamr, sent in zip(line_amr, sentences)]
with open(args.output_prefix + ".jsonl", "w", encoding="utf-8") as fout:
fout.write("\n".join(res_out) + "\n")
if __name__ == '__main__':
main()
================================================
FILE: hanlp/components/amr/seq2seq/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-27 19:24
================================================
FILE: hanlp/components/amr/seq2seq/dataset/IO.py
================================================
import glob
from typing import List, Union, Iterable
from pathlib import Path
from .penman import pm_load as pm_load
def read_raw_amr_data(
paths: List[Union[str, Path]],
use_recategorization=False,
dereify=True,
remove_wiki=False,
):
assert paths
if not isinstance(paths, Iterable):
paths = [paths]
graphs = []
for path_ in paths:
for path in glob.glob(str(path_)):
path = Path(path)
assert path.exists(), f'{path} not exist'
graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki))
assert graphs, 'No graphs loaded'
if use_recategorization:
for g in graphs:
metadata = g.metadata
metadata['snt_orig'] = metadata['snt']
tokens = eval(metadata['tokens'])
metadata['snt'] = ' '.join(
[t for t in tokens if not ((t.startswith('-L') or t.startswith('-R')) and t.endswith('-'))])
return graphs
================================================
FILE: hanlp/components/amr/seq2seq/dataset/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-27 19:29
================================================
FILE: hanlp/components/amr/seq2seq/dataset/dataset.py
================================================
from collections import Counter
from typing import Union, List, Callable, Tuple
import torch
import penman
from penman import Graph
from hanlp.common.dataset import TransformableDataset
from hanlp.components.amr.seq2seq.dataset.IO import read_raw_amr_data
from hanlp.components.amr.seq2seq.dataset.penman import role_is_reverted
from hanlp.components.amr.seq2seq.dataset.tokenization_bart import PENMANBartTokenizer
from phrasetree.tree import Tree
import json
from hanlp_common.constant import BOS, EOS, ROOT
from hanlp_common.io import load_pickle
class AMRDataset(TransformableDataset):
def __init__(self,
data: Union[str, List],
use_recategorization=False,
remove_wiki=False,
dereify=False,
transform: Union[Callable, List] = None,
cache=None,
generate_idx=None) -> None:
self.dereify = dereify
self.remove_wiki = remove_wiki
self.use_recategorization = use_recategorization
super().__init__(data, transform, cache, generate_idx)
def load_file(self, filepath: str):
graphs = read_raw_amr_data([filepath], self.use_recategorization, remove_wiki=self.remove_wiki,
dereify=self.dereify)
for g in graphs:
yield {'amr': g}
def get_roles(self):
roles = Counter()
for sample in self.data:
g: Graph = sample['amr']
for s, r, t in g.triples:
if role_is_reverted(r):
r = r[:-3]
roles[r] += 1
return roles
def get_frames(self):
frames = Counter()
for sample in self.data:
g: Graph = sample['amr']
for i in g.instances():
t = i.target
cells = t.split('-')
if len(cells) == 2 and len(cells[1]) == 2 and cells[1].isdigit():
frames[t] += 1
return frames
class AMRPickleDataset(AMRDataset):
def load_file(self, filepath: str):
items = torch.load(filepath)
for each in items:
each['amr'] = penman.decode(each['amr'])
yield each
def dfs_linearize_tokenize(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False, text_key='snt') -> dict:
amr = sample.get('amr', None)
if amr:
l, e = tokenizer.linearize(amr)
sample['graph_tokens'] = e['linearized_graphs']
sample['graph_token_ids'] = l
text = amr.metadata[text_key]
else:
text = sample['text']
if remove_space:
text = ''.join(text.split())
sample['text'] = text
sample['text_token_ids'] = tokenizer.encode(text)
return sample
def dfs_linearize_levi(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False) -> dict:
amr = sample.get('amr', None)
if amr:
l, e = tokenizer.linearize(amr)
sample['graph_tokens'] = e['linearized_graphs']
sample['graph_token_ids'] = l
tok = json.loads(amr.metadata['tok'])
dep = json.loads(amr.metadata['dep'])
levi = dep_to_levi(tok, dep)
sample['text'] = ' '.join(levi)
# ids = sum(tokenizer.batch_encode_plus([' ' + x for x in levi], add_special_tokens=False).input_ids, [])
ids = []
idx = 0
for t in levi:
if t in ('(', ')'):
ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + t))
else:
if idx % 2:
ids.extend(tokenizer.encode(t, add_special_tokens=False))
else:
ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + t))
idx += 1
sample['text_token_ids'] = [tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id]
return sample
def dfs_linearize_rgcn(sample: dict, tokenizer: PENMANBartTokenizer) -> dict:
amr = sample.get('amr', None)
if amr:
l, e = tokenizer.linearize(amr)
sample['graph_tokens'] = e['linearized_graphs']
sample['graph_token_ids'] = l
tok = sample['tok']
sample['text'] = [tokenizer.cls_token] + [' ' + x for x in tok]
arc_scores = sample['dep']['scores']['arc_scores']
rel_scores = sample['dep']['scores']['rel_scores']
dep_graph = arc_scores[:, :, None] * rel_scores
root = torch.zeros((1,) + dep_graph.shape[1:])
sample['dep_graph'] = torch.cat([root, dep_graph], dim=0)
return sample
def dfs_linearize_constituency(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False) -> dict:
amr = sample.get('amr', None)
if amr:
l, e = tokenizer.linearize(amr)
sample['graph_tokens'] = e['linearized_graphs']
sample['graph_token_ids'] = l
tree = Tree.from_list(json.loads(sample['amr'].metadata['con_list']))
for each in tree.subtrees(lambda x: x.height() == 2):
if each[0] == '(':
each[0] = ''
elif each[0] == ')':
each[0] = ''
text = tree.pformat(margin=10e7)
tokens = []
buffer = []
for c in text:
if c == '(' or c == ')':
tokens.append(''.join(buffer))
tokens.append(c)
buffer.clear()
continue
buffer.append(c)
if buffer:
tokens.append(''.join(buffer))
tokens = [x.strip() for x in tokens]
tokens = [x for x in tokens if x]
restore_bracket = {'': '(', '': ')'}
tokens = [restore_bracket.get(x, x) for x in tokens]
ids = []
for each in tokens:
pairs = each.split(' ', 1)
if len(pairs) == 2:
con, token = pairs
ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + con))
ids.extend(tokenizer.encode(token, add_special_tokens=False))
else:
ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + each))
if remove_space:
text = ''.join(text.split())
sample['text'] = text
sample['text_token_ids'] = [tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id]
return sample
def dfs_linearize_tokenize_with_linguistic_structures(sample: dict, tokenizer: PENMANBartTokenizer,
remove_space=False,
text_key='snt') -> dict:
amr = sample.get('amr', None)
if amr:
l, e = tokenizer.linearize(amr)
sample['graph_tokens'] = e['linearized_graphs']
sample['graph_token_ids'] = l
text = amr.metadata[text_key]
if remove_space:
text = ''.join(text.split())
sample['text'] = text
tok = json.loads(amr.metadata['tok'])
text_token_ids = tokenizer.batch_encode_plus(tok, add_special_tokens=False).input_ids
sample['text_token_ids'] = [tokenizer.bos_token_id] + sum(text_token_ids, []) + [tokenizer.eos_token_id]
pos = amr.metadata.get('pos', None)
if pos:
flat_pos = []
pos = json.loads(pos)
for subtokens, tag in zip(text_token_ids, pos):
flat_pos.extend([tag] * len(subtokens))
sample['pos'] = [BOS] + flat_pos + [EOS]
ner = amr.metadata.get('ner', None)
if ner is not None:
flat_ner = []
ner_spans = json.loads(ner)
ner = ['O'] * len(text_token_ids)
for form, tag, start, end in ner_spans:
ner[start:end] = [tag] * (end - start)
for subtokens, tag in zip(text_token_ids, ner):
flat_ner.extend([tag] * len(subtokens))
sample['ner'] = [BOS] + flat_ner + [EOS]
dep = amr.metadata.get('dep', None)
if dep:
token_to_1st_subtoken = [0]
num_subtokens = 1 # 1 for BOS
for subtokens in text_token_ids:
token_to_1st_subtoken.append(num_subtokens)
num_subtokens += len(subtokens)
flat_arc, flat_rel = [0], [BOS]
dep = json.loads(dep)
for subtokens, (arc, rel) in zip(text_token_ids, dep):
flat_arc.extend([token_to_1st_subtoken[arc]] * len(subtokens))
flat_rel.extend([rel] * len(subtokens))
sample['dep_arc'] = flat_arc + [0]
sample['dep_rel'] = flat_rel + [EOS]
return sample
def dep_to_levi(tok: List[str], dep: List[Tuple[int, str]]):
root = [i for i, x in enumerate(dep) if x[0] == 0][0]
seq = []
dfs(tok, dep, root, seq)
return seq
def dfs(tok: List[str], dep: List[Tuple[int, str]], s, seq):
seq.append(dep[s][1])
seq.append(tok[s])
children = [i for i, x in enumerate(dep) if x[0] == s + 1]
if children:
seq.append('(')
for child in children:
dfs(tok, dep, child, seq)
seq.append(')')
================================================
FILE: hanlp/components/amr/seq2seq/dataset/linearization.py
================================================
import abc
import itertools
from collections import deque, defaultdict
import re
from typing import List, Optional, Dict, Any, Set, TypeVar
from dataclasses import dataclass
import networkx as nx
import penman
@dataclass
class SemanticGraph:
nodes_var: List[str]
"""
List of linearized nodes, with special tokens.
"""
edges: Optional[List[str]]
"""
List of linearized edges, with special tokens.
"""
backreferences: List[int]
"""
List of backpointers to handle rentrancies and cycles.
"""
var2instance: Dict[str, str]
"""
Dict from var ids to 'lemmatized' readable strings qualifying the node (collapsing the :instance edge for AMR).
"""
extra: Dict[str, Any]
"""
Holds extra stuff that might be useful, e.g. alignments, NER, EL.
"""
# @cached_property
@property
def variables(self) -> Set[str]:
"""Set of variables in this semantic graph"""
variables = {v for v in self.nodes_var if not v.startswith('<')}
return variables
@property
def resolved_nodes_var(self) -> List[str]:
return [self.nodes_var[b] for b in self.backreferences]
# @cached_property
@property
def nodes(self) -> List[str]:
"""Linearized nodes with varids replaced by instances"""
return [self.var2instance.get(node, node) for node in self.nodes_var]
@property
def resolved_nodes(self) -> List[str]:
return [self.nodes[b] for b in self.backreferences]
def src_occurrence(self, var: str) -> int:
pass
class BaseLinearizer(metaclass=abc.ABCMeta):
@abc.abstractmethod
def linearize(self, *args, **kwargs) -> SemanticGraph:
pass
class AMRTokens:
START, END = '<', '>'
_TEMPL = START + '{}' + END
BOS_N = _TEMPL.format('s')
EOS_N = _TEMPL.format('/s')
START_N = _TEMPL.format('start')
STOP_N = _TEMPL.format('stop')
PNTR_N = _TEMPL.format('pointer')
LIT_START = _TEMPL.format('lit')
LIT_END = _TEMPL.format('/lit')
BACKR_SRC_N = _TEMPL.format('backr:src:XXX')
BACKR_TRG_N = _TEMPL.format('backr:trg:XXX')
BOS_E = _TEMPL.format('s')
EOS_E = _TEMPL.format('/s')
START_E = _TEMPL.format('start')
STOP_E = _TEMPL.format('stop')
_FIXED_SPECIAL_TOKENS_N = {
BOS_N, EOS_N, START_N, STOP_N}
_FIXED_SPECIAL_TOKENS_E = {
BOS_E, EOS_E, START_E, STOP_E}
_FIXED_SPECIAL_TOKENS = _FIXED_SPECIAL_TOKENS_N | _FIXED_SPECIAL_TOKENS_E
# match and read backreferences
_re_BACKR_SRC_N = re.compile(BACKR_SRC_N.replace('XXX', r'([0-9]+)'))
_re_BACKR_TRG_N = re.compile(BACKR_TRG_N.replace('XXX', r'([0-9]+)'))
@classmethod
def is_node(cls, string: str) -> bool:
if isinstance(string, str) and string.startswith(':'):
return False
elif string in cls._FIXED_SPECIAL_TOKENS_E:
return False
return True
@classmethod
def read_backr(cls, string: str) -> Optional:
m_src = cls._re_BACKR_SRC_N.search(string)
if m_src is not None:
return m_src
m_trg = cls._re_BACKR_TRG_N.search(string)
if m_trg is not None:
return m_trg
return None
T = TypeVar('T')
def index_default(
item: T, list_: List[T],
start: Optional[int] = None,
stop: Optional[int] = None,
default: Optional[int] = None
):
if start is None:
start = 0
if stop is None:
stop = len(list_)
return next((i for i, x in enumerate(list_[start:stop], start=start) if x == item), default)
class AMRLinearizer(BaseLinearizer):
def __init__(
self,
use_pointer_tokens: bool = True,
collapse_name_ops: bool = False,
):
self.collapse_name_ops = collapse_name_ops
self.interleave_edges = False
self.use_pointer_tokens = use_pointer_tokens
def _collapse_name_ops(self, amr):
# identify name triples
name_vars = {}
for i, (v1, rel, v2) in enumerate(amr.triples):
if rel == ':instance' and v2 == 'name':
name_vars[v1] = 1
# check if they have ops
name_vars_to_ops = defaultdict(list)
for i, (v1, rel, v2) in enumerate(amr.triples):
if v1 in name_vars and rel.startswith(':op'):
name_vars_to_ops[v1].append((i, rel, v2.strip('"')))
triples = amr.triples.copy()
for nv, ops in name_vars_to_ops.items():
ops = sorted(ops, key=lambda x: int(x[1][3:]))
idx, _, lits = zip(*ops)
for i in idx:
triples[i] = None
lit = '"' + '_'.join(lits) + '"'
triples[min(idx)] = penman.Triple(nv, ':op1', lit)
triples = [t for t in triples if t is not None]
amr_ = penman.Graph(triples)
amr_.metadata = amr.metadata
return amr_
def linearize(self, amr: penman.Graph) -> SemanticGraph:
if self.collapse_name_ops:
amr = self._collapse_name_ops(amr)
linearized = self._linearize(amr)
linearized = self._interleave(linearized)
if self.use_pointer_tokens:
linearized = self._add_pointer_tokens(linearized)
return linearized
def _linearize(self, amr: penman.Graph) -> SemanticGraph:
variables = set(amr.variables())
variables = {'var:' + v for v in variables}
var2instance = {}
graph = nx.MultiDiGraph()
triples2order = {k: i for i, k in enumerate(amr.triples)}
for triple in amr.triples:
var, rel, instance = triple
order = triples2order[triple]
if rel != ':instance':
continue
for expansion_candidate in itertools.chain(range(order - 1, -1), range(order + 1, len(amr.triples))):
if var == amr.triples[expansion_candidate][2]:
expansion = expansion_candidate
break
else:
expansion = 0
var = 'var:' + var
var2instance[var] = instance
graph.add_node(var, instance=instance, order=order, expansion=expansion)
for triple in amr.edges():
var1, rel, var2 = triple
order = triples2order[triple]
if rel == ':instance':
continue
var1 = 'var:' + var1
var2 = 'var:' + var2
graph.add_edge(var1, var2, rel=rel, order=order)
for triple in amr.attributes():
var, rel, attr = triple
order = triples2order[triple]
if rel == ':instance':
continue
var = 'var:' + var
graph.add_edge(var, attr, rel=rel, order=order)
# nodes that are not reachable from the root (e.g. because of reification)
# will be present in the not_explored queue
# undirected_graph = graph.to_undirected()
# print(amr.variables())
not_explored = deque(sorted(variables, key=lambda x: nx.get_node_attributes(graph, 'order')[x]))
# (
# len(nx.shortest_path(undirected_graph, 'var:' + amr.top, x)),
# -graph.out_degree(x),
# )
first_index = {}
explored = set()
added_to_queue = set()
nodes_visit = [AMRTokens.BOS_N]
edges_visit = [AMRTokens.BOS_E]
backreferences = [0]
queue = deque()
queue.append('var:' + amr.top)
while queue or not_explored:
if queue:
node1 = queue.popleft()
else:
node1 = not_explored.popleft()
if node1 in added_to_queue:
continue
if not list(graph.successors(node1)):
continue
if node1 in variables:
if node1 in explored:
continue
if node1 in first_index:
nodes_visit.append(AMRTokens.BACKR_TRG_N)
backreferences.append(first_index[node1])
else:
backreferences.append(len(nodes_visit))
first_index[node1] = len(nodes_visit)
nodes_visit.append(node1)
edges_visit.append(AMRTokens.START_E)
successors = []
for node2 in graph.successors(node1):
for edge_data in graph.get_edge_data(node1, node2).values():
rel = edge_data['rel']
order = edge_data['order']
successors.append((order, rel, node2))
successors = sorted(successors)
for order, rel, node2 in successors:
edges_visit.append(rel)
# node2 is a variable
if node2 in variables:
# ... which was mentioned before
if node2 in first_index:
nodes_visit.append(AMRTokens.BACKR_TRG_N)
backreferences.append(first_index[node2])
# .. which is mentioned for the first time
else:
backreferences.append(len(nodes_visit))
first_index[node2] = len(nodes_visit)
nodes_visit.append(node2)
# 1) not already in Q
# 2) has children
# 3) the edge right before its expansion has been encountered
if (node2 not in added_to_queue) and list(graph.successors(node2)) and (
nx.get_node_attributes(graph, 'expansion')[node2] <= order):
queue.append(node2)
added_to_queue.add(node2)
# node2 is a constant
else:
backreferences.append(len(nodes_visit))
nodes_visit.append(node2)
backreferences.append(len(nodes_visit))
nodes_visit.append(AMRTokens.STOP_N)
edges_visit.append(AMRTokens.STOP_E)
explored.add(node1)
else:
backreferences.append(len(nodes_visit))
nodes_visit.append(node1)
explored.add(node1)
backreferences.append(len(nodes_visit))
nodes_visit.append(AMRTokens.EOS_N)
edges_visit.append(AMRTokens.EOS_E)
assert len(nodes_visit) == len(edges_visit) == len(backreferences)
return SemanticGraph(
nodes_visit,
edges_visit,
backreferences,
var2instance,
extra={'graph': graph, 'amr': amr}
)
def _interleave(self, graph: SemanticGraph) -> SemanticGraph:
new_backreferences_map = []
new_nodes = []
new_edges = None
new_backreferences = []
# to isolate sublist to the stop token
start_i = 1
end_i = index_default(AMRTokens.STOP_N, graph.nodes_var, start_i, -1, -1)
def add_node(node, backr=None):
old_n_node = len(new_backreferences_map)
new_n_node = len(new_nodes)
if backr is None:
backr = old_n_node
new_backreferences_map.append(new_n_node)
new_nodes.append(node)
if old_n_node == backr:
new_backreferences.append(new_n_node)
else:
new_backreferences.append(new_backreferences_map[backr])
def add_edge(edge):
new_nodes.append(edge)
new_backreferences.append(len(new_backreferences))
add_node(AMRTokens.BOS_N)
while end_i > -1:
# src node
add_node(graph.nodes_var[start_i], graph.backreferences[start_i])
# edges and trg nodes, interleaved
nodes = graph.nodes_var[start_i + 1:end_i]
edges = graph.edges[start_i + 1:end_i]
backr = graph.backreferences[start_i + 1:end_i]
for n, e, b in zip(nodes, edges, backr):
add_edge(e)
add_node(n, b)
# stop
add_node(graph.nodes_var[end_i], graph.backreferences[end_i])
start_i = end_i + 1
end_i = index_default(AMRTokens.STOP_N, graph.nodes_var, start_i, -1, -1)
add_node(AMRTokens.EOS_N)
new_graph = SemanticGraph(
new_nodes,
None,
new_backreferences,
graph.var2instance,
extra=graph.extra,
)
return new_graph
def _add_pointer_tokens(self, graph: SemanticGraph) -> SemanticGraph:
new_nodes = []
var2pointer = {}
for node, backr in zip(graph.nodes_var, graph.backreferences):
if node == AMRTokens.BACKR_TRG_N:
node = graph.nodes_var[backr]
pointer = var2pointer[node]
new_nodes.append(pointer)
elif node in graph.var2instance:
pointer = var2pointer.setdefault(node, f"")
new_nodes.append(pointer)
new_nodes.append(node)
else:
new_nodes.append(node)
new_backreferences = list(range(len(new_nodes)))
new_graph = SemanticGraph(
new_nodes,
None,
new_backreferences,
graph.var2instance,
extra=graph.extra,
)
return new_graph
================================================
FILE: hanlp/components/amr/seq2seq/dataset/penman.py
================================================
from typing import List
from penman import load as load_, Graph, Triple
from penman import loads as loads_
from penman import encode as encode_
from penman.model import Model
from penman.models.noop import NoOpModel
from penman.models import amr
import penman
import logging
op_model = Model()
noop_model = NoOpModel()
amr_model = amr.model
DEFAULT = op_model
# Mute loggers
penman.layout.logger.setLevel(logging.CRITICAL)
penman._parse.logger.setLevel(logging.CRITICAL)
def _get_model(dereify):
if dereify is None:
return DEFAULT
elif dereify:
return op_model
else:
return noop_model
def _remove_wiki(graph):
metadata = graph.metadata
triples = []
for t in graph.triples:
v1, rel, v2 = t
if rel == ':wiki':
t = Triple(v1, rel, '+')
triples.append(t)
graph = Graph(triples)
graph.metadata = metadata
return graph
def pm_load(source, dereify=None, remove_wiki=False) -> List[penman.Graph]:
"""
Args:
source:
dereify: Restore reverted relations
remove_wiki:
Returns:
"""
model = _get_model(dereify)
out = load_(source=source, model=model)
if remove_wiki:
for i in range(len(out)):
out[i] = _remove_wiki(out[i])
return out
def loads(string, dereify=None, remove_wiki=False):
model = _get_model(dereify)
out = loads_(string=string, model=model)
if remove_wiki:
for i in range(len(out)):
out[i] = _remove_wiki(out[i])
return out
def pm_encode(g, top=None, indent=-1, compact=False):
model = amr_model
return encode_(g=g, top=top, indent=indent, compact=compact, model=model)
def role_is_reverted(role: str):
if role.endswith('consist-of'):
return False
return role.endswith('-of')
class AMRGraph(penman.Graph):
def __str__(self):
return penman.encode(self)
================================================
FILE: hanlp/components/amr/seq2seq/dataset/postprocessing.py
================================================
from collections import defaultdict, Counter
import enum
import re
import networkx as nx
import penman
from hanlp.components.amr.seq2seq.dataset.penman import pm_encode
BACKOFF = penman.Graph([
penman.Triple('d2', ':instance', 'dog'),
penman.Triple('b1', ':instance', 'bark-01'),
penman.Triple('b1', ':ARG0', 'd2'), ])
def token_processing(tok):
if tok is None:
return None
elif tok.isdigit():
try:
return eval(tok)
except:
return tok
elif tok.startswith('"') and (not tok.endswith('"')):
return tok + '"'
elif tok.endswith('"') and (not tok.startswith('"')):
return '"' + tok
else:
return tok
def decode_into_node_and_backreferences(subtoken_ids, tokenizer):
rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)")
rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>")
# get strings
subtokens = tokenizer.convert_ids_to_tokens(subtoken_ids)
# fix backreferences
subtoken_backreferences = [max(t - len(tokenizer), -1) for t in subtoken_ids]
# strip padding
no_pad = [(s, b) for s, b in zip(subtokens, subtoken_backreferences) if s != (tokenizer.INIT + '')]
if no_pad:
subtokens, subtoken_backreferences = zip(*no_pad)
else:
subtokens, subtoken_backreferences = [''], [-1]
# subword collapse
tokens = []
backreferences = []
subword_to_token_map = {}
current_token_i = 0
for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)):
subword_to_token_map[subw_i] = current_token_i
# if empty you cannot do anything but add a new word
if not tokens:
tokens.append(subtok.lstrip(tokenizer.INIT))
backreferences.append(-1)
current_token_i += 1
# backref can't be splitted
elif subw_backr > -1:
tokens.append(None)
backreferences.append(subword_to_token_map[subw_backr])
current_token_i += 1
# after a special token release
elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]):
tokens.append(subtok.lstrip(tokenizer.INIT))
backreferences.append(-1)
current_token_i += 1
# after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT
# TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':'
elif (tokens[-1] == ':') and rex_arg.match(subtok):
tokens[-1] = tokens[-1] + subtok[1:]
# leading tokenizer.INIT
elif subtok.startswith(tokenizer.INIT):
tokens.append(subtok.lstrip(tokenizer.INIT))
backreferences.append(-1)
current_token_i += 1
# very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge
elif isinstance(tokens[-1], str) and tokens[-1].startswith(':') and tokens[-1][-1].isdigit() and (
subtok != '-of'):
tokens.append(subtok.lstrip(tokenizer.INIT))
backreferences.append(-1)
current_token_i += 1
# in any other case attach to the previous
else:
tokens[-1] = tokens[-1] + subtok
# strip INIT and fix byte-level
tokens = [tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t for t in tokens]
# tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens]
# unks are substituted with thing
tokens = [t if t != '' else 'thing' for t in tokens]
old_tokens = tokens
old_backreferences = backreferences
# Barack Obama -> "Barack Obama"
tokens = []
backreferences = []
token_to_token_map = {}
start_search = 0
removed = 0
while True:
try:
lit_start = old_tokens.index('', start_search)
token_addition = old_tokens[start_search:lit_start]
for i, t in enumerate(token_addition, start=start_search):
token_to_token_map[i] = i - removed
tokens += token_addition
backreferences_addition = [token_to_token_map[b] if b > -1 else -1 for b in
old_backreferences[start_search:lit_start]]
backreferences += backreferences_addition
lit_end = min(lit_start + 2, len(old_tokens) - 1)
while lit_end < len(old_tokens):
old_tok = old_tokens[lit_end]
if isinstance(old_tok, str) and (
(old_tok.startswith(':') and len(old_tok) > 3) or (old_tok == '')):
res_tok = old_tokens[lit_start + 1:lit_end]
for i in range(lit_start, lit_end):
token_to_token_map[i] = len(tokens)
# Remove possible wrong None
res = old_tokens[lit_start + 1:lit_end]
res = [str(r) for r in res if r is not None]
res = '"' + '_'.join(res) + '"'
removed += len(res_tok)
start_search = lit_end
tokens += [res, old_tok]
backreferences += [-1, -1]
break
elif old_tok == ' ':
res_tok = old_tokens[lit_start + 1:lit_end]
for i in range(lit_start, lit_end + 1):
token_to_token_map[i] = len(tokens)
# Remove possible wrong None
res = old_tokens[lit_start + 1:lit_end]
res = [str(r) for r in res if r is not None]
res = '"' + '_'.join(res) + '"'
removed += len(res_tok) + 1
start_search = lit_end + 1
tokens.append(res)
backreferences.append(-1)
break
else:
lit_end += 1
start_search = lit_end
except ValueError:
token_addition = old_tokens[start_search:]
for i, t in enumerate(token_addition, start=start_search):
token_to_token_map[i] = i - removed
backreferences_addition = [token_to_token_map[b] if b > -1 else b for b in
old_backreferences[start_search:]]
tokens += token_addition
backreferences += backreferences_addition
break
tokens = [token_processing(t) for t in tokens]
shift = 1
if len(tokens) > 1 and tokens[1] == '':
shift = 2
tokens = tokens[shift:]
backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]]
if tokens and tokens[-1] == ' ':
tokens.pop()
backreferences.pop()
return tokens, backreferences
def decode_into_node_and_backreferences_without_space(subtoken_ids, tokenizer):
rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)")
rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>")
# get strings
subtokens = tokenizer.convert_ids_to_tokens(subtoken_ids)
# fix backreferences
subtoken_backreferences = [max(t - len(tokenizer), -1) for t in subtoken_ids]
# strip padding
no_pad = [(s, b) for s, b in zip(subtokens, subtoken_backreferences) if s != (tokenizer.INIT + '')]
if no_pad:
subtokens, subtoken_backreferences = zip(*no_pad)
else:
subtokens, subtoken_backreferences = [''], [-1]
# subword collapse
tokens = []
backreferences = []
subword_to_token_map = {}
current_token_i = 0
prev_is_pointer = False
prev_is_rel = False
for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)):
subword_to_token_map[subw_i] = current_token_i
is_pointer = subtok.startswith('')
is_rel = subtok.startswith(':') and len(subtok) > 1
is_bracket = subtok in '()'
# if empty you cannot do anything but add a new word
if not tokens:
tokens.append(subtok)
backreferences.append(-1)
current_token_i += 1
# backref can't be splitted
elif subw_backr > -1:
tokens.append(None)
backreferences.append(subword_to_token_map[subw_backr])
current_token_i += 1
# after a special token release
elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]):
tokens.append(subtok)
backreferences.append(-1)
current_token_i += 1
# after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT
# TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':'
elif (tokens[-1] == ':') and rex_arg.match(subtok):
tokens[-1] = tokens[-1] + subtok[1:]
# current or prev is a control token
elif (is_pointer or is_rel or prev_is_pointer or prev_is_rel or is_bracket or subtok == ' ') \
and subtok != '-of':
tokens.append(subtok)
backreferences.append(-1)
current_token_i += 1
# very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge
elif isinstance(tokens[-1], str) and tokens[-1].startswith(':') and tokens[-1][-1].isdigit() and (
subtok != '-of'):
tokens.append(subtok)
backreferences.append(-1)
current_token_i += 1
# in any other case attach to the previous
else:
tokens[-1] = tokens[-1] + subtok
prev_is_pointer = is_pointer
prev_is_rel = is_rel
# strip INIT and fix byte-level
tokens = [tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t for t in tokens]
# tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens]
# unks are substituted with thing
tokens = [t if t != '' else 'thing' for t in tokens]
old_tokens = tokens
old_backreferences = backreferences
# Barack Obama -> "Barack Obama"
tokens = []
backreferences = []
token_to_token_map = {}
start_search = 0
removed = 0
while True:
try:
lit_start = old_tokens.index('', start_search)
token_addition = old_tokens[start_search:lit_start]
for i, t in enumerate(token_addition, start=start_search):
token_to_token_map[i] = i - removed
tokens += token_addition
backreferences_addition = [token_to_token_map[b] if b > -1 else -1 for b in
old_backreferences[start_search:lit_start]]
backreferences += backreferences_addition
lit_end = min(lit_start + 2, len(old_tokens) - 1)
while lit_end < len(old_tokens):
old_tok = old_tokens[lit_end]
if isinstance(old_tok, str) and (
(old_tok.startswith(':') and len(old_tok) > 3) or (old_tok == '')):
res_tok = old_tokens[lit_start + 1:lit_end]
for i in range(lit_start, lit_end):
token_to_token_map[i] = len(tokens)
# Remove possible wrong None
res = old_tokens[lit_start + 1:lit_end]
res = [str(r) for r in res if r is not None]
res = '"' + '_'.join(res) + '"'
removed += len(res_tok)
start_search = lit_end
tokens += [res, old_tok]
backreferences += [-1, -1]
break
elif old_tok == ' ':
res_tok = old_tokens[lit_start + 1:lit_end]
for i in range(lit_start, lit_end + 1):
token_to_token_map[i] = len(tokens)
# Remove possible wrong None
res = old_tokens[lit_start + 1:lit_end]
res = [str(r) for r in res if r is not None]
res = '"' + '_'.join(res) + '"'
removed += len(res_tok) + 1
start_search = lit_end + 1
tokens.append(res)
backreferences.append(-1)
break
else:
lit_end += 1
start_search = lit_end
except ValueError:
token_addition = old_tokens[start_search:]
for i, t in enumerate(token_addition, start=start_search):
token_to_token_map[i] = i - removed
backreferences_addition = [token_to_token_map[b] if b > -1 else b for b in
old_backreferences[start_search:]]
tokens += token_addition
backreferences += backreferences_addition
break
tokens = [token_processing(t) for t in tokens]
shift = 0
if len(tokens) > 1 and tokens[1] == '':
shift = 1
tokens = tokens[shift:]
backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]]
if tokens and tokens[-1] == ' ':
tokens.pop()
backreferences.pop()
return tokens, backreferences
def index_of(element, iterable, default=None, start=None, end=None):
if not callable(element):
def check(x):
return element == x
else:
check = element
if start is None:
start = 0
if end is None:
end = len(iterable)
item = start
while item < end:
if check(iterable[item]):
return item
item += 1
return default
def separate_edges_nodes(edges_nodes_slice, *other):
is_arg = lambda x: isinstance(x, str) and x.startswith(':')
start = 0
edges = []
nodes = []
l = len(edges_nodes_slice)
while start < l:
edge_index = index_of(
is_arg,
edges_nodes_slice,
start=start)
if edge_index is None or edge_index == (l - 1):
break
if is_arg(edges_nodes_slice[edge_index + 1]):
start = edge_index + 1
continue
edges.append(edge_index)
nodes.append(edge_index + 1)
start = edge_index + 2
ret = []
for oth in other:
edges_oth = [oth[i] for i in edges]
nodes_oth = [oth[i] for i in nodes]
ret.append((edges_oth, nodes_oth))
return ret
def _split_name_ops(graph):
# identify name triples
name_vars = {}
for i, (v1, rel, v2) in enumerate(graph.triples):
if rel == ':instance' and v2 == 'name':
name_vars[v1] = 1
# check if they have ops
name_vars_to_ops = defaultdict(list)
for i, (v1, rel, v2) in enumerate(graph.triples):
if v1 in name_vars and rel.startswith(':op'):
name_vars_to_ops[v1].append((i, rel, v2.strip('"')))
triples = graph.triples.copy()
for nv, ops in name_vars_to_ops.items():
ops = sorted(ops, key=lambda x: int(x[1][3:]))
idx, _, lits = zip(*ops)
for i in idx:
triples[i] = None
lits = ['"' + l + '"' for lit in lits for l in lit.split('_')]
tt = []
for i, l in enumerate(lits, start=1):
rel = ':op' + str(i)
tt.append(penman.Triple(nv, rel, l))
triples[min(idx)] = tt
triples = [t if isinstance(t, list) else [t] for t in triples if t is not None]
triples = [t for tt in triples for t in tt]
graph_ = penman.Graph(triples)
graph_.metadata = graph.metadata
return graph_
def _reconstruct_graph_from_nodes(nodes, backreferences):
triples = []
triples_added = set()
variable2index = {}
index2variable = {}
start_index = 0
cnt = defaultdict(Counter)
while start_index < len(nodes):
stop_index = index_of('', nodes, default=len(nodes) + 1, start=start_index)
old_start_index = start_index
start_index = stop_index + 1
src_node, src_backr = nodes[old_start_index], backreferences[old_start_index]
if src_node == '':
continue
trg_nodes_edges = nodes[old_start_index:stop_index]
trg_nodes_edges_backr = backreferences[old_start_index:stop_index]
trg_nodes_edges_indices = list(range(old_start_index, stop_index))
if isinstance(src_node, str):
if src_node in ('', ' ', ''):
continue
elif ('/' in src_node) or (':' in src_node) or ('(' in src_node) or (')' in src_node):
src_node = 'thing'
if src_node is not None:
src_node = str(src_node)
src_var = src_node[0].lower()
if not src_var not in 'abcdefghijklmnopqrstuvwxyz':
src_var = 'x'
# src_var = f'{src_var}_{len(variable2index)}'
src_var = f'{src_var}{len(variable2index)}'
src_var_i = old_start_index
variable2index[src_var] = src_var_i
index2variable[src_var_i] = src_var
triple = penman.Triple(src_var, ':instance', src_node)
if triple not in triples_added:
triples.append(triple)
triples_added.add(triple)
else:
if src_backr in index2variable:
src_var = index2variable[src_backr]
# more resilient logic here
(trg_edges, trg_nodes), (_, trg_nodes_backr), (_, trg_nodes_indices) = \
separate_edges_nodes(
trg_nodes_edges,
trg_nodes_edges,
trg_nodes_edges_backr,
trg_nodes_edges_indices)
for n, e, nb, ni in zip(trg_nodes, trg_edges, trg_nodes_backr, trg_nodes_indices):
if isinstance(n, str) and n.startswith(':'):
continue
if isinstance(n, str) and n.startswith('<') and n.endswith('>'):
continue
if e == ':li':
pass
elif len(e) < 4 or (not e.startswith(':')):
continue
# same edge more than once
num = cnt[src_var][e]
# num = 0
if num:
if e.startswith(':op') or e.startswith(':snt'):
continue
# elif e.startswith(':ARG'):
# continue
elif num > 3:
continue
if n is None:
if nb not in index2variable:
continue
trg_var = index2variable[nb]
trg = trg_var
elif e == ':mode':
trg = n
elif (not isinstance(n, str)) or re.match(r"^[+-]?\d+\.?\d*$", n) or (n == '-') or (n == '+'):
trg = str(n)
elif (n.startswith('"') and n.endswith('"') and len(n) > 2):
trg = '"' + n.replace('"', '') + '"'
elif ('/' in n) or (':' in n) or ('(' in n) or (')' in n) or ('=' in n):
trg = f'"{n}"'
elif n == '"':
continue
elif (n.startswith('"') and (not n.endswith('"'))) or (not n.startswith('"') and (n.endswith('"'))) or (
'"' in n):
trg = '"' + n.replace('"', '') + '"'
else:
trg_var = n[0].lower()
if trg_var not in 'abcdefghijklmnopqrstuvwxyz':
trg_var = 'x'
# trg_var = f'{trg_var}_{len(variable2index)}'
trg_var = f'{trg_var}{len(variable2index)}'
trg_var_i = ni
variable2index[trg_var] = trg_var_i
index2variable[trg_var_i] = trg_var
triple = penman.Triple(trg_var, ':instance', n)
if triple not in triples_added:
triples.append(triple)
triples_added.add(triple)
trg = trg_var
triple = penman.Triple(src_var, e, trg)
if triple not in triples_added:
triples.append(triple)
triples_added.add(triple)
cnt[src_var][e] += 1
return penman.Graph(triples)
def build_graph(nodes, backreferences, restore_name_ops=False):
graph = _reconstruct_graph_from_nodes(nodes, backreferences)
if restore_name_ops:
graph = _split_name_ops(graph)
return graph
class ParsedStatus(enum.Enum):
OK = 0
FIXED = 1
BACKOFF = 2
def connect_graph_if_not_connected(graph):
try:
encoded = pm_encode(graph)
return graph, ParsedStatus.OK
except:
pass
nxgraph = nx.MultiGraph()
variables = graph.variables()
for v1, _, v2 in graph.triples:
if v1 in variables and v2 in variables:
nxgraph.add_edge(v1, v2)
elif v1 in variables:
nxgraph.add_edge(v1, v1)
triples = graph.triples.copy()
new_triples = []
addition = f'a{len(variables) + 1}'
triples.append(penman.Triple(addition, ':instance', 'and'))
for i, conn_set in enumerate(nx.connected_components(nxgraph), start=1):
edge = f':op{i}'
conn_set = sorted(conn_set, key=lambda x: int(x[1:]))
conn_set = [c for c in conn_set if c in variables]
node = conn_set[0]
new_triples.append(penman.Triple(addition, edge, node))
triples = new_triples + triples
metadata = graph.metadata
graph = penman.Graph(triples)
graph.metadata.update(metadata)
pm_encode(graph)
return graph, ParsedStatus.FIXED
def restore_backreferences_from_pointers(nodes):
new_nodes, new_backreferences = [], []
prev_pointer = None
pointer2i = {}
for n in nodes:
is_pointer = isinstance(n, str) and n.startswith('')
if not is_pointer:
if prev_pointer is not None:
if prev_pointer in pointer2i:
new_nodes.append(None)
new_backreferences.append(pointer2i[prev_pointer])
new_nodes.append(n)
new_backreferences.append(-1)
else:
pointer2i[prev_pointer] = len(new_nodes)
new_nodes.append(n)
new_backreferences.append(-1)
else:
new_nodes.append(n)
new_backreferences.append(-1)
prev_pointer = None
else:
prev_pointer = n
return new_nodes, new_backreferences
================================================
FILE: hanlp/components/amr/seq2seq/dataset/tokenization_bart.py
================================================
import copy
import sys
from typing import Set, Iterable
import penman
import regex as re
import torch
from transformers import BartTokenizer
from . import postprocessing
from .linearization import AMRTokens, AMRLinearizer
from .penman import pm_encode
class AMRBartTokenizer(BartTokenizer):
ADDITIONAL = [
AMRTokens.PNTR_N,
AMRTokens.STOP_N,
AMRTokens.LIT_START,
AMRTokens.LIT_END,
AMRTokens.BACKR_SRC_N,
AMRTokens.BACKR_TRG_N, ]
def __init__(self, *args, use_pointer_tokens=False, collapse_name_ops=False, INIT='Ġ', **kwargs):
super().__init__(*args, **kwargs)
self.INIT = INIT
self.patterns = re.compile(
r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.linearizer = AMRLinearizer(use_pointer_tokens=use_pointer_tokens, collapse_name_ops=collapse_name_ops)
self.use_pointer_tokens = use_pointer_tokens
self.collapse_name_ops = collapse_name_ops
self.recategorizations = set()
self.modified = 0
@classmethod
def from_pretrained(cls, pretrained_model_path, additional_tokens: Iterable[str] = None,
recategorization_tokens: Iterable[str] = None,
*args, **kwargs):
inst = super().from_pretrained(pretrained_model_path, *args, **kwargs)
inst.init_amr_vocabulary(additions=additional_tokens, recategorization_tokens=recategorization_tokens)
return inst
def init_amr_vocabulary(self, additions: Set[str] = None, recategorization_tokens: Iterable[str] = None):
for tok in self.all_special_tokens:
ntok = self.INIT + tok
i = self.encoder[tok]
self.decoder[i] = ntok
del self.encoder[tok]
self.encoder[ntok] = i
tokens = []
if additions:
tokens.extend(additions)
if recategorization_tokens:
for tok in recategorization_tokens:
if not tok.startswith('_'):
self.recategorizations.add(tok)
tokens.append(tok)
if self.use_pointer_tokens:
for cnt in range(512):
tokens.append(f"")
tokens += self.ADDITIONAL
tokens = [self.INIT + t if t[0] not in ('_', '-') else t for t in tokens]
tokens = [t for t in tokens if t not in self.encoder]
self.old_enc_size = old_enc_size = len(self.encoder)
for i, t in enumerate(tokens, start=old_enc_size):
self.encoder[t] = i
self.encoder = {k: i for i, (k, v) in enumerate(sorted(self.encoder.items(), key=lambda x: x[1]))}
self.decoder = {v: k for k, v in sorted(self.encoder.items(), key=lambda x: x[1])}
self.modified = len(tokens)
self.bos_token = self.INIT + self.bos_token
self.pad_token = self.INIT + self.pad_token
self.eos_token = self.INIT + self.eos_token
self.unk_token = self.INIT + self.unk_token
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return output
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
def _tokenize(self, text):
""" Tokenize a string. Modified in order to handle sentences with recategorization pointers"""
bpe_tokens = []
for tok_span in text.lstrip().split(' '):
tok_span = tok_span.strip()
recats = tok_span.rsplit('_', 1)
if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]])
else:
for token in re.findall(self.pat, ' ' + tok_span):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _tok_bpe(self, token, add_space=True):
# if add_space:
# token = ' ' + token.lstrip()
tokk = []
tok = token.strip()
recats = tok.rsplit('_', 1)
if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
tokk.extend([self.INIT + recats[0], '_' + recats[1]])
else:
for tok in self.patterns.findall(' ' + token):
tok = "".join(
self.byte_encoder[b] for b in tok.encode("utf-8"))
toks = self.bpe(tok).split(' ')
tokk.extend(toks)
return tokk
def _get_nodes_and_backreferences(self, graph):
lin = self.linearizer.linearize(graph)
linearized_nodes, backreferences = lin.nodes, lin.backreferences
return linearized_nodes, backreferences
def tokenize_amr(self, graph):
linearized_nodes, backreferences = self._get_nodes_and_backreferences(graph)
bpe_tokens = []
bpe_backreferences = []
counter = 0
for i, (backr, tokk) in enumerate(zip(backreferences, linearized_nodes)):
is_in_enc = self.INIT + tokk in self.encoder
is_rel = tokk.startswith(':') and len(tokk) > 1
is_spc = tokk.startswith('<') and tokk.endswith('>')
is_of = tokk.startswith(':') and tokk.endswith('-of')
is_frame = re.match(r'.+-\d\d', tokk) is not None
if tokk.startswith('"') and tokk.endswith('"'):
tokk = tokk[1:-1].replace('_', ' ')
bpe_toks = [self.INIT + AMRTokens.LIT_START]
bpe_toks += self._tok_bpe(tokk, add_space=True)
bpe_toks.append(self.INIT + AMRTokens.LIT_END)
elif (is_rel or is_spc or is_frame or is_of):
if is_in_enc:
bpe_toks = [self.INIT + tokk]
elif is_frame:
bpe_toks = self._tok_bpe(tokk[:-3], add_space=True) + [tokk[-3:]]
elif is_of:
rel = tokk[:-3]
if self.INIT + rel in self.encoder:
bpe_toks = [self.INIT + rel, '-of']
else:
bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:], add_space=True) + ['-of']
elif is_rel:
bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:], add_space=True)
else:
raise
else:
if is_in_enc:
bpe_toks = [self.INIT + tokk]
else:
bpe_toks = self._tok_bpe(tokk, add_space=True)
bpe_tokens.append(bpe_toks)
if i == backr:
bpe_backr = list(range(counter, counter + len(bpe_toks)))
counter += len(bpe_toks)
bpe_backreferences.append(bpe_backr)
else:
bpe_backreferences.append(bpe_backreferences[backr][0:1])
counter += 1
bpe_tokens = [b for bb in bpe_tokens for b in bb]
bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens]
bpe_backreferences = [b for bb in bpe_backreferences for b in bb]
return bpe_tokens, bpe_token_ids, bpe_backreferences
def batch_encode_sentences(self, sentences, device=torch.device('cpu')):
sentences = [s for s in sentences]
extra = {'sentences': sentences}
batch = super().batch_encode_plus(sentences, return_tensors='pt', pad_to_max_length=True)
batch = {k: v.to(device) for k, v in batch.items()}
return batch, extra
def linearize(self, graph):
shift = len(self.encoder)
tokens, token_ids, backreferences = self.tokenize_amr(graph)
extra = {'linearized_graphs': tokens, 'graphs': graph}
token_uni_ids = \
[idx if i == b else b + shift for i, (idx, b) in enumerate(zip(token_ids, backreferences))]
if token_uni_ids[-1] != (self.INIT + AMRTokens.EOS_N):
tokens.append(self.INIT + AMRTokens.EOS_N)
token_ids.append(self.eos_token_id)
token_uni_ids.append(self.eos_token_id)
backreferences.append(len(backreferences))
return token_uni_ids, extra
def batch_encode_graphs(self, graphs, device=torch.device('cpu')):
linearized, extras = zip(*[self.linearize(g) for g in graphs])
return self.batch_encode_graphs_from_linearized(linearized, extras, device=device)
def batch_encode_graphs_from_linearized(self, linearized, extras=None, device=torch.device('cpu')):
if extras is not None:
batch_extra = {'linearized_graphs': [], 'graphs': []}
for extra in extras:
batch_extra['graphs'].append(extra['graphs'])
batch_extra['linearized_graphs'].append(extra['linearized_graphs'])
else:
batch_extra = {}
maxlen = 0
batch = []
for token_uni_ids in linearized:
maxlen = max(len(token_uni_ids), maxlen)
batch.append(token_uni_ids)
batch = [x + [self.pad_token_id] * (maxlen - len(x)) for x in batch]
batch = torch.tensor(batch).to(device)
batch = {'decoder_input_ids': batch[:, :-1], 'lm_labels': batch[:, 1:]}
return batch, batch_extra
def decode_amr(self, tokens, restore_name_ops=False):
try:
nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self)
except Exception as e:
print('Decoding failure:', file=sys.stderr)
print(e, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
if self.use_pointer_tokens:
nodes, backreferences = postprocessing.restore_backreferences_from_pointers(nodes)
try:
graph_ = graph = postprocessing.build_graph(nodes, backreferences, restore_name_ops=restore_name_ops)
except Exception as e:
print('Building failure:', file=sys.stderr)
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(e, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
try:
graph, status = postprocessing.connect_graph_if_not_connected(graph)
if status == postprocessing.ParsedStatus.BACKOFF:
print('Reconnection 1 failure:')
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(graph_, file=sys.stderr)
return graph, status, (nodes, backreferences)
except Exception as e:
print('Reconnction 2 failure:', file=sys.stderr)
print(e, file=sys.stderr)
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(graph_, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences)
class PENMANBartTokenizer(AMRBartTokenizer):
def __init__(self, *args, raw_graph=False, **kwargs):
super().__init__(*args, **kwargs)
self.linearizer = None
self.remove_pars = False
self.raw_graph = raw_graph
def _tokenize_encoded_graph(self, encoded):
linearized = re.sub(r"(\".+?\")", r' \1 ', encoded)
pieces = []
for piece in linearized.split():
if piece.startswith('"') and piece.endswith('"'):
pieces.append(piece)
else:
piece = piece.replace('(', ' ( ')
piece = piece.replace(')', ' ) ')
piece = piece.replace(':', ' :')
piece = piece.replace('/', ' / ')
piece = piece.strip()
pieces.append(piece)
linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip()
linearized_nodes = [AMRTokens.BOS_N] + linearized.split(' ')
return linearized_nodes
def tokenize_amr(self, graph):
if self.raw_graph:
graph_ = copy.deepcopy(graph)
graph_.metadata = {}
linearized = penman.encode(graph_)
linearized = re.sub(r"\s+", ' ', linearized)
bpe_tokens = [self.bos_token] + self._tokenize(linearized)[:1022]
bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens]
bpe_backreferences = list(range(len(bpe_token_ids)))
return bpe_tokens, bpe_token_ids, bpe_backreferences
else:
return super().tokenize_amr(graph)
def _get_nodes_and_backreferences(self, graph):
graph_ = copy.deepcopy(graph)
graph_.metadata = {}
linearized = penman.encode(graph_)
linearized_nodes = self._tokenize_encoded_graph(linearized)
if self.use_pointer_tokens:
remap = {}
for i in range(1, len(linearized_nodes)):
nxt = linearized_nodes[i]
lst = linearized_nodes[i - 1]
if nxt == '/':
remap[lst] = f''
i = 1
linearized_nodes_ = [linearized_nodes[0]]
while i < (len(linearized_nodes)):
nxt = linearized_nodes[i]
lst = linearized_nodes_[-1]
if nxt in remap:
if lst == '(' and linearized_nodes[i + 1] == '/':
nxt = remap[nxt]
i += 1
elif lst.startswith(':'):
nxt = remap[nxt]
linearized_nodes_.append(nxt)
i += 1
linearized_nodes = linearized_nodes_
if self.remove_pars:
linearized_nodes = [n for n in linearized_nodes if n != '(']
backreferences = list(range(len(linearized_nodes)))
return linearized_nodes, backreferences
def _classify(self, node):
if not isinstance(node, str):
return "CONST"
elif node == 'i':
return "I"
elif re.match(r'^[a-z]\d*$', node) is not None:
return "VAR"
elif node[0].isdigit():
return "CONST"
elif node.startswith('"') and node.endswith('"'):
return "CONST"
elif node in ('+', '-'):
return "CONST"
elif node == ':mode':
return 'MODE'
elif node.startswith(':'):
return "EDGE"
elif node in ['/', '(', ')']:
return node
elif node[0].isalpha():
for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'):
if char in node:
return "CONST"
return "INST"
else:
return 'CONST'
def _fix_and_make_graph(self, nodes):
nodes_ = []
for n in nodes:
if isinstance(n, str):
if n.startswith('<') and n.endswith('>') and (not n.startswith('')
if e != len(nxt) - 1:
pst = nxt[e + 1:]
nxt = nxt[:e + 1]
nodes_.append(nxt)
if pst is not None:
nodes_.append(pst)
else:
nodes_.append(nxt)
i += 1
nodes = nodes_
i = 1
nodes_ = [nodes[0]]
while i < len(nodes):
nxt = nodes[i]
if isinstance(nxt, str) and nxt.startswith(' 0:
line = line[:i].strip()
break
old_line = line
while True:
open_count = len(re.findall(r'\(', line))
close_count = len(re.findall(r'\)', line))
if open_count > close_count:
line += ')' * (open_count - close_count)
elif close_count > open_count:
for i in range(close_count - open_count):
line = line.rstrip(')')
line = line.rstrip(' ')
if old_line == line:
break
old_line = line
"""
graph = penman.decode(linearized + ' ')
triples = []
newvars = 2000
for triple in graph.triples:
x, rel, y = triple
if x is None:
pass
elif rel == ':instance' and y is None:
triples.append(penman.Triple(x, rel, 'thing'))
elif y is None:
var = f'x{newvars}'
newvars += 1
triples.append(penman.Triple(x, rel, var))
triples.append(penman.Triple(var, ':instance', 'thing'))
else:
triples.append(triple)
graph = penman.Graph(triples)
linearized = pm_encode(graph)
def fix_text(linearized=linearized):
n = 0
def _repl1(match):
nonlocal n
out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3)
n += 1
return out
linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized,
flags=re.IGNORECASE | re.MULTILINE)
def _repl2(match):
return match.group(1)
linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2,
linearized,
flags=re.IGNORECASE | re.MULTILINE)
# adds a ':' to args w/o it
linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized)
# removes edges with no node
# linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE)
return linearized
linearized = fix_text(linearized)
g = penman.decode(linearized)
return g
def decode_amr(self, tokens, restore_name_ops=None):
try:
if self.raw_graph:
nodes = self._tokenize_encoded_graph(self.decode(tokens))
backreferences = list(range(len(nodes)))
else:
nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self)
nodes_ = nodes
except Exception as e:
print('Decoding failure:', file=sys.stderr)
print(e, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
try:
graph_ = graph = self._fix_and_make_graph(nodes)
if self.collapse_name_ops:
graph_ = graph = postprocessing._split_name_ops(graph)
except Exception as e:
print('Building failure:', file=sys.stderr)
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(e, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
try:
graph, status = postprocessing.connect_graph_if_not_connected(graph)
if status == postprocessing.ParsedStatus.BACKOFF:
print('Reconnection 1 failure:')
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(graph_, file=sys.stderr)
return graph, status, (nodes_, backreferences)
except Exception as e:
print('Reconnction 2 failure:', file=sys.stderr)
print(e, file=sys.stderr)
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(graph_, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes_, backreferences)
================================================
FILE: hanlp/components/amr/seq2seq/dataset/tokenization_t5.py
================================================
import copy
import sys
from typing import Set, Iterable, Dict
import penman
import regex as re
import torch
import traceback
from transformers import T5Tokenizer, T5TokenizerFast
from . import postprocessing
from .linearization import AMRTokens, AMRLinearizer
from .penman import pm_encode
class AMRT5Tokenizer(T5TokenizerFast):
ADDITIONAL = [
AMRTokens.PNTR_N,
AMRTokens.STOP_N,
AMRTokens.LIT_START,
AMRTokens.LIT_END,
AMRTokens.BACKR_SRC_N,
AMRTokens.BACKR_TRG_N, ]
def __init__(self, *args, use_pointer_tokens=False, collapse_name_ops=False, INIT='', **kwargs):
super().__init__(*args, **kwargs)
self.INIT = INIT
self.patterns = re.compile(
r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.linearizer = AMRLinearizer(use_pointer_tokens=use_pointer_tokens, collapse_name_ops=collapse_name_ops)
self.use_pointer_tokens = use_pointer_tokens
self.collapse_name_ops = collapse_name_ops
self.recategorizations = set()
self.modified = 0
@classmethod
def from_pretrained(cls, pretrained_model_path, additional_tokens: Iterable[str] = None,
recategorization_tokens: Iterable[str] = None,
*args, **kwargs):
inst = super().from_pretrained(pretrained_model_path, *args, **kwargs)
inst.init_amr_vocabulary(additions=additional_tokens, recategorization_tokens=recategorization_tokens)
return inst
def init_amr_vocabulary(self, additions: Set[str] = None, recategorization_tokens: Iterable[str] = None):
# T5 has no encoder but it's not a problem for Chinese
# for tok in self.all_special_tokens:
# ntok = self.INIT + tok
# i = self.encoder[tok]
# self.decoder[i] = ntok
# del self.encoder[tok]
# self.encoder[ntok] = i
tokens = [AMRTokens.BOS_N]
if additions:
tokens.extend(additions)
if recategorization_tokens:
for tok in recategorization_tokens:
if not tok.startswith('_'):
self.recategorizations.add(tok)
tokens.append(tok)
if self.use_pointer_tokens:
for cnt in range(512):
tokens.append(f"")
tokens += self.ADDITIONAL
tokens = [self.INIT + t if t[0] not in ('_', '-') else t for t in tokens]
self.old_enc_size = len(self)
self.add_tokens(tokens)
self.modified = len(tokens)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return output
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
def _tokenize(self, text):
""" Tokenize a string. Modified in order to handle sentences with recategorization pointers"""
bpe_tokens = []
for tok_span in text.lstrip().split(' '):
tok_span = tok_span.strip()
recats = tok_span.rsplit('_', 1)
if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]])
else:
for token in re.findall(self.pat, ' ' + tok_span):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _tok_bpe(self, token, add_space=True):
# if add_space:
# token = ' ' + token.lstrip()
tokk = []
tok = token.strip()
recats = tok.rsplit('_', 1)
if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder:
tokk.extend([self.INIT + recats[0], '_' + recats[1]])
else:
for tok in self.patterns.findall(token):
tokk.extend(self.tokenize(tok))
return tokk
def _get_nodes_and_backreferences(self, graph):
lin = self.linearizer.linearize(graph)
linearized_nodes, backreferences = lin.nodes, lin.backreferences
return linearized_nodes, backreferences
def tokenize_amr(self, graph):
linearized_nodes, backreferences = self._get_nodes_and_backreferences(graph)
bpe_tokens = []
bpe_backreferences = []
counter = 0
encoder = self.encoder
for i, (backr, tokk) in enumerate(zip(backreferences, linearized_nodes)):
is_in_enc = self.INIT + tokk in encoder
is_rel = tokk.startswith(':') and len(tokk) > 1
is_spc = tokk.startswith('<') and tokk.endswith('>')
is_of = tokk.startswith(':') and tokk.endswith('-of')
is_frame = re.match(r'.+-\d\d', tokk) is not None
if tokk.startswith('"') and tokk.endswith('"'):
tokk = tokk[1:-1].replace('_', ' ')
bpe_toks = [self.INIT + AMRTokens.LIT_START]
bpe_toks += self._tok_bpe(tokk, add_space=True)
bpe_toks.append(self.INIT + AMRTokens.LIT_END)
elif (is_rel or is_spc or is_frame or is_of):
if is_in_enc:
bpe_toks = [self.INIT + tokk]
elif is_frame:
bpe_toks = self._tok_bpe(tokk[:-3], add_space=True) + [tokk[-3:]]
elif is_of:
rel = tokk[:-3]
if self.INIT + rel in encoder:
bpe_toks = [self.INIT + rel, '-of']
else:
bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:], add_space=True) + ['-of']
elif is_rel:
bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:], add_space=True)
else:
raise
else:
if is_in_enc:
bpe_toks = [self.INIT + tokk]
else:
bpe_toks = self._tok_bpe(tokk, add_space=True)
bpe_tokens.append(bpe_toks)
if i == backr:
bpe_backr = list(range(counter, counter + len(bpe_toks)))
counter += len(bpe_toks)
bpe_backreferences.append(bpe_backr)
else:
bpe_backreferences.append(bpe_backreferences[backr][0:1])
counter += 1
bpe_tokens = [b for bb in bpe_tokens for b in bb]
bpe_token_ids = self.convert_tokens_to_ids(bpe_tokens)
bpe_backreferences = [b for bb in bpe_backreferences for b in bb]
return bpe_tokens, bpe_token_ids, bpe_backreferences
def batch_encode_sentences(self, sentences, device=torch.device('cpu')):
sentences = [s for s in sentences]
extra = {'sentences': sentences}
batch = super().batch_encode_plus(sentences, return_tensors='pt', pad_to_max_length=True)
batch = {k: v.to(device) for k, v in batch.items()}
return batch, extra
def linearize(self, graph):
shift = len(self)
tokens, token_ids, backreferences = self.tokenize_amr(graph)
extra = {'linearized_graphs': tokens, 'graphs': graph}
token_uni_ids = \
[idx if i == b else b + shift for i, (idx, b) in enumerate(zip(token_ids, backreferences))]
if token_uni_ids[-1] != (self.INIT + AMRTokens.EOS_N):
tokens.append(self.INIT + AMRTokens.EOS_N)
token_ids.append(self.eos_token_id)
token_uni_ids.append(self.eos_token_id)
backreferences.append(len(backreferences))
return token_uni_ids, extra
def batch_encode_graphs(self, graphs, device=torch.device('cpu')):
linearized, extras = zip(*[self.linearize(g) for g in graphs])
return self.batch_encode_graphs_from_linearized(linearized, extras, device=device)
def batch_encode_graphs_from_linearized(self, linearized, extras=None, device=torch.device('cpu')):
if extras is not None:
batch_extra = {'linearized_graphs': [], 'graphs': []}
for extra in extras:
batch_extra['graphs'].append(extra['graphs'])
batch_extra['linearized_graphs'].append(extra['linearized_graphs'])
else:
batch_extra = {}
maxlen = 0
batch = []
for token_uni_ids in linearized:
maxlen = max(len(token_uni_ids), maxlen)
batch.append(token_uni_ids)
batch = [x + [self.pad_token_id] * (maxlen - len(x)) for x in batch]
batch = torch.tensor(batch).to(device)
batch = {'decoder_input_ids': batch[:, :-1], 'lm_labels': batch[:, 1:]}
return batch, batch_extra
def decode_amr(self, tokens, restore_name_ops=False):
try:
nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self)
except Exception as e:
print('Decoding failure:', file=sys.stderr)
traceback.print_exc()
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
if self.use_pointer_tokens:
nodes, backreferences = postprocessing.restore_backreferences_from_pointers(nodes)
try:
graph_ = graph = postprocessing.build_graph(nodes, backreferences, restore_name_ops=restore_name_ops)
except Exception as e:
print('Building failure:', file=sys.stderr)
traceback.print_exc()
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(e, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
try:
graph, status = postprocessing.connect_graph_if_not_connected(graph)
if status == postprocessing.ParsedStatus.BACKOFF:
print('Reconnection 1 failure:')
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(graph_, file=sys.stderr)
return graph, status, (nodes, backreferences)
except Exception as e:
print('Reconnction 2 failure:', file=sys.stderr)
traceback.print_exc()
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(graph_, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences)
class PENMANT5Tokenizer(AMRT5Tokenizer):
def __init__(self, *args, raw_graph=False, **kwargs):
super().__init__(*args, **kwargs)
self.linearizer = None
self.remove_pars = False
self.raw_graph = raw_graph
def _tokenize_encoded_graph(self, encoded):
linearized = re.sub(r"(\".+?\")", r' \1 ', encoded)
pieces = []
for piece in linearized.split():
if piece.startswith('"') and piece.endswith('"'):
pieces.append(piece)
else:
piece = piece.replace('(', ' ( ')
piece = piece.replace(')', ' ) ')
piece = piece.replace(':', ' :')
piece = piece.replace('/', ' / ')
piece = piece.strip()
pieces.append(piece)
linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip()
# T5 uses pad instead of
# linearized_nodes = [AMRTokens.BOS_N] + linearized.split(' ')
linearized_nodes = [self.pad_token] + linearized.split(' ')
return linearized_nodes
def tokenize_amr(self, graph):
if self.raw_graph:
graph_ = copy.deepcopy(graph)
graph_.metadata = {}
linearized = penman.encode(graph_)
linearized = re.sub(r"\s+", ' ', linearized)
bpe_tokens = [self.bos_token] + self._tokenize(linearized)[:1022]
bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens]
bpe_backreferences = list(range(len(bpe_token_ids)))
return bpe_tokens, bpe_token_ids, bpe_backreferences
else:
return super().tokenize_amr(graph)
def _get_nodes_and_backreferences(self, graph):
graph_ = copy.deepcopy(graph)
graph_.metadata = {}
linearized = penman.encode(graph_)
linearized_nodes = self._tokenize_encoded_graph(linearized)
if self.use_pointer_tokens:
remap = {}
for i in range(1, len(linearized_nodes)):
nxt = linearized_nodes[i]
lst = linearized_nodes[i - 1]
if nxt == '/':
remap[lst] = f''
i = 1
linearized_nodes_ = [linearized_nodes[0]]
while i < (len(linearized_nodes)):
nxt = linearized_nodes[i]
lst = linearized_nodes_[-1]
if nxt in remap:
if lst == '(' and linearized_nodes[i + 1] == '/':
nxt = remap[nxt]
i += 1
elif lst.startswith(':'):
nxt = remap[nxt]
linearized_nodes_.append(nxt)
i += 1
linearized_nodes = linearized_nodes_
if self.remove_pars:
linearized_nodes = [n for n in linearized_nodes if n != '(']
backreferences = list(range(len(linearized_nodes)))
return linearized_nodes, backreferences
def _classify(self, node):
if not isinstance(node, str):
return "CONST"
elif node == 'i':
return "I"
elif re.match(r'^[a-z]\d*$', node) is not None:
return "VAR"
elif node[0].isdigit():
return "CONST"
elif node.startswith('"') and node.endswith('"'):
return "CONST"
elif node in ('+', '-'):
return "CONST"
elif node == ':mode':
return 'MODE'
elif node.startswith(':'):
return "EDGE"
elif node in ['/', '(', ')']:
return node
elif node[0].isalpha():
for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'):
if char in node:
return "CONST"
return "INST"
else:
return 'CONST'
def _fix_and_make_graph(self, nodes):
nodes_ = []
for n in nodes:
if isinstance(n, str):
if n.startswith('<') and n.endswith('>') and (not n.startswith('')
if e != len(nxt) - 1:
pst = nxt[e + 1:]
nxt = nxt[:e + 1]
nodes_.append(nxt)
if pst is not None:
nodes_.append(pst)
else:
nodes_.append(nxt)
i += 1
nodes = nodes_
i = 1
nodes_ = [nodes[0]]
while i < len(nodes):
nxt = nodes[i]
if isinstance(nxt, str) and nxt.startswith(' 0:
line = line[:i].strip()
break
old_line = line
while True:
open_count = len(re.findall(r'\(', line))
close_count = len(re.findall(r'\)', line))
if open_count > close_count:
line += ')' * (open_count - close_count)
elif close_count > open_count:
for i in range(close_count - open_count):
line = line.rstrip(')')
line = line.rstrip(' ')
if old_line == line:
break
old_line = line
"""
graph = penman.decode(linearized + ' ')
triples = []
newvars = 2000
for triple in graph.triples:
x, rel, y = triple
if x is None:
pass
elif rel == ':instance' and y is None:
triples.append(penman.Triple(x, rel, 'thing'))
elif y is None:
var = f'z{newvars}'
newvars += 1
triples.append(penman.Triple(x, rel, var))
triples.append(penman.Triple(var, ':instance', 'thing'))
else:
triples.append(triple)
graph = penman.Graph(triples)
linearized = pm_encode(graph)
def fix_text(linearized=linearized):
n = 0
def _repl1(match):
nonlocal n
out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3)
n += 1
return out
linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized,
flags=re.IGNORECASE | re.MULTILINE)
def _repl2(match):
return match.group(1)
linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2,
linearized,
flags=re.IGNORECASE | re.MULTILINE)
# adds a ':' to args w/o it
linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized)
# removes edges with no node
# linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE)
return linearized
linearized = fix_text(linearized)
g = penman.decode(linearized)
return g
def decode_amr(self, tokens, restore_name_ops=None):
try:
if self.raw_graph:
nodes = self._tokenize_encoded_graph(self.decode(tokens))
backreferences = list(range(len(nodes)))
else:
nodes, backreferences = postprocessing.decode_into_node_and_backreferences_without_space(tokens, self) \
if not self.INIT else postprocessing.decode_into_node_and_backreferences(tokens, self)
nodes_ = nodes
except Exception as e:
print('Decoding failure:', file=sys.stderr)
traceback.print_exc()
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
try:
graph_ = graph = self._fix_and_make_graph(nodes)
if self.collapse_name_ops:
graph_ = graph = postprocessing._split_name_ops(graph)
except Exception as e:
print('Building failure:', file=sys.stderr)
traceback.print_exc()
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(e, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None)
try:
graph, status = postprocessing.connect_graph_if_not_connected(graph)
if status == postprocessing.ParsedStatus.BACKOFF:
print('Reconnection 1 failure:')
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(graph_, file=sys.stderr)
return graph, status, (nodes_, backreferences)
except Exception as e:
print('Reconnction 2 failure:', file=sys.stderr)
print(e, file=sys.stderr)
traceback.print_exc()
print(nodes, file=sys.stderr)
print(backreferences, file=sys.stderr)
print(graph_, file=sys.stderr)
return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes_, backreferences)
@property
def encoder(self) -> Dict[str, int]:
return self.get_vocab()
================================================
FILE: hanlp/components/amr/seq2seq/evaluation.py
================================================
from pathlib import Path
import penman
def write_predictions(predictions_path, tokenizer, graphs):
pieces = [penman.encode(g) for g in graphs]
text = '\n\n'.join(pieces)
if tokenizer:
text = text.replace(tokenizer.INIT, '')
Path(predictions_path).write_text(text)
return predictions_path
def compute_smatch(pred, gold):
from perin_parser.thirdparty.mtool import smatch
with Path(pred).open() as p, Path(gold).open() as g:
score = next(smatch.score_amr_pairs(p, g))
return score[2]
def compute_bleu(gold_sentences, pred_sentences):
from sacrebleu import corpus_bleu
return corpus_bleu(pred_sentences, [gold_sentences])
================================================
FILE: hanlp/components/amr/seq2seq/optim.py
================================================
# taken from
import math
import torch
from torch.optim.optimizer import Optimizer
class RAdam(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
self.degenerated_to_sgd = degenerated_to_sgd
if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
for param in params:
if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
param['buffer'] = [[None, None, None] for _ in range(10)]
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
buffer=[[None, None, None] for _ in range(10)])
super(RAdam, self).__init__(params, defaults)
def __setstate__(self, state):
super(RAdam, self).__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data.float()
if grad.is_sparse:
raise RuntimeError('RAdam does not support sparse gradients')
p_data_fp32 = p.data.float()
state = self.state[p]
if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
state['step'] += 1
buffered = group['buffer'][int(state['step'] % 10)]
if state['step'] == buffered[0]:
N_sma, step_size = buffered[1], buffered[2]
else:
buffered[0] = state['step']
beta2_t = beta2 ** state['step']
N_sma_max = 2 / (1 - beta2) - 1
N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
buffered[1] = N_sma
# more conservative since it's an approximated value
if N_sma >= 5:
step_size = math.sqrt(
(1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
N_sma_max - 2)) / (1 - beta1 ** state['step'])
elif self.degenerated_to_sgd:
step_size = 1.0 / (1 - beta1 ** state['step'])
else:
step_size = -1
buffered[2] = step_size
# more conservative since it's an approximated value
if N_sma >= 5:
if group['weight_decay'] != 0:
p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
denom = exp_avg_sq.sqrt().add_(group['eps'])
p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr'])
p.data.copy_(p_data_fp32)
elif step_size > 0:
if group['weight_decay'] != 0:
p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr'])
p.data.copy_(p_data_fp32)
return loss
================================================
FILE: hanlp/components/amr/seq2seq/seq2seq_amr_parser.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-28 17:33
import datetime
import functools
import logging
import os
from typing import Union, List, Callable
import torch
from torch.utils.data import DataLoader
from transformers import get_constant_schedule_with_warmup, T5ForConditionalGeneration
from transformers.models.bart.modeling_bart import BartForConditionalGeneration
from hanlp.common.dataset import SamplerBuilder, SortingSamplerBuilder, PadSequenceDataLoader
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.vocab import Vocab
from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset, dfs_linearize_tokenize
from hanlp.components.amr.seq2seq.dataset.penman import AMRGraph
from hanlp.components.amr.seq2seq.dataset.tokenization_bart import PENMANBartTokenizer
from hanlp.components.amr.seq2seq.dataset.tokenization_t5 import PENMANT5Tokenizer
from hanlp.components.amr.seq2seq.evaluation import write_predictions, compute_smatch
from hanlp.components.amr.seq2seq.optim import RAdam
from hanlp.layers.transformers.pt_imports import PretrainedConfig, AutoConfig_
from hanlp.layers.transformers.resource import get_model_mirror, get_tokenizer_mirror
from hanlp.metrics.amr.smatch_eval import smatch_eval
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.constant import IDX
from hanlp_common.util import merge_locals_kwargs, reorder
class Seq2seq_AMR_Parser(TorchComponent):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._transformer_config: PretrainedConfig = None
self._tokenizer: PENMANBartTokenizer = None
self.model: BartForConditionalGeneration = None
def build_dataloader(self, data, batch_size,
gradient_accumulation=1,
shuffle=False,
sampler_builder: SamplerBuilder = None,
device=None,
logger: logging.Logger = None,
**kwargs) -> DataLoader:
dataset = self.build_dataset(data, not shuffle)
if self.vocabs.mutable:
self.build_vocabs(dataset, logger)
self.finalize_dataset(dataset, logger)
if isinstance(data, str):
dataset.purge_cache()
timer = CountdownTimer(len(dataset))
max_num_tokens = 0
# lc = Counter()
for each in dataset:
max_num_tokens = max(max_num_tokens, len(each['text_token_ids']))
# lc[len(each['text_token_ids'])] += 1
timer.log(f'Preprocessing and caching samples (longest sequence {max_num_tokens})'
f'[blink][yellow]...[/yellow][/blink]')
# print(lc.most_common())
if self.vocabs.mutable:
self.vocabs.lock()
self.vocabs.summary(logger)
if not sampler_builder:
sampler_builder = SortingSamplerBuilder(batch_max_tokens=500)
sampler = sampler_builder.build([len(x['text_token_ids']) for x in dataset], shuffle,
gradient_accumulation if dataset.cache else 1)
return self._create_dataloader(dataset, batch_size, device, sampler, shuffle)
def _create_dataloader(self, dataset, batch_size, device, sampler, shuffle):
return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler,
pad=self._get_pad_dict())
def _get_pad_dict(self):
return {'text_token_ids': self._transformer_config.pad_token_id,
'graph_token_ids': self._transformer_config.pad_token_id}
def finalize_dataset(self, dataset, logger: logging.Logger = None):
dataset.append_transform(functools.partial(dfs_linearize_tokenize, tokenizer=self._tokenizer,
remove_space='chinese' in self.config.transformer))
def build_dataset(self, data, generate_idx):
dataset = AMRDataset(data, generate_idx=generate_idx)
return dataset
def collect_additional_tokens(self, additional_tokens, dataset):
pred_min = self.config.pred_min
frames = dataset.get_frames()
for token, freq in frames.items():
if freq >= pred_min:
additional_tokens.add(token)
for token, freq in dataset.get_roles().items():
additional_tokens.add(token)
additional_tokens.update(self.config.additional_tokens)
def build_tokenizer(self, additional_tokens) -> PENMANBartTokenizer:
transformer = self.config.transformer
if 't5-' in transformer:
cls = PENMANT5Tokenizer
elif 'bart-' in transformer:
cls = PENMANBartTokenizer
else:
raise NotImplemented(f'Unsupported transformer {transformer}')
transformer = get_tokenizer_mirror(transformer)
self._tokenizer = cls.from_pretrained(
transformer,
collapse_name_ops=self.config.collapse_name_ops,
use_pointer_tokens=self.config.use_pointer_tokens,
raw_graph=self.config.raw_graph,
additional_tokens=additional_tokens,
recategorization_tokens=self.config.recategorization_tokens,
config=self._transformer_config,
)
return self._tokenizer
def build_optimizer(self, trn, lr, epochs, gradient_accumulation, warmup_steps, weight_decay, **kwargs):
num_training_steps = len(trn) * epochs // gradient_accumulation
if isinstance(warmup_steps, float):
warmup_steps = int(num_training_steps * warmup_steps)
optimizer = RAdam(
self.model.parameters(),
lr=lr,
weight_decay=weight_decay)
scheduler = get_constant_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps)
return optimizer, scheduler
def build_criterion(self, **kwargs):
pass
def build_metric(self, **kwargs):
pass
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, dev_data=None, eval_after=None,
**kwargs):
best_epoch, best_metric = 0, -1
if isinstance(eval_after, float):
eval_after = int(epochs * eval_after)
timer = CountdownTimer(epochs)
history = History()
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
**self.config)
if epoch > eval_after:
dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width,
output=os.path.join(save_dir, 'dev.pred.txt'),
input=dev_data, use_fast=True)
timer.update()
report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
if epoch > eval_after:
if dev_metric > best_metric:
best_epoch, best_metric = epoch, dev_metric
self.save_weights(save_dir)
report += ' [red](saved)[/red]'
else:
report += f' ({epoch - best_epoch})'
# if epoch - best_epoch >= patience:
# report += ' early stop'
logger.info(report)
# if epoch - best_epoch >= patience:
# break
if not best_epoch:
self.save_weights(save_dir)
elif best_epoch != epoch:
self.load_weights(save_dir)
logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
logger.info(f"{timer.elapsed_human} elapsed")
return best_metric
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger,
history: History = None, gradient_accumulation=1, ratio_percentage=None, **kwargs):
optimizer, scheduler = optimizer
self.model.train()
timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
total_loss = 0
for batch in trn:
output_dict = self.feed_batch(batch)
loss = output_dict['loss']
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
loss.backward()
total_loss += loss.item()
if history.step(gradient_accumulation):
self._step(optimizer, scheduler)
timer.log(self.report_metrics(total_loss / (timer.current + 1)),
ratio_percentage=ratio_percentage, logger=logger)
del loss
del output_dict
return total_loss / max(timer.total, 1)
def _step(self, optimizer, scheduler):
if self.config.grad_norm:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
optimizer.step()
if scheduler:
scheduler.step()
optimizer.zero_grad()
def report_metrics(self, loss):
return f'loss: {loss:.4f}'
def feed_batch(self, batch):
input_ids, labels = batch['text_token_ids'], batch.get('graph_token_ids')
attention_mask = input_ids.ne(self.model.config.pad_token_id).to(torch.long)
if labels is not None:
decoder_input_ids = labels[:, :-1]
labels = labels[:, 1:].contiguous()
else:
decoder_input_ids = None
return self.model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids,
labels=labels)
@torch.no_grad()
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, ratio_width=None,
logger=None, input=None, use_fast=False,
**kwargs):
self.model.eval()
timer = CountdownTimer(len(data))
graphs = []
orders = []
smatch = 0
for idx, batch in enumerate(data):
graphs_per_batch = self.predict_amrs(batch)
graphs_per_batch = [x[0] for x in graphs_per_batch]
# Copy meta data from gold graph
for gp, gg in zip(graphs_per_batch, batch['amr']):
metadata = gg.metadata.copy()
metadata['annotator'] = f'{self.config.transformer}-amr'
metadata['date'] = str(datetime.datetime.now())
if 'save-date' in metadata:
del metadata['save-date']
gp.metadata = metadata
graphs.extend(graphs_per_batch)
orders.extend(batch[IDX])
if idx == timer.total - 1:
graphs = reorder(graphs, orders)
write_predictions(output, self._tokenizer, graphs)
try:
if use_fast:
smatch = compute_smatch(output, input)
else:
smatch = smatch_eval(output, input, use_fast=False)
except:
pass
timer.log(smatch.cstr() if isinstance(smatch, MetricDict) else f'{smatch:.2%}', ratio_percentage=False,
logger=logger)
else:
timer.log(ratio_percentage=False, logger=logger)
return smatch
def predict_amrs(self, batch, beam_size=1):
out = self._model_generate(batch, beam_size)
tokens = []
for i1 in range(0, out.size(0), beam_size):
tokens_same_source = []
tokens.append(tokens_same_source)
for i2 in range(i1, i1 + beam_size):
tokk = out[i2].tolist()
tokens_same_source.append(tokk)
tokens = [t for tt in tokens for t in tt]
graphs = []
tokenizer = self._tokenizer
for i1 in range(0, len(tokens), beam_size):
graphs_same_source = []
graphs.append(graphs_same_source)
for i2 in range(i1, i1 + beam_size):
tokk = tokens[i2]
graph, status, (lin, backr) = tokenizer.decode_amr(tokk, restore_name_ops=False)
graph.status = status
graph.nodes = lin
graph.backreferences = backr
graph.tokens = tokk
graphs_same_source.append(graph)
graphs_same_source[:] = \
tuple(zip(*sorted(enumerate(graphs_same_source), key=lambda x: (x[1].status.value, x[0]))))[1]
return graphs
def _model_generate(self, batch, beam_size):
input_ids = batch['text_token_ids']
attention_mask = input_ids.ne(self.model.config.pad_token_id).to(torch.long)
out = self.model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=1024,
decoder_start_token_id=0,
num_beams=beam_size,
num_return_sequences=beam_size)
return out
def build_model(self, training=True, **kwargs) -> torch.nn.Module:
# noinspection PyTypeChecker
transformer = self.config.transformer
cls = self._get_model_cls(transformer)
transformer = get_model_mirror(self.config.transformer)
model: cls = cls.from_pretrained(
transformer,
config=self._transformer_config) if training else cls(self._transformer_config)
if not training:
self.build_tokenizer(self.vocabs['additional_tokens'])
tokenizer = self._tokenizer
model.resize_token_embeddings(len(tokenizer.encoder))
if training:
self._init_new_embeddings(model if cls == T5ForConditionalGeneration else model.model, tokenizer)
return model
def _get_model_cls(self, transformer: str):
if 't5-' in transformer:
cls = T5ForConditionalGeneration
elif 'bart-' in transformer:
cls = BartForConditionalGeneration
else:
raise NotImplemented(f'Unsupported transformer {transformer}')
return cls
@staticmethod
def _init_new_embeddings(model, tokenizer):
modified = 0
encoder = tokenizer.encoder
for tok, idx in encoder.items():
tok = tok.lstrip(tokenizer.INIT)
if idx < tokenizer.old_enc_size:
continue
elif tok.startswith(''):
tok_split = ['pointer', str(tok.split(':')[1].strip('>'))]
elif tok.startswith('<'):
continue
elif tok.startswith(':'):
if tok.startswith(':op'):
tok_split = ['relation', 'operator', str(int(tok[3:]))]
elif tok.startswith(':snt'):
tok_split = ['relation', 'sentence', str(int(tok[4:]))]
elif tok.startswith(':ARG'):
tok_split = ['relation', 'argument', str(int(tok[4:]))]
else:
tok_split = ['relation'] + tok.lstrip(':').split('-')
else:
tok_split = tok.split('-')
tok_split_ = tok_split
tok_split = []
for s in tok_split_:
s_ = s + tokenizer.INIT
if s_ in encoder:
tok_split.append(s_)
else:
tok_split.extend(tokenizer._tok_bpe(s))
vecs = []
for s in tok_split:
idx_split = encoder.get(s, -1)
if idx_split > -1:
vec_split = model.encoder.embed_tokens.weight.data[idx_split].clone()
vecs.append(vec_split)
if vecs:
vec = torch.stack(vecs, 0).mean(0)
noise = torch.empty_like(vec)
noise.uniform_(-0.1, +0.1)
model.encoder.embed_tokens.weight.data[idx] = vec + noise
modified += 1
def input_is_flat(self, data):
return isinstance(data, str)
def predict(self, data: Union[str, List[str]], beautiful_amr_graph=True, **kwargs):
flat = self.input_is_flat(data)
if flat:
data = [data]
dataloader = self.build_dataloader([{'text': x} for x in data], **self.config, device=self.device)
orders = []
results = []
for batch in dataloader:
graphs = self.predict_amrs(batch)
graphs = [x[0] for x in graphs]
if beautiful_amr_graph:
graphs = [AMRGraph(x.triples, x.top, x.epidata, x.metadata) for x in graphs]
results.extend(graphs)
orders.extend(batch[IDX])
results = reorder(results, orders)
if flat:
results = results[0]
return results
def fit(self, trn_data, dev_data, save_dir, batch_size=32, epochs=30,
transformer='facebook/bart-base',
lr=5e-05,
grad_norm=2.5,
weight_decay=0.004,
warmup_steps=1,
dropout=0.25,
attention_dropout=0.0,
pred_min=5,
eval_after=0.5,
collapse_name_ops=False,
use_pointer_tokens=True,
raw_graph=False,
gradient_accumulation=1,
recategorization_tokens=(
'PERSON', 'COUNTRY', 'QUANTITY', 'ORGANIZATION', 'DATE_ATTRS', 'NATIONALITY', 'LOCATION', 'ENTITY',
'CITY',
'MISC', 'ORDINAL_ENTITY', 'IDEOLOGY', 'RELIGION', 'STATE_OR_PROVINCE', 'URL', 'CAUSE_OF_DEATH', 'O',
'TITLE', 'DATE', 'NUMBER', 'HANDLE', 'SCORE_ENTITY', 'DURATION', 'ORDINAL', 'MONEY', 'SET',
'CRIMINAL_CHARGE', '_1', '_2', '_3', '_4', '_2', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12',
'_13',
'_14', '_15'),
additional_tokens=(
'date-entity', 'government-organization', 'temporal-quantity', 'amr-unknown', 'multi-sentence',
'political-party', 'monetary-quantity', 'ordinal-entity', 'religious-group', 'percentage-entity',
'world-region', 'url-entity', 'political-movement', 'et-cetera', 'at-least', 'mass-quantity',
'have-org-role-91', 'have-rel-role-91', 'include-91', 'have-concession-91', 'have-condition-91',
'be-located-at-91', 'rate-entity-91', 'instead-of-91', 'hyperlink-91', 'request-confirmation-91',
'have-purpose-91', 'be-temporally-at-91', 'regardless-91', 'have-polarity-91', 'byline-91',
'have-manner-91', 'have-part-91', 'have-quant-91', 'publication-91', 'be-from-91', 'have-mod-91',
'have-frequency-91', 'score-on-scale-91', 'have-li-91', 'be-compared-to-91', 'be-destined-for-91',
'course-91', 'have-subevent-91', 'street-address-91', 'have-extent-91', 'statistical-test-91',
'have-instrument-91', 'have-name-91', 'be-polite-91', '-00', '-01', '-02', '-03', '-04', '-05',
'-06',
'-07', '-08', '-09', '-10', '-11', '-12', '-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20',
'-21',
'-22', '-23', '-24', '-25', '-26', '-27', '-28', '-29', '-20', '-31', '-32', '-33', '-34', '-35',
'-36',
'-37', '-38', '-39', '-40', '-41', '-42', '-43', '-44', '-45', '-46', '-47', '-48', '-49', '-50',
'-51',
'-52', '-53', '-54', '-55', '-56', '-57', '-58', '-59', '-60', '-61', '-62', '-63', '-64', '-65',
'-66',
'-67', '-68', '-69', '-70', '-71', '-72', '-73', '-74', '-75', '-76', '-77', '-78', '-79', '-80',
'-81',
'-82', '-83', '-84', '-85', '-86', '-87', '-88', '-89', '-90', '-91', '-92', '-93', '-94', '-95',
'-96',
'-97', '-98', '-of'),
devices=None,
logger=None,
seed=None,
finetune: Union[bool, str] = False,
eval_trn=True,
_device_placeholder=False,
**kwargs):
"""
Args:
trn_data:
dev_data:
save_dir:
batch_size:
epochs:
transformer:
lr:
grad_norm:
weight_decay:
warmup_steps:
dropout:
attention_dropout:
pred_min:
eval_after:
collapse_name_ops: ``True`` to merge name ops.
use_pointer_tokens: ``True`` to use pointer tokens to represent variables.
raw_graph: ``True`` to use the raw graph as input and skip all pre/post-processing steps.
gradient_accumulation:
recategorization_tokens: Tokens used in re-categorization. They will be added to tokenizer too but do not
put them into ``additional_tokens``.
additional_tokens: Tokens to be added to the tokenizer vocab.
devices:
logger:
seed:
finetune:
eval_trn:
_device_placeholder:
**kwargs:
Returns:
"""
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def on_config_ready(self, **kwargs):
super().on_config_ready(**kwargs)
config = AutoConfig_.from_pretrained(self.config.transformer)
config.output_past = False
config.no_repeat_ngram_size = 0
config.prefix = " "
# config.output_attentions = True
config.dropout = self.config.dropout
config.attention_dropout = self.config.attention_dropout
self._transformer_config = config
def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=True,
cache=None, ret_speed=False, **kwargs):
return super().evaluate(tst_data, save_dir, logger, batch_size, output, cache, ret_speed, **kwargs)
def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger):
additional_tokens = set()
self.collect_additional_tokens(additional_tokens, trn)
additional_tokens = sorted(additional_tokens)
self.build_tokenizer(additional_tokens)
self.vocabs['additional_tokens'] = Vocab(idx_to_token=list(additional_tokens))
================================================
FILE: hanlp/components/classifiers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 13:18
================================================
FILE: hanlp/components/classifiers/fasttext_classifier.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 13:31
import os
import sys
from typing import List, Union
import fasttext
from fasttext.FastText import _FastText
import hanlp
from hanlp.common.component import Component
from hanlp.utils.io_util import get_resource, stdout_redirected
from hanlp_common.io import load_json
from hanlp_common.reflection import classpath_of
from hanlp_common.structure import SerializableDict
class FastTextClassifier(Component):
def __init__(self) -> None:
super().__init__()
self._model: _FastText = None
self.config = SerializableDict({
'classpath': classpath_of(self),
'hanlp_version': hanlp.__version__,
})
def load(self, save_dir, model_path=None, **kwargs):
config_path = os.path.join(save_dir, 'config.json')
if os.path.isfile(config_path):
self.config: dict = load_json(config_path)
model_path = self.config.get('model_path', model_path)
else:
model_path = model_path or save_dir
self.config['model_path'] = model_path
filepath = get_resource(model_path)
with stdout_redirected(to=os.devnull, stdout=sys.stderr):
self._model = fasttext.load_model(filepath)
def predict(self, text: Union[str, List[str]], topk=False, prob=False, max_len=None, **kwargs):
"""
Classify text.
Args:
text: A document or a list of documents.
topk: ``True`` or ``int`` to return the top-k labels.
prob: Return also probabilities.
max_len: Strip long document into ``max_len`` characters for faster prediction.
**kwargs: Not used
Returns:
Classification results.
"""
num_labels = len(self._model.get_labels())
flat = isinstance(text, str)
if flat:
text = [text]
if not isinstance(topk, list):
topk = [topk] * len(text)
if not isinstance(prob, list):
prob = [prob] * len(text)
if max_len:
text = [x[:max_len] for x in text]
text = [x.replace('\n', ' ') for x in text]
batch_labels, batch_probs = self._model.predict(text, k=num_labels)
results = []
for labels, probs, k, p in zip(batch_labels, batch_probs, topk, prob):
labels = [self._strip_prefix(x) for x in labels]
if k is False:
labels = labels[0]
elif k is True:
pass
elif k:
labels = labels[:k]
if p:
probs = probs.tolist()
if k is False:
result = labels, probs[0]
else:
result = dict(zip(labels, probs))
else:
result = labels
results.append(result)
if flat:
results = results[0]
return results
@property
def labels(self):
return [self._strip_prefix(x) for x in self._model.get_labels()]
@staticmethod
def _strip_prefix(label: str):
return label[len('__label__'):]
================================================
FILE: hanlp/components/classifiers/transformer_classifier.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-08 16:31
import logging
from abc import ABC
from typing import Callable, Union
from typing import List
import torch
from torch import nn
from torch.utils.data import DataLoader
from hanlp_common.constant import IDX
from hanlp.common.dataset import TableDataset, SortingSampler, PadSequenceDataLoader, TransformableDataset
from hanlp.common.torch_component import TorchComponent
from hanlp.common.vocab import Vocab
from hanlp.components.distillation.schedulers import LinearTeacherAnnealingScheduler
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.layers.transformers.pt_imports import PreTrainedModel, AutoTokenizer, BertTokenizer, AutoTokenizer_
from hanlp.layers.transformers.utils import transformer_sliding_window, build_optimizer_scheduler_with_transformer
from hanlp.metrics.accuracy import CategoricalAccuracy
from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, merge_dict, isdebugging
class TransformerClassificationModel(nn.Module):
def __init__(self,
transformer: PreTrainedModel,
num_labels: int,
max_seq_length=512) -> None:
super().__init__()
self.max_seq_length = max_seq_length
self.transformer = transformer
self.dropout = nn.Dropout(transformer.config.hidden_dropout_prob)
self.classifier = nn.Linear(transformer.config.hidden_size, num_labels)
def forward(self, input_ids, attention_mask, token_type_ids):
seq_length = input_ids.size(-1)
if seq_length > self.max_seq_length:
sequence_output = transformer_sliding_window(self.transformer, input_ids,
max_pieces=self.max_seq_length, ret_cls='max')
else:
sequence_output = self.transformer(input_ids, attention_mask, token_type_ids)[0][:, 0, :]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
return logits
class TransformerComponent(TorchComponent, ABC):
def __init__(self, **kwargs) -> None:
""" The base class for transorfmer based components. If offers methods to build transformer tokenizers
, optimizers and models.
Args:
**kwargs: Passed to config.
"""
super().__init__(**kwargs)
self.transformer_tokenizer = None
def build_optimizer(self,
trn,
epochs,
lr,
adam_epsilon,
weight_decay,
warmup_steps,
transformer_lr=None,
teacher=None,
**kwargs):
num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
if transformer_lr is None:
transformer_lr = lr
transformer = self.model.encoder.transformer
optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, transformer,
lr, transformer_lr,
num_training_steps, warmup_steps,
weight_decay, adam_epsilon)
if teacher:
lambda_scheduler = LinearTeacherAnnealingScheduler(num_training_steps)
scheduler = (scheduler, lambda_scheduler)
return optimizer, scheduler
def fit(self, trn_data, dev_data, save_dir,
transformer=None,
lr=5e-5,
transformer_lr=None,
adam_epsilon=1e-8,
weight_decay=0,
warmup_steps=0.1,
batch_size=32,
gradient_accumulation=1,
grad_norm=5.0,
transformer_grad_norm=None,
average_subwords=False,
scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
word_dropout=None,
hidden_dropout=None,
max_seq_len=None,
ret_raw_hidden_states=False,
batch_max_tokens=None,
epochs=3,
logger=None,
devices: Union[float, int, List[int]] = None,
**kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def on_config_ready(self, **kwargs):
super().on_config_ready(**kwargs)
if 'albert_chinese' in self.config.transformer:
self.transformer_tokenizer = BertTokenizer.from_pretrained(self.config.transformer, use_fast=True)
else:
self.transformer_tokenizer = AutoTokenizer_.from_pretrained(self.config.transformer, use_fast=True)
def build_transformer(self, training=True):
transformer = TransformerEncoder(self.config.transformer, self.transformer_tokenizer,
self.config.average_subwords,
self.config.scalar_mix, self.config.word_dropout,
ret_raw_hidden_states=self.config.ret_raw_hidden_states,
training=training)
transformer_layers = self.config.get('transformer_layers', None)
if transformer_layers:
transformer.transformer.encoder.layer = transformer.transformer.encoder.layer[:transformer_layers]
return transformer
class TransformerClassifier(TransformerComponent):
def __init__(self, **kwargs) -> None:
"""A classifier using transformer as encoder.
Args:
**kwargs: Passed to config.
"""
super().__init__(**kwargs)
self.model: TransformerClassificationModel = None
def build_criterion(self, **kwargs):
criterion = nn.CrossEntropyLoss()
return criterion
def build_metric(self, **kwargs):
return CategoricalAccuracy()
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, **kwargs):
best_epoch, best_metric = 0, -1
timer = CountdownTimer(epochs)
ratio_width = len(f'{len(trn)}/{len(trn)}')
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger)
if dev:
self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
report = f'{timer.elapsed_human}/{timer.total_time_human}'
dev_score = metric.get_metric()
if dev_score > best_metric:
self.save_weights(save_dir)
best_metric = dev_score
report += ' [red]saved[/red]'
timer.log(report, ratio_percentage=False, newline=True, ratio=False)
@property
def label_vocab(self):
return self.vocabs[self.config.label_key]
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
self.model.train()
timer = CountdownTimer(len(trn))
optimizer, scheduler = optimizer
total_loss = 0
metric.reset()
for batch in trn:
optimizer.zero_grad()
logits = self.feed_batch(batch)
target = batch['label_id']
loss = self.compute_loss(criterion, logits, target, batch)
loss.backward()
optimizer.step()
scheduler.step()
total_loss += loss.item()
self.update_metric(metric, logits, target)
timer.log(f'loss: {total_loss / (timer.current + 1):.4f} acc: {metric.get_metric():.2%}',
ratio_percentage=None,
logger=logger)
del loss
return total_loss / timer.total
def update_metric(self, metric, logits: torch.Tensor, target, output=None):
metric(logits, target)
if output:
label_ids = logits.argmax(-1)
return label_ids
def compute_loss(self, criterion, logits, target, batch):
loss = criterion(logits, target)
return loss
def feed_batch(self, batch) -> torch.LongTensor:
logits = self.model(*[batch[key] for key in ['input_ids', 'attention_mask', 'token_type_ids']])
return logits
# noinspection PyMethodOverriding
def evaluate_dataloader(self,
data: DataLoader,
criterion: Callable,
metric,
logger,
ratio_width=None,
filename=None,
output=None,
**kwargs):
self.model.eval()
timer = CountdownTimer(len(data))
total_loss = 0
metric.reset()
num_samples = 0
if output:
output = open(output, 'w')
for batch in data:
logits = self.feed_batch(batch)
target = batch['label_id']
loss = self.compute_loss(criterion, logits, target, batch)
total_loss += loss.item()
label_ids = self.update_metric(metric, logits, target, output)
if output:
labels = [self.vocabs[self.config.label_key].idx_to_token[i] for i in label_ids.tolist()]
for i, label in enumerate(labels):
# text_a text_b pred gold
columns = [batch[self.config.text_a_key][i]]
if self.config.text_b_key:
columns.append(batch[self.config.text_b_key][i])
columns.append(label)
columns.append(batch[self.config.label_key][i])
output.write('\t'.join(columns))
output.write('\n')
num_samples += len(target)
report = f'loss: {total_loss / (timer.current + 1):.4f} acc: {metric.get_metric():.2%}'
if filename:
report = f'{filename} {report} {num_samples / timer.elapsed:.0f} samples/sec'
timer.log(report, ratio_percentage=None, logger=logger, ratio_width=ratio_width)
if output:
output.close()
return total_loss / timer.total
# noinspection PyMethodOverriding
def build_model(self, transformer, training=True, **kwargs) -> torch.nn.Module:
# config: PretrainedConfig = AutoConfig.from_pretrained(transformer)
# config.num_labels = len(self.vocabs.label)
# config.hidden_dropout_prob = self.config.hidden_dropout_prob
transformer = self.build_transformer(training=training).transformer
model = TransformerClassificationModel(transformer, len(self.vocabs.label))
# truncated_normal_(model.classifier.weight, mean=0.02, std=0.05)
return model
# noinspection PyMethodOverriding
def build_dataloader(self, data, batch_size, shuffle, device, text_a_key, text_b_key,
label_key,
logger: logging.Logger = None,
sorting=True,
**kwargs) -> DataLoader:
if not batch_size:
batch_size = self.config.batch_size
dataset = self.build_dataset(data)
dataset.append_transform(self.vocabs)
if self.vocabs.mutable:
if not any([text_a_key, text_b_key]):
if len(dataset.headers) == 2:
self.config.text_a_key = dataset.headers[0]
self.config.label_key = dataset.headers[1]
elif len(dataset.headers) >= 3:
self.config.text_a_key, self.config.text_b_key, self.config.label_key = dataset.headers[0], \
dataset.headers[1], \
dataset.headers[-1]
else:
raise ValueError('Wrong dataset format')
report = {'text_a_key', 'text_b_key', 'label_key'}
report = dict((k, self.config[k]) for k in report)
report = [f'{k}={v}' for k, v in report.items() if v]
report = ', '.join(report)
logger.info(f'Guess [bold][blue]{report}[/blue][/bold] according to the headers of training dataset: '
f'[blue]{dataset}[/blue]')
self.build_vocabs(dataset, logger)
dataset.purge_cache()
# if self.config.transform:
# dataset.append_transform(self.config.transform)
dataset.append_transform(TransformerTextTokenizer(tokenizer=self.transformer_tokenizer,
text_a_key=self.config.text_a_key,
text_b_key=self.config.text_b_key,
max_seq_length=self.config.max_seq_length,
truncate_long_sequences=self.config.truncate_long_sequences,
output_key=''))
batch_sampler = None
if sorting and not isdebugging():
if dataset.cache and len(dataset) > 1000:
timer = CountdownTimer(len(dataset))
lens = []
for idx, sample in enumerate(dataset):
lens.append(len(sample['input_ids']))
timer.log('Pre-processing and caching dataset [blink][yellow]...[/yellow][/blink]',
ratio_percentage=None)
else:
lens = [len(sample['input_ids']) for sample in dataset]
batch_sampler = SortingSampler(lens, batch_size=batch_size, shuffle=shuffle,
batch_max_tokens=self.config.batch_max_tokens)
return PadSequenceDataLoader(dataset, batch_size, shuffle, batch_sampler=batch_sampler, device=device)
def build_dataset(self, data) -> TransformableDataset:
if isinstance(data, str):
dataset = TableDataset(data, cache=True)
elif isinstance(data, TableDataset):
dataset = data
elif isinstance(data, list):
dataset = TableDataset(data)
else:
raise ValueError(f'Unsupported data {data}')
return dataset
def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs):
if not data:
return []
flat = isinstance(data, str) or isinstance(data, tuple)
if flat:
data = [data]
samples = []
for idx, d in enumerate(data):
sample = {IDX: idx}
if self.config.text_b_key:
sample[self.config.text_a_key] = d[0]
sample[self.config.text_b_key] = d[1]
else:
sample[self.config.text_a_key] = d
samples.append(sample)
dataloader = self.build_dataloader(samples,
sorting=False,
**merge_dict(self.config,
batch_size=batch_size,
shuffle=False,
device=self.device,
overwrite=True)
)
labels = [None] * len(data)
vocab = self.vocabs.label
for batch in dataloader:
logits = self.feed_batch(batch)
pred = logits.argmax(-1)
pred = pred.tolist()
for idx, tag in zip(batch[IDX], pred):
labels[idx] = vocab.idx_to_token[tag]
if flat:
return labels[0]
return labels
def fit(self, trn_data, dev_data, save_dir,
text_a_key=None,
text_b_key=None,
label_key=None,
transformer=None,
max_seq_len=512,
truncate_long_sequences=True,
# hidden_dropout_prob=0.0,
lr=5e-5,
transformer_lr=None,
adam_epsilon=1e-6,
weight_decay=0,
warmup_steps=0.1,
batch_size=32,
batch_max_tokens=None,
epochs=3,
logger=None,
# transform=None,
devices: Union[float, int, List[int]] = None,
**kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_vocabs(self, trn, logger, **kwargs):
self.vocabs.label = Vocab(pad_token=None, unk_token=None)
for each in trn:
pass
self.vocabs.lock()
self.vocabs.summary(logger)
================================================
FILE: hanlp/components/classifiers/transformer_classifier_hf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2023-02-17 17:54
import logging
from typing import List, Union, Callable
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, PreTrainedTokenizer, AutoTokenizer
from hanlp.common.dataset import TableDataset, PadSequenceDataLoader, SortingSamplerBuilder
from hanlp.common.torch_component import TorchComponent
from hanlp_common.constant import IDX
from hanlp_common.util import split_dict, reorder
class TransformerClassifierHF(TorchComponent):
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self._tokenizer: PreTrainedTokenizer = None
def build_dataloader(self, data, sampler_builder=None, shuffle=False, device=None,
logger: logging.Logger = None,
**kwargs) -> DataLoader:
dataset = TableDataset(data)
lens = [len(sample['input_ids']) for sample in dataset]
if sampler_builder:
sampler = sampler_builder.build(lens, shuffle, 1)
else:
sampler = SortingSamplerBuilder(batch_size=32).build(lens, shuffle, 1)
loader = PadSequenceDataLoader(dataset=dataset,
batch_sampler=sampler,
pad={'input_ids': self._tokenizer.pad_token_id},
device=device,
vocabs=self.vocabs)
return loader
def build_optimizer(self, **kwargs):
raise NotImplementedError()
def build_criterion(self, **kwargs):
raise NotImplementedError()
def build_metric(self, **kwargs):
raise NotImplementedError()
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, **kwargs):
raise NotImplementedError()
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
raise NotImplementedError()
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
raise NotImplementedError()
def load_vocabs(self, save_dir, filename='vocabs.json'):
self._tokenizer = AutoTokenizer.from_pretrained(save_dir)
def load_weights(self, save_dir, filename='model.pt', **kwargs):
pass
def build_model(self, training=True, save_dir=None, **kwargs) -> torch.nn.Module:
return AutoModelForSequenceClassification.from_pretrained(save_dir)
def predict(self, text: Union[str, List[str]], topk=False, prob=False, **kwargs):
"""
Classify text.
Args:
text: A document or a list of documents.
topk: ``True`` or ``int`` to return the top-k labels.
prob: Return also probabilities.
max_len: Strip long document into ``max_len`` characters for faster prediction.
**kwargs: Not used
Returns:
Classification results.
"""
flat = isinstance(text, str)
if flat:
text = [text]
if not isinstance(topk, list):
topk = [topk] * len(text)
if not isinstance(prob, list):
prob = [prob] * len(text)
# noinspection PyTypeChecker
dataloader = self.build_dataloader(
split_dict(self._tokenizer(text, max_length=self.model.config.max_position_embeddings, truncation=True,
return_token_type_ids=False, return_attention_mask=False)),
device=self.device)
results = []
order = []
id2label = self.model.config.id2label
for batch in dataloader:
logits = self.model(input_ids=batch['input_ids']).logits
logits, batch_labels = logits.sort(descending=True)
batch_labels = [[id2label[l] for l in ls] for ls in batch_labels.tolist()]
batch_probs = logits.softmax(dim=-1).tolist()
for labels, probs, i in zip(batch_labels, batch_probs, batch[IDX]):
k = topk[i]
p = prob[i]
if k is False:
labels = labels[0]
elif k is True:
pass
elif k:
labels = labels[:k]
if p:
if k is False:
result = labels, probs[0]
else:
result = dict(zip(labels, probs))
else:
result = labels
results.append(result)
order.extend(batch[IDX])
results = reorder(results, order)
if flat:
results = results[0]
return results
@property
def labels(self):
return [x[1] for x in sorted(self.model.config.id2label.items())]
================================================
FILE: hanlp/components/classifiers/transformer_classifier_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 13:19
import math
from typing import Union, Tuple, Any, Iterable
import tensorflow as tf
from hanlp.common.keras_component import KerasComponent
from hanlp_common.structure import SerializableDict
from hanlp.layers.transformers.loader_tf import build_transformer
from hanlp.optimizers.adamw import create_optimizer
from hanlp.transform.table_tf import TableTransform
from hanlp.utils.log_util import logger
from hanlp_common.util import merge_locals_kwargs
from transformers.tokenization_utils import PreTrainedTokenizer
class TransformerTextTransform(TableTransform):
def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=None,
y_column=-1, skip_header=True, delimiter='auto', multi_label=False, **kwargs) -> None:
super().__init__(config, map_x, map_y, x_columns, y_column, multi_label, skip_header, delimiter, **kwargs)
self.tokenizer: PreTrainedTokenizer = None
def inputs_to_samples(self, inputs, gold=False):
tokenizer = self.tokenizer
max_length = self.config.max_length
num_features = None
pad_token = None if self.label_vocab.mutable else tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
for (X, Y) in super().inputs_to_samples(inputs, gold):
if self.label_vocab.mutable:
yield None, Y
continue
if isinstance(X, str):
X = (X,)
if num_features is None:
num_features = self.config.num_features
assert num_features == len(X), f'Numbers of features {num_features} ' \
f'inconsistent with current {len(X)}={X}'
text_a = X[0]
text_b = X[1] if len(X) > 1 else None
tokens_a = self.tokenizer.tokenize(text_a)
tokens_b = self.tokenizer.tokenize(text_b) if text_b else None
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
segment_ids = [0] * len(tokens)
if tokens_b:
tokens += tokens_b
segment_ids += [1] * len(tokens_b)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
attention_mask = [1] * len(token_ids)
diff = max_length - len(token_ids)
if diff < 0:
# logger.warning(
# f'Input tokens {tokens} exceed the max sequence length of {max_length - 2}. '
# f'The exceeded part will be truncated and ignored. '
# f'You are recommended to split your long text into several sentences within '
# f'{max_length - 2} tokens beforehand.')
token_ids = token_ids[:max_length]
attention_mask = attention_mask[:max_length]
segment_ids = segment_ids[:max_length]
elif diff > 0:
token_ids += [pad_token] * diff
attention_mask += [0] * diff
segment_ids += [0] * diff
assert len(token_ids) == max_length, "Error with input length {} vs {}".format(len(token_ids), max_length)
assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask),
max_length)
assert len(segment_ids) == max_length, "Error with input length {} vs {}".format(len(segment_ids),
max_length)
label = Y
yield (token_ids, attention_mask, segment_ids), label
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
max_length = self.config.max_length
types = (tf.int32, tf.int32, tf.int32), tf.string
shapes = ([max_length], [max_length], [max_length]), [None, ] if self.config.get('multi_label', None) else []
values = (0, 0, 0), self.label_vocab.safe_pad_token
return types, shapes, values
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
logger.fatal('map_x should always be set to True')
exit(1)
def y_to_idx(self, y) -> tf.Tensor:
if self.config.get('multi_label', None):
# need to change index to binary vector
mapped = tf.map_fn(fn=lambda x: tf.cast(self.label_vocab.lookup(x), tf.int32), elems=y,
fn_output_signature=tf.TensorSpec(dtype=tf.dtypes.int32, shape=[None, ]))
one_hots = tf.one_hot(mapped, len(self.label_vocab))
idx = tf.reduce_sum(one_hots, -2)
else:
idx = self.label_vocab.lookup(y)
return idx
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
batch=None) -> Iterable:
# Prediction to be Y > 0:
if self.config.get('multi_label', None):
preds = Y
else:
preds = tf.argmax(Y, axis=-1)
for y in preds:
yield self.label_vocab.idx_to_token[y]
def input_is_single_sample(self, input: Any) -> bool:
return isinstance(input, (str, tuple))
class TransformerClassifierTF(KerasComponent):
def __init__(self, bert_text_transform=None) -> None:
if not bert_text_transform:
bert_text_transform = TransformerTextTransform()
super().__init__(bert_text_transform)
self.model: tf.keras.Model
self.transform: TransformerTextTransform = bert_text_transform
# noinspection PyMethodOverriding
def fit(self, trn_data: Any, dev_data: Any, save_dir: str, transformer: str, max_length: int = 128,
optimizer='adamw', warmup_steps_ratio=0.1, use_amp=False, batch_size=32,
epochs=3, logger=None, verbose=1, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def evaluate_output(self, tst_data, out, num_batches, metric):
out.write('sentence\tpred\tgold\n')
total, correct, score = 0, 0, 0
for idx, batch in enumerate(tst_data):
outputs = self.model.predict_on_batch(batch[0])
outputs = tf.argmax(outputs, axis=1)
for X, Y_pred, Y_gold, in zip(batch[0][0], outputs, batch[1]):
feature = ' '.join(self.transform.tokenizer.convert_ids_to_tokens(X.numpy()))
feature = feature.replace(' ##', '') # fix sub-word generated by BERT tagger
out.write('{}\t{}\t{}\n'.format(feature,
self._y_id_to_str(Y_pred),
self._y_id_to_str(Y_gold)))
total += 1
correct += int(tf.equal(Y_pred, Y_gold).numpy())
score = correct / total
print('\r{}/{} {}: {:.2f}'.format(idx + 1, num_batches, metric, score * 100), end='')
print()
return score
def _y_id_to_str(self, Y_pred) -> str:
return self.transform.label_vocab.idx_to_token[Y_pred.numpy()]
def build_loss(self, loss, **kwargs):
if loss:
assert isinstance(loss, tf.keras.losses.loss), 'Must specify loss as an instance in tf.keras.losses'
return loss
elif self.config.get('multi_label', None):
# Loss to be BinaryCrossentropy for multi-label:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
else:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
return loss
# noinspection PyMethodOverriding
def build_optimizer(self, optimizer, use_amp, train_steps, warmup_steps, **kwargs):
if optimizer == 'adamw':
opt = create_optimizer(init_lr=5e-5, num_train_steps=train_steps, num_warmup_steps=warmup_steps)
# opt = tfa.optimizers.AdamW(learning_rate=3e-5, epsilon=1e-08, weight_decay=0.01)
# opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
self.config.optimizer = tf.keras.utils.serialize_keras_object(opt)
lr_config = self.config.optimizer['config']['learning_rate']['config']
if hasattr(lr_config['decay_schedule_fn'], 'get_config'):
lr_config['decay_schedule_fn'] = dict(
(k, v) for k, v in lr_config['decay_schedule_fn'].config().items() if not k.startswith('_'))
else:
opt = super().build_optimizer(optimizer)
if use_amp:
# loss scaling is currently required when using mixed precision
opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
return opt
# noinspection PyMethodOverriding
def build_model(self, transformer, max_length, **kwargs):
model, self.transform.tokenizer = build_transformer(transformer, max_length, len(self.transform.label_vocab),
tagging=False)
return model
def build_vocab(self, trn_data, logger):
train_examples = super().build_vocab(trn_data, logger)
warmup_steps_per_epoch = math.ceil(train_examples * self.config.warmup_steps_ratio / self.config.batch_size)
self.config.warmup_steps = warmup_steps_per_epoch * self.config.epochs
return train_examples
def build_metrics(self, metrics, logger, **kwargs):
if self.config.get('multi_label', None):
metric = tf.keras.metrics.BinaryAccuracy('binary_accuracy')
else:
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
return [metric]
================================================
FILE: hanlp/components/classifiers/transformer_regression_hf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2023-02-17 17:54
import logging
from typing import List, Union, Callable
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, PreTrainedTokenizer, AutoTokenizer
from hanlp.common.dataset import TableDataset, PadSequenceDataLoader, SortingSamplerBuilder
from hanlp.common.torch_component import TorchComponent
from hanlp_common.constant import IDX
from hanlp_common.util import split_dict, reorder
class TransformerRegressionHF(TorchComponent):
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self._tokenizer: PreTrainedTokenizer = None
def build_dataloader(self, data, sampler_builder=None, shuffle=False, device=None,
logger: logging.Logger = None,
**kwargs) -> DataLoader:
dataset = TableDataset(data)
lens = [len(sample['input_ids']) for sample in dataset]
if sampler_builder:
sampler = sampler_builder.build(lens, shuffle, 1)
else:
sampler = SortingSamplerBuilder(batch_size=32).build(lens, shuffle, 1)
loader = PadSequenceDataLoader(dataset=dataset,
batch_sampler=sampler,
pad={'input_ids': self._tokenizer.pad_token_id},
device=device,
vocabs=self.vocabs)
return loader
def build_optimizer(self, **kwargs):
raise NotImplementedError()
def build_criterion(self, **kwargs):
raise NotImplementedError()
def build_metric(self, **kwargs):
raise NotImplementedError()
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, **kwargs):
raise NotImplementedError()
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
raise NotImplementedError()
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
raise NotImplementedError()
def load_vocabs(self, save_dir, filename='vocabs.json'):
self._tokenizer = AutoTokenizer.from_pretrained(save_dir)
def load_weights(self, save_dir, filename='model.pt', **kwargs):
pass
def build_model(self, training=True, save_dir=None, **kwargs) -> torch.nn.Module:
return AutoModelForSequenceClassification.from_pretrained(save_dir)
def predict(self, text: Union[str, List[str]], **kwargs):
"""
Classify text.
Args:
text: A document or a list of documents.
topk: ``True`` or ``int`` to return the top-k labels.
prob: Return also probabilities.
max_len: Strip long document into ``max_len`` characters for faster prediction.
**kwargs: Not used
Returns:
Classification results.
"""
flat = isinstance(text, str)
if flat:
text = [text]
# noinspection PyTypeChecker
dataloader = self.build_dataloader(
split_dict(self._tokenizer(text, max_length=self.model.config.max_position_embeddings, truncation=True,
return_token_type_ids=False, return_attention_mask=False)),
device=self.device)
results = []
order = []
for batch in dataloader:
logits = self.model(input_ids=batch['input_ids']).logits
logits = logits.squeeze(-1).clip(-1, 1)
logits = logits.tolist()
results.extend(logits)
order.extend(batch[IDX])
results = reorder(results, order)
if flat:
results = results[0]
return results
================================================
FILE: hanlp/components/distillation/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-17 20:29
================================================
FILE: hanlp/components/distillation/distillable_component.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-17 20:30
from abc import ABC
from copy import copy
import hanlp
from hanlp.common.torch_component import TorchComponent
from hanlp.components.distillation.losses import KnowledgeDistillationLoss
from hanlp.components.distillation.schedulers import TemperatureScheduler
from hanlp.utils.torch_util import cuda_devices
from hanlp_common.util import merge_locals_kwargs
class DistillableComponent(TorchComponent, ABC):
# noinspection PyMethodMayBeStatic,PyTypeChecker
def build_teacher(self, teacher: str, devices) -> TorchComponent:
return hanlp.load(teacher, load_kwargs={'devices': devices})
def distill(self,
teacher: str,
trn_data,
dev_data,
save_dir,
batch_size=None,
epochs=None,
kd_criterion='kd_ce_loss',
temperature_scheduler='flsw',
devices=None,
logger=None,
seed=None,
**kwargs):
devices = devices or cuda_devices()
if isinstance(kd_criterion, str):
kd_criterion = KnowledgeDistillationLoss(kd_criterion)
if isinstance(temperature_scheduler, str):
temperature_scheduler = TemperatureScheduler.from_name(temperature_scheduler)
teacher = self.build_teacher(teacher, devices=devices)
self.vocabs = teacher.vocabs
config = copy(teacher.config)
batch_size = batch_size or config.get('batch_size', None)
epochs = epochs or config.get('epochs', None)
config.update(kwargs)
return super().fit(**merge_locals_kwargs(locals(),
config,
excludes=('self', 'kwargs', '__class__', 'config')))
@property
def _savable_config(self):
config = super(DistillableComponent, self)._savable_config
if 'teacher' in config:
config.teacher = config.teacher.load_path
return config
================================================
FILE: hanlp/components/distillation/losses.py
================================================
# Adopted from https://github.com/airaria/TextBrewer
# Apache License Version 2.0
import torch
import torch.nn.functional as F
from hanlp_common.configurable import AutoConfigurable
def kd_mse_loss(logits_S, logits_T, temperature=1):
'''
Calculate the mse loss between logits_S and logits_T
:param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
:param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
:param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,)
'''
if isinstance(temperature, torch.Tensor) and temperature.dim() > 0:
temperature = temperature.unsqueeze(-1)
beta_logits_T = logits_T / temperature
beta_logits_S = logits_S / temperature
loss = F.mse_loss(beta_logits_S, beta_logits_T)
return loss
def kd_ce_loss(logits_S, logits_T, temperature=1):
'''
Calculate the cross entropy between logits_S and logits_T
:param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
:param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
:param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,)
'''
if isinstance(temperature, torch.Tensor) and temperature.dim() > 0:
temperature = temperature.unsqueeze(-1)
beta_logits_T = logits_T / temperature
beta_logits_S = logits_S / temperature
p_T = F.softmax(beta_logits_T, dim=-1)
loss = -(p_T * F.log_softmax(beta_logits_S, dim=-1)).sum(dim=-1).mean()
return loss
def att_mse_loss(attention_S, attention_T, mask=None):
'''
* Calculates the mse loss between `attention_S` and `attention_T`.
* If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
:param torch.Tensor logits_S: tensor of shape (*batch_size*, *num_heads*, *length*, *length*)
:param torch.Tensor logits_T: tensor of shape (*batch_size*, *num_heads*, *length*, *length*)
:param torch.Tensor mask: tensor of shape (*batch_size*, *length*)
'''
if mask is None:
attention_S_select = torch.where(attention_S <= -1e-3, torch.zeros_like(attention_S), attention_S)
attention_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), attention_T)
loss = F.mse_loss(attention_S_select, attention_T_select)
else:
mask = mask.to(attention_S).unsqueeze(1).expand(-1, attention_S.size(1), -1) # (bs, num_of_heads, len)
valid_count = torch.pow(mask.sum(dim=2), 2).sum()
loss = (F.mse_loss(attention_S, attention_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze(
2)).sum() / valid_count
return loss
def att_mse_sum_loss(attention_S, attention_T, mask=None):
'''
* Calculates the mse loss between `attention_S` and `attention_T`.
* If the the shape is (*batch_size*, *num_heads*, *length*, *length*), sums along the `num_heads` dimension and then calcuates the mse loss between the two matrices.
* If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
:param torch.Tensor logits_S: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*)
:param torch.Tensor logits_T: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*)
:param torch.Tensor mask: tensor of shape (*batch_size*, *length*)
'''
if len(attention_S.size()) == 4:
attention_T = attention_T.sum(dim=1)
attention_S = attention_S.sum(dim=1)
if mask is None:
attention_S_select = torch.where(attention_S <= -1e-3, torch.zeros_like(attention_S), attention_S)
attention_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), attention_T)
loss = F.mse_loss(attention_S_select, attention_T_select)
else:
mask = mask.to(attention_S)
valid_count = torch.pow(mask.sum(dim=1), 2).sum()
loss = (F.mse_loss(attention_S, attention_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze(
1)).sum() / valid_count
return loss
def att_ce_loss(attention_S, attention_T, mask=None):
'''
* Calculates the cross-entropy loss between `attention_S` and `attention_T`, where softmax is to applied on ``dim=-1``.
* If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
:param torch.Tensor logits_S: tensor of shape (*batch_size*, *num_heads*, *length*, *length*)
:param torch.Tensor logits_T: tensor of shape (*batch_size*, *num_heads*, *length*, *length*)
:param torch.Tensor mask: tensor of shape (*batch_size*, *length*)
'''
probs_T = F.softmax(attention_T, dim=-1)
if mask is None:
probs_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), probs_T)
loss = -((probs_T_select * F.log_softmax(attention_S, dim=-1)).sum(dim=-1)).mean()
else:
mask = mask.to(attention_S).unsqueeze(1).expand(-1, attention_S.size(1), -1) # (bs, num_of_heads, len)
loss = -((probs_T * F.log_softmax(attention_S, dim=-1) * mask.unsqueeze(2)).sum(
dim=-1) * mask).sum() / mask.sum()
return loss
def att_ce_mean_loss(attention_S, attention_T, mask=None):
'''
* Calculates the cross-entropy loss between `attention_S` and `attention_T`, where softmax is to applied on ``dim=-1``.
* If the shape is (*batch_size*, *num_heads*, *length*, *length*), averages over dimension `num_heads` and then computes cross-entropy loss between the two matrics.
* If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
:param torch.tensor logits_S: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*)
:param torch.tensor logits_T: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*)
:param torch.tensor mask: tensor of shape (*batch_size*, *length*)
'''
if len(attention_S.size()) == 4:
attention_S = attention_S.mean(dim=1) # (bs, len, len)
attention_T = attention_T.mean(dim=1)
probs_T = F.softmax(attention_T, dim=-1)
if mask is None:
probs_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), probs_T)
loss = -((probs_T_select * F.log_softmax(attention_S, dim=-1)).sum(dim=-1)).mean()
else:
mask = mask.to(attention_S)
loss = -((probs_T * F.log_softmax(attention_S, dim=-1) * mask.unsqueeze(1)).sum(
dim=-1) * mask).sum() / mask.sum()
return loss
def hid_mse_loss(state_S, state_T, mask=None):
'''
* Calculates the mse loss between `state_S` and `state_T`, which are the hidden state of the models.
* If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
* If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions.
:param torch.Tensor state_S: tensor of shape (*batch_size*, *length*, *hidden_size*)
:param torch.Tensor state_T: tensor of shape (*batch_size*, *length*, *hidden_size*)
:param torch.Tensor mask: tensor of shape (*batch_size*, *length*)
'''
if mask is None:
loss = F.mse_loss(state_S, state_T)
else:
mask = mask.to(state_S)
valid_count = mask.sum() * state_S.size(-1)
loss = (F.mse_loss(state_S, state_T, reduction='none') * mask.unsqueeze(-1)).sum() / valid_count
return loss
def cos_loss(state_S, state_T, mask=None):
'''
* Computes the cosine similarity loss between the inputs. This is the loss used in DistilBERT, see `DistilBERT `_
* If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
* If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions.
:param torch.Tensor state_S: tensor of shape (*batch_size*, *length*, *hidden_size*)
:param torch.Tensor state_T: tensor of shape (*batch_size*, *length*, *hidden_size*)
:param torch.Tensor mask: tensor of shape (*batch_size*, *length*)
'''
if mask is None:
state_S = state_S.view(-1, state_S.size(-1))
state_T = state_T.view(-1, state_T.size(-1))
else:
mask = mask.to(state_S).unsqueeze(-1).expand_as(state_S) # (bs,len,dim)
state_S = torch.masked_select(state_S, mask).view(-1, mask.size(-1)) # (bs * select, dim)
state_T = torch.masked_select(state_T, mask).view(-1, mask.size(-1)) # (bs * select, dim)
target = state_S.new(state_S.size(0)).fill_(1)
loss = F.cosine_embedding_loss(state_S, state_T, target, reduction='mean')
return loss
def pkd_loss(state_S, state_T, mask=None):
'''
* Computes normalized vector mse loss at position 0 along `length` dimension. This is the loss used in BERT-PKD, see `Patient Knowledge Distillation for BERT Model Compression `_.
* If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions.
:param torch.Tensor state_S: tensor of shape (*batch_size*, *length*, *hidden_size*)
:param torch.Tensor state_T: tensor of shape (*batch_size*, *length*, *hidden_size*)
:param mask: not used.
'''
cls_T = state_T[:, 0] # (batch_size, hidden_dim)
cls_S = state_S[:, 0] # (batch_size, hidden_dim)
normed_cls_T = cls_T / torch.norm(cls_T, dim=1, keepdim=True)
normed_cls_S = cls_S / torch.norm(cls_S, dim=1, keepdim=True)
loss = (normed_cls_S - normed_cls_T).pow(2).sum(dim=-1).mean()
return loss
def fsp_loss(state_S, state_T, mask=None):
r'''
* Takes in two lists of matrics `state_S` and `state_T`. Each list contains two matrices of the shape (*batch_size*, *length*, *hidden_size*). Computes the similarity matrix between the two matrices in `state_S` ( with the resulting shape (*batch_size*, *hidden_size*, *hidden_size*) ) and the ones in B ( with the resulting shape (*batch_size*, *hidden_size*, *hidden_size*) ), then computes the mse loss between the similarity matrices:
.. math::
loss = mean((S_{1}^T \cdot S_{2} - T_{1}^T \cdot T_{2})^2)
* It is a Variant of FSP loss in `A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning `_.
* If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
* If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions.
:param torch.tensor state_S: list of two tensors, each tensor is of the shape (*batch_size*, *length*, *hidden_size*)
:param torch.tensor state_T: list of two tensors, each tensor is of the shape (*batch_size*, *length*, *hidden_size*)
:param torch.tensor mask: tensor of the shape (*batch_size*, *length*)
Example in `intermediate_matches`::
intermediate_matches = [
{'layer_T':[0,0], 'layer_S':[0,0], 'feature':'hidden','loss': 'fsp', 'weight' : 1, 'proj':['linear',384,768]},
...]
'''
if mask is None:
state_S_0 = state_S[0] # (batch_size , length, hidden_dim)
state_S_1 = state_S[1] # (batch_size, length, hidden_dim)
state_T_0 = state_T[0]
state_T_1 = state_T[1]
gram_S = torch.bmm(state_S_0.transpose(1, 2), state_S_1) / state_S_1.size(
1) # (batch_size, hidden_dim, hidden_dim)
gram_T = torch.bmm(state_T_0.transpose(1, 2), state_T_1) / state_T_1.size(1)
else:
mask = mask.to(state_S[0]).unsqueeze(-1)
lengths = mask.sum(dim=1, keepdim=True)
state_S_0 = state_S[0] * mask
state_S_1 = state_S[1] * mask
state_T_0 = state_T[0] * mask
state_T_1 = state_T[1] * mask
gram_S = torch.bmm(state_S_0.transpose(1, 2), state_S_1) / lengths
gram_T = torch.bmm(state_T_0.transpose(1, 2), state_T_1) / lengths
loss = F.mse_loss(gram_S, gram_T)
return loss
def mmd_loss(state_S, state_T, mask=None):
r'''
* Takes in two lists of matrices `state_S` and `state_T`. Each list contains 2 matrices of the shape (*batch_size*, *length*, *hidden_size*). `hidden_size` of matrices in `State_S` doesn't need to be the same as that of `state_T`. Computes the similarity matrix between the two matrices in `state_S` ( with the resulting shape (*batch_size*, *length*, *length*) ) and the ones in B ( with the resulting shape (*batch_size*, *length*, *length*) ), then computes the mse loss between the similarity matrices:
.. math::
loss = mean((S_{1} \cdot S_{2}^T - T_{1} \cdot T_{2}^T)^2)
* It is a Variant of the NST loss in `Like What You Like: Knowledge Distill via Neuron Selectivity Transfer `_
* If the `inputs_mask` is given, masks the positions where ``input_mask==0``.
:param torch.tensor state_S: list of two tensors, each tensor is of the shape (*batch_size*, *length*, *hidden_size*)
:param torch.tensor state_T: list of two tensors, each tensor is of the shape (*batch_size*, *length*, *hidden_size*)
:param torch.tensor mask: tensor of the shape (*batch_size*, *length*)
Example in `intermediate_matches`::
intermediate_matches = [
{'layer_T':[0,0], 'layer_S':[0,0], 'feature':'hidden','loss': 'nst', 'weight' : 1},
...]
'''
state_S_0 = state_S[0] # (batch_size , length, hidden_dim_S)
state_S_1 = state_S[1] # (batch_size , length, hidden_dim_S)
state_T_0 = state_T[0] # (batch_size , length, hidden_dim_T)
state_T_1 = state_T[1] # (batch_size , length, hidden_dim_T)
if mask is None:
gram_S = torch.bmm(state_S_0, state_S_1.transpose(1, 2)) / state_S_1.size(2) # (batch_size, length, length)
gram_T = torch.bmm(state_T_0, state_T_1.transpose(1, 2)) / state_T_1.size(2)
loss = F.mse_loss(gram_S, gram_T)
else:
mask = mask.to(state_S[0])
valid_count = torch.pow(mask.sum(dim=1), 2).sum()
gram_S = torch.bmm(state_S_0, state_S_1.transpose(1, 2)) / state_S_1.size(1) # (batch_size, length, length)
gram_T = torch.bmm(state_T_0, state_T_1.transpose(1, 2)) / state_T_1.size(1)
loss = (F.mse_loss(gram_S, gram_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze(
1)).sum() / valid_count
return loss
class KnowledgeDistillationLoss(AutoConfigurable):
def __init__(self, name) -> None:
super().__init__()
self.name = name
import sys
thismodule = sys.modules[__name__]
self._loss = getattr(thismodule, name)
def __call__(self, *args, **kwargs):
return self._loss(*args, **kwargs)
================================================
FILE: hanlp/components/distillation/schedulers.py
================================================
# Adopted from https://github.com/airaria/TextBrewer
# Apache License Version 2.0
from abc import ABC, abstractmethod
import torch
# x is between 0 and 1
from hanlp_common.configurable import AutoConfigurable
def linear_growth_weight_scheduler(x):
return x
def linear_decay_weight_scheduler(x):
return 1 - x
def constant_temperature_scheduler(logits_S, logits_T, base_temperature):
'''
Remember to detach logits_S
'''
return base_temperature
def flsw_temperature_scheduler_builder(beta, gamma, eps=1e-4, *args):
'''
adapted from arXiv:1911.07471
'''
def flsw_temperature_scheduler(logits_S, logits_T, base_temperature):
v = logits_S.detach()
t = logits_T.detach()
with torch.no_grad():
v = v / (torch.norm(v, dim=-1, keepdim=True) + eps)
t = t / (torch.norm(t, dim=-1, keepdim=True) + eps)
w = torch.pow((1 - (v * t).sum(dim=-1)), gamma)
tau = base_temperature + (w.mean() - w) * beta
return tau
return flsw_temperature_scheduler
def cwsm_temperature_scheduler_builder(beta, *args):
'''
adapted from arXiv:1911.07471
'''
def cwsm_temperature_scheduler(logits_S, logits_T, base_temperature):
v = logits_S.detach()
with torch.no_grad():
v = torch.softmax(v, dim=-1)
v_max = v.max(dim=-1)[0]
w = 1 / (v_max + 1e-3)
tau = base_temperature + (w.mean() - w) * beta
return tau
return cwsm_temperature_scheduler
class LinearTeacherAnnealingScheduler(object):
def __init__(self, num_training_steps: int) -> None:
super().__init__()
self._num_training_steps = num_training_steps
self._current_training_steps = 0
def step(self):
self._current_training_steps += 1
def __float__(self):
return self._current_training_steps / self._num_training_steps
class TemperatureScheduler(ABC, AutoConfigurable):
def __init__(self, base_temperature) -> None:
super().__init__()
self.base_temperature = base_temperature
def __call__(self, logits_S, logits_T):
return self.forward(logits_S, logits_T)
@abstractmethod
def forward(self, logits_S, logits_T):
raise NotImplementedError()
@staticmethod
def from_name(name):
classes = {
'constant': ConstantScheduler,
'flsw': FlswScheduler,
'cwsm': CwsmScheduler,
}
assert name in classes, f'Unsupported temperature scheduler {name}. Expect one from {list(classes.keys())}.'
return classes[name]()
class FunctionalScheduler(TemperatureScheduler):
def __init__(self, scheduler_func, base_temperature) -> None:
super().__init__(base_temperature)
self._scheduler_func = scheduler_func
def forward(self, logits_S, logits_T):
return self._scheduler_func(logits_S, logits_T, self.base_temperature)
class ConstantScheduler(TemperatureScheduler):
def forward(self, logits_S, logits_T):
return self.base_temperature
class FlswScheduler(FunctionalScheduler):
def __init__(self, beta=1, gamma=1, eps=1e-4, base_temperature=8):
super().__init__(flsw_temperature_scheduler_builder(beta, gamma, eps), base_temperature)
self.beta = beta
self.gamma = gamma
self.eps = eps
class CwsmScheduler(FunctionalScheduler):
def __init__(self, beta=1, base_temperature=8):
super().__init__(cwsm_temperature_scheduler_builder(beta), base_temperature)
self.beta = beta
================================================
FILE: hanlp/components/eos/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-26 20:19
================================================
FILE: hanlp/components/eos/ngram.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-26 20:19
import logging
from collections import Counter
from typing import Union, List, Callable
import torch
from torch import nn, optim
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.common.vocab import Vocab
from hanlp.datasets.eos.eos import SentenceBoundaryDetectionDataset
from hanlp.metrics.f1 import F1
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs
class NgramSentenceBoundaryDetectionModel(nn.Module):
def __init__(self,
char_vocab_size,
embedding_size=128,
rnn_type: str = 'LSTM',
rnn_size=256,
rnn_layers=1,
rnn_bidirectional=False,
dropout=0.2,
**kwargs
):
super(NgramSentenceBoundaryDetectionModel, self).__init__()
self.embed = nn.Embedding(num_embeddings=char_vocab_size,
embedding_dim=embedding_size)
rnn_type = rnn_type.lower()
if rnn_type == 'lstm':
self.rnn = nn.LSTM(input_size=embedding_size,
hidden_size=rnn_size,
num_layers=rnn_layers,
dropout=self.dropout if rnn_layers > 1 else 0.0,
bidirectional=rnn_bidirectional,
batch_first=True)
elif rnn_type == 'gru':
self.rnn = nn.GRU(input_size=self.embdding_size,
hidden_size=rnn_size,
num_layers=rnn_layers,
dropout=self.dropout if rnn_layers > 1 else 0.0,
bidirectional=rnn_bidirectional,
batch_first=True)
else:
raise NotImplementedError(f"'{rnn_type}' has to be one of [LSTM, GRU]")
self.dropout = nn.Dropout(p=dropout) if dropout else None
self.dense = nn.Linear(in_features=rnn_size * (2 if rnn_bidirectional else 1),
out_features=1)
def forward(self, x: torch.Tensor):
output = self.embed(x)
self.rnn.flatten_parameters()
output, _ = self.rnn(output)
if self.dropout:
output = self.dropout(output[:, -1, :])
output = output.squeeze(1)
output = self.dense(output).squeeze(-1)
return output
class NgramSentenceBoundaryDetector(TorchComponent):
def __init__(self, **kwargs) -> None:
"""A sentence boundary detector using ngram as features and LSTM as encoder (:cite:`Schweter:Ahmed:2019`).
It predicts whether a punctuation marks an ``EOS``.
.. Note::
This component won't work on text without the punctuations defined in its config. It's always
recommended to understand how it works before using it. The predefined punctuations can be listed by the
following codes.
>>> print(eos.config.eos_chars)
Args:
**kwargs: Passed to config.
"""
super().__init__(**kwargs)
def build_optimizer(self, **kwargs):
optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr)
return optimizer
def build_criterion(self, **kwargs):
return BCEWithLogitsLoss()
def build_metric(self, **kwargs):
return F1()
def execute_training_loop(self,
trn: DataLoader,
dev: DataLoader,
epochs,
criterion,
optimizer,
metric,
save_dir,
logger: logging.Logger,
devices,
**kwargs):
best_epoch, best_metric = 0, -1
timer = CountdownTimer(epochs)
ratio_width = len(f'{len(trn)}/{len(trn)}')
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger)
if dev:
self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
report = f'{timer.elapsed_human}/{timer.total_time_human}'
dev_score = metric.score
if dev_score > best_metric:
self.save_weights(save_dir)
best_metric = dev_score
report += ' [red]saved[/red]'
timer.log(report, ratio_percentage=False, newline=True, ratio=False)
def fit_dataloader(self,
trn: DataLoader,
criterion,
optimizer,
metric,
logger: logging.Logger,
**kwargs):
self.model.train()
timer = CountdownTimer(len(trn))
total_loss = 0
self.reset_metrics(metric)
for batch in trn:
optimizer.zero_grad()
prediction = self.feed_batch(batch)
loss = self.compute_loss(prediction, batch, criterion)
self.update_metrics(batch, prediction, metric)
loss.backward()
if self.config.grad_norm:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
optimizer.step()
total_loss += loss.item()
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger)
del loss
return total_loss / timer.total
def compute_loss(self, prediction, batch, criterion):
loss = criterion(prediction, batch['label_id'])
return loss
# noinspection PyMethodOverriding
def evaluate_dataloader(self,
data: DataLoader,
criterion: Callable,
metric,
logger,
ratio_width=None,
output=False,
**kwargs):
self.model.eval()
self.reset_metrics(metric)
timer = CountdownTimer(len(data))
total_loss = 0
for batch in data:
prediction = self.feed_batch(batch)
self.update_metrics(batch, prediction, metric)
loss = self.compute_loss(prediction, batch, criterion)
total_loss += loss.item()
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger,
ratio_width=ratio_width)
del loss
return total_loss / timer.total, metric
def build_model(self, training=True, **kwargs) -> torch.nn.Module:
model = NgramSentenceBoundaryDetectionModel(**self.config, char_vocab_size=len(self.vocabs.char))
return model
def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger, **kwargs) -> DataLoader:
dataset = SentenceBoundaryDetectionDataset(data, **self.config, transform=[self.vocabs])
if isinstance(data, str):
dataset.purge_cache()
if not self.vocabs:
self.build_vocabs(dataset, logger)
return PadSequenceDataLoader(dataset, batch_size=batch_size, shuffle=shuffle, device=device,
pad={'label_id': .0})
def predict(self, data: Union[str, List[str]], batch_size: int = None, strip=True, **kwargs):
"""Sentence split.
Args:
data: A paragraph or a list of paragraphs.
batch_size: Number of samples per batch.
strip: Strip out blank characters at the head and tail of each sentence.
Returns:
A list of sentences or a list of lists of sentences.
"""
if not data:
return []
self.model.eval()
flat = isinstance(data, str)
if flat:
data = [data]
samples = []
eos_chars = self.config.eos_chars
window_size = self.config.window_size
for doc_id_, corpus in enumerate(data):
corpus = list(corpus)
for i, c in enumerate(corpus):
if c in eos_chars:
window = corpus[max(0, i - window_size): i + window_size + 1]
samples.append({'char': window, 'offset_': i, 'doc_id_': doc_id_})
eos_prediction = [[] for _ in range(len(data))]
if samples:
dataloader = self.build_dataloader(samples, **self.config, device=self.device, shuffle=False, logger=None)
for batch in dataloader:
logits = self.feed_batch(batch)
prediction = (logits > 0).tolist()
for doc_id_, offset_, eos in zip(batch['doc_id_'], batch['offset_'], prediction):
if eos:
eos_prediction[doc_id_].append(offset_)
outputs = []
for corpus, output in zip(data, eos_prediction):
sents_per_document = []
prev_offset = 0
for offset in output:
offset += 1
sents_per_document.append(corpus[prev_offset:offset])
prev_offset = offset
if prev_offset != len(corpus):
sents_per_document.append(corpus[prev_offset:])
if strip:
sents_per_document = [x.strip() for x in sents_per_document]
sents_per_document = [x for x in sents_per_document if x]
outputs.append(sents_per_document)
if flat:
outputs = outputs[0]
return outputs
# noinspection PyMethodOverriding
def fit(self,
trn_data,
dev_data,
save_dir,
epochs=5,
append_after_sentence=None,
eos_chars=None,
eos_char_min_freq=200,
eos_char_is_punct=True,
char_min_freq=None,
window_size=5,
batch_size=32,
lr=0.001,
grad_norm=None,
loss_reduction='sum',
embedding_size=128,
rnn_type: str = 'LSTM',
rnn_size=256,
rnn_layers=1,
rnn_bidirectional=False,
dropout=0.2,
devices=None,
logger=None,
seed=None,
**kwargs
):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_vocabs(self, dataset: SentenceBoundaryDetectionDataset, logger, **kwargs):
char_min_freq = self.config.char_min_freq
if char_min_freq:
has_cache = dataset.cache is not None
char_counter = Counter()
for each in dataset:
for c in each['char']:
char_counter[c] += 1
self.vocabs.char = vocab = Vocab()
for c, f in char_counter.items():
if f >= char_min_freq:
vocab.add(c)
if has_cache:
dataset.purge_cache()
for each in dataset:
pass
else:
self.vocabs.char = Vocab()
for each in dataset:
pass
self.config.eos_chars = dataset.eos_chars
self.vocabs.lock()
self.vocabs.summary(logger)
def reset_metrics(self, metrics):
metrics.reset()
def report_metrics(self, loss, metrics):
return f'loss: {loss:.4f} {metrics}'
def update_metrics(self, batch: dict, prediction: torch.FloatTensor, metrics):
def nonzero_offsets(y):
return set(y.nonzero().squeeze(-1).tolist())
metrics(nonzero_offsets(prediction > 0), nonzero_offsets(batch['label_id']))
def feed_batch(self, batch):
prediction = self.model(batch['char_id'])
return prediction
================================================
FILE: hanlp/components/lambda_wrapper.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 18:36
from typing import Callable, Any
from hanlp.common.component import Component
from hanlp_common.reflection import classpath_of, object_from_classpath, str_to_type
class LambdaComponent(Component):
def __init__(self, function: Callable) -> None:
super().__init__()
self.config = {}
self.function = function
self.config['function'] = classpath_of(function)
self.config['classpath'] = classpath_of(self)
def predict(self, data: Any, **kwargs):
unpack = kwargs.pop('_hanlp_unpack', None)
if unpack:
return self.function(*data, **kwargs)
return self.function(data, **kwargs)
@staticmethod
def from_config(meta: dict, **kwargs):
cls = str_to_type(meta['classpath'])
function = meta['function']
function = object_from_classpath(function)
return cls(function)
================================================
FILE: hanlp/components/lemmatizer.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-08 18:35
from typing import List
from hanlp.common.transform import TransformList
from hanlp.components.parsers.ud.lemma_edit import gen_lemma_rule, apply_lemma_rule
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
def add_lemma_rules_to_sample(sample: dict):
if 'tag' in sample and 'lemma' not in sample:
lemma_rules = [gen_lemma_rule(word, lemma)
if lemma != "_" else "_"
for word, lemma in zip(sample['token'], sample['tag'])]
sample['lemma'] = sample['tag'] = lemma_rules
return sample
class TransformerLemmatizer(TransformerTagger):
def __init__(self, **kwargs) -> None:
"""A transition based lemmatizer using transformer as encoder.
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
def build_dataset(self, data, transform=None, **kwargs):
if not isinstance(transform, list):
transform = TransformList()
transform.append(add_lemma_rules_to_sample)
return super().build_dataset(data, transform, **kwargs)
def prediction_to_human(self, pred, vocab: List[str], batch, token=None):
if token is None:
token = batch['token']
rules = super().prediction_to_human(pred, vocab, batch)
for token_per_sent, rule_per_sent in zip(token, rules):
lemma_per_sent = [apply_lemma_rule(t, r) for t, r in zip(token_per_sent, rule_per_sent)]
yield lemma_per_sent
================================================
FILE: hanlp/components/lm/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-29 21:07
================================================
FILE: hanlp/components/lm/mlm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-29 21:07
import logging
import math
from typing import Callable, Union, List
import torch
from hanlp_common.constant import IDX
from hanlp_common.util import reorder
from torch.utils.data import DataLoader
from transformers import AutoModelForMaskedLM
from transformers.tokenization_utils import PreTrainedTokenizer
from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader, SortingSampler
from hanlp.common.torch_component import TorchComponent
from hanlp.layers.transformers.pt_imports import AutoTokenizer_
from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer
from hanlp.utils.time_util import CountdownTimer
class MaskedLanguageModelDataset(TransformableDataset):
def load_file(self, filepath: str):
raise NotImplementedError()
class MaskedLanguageModel(TorchComponent):
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.tokenizer: PreTrainedTokenizer = None
def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None,
verbose=False, **kwargs) -> DataLoader:
dataset = MaskedLanguageModelDataset([{'token': x} for x in data], generate_idx=True,
transform=TransformerTextTokenizer(self.tokenizer, text_a_key='token'))
if verbose:
verbose = CountdownTimer(len(dataset))
lens = []
for each in dataset:
lens.append(len(each['token_input_ids']))
if verbose:
verbose.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]')
dataloader = PadSequenceDataLoader(dataset, batch_sampler=SortingSampler(lens, batch_size=batch_size),
device=device)
return dataloader
def build_optimizer(self, **kwargs):
raise NotImplementedError()
def build_criterion(self, **kwargs):
raise NotImplementedError()
def build_metric(self, **kwargs):
raise NotImplementedError()
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, **kwargs):
raise NotImplementedError()
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
raise NotImplementedError()
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
raise NotImplementedError()
def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module:
return AutoModelForMaskedLM.from_pretrained(transformer)
def input_is_flat(self, masked_sents):
return isinstance(masked_sents, str)
def predict(self, masked_sents: Union[str, List[str]], batch_size=32, topk=10, **kwargs):
flat = self.input_is_flat(masked_sents)
if flat:
masked_sents = [masked_sents]
dataloader = self.build_dataloader(masked_sents, **self.config, device=self.device, batch_size=batch_size)
orders = []
results = []
for batch in dataloader:
input_ids = batch['token_input_ids']
outputs = self.model(input_ids=input_ids, attention_mask=batch['token_attention_mask'])
mask = input_ids == self.tokenizer.mask_token_id
if mask.any():
num_masks = mask.sum(dim=-1).tolist()
masked_logits = outputs.logits[mask]
masked_logits[:, self.tokenizer.all_special_ids] = -math.inf
probs, indices = torch.nn.functional.softmax(masked_logits, dim=-1).topk(topk)
br = []
for p, index in zip(probs.tolist(), indices.tolist()):
br.append(dict(zip(self.tokenizer.convert_ids_to_tokens(index), p)))
offset = 0
for n in num_masks:
results.append(br[offset:offset + n])
offset += n
else:
results.extend([[]] * input_ids.size(0))
orders.extend(batch[IDX])
results = reorder(results, orders)
if flat:
results = results[0]
return results
def load_config(self, save_dir, filename='config.json', **kwargs):
self.config.transformer = save_dir
def load_vocabs(self, save_dir, filename='vocabs.json'):
self.tokenizer = AutoTokenizer_.from_pretrained(self.config.transformer)
def load_weights(self, save_dir, filename='model.pt', **kwargs):
pass
================================================
FILE: hanlp/components/mtl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-20 19:54
================================================
FILE: hanlp/components/mtl/multi_task_learning.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-20 19:55
import functools
import itertools
import logging
import os
from collections import defaultdict
from copy import copy
from itertools import chain
from typing import Union, List, Callable, Dict, Optional, Any, Iterable, Tuple
import numpy as np
import torch
from hanlp_common.constant import IDX, BOS, EOS
from hanlp_common.document import Document
from hanlp_common.util import merge_locals_kwargs, topological_sort, reorder, prefix_match
from hanlp_common.visualization import markdown_table
from toposort import toposort
from torch.utils.data import DataLoader
import hanlp.utils.torch_util
from hanlp.common.dataset import PadSequenceDataLoader, PrefetchDataLoader, CachedDataLoader
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding, ContextualWordEmbeddingModule
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.layers.transformers.utils import pick_tensor_for_each_token
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm
class MultiTaskModel(torch.nn.Module):
def __init__(self,
encoder: torch.nn.Module,
scalar_mixes: torch.nn.ModuleDict,
decoders: torch.nn.ModuleDict,
use_raw_hidden_states: dict) -> None:
super().__init__()
self.use_raw_hidden_states = use_raw_hidden_states
self.encoder: ContextualWordEmbeddingModule = encoder
self.scalar_mixes = scalar_mixes
self.decoders = decoders
class MultiTaskDataLoader(DataLoader):
def __init__(self, training=True, tau: float = 0.8, **dataloaders) -> None:
# noinspection PyTypeChecker
super().__init__(None)
self.tau = tau
self.training = training
self.dataloaders: Dict[str, DataLoader] = dataloaders if dataloaders else {}
# self.iterators = dict((k, iter(v)) for k, v in dataloaders.items())
def __len__(self) -> int:
if self.dataloaders:
return sum(len(x) for x in self.dataloaders.values())
return 0
def __iter__(self):
if self.training:
sampling_weights, total_size = self.sampling_weights
task_names = list(self.dataloaders.keys())
iterators = dict((k, itertools.cycle(v)) for k, v in self.dataloaders.items())
for i in range(total_size):
task_name = np.random.choice(task_names, p=sampling_weights)
yield task_name, next(iterators[task_name])
else:
for task_name, dataloader in self.dataloaders.items():
for batch in dataloader:
yield task_name, batch
@property
def sampling_weights(self):
sampling_weights = self.sizes
total_size = sum(sampling_weights)
Z = sum(pow(v, self.tau) for v in sampling_weights)
sampling_weights = [pow(v, self.tau) / Z for v in sampling_weights]
return sampling_weights, total_size
@property
def sizes(self):
return [len(v) for v in self.dataloaders.values()]
class MultiTaskLearning(TorchComponent):
def __init__(self, **kwargs) -> None:
""" A multi-task learning (MTL) framework. It shares the same encoder across multiple decoders. These decoders
can have dependencies on each other which will be properly handled during decoding. To integrate a component
into this MTL framework, a component needs to implement the :class:`~hanlp.components.mtl.tasks.Task` interface.
This framework mostly follows the architecture of :cite:`clark-etal-2019-bam` and :cite:`he-choi-2021-stem`, with additional scalar mix
tricks (:cite:`kondratyuk-straka-2019-75`) allowing each task to attend to any subset of layers. We also
experimented with knowledge distillation on single tasks, the performance gain was nonsignificant on a large
dataset. In the near future, we have no plan to invest more efforts in distillation, since most datasets HanLP
uses are relatively large, and our hardware is relatively powerful.
Args:
**kwargs: Arguments passed to config.
"""
super().__init__(**kwargs)
self.model: Optional[MultiTaskModel] = None
self.tasks: Dict[str, Task] = None
self.vocabs = None
def build_dataloader(self,
data,
batch_size,
shuffle=False,
device=None,
logger: logging.Logger = None,
gradient_accumulation=1,
tau: float = 0.8,
prune=None,
prefetch=None,
tasks_need_custom_eval=None,
cache=False,
debug=False,
**kwargs) -> DataLoader:
# This method is only called during training or evaluation but not prediction
dataloader = MultiTaskDataLoader(training=shuffle, tau=tau)
for i, (task_name, task) in enumerate(self.tasks.items()):
encoder_transform, transform = self.build_transform(task)
training = None
if data == 'trn':
if debug:
_data = task.dev
else:
_data = task.trn
training = True
elif data == 'dev':
_data = task.dev
training = False
elif data == 'tst':
_data = task.tst
training = False
else:
_data = data
if isinstance(data, str):
logger.info(f'[yellow]{i + 1} / {len(self.tasks)}[/yellow] Building [blue]{data}[/blue] dataset for '
f'[cyan]{task_name}[/cyan] ...')
# Adjust Tokenizer according to task config
config = copy(task.config)
config.pop('transform', None)
task_dataloader: DataLoader = task.build_dataloader(_data, transform, training, device, logger,
tokenizer=encoder_transform.tokenizer,
gradient_accumulation=gradient_accumulation,
cache=isinstance(data, str), **config)
# if prune:
# # noinspection PyTypeChecker
# task_dataset: TransformDataset = task_dataloader.dataset
# size_before = len(task_dataset)
# task_dataset.prune(prune)
# size_after = len(task_dataset)
# num_pruned = size_before - size_after
# logger.info(f'Pruned [yellow]{num_pruned} ({num_pruned / size_before:.1%})[/yellow] '
# f'samples out of {size_before}.')
if cache and data in ('trn', 'dev'):
task_dataloader: CachedDataLoader = CachedDataLoader(
task_dataloader,
f'{cache}/{os.getpid()}-{data}-{task_name.replace("/", "-")}-cache.pt' if isinstance(cache,
str) else None
)
dataloader.dataloaders[task_name] = task_dataloader
if data == 'trn':
sampling_weights, total_size = dataloader.sampling_weights
headings = ['task', '#batches', '%batches', '#scaled', '%scaled', '#epoch']
matrix = []
min_epochs = []
for (task_name, dataset), weight in zip(dataloader.dataloaders.items(), sampling_weights):
epochs = len(dataset) / weight / total_size
matrix.append(
[f'{task_name}', len(dataset), f'{len(dataset) / total_size:.2%}', int(total_size * weight),
f'{weight:.2%}', f'{epochs:.2f}'])
min_epochs.append(epochs)
longest = int(torch.argmax(torch.tensor(min_epochs)))
table = markdown_table(headings, matrix)
rows = table.splitlines()
cells = rows[longest + 2].split('|')
cells[-2] = cells[-2].replace(f'{min_epochs[longest]:.2f}',
f'[bold][red]{min_epochs[longest]:.2f}[/red][/bold]')
rows[longest + 2] = '|'.join(cells)
logger.info(f'[bold][yellow]{"Samples Distribution": ^{len(rows[0])}}[/yellow][/bold]')
logger.info('\n'.join(rows))
if prefetch and (data == 'trn' or not tasks_need_custom_eval):
dataloader = PrefetchDataLoader(dataloader, prefetch=prefetch)
return dataloader
def build_transform(self, task: Task) -> Tuple[TransformerSequenceTokenizer, TransformList]:
encoder: ContextualWordEmbedding = self.config.encoder
encoder_transform: TransformerSequenceTokenizer = task.build_tokenizer(encoder.transform())
length_transform = FieldLength('token', 'token_length')
transform = TransformList(encoder_transform, length_transform)
extra_transform = self.config.get('transform', None)
if extra_transform:
transform.insert(0, extra_transform)
return encoder_transform, transform
def build_optimizer(self,
trn,
epochs,
adam_epsilon,
weight_decay,
warmup_steps,
lr,
encoder_lr,
**kwargs):
model = self.model_
encoder = model.encoder
num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
encoder_parameters = list(encoder.parameters())
parameter_groups: List[Dict[str, Any]] = []
decoders = model.decoders
decoder_optimizers = dict()
for k, task in self.tasks.items():
decoder: torch.nn.Module = decoders[k]
decoder_parameters = list(decoder.parameters())
if task.separate_optimizer:
decoder_optimizers[k] = task.build_optimizer(decoder=decoder, **kwargs)
else:
task_lr = task.lr or lr
parameter_groups.append({"params": decoder_parameters, 'lr': task_lr})
parameter_groups.append({"params": encoder_parameters, 'lr': encoder_lr})
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
no_decay_parameters = set()
for n, p in model.named_parameters():
if any(nd in n for nd in no_decay):
no_decay_parameters.add(p)
no_decay_by_lr = defaultdict(list)
for group in parameter_groups:
_lr = group['lr']
ps = group['params']
group['params'] = decay_parameters = []
group['weight_decay'] = weight_decay
for p in ps:
if p in no_decay_parameters:
no_decay_by_lr[_lr].append(p)
else:
decay_parameters.append(p)
for _lr, ps in no_decay_by_lr.items():
parameter_groups.append({"params": ps, 'lr': _lr, 'weight_decay': 0.0})
# noinspection PyTypeChecker
from transformers import optimization
encoder_optimizer = optimization.AdamW(
parameter_groups,
lr=lr,
weight_decay=weight_decay,
eps=adam_epsilon,
)
encoder_scheduler = optimization.get_linear_schedule_with_warmup(encoder_optimizer,
num_training_steps * warmup_steps,
num_training_steps)
return encoder_optimizer, encoder_scheduler, decoder_optimizers
def build_criterion(self, **kwargs):
return dict((k, v.build_criterion(decoder=self.model_.decoders[k], **kwargs)) for k, v in self.tasks.items())
def build_metric(self, **kwargs):
metrics = MetricDict()
for key, task in self.tasks.items():
metric = task.build_metric(**kwargs)
assert metric, f'Please implement `build_metric` of {type(task)} to return a metric.'
metrics[key] = metric
return metrics
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, patience=0.5, **kwargs):
if isinstance(patience, float):
patience = int(patience * epochs)
best_epoch, best_metric = 0, -1
timer = CountdownTimer(epochs)
ratio_width = len(f'{len(trn)}/{len(trn)}')
epoch = 0
history = History()
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, history, ratio_width=ratio_width,
**self.config)
if dev:
self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width, input='dev')
report = f'{timer.elapsed_human}/{timer.total_time_human}'
dev_score = metric.score
if dev_score > best_metric:
self.save_weights(save_dir)
best_metric = dev_score
best_epoch = epoch
report += ' [red]saved[/red]'
else:
report += f' ({epoch - best_epoch})'
if epoch - best_epoch >= patience:
report += ' early stop'
break
timer.log(report, ratio_percentage=False, newline=True, ratio=False)
for d in [trn, dev]:
self._close_dataloader(d)
if best_epoch != epoch:
logger.info(f'Restoring best model saved [red]{epoch - best_epoch}[/red] epochs ago')
self.load_weights(save_dir)
return best_metric
def _close_dataloader(self, d):
if isinstance(d, PrefetchDataLoader):
d.close()
if hasattr(d.dataset, 'close'):
self._close_dataloader(d.dataset)
elif isinstance(d, CachedDataLoader):
d.close()
elif isinstance(d, MultiTaskDataLoader):
for d in d.dataloaders.values():
self._close_dataloader(d)
# noinspection PyMethodOverriding
def fit_dataloader(self,
trn: DataLoader,
criterion,
optimizer,
metric,
logger: logging.Logger,
history: History,
ratio_width=None,
gradient_accumulation=1,
encoder_grad_norm=None,
decoder_grad_norm=None,
patience=0.5,
eval_trn=False,
**kwargs):
self.model.train()
encoder_optimizer, encoder_scheduler, decoder_optimizers = optimizer
timer = CountdownTimer(len(trn))
total_loss = 0
self.reset_metrics(metric)
model = self.model_
encoder_parameters = model.encoder.parameters()
decoder_parameters = model.decoders.parameters()
for idx, (task_name, batch) in enumerate(trn):
decoder_optimizer = decoder_optimizers.get(task_name, None)
output_dict, _ = self.feed_batch(batch, task_name)
loss = self.compute_loss(batch, output_dict[task_name]['output'], criterion[task_name],
self.tasks[task_name])
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
loss.backward()
total_loss += float(loss.item())
if history.step(gradient_accumulation):
if self.config.get('grad_norm', None):
clip_grad_norm(model, self.config.grad_norm)
if encoder_grad_norm:
torch.nn.utils.clip_grad_norm_(encoder_parameters, encoder_grad_norm)
if decoder_grad_norm:
torch.nn.utils.clip_grad_norm_(decoder_parameters, decoder_grad_norm)
encoder_optimizer.step()
encoder_optimizer.zero_grad()
encoder_scheduler.step()
if decoder_optimizer:
if isinstance(decoder_optimizer, tuple):
decoder_optimizer, decoder_scheduler = decoder_optimizer
else:
decoder_scheduler = None
decoder_optimizer.step()
decoder_optimizer.zero_grad()
if decoder_scheduler:
decoder_scheduler.step()
if eval_trn:
self.decode_output(output_dict, batch, task_name)
self.update_metrics(batch, output_dict, metric, task_name)
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric if eval_trn else None),
ratio_percentage=None,
ratio_width=ratio_width,
logger=logger)
del loss
del output_dict
return total_loss / timer.total
def report_metrics(self, loss, metrics: MetricDict):
return f'loss: {loss:.4f} {metrics.cstr()}' if metrics else f'loss: {loss:.4f}'
# noinspection PyMethodOverriding
@torch.no_grad()
def evaluate_dataloader(self,
data: MultiTaskDataLoader,
criterion,
metric: MetricDict,
logger,
ratio_width=None,
input: str = None,
**kwargs):
self.model.eval()
self.reset_metrics(metric)
tasks_need_custom_eval = self.config.get('tasks_need_custom_eval', None)
tasks_need_custom_eval = tasks_need_custom_eval or {}
tasks_need_custom_eval = dict((k, None) for k in tasks_need_custom_eval)
for each in tasks_need_custom_eval:
tasks_need_custom_eval[each] = data.dataloaders.pop(each)
timer = CountdownTimer(len(data) + len(tasks_need_custom_eval))
total_loss = 0
for idx, (task_name, batch) in enumerate(data):
output_dict, _ = self.feed_batch(batch, task_name)
loss = self.compute_loss(batch, output_dict[task_name]['output'], criterion[task_name],
self.tasks[task_name])
total_loss += loss.item()
self.decode_output(output_dict, batch, task_name)
self.update_metrics(batch, output_dict, metric, task_name)
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger,
ratio_width=ratio_width)
del loss
del output_dict
for task_name, dataset in tasks_need_custom_eval.items():
task = self.tasks[task_name]
decoder = self.model_.decoders[task_name]
task.evaluate_dataloader(
dataset, task.build_criterion(decoder=decoder),
metric=metric[task_name],
input=task.dev if input == 'dev' else task.tst,
split=input,
decoder=decoder,
h=functools.partial(self._encode, task_name=task_name,
cls_is_bos=task.cls_is_bos, sep_is_eos=task.sep_is_eos)
)
data.dataloaders[task_name] = dataset
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger,
ratio_width=ratio_width)
return total_loss / timer.total, metric, data
def build_model(self, training=False, **kwargs) -> torch.nn.Module:
tasks = self.tasks
encoder: ContextualWordEmbedding = self.config.encoder
transformer_module = encoder.module(training=training)
encoder_size = transformer_module.get_output_dim()
scalar_mixes = torch.nn.ModuleDict()
decoders = torch.nn.ModuleDict()
use_raw_hidden_states = dict()
for task_name, task in tasks.items():
decoder = task.build_model(encoder_size, training=training, **task.config)
assert decoder, f'Please implement `build_model` of {type(task)} to return a decoder.'
decoders[task_name] = decoder
if task.scalar_mix:
scalar_mix = task.scalar_mix.build()
scalar_mixes[task_name] = scalar_mix
# Activate scalar mix starting from 0-th layer
encoder.scalar_mix = 0
use_raw_hidden_states[task_name] = task.use_raw_hidden_states
encoder.ret_raw_hidden_states = any(use_raw_hidden_states.values())
return MultiTaskModel(transformer_module, scalar_mixes, decoders, use_raw_hidden_states)
def predict(self,
data: Union[str, List[str]],
tasks: Optional[Union[str, List[str]]] = None,
skip_tasks: Optional[Union[str, List[str]]] = None,
resolved_tasks=None,
**kwargs) -> Document:
"""Predict on data.
Args:
data: A sentence or a list of sentences.
tasks: The tasks to predict.
skip_tasks: The tasks to skip.
resolved_tasks: The resolved tasks to override ``tasks`` and ``skip_tasks``.
**kwargs: Not used.
Returns:
A :class:`~hanlp_common.document.Document`.
"""
doc = Document()
target_tasks = resolved_tasks or self.resolve_tasks(tasks, skip_tasks)
if data == []:
for group in target_tasks:
for task_name in group:
doc[task_name] = []
return doc
flatten_target_tasks = [self.tasks[t] for group in target_tasks for t in group]
cls_is_bos = any([x.cls_is_bos for x in flatten_target_tasks])
sep_is_eos = any([x.sep_is_eos for x in flatten_target_tasks])
# Now build the dataloaders and execute tasks
first_task_name: str = list(target_tasks[0])[0]
first_task: Task = self.tasks[first_task_name]
encoder_transform, transform = self.build_transform(first_task)
# Override the tokenizer config of the 1st task
encoder_transform.sep_is_eos = sep_is_eos
encoder_transform.cls_is_bos = cls_is_bos
average_subwords = self.model.encoder.average_subwords
flat = first_task.input_is_flat(data)
if flat:
data = [data]
device = self.device
samples = first_task.build_samples(data, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos)
dataloader = first_task.build_dataloader(samples, transform=transform, device=device)
results = defaultdict(list)
order = []
for batch in dataloader:
order.extend(batch[IDX])
# Run the first task, let it make the initial batch for the successors
output_dict = self.predict_task(first_task, first_task_name, batch, results, run_transform=True,
cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos)
# Run each task group in order
for group_id, group in enumerate(target_tasks):
# We could parallelize this in the future
for task_name in group:
if task_name == first_task_name:
continue
output_dict = self.predict_task(self.tasks[task_name], task_name, batch, results, output_dict,
run_transform=True, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos)
if group_id == 0:
# We are kind of hard coding here. If the first task is a tokenizer,
# we need to convert the hidden and mask to token level
if first_task_name.startswith('tok'):
spans = []
tokens = []
output_spans = first_task.config.get('output_spans', None)
for span_per_sent, token_per_sent in zip(output_dict[first_task_name]['prediction'],
results[first_task_name][-len(batch[IDX]):]):
if output_spans:
token_per_sent = [x[0] for x in token_per_sent]
if cls_is_bos:
span_per_sent = [(-1, 0)] + span_per_sent
token_per_sent = [BOS] + token_per_sent
if sep_is_eos:
span_per_sent = span_per_sent + [(span_per_sent[-1][0] + 1, span_per_sent[-1][1] + 1)]
token_per_sent = token_per_sent + [EOS]
# The offsets start with 0 while [CLS] is zero
if average_subwords:
span_per_sent = [list(range(x[0] + 1, x[1] + 1)) for x in span_per_sent]
else:
span_per_sent = [x[0] + 1 for x in span_per_sent]
spans.append(span_per_sent)
tokens.append(token_per_sent)
spans = PadSequenceDataLoader.pad_data(spans, 0, torch.long, device=device)
output_dict['hidden'] = pick_tensor_for_each_token(output_dict['hidden'], spans,
average_subwords)
batch['token_token_span'] = spans
batch['token'] = tokens
# noinspection PyTypeChecker
batch['token_length'] = torch.tensor([len(x) for x in tokens], dtype=torch.long, device=device)
batch.pop('mask', None)
# Put results into doc in the order of tasks
for k in self.config.task_names:
v = results.get(k, None)
if v is None:
continue
doc[k] = reorder(v, order)
# Allow task to perform finalization on document
for group in target_tasks:
for task_name in group:
task = self.tasks[task_name]
task.finalize_document(doc, task_name)
# If no tok in doc, use raw input as tok
if not any(k.startswith('tok') for k in doc):
doc['tok'] = data
if flat:
for k, v in list(doc.items()):
doc[k] = v[0]
# If there is only one field, don't bother to wrap it
# if len(doc) == 1:
# return list(doc.values())[0]
return doc
def resolve_tasks(self, tasks, skip_tasks) -> List[Iterable[str]]:
# Now we decide which tasks to perform and their orders
tasks_in_topological_order = self._tasks_in_topological_order
task_topological_order = self._task_topological_order
computation_graph = self._computation_graph
target_tasks = self._resolve_task_name(tasks)
if not target_tasks:
target_tasks = tasks_in_topological_order
else:
target_topological_order = defaultdict(set)
for task_name in target_tasks:
for dependency in topological_sort(computation_graph, task_name):
target_topological_order[task_topological_order[dependency]].add(dependency)
target_tasks = [item[1] for item in sorted(target_topological_order.items())]
if skip_tasks:
skip_tasks = self._resolve_task_name(skip_tasks)
target_tasks = [x - skip_tasks for x in target_tasks]
target_tasks = [x for x in target_tasks if x]
assert target_tasks, f'No task to perform due to `tasks = {tasks}`.'
# Sort target tasks within the same group in a defined order
target_tasks = [sorted(x, key=lambda _x: self.config.task_names.index(_x)) for x in target_tasks]
return target_tasks
def predict_task(self, task: Task, output_key, batch, results, output_dict=None, run_transform=True,
cls_is_bos=True, sep_is_eos=True):
output_dict, batch = self.feed_batch(batch, output_key, output_dict, run_transform, cls_is_bos, sep_is_eos,
results)
self.decode_output(output_dict, batch, output_key)
results[output_key].extend(task.prediction_to_result(output_dict[output_key]['prediction'], batch))
return output_dict
def _resolve_task_name(self, dependencies):
resolved_dependencies = set()
if isinstance(dependencies, str):
if dependencies in self.tasks:
resolved_dependencies.add(dependencies)
elif dependencies.endswith('*'):
resolved_dependencies.update(x for x in self.tasks if x.startswith(dependencies[:-1]))
else:
prefix_matched = prefix_match(dependencies, self.config.task_names)
assert prefix_matched, f'No prefix matching for {dependencies}. ' \
f'Check your dependencies definition: {list(self.tasks.values())}'
resolved_dependencies.add(prefix_matched)
elif isinstance(dependencies, Iterable):
resolved_dependencies.update(set(chain.from_iterable(self._resolve_task_name(x) for x in dependencies)))
return resolved_dependencies
def fit(self,
encoder: Embedding,
tasks: Dict[str, Task],
save_dir,
epochs,
patience=0.5,
lr=1e-3,
encoder_lr=5e-5,
adam_epsilon=1e-8,
weight_decay=0.0,
warmup_steps=0.1,
gradient_accumulation=1,
grad_norm=5.0,
encoder_grad_norm=None,
decoder_grad_norm=None,
tau: float = 0.8,
transform=None,
# prune: Callable = None,
eval_trn=True,
prefetch=None,
tasks_need_custom_eval=None,
_device_placeholder=False,
cache=False,
devices=None,
logger=None,
seed=None,
**kwargs):
trn_data, dev_data, batch_size = 'trn', 'dev', None
task_names = list(tasks.keys())
return super().fit(**merge_locals_kwargs(locals(), kwargs, excludes=('self', 'kwargs', '__class__', 'tasks')),
**tasks)
# noinspection PyAttributeOutsideInit
def on_config_ready(self, **kwargs):
self.tasks = dict((key, task) for key, task in self.config.items() if isinstance(task, Task))
computation_graph = dict()
for task_name, task in self.tasks.items():
dependencies = task.dependencies
resolved_dependencies = self._resolve_task_name(dependencies)
computation_graph[task_name] = resolved_dependencies
# We can cache this order
tasks_in_topological_order = list(toposort(computation_graph))
task_topological_order = dict()
for i, group in enumerate(tasks_in_topological_order):
for task_name in group:
task_topological_order[task_name] = i
self._tasks_in_topological_order = tasks_in_topological_order
self._task_topological_order = task_topological_order
self._computation_graph = computation_graph
@staticmethod
def reset_metrics(metrics: Dict[str, Metric]):
for metric in metrics.values():
metric.reset()
def feed_batch(self,
batch: Dict[str, Any],
task_name,
output_dict=None,
run_transform=False,
cls_is_bos=False,
sep_is_eos=False,
results=None) -> Tuple[Dict[str, Any], Dict[str, Any]]:
h, output_dict = self._encode(batch, task_name, output_dict, cls_is_bos, sep_is_eos)
task = self.tasks[task_name]
if run_transform:
batch = task.transform_batch(batch, results=results, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos)
batch['mask'] = mask = hanlp.utils.torch_util.lengths_to_mask(batch['token_length'])
output_dict[task_name] = {
'output': task.feed_batch(h,
batch=batch,
mask=mask,
decoder=self.model.decoders[task_name]),
'mask': mask
}
return output_dict, batch
def _encode(self, batch, task_name, output_dict=None, cls_is_bos=False, sep_is_eos=False):
model = self.model
if output_dict:
hidden, raw_hidden = output_dict['hidden'], output_dict['raw_hidden']
else:
hidden = model.encoder(batch)
if isinstance(hidden, tuple):
hidden, raw_hidden = hidden
else:
raw_hidden = None
output_dict = {'hidden': hidden, 'raw_hidden': raw_hidden}
hidden_states = raw_hidden if model.use_raw_hidden_states[task_name] else hidden
if task_name in model.scalar_mixes:
scalar_mix = model.scalar_mixes[task_name]
h = scalar_mix(hidden_states)
else:
if model.scalar_mixes: # If any task enables scalar_mix, hidden_states will be a 4d tensor
hidden_states = hidden_states[-1, :, :, :]
h = hidden_states
# If the task doesn't need cls while h has cls, remove cls
task = self.tasks[task_name]
if cls_is_bos and not task.cls_is_bos:
h = h[:, 1:, :]
if sep_is_eos and not task.sep_is_eos:
h = h[:, :-1, :]
return h, output_dict
def decode_output(self, output_dict, batch, task_name=None):
if not task_name:
for task_name, task in self.tasks.items():
output_per_task = output_dict.get(task_name, None)
if output_per_task is not None:
output_per_task['prediction'] = task.decode_output(
output_per_task['output'],
output_per_task['mask'],
batch, self.model.decoders[task_name])
else:
output_per_task = output_dict[task_name]
output_per_task['prediction'] = self.tasks[task_name].decode_output(
output_per_task['output'],
output_per_task['mask'],
batch,
self.model.decoders[task_name])
def update_metrics(self, batch: Dict[str, Any], output_dict: Dict[str, Any], metrics: MetricDict, task_name):
task = self.tasks[task_name]
output_per_task = output_dict.get(task_name, None)
if output_per_task:
output = output_per_task['output']
prediction = output_per_task['prediction']
metric = metrics.get(task_name, None)
task.update_metrics(batch, output, prediction, metric)
def compute_loss(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
criterion: Callable,
task: Task) -> torch.FloatTensor:
return task.compute_loss(batch, output, criterion)
def evaluate(self, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs):
rets = super().evaluate('tst', save_dir, logger, batch_size, output, **kwargs)
tst = rets[-1]
self._close_dataloader(tst)
return rets
def save_vocabs(self, save_dir, filename='vocabs.json'):
for task_name, task in self.tasks.items():
task.save_vocabs(save_dir, f'{task_name}_{filename}')
def load_vocabs(self, save_dir, filename='vocabs.json'):
for task_name, task in self.tasks.items():
task.load_vocabs(save_dir, f'{task_name}_{filename}')
def parallelize(self, devices: List[Union[int, torch.device]]):
raise NotImplementedError('Parallelization is not implemented yet.')
def __call__(self, data, **kwargs) -> Document:
return super().__call__(data, **kwargs)
def __getitem__(self, task_name: str) -> Task:
return self.tasks[task_name]
def __delitem__(self, task_name: str):
"""Delete a task (and every resource it owns) from this component.
Args:
task_name: The name of the task to be deleted.
Examples:
>>> del mtl['dep'] # Delete dep from MTL
"""
del self.config[task_name]
self.config.task_names.remove(task_name)
del self.tasks[task_name]
del self.model.decoders[task_name]
del self._computation_graph[task_name]
self._task_topological_order.pop(task_name)
for group in self._tasks_in_topological_order:
group: set = group
group.discard(task_name)
def __repr__(self):
return repr(self.config)
def items(self):
yield from self.tasks.items()
def __setattr__(self, key: str, value):
if key and key.startswith('dict') and not hasattr(self, key):
please_read_the_doc_ok = f'This MTL component has no {key}.'
matched_children = []
for name in self.config.task_names:
if hasattr(self[name], key):
matched_children.append(name)
if matched_children:
please_read_the_doc_ok += f' Maybe you are looking for one of its tasks: {matched_children}. ' \
f'For example, HanLP["{matched_children[0]}"].{key} = ...'
raise TypeError(please_read_the_doc_ok)
object.__setattr__(self, key, value)
================================================
FILE: hanlp/components/mtl/tasks/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-02 16:51
import logging
import os
import warnings
from abc import ABC, abstractmethod
from copy import copy
from typing import Callable, Dict, Any, Union, Iterable, List
import torch
from hanlp_common.util import merge_locals_kwargs
from torch.utils.data import DataLoader
from hanlp_common.constant import BOS, EOS
from hanlp.common.dataset import SamplerBuilder, SortingSamplerBuilder, TransformableDataset, KMeansSamplerBuilder
from hanlp_common.document import Document
from hanlp.common.structure import ConfigTracker
from hanlp.common.torch_component import TorchComponent
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.time_util import CountdownTimer
class Task(ConfigTracker, TorchComponent, ABC):
# noinspection PyMissingConstructor
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=None,
separate_optimizer=False,
cls_is_bos=False,
sep_is_eos=False,
**kwargs) -> None:
"""
A task in the multi-task learning framework
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
sep_is_eos: ``True`` to treat the last token as ``EOS``.
**kwargs: Additional config.
"""
ConfigTracker.__init__(self, merge_locals_kwargs(locals(), kwargs))
for f, n in zip([trn, dev, tst], ['trn', 'dev', 'tst']):
if f and os.path.isfile(f): # anonymize local file names
self.config.pop(n)
self.separate_optimizer = separate_optimizer
self.lr = lr
self.use_raw_hidden_states = use_raw_hidden_states
if sampler_builder is None:
sampler_builder = SortingSamplerBuilder(batch_size=32)
self.sampler_builder: Union[SortingSamplerBuilder, KMeansSamplerBuilder] = sampler_builder
self.dependencies = dependencies
self.tst = tst
self.dev = dev
self.trn = trn
self.scalar_mix = scalar_mix
self.cls_is_bos = cls_is_bos
self.sep_is_eos = sep_is_eos
@abstractmethod
def build_dataloader(self,
data,
transform: Callable = None,
training=False,
device=None,
logger: logging.Logger = None,
cache=False,
gradient_accumulation=1,
**kwargs) -> DataLoader:
"""
Build a dataloader for training or evaluation.
Args:
data: Either a path or a list of samples.
transform: The transform from MTL, which is usually [TransformerSequenceTokenizer, FieldLength('token')]
training: Whether this method is called on training set.
device: The device dataloader is intended to work with.
logger: Logger for printing message indicating progress.
cache: Whether the dataloader should be cached.
gradient_accumulation: Gradient accumulation to be passed to sampler builder.
**kwargs: Additional experimental arguments.
"""
pass
def build_optimizer(self, decoder: torch.nn.Module, **kwargs):
pass
def build_batch_wise_scheduler(self, decoder: torch.nn.Module, **kwargs):
pass
@abstractmethod
def compute_loss(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
criterion,
) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
pass
@abstractmethod
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]:
pass
@abstractmethod
def update_metrics(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any],
metric: Union[MetricDict, Metric]):
pass
# noinspection PyMethodOverriding
@abstractmethod
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
pass
@abstractmethod
def build_metric(self, **kwargs):
pass
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
pass
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, output=False, **kwargs):
pass
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, **kwargs):
pass
# noinspection PyMethodMayBeStatic
def compute_lens(self, data: Union[List[Dict[str, Any]], str], dataset: TransformableDataset,
input_ids='token_input_ids'):
"""
Args:
data: Samples to be measured or path to dataset during training time.
dataset: During training time, use this dataset to measure the length of each sample inside.
input_ids: Field name corresponds to input ids.
Returns:
Length list of this samples
"""
if dataset.cache is None:
warnings.warn(f'Caching for the dataset is not enabled, '
f'try `dataset.purge_cache()` if possible. The dataset is {dataset}.')
if isinstance(data, str):
timer = CountdownTimer(len(dataset))
for each in dataset:
timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]')
timer.erase()
return [len(x[input_ids]) for x in dataset]
def feed_batch(self,
h: torch.FloatTensor,
batch: Dict[str, torch.Tensor],
mask: torch.BoolTensor,
decoder: torch.nn.Module):
return decoder(h, batch=batch, mask=mask)
def input_is_flat(self, data) -> bool:
"""
Check whether the data is flat (meaning that it's only a single sample, not even batched).
Returns:
bool: ``True`` to indicate the input data is flat.
"""
raise NotImplementedError(
'`input_is_flat()` needs to be implemented for the task component to accept raw input from user.'
)
@abstractmethod
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
raise NotImplementedError()
# noinspection PyMethodMayBeStatic
def transform_batch(self,
batch: Dict[str, Any],
# inputs: List[List[str]],
results: Dict[str, Any] = None,
cls_is_bos=False,
sep_is_eos=False) -> Dict[str, Any]:
"""
Let the task transform the batch before feeding the batch into its decoder. The default behavior is to
adjust the head and tail of tokens, according to ``cls_is_bos``, ``sep_is_eos`` passed in and the two
settings of the task itself.
Args:
batch: A batch of samples.
results: Predicted results from other tasks which might be useful for this task to utilize. Say a dep task
uses both token and pos as features, then it will need both tok and pos results to make a batch.
cls_is_bos: First token in this batch is BOS.
sep_is_eos: Last token in this batch is EOS.
Returns:
A batch.
"""
if cls_is_bos != self.cls_is_bos or sep_is_eos != self.sep_is_eos:
batch = copy(batch)
tokens = self._adjust_token(batch, cls_is_bos, sep_is_eos, 'token')
delta = len(tokens[0]) - len(batch['token'][0])
batch['token_length'] = batch['token_length'] + delta
batch['token'] = tokens
if 'token_' in batch:
if isinstance(batch['token_'][0], list):
batch['token_'] = self._adjust_token(batch, cls_is_bos, sep_is_eos, 'token_')
else:
batch['token_'] = tokens
return batch
def _adjust_token(self, batch, cls_is_bos, sep_is_eos, token_key):
tokens = []
for sent in batch[token_key]:
if cls_is_bos:
if not self.cls_is_bos:
sent = sent[1:]
elif self.cls_is_bos:
sent = [BOS] + sent
if sep_is_eos:
if not self.sep_is_eos:
sent = sent[:-1]
elif self.sep_is_eos:
sent = sent + [EOS]
tokens.append(sent)
return tokens
# noinspection PyMethodMayBeStatic
def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
"""
Build samples for this task. Called when this task is the first task. Default behaviour is to take inputs as
list of tokens and put these tokens into a dict per sample.
Args:
inputs: Inputs from users, usually a list of lists of tokens.
cls_is_bos: Insert BOS to the head of each sentence.
sep_is_eos: Append EOS to the tail of each sentence.
Returns:
List of samples.
"""
if cls_is_bos:
inputs = [[BOS] + x for x in inputs]
if sep_is_eos:
inputs = [x + [EOS] for x in inputs]
return [{'token': token} for token in inputs]
def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer):
"""Build a transformer tokenizer for this task.
Args:
tokenizer: A tokenizer which is shared but can be adjusted to provide per-task settings.
Returns:
A TransformerSequenceTokenizer.
"""
if tokenizer.cls_is_bos != self.cls_is_bos or tokenizer.sep_is_eos != self.sep_is_eos:
tokenizer = copy(tokenizer)
tokenizer.cls_is_bos = self.cls_is_bos
tokenizer.sep_is_eos = self.sep_is_eos
return tokenizer
# noinspection PyMethodMayBeStatic
def finalize_document(self, doc: Document, task_name: str):
pass
================================================
FILE: hanlp/components/mtl/tasks/amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-12 16:05
import logging
from typing import Dict, Any, List, Union, Iterable, Callable
import torch
from stog.data.dataset_readers.amr_parsing.amr import AMRGraph
from stog.data.dataset_readers.amr_parsing.node_utils import NodeUtilities
from stog.data.dataset_readers.amr_parsing.postprocess.node_restore import NodeRestore
from torch.utils.data import DataLoader
from hanlp_common.constant import CLS
from hanlp.common.dataset import PrefetchDataLoader, SamplerBuilder
from hanlp.common.transform import VocabDict
from hanlp.components.amr.amr_parser.graph_amr_decoder import GraphAbstractMeaningRepresentationDecoder
from hanlp.components.amr.amr_parser.graph_parser import GraphAbstractMeaningRepresentationParser
from hanlp.components.amr.amr_parser.postprocess import PostProcessor
from hanlp.components.amr.amr_parser.work import parse_batch
from hanlp.components.mtl.tasks import Task
from hanlp.datasets.parsing.amr import batchify, get_concepts
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.amr.smatch_eval import SmatchScores, get_amr_utils
from hanlp.metrics.f1 import F1_
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.io_util import get_resource
from hanlp_common.util import merge_list_of_dict, merge_locals_kwargs
class GraphAbstractMeaningRepresentationParsing(Task, GraphAbstractMeaningRepresentationParser):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=1e-3,
separate_optimizer=False,
cls_is_bos=True,
sep_is_eos=False,
char2concept_dim=128,
cnn_filters=((3, 256),),
concept_char_dim=32,
concept_dim=300,
dropout=0.2,
embed_dim=512,
eval_every=20,
ff_embed_dim=1024,
graph_layers=2,
inference_layers=4,
num_heads=8,
rel_dim=100,
snt_layers=4,
unk_rate=0.33,
vocab_min_freq=5,
beam_size=8,
alpha=0.6,
max_time_step=100,
amr_version='2.0',
**kwargs) -> None:
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
utils_dir = get_resource(get_amr_utils(amr_version))
self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir))
def build_dataloader(self,
data,
transform: Callable = None,
training=False,
device=None,
logger: logging.Logger = None,
cache=False,
gradient_accumulation=1,
**kwargs) -> DataLoader:
if isinstance(data, list):
data = GraphAbstractMeaningRepresentationParser.build_samples(self, data)
dataset, lens = GraphAbstractMeaningRepresentationParser.build_dataset(self, data, logger=logger,
transform=transform, training=training)
if self.vocabs.mutable:
GraphAbstractMeaningRepresentationParser.build_vocabs(self, dataset, logger)
dataloader = PrefetchDataLoader(
DataLoader(batch_sampler=self.sampler_builder.build(lens, shuffle=training,
gradient_accumulation=gradient_accumulation),
dataset=dataset,
collate_fn=merge_list_of_dict,
num_workers=0), batchify=self.build_batchify(device, training),
prefetch=None)
return dataloader
def compute_loss(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
concept_loss, arc_loss, rel_loss, graph_arc_loss = output
concept_loss, concept_correct, concept_total = concept_loss
rel_loss, rel_correct, rel_total = rel_loss
loss = concept_loss + arc_loss + rel_loss
return loss
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]:
return output
def update_metrics(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any],
metric: Union[MetricDict, Metric]):
pass
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return GraphAbstractMeaningRepresentationDecoder(vocabs=self.vocabs, encoder_size=encoder_size, **self.config)
def build_metric(self, **kwargs):
return SmatchScores({'Smatch': F1_(0, 0, 0)})
def input_is_flat(self, data) -> bool:
return GraphAbstractMeaningRepresentationParser.input_is_flat(self, data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
pp = PostProcessor(self.vocabs['rel'])
for concept, relation, score in zip(prediction['concept'], prediction['relation'], prediction['score']):
amr = pp.to_amr(concept, relation)
amr_graph = AMRGraph(amr)
self.sense_restore.restore_graph(amr_graph)
yield amr_graph
def evaluate_dataloader(self,
data: DataLoader,
criterion: Callable,
metric=None,
output=False,
input=None,
decoder=None,
h=None,
split=None,
**kwargs):
# noinspection PyTypeChecker
GraphAbstractMeaningRepresentationParser.evaluate_dataloader(self, data, logger=None, metric=metric,
input=input, model=decoder, h=lambda x: h(x)[0],
use_fast=True)
def feed_batch(self,
h: torch.FloatTensor,
batch: Dict[str, torch.Tensor],
mask: torch.BoolTensor,
decoder: torch.nn.Module):
if decoder.training:
return super().feed_batch(h, batch, mask, decoder)
beam_size = self.config.get('beam_size', 8)
alpha = self.config.get('alpha', 0.6)
max_time_step = self.config.get('max_time_step', 100)
res = parse_batch(decoder, batch, beam_size, alpha, max_time_step, h=h)
return res
def transform_batch(self, batch: Dict[str, Any], results: Dict[str, Any] = None, cls_is_bos=False,
sep_is_eos=False) -> Dict[str, Any]:
batch = super().transform_batch(batch, results, cls_is_bos, sep_is_eos)
batch['lemma'] = [[CLS] + x for x in results['lem']]
copy_seq = merge_list_of_dict(
[get_concepts({'token': t[1:], 'lemma': l[1:]}, self.vocabs.predictable_concept) for t, l in
zip(batch['token'], batch['lemma'])])
copy_seq.pop('token')
copy_seq.pop('lemma')
batch.update(copy_seq)
ret = batchify(batch, self.vocabs, device=batch['token_input_ids'].device)
return ret
================================================
FILE: hanlp/components/mtl/tasks/constituency.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 16:52
import logging
from typing import Dict, Any, List, Union, Iterable, Callable
import torch
from phrasetree.tree import Tree
from hanlp_common.constant import BOS, EOS
from hanlp_common.document import Document
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.parsers.constituency.crf_constituency_model import CRFConstituencyDecoder
from hanlp.components.parsers.constituency.crf_constituency_parser import CRFConstituencyParser
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, prefix_match
class CRFConstituencyParsing(Task, CRFConstituencyParser):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=None,
separate_optimizer=False,
cls_is_bos=True,
sep_is_eos=True,
delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP', ',', 'S1'),
equal=(('ADVP', 'PRT'),),
mbr=True,
n_mlp_span=500,
n_mlp_label=100,
mlp_dropout=.33,
no_subcategory=True,
**kwargs
) -> None:
r"""Two-stage CRF Parsing (:cite:`ijcai2020-560`).
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
sep_is_eos: ``True`` to treat the last token as ``EOS``.
delete: Constituencies to be deleted from training and evaluation.
equal: Constituencies that are regarded as equal during evaluation.
mbr: ``True`` to enable Minimum Bayes Risk (MBR) decoding (:cite:`smith-smith-2007-probabilistic`).
n_mlp_span: Number of features for span decoder.
n_mlp_label: Number of features for label decoder.
mlp_dropout: Dropout applied to MLPs.
no_subcategory: Strip out subcategories.
**kwargs: Not used.
"""
if isinstance(equal, tuple):
equal = dict(equal)
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
# noinspection DuplicatedCode
def build_dataloader(self,
data,
transform: Callable = None,
training=False,
device=None,
logger: logging.Logger = None,
cache=False,
gradient_accumulation=1,
**kwargs) -> DataLoader:
dataset = CRFConstituencyParsing.build_dataset(self, data, transform)
dataset.purge_cache()
if self.vocabs.mutable:
CRFConstituencyParsing.build_vocabs(self, dataset, logger)
if isinstance(data, str):
timer = CountdownTimer(len(dataset))
# noinspection PyCallByClass
BiaffineDependencyParser.cache_dataset(self, dataset, timer, training, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset)
def feed_batch(self,
h: torch.FloatTensor,
batch: Dict[str, torch.Tensor],
mask: torch.BoolTensor,
decoder: torch.nn.Module):
return {
'output': decoder(h),
'mask': CRFConstituencyParser.compute_mask(
self, batch, offset=1 if 'constituency' in batch or batch['token'][0][-1] == EOS else -1)
}
def compute_loss(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
out, mask = output['output'], output['mask']
loss, span_probs = CRFConstituencyParser.compute_loss(self, out, batch['chart_id'], mask, crf_decoder=criterion)
output['span_probs'] = span_probs
return loss
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]:
out, mask = output['output'], output['mask']
tokens = []
for sent in batch['token']:
if sent[0] == BOS:
sent = sent[1:]
if sent[-1] == EOS:
sent = sent[:-1]
tokens.append(sent)
return CRFConstituencyParser.decode_output(self, out, mask, batch, output.get('span_probs', None),
decoder=decoder, tokens=tokens)
def update_metrics(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
return CRFConstituencyParser.update_metrics(self, metric, batch, prediction)
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return CRFConstituencyDecoder(n_labels=len(self.vocabs.chart), n_hidden=encoder_size)
def build_metric(self, **kwargs):
return CRFConstituencyParser.build_metric(self)
def input_is_flat(self, data) -> bool:
return CRFConstituencyParser.input_is_flat(self, data)
def prediction_to_result(self, prediction: List, batch: Dict[str, Any]) -> List:
return prediction
def finalize_document(self, doc: Document, task_name: str):
pos_key = prefix_match('pos', doc)
pos: List[List[str]] = doc.get(pos_key, None)
if pos:
for tree, pos_per_sent in zip(doc[task_name], pos):
tree: Tree = tree
offset = 0
for subtree in tree.subtrees(lambda t: t.height() == 2):
tag = subtree.label()
if tag == '_':
subtree.set_label(pos_per_sent[offset])
offset += 1
def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
return CRFConstituencyParser.build_samples(self, inputs)
================================================
FILE: hanlp/components/mtl/tasks/dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-13 21:39
import logging
from typing import Dict, Any, Union, Iterable, List
import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder
from hanlp.datasets.parsing.loaders.conll_dataset import append_bos
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.constant import EOS
from hanlp_common.util import merge_locals_kwargs
class BiaffineDependencyParsing(Task, BiaffineDependencyParser):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=2e-3, separate_optimizer=False,
cls_is_bos=True,
sep_is_eos=False,
punct=False,
tree=False,
proj=False,
n_mlp_arc=500,
n_mlp_rel=100,
mlp_dropout=.33,
mu=.9,
nu=.9,
epsilon=1e-12,
decay=.75,
decay_steps=5000,
use_pos=False,
max_seq_len=None,
**kwargs) -> None:
"""Biaffine dependency parsing (:cite:`dozat:17a`).
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
sep_is_eos: ``True`` to treat the last token as ``EOS``.
punct: ``True`` to include punctuations in evaluation.
tree: ``True`` to enforce tree constraint.
proj: ``True`` for projective parsing.
n_mlp_arc: Number of features for arc representation.
n_mlp_rel: Number of features for rel representation.
mlp_dropout: Dropout applied to MLPs.
mu: First coefficient used for computing running averages of gradient and its square in Adam.
nu: Second coefficient used for computing running averages of gradient and its square in Adam.
epsilon: Term added to the denominator to improve numerical stability
decay: Decay rate for exceptional lr scheduler.
decay_steps: Decay every ``decay_steps`` steps.
use_pos: Use pos feature.
max_seq_len: Prune samples longer than this length.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
BiaffineDependencyParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1],
batch.get('punct_mask', None), metric, batch)
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder, **kwargs) -> Union[Dict[str, Any], Any]:
(arc_scores, rel_scores), mask = output
return BiaffineDependencyParser.decode(self, arc_scores, rel_scores, mask, batch)
def compute_loss(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
(arc_scores, rel_scores), mask = output
return BiaffineDependencyParser.compute_loss(self, arc_scores, rel_scores, batch['arc'], batch['rel_id'], mask,
criterion,
batch)
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return BiaffineDecoder(encoder_size, self.config.n_mlp_arc, self.config.n_mlp_rel, self.config.mlp_dropout,
len(self.vocabs.rel))
def build_metric(self, **kwargs):
return BiaffineDependencyParser.build_metric(self, **kwargs)
def build_dataloader(self, data, transform: TransformList = None, training=False, device=None,
logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
transform.insert(0, append_bos)
dataset = BiaffineDependencyParser.build_dataset(self, data, transform)
dataset.purge_cache()
if self.vocabs.mutable:
BiaffineDependencyParser.build_vocabs(self, dataset, logger, transformer=True)
if isinstance(data, str):
timer = CountdownTimer(len(dataset))
BiaffineDependencyParser.cache_dataset(self, dataset, timer, training, logger)
max_seq_len = self.config.get('max_seq_len', None)
if max_seq_len and isinstance(data, str):
dataset.prune(lambda x: len(x['token_input_ids']) > max_seq_len, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
shuffle=training, gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset,
pad=self.get_pad_dict())
def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor,
decoder: torch.nn.Module):
logits = super().feed_batch(h, batch, mask, decoder)
mask = mask.clone()
mask[:, 0] = 0
return logits, mask
def build_optimizer(self, decoder: torch.nn.Module, **kwargs):
config = self.config
optimizer = Adam(decoder.parameters(),
config.lr,
(config.mu, config.nu),
config.epsilon)
scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps))
return optimizer, scheduler
def input_is_flat(self, data) -> bool:
return BiaffineDependencyParser.input_is_flat(self, data, self.config.use_pos)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
arcs, rels = prediction
arcs = arcs[:, 1:] # Skip the ROOT
rels = rels[:, 1:]
arcs = arcs.tolist()
rels = rels.tolist()
vocab = self.vocabs['rel'].idx_to_token
for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']):
tokens = tokens[1:]
sent_len = len(tokens)
result = list(zip(arcs_per_sent[:sent_len], [vocab[r] for r in rels_per_sent[:sent_len]]))
yield result
def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
return [{'FORM': token + ([EOS] if sep_is_eos else [])} for token in inputs]
================================================
FILE: hanlp/components/mtl/tasks/dep_2nd.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-07 14:14
import logging
from typing import Dict, Any, Union, Iterable, Callable, List
import torch
from hanlp_common.util import merge_locals_kwargs
from torch.utils.data import DataLoader
import hanlp.utils.torch_util
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.parsers.biaffine.biaffine_2nd_dep import BiaffineSecondaryParser, BiaffineJointDecoder, \
BiaffineSeparateDecoder
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
class BiaffineSecondaryDependencyDecoder(torch.nn.Module):
def __init__(self, hidden_size, config) -> None:
super().__init__()
self.decoder = BiaffineJointDecoder(hidden_size, config) if config.joint \
else BiaffineSeparateDecoder(hidden_size, config)
def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
if mask is None:
mask = hanlp.utils.torch_util.lengths_to_mask(batch['token_length'])
else:
mask = mask.clone()
scores = self.decoder(contextualized_embeddings, mask)
mask[:, 0] = 0
return scores, mask
class BiaffineSecondaryDependencyParsing(Task, BiaffineSecondaryParser):
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None,
dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False,
lr=2e-3, separate_optimizer=False,
punct=False,
tree=False,
apply_constraint=True,
n_mlp_arc=500,
n_mlp_rel=100,
mlp_dropout=.33,
pad_rel=None,
joint=True,
mu=.9,
nu=.9,
epsilon=1e-12,
cls_is_bos=True,
**kwargs) -> None:
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
dataset = BiaffineSecondaryParser.build_dataset(self, data, transform)
dataset.purge_cache()
if self.vocabs.mutable:
BiaffineSecondaryParser.build_vocabs(self, dataset, logger, transformer=True)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset,
pad={'arc': 0, 'arc_2nd': False})
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
BiaffineSecondaryParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1],
batch['punct_mask'], metric, batch)
def decode_output(self, output: Dict[str, Any], batch: Dict[str, Any], decoder, **kwargs) \
-> Union[Dict[str, Any], Any]:
return BiaffineSecondaryParser.decode(self, *output[0], output[1], batch)
def compute_loss(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
return BiaffineSecondaryParser.compute_loss(self, *output[0], batch['arc'], batch['rel_id'], output[1],
criterion, batch)
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return BiaffineSecondaryDependencyDecoder(encoder_size, self.config)
def build_metric(self, **kwargs):
return BiaffineSecondaryParser.build_metric(self, **kwargs)
def build_criterion(self, **kwargs):
return BiaffineSecondaryParser.build_criterion(self, **kwargs)
def build_optimizer(self, decoder: torch.nn.Module, **kwargs):
config = self.config
optimizer = torch.optim.Adam(decoder.parameters(),
config.lr,
(config.mu, config.nu),
config.epsilon)
return optimizer
def input_is_flat(self, data) -> bool:
return BiaffineSecondaryParser.input_is_flat(self, data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
outputs = []
return BiaffineSecondaryParser.predictions_to_human(self, prediction, outputs, batch['token'], use_pos=False)
================================================
FILE: hanlp/components/mtl/tasks/lem.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-09 16:37
import logging
from typing import Dict, Any, Union, Iterable, Callable, List
import torch
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.lemmatizer import TransformerLemmatizer
from hanlp.components.mtl.tasks import Task
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
from torch.utils.data import DataLoader
class LinearDecoder(torch.nn.Module):
def __init__(self,
hidden_size,
num_labels) -> None:
super().__init__()
self.classifier = torch.nn.Linear(hidden_size, num_labels)
def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
return self.classifier(contextualized_embeddings)
class TransformerLemmatization(Task, TransformerLemmatizer):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=1e-3,
separate_optimizer=False,
cls_is_bos=False,
sep_is_eos=False,
max_seq_len=None,
sent_delimiter=None,
char_level=False,
hard_constraint=False,
token_key='token', **kwargs) -> None:
""" Transition based lemmatization (:cite:`kondratyuk-straka-2019-75`).
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
sep_is_eos: ``True`` to treat the last token as ``EOS``.
max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
be split here.
char_level: Whether the sequence length is measured at char level, which is never the case for
lemmatization.
hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
in a sentence, it will be split at a token anyway.
token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
def build_dataloader(self,
data: List[List[str]],
transform: Callable = None,
training=False,
device=None,
logger: logging.Logger = None,
cache=False,
gradient_accumulation=1,
**kwargs) -> DataLoader:
args = dict((k, self.config[k]) for k in
['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
dataset = self.build_dataset(data, cache=True, transform=transform, **args)
dataset.append_transform(self.vocabs)
if self.vocabs.mutable:
self.build_vocabs(dataset, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
shuffle=training, gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset)
def compute_loss(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
return TransformerLemmatizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder,
**kwargs) -> Union[Dict[str, Any], Any]:
return TransformerLemmatizer.decode_output(self, output, mask, batch, decoder)
def update_metrics(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any],
metric: Union[MetricDict, Metric]):
return TransformerLemmatizer.update_metrics(self, metric, output, batch['tag_id'], batch['mask'])
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return LinearDecoder(encoder_size, len(self.vocabs['tag']))
def build_metric(self, **kwargs):
return TransformerLemmatizer.build_metric(self, **kwargs)
def input_is_flat(self, data) -> bool:
return TransformerLemmatizer.input_is_flat(self, data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
return TransformerLemmatizer.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token, batch,
token=batch['token'])
================================================
FILE: hanlp/components/mtl/tasks/ner/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:34
================================================
FILE: hanlp/components/mtl/tasks/ner/biaffine_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-05 01:49
import logging
from copy import copy
from typing import Dict, Any, Union, Iterable, List
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.components.ner.biaffine_ner.biaffine_ner import BiaffineNamedEntityRecognizer
from hanlp.components.ner.biaffine_ner.biaffine_ner_model import BiaffineNamedEntityRecognitionDecoder
from hanlp.datasets.ner.loaders.json_ner import unpack_ner
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
class BiaffineNamedEntityRecognition(Task, BiaffineNamedEntityRecognizer):
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None,
dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False,
lr=None, separate_optimizer=False,
doc_level_offset=True, is_flat_ner=True, tagset=None, ret_tokens=' ',
ffnn_size=150, loss_reduction='mean', **kwargs) -> None:
"""An implementation of Named Entity Recognition as Dependency Parsing (:cite:`yu-etal-2020-named`). It treats
every possible span as a candidate of entity and predicts its entity label. Non-entity spans are assigned NULL
label to be excluded. The label prediction is done with a biaffine layer (:cite:`dozat:17a`). As it makes no
assumption about the spans, it naturally supports flat NER and nested NER.
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
is_flat_ner: ``True`` for flat NER, otherwise nested NER.
tagset: Optional tagset to prune entities outside of this tagset from datasets.
ret_tokens: A delimiter between tokens in entities so that the surface form of an entity can be rebuilt.
ffnn_size: Feedforward size for MLPs extracting the head/tail representations.
loss_reduction: The loss reduction used in aggregating losses.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
BiaffineNamedEntityRecognizer.update_metrics(self, batch, prediction, metric)
def decode_output(self,
output: Dict[str, Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder,
**kwargs) -> Union[Dict[str, Any], Any]:
return self.get_pred_ner(batch['token'], output['candidate_ner_scores'])
def compute_loss(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
return output['loss']
def build_dataloader(self, data,
transform: TransformList = None,
training=False,
device=None,
logger: logging.Logger = None,
gradient_accumulation=1,
**kwargs) -> DataLoader:
transform = copy(transform)
transform.append(unpack_ner)
dataset = BiaffineNamedEntityRecognizer.build_dataset(self, data, self.vocabs, transform)
dataset.purge_cache()
if self.vocabs.mutable:
BiaffineNamedEntityRecognizer.build_vocabs(self, dataset, logger, self.vocabs)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset)
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return BiaffineNamedEntityRecognitionDecoder(encoder_size, self.config.ffnn_size, len(self.vocabs.label),
self.config.loss_reduction)
def build_metric(self, **kwargs):
return BiaffineNamedEntityRecognizer.build_metric(self, **kwargs)
def input_is_flat(self, data) -> bool:
return BiaffineNamedEntityRecognizer.input_is_flat(data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
results = []
BiaffineNamedEntityRecognizer.prediction_to_result(batch['token'], prediction, results,
ret_tokens=self.config.get('ret_tokens', ' '))
return results
================================================
FILE: hanlp/components/mtl/tasks/ner/tag_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:35
import logging
from typing import Union, List, Dict, Any, Iterable, Callable, Set, Sequence
import torch
from hanlp_trie import DictInterface
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.ner.transformer_ner import TransformerNamedEntityRecognizer
from hanlp.layers.crf.crf import CRF
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
class LinearCRFDecoder(torch.nn.Module):
def __init__(self,
hidden_size,
num_labels,
secondary_encoder=None,
crf=False) -> None:
super().__init__()
self.secondary_encoder = secondary_encoder
self.classifier = torch.nn.Linear(hidden_size, num_labels)
self.crf = CRF(num_labels) if crf else None
def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
if self.secondary_encoder:
contextualized_embeddings = self.secondary_encoder(contextualized_embeddings, mask=mask)
return self.classifier(contextualized_embeddings)
class TaggingNamedEntityRecognition(Task, TransformerNamedEntityRecognizer):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=1e-3,
separate_optimizer=False,
max_seq_len=None,
sent_delimiter=None,
char_level=False,
hard_constraint=False,
tagging_scheme=None,
crf=False,
delimiter_in_entity=None,
merge_types: List[str] = None,
secondary_encoder=None,
token_key='token',
dict_whitelist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
dict_blacklist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
dict_tags: Union[
DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None,
**kwargs) -> None:
r"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
During decoding, it performs longest-prefix-matching of these words to override the prediction from
underlying statistical model. It also uses a blacklist to mask out mis-predicted entities.
.. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
do and what it can't do. The tutorial in `this book `_ can be very helpful.
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
be split here.
char_level: Whether the sequence length is measured at char level, which is never the case for
lemmatization.
hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
in a sentence, it will be split at a token anyway.
token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining
tokens during decoding.
merge_types: The types of consecutive entities to be merged.
secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden
states from the main encoder as input.
token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
dict_whitelist: A :class:`dict` or a :class:`~hanlp_trie.dictionary.DictInterface` of gazetteers to be
included into the final results.
dict_blacklist: A :class:`set` or a :class:`~hanlp_trie.dictionary.DictInterface` of badcases to be
excluded from the final results.
**kwargs:
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
self.secondary_encoder = secondary_encoder
self.dict_whitelist = dict_whitelist
self.dict_blacklist = dict_blacklist
self.dict_tags = dict_tags
def build_dataloader(self,
data,
transform: Callable = None,
training=False,
device=None,
logger: logging.Logger = None,
cache=False,
gradient_accumulation=1,
**kwargs) -> DataLoader:
args = dict((k, self.config[k]) for k in
['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
dataset = self.build_dataset(data, cache=cache, transform=transform, **args)
dataset.append_transform(self.vocabs)
dataset.purge_cache()
if self.vocabs.mutable:
self.build_vocabs(dataset, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(
self.compute_lens(data, dataset),
shuffle=training, gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset)
def compute_loss(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
return TransformerNamedEntityRecognizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder,
**kwargs) -> Union[Dict[str, Any], Any]:
return TransformerNamedEntityRecognizer.decode_output(self, output, batch['mask'], batch, decoder)
def update_metrics(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any],
metric: Union[MetricDict, Metric]):
return TransformerNamedEntityRecognizer.update_metrics(self, metric, output, batch['tag_id'], batch['mask'],
batch, prediction)
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.secondary_encoder, self.config.crf)
def build_metric(self, **kwargs):
return TransformerNamedEntityRecognizer.build_metric(self, **kwargs)
def input_is_flat(self, data) -> bool:
return TransformerNamedEntityRecognizer.input_is_flat(self, data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
return TransformerNamedEntityRecognizer.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token,
batch)
================================================
FILE: hanlp/components/mtl/tasks/pos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-19 18:56
import logging
from typing import Dict, Any, Union, Iterable, Callable, List, Tuple, Sequence
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
from hanlp.layers.crf.crf import CRF
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
from hanlp_trie import DictInterface, TrieDict
class LinearCRFDecoder(torch.nn.Module):
def __init__(self,
hidden_size,
num_labels,
crf=False) -> None:
"""A linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer on top of it.
Args:
hidden_size: Size of hidden states.
num_labels: Size of tag set.
crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
"""
super().__init__()
self.classifier = torch.nn.Linear(hidden_size, num_labels)
self.crf = CRF(num_labels) if crf else None
def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
"""
Args:
contextualized_embeddings: Hidden states for contextual layer.
batch: A dict of a batch.
mask: Mask for tokens.
Returns:
Logits. Users are expected to call ``CRF.decode`` on these emissions during decoding and ``CRF.forward``
during training.
"""
return self.classifier(contextualized_embeddings)
class TransformerTagging(Task, TransformerTagger):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=1e-3,
separate_optimizer=False,
cls_is_bos=False,
sep_is_eos=False,
max_seq_len=None,
sent_delimiter=None,
char_level=False,
hard_constraint=False,
crf=False,
token_key='token',
dict_tags: Union[
DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None,
**kwargs) -> None:
"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
any tagging tasks including PoS tagging and many others. It also features with a custom dictionary ``dict_tags``
to perform ``longest-prefix-matching`` which replaces matched tokens with given tags.
.. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
do and what it can't do. The tutorial in `this book `_ can be very helpful.
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
sep_is_eos: ``True`` to treat the last token as ``EOS``.
max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
be split here.
char_level: Whether the sequence length is measured at char level, which is never the case for
lemmatization.
hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
in a sentence, it will be split at a token anyway.
crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
dict_tags: A custom dictionary to override predicted tags by performing longest-prefix-matching.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
self.dict_tags = dict_tags
def build_dataloader(self,
data,
transform: Callable = None,
training=False,
device=None,
logger: logging.Logger = None,
cache=False,
gradient_accumulation=1,
**kwargs) -> DataLoader:
args = dict((k, self.config[k]) for k in
['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
dataset = self.build_dataset(data, cache=True, transform=transform, **args)
dataset.append_transform(self.vocabs)
if self.vocabs.mutable:
self.build_vocabs(dataset, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
shuffle=training, gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset)
def compute_loss(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
return TransformerTagger.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder,
**kwargs) -> Union[Dict[str, Any], Any]:
return TransformerTagger.decode_output(self, output, mask, batch, decoder)
def update_metrics(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any],
metric: Union[MetricDict, Metric]):
return TransformerTagger.update_metrics(self, metric, output, batch['tag_id'], batch['mask'])
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.config.crf)
def build_metric(self, **kwargs):
return TransformerTagger.build_metric(self, **kwargs)
def input_is_flat(self, data) -> bool:
return TransformerTagger.input_is_flat(self, data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
return TransformerTagger.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token, batch)
================================================
FILE: hanlp/components/mtl/tasks/sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-13 21:39
import logging
from typing import Dict, Any, Union, Iterable, List
import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder
from hanlp.components.parsers.biaffine.biaffine_sdp import BiaffineSemanticDependencyParser
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs
class BiaffineSemanticDependencyParsing(Task, BiaffineSemanticDependencyParser):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=2e-3, separate_optimizer=False,
punct=False,
tree=True,
pad_rel=None,
apply_constraint=False,
single_root=True,
no_zero_head=None,
n_mlp_arc=500,
n_mlp_rel=100,
mlp_dropout=.33,
mu=.9,
nu=.9,
epsilon=1e-12,
decay=.75,
decay_steps=5000,
cls_is_bos=True,
use_pos=False,
**kwargs) -> None:
r"""Implementation of "Stanford's graph-based neural dependency parser at
the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade"
(:cite:`he-choi-2019`).
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
punct: ``True`` to include punctuations in evaluation.
pad_rel: Padding token for relations.
apply_constraint: Enforce constraints (see following parameters).
single_root: Force single root.
no_zero_head: Every token has at least one head.
n_mlp_arc: Number of features for arc representation.
n_mlp_rel: Number of features for rel representation.
mlp_dropout: Dropout applied to MLPs.
mu: First coefficient used for computing running averages of gradient and its square in Adam.
nu: Second coefficient used for computing running averages of gradient and its square in Adam.
epsilon: Term added to the denominator to improve numerical stability
decay: Decay rate for exceptional lr scheduler.
decay_steps: Decay every ``decay_steps`` steps.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
use_pos: Use pos feature.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
BiaffineSemanticDependencyParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1],
output[-1], metric, batch)
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder, **kwargs) -> Union[Dict[str, Any], Any]:
(arc_scores, rel_scores), mask, punct_mask = output
return BiaffineSemanticDependencyParser.decode(self, arc_scores, rel_scores, mask, batch)
def compute_loss(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
(arc_scores, rel_scores), mask, punct_mask = output
return BiaffineSemanticDependencyParser.compute_loss(self, arc_scores, rel_scores, batch['arc'],
batch['rel_id'], mask, criterion,
batch)
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return BiaffineDecoder(encoder_size, self.config.n_mlp_arc, self.config.n_mlp_rel, self.config.mlp_dropout,
len(self.vocabs.rel))
def build_metric(self, **kwargs):
return BiaffineSemanticDependencyParser.build_metric(self, **kwargs)
def build_dataloader(self, data, transform: TransformList = None, training=False, device=None,
logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
dataset = BiaffineSemanticDependencyParser.build_dataset(self, data, transform)
dataset.purge_cache()
if self.vocabs.mutable:
BiaffineSemanticDependencyParser.build_vocabs(self, dataset, logger, transformer=True)
if isinstance(data, str):
timer = CountdownTimer(len(dataset))
BiaffineSemanticDependencyParser.cache_dataset(self, dataset, timer, training, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
shuffle=training, gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset,
pad=self.get_pad_dict())
def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor,
decoder: torch.nn.Module):
logits = super().feed_batch(h, batch, mask, decoder)
arc_scores = logits[0]
mask = mask.clone()
mask[:, 0] = 0
mask = self.convert_to_3d_mask(arc_scores, mask)
punct_mask = self.convert_to_3d_puncts(batch.get('punct_mask', None), mask)
return logits, mask, punct_mask
def build_optimizer(self, decoder: torch.nn.Module, **kwargs):
config = self.config
optimizer = Adam(decoder.parameters(),
config.lr,
(config.mu, config.nu),
config.epsilon)
scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps))
return optimizer, scheduler
def input_is_flat(self, data) -> bool:
return BiaffineSemanticDependencyParser.input_is_flat(self, data, self.config.use_pos)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
arcs, rels = prediction
arcs = arcs[:, 1:, :] # Skip the ROOT
rels = rels[:, 1:, :]
arcs = arcs.tolist()
rels = rels.tolist()
vocab = self.vocabs['rel'].idx_to_token
for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']):
tokens = tokens[1:]
sent_len = len(tokens)
result = []
for a, r in zip(arcs_per_sent[:sent_len], rels_per_sent[:sent_len]):
heads = [i for i in range(sent_len + 1) if a[i]]
deprels = [vocab[r[i]] for i in range(sent_len + 1) if a[i]]
result.append(list(zip(heads, deprels)))
yield result
def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
return BiaffineSemanticDependencyParser.build_samples(self, inputs, self.config.use_pos)
================================================
FILE: hanlp/components/mtl/tasks/srl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-04 16:49
================================================
FILE: hanlp/components/mtl/tasks/srl/bio_srl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-04 16:50
import logging
from typing import Dict, Any, List, Union, Iterable, Callable
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.srl.span_bio.baffine_tagging import BiaffineTaggingDecoder
from hanlp.components.srl.span_bio.span_bio import SpanBIOSemanticRoleLabeler
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
import torch.nn.functional as F
class SpanBIOSemanticRoleLabeling(Task, SpanBIOSemanticRoleLabeler):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=None,
separate_optimizer=False,
cls_is_bos=False,
sep_is_eos=False,
crf=False,
n_mlp_rel=300,
mlp_dropout=0.2,
loss_reduction='mean',
doc_level_offset=True,
**kwargs) -> None:
"""A span based Semantic Role Labeling task using BIO scheme for tagging the role of each token. Given a
predicate and a token, it uses biaffine (:cite:`dozat:17a`) to predict their relations as one of BIO-ROLE.
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
sep_is_eos: ``True`` to treat the last token as ``EOS``.
crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
n_mlp_rel: Output size of MLPs for representing predicate and tokens.
mlp_dropout: Dropout applied to MLPs.
loss_reduction: Loss reduction for aggregating losses.
doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader:
dataset = self.build_dataset(data, transform=[transform, self.vocabs])
dataset.purge_cache()
if self.vocabs.mutable:
SpanBIOSemanticRoleLabeler.build_vocabs(self, dataset, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset)
def compute_loss(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
pred, mask = output
return SpanBIOSemanticRoleLabeler.compute_loss(self, criterion, pred, batch['srl_id'], mask)
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]:
pred, mask = output
return SpanBIOSemanticRoleLabeler.decode_output(self, pred, mask, batch, decoder)
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
return SpanBIOSemanticRoleLabeler.update_metrics(self, metric, prediction, batch)
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return BiaffineTaggingDecoder(
len(self.vocabs['srl']),
encoder_size,
self.config.n_mlp_rel,
self.config.mlp_dropout,
self.config.crf,
)
def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor,
decoder: torch.nn.Module):
if not h.numel(): # No tokens, don't bother to run the decoder
return [], None
pred = decoder(h)
mask3d = self.compute_mask(mask)
if self.config.crf:
token_index = mask3d[0]
pred = pred.flatten(end_dim=1)[token_index]
pred = F.log_softmax(pred, dim=-1)
return pred, mask3d
def build_metric(self, **kwargs):
return SpanBIOSemanticRoleLabeler.build_metric(self)
def input_is_flat(self, data) -> bool:
return SpanBIOSemanticRoleLabeler.input_is_flat(self, data)
def prediction_to_result(self, prediction: List, batch: Dict[str, Any]) -> List:
yield from SpanBIOSemanticRoleLabeler.prediction_to_result(self, prediction, batch)
================================================
FILE: hanlp/components/mtl/tasks/srl/rank_srl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-05 15:43
import logging
from typing import Union, List, Dict, Any, Iterable, Callable
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.srl.span_rank.span_rank import SpanRankingSemanticRoleLabeler
from hanlp.components.srl.span_rank.span_ranking_srl_model import SpanRankingSRLDecoder
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
class SpanRankingSemanticRoleLabeling(Task, SpanRankingSemanticRoleLabeler):
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None,
dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False,
lr=1e-3, separate_optimizer=False,
lexical_dropout=0.5,
dropout=0.2,
span_width_feature_size=20,
ffnn_size=150,
ffnn_depth=2,
argument_ratio=0.8,
predicate_ratio=0.4,
max_arg_width=30,
mlp_label_size=100,
enforce_srl_constraint=False,
use_gold_predicates=False,
doc_level_offset=True,
use_biaffine=False,
loss_reduction='mean',
with_argument=' ',
**kwargs) -> None:
r""" An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling"
(:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them.
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
lexical_dropout: Dropout applied to hidden states of encoder.
dropout: Dropout used for other layers except the encoder.
span_width_feature_size: Span width feature size.
ffnn_size: Feedforward size.
ffnn_depth: Number of layers of feedforward MLPs.
argument_ratio: Ratio of candidate arguments over number of tokens.
predicate_ratio: Ratio of candidate predicates over number of tokens.
max_arg_width: Maximum argument width.
mlp_label_size: Feature size for label representation.
enforce_srl_constraint: Enforce SRL constraints (number of core ARGs etc.).
use_gold_predicates: Use gold predicates instead of predicting them.
doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
use_biaffine: ``True`` to use biaffine (:cite:`dozat:17a`) instead of lineary layer for label prediction.
loss_reduction: The loss reduction used in aggregating losses.
with_argument: The delimiter between tokens in arguments to be used for joining tokens for outputs.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
dataset = self.build_dataset(data, isinstance(data, list), logger, transform)
dataset.purge_cache()
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset)
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
return SpanRankingSemanticRoleLabeler.update_metrics(self, batch, {'prediction': prediction},
tuple(metric.values()))
def decode_output(self,
output: Dict[str, Any],
mask: torch.BoolTensor,
batch: Dict[str, Any],
decoder, **kwargs) -> Union[Dict[str, Any], Any]:
return SpanRankingSemanticRoleLabeler.decode_output(self, output, batch)
def compute_loss(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
return output['loss']
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return SpanRankingSRLDecoder(encoder_size, len(self.vocabs.srl_label), self.config)
def build_metric(self, **kwargs):
predicate_f1, end_to_end_f1 = SpanRankingSemanticRoleLabeler.build_metric(self, **kwargs)
return MetricDict({'predicate': predicate_f1, 'e2e': end_to_end_f1})
def build_criterion(self, **kwargs):
pass
def input_is_flat(self, data) -> bool:
return SpanRankingSemanticRoleLabeler.input_is_flat(self, data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
return SpanRankingSemanticRoleLabeler.format_dict_to_results(batch['token'], prediction, exclusive_offset=True,
with_predicate=True,
with_argument=self.config.get('with_argument',
' '),
label_first=True)
================================================
FILE: hanlp/components/mtl/tasks/tok/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 16:34
================================================
FILE: hanlp/components/mtl/tasks/tok/reg_tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-02 16:51
import logging
from typing import Union, List, Dict, Any, Iterable, Tuple
import torch
from hanlp_common.util import merge_locals_kwargs
from torch import Tensor
from torch.utils.data import DataLoader
import hanlp.utils.torch_util
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import FieldLength, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer
from hanlp.metrics.chunking.binary_chunking_f1 import BinaryChunkingF1
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
def generate_token_span_tuple(sample: dict):
prefix_mask = sample.get('text_prefix_mask', None)
if prefix_mask:
sample['span_tuple'] = spans = []
previous_prefix = 0
prefix_mask_ = prefix_mask[1:-1]
for i, mask in enumerate(prefix_mask_):
if i and mask:
spans.append((previous_prefix, i))
previous_prefix = i
spans.append((previous_prefix, len(prefix_mask_)))
return sample
class RegressionTokenizingDecoder(torch.nn.Linear):
def __init__(self, in_features: int, out_features: int = 1, bias: bool = ...) -> None:
super().__init__(in_features, out_features, bias)
# noinspection PyMethodOverriding
def forward(self, input: Tensor, **kwargs) -> Tensor:
return super().forward(input[:, 1:-1, :]).squeeze_(-1)
class RegressionTokenization(Task):
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None,
dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=True, lr=1e-3, separate_optimizer=False, delimiter=None,
max_seq_len=None, sent_delimiter=None) -> None:
super().__init__(**merge_locals_kwargs(locals()))
def build_criterion(self, **kwargs):
return torch.nn.BCEWithLogitsLoss(reduction='mean')
def build_metric(self, **kwargs):
return BinaryChunkingF1()
# noinspection PyMethodOverriding
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return RegressionTokenizingDecoder(encoder_size)
def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs):
pass
def build_dataloader(self,
data,
transform: TransformList = None,
training=False,
device=None,
logger: logging.Logger = None,
tokenizer: PreTrainedTokenizer = None,
**kwargs) -> DataLoader:
assert tokenizer
dataset = TextTokenizingDataset(data, cache=True, delimiter=self.config.sent_delimiter,
generate_idx=isinstance(data, list),
max_seq_len=self.config.max_seq_len,
sent_delimiter=self.config.sent_delimiter,
transform=[
TransformerSequenceTokenizer(tokenizer,
'text',
ret_prefix_mask=True,
ret_subtokens=True,
),
FieldLength('text_input_ids', 'text_input_ids_length', delta=-2),
generate_token_span_tuple])
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset, 'text_input_ids'),
shuffle=training),
device=device,
dataset=dataset)
def decode_output(self,
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
batch: Dict[str, Any], **kwargs) -> List[Tuple[int, int]]:
spans = BinaryChunkingF1.decode_spans(output > 0, batch['text_input_ids_length'])
return spans
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: List[Tuple[int, int]], metric: BinaryChunkingF1):
metric.update(prediction, batch['span_tuple'])
def compute_loss(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion):
mask = hanlp.utils.torch_util.lengths_to_mask(batch['text_input_ids_length'])
return criterion(output[mask], batch['text_prefix_mask'][:, 1:-1][mask].to(torch.float))
================================================
FILE: hanlp/components/mtl/tasks/tok/tag_tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 16:35
import logging
from typing import Dict, Any, Union, Iterable, List, Set
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict, TransformList
from hanlp.components.mtl.tasks import Task
from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer
from hanlp.layers.crf.crf import CRF
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp_common.util import merge_locals_kwargs
from hanlp_trie import DictInterface, TrieDict
class LinearCRFDecoder(torch.nn.Module):
def __init__(self,
hidden_size,
num_labels,
crf=False) -> None:
super().__init__()
self.classifier = torch.nn.Linear(hidden_size, num_labels)
self.crf = CRF(num_labels) if crf else None
def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
return self.classifier(contextualized_embeddings[:, 1:-1, :])
class TaggingTokenization(Task, TransformerTaggingTokenizer):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=1e-3, separate_optimizer=False,
cls_is_bos=True,
sep_is_eos=True,
delimiter=None,
max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False,
transform=None,
tagging_scheme='BMES',
crf=False,
token_key='token',
dict_force: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
dict_combine: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
**kwargs) -> None:
"""Tokenization which casts a chunking problem into a tagging problem.
This task has to create batch of tokens containing both [CLS] and [SEP] since it's usually the first task
and later tasks might need them.
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
sep_is_eos: ``True`` to treat the last token as ``EOS``.
delimiter: Delimiter used to split a line in the corpus.
max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
be split here.
char_level: Whether the sequence length is measured at char level.
hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
in a sentence, it will be split at a token anyway.
transform: An optional transform to be applied to samples. Usually a character normalization transform is
passed in.
tagging_scheme: Either ``BMES`` or ``BI``.
crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs, excludes=(
'self', 'kwargs', '__class__', 'dict_force', 'dict_combine'))) # avoid to config
self.transform = transform
self.vocabs = VocabDict()
self.dict_force = dict_force
self.dict_combine = dict_combine
def build_dataloader(self, data, transform: TransformList = None, training=False, device=None,
logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader:
args = dict((k, self.config[k]) for k in
['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
# We only need those transforms before TransformerTokenizer
transformer_index = transform.index_by_type(TransformerSequenceTokenizer)
assert transformer_index is not None
transform = transform[:transformer_index + 1]
if self.transform:
transform.insert(0, self.transform)
transform.append(self.last_transform())
dataset = self.build_dataset(data, cache=cache, transform=transform, **args)
dataset.purge_cache()
if self.vocabs.mutable:
self.build_vocabs(dataset, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
shuffle=training, gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset)
def compute_loss(self,
batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
return TransformerTaggingTokenizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])
def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]:
return TransformerTaggingTokenizer.decode_output(self, output, mask, batch, decoder)
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
TransformerTaggingTokenizer.update_metrics(self, metric, output, batch['tag_id'], None, batch, prediction)
def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.config.crf)
def build_metric(self, **kwargs):
return TransformerTaggingTokenizer.build_metric(self)
def build_criterion(self, model=None, **kwargs):
return TransformerTaggingTokenizer.build_criterion(self, model=model, reduction='mean')
def input_is_flat(self, data) -> bool:
return TransformerTaggingTokenizer.input_is_flat(self, data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
return TransformerTaggingTokenizer.prediction_to_human(self, prediction, None, batch, rebuild_span=True)
def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer):
# The transform for tokenizer needs very special settings, ensure these settings are set properly.
return TransformerSequenceTokenizer(
tokenizer.tokenizer,
tokenizer.input_key,
tokenizer.output_key,
tokenizer.max_seq_length,
tokenizer.truncate_long_sequences,
ret_subtokens=True,
ret_subtokens_group=True,
ret_token_span=True,
cls_is_bos=True,
sep_is_eos=True,
use_fast=tokenizer.tokenizer.is_fast,
dict_force=self.dict_force,
strip_cls_sep=False,
)
def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
return [{self.config.token_key: sent} for sent in inputs]
@property
def dict_force(self) -> DictInterface:
return TransformerTaggingTokenizer.dict_force.fget(self)
@dict_force.setter
def dict_force(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
if dictionary is not None and not isinstance(dictionary, DictInterface):
dictionary = TrieDict(dictionary)
self.config.dict_force = dictionary
@property
def dict_combine(self) -> DictInterface:
return TransformerTaggingTokenizer.dict_combine.fget(self)
@dict_combine.setter
def dict_combine(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
# noinspection PyArgumentList
TransformerTaggingTokenizer.dict_combine.fset(self, dictionary)
def transform_batch(self, batch: Dict[str, Any], results: Dict[str, Any] = None, cls_is_bos=False,
sep_is_eos=False) -> Dict[str, Any]:
"""
This method is overrode to honor the zero indexed token used in custom dict. Although for a tokenizer,
cls_is_bos = sep_is_eos = True, its tokens don't contain [CLS] or [SEP]. This behaviour is adopted from the
early versions and it is better kept to avoid migration efforts.
Args:
batch: A batch of samples.
results: Predicted results from other tasks which might be useful for this task to utilize. Say a dep task
uses both token and pos as features, then it will need both tok and pos results to make a batch.
cls_is_bos: First token in this batch is BOS.
sep_is_eos: Last token in this batch is EOS.
Returns:
A batch.
"""
return batch
================================================
FILE: hanlp/components/mtl/tasks/ud.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-17 21:54
import logging
from typing import Dict, Any, List, Union, Iterable, Callable
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp_common.document import Document
from hanlp.common.transform import VocabDict, PunctuationMask
from hanlp.components.mtl.tasks import Task
from hanlp_common.conll import CoNLLUWord
from hanlp.components.parsers.ud.ud_model import UniversalDependenciesDecoder
from hanlp.components.parsers.ud.ud_parser import UniversalDependenciesParser
from hanlp.components.parsers.ud.util import generate_lemma_rule, append_bos
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
class UniversalDependenciesParsing(Task, UniversalDependenciesParser):
def __init__(self,
trn: str = None,
dev: str = None,
tst: str = None,
sampler_builder: SamplerBuilder = None,
dependencies: str = None,
scalar_mix: ScalarMixWithDropoutBuilder = None,
use_raw_hidden_states=False,
lr=None,
separate_optimizer=False,
cls_is_bos=True,
sep_is_eos=False,
n_mlp_arc=768,
n_mlp_rel=256,
mlp_dropout=.33,
tree=False,
proj=False,
punct=False,
max_seq_len=None,
**kwargs) -> None:
r"""Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing) implementation
of "75 Languages, 1 Model: Parsing Universal Dependencies Universally" (:cite:`kondratyuk-straka-2019-75`).
Args:
trn: Path to training set.
dev: Path to dev set.
tst: Path to test set.
sampler_builder: A builder which builds a sampler.
dependencies: Its dependencies on other tasks.
scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
lr: Learning rate for this task.
separate_optimizer: Use customized separate optimizer for this task.
cls_is_bos: ``True`` to treat the first token as ``BOS``.
sep_is_eos: ``True`` to treat the last token as ``EOS``.
n_mlp_arc: Number of features for arc representation.
n_mlp_rel: Number of features for rel representation.
mlp_dropout: Dropout applied to MLPs.
tree: ``True`` to enforce tree constraint.
proj: ``True`` for projective parsing.
punct: ``True`` to include punctuations in evaluation.
max_seq_len: Prune samples longer than this length. Useful for reducing GPU consumption.
**kwargs: Not used.
"""
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.vocabs = VocabDict()
def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader:
_transform = [generate_lemma_rule, append_bos, self.vocabs, transform]
if isinstance(data, str) and not self.config.punct:
_transform.append(PunctuationMask('token', 'punct_mask'))
dataset = UniversalDependenciesParser.build_dataset(self, data, _transform)
dataset.purge_cache()
if self.vocabs.mutable:
UniversalDependenciesParser.build_vocabs(self, dataset, logger, transformer=True)
max_seq_len = self.config.get('max_seq_len', None)
if max_seq_len and isinstance(data, str):
dataset.prune(lambda x: len(x['token_input_ids']) > max_seq_len, logger)
return PadSequenceDataLoader(
batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset),
shuffle=training, gradient_accumulation=gradient_accumulation),
device=device,
dataset=dataset,
pad={'arc': 0})
def compute_loss(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \
Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
return output[0]['loss'] / 4 # we have 4 tasks
def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
mask: torch.BoolTensor, batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[
Dict[str, Any], Any]:
return UniversalDependenciesParser.decode_output(self, *output, batch)
def update_metrics(self, batch: Dict[str, Any],
output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
prediction: Dict[str, Any], metric: Union[MetricDict, Metric]):
UniversalDependenciesParser.update_metrics(self, metric, batch, *output)
# noinspection PyMethodOverriding
def build_model(self,
encoder_size,
n_mlp_arc,
n_mlp_rel,
mlp_dropout,
training=True,
**kwargs) -> torch.nn.Module:
return UniversalDependenciesDecoder(
encoder_size,
n_mlp_arc,
n_mlp_rel,
mlp_dropout,
len(self.vocabs.rel),
len(self.vocabs.lemma),
len(self.vocabs.pos),
len(self.vocabs.feat),
0,
0
)
def build_metric(self, **kwargs):
return UniversalDependenciesParser.build_metric(self)
def input_is_flat(self, data) -> bool:
return UniversalDependenciesParser.input_is_flat(self, data)
def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List:
yield from UniversalDependenciesParser.prediction_to_human(self, prediction, batch)
def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor,
decoder: torch.nn.Module):
mask = self.compute_mask(batch)
output_dict = decoder(h, batch, mask)
if decoder.training:
mask = mask.clone()
mask[:, 0] = 0
return output_dict, mask
def finalize_document(self, doc: Document, task_name: str):
lem = []
pos = []
feat = []
dep = []
for sent in doc[task_name]:
sent: List[CoNLLUWord] = sent
lem.append([x.lemma for x in sent])
pos.append([x.upos for x in sent])
feat.append([x.feats for x in sent])
dep.append([(x.head, x.deprel) for x in sent])
promoted = 0
if 'lem' not in doc:
doc['lem'] = lem
promoted += 1
if 'pos' not in doc:
doc['pos'] = pos
promoted += 1
if 'feat' not in doc:
doc['fea'] = feat
promoted += 1
if 'dep' not in doc:
doc['dep'] = dep
promoted += 1
if promoted == 4:
doc.pop(task_name)
================================================
FILE: hanlp/components/ner/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-21 17:22
================================================
FILE: hanlp/components/ner/biaffine_ner/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-21 18:41
================================================
FILE: hanlp/components/ner/biaffine_ner/biaffine_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-09 18:13
import logging
from typing import Union, List, Callable, Dict, Any
from hanlp_common.constant import IDX
from hanlp.common.structure import History
from hanlp.components.ner.biaffine_ner.biaffine_ner_model import BiaffineNamedEntityRecognitionModel
from hanlp.datasets.ner.loaders.json_ner import JsonNERDataset, unpack_ner
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength, TransformList
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.metrics.f1 import F1
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, reorder
class BiaffineNamedEntityRecognizer(TorchComponent):
def __init__(self, **kwargs) -> None:
"""An implementation of Named Entity Recognition as Dependency Parsing (:cite:`yu-etal-2020-named`). It treats
every possible span as a candidate of entity and predicts its entity label. Non-entity spans are assigned NULL
label to be excluded. The label prediction is done with a biaffine layer (:cite:`dozat:17a`). As it makes no
assumption about the spans, it naturally supports flat NER and nested NER.
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
self.model: BiaffineNamedEntityRecognitionModel = None
def build_optimizer(self,
trn,
epochs,
lr,
adam_epsilon,
weight_decay,
warmup_steps,
transformer_lr,
**kwargs):
# noinspection PyProtectedMember
if self.use_transformer:
num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model,
self._get_transformer(),
lr, transformer_lr,
num_training_steps, warmup_steps,
weight_decay, adam_epsilon)
else:
optimizer = torch.optim.Adam(self.model.parameters(), self.config.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer=optimizer,
mode='max',
factor=0.5,
patience=2,
verbose=True,
)
return optimizer, scheduler
@property
def use_transformer(self):
return 'token' not in self.vocabs
def _get_transformer(self):
return getattr(self.model_.embed, 'transformer', None)
def build_criterion(self, **kwargs):
pass
# noinspection PyProtectedMember
def build_metric(self, **kwargs) -> F1:
return F1()
def execute_training_loop(self,
trn: DataLoader,
dev: DataLoader,
epochs,
criterion,
optimizer,
metric,
save_dir,
logger: logging.Logger,
devices,
gradient_accumulation=1,
**kwargs):
best_epoch, best_metric = 0, -1
optimizer, scheduler = optimizer
history = History()
timer = CountdownTimer(epochs)
ratio_width = len(f'{len(trn)}/{len(trn)}')
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history,
gradient_accumulation=gradient_accumulation,
linear_scheduler=scheduler if self._get_transformer() else None)
if dev:
self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
report = f'{timer.elapsed_human}/{timer.total_time_human}'
dev_score = metric.score
if not self._get_transformer():
scheduler.step(dev_score)
if dev_score > best_metric:
self.save_weights(save_dir)
best_metric = dev_score
report += ' [red]saved[/red]'
timer.log(report, ratio_percentage=False, newline=True, ratio=False)
return best_metric
def fit_dataloader(self,
trn: DataLoader,
criterion,
optimizer,
metric,
logger: logging.Logger,
linear_scheduler=None,
history: History = None,
gradient_accumulation=1,
**kwargs):
self.model.train()
timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
total_loss = 0
self.reset_metrics(metric)
for batch in trn:
optimizer.zero_grad()
output_dict = self.feed_batch(batch)
self.update_metrics(batch, output_dict, metric)
loss = output_dict['loss']
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
loss.backward()
total_loss += loss.item()
if history.step(gradient_accumulation):
if self.config.grad_norm:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
optimizer.step()
if linear_scheduler:
linear_scheduler.step()
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger)
del loss
return total_loss / timer.total
# noinspection PyMethodOverriding
@torch.no_grad()
def evaluate_dataloader(self,
data: DataLoader,
criterion: Callable,
metric,
logger,
ratio_width=None,
output=False,
**kwargs):
self.model.eval()
self.reset_metrics(metric)
timer = CountdownTimer(len(data))
total_loss = 0
if output:
fp = open(output, 'w')
for batch in data:
output_dict = self.feed_batch(batch)
if output:
for sent, pred, gold in zip(batch['token'], output_dict['prediction'], batch['ner']):
fp.write('Tokens\t' + ' '.join(sent) + '\n')
fp.write('Pred\t' + '\t'.join(
['[' + ' '.join(sent[x:y + 1]) + f']/{label}' for x, y, label in pred]) + '\n')
fp.write('Gold\t' + '\t'.join(
['[' + ' '.join(sent[x:y + 1]) + f']/{label}' for x, y, label in gold]) + '\n')
fp.write('\n')
self.update_metrics(batch, output_dict, metric)
loss = output_dict['loss']
total_loss += loss.item()
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger,
ratio_width=ratio_width)
del loss
if output:
fp.close()
return total_loss / timer.total, metric
def build_model(self,
training=True,
**kwargs) -> torch.nn.Module:
# noinspection PyTypeChecker
# embed: torch.nn.Embedding = self.config.embed.module(vocabs=self.vocabs)[0].embed
model = BiaffineNamedEntityRecognitionModel(self.config,
self.config.embed.module(vocabs=self.vocabs),
self.config.context_layer,
len(self.vocabs.label))
return model
# noinspection PyMethodOverriding
def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None, vocabs=None,
sampler_builder=None,
gradient_accumulation=1,
**kwargs) -> DataLoader:
if vocabs is None:
vocabs = self.vocabs
transform = TransformList(unpack_ner, FieldLength('token'))
if isinstance(self.config.embed, Embedding):
transform.append(self.config.embed.transform(vocabs=vocabs))
transform.append(self.vocabs)
dataset = self.build_dataset(data, vocabs, transform)
if vocabs.mutable:
self.build_vocabs(dataset, logger, vocabs)
if 'token' in vocabs:
lens = [x['token'] for x in dataset]
else:
lens = [len(x['token_input_ids']) for x in dataset]
if sampler_builder:
sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
else:
sampler = None
return PadSequenceDataLoader(batch_sampler=sampler,
device=device,
dataset=dataset)
def build_dataset(self, data, vocabs, transform):
dataset = JsonNERDataset(data, transform=transform,
doc_level_offset=self.config.get('doc_level_offset', True),
tagset=self.config.get('tagset', None))
dataset.append_transform(vocabs)
if isinstance(data, str):
dataset.purge_cache() # Enable cache
return dataset
def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, ret_tokens=True, **kwargs):
if not data:
return []
flat = self.input_is_flat(data)
if flat:
data = [data]
dataloader = self.build_dataloader([{'token': x} for x in data], batch_size, False, self.device)
predictions = []
orders = []
for batch in dataloader:
output_dict = self.feed_batch(batch)
token = batch['token']
prediction = output_dict['prediction']
self.prediction_to_result(token, prediction, predictions, ret_tokens)
orders.extend(batch[IDX])
predictions = reorder(predictions, orders)
if flat:
return predictions[0]
return predictions
@staticmethod
def prediction_to_result(token, prediction, predictions: List, ret_tokens: Union[bool, str]):
for tokens, ner in zip(token, prediction):
prediction_per_sent = []
for i, (b, e, l) in enumerate(ner):
if ret_tokens is not None:
entity = tokens[b: e + 1]
if isinstance(ret_tokens, str):
entity = ret_tokens.join(entity)
prediction_per_sent.append((entity, l, b, e + 1))
else:
prediction_per_sent.append((b, e + 1, l))
predictions.append(prediction_per_sent)
@staticmethod
def input_is_flat(data):
return isinstance(data[0], str)
# noinspection PyMethodOverriding
def fit(self,
trn_data,
dev_data,
save_dir,
embed: Embedding,
context_layer,
sampler='sorting',
n_buckets=32,
batch_size=50,
lexical_dropout=0.5,
ffnn_size=150,
is_flat_ner=True,
doc_level_offset=True,
lr=1e-3,
transformer_lr=1e-5,
adam_epsilon=1e-6,
weight_decay=0.01,
warmup_steps=0.1,
grad_norm=5.0,
epochs=50,
loss_reduction='sum',
gradient_accumulation=1,
ret_tokens=True,
tagset=None,
sampler_builder=None,
devices=None,
logger=None,
seed=None,
**kwargs
):
"""
Args:
trn_data: Path to training set.
dev_data: Path to dev set.
save_dir: The directory to save trained component.
embed: Embeddings to use.
context_layer: A contextualization layer (transformer or RNN).
sampler: Sampler to use.
n_buckets: Number of buckets to use in KMeans sampler.
batch_size: The number of samples in a batch.
lexical_dropout: Dropout applied to hidden states of context layer.
ffnn_size: Feedforward size for MLPs extracting the head/tail representations.
is_flat_ner: ``True`` for flat NER, otherwise nested NER.
doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
lr: Learning rate for decoder.
transformer_lr: Learning rate for encoder.
adam_epsilon: The epsilon to use in Adam.
weight_decay: The weight decay to use.
warmup_steps: The number of warmup steps.
grad_norm: Gradient norm for clipping.
epochs: The number of epochs to train.
loss_reduction: The loss reduction used in aggregating losses.
gradient_accumulation: Number of mini-batches per update step.
ret_tokens: A delimiter between tokens in entities so that the surface form of an entity can be rebuilt.
tagset: Optional tagset to prune entities outside of this tagset from datasets.
sampler_builder: The builder to build sampler, which will override batch_size.
devices: Devices this component will live on.
logger: Any :class:`logging.Logger` instance.
seed: Random seed to reproduce this training.
**kwargs: Not used.
Returns:
The best metrics on training set.
"""
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_vocabs(self, dataset, logger, vocabs, lock=True, label_vocab_name='label', **kwargs):
vocabs[label_vocab_name] = label_vocab = Vocab(pad_token=None, unk_token=None)
# Use null to indicate no relationship
label_vocab.add('')
timer = CountdownTimer(len(dataset))
for each in dataset:
timer.log('Building NER vocab [blink][yellow]...[/yellow][/blink]')
label_vocab.set_unk_as_safe_unk()
if lock:
vocabs.lock()
vocabs.summary(logger)
def reset_metrics(self, metrics):
metrics.reset()
def report_metrics(self, loss, metrics):
return f'loss: {loss:.4f} {metrics}'
def feed_batch(self, batch) -> Dict[str, Any]:
output_dict = self.model(batch)
output_dict['prediction'] = self.get_pred_ner(batch['token'], output_dict['candidate_ner_scores'])
return output_dict
def update_metrics(self, batch: dict, prediction: Union[Dict, List], metrics):
if isinstance(prediction, dict):
prediction = prediction['prediction']
assert len(prediction) == len(batch['ner'])
for pred, gold in zip(prediction, batch['ner']):
metrics(set(pred), set(gold))
def get_pred_ner(self, sentences, span_scores):
is_flat_ner = self.config.is_flat_ner
candidates = []
for sid, sent in enumerate(sentences):
for s in range(len(sent)):
for e in range(s, len(sent)):
candidates.append((sid, s, e))
top_spans = [[] for _ in range(len(sentences))]
span_scores_cpu = span_scores.tolist()
for i, type in enumerate(torch.argmax(span_scores, dim=-1).tolist()):
if type > 0:
sid, s, e = candidates[i]
top_spans[sid].append((s, e, type, span_scores_cpu[i][type]))
top_spans = [sorted(top_span, reverse=True, key=lambda x: x[3]) for top_span in top_spans]
sent_pred_mentions = [[] for _ in range(len(sentences))]
for sid, top_span in enumerate(top_spans):
for ns, ne, t, _ in top_span:
for ts, te, _ in sent_pred_mentions[sid]:
if ns < ts <= ne < te or ts < ns <= te < ne:
# for both nested and flat ner no clash is allowed
break
if is_flat_ner and (ns <= ts <= te <= ne or ts <= ns <= ne <= te):
# for flat ner nested mentions are not allowed
break
else:
sent_pred_mentions[sid].append((ns, ne, t))
pred_mentions = set((sid, s, e, t) for sid, spr in enumerate(sent_pred_mentions) for s, e, t in spr)
prediction = [[] for _ in range(len(sentences))]
idx_to_label = self.vocabs['label'].idx_to_token
for sid, s, e, t in sorted(pred_mentions):
prediction[sid].append((s, e, idx_to_label[t]))
return prediction
================================================
FILE: hanlp/components/ner/biaffine_ner/biaffine_ner_model.py
================================================
from typing import Dict
import torch
import torch.nn.functional as F
from torch import nn
import hanlp.utils.torch_util
from hanlp.layers.time_distributed import TimeDistributed
from ...parsers.biaffine.biaffine import Biaffine
def initializer_1d(input_tensor, initializer):
assert len(input_tensor.size()) == 1
input_tensor = input_tensor.view(-1, 1)
input_tensor = initializer(input_tensor)
return input_tensor.view(-1)
class BiaffineNamedEntityRecognitionModel(nn.Module):
def __init__(self, config, embed: torch.nn.Module, context_layer: torch.nn.Module, label_space_size):
super(BiaffineNamedEntityRecognitionModel, self).__init__()
self.config = config
self.lexical_dropout = float(self.config.lexical_dropout)
self.label_space_size = label_space_size
# Initialize layers and parameters
self.word_embedding_dim = embed.get_output_dim() # get the embedding dim
self.embed = embed
# Initialize context layer
self.context_layer = context_layer
context_layer_output_dim = context_layer.get_output_dim()
self.decoder = BiaffineNamedEntityRecognitionDecoder(context_layer_output_dim, config.ffnn_size,
label_space_size, config.loss_reduction)
def forward(self,
batch: Dict[str, torch.Tensor]
):
keys = 'token_length', 'begin_offset', 'end_offset', 'label_id'
sent_lengths, gold_starts, gold_ends, gold_labels = [batch.get(k, None) for k in keys]
masks = hanlp.utils.torch_util.lengths_to_mask(sent_lengths)
num_sentences, max_sent_length = masks.size()
raw_embeddings = self.embed(batch, mask=masks)
raw_embeddings = F.dropout(raw_embeddings, self.lexical_dropout, self.training)
contextualized_embeddings = self.context_layer(raw_embeddings, masks)
return self.decoder.decode(contextualized_embeddings, gold_starts, gold_ends, gold_labels, masks,
max_sent_length,
num_sentences, sent_lengths)
class BiaffineNamedEntityRecognitionDecoder(nn.Module):
def __init__(self, hidden_size, ffnn_size, label_space_size, loss_reduction='sum') -> None:
"""An implementation of the biaffine decoder in "Named Entity Recognition as Dependency Parsing"
(:cite:`yu-etal-2020-named`).
Args:
hidden_size: Size of hidden states.
ffnn_size: Feedforward size for MLPs extracting the head/tail representations.
label_space_size: Size of tag set.
loss_reduction: The loss reduction used in aggregating losses.
"""
super().__init__()
self.loss_reduction = loss_reduction
# MLPs
def new_mlp():
return TimeDistributed(nn.Linear(hidden_size, ffnn_size))
self.start_mlp = new_mlp()
self.end_mlp = new_mlp()
self.biaffine = Biaffine(ffnn_size, label_space_size)
def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
keys = 'token_length', 'begin_offset', 'end_offset', 'label_id'
sent_lengths, gold_starts, gold_ends, gold_labels = [batch.get(k, None) for k in keys]
if mask is None:
mask = hanlp.utils.torch_util.lengths_to_mask(sent_lengths)
num_sentences, max_sent_length = mask.size()
return self.decode(contextualized_embeddings, gold_starts, gold_ends, gold_labels, mask,
max_sent_length,
num_sentences, sent_lengths)
def get_dense_span_labels(self, span_starts, span_ends, span_labels, max_sentence_length):
num_sentences, max_spans_num = span_starts.size()
sentence_indices = torch.arange(0, num_sentences, device=span_starts.device).unsqueeze(1).expand(-1,
max_spans_num)
sparse_indices = torch.cat([sentence_indices.unsqueeze(2), span_starts.unsqueeze(2), span_ends.unsqueeze(2)],
dim=2)
rank = 3
dense_labels = torch.sparse.LongTensor(sparse_indices.view(num_sentences * max_spans_num, rank).t(),
span_labels.view(-1),
torch.Size([num_sentences] + [max_sentence_length] * (rank - 1))) \
.to_dense()
return dense_labels
def decode(self, contextualized_embeddings, gold_starts, gold_ends, gold_labels, masks, max_sent_length,
num_sentences, sent_lengths):
# Apply MLPs to starts and ends, [num_sentences, max_sentences_length,emb]
candidate_starts_emb = self.start_mlp(contextualized_embeddings)
candidate_ends_emb = self.end_mlp(contextualized_embeddings)
candidate_ner_scores = self.biaffine(candidate_starts_emb, candidate_ends_emb).permute([0, 2, 3, 1])
"""generate candidate spans with argument pruning"""
# Generate masks
candidate_scores_mask = masks.unsqueeze(1) & masks.unsqueeze(2)
device = sent_lengths.device
sentence_ends_leq_starts = (
~hanlp.utils.torch_util.lengths_to_mask(torch.arange(max_sent_length, device=device), max_sent_length)) \
.unsqueeze_(0).expand(num_sentences, -1, -1)
candidate_scores_mask &= sentence_ends_leq_starts
candidate_ner_scores = candidate_ner_scores[candidate_scores_mask]
predict_dict = {
"candidate_ner_scores": candidate_ner_scores,
}
if gold_starts is not None:
gold_ner_labels = self.get_dense_span_labels(gold_starts, gold_ends, gold_labels, max_sent_length)
loss = torch.nn.functional.cross_entropy(candidate_ner_scores,
gold_ner_labels[candidate_scores_mask],
reduction=self.loss_reduction)
predict_dict['loss'] = loss
return predict_dict
================================================
FILE: hanlp/components/ner/ner_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 20:33
from abc import ABC
from typing import Union, Any, Tuple, Iterable
import tensorflow as tf
from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform
from hanlp.common.transform_tf import Transform
from hanlp.common.keras_component import KerasComponent
from hanlp.components.taggers.ngram_conv.ngram_conv_tagger import NgramConvTaggerTF
from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF
from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF
from hanlp.metrics.chunking.sequence_labeling import iobes_to_span
from hanlp_common.util import merge_locals_kwargs
class IOBES_NamedEntityRecognizer(KerasComponent, ABC):
def predict_batch(self, batch, inputs=None):
for words, tags in zip(inputs, super().predict_batch(batch, inputs)):
yield from iobes_to_span(words, tags)
class IOBES_Transform(Transform):
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
batch=None) -> Iterable:
for words, tags in zip(inputs, super().Y_to_outputs(Y, gold, inputs=inputs, X=X, batch=batch)):
yield from iobes_to_span(words, tags)
class RNNNamedEntityRecognizerTF(RNNTaggerTF, IOBES_NamedEntityRecognizer):
def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False,
rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, logger=None,
loss: Union[tf.keras.losses.Loss, str] = None,
optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=32,
dev_batch_size=32, lr_decay_per_epoch=None,
run_eagerly=False,
verbose=True, **kwargs):
# assert kwargs.get('run_eagerly', True), 'This component can only run eagerly'
# kwargs['run_eagerly'] = True
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_loss(self, loss, **kwargs):
if not loss:
loss = tf.keras.losses.SparseCategoricalCrossentropy(
reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
from_logits=True)
return super().build_loss(loss, **kwargs)
class NgramConvNamedEntityRecognizerTF(NgramConvTaggerTF, IOBES_NamedEntityRecognizer):
def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200,
ngram_embed: Union[str, int, dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3,
filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True,
loss: Union[tf.keras.losses.Loss, str] = None,
optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=100,
epochs=100, logger=None, verbose=True, **kwargs):
return super().fit(trn_data, dev_data, save_dir, word_embed, ngram_embed, embedding_trainable, window_size,
kernel_size, filters, dropout_embed, dropout_hidden, weight_norm, loss, optimizer, metrics,
batch_size, epochs, logger, verbose, **kwargs)
class IOBES_TransformerTransform(IOBES_Transform, TransformerTransform):
pass
class TransformerNamedEntityRecognizerTF(TransformerTaggerTF):
def __init__(self, transform: TransformerTransform = None) -> None:
if not transform:
transform = IOBES_TransformerTransform()
super().__init__(transform)
def fit(self, trn_data, dev_data, save_dir, transformer, optimizer='adamw', learning_rate=5e-5, weight_decay_rate=0,
epsilon=1e-8, clipnorm=1.0, warmup_steps_ratio=0, use_amp=False, max_seq_length=128, batch_size=32,
epochs=3, metrics='f1', run_eagerly=False, logger=None, verbose=True, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
================================================
FILE: hanlp/components/ner/rnn_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-12 18:00
from typing import Any
import torch
from hanlp_common.util import merge_locals_kwargs
import hanlp.utils.span_util
from hanlp.components.taggers.rnn_tagger import RNNTagger
from hanlp.metrics.chunking.conlleval import SpanF1
class RNNNamedEntityRecognizer(RNNTagger):
def __init__(self, **kwargs) -> None:
"""An old-school RNN tagger using word2vec or fasttext embeddings.
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
def build_metric(self, **kwargs):
return SpanF1(self.tagging_scheme)
def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, **kwargs):
loss, metric = super().evaluate_dataloader(data, criterion, logger, ratio_width, **kwargs)
if logger:
logger.info(metric.result(True, False)[-1])
return loss, metric
def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256,
drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='token', tagging_scheme=None,
anneal_factor: float = 0.5, delimiter=None, anneal_patience=2, devices=None,
token_delimiter=None,
logger=None,
verbose=True, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def update_metrics(self, metric, logits, y, mask, batch, prediction):
logits = self.decode_output(logits, mask, batch)
if isinstance(logits, torch.Tensor):
logits = logits.tolist()
metric(self._id_to_tags(logits), batch['tag'])
def predict(self, tokens: Any, batch_size: int = None, **kwargs):
return super().predict(tokens, batch_size, **kwargs)
def predict_data(self, data, batch_size, **kwargs):
outputs = super().predict_data(data, batch_size)
tagging_scheme = self.tagging_scheme
if tagging_scheme == 'IOBES':
entities = [hanlp.utils.span_util.iobes_tags_to_spans(y) for y in outputs]
elif tagging_scheme == 'BIO':
entities = [hanlp.utils.span_util.bio_tags_to_spans(y) for y in outputs]
elif tagging_scheme == 'BIOUL':
entities = [hanlp.utils.span_util.bioul_tags_to_spans(y) for y in outputs]
else:
raise ValueError(f'Unrecognized tag scheme {tagging_scheme}')
for i, (tokens, es) in enumerate(zip(data, entities)):
outputs[i] = [(self.config.token_delimiter.join(tokens[b:e + 1]), t, b, e + 1) for t, (b, e) in es]
return outputs
def save_config(self, save_dir, filename='config.json'):
if self.config.token_delimiter is None:
self.config.token_delimiter = '' if all(
[len(x) == 1 for x in self.vocabs[self.config.token_key].idx_to_token[-100:]]) else ' '
super().save_config(save_dir, filename)
================================================
FILE: hanlp/components/ner/transformer_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-07 11:08
import functools
from typing import Union, List, Dict, Any, Set
from hanlp_trie import DictInterface, TrieDict
from hanlp.common.dataset import SamplerBuilder
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
from hanlp.metrics.chunking.sequence_labeling import get_entities
from hanlp.metrics.f1 import F1
from hanlp.datasets.ner.loaders.json_ner import prune_ner_tagset
from hanlp.utils.string_util import guess_delimiter
from hanlp_common.util import merge_locals_kwargs
class TransformerNamedEntityRecognizer(TransformerTagger):
def __init__(self, **kwargs) -> None:
r"""A simple tagger using transformers and a linear layer with an optional CRF
(:cite:`lafferty2001conditional`) layer for
NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
During decoding, it performs longest-prefix-matching of these words to override the prediction from
underlying statistical model. It also uses a blacklist to mask out mis-predicted entities.
.. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
do and what it can't do. The tutorial in `this book `_ can be very helpful.
Args:
**kwargs: Not used.
"""
super().__init__(**kwargs)
def build_metric(self, **kwargs):
return F1()
# noinspection PyMethodOverriding
def update_metrics(self, metric, logits, y, mask, batch, prediction):
for p, g in zip(prediction, self.tag_to_span(batch['tag'], batch)):
pred = set(p)
gold = set(g)
metric(pred, gold)
# noinspection PyMethodOverriding
def decode_output(self, logits, mask, batch, model=None):
output = super().decode_output(logits, mask, batch, model)
prediction = super().prediction_to_human(output, self.vocabs['tag'].idx_to_token, batch)
return self.tag_to_span(prediction, batch)
def tag_to_span(self, batch_tags, batch):
spans = []
sents = batch[self.config.token_key]
dict_whitelist = self.dict_whitelist
dict_blacklist = self.dict_blacklist
merge_types = self.config.get('merge_types', None)
for tags, tokens in zip(batch_tags, sents):
entities = get_entities(tags)
if dict_whitelist:
matches = dict_whitelist.tokenize(tokens)
if matches:
# Fix O E-LOC O like predictions
entities = get_entities(tags)
for label, start, end in entities:
if end - start == 1:
tags[start] = 'S-' + label
else:
tags[start] = 'B-' + label
for i in range(start + 1, end - 1):
tags[i] = 'I-' + label
tags[end - 1] = 'E-' + label
for start, end, label in matches:
if (not tags[start][0] in 'ME') and (not tags[end - 1][0] in 'BM'):
if end - start == 1:
tags[start] = 'S-' + label
else:
tags[start] = 'B-' + label
for i in range(start + 1, end - 1):
tags[i] = 'I-' + label
tags[end - 1] = 'E-' + label
entities = get_entities(tags)
if merge_types and len(entities) > 1:
merged_entities = []
begin = 0
for i in range(1, len(entities)):
if entities[begin][0] != entities[i][0] or entities[i - 1][2] != entities[i][1] \
or entities[i][0] not in merge_types:
merged_entities.append((entities[begin][0], entities[begin][1], entities[i - 1][2]))
begin = i
merged_entities.append((entities[begin][0], entities[begin][1], entities[-1][2]))
entities = merged_entities
if dict_blacklist:
pruned = []
delimiter_in_entity = self.config.get('delimiter_in_entity', ' ')
for label, start, end in entities:
entity = delimiter_in_entity.join(tokens[start:end])
if entity not in dict_blacklist:
pruned.append((label, start, end))
entities = pruned
spans.append(entities)
return spans
def decorate_spans(self, spans, batch):
batch_ner = []
delimiter_in_entity = self.config.get('delimiter_in_entity', ' ')
for spans_per_sent, tokens in zip(spans, batch.get(f'{self.config.token_key}_', batch[self.config.token_key])):
ner_per_sent = []
for label, start, end in spans_per_sent:
ner_per_sent.append((delimiter_in_entity.join(tokens[start:end]), label, start, end))
batch_ner.append(ner_per_sent)
return batch_ner
def generate_prediction_filename(self, tst_data, save_dir):
return super().generate_prediction_filename(tst_data.replace('.tsv', '.txt'), save_dir)
def prediction_to_human(self, pred, vocab, batch):
return self.decorate_spans(pred, batch)
def input_is_flat(self, tokens):
return tokens and isinstance(tokens, list) and isinstance(tokens[0], str)
def fit(self, trn_data, dev_data, save_dir, transformer,
delimiter_in_entity=None,
merge_types: List[str] = None,
average_subwords=False,
word_dropout: float = 0.2,
hidden_dropout=None,
layer_dropout=0,
scalar_mix=None,
grad_norm=5.0,
lr=5e-5,
transformer_lr=None,
adam_epsilon=1e-8,
weight_decay=0,
warmup_steps=0.1,
crf=False,
secondary_encoder=None,
reduction='sum',
batch_size=32,
sampler_builder: SamplerBuilder = None,
epochs=3,
tagset=None,
token_key='token',
max_seq_len=None,
sent_delimiter=None,
char_level=False,
hard_constraint=False,
transform=None,
logger=None,
seed=None,
devices: Union[float, int, List[int]] = None,
**kwargs):
"""Fit component to training set.
Args:
trn_data: Training set.
dev_data: Development set.
save_dir: The directory to save trained component.
transformer: An identifier of a pre-trained transformer.
delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining
tokens during decoding.
merge_types: The types of consecutive entities to be merged.
average_subwords: ``True`` to average subword representations.
word_dropout: Dropout rate to randomly replace a subword with MASK.
hidden_dropout: Dropout rate applied to hidden states.
layer_dropout: Randomly zero out hidden states of a transformer layer.
scalar_mix: Layer attention.
grad_norm: Gradient norm for clipping.
lr: Learning rate for decoder.
transformer_lr: Learning for encoder.
adam_epsilon: The epsilon to use in Adam.
weight_decay: The weight decay to use.
warmup_steps: The number of warmup steps.
crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden
states from the main encoder as input.
reduction: The loss reduction used in aggregating losses.
batch_size: The number of samples in a batch.
sampler_builder: The builder to build sampler, which will override batch_size.
epochs: The number of epochs to train.
tagset: Optional tagset to prune entities outside of this tagset from datasets.
token_key: The key to tokens in dataset.
max_seq_len: The maximum sequence length. Sequence longer than this will be handled by sliding
window.
sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
be split here.
char_level: Whether the sequence length is measured at char level, which is never the case for
lemmatization.
hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
in a sentence, it will be split at a token anyway.
transform: An optional transform to be applied to samples. Usually a character normalization transform is
passed in.
devices: Devices this component will live on.
logger: Any :class:`logging.Logger` instance.
seed: Random seed to reproduce this training.
**kwargs: Not used.
Returns:
The best metrics on training set.
"""
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_vocabs(self, trn, logger, **kwargs):
super().build_vocabs(trn, logger, **kwargs)
if self.config.get('delimiter_in_entity', None) is None:
# Check the first sample to guess the delimiter between tokens in a NE
tokens = trn[0][self.config.token_key]
delimiter_in_entity = guess_delimiter(tokens)
logger.info(f'Guess the delimiter between tokens in named entity could be [blue]"{delimiter_in_entity}'
f'"[/blue]. If not, specify `delimiter_in_entity` in `fit()`')
self.config.delimiter_in_entity = delimiter_in_entity
def build_dataset(self, data, transform=None, **kwargs):
dataset = super().build_dataset(data, transform, **kwargs)
if isinstance(data, str):
tagset = self.config.get('tagset', None)
if tagset:
dataset.append_transform(functools.partial(prune_ner_tagset, tagset=tagset))
return dataset
@property
def dict_whitelist(self) -> DictInterface:
return self.config.get('dict_whitelist', None)
@dict_whitelist.setter
def dict_whitelist(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
if dictionary is not None and not isinstance(dictionary, DictInterface):
dictionary = TrieDict(dictionary)
self.config.dict_whitelist = dictionary
@property
def dict_blacklist(self) -> DictInterface:
return self.config.get('dict_blacklist', None)
@dict_blacklist.setter
def dict_blacklist(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
if dictionary is not None and not isinstance(dictionary, DictInterface):
dictionary = TrieDict(dictionary)
self.config.dict_blacklist = dictionary
================================================
FILE: hanlp/components/parsers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-22 12:46
================================================
FILE: hanlp/components/parsers/alg.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
from hanlp_common.conll import isprojective
def kmeans(x, k, max_it=32):
r"""
KMeans algorithm for clustering the sentences by length.
Args:
x (list[int]):
The list of sentence lengths.
k (int):
The number of clusters.
This is an approximate value. The final number of clusters can be less or equal to `k`.
max_it (int):
Maximum number of iterations.
If centroids does not converge after several iterations, the algorithm will be early stopped.
Returns:
list[float], list[list[int]]:
The first list contains average lengths of sentences in each cluster.
The second is the list of clusters holding indices of data points.
Examples:
>>> x = torch.randint(10,20,(10,)).tolist()
>>> x
[15, 10, 17, 11, 18, 13, 17, 19, 18, 14]
>>> centroids, clusters = kmeans(x, 3)
>>> centroids
[10.5, 14.0, 17.799999237060547]
>>> clusters
[[1, 3], [0, 5, 9], [2, 4, 6, 7, 8]]
"""
# the number of clusters must not be greater than the number of datapoints
x, k = torch.tensor(x, dtype=torch.float), min(len(x), k)
# collect unique datapoints
d = x.unique()
# initialize k centroids randomly
c = d[torch.randperm(len(d))[:k]]
# assign each datapoint to the cluster with the closest centroid
dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1)
for _ in range(max_it):
# if an empty cluster is encountered,
# choose the farthest datapoint from the biggest cluster and move that the empty one
mask = torch.arange(k).unsqueeze(-1).eq(y)
none = torch.where(~mask.any(-1))[0].tolist()
while len(none) > 0:
for i in none:
# the biggest cluster
b = torch.where(mask[mask.sum(-1).argmax()])[0]
# the datapoint farthest from the centroid of cluster b
f = dists[b].argmax()
# update the assigned cluster of f
y[b[f]] = i
# re-calculate the mask
mask = torch.arange(k).unsqueeze(-1).eq(y)
none = torch.where(~mask.any(-1))[0].tolist()
# update the centroids
c, old = (x * mask).sum(-1) / mask.sum(-1), c
# re-assign all datapoints to clusters
dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1)
# stop iteration early if the centroids converge
if c.equal(old):
break
# assign all datapoints to the new-generated clusters
# the empty ones are discarded
assigned = y.unique().tolist()
# get the centroids of the assigned clusters
centroids = c[assigned].tolist()
# map all values of datapoints to buckets
clusters = [torch.where(y.eq(i))[0].tolist() for i in assigned]
return centroids, clusters
def eisner(scores, mask):
r"""
First-order Eisner algorithm for projective decoding.
References:
- Ryan McDonald, Koby Crammer and Fernando Pereira. 2005.
`Online Large-Margin Training of Dependency Parsers`_.
Args:
scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all dependent-head pairs.
mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
The mask to avoid parsing over padding tokens.
The first column serving as pseudo words for roots should be ``False``.
Returns:
~torch.Tensor:
A tensor with shape ``[batch_size, seq_len]`` for the resulting projective parse trees.
Examples:
>>> scores = torch.tensor([[[-13.5026, -18.3700, -13.0033, -16.6809],
[-36.5235, -28.6344, -28.4696, -31.6750],
[ -2.9084, -7.4825, -1.4861, -6.8709],
[-29.4880, -27.6905, -26.1498, -27.0233]]])
>>> mask = torch.tensor([[False, True, True, True]])
>>> eisner(scores, mask)
tensor([[0, 2, 0, 2]])
.. _Online Large-Margin Training of Dependency Parsers:
https://www.aclweb.org/anthology/P05-1012/
"""
lens = mask.sum(1)
batch_size, seq_len, _ = scores.shape
scores = scores.permute(2, 1, 0)
s_i = torch.full_like(scores, float('-inf'))
s_c = torch.full_like(scores, float('-inf'))
p_i = scores.new_zeros(seq_len, seq_len, batch_size).long()
p_c = scores.new_zeros(seq_len, seq_len, batch_size).long()
s_c.diagonal().fill_(0)
for w in range(1, seq_len):
n = seq_len - w
starts = p_i.new_tensor(range(n)).unsqueeze(0)
# ilr = C(i->r) + C(j->r+1)
ilr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
# [batch_size, n, w]
il = ir = ilr.permute(2, 0, 1)
# I(j->i) = max(C(i->r) + C(j->r+1) + s(j->i)), i <= r < j
il_span, il_path = il.max(-1)
s_i.diagonal(-w).copy_(il_span + scores.diagonal(-w))
p_i.diagonal(-w).copy_(il_path + starts)
# I(i->j) = max(C(i->r) + C(j->r+1) + s(i->j)), i <= r < j
ir_span, ir_path = ir.max(-1)
s_i.diagonal(w).copy_(ir_span + scores.diagonal(w))
p_i.diagonal(w).copy_(ir_path + starts)
# C(j->i) = max(C(r->i) + I(j->r)), i <= r < j
cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
cl_span, cl_path = cl.permute(2, 0, 1).max(-1)
s_c.diagonal(-w).copy_(cl_span)
p_c.diagonal(-w).copy_(cl_path + starts)
# C(i->j) = max(I(i->r) + C(r->j)), i < r <= j
cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
cr_span, cr_path = cr.permute(2, 0, 1).max(-1)
s_c.diagonal(w).copy_(cr_span)
s_c[0, w][lens.ne(w)] = float('-inf')
p_c.diagonal(w).copy_(cr_path + starts + 1)
def backtrack(p_i, p_c, heads, i, j, complete):
if i == j:
return
if complete:
r = p_c[i, j]
backtrack(p_i, p_c, heads, i, r, False)
backtrack(p_i, p_c, heads, r, j, True)
else:
r, heads[j] = p_i[i, j], i
i, j = sorted((i, j))
backtrack(p_i, p_c, heads, i, r, True)
backtrack(p_i, p_c, heads, j, r + 1, True)
preds = []
p_c = p_c.permute(2, 0, 1).cpu()
p_i = p_i.permute(2, 0, 1).cpu()
for i, length in enumerate(lens.tolist()):
heads = p_c.new_zeros(length + 1, dtype=torch.long)
backtrack(p_i[i], p_c[i], heads, 0, length, True)
preds.append(heads.to(mask.device))
return pad(preds, total_length=seq_len).to(mask.device)
def backtrack(p_i, p_c, heads, i, j, complete):
if i == j:
return
if complete:
r = p_c[i, j]
backtrack(p_i, p_c, heads, i, r, False)
backtrack(p_i, p_c, heads, r, j, True)
else:
r, heads[j] = p_i[i, j], i
i, j = sorted((i, j))
backtrack(p_i, p_c, heads, i, r, True)
backtrack(p_i, p_c, heads, j, r + 1, True)
def stripe(x, n, w, offset=(0, 0), dim=1):
"""r'''Returns a diagonal stripe of the tensor.
Args:
x: Tensor
n: int
w: int
offset: tuple (Default value = (0)
dim: int (Default value = 1)
Example:
0):
Returns:
>>> x = torch.arange(25).view(5, 5)
>>> x
tensor([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19],
[20, 21, 22, 23, 24]])
>>> stripe(x, 2, 3, (1, 1))
tensor([[ 6, 7, 8],
[12, 13, 14]])
>>> stripe(x, 2, 3, dim=0)
tensor([[ 0, 5, 10],
[ 6, 11, 16]])
"""
x, seq_len = x.contiguous(), x.size(1)
stride, numel = list(x.stride()), x[0, 0].numel()
stride[0] = (seq_len + 1) * numel
stride[1] = (1 if dim == 1 else seq_len) * numel
return x.as_strided(size=(n, w, *x.shape[2:]),
stride=stride,
storage_offset=(offset[0] * seq_len + offset[1]) * numel)
def cky(scores, mask):
r"""
The implementation of `Cocke-Kasami-Younger`_ (CKY) algorithm to parse constituency trees.
References:
- Yu Zhang, Houquan Zhou and Zhenghua Li. 2020.
`Fast and Accurate Neural CRF Constituency Parsing`_.
Args:
scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all candidate constituents.
mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
The mask to avoid parsing over padding tokens.
For each square matrix in a batch, the positions except upper triangular part should be masked out.
Returns:
Sequences of factorized predicted bracketed trees that are traversed in pre-order.
Examples:
>>> scores = torch.tensor([[[ 2.5659, 1.4253, -2.5272, 3.3011],
[ 1.3687, -0.5869, 1.0011, 3.3020],
[ 1.2297, 0.4862, 1.1975, 2.5387],
[-0.0511, -1.2541, -0.7577, 0.2659]]])
>>> mask = torch.tensor([[[False, True, True, True],
[False, False, True, True],
[False, False, False, True],
[False, False, False, False]]])
>>> cky(scores, mask)
[[(0, 3), (0, 1), (1, 3), (1, 2), (2, 3)]]
.. _Cocke-Kasami-Younger:
https://en.wikipedia.org/wiki/CYK_algorithm
.. _Fast and Accurate Neural CRF Constituency Parsing:
https://www.ijcai.org/Proceedings/2020/560/
"""
lens = mask[:, 0].sum(-1)
scores = scores.permute(1, 2, 0)
seq_len, seq_len, batch_size = scores.shape
s = scores.new_zeros(seq_len, seq_len, batch_size)
p = scores.new_zeros(seq_len, seq_len, batch_size).long()
for w in range(1, seq_len):
n = seq_len - w
starts = p.new_tensor(range(n)).unsqueeze(0)
if w == 1:
s.diagonal(w).copy_(scores.diagonal(w))
continue
# [n, w, batch_size]
s_span = stripe(s, n, w - 1, (0, 1)) + stripe(s, n, w - 1, (1, w), 0)
# [batch_size, n, w]
s_span = s_span.permute(2, 0, 1)
# [batch_size, n]
s_span, p_span = s_span.max(-1)
s.diagonal(w).copy_(s_span + scores.diagonal(w))
p.diagonal(w).copy_(p_span + starts + 1)
def backtrack(p, i, j):
if j == i + 1:
return [(i, j)]
split = p[i][j]
ltree = backtrack(p, i, split)
rtree = backtrack(p, split, j)
return [(i, j)] + ltree + rtree
p = p.permute(2, 0, 1).tolist()
trees = [backtrack(p[i], 0, length) if length else [] for i, length in enumerate(lens.tolist())]
return trees
def istree(sequence, proj=False, multiroot=False):
r"""
Checks if the arcs form an valid dependency tree.
Args:
sequence (list[int]):
A list of head indices.
proj (bool):
If ``True``, requires the tree to be projective. Default: ``False``.
multiroot (bool):
If ``False``, requires the tree to contain only a single root. Default: ``True``.
Returns:
``True`` if the arcs form an valid tree, ``False`` otherwise.
Examples:
>>> istree([3, 0, 0, 3], multiroot=True)
True
>>> istree([3, 0, 0, 3], proj=True)
False
"""
if proj and not isprojective(sequence):
return False
n_roots = sum(head == 0 for head in sequence)
if n_roots == 0:
return False
if not multiroot and n_roots > 1:
return False
if any(i == head for i, head in enumerate(sequence, 1)):
return False
return next(tarjan(sequence), None) is None
def tarjan(sequence):
r"""
Tarjan algorithm for finding Strongly Connected Components (SCCs) of a graph.
Args:
sequence (list):
List of head indices.
Yields:
A list of indices that make up a SCC. All self-loops are ignored.
Examples:
>>> next(tarjan([2, 5, 0, 3, 1])) # (1 -> 5 -> 2 -> 1) is a cycle
[2, 5, 1]
"""
sequence = [-1] + sequence
# record the search order, i.e., the timestep
dfn = [-1] * len(sequence)
# record the the smallest timestep in a SCC
low = [-1] * len(sequence)
# push the visited into the stack
stack, onstack = [], [False] * len(sequence)
def connect(i, timestep):
dfn[i] = low[i] = timestep[0]
timestep[0] += 1
stack.append(i)
onstack[i] = True
for j, head in enumerate(sequence):
if head != i:
continue
if dfn[j] == -1:
yield from connect(j, timestep)
low[i] = min(low[i], low[j])
elif onstack[j]:
low[i] = min(low[i], dfn[j])
# a SCC is completed
if low[i] == dfn[i]:
cycle = [stack.pop()]
while cycle[-1] != i:
onstack[cycle[-1]] = False
cycle.append(stack.pop())
onstack[i] = False
# ignore the self-loop
if len(cycle) > 1:
yield cycle
timestep = [0]
for i in range(len(sequence)):
if dfn[i] == -1:
yield from connect(i, timestep)
def chuliu_edmonds(s):
r"""
ChuLiu/Edmonds algorithm for non-projective decoding.
Some code is borrowed from `tdozat's implementation`_.
Descriptions of notations and formulas can be found in
`Non-projective Dependency Parsing using Spanning Tree Algorithms`_.
Notes:
The algorithm does not guarantee to parse a single-root tree.
References:
- Ryan McDonald, Fernando Pereira, Kiril Ribarov and Jan Hajic. 2005.
`Non-projective Dependency Parsing using Spanning Tree Algorithms`_.
Args:
s (~torch.Tensor): ``[seq_len, seq_len]``.
Scores of all dependent-head pairs.
Returns:
~torch.Tensor:
A tensor with shape ``[seq_len]`` for the resulting non-projective parse tree.
.. _tdozat's implementation:
https://github.com/tdozat/Parser-v3
.. _Non-projective Dependency Parsing using Spanning Tree Algorithms:
https://www.aclweb.org/anthology/H05-1066/
"""
s[0, 1:] = float('-inf')
# prevent self-loops
s.diagonal()[1:].fill_(float('-inf'))
# select heads with highest scores
tree = s.argmax(-1)
# return the cycle finded by tarjan algorithm lazily
cycle = next(tarjan(tree.tolist()[1:]), None)
# if the tree has no cycles, then it is a MST
if not cycle:
return tree
# indices of cycle in the original tree
cycle = torch.tensor(cycle)
# indices of noncycle in the original tree
noncycle = torch.ones(len(s)).index_fill_(0, cycle, 0)
noncycle = torch.where(noncycle.gt(0))[0]
def contract(s):
# heads of cycle in original tree
cycle_heads = tree[cycle]
# scores of cycle in original tree
s_cycle = s[cycle, cycle_heads]
# calculate the scores of cycle's potential dependents
# s(c->x) = max(s(x'->x)), x in noncycle and x' in cycle
s_dep = s[noncycle][:, cycle]
# find the best cycle head for each noncycle dependent
deps = s_dep.argmax(1)
# calculate the scores of cycle's potential heads
# s(x->c) = max(s(x'->x) - s(a(x')->x') + s(cycle)), x in noncycle and x' in cycle
# a(v) is the predecessor of v in cycle
# s(cycle) = sum(s(a(v)->v))
s_head = s[cycle][:, noncycle] - s_cycle.view(-1, 1) + s_cycle.sum()
# find the best noncycle head for each cycle dependent
heads = s_head.argmax(0)
contracted = torch.cat((noncycle, torch.tensor([-1])))
# calculate the scores of contracted graph
s = s[contracted][:, contracted]
# set the contracted graph scores of cycle's potential dependents
s[:-1, -1] = s_dep[range(len(deps)), deps]
# set the contracted graph scores of cycle's potential heads
s[-1, :-1] = s_head[heads, range(len(heads))]
return s, heads, deps
# keep track of the endpoints of the edges into and out of cycle for reconstruction later
s, heads, deps = contract(s)
# y is the contracted tree
y = chuliu_edmonds(s)
# exclude head of cycle from y
y, cycle_head = y[:-1], y[-1]
# fix the subtree with no heads coming from the cycle
# len(y) denotes heads coming from the cycle
subtree = y < len(y)
# add the nodes to the new tree
tree[noncycle[subtree]] = noncycle[y[subtree]]
# fix the subtree with heads coming from the cycle
subtree = ~subtree
# add the nodes to the tree
tree[noncycle[subtree]] = cycle[deps[subtree]]
# fix the root of the cycle
cycle_root = heads[cycle_head]
# break the cycle and add the root of the cycle to the tree
tree[cycle[cycle_root]] = noncycle[cycle_head]
return tree
def mst(scores, mask, multiroot=False):
r"""
MST algorithm for decoding non-pojective trees.
This is a wrapper for ChuLiu/Edmonds algorithm.
The algorithm first runs ChuLiu/Edmonds to parse a tree and then have a check of multi-roots,
If ``multiroot=True`` and there indeed exist multi-roots, the algorithm seeks to find
best single-root trees by iterating all possible single-root trees parsed by ChuLiu/Edmonds.
Otherwise the resulting trees are directly taken as the final outputs.
Args:
scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all dependent-head pairs.
mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
The mask to avoid parsing over padding tokens.
The first column serving as pseudo words for roots should be ``False``.
muliroot (bool):
Ensures to parse a single-root tree If ``False``.
Returns:
~torch.Tensor:
A tensor with shape ``[batch_size, seq_len]`` for the resulting non-projective parse trees.
Examples:
>>> scores = torch.tensor([[[-11.9436, -13.1464, -6.4789, -13.8917],
[-60.6957, -60.2866, -48.6457, -63.8125],
[-38.1747, -49.9296, -45.2733, -49.5571],
[-19.7504, -23.9066, -9.9139, -16.2088]]])
>>> scores[:, 0, 1:] = float('-inf')
>>> scores.diagonal(0, 1, 2)[1:].fill_(float('-inf'))
>>> mask = torch.tensor([[False, True, True, True]])
>>> mst(scores, mask)
tensor([[0, 2, 0, 2]])
"""
batch_size, seq_len, _ = scores.shape
scores = scores.detach().cpu().unbind()
preds = []
for i, length in enumerate(mask.sum(1).tolist()):
s = scores[i][:length + 1, :length + 1]
tree = chuliu_edmonds(s)
roots = torch.where(tree[1:].eq(0))[0] + 1
if not multiroot and len(roots) > 1:
s_root = s[:, 0]
s_best = float('-inf')
s = s.index_fill(1, torch.tensor(0), float('-inf'))
for root in roots:
s[:, 0] = float('-inf')
s[root, 0] = s_root[root]
t = chuliu_edmonds(s)
s_tree = s[1:].gather(1, t[1:].unsqueeze(-1)).sum()
if s_tree > s_best:
s_best, tree = s_tree, t
preds.append(tree)
return pad(preds, total_length=seq_len).to(mask.device)
def eisner2o(scores, mask):
r"""
Second-order Eisner algorithm for projective decoding.
This is an extension of the first-order one that further incorporates sibling scores into tree scoring.
References:
- Ryan McDonald and Fernando Pereira. 2006.
`Online Learning of Approximate Dependency Parsing Algorithms`_.
Args:
scores (~torch.Tensor, ~torch.Tensor):
A tuple of two tensors representing the first-order and second-order scores repectively.
The first (``[batch_size, seq_len, seq_len]``) holds scores of all dependent-head pairs.
The second (``[batch_size, seq_len, seq_len, seq_len]``) holds scores of all dependent-head-sibling triples.
mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
The mask to avoid parsing over padding tokens.
The first column serving as pseudo words for roots should be ``False``.
Returns:
~torch.Tensor:
A tensor with shape ``[batch_size, seq_len]`` for the resulting projective parse trees.
Examples:
>>> s_arc = torch.tensor([[[ -2.8092, -7.9104, -0.9414, -5.4360],
[-10.3494, -7.9298, -3.6929, -7.3985],
[ 1.1815, -3.8291, 2.3166, -2.7183],
[ -3.9776, -3.9063, -1.6762, -3.1861]]])
>>> s_sib = torch.tensor([[[[ 0.4719, 0.4154, 1.1333, 0.6946],
[ 1.1252, 1.3043, 2.1128, 1.4621],
[ 0.5974, 0.5635, 1.0115, 0.7550],
[ 1.1174, 1.3794, 2.2567, 1.4043]],
[[-2.1480, -4.1830, -2.5519, -1.8020],
[-1.2496, -1.7859, -0.0665, -0.4938],
[-2.6171, -4.0142, -2.9428, -2.2121],
[-0.5166, -1.0925, 0.5190, 0.1371]],
[[ 0.5827, -1.2499, -0.0648, -0.0497],
[ 1.4695, 0.3522, 1.5614, 1.0236],
[ 0.4647, -0.7996, -0.3801, 0.0046],
[ 1.5611, 0.3875, 1.8285, 1.0766]],
[[-1.3053, -2.9423, -1.5779, -1.2142],
[-0.1908, -0.9699, 0.3085, 0.1061],
[-1.6783, -2.8199, -1.8853, -1.5653],
[ 0.3629, -0.3488, 0.9011, 0.5674]]]])
>>> mask = torch.tensor([[False, True, True, True]])
>>> eisner2o((s_arc, s_sib), mask)
tensor([[0, 2, 0, 2]])
.. _Online Learning of Approximate Dependency Parsing Algorithms:
https://www.aclweb.org/anthology/E06-1011/
"""
# the end position of each sentence in a batch
lens = mask.sum(1)
s_arc, s_sib = scores
batch_size, seq_len, _ = s_arc.shape
# [seq_len, seq_len, batch_size]
s_arc = s_arc.permute(2, 1, 0)
# [seq_len, seq_len, seq_len, batch_size]
s_sib = s_sib.permute(2, 1, 3, 0)
s_i = torch.full_like(s_arc, float('-inf'))
s_s = torch.full_like(s_arc, float('-inf'))
s_c = torch.full_like(s_arc, float('-inf'))
p_i = s_arc.new_zeros(seq_len, seq_len, batch_size).long()
p_s = s_arc.new_zeros(seq_len, seq_len, batch_size).long()
p_c = s_arc.new_zeros(seq_len, seq_len, batch_size).long()
s_c.diagonal().fill_(0)
for w in range(1, seq_len):
# n denotes the number of spans to iterate,
# from span (0, w) to span (n, n+w) given width w
n = seq_len - w
starts = p_i.new_tensor(range(n)).unsqueeze(0)
# I(j->i) = max(I(j->r) + S(j->r, i)), i < r < j |
# C(j->j) + C(i->j-1))
# + s(j->i)
# [n, w, batch_size]
il = stripe(s_i, n, w, (w, 1)) + stripe(s_s, n, w, (1, 0), 0)
il += stripe(s_sib[range(w, n + w), range(n)], n, w, (0, 1))
# [n, 1, batch_size]
il0 = stripe(s_c, n, 1, (w, w)) + stripe(s_c, n, 1, (0, w - 1))
# il0[0] are set to zeros since the scores of the complete spans starting from 0 are always -inf
il[:, -1] = il0.index_fill_(0, lens.new_tensor(0), 0).squeeze(1)
il_span, il_path = il.permute(2, 0, 1).max(-1)
s_i.diagonal(-w).copy_(il_span + s_arc.diagonal(-w))
p_i.diagonal(-w).copy_(il_path + starts + 1)
# I(i->j) = max(I(i->r) + S(i->r, j), i < r < j |
# C(i->i) + C(j->i+1))
# + s(i->j)
# [n, w, batch_size]
ir = stripe(s_i, n, w) + stripe(s_s, n, w, (0, w), 0)
ir += stripe(s_sib[range(n), range(w, n + w)], n, w)
ir[0] = float('-inf')
# [n, 1, batch_size]
ir0 = stripe(s_c, n, 1) + stripe(s_c, n, 1, (w, 1))
ir[:, 0] = ir0.squeeze(1)
ir_span, ir_path = ir.permute(2, 0, 1).max(-1)
s_i.diagonal(w).copy_(ir_span + s_arc.diagonal(w))
p_i.diagonal(w).copy_(ir_path + starts)
# [n, w, batch_size]
slr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
slr_span, slr_path = slr.permute(2, 0, 1).max(-1)
# S(j, i) = max(C(i->r) + C(j->r+1)), i <= r < j
s_s.diagonal(-w).copy_(slr_span)
p_s.diagonal(-w).copy_(slr_path + starts)
# S(i, j) = max(C(i->r) + C(j->r+1)), i <= r < j
s_s.diagonal(w).copy_(slr_span)
p_s.diagonal(w).copy_(slr_path + starts)
# C(j->i) = max(C(r->i) + I(j->r)), i <= r < j
cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
cl_span, cl_path = cl.permute(2, 0, 1).max(-1)
s_c.diagonal(-w).copy_(cl_span)
p_c.diagonal(-w).copy_(cl_path + starts)
# C(i->j) = max(I(i->r) + C(r->j)), i < r <= j
cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
cr_span, cr_path = cr.permute(2, 0, 1).max(-1)
s_c.diagonal(w).copy_(cr_span)
# disable multi words to modify the root
s_c[0, w][lens.ne(w)] = float('-inf')
p_c.diagonal(w).copy_(cr_path + starts + 1)
def backtrack(p_i, p_s, p_c, heads, i, j, flag):
if i == j:
return
if flag == 'c':
r = p_c[i, j]
backtrack(p_i, p_s, p_c, heads, i, r, 'i')
backtrack(p_i, p_s, p_c, heads, r, j, 'c')
elif flag == 's':
r = p_s[i, j]
i, j = sorted((i, j))
backtrack(p_i, p_s, p_c, heads, i, r, 'c')
backtrack(p_i, p_s, p_c, heads, j, r + 1, 'c')
elif flag == 'i':
r, heads[j] = p_i[i, j], i
if r == i:
r = i + 1 if i < j else i - 1
backtrack(p_i, p_s, p_c, heads, j, r, 'c')
else:
backtrack(p_i, p_s, p_c, heads, i, r, 'i')
backtrack(p_i, p_s, p_c, heads, r, j, 's')
preds = []
p_i = p_i.permute(2, 0, 1).cpu()
p_s = p_s.permute(2, 0, 1).cpu()
p_c = p_c.permute(2, 0, 1).cpu()
for i, length in enumerate(lens.tolist()):
heads = p_c.new_zeros(length + 1, dtype=torch.long)
backtrack(p_i[i], p_s[i], p_c[i], heads, 0, length, 'c')
preds.append(heads.to(mask.device))
return pad(preds, total_length=seq_len).to(mask.device)
def pad(tensors, padding_value=0, total_length=None):
size = [len(tensors)] + [max(tensor.size(i) for tensor in tensors)
for i in range(len(tensors[0].size()))]
if total_length is not None:
assert total_length >= size[1]
size[1] = total_length
out_tensor = tensors[0].data.new(*size).fill_(padding_value)
for i, tensor in enumerate(tensors):
out_tensor[i][[slice(0, i) for i in tensor.size()]] = tensor
return out_tensor
def decode_dep(s_arc, mask, tree=False, proj=False):
r"""
Args:
s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all possible arcs.
mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
The mask for covering the unpadded tokens.
tree (bool):
If ``True``, ensures to output well-formed trees. Default: ``False``.
proj (bool):
If ``True``, ensures to output projective trees. Default: ``False``.
Returns:
~torch.Tensor, ~torch.Tensor:
Predicted arcs and labels of shape ``[batch_size, seq_len]``.
"""
lens = mask.sum(1)
arc_preds = s_arc.argmax(-1)
bad = [not istree(seq[1:i + 1], proj) for i, seq in zip(lens.tolist(), arc_preds.tolist())]
if tree and any(bad):
if proj:
alg = eisner
else:
alg = mst
s_arc.diagonal(0, 1, 2)[1:].fill_(float('-inf'))
arc_preds[bad] = alg(s_arc[bad], mask[bad])
return arc_preds
================================================
FILE: hanlp/components/parsers/alg_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 19:49
# Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser
from typing import List
import numpy as np
import tensorflow as tf
from collections import defaultdict
def nonzero(t: tf.Tensor) -> tf.Tensor:
return tf.where(t > 0)
def view(t: tf.Tensor, *dims) -> tf.Tensor:
return tf.reshape(t, dims)
def arange(n: int) -> tf.Tensor:
return tf.range(n)
def randperm(n: int) -> tf.Tensor:
return tf.random.shuffle(arange(n))
def tolist(t: tf.Tensor) -> List:
if isinstance(t, tf.Tensor):
t = t.numpy()
return t.tolist()
def kmeans(x, k, seed=None):
"""See https://github.com/zysite/biaffine-parser/blob/master/parser/utils/alg.py#L7
Args:
x(list): Lengths of sentences
k(int):
seed: (Default value = None)
Returns:
"""
x = tf.constant(x, dtype=tf.float32)
# count the frequency of each datapoint
d, indices, f = tf.unique_with_counts(x, tf.int32)
f = tf.cast(f, tf.float32)
# calculate the sum of the values of the same datapoints
total = d * f
# initialize k centroids randomly
c, old = tf.random.shuffle(d, seed)[:k], None
# assign labels to each datapoint based on centroids
dists = tf.abs(tf.expand_dims(d, -1) - c)
y = tf.argmin(dists, axis=-1, output_type=tf.int32)
dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y])))
# make sure number of datapoints is greater than that of clusters
assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters"
while old is None or not tf.reduce_all(c == old):
# if an empty cluster is encountered,
# choose the farthest datapoint from the biggest cluster
# and move that the empty one
for i in range(k):
if not tf.reduce_any(y == i):
mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32)
lens = tf.reduce_sum(mask, axis=-1)
biggest = view(nonzero(mask[tf.argmax(lens)]), -1)
farthest = tf.argmax(tf.gather(dists, biggest))
tf.tensor_scatter_nd_update(y, tf.expand_dims(tf.expand_dims(biggest[farthest], -1), -1), [i])
mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32)
# update the centroids
c, old = tf.cast(tf.reduce_sum(total * mask, axis=-1), tf.float32) / tf.cast(tf.reduce_sum(f * mask, axis=-1),
tf.float32), c
# re-assign all datapoints to clusters
dists = tf.abs(tf.expand_dims(d, -1) - c)
y = tf.argmin(dists, axis=-1, output_type=tf.int32)
dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y])))
# assign all datapoints to the new-generated clusters
# without considering the empty ones
y, (assigned, _) = tf.gather(y, indices), tf.unique(y)
# get the centroids of the assigned clusters
centroids = tf.gather(c, assigned).numpy().tolist()
# map all values of datapoints to buckets
clusters = [tf.squeeze(tf.where(y == i), axis=-1).numpy().tolist() for i in assigned]
return centroids, clusters
# ***************************************************************
class Tarjan:
"""Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph"""
def __init__(self, prediction, tokens):
"""
Parameters
----------
prediction : numpy.ndarray
a predicted dependency tree where prediction[dep_idx] = head_idx
tokens : numpy.ndarray
the tokens we care about (i.e. exclude _GO, _EOS, and _PAD)
"""
self._edges = defaultdict(set)
self._vertices = set((0,))
for dep, head in enumerate(prediction[tokens]):
self._vertices.add(dep + 1)
self._edges[head].add(dep + 1)
self._indices = {}
self._lowlinks = {}
self._onstack = defaultdict(lambda: False)
self._SCCs = []
index = 0
stack = []
for v in self.vertices:
if v not in self.indices:
self.strongconnect(v, index, stack)
# =============================================================
def strongconnect(self, v, index, stack):
"""
Args:
v:
index:
stack:
Returns:
"""
self._indices[v] = index
self._lowlinks[v] = index
index += 1
stack.append(v)
self._onstack[v] = True
for w in self.edges[v]:
if w not in self.indices:
self.strongconnect(w, index, stack)
self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w])
elif self._onstack[w]:
self._lowlinks[v] = min(self._lowlinks[v], self._indices[w])
if self._lowlinks[v] == self._indices[v]:
self._SCCs.append(set())
while stack[-1] != v:
w = stack.pop()
self._onstack[w] = False
self._SCCs[-1].add(w)
w = stack.pop()
self._onstack[w] = False
self._SCCs[-1].add(w)
return
# ======================
@property
def edges(self):
return self._edges
@property
def vertices(self):
return self._vertices
@property
def indices(self):
return self._indices
@property
def SCCs(self):
return self._SCCs
def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True):
"""Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py
Args:
parse_probs(NDArray): seq_len x seq_len, the probability of arcs
length(NDArray): sentence length including ROOT
tokens_to_keep(NDArray): mask matrix
ensure_tree: (Default value = True)
Returns:
"""
if ensure_tree:
I = np.eye(len(tokens_to_keep))
# block loops and pad heads
parse_probs = parse_probs * tokens_to_keep * (1 - I)
parse_preds = np.argmax(parse_probs, axis=1)
tokens = np.arange(1, length)
roots = np.where(parse_preds[tokens] == 0)[0] + 1
# ensure at least one root
if len(roots) < 1:
# The current root probabilities
root_probs = parse_probs[tokens, 0]
# The current head probabilities
old_head_probs = parse_probs[tokens, parse_preds[tokens]]
# Get new potential root probabilities
new_root_probs = root_probs / old_head_probs
# Select the most probable root
new_root = tokens[np.argmax(new_root_probs)]
# Make the change
parse_preds[new_root] = 0
# ensure at most one root
elif len(roots) > 1:
# The probabilities of the current heads
root_probs = parse_probs[roots, 0]
# Set the probability of depending on the root zero
parse_probs[roots, 0] = 0
# Get new potential heads and their probabilities
new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1
new_head_probs = parse_probs[roots, new_heads] / root_probs
# Select the most probable root
new_root = roots[np.argmin(new_head_probs)]
# Make the change
parse_preds[roots] = new_heads
parse_preds[new_root] = 0
# remove cycles
tarjan = Tarjan(parse_preds, tokens)
for SCC in tarjan.SCCs:
if len(SCC) > 1:
dependents = set()
to_visit = set(SCC)
while len(to_visit) > 0:
node = to_visit.pop()
if not node in dependents:
dependents.add(node)
to_visit.update(tarjan.edges[node])
# The indices of the nodes that participate in the cycle
cycle = np.array(list(SCC))
# The probabilities of the current heads
old_heads = parse_preds[cycle]
old_head_probs = parse_probs[cycle, old_heads]
# Set the probability of depending on a non-head to zero
non_heads = np.array(list(dependents))
parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
# Get new potential heads and their probabilities
new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1
new_head_probs = parse_probs[cycle, new_heads] / old_head_probs
# Select the most probable change
change = np.argmax(new_head_probs)
changed_cycle = cycle[change]
old_head = old_heads[change]
new_head = new_heads[change]
# Make the change
parse_preds[changed_cycle] = new_head
tarjan.edges[new_head].add(changed_cycle)
tarjan.edges[old_head].remove(changed_cycle)
return parse_preds
else:
# block and pad heads
parse_probs = parse_probs * tokens_to_keep
parse_preds = np.argmax(parse_probs, axis=1)
return parse_preds
def rel_argmax(rel_probs, length, root, ensure_tree=True):
"""Fix the relation prediction by heuristic rules
Args:
rel_probs(NDArray): seq_len x rel_size
length: real sentence length
ensure_tree: (Default value = True)
root:
Returns:
"""
if ensure_tree:
tokens = np.arange(1, length)
rel_preds = np.argmax(rel_probs, axis=1)
roots = np.where(rel_preds[tokens] == root)[0] + 1
if len(roots) < 1:
rel_preds[1 + np.argmax(rel_probs[tokens, root])] = root
elif len(roots) > 1:
root_probs = rel_probs[roots, root]
rel_probs[roots, root] = 0
new_rel_preds = np.argmax(rel_probs[roots], axis=1)
new_rel_probs = rel_probs[roots, new_rel_preds] / root_probs
new_root = roots[np.argmin(new_rel_probs)]
rel_preds[roots] = new_rel_preds
rel_preds[new_root] = root
return rel_preds
else:
rel_preds = np.argmax(rel_probs, axis=1)
return rel_preds
================================================
FILE: hanlp/components/parsers/biaffine/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 20:43
================================================
FILE: hanlp/components/parsers/biaffine/biaffine.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
import torch.nn as nn
class Biaffine(nn.Module):
r"""
Biaffine layer for first-order scoring.
This function has a tensor of weights :math:`W` and bias terms if needed.
The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y`,
in which :math:`x` and :math:`y` can be concatenated with bias terms.
References:
- Timothy Dozat and Christopher D. Manning. 2017.
`Deep Biaffine Attention for Neural Dependency Parsing`_.
Args:
n_in (int):
The size of the input feature.
n_out (int):
The number of output channels.
bias_x (bool):
If ``True``, adds a bias term for tensor :math:`x`. Default: ``True``.
bias_y (bool):
If ``True``, adds a bias term for tensor :math:`y`. Default: ``True``.
.. _Deep Biaffine Attention for Neural Dependency Parsing:
https://openreview.net/forum?id=Hk95PK9le
"""
def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
super().__init__()
self.n_in = n_in
self.n_out = n_out
self.bias_x = bias_x
self.bias_y = bias_y
self.weight = nn.Parameter(torch.Tensor(n_out, n_in + bias_x, n_in + bias_y))
self.reset_parameters()
def __repr__(self):
s = f"n_in={self.n_in}, n_out={self.n_out}"
if self.bias_x:
s += f", bias_x={self.bias_x}"
if self.bias_y:
s += f", bias_y={self.bias_y}"
return f"{self.__class__.__name__}({s})"
def reset_parameters(self):
nn.init.zeros_(self.weight)
def forward(self, x, y):
r"""
Args:
x (torch.Tensor): ``[batch_size, seq_len, n_in]``.
y (torch.Tensor): ``[batch_size, seq_len, n_in]``.
Returns:
~torch.Tensor:
A scoring tensor of shape ``[batch_size, n_out, seq_len, seq_len]``.
If ``n_out=1``, the dimension for ``n_out`` will be squeezed automatically.
"""
if self.bias_x:
x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
if self.bias_y:
y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
# [batch_size, n_out, seq_len, seq_len]
s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
# remove dim 1 if n_out == 1
s = s.squeeze(1)
return s
================================================
FILE: hanlp/components/parsers/biaffine/biaffine_2nd_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-06 13:57
import functools
from typing import Union, List, Any
import torch
from hanlp_common.constant import UNK
from hanlp.common.transform import TransformList
from hanlp.common.vocab import Vocab
from hanlp.components.parsers.biaffine.biaffine import Biaffine
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder, \
EncoderWithContextualLayer
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp.components.parsers.biaffine.biaffine_sdp import BiaffineSemanticDependencyParser
from hanlp_common.conll import CoNLLUWord, CoNLLSentence
from hanlp.components.parsers.parse_alg import add_secondary_arcs_by_preds
from hanlp.datasets.parsing.loaders.conll_dataset import append_bos
from hanlp.datasets.parsing.semeval15 import unpack_deps_to_head_deprel, merge_head_deprel_with_2nd
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
from transformers import PreTrainedModel, PreTrainedTokenizer
class BiaffineSeparateDecoder(torch.nn.Module):
def __init__(self, hidden_size, config) -> None:
super().__init__()
self.biaffine_decoder = BiaffineDecoder(hidden_size,
config.n_mlp_arc,
config.n_mlp_rel,
config.mlp_dropout,
config.n_rels)
self.biaffine_decoder_2nd = BiaffineDecoder(hidden_size,
config.n_mlp_arc,
config.n_mlp_rel,
config.mlp_dropout,
config.n_rels_2nd)
def forward(self, x, mask):
return tuple(zip(self.biaffine_decoder(x, mask), self.biaffine_decoder_2nd(x, mask)))
class BiaffineJointDecoder(BiaffineDecoder):
def __init__(self, hidden_size, config) -> None:
super().__init__(hidden_size, config.n_mlp_arc, config.n_mlp_rel, config.mlp_dropout, config.n_rels)
# the Biaffine layers for secondary dep
self.arc_attn_2nd = Biaffine(n_in=config.n_mlp_arc,
bias_x=True,
bias_y=False)
self.rel_attn_2nd = Biaffine(n_in=config.n_mlp_rel,
n_out=config.n_rels,
bias_x=True,
bias_y=True)
def forward(self, x, mask=None, **kwargs: Any):
arc_d, arc_h, rel_d, rel_h = self.apply_mlps(x)
s_arc, s_rel = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn, self.rel_attn)
s_arc_2nd, s_rel_2nd = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn_2nd, self.rel_attn_2nd)
return (s_arc, s_arc_2nd), (s_rel, s_rel_2nd)
class BiaffineSecondaryModel(torch.nn.Module):
def __init__(self, config, pretrained_embed: torch.Tensor = None, transformer: PreTrainedModel = None,
transformer_tokenizer: PreTrainedTokenizer = None):
super().__init__()
self.encoder = EncoderWithContextualLayer(config, pretrained_embed, transformer, transformer_tokenizer)
self.decoder = BiaffineJointDecoder(self.encoder.hidden_size, config) if config.joint \
else BiaffineSeparateDecoder(self.encoder.hidden_size, config)
def forward(self,
words=None,
feats=None,
input_ids=None,
token_span=None,
mask=None, lens=None, **kwargs):
x, mask = self.encoder(words, feats, input_ids, token_span, mask, lens)
return self.decoder(x, mask)
class BiaffineSecondaryParser(BiaffineDependencyParser):
def __init__(self) -> None:
super().__init__()
self.model: BiaffineSecondaryModel = None
def build_dataset(self, data, bos_transform=None):
transform = TransformList(functools.partial(append_bos, pos_key='UPOS'),
functools.partial(unpack_deps_to_head_deprel, pad_rel=self.config.pad_rel,
arc_key='arc_2nd',
rel_key='rel_2nd'))
if self.config.joint:
transform.append(merge_head_deprel_with_2nd)
if bos_transform:
transform.append(bos_transform)
return super().build_dataset(data, transform)
def build_criterion(self, **kwargs):
# noinspection PyCallByClass
return super().build_criterion(**kwargs), (BiaffineSemanticDependencyParser.build_criterion(self, **kwargs))
def fit(self, trn_data, dev_data, save_dir, feat=None, n_embed=100, pretrained_embed=None, transformer=None,
average_subwords=False, word_dropout: float = 0.2, transformer_hidden_dropout=None, layer_dropout=0,
scalar_mix: int = None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, hidden_dropout=.33,
n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, lr=2e-3, transformer_lr=5e-5, mu=.9, nu=.9, epsilon=1e-12,
clip=5.0, decay=.75, decay_steps=5000, patience=100, batch_size=None, sampler_builder=None,
lowercase=False, epochs=50000, tree=False, punct=False, min_freq=2,
apply_constraint=True, joint=False, no_cycle=False, root=None,
logger=None,
verbose=True, unk=UNK, pad_rel=None, max_sequence_length=512, devices: Union[float, int, List[int]] = None,
transform=None, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_vocabs(self, dataset, logger=None, transformer=None):
self.vocabs['rel_2nd'] = rel_2nd = Vocab(pad_token=self.config.pad_rel, unk_token=self.config.pad_rel)
if self.config.joint:
self.vocabs['rel'] = rel_2nd
super().build_vocabs(dataset, logger, transformer)
self.config.n_rels_2nd = len(rel_2nd)
def create_model(self, pretrained_embed, transformer):
return BiaffineSecondaryModel(self.config, pretrained_embed, transformer, self.transformer_tokenizer)
def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None):
arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd = self.unpack_scores(arc_scores, rel_scores)
loss_1st = super().compute_loss(arc_scores_1st, rel_scores_1st, arcs, rels, mask, criterion[0], batch)
mask = self.compute_mask(arc_scores_2nd, batch, mask)
# noinspection PyCallByClass
loss_2st = BiaffineSemanticDependencyParser.compute_loss(self, arc_scores_2nd, rel_scores_2nd,
batch['arc_2nd'], batch['rel_2nd_id'], mask,
criterion[1], batch)
return loss_1st + loss_2st
@staticmethod
def compute_mask(arc_scores_2nd, batch, mask_1st):
mask = batch.get('mask_2nd', None)
if mask is None:
batch['mask_2nd'] = mask = BiaffineSemanticDependencyParser.convert_to_3d_mask(arc_scores_2nd, mask_1st)
return mask
def unpack_scores(self, arc_scores, rel_scores):
arc_scores_1st, arc_scores_2nd = arc_scores
rel_scores_1st, rel_scores_2nd = rel_scores
return arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd
def get_pad_dict(self):
d = super(BiaffineSecondaryParser, self).get_pad_dict()
d.update({'arc_2nd': False})
return d
def decode(self, arc_scores, rel_scores, mask, batch=None, predicting=None):
output_1st, output_2nd = batch.get('outputs', (None, None))
if output_1st is None:
arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd = self.unpack_scores(arc_scores, rel_scores)
output_1st = super().decode(arc_scores_1st, rel_scores_1st, mask)
mask = self.compute_mask(arc_scores_2nd, batch, mask)
# noinspection PyCallByClass
output_2nd = BiaffineSemanticDependencyParser.decode(self, arc_scores_2nd, rel_scores_2nd, mask, batch)
if self.config.get('no_cycle'):
assert predicting, 'No cycle constraint for evaluation is not implemented yet. If you are ' \
'interested, welcome to submit a pull request.'
root_rel_idx = self.vocabs['rel'].token_to_idx.get(self.config.get('root', None), None)
arc_pred_1st, rel_pred_1st, arc_pred_2nd, rel_pred_2nd = *output_1st, *output_2nd
arc_scores_2nd = arc_scores_2nd.transpose(1, 2).cpu().detach().numpy()
arc_pred_2nd = arc_pred_2nd.cpu().detach().numpy()
rel_pred_2nd = rel_pred_2nd.cpu().detach().numpy()
trees = arc_pred_1st.cpu().detach().numpy()
graphs = []
for i, (arc_scores, arc_preds, rel_preds, tree, tokens) in enumerate(
zip(arc_scores_2nd, arc_pred_2nd, rel_pred_2nd, trees, batch['token'])):
sent_len = len(tokens)
graph = add_secondary_arcs_by_preds(arc_scores, arc_preds[:sent_len, :sent_len], rel_preds,
tree[:sent_len], root_rel_idx)
graphs.append(graph[1:]) # Remove root
# if not predicting:
# # Write back to torch Tensor
# for d, hr in zip(graph):
# pass
output_2nd = None, graphs
return tuple(zip(output_1st, output_2nd))
def update_metric(self, arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch=None):
super().update_metric(arc_preds[0], rel_preds[0], arcs, rels, mask, puncts, metric['1st'], batch)
puncts = BiaffineSemanticDependencyParser.convert_to_3d_puncts(puncts, batch['mask_2nd'])
# noinspection PyCallByClass
BiaffineSemanticDependencyParser.update_metric(self, arc_preds[1], rel_preds[1], batch['arc_2nd'],
batch['rel_2nd_id'], batch['mask_2nd'], puncts, metric['2nd'],
batch)
def build_metric(self, **kwargs):
# noinspection PyCallByClass
return MetricDict({'1st': super().build_metric(**kwargs),
'2nd': BiaffineSemanticDependencyParser.build_metric(self, **kwargs)})
def collect_outputs_extend(self, predictions: list, arc_preds, rel_preds, lens, mask):
predictions.extend(rel_preds[1])
def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True):
rel_vocab = self.vocabs['rel'].idx_to_token
for d, graph in zip(data, predictions):
sent = CoNLLSentence()
for idx, (cell, hrs) in enumerate(zip(d, graph)):
if use_pos:
token, pos = cell
else:
token, pos = cell, None
head = hrs[0][0]
deprel = rel_vocab[hrs[0][1]]
deps = [(h, rel_vocab[r]) for h, r in hrs[1:]]
sent.append(CoNLLUWord(idx + 1, token, upos=pos, head=head, deprel=deprel, deps=deps))
outputs.append(sent)
================================================
FILE: hanlp/components/parsers/biaffine/biaffine_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 20:51
import os
from collections import Counter
from typing import Union, Any, List
from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, AutoModel_, AutoTokenizer_
import torch
from hanlp.utils.torch_util import lengths_to_mask
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader
from hanlp_common.constant import ROOT, UNK, IDX
from hanlp.common.dataset import PadSequenceDataLoader
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import LowerCase, FieldLength, PunctuationMask
from hanlp.common.vocab import Vocab
from hanlp.components.parsers.alg import decode_dep
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDependencyModel
from hanlp_common.conll import CoNLLWord, CoNLLSentence
from hanlp.datasets.parsing.loaders.conll_dataset import CoNLLParsingDataset, append_bos
from hanlp.layers.embeddings.util import index_word2vec_with_vocab
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
from hanlp.metrics.parsing.attachmentscore import AttachmentScore
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import isdebugging, merge_locals_kwargs, merge_dict, reorder
class BiaffineDependencyParser(TorchComponent):
def __init__(self) -> None:
"""Biaffine dependency parsing (:cite:`dozat:17a`).
"""
super().__init__()
self.model: BiaffineDependencyModel = None
self.transformer_tokenizer: PreTrainedTokenizer = None
def predict(self, data: Any, batch_size=None, batch_max_tokens=None, conll=True, **kwargs):
if not data:
return []
use_pos = self.use_pos
flat = self.input_is_flat(data, use_pos)
if flat:
data = [data]
samples = self.build_samples(data, use_pos)
if not batch_max_tokens:
batch_max_tokens = self.config.get('batch_max_tokens', None)
if not batch_size:
batch_size = self.config.batch_size
dataloader = self.build_dataloader(samples,
device=self.devices[0], shuffle=False,
**merge_dict(self.config,
batch_size=batch_size,
batch_max_tokens=batch_max_tokens,
overwrite=True,
**kwargs))
predictions, build_data, data, order = self.before_outputs(data)
for batch in dataloader:
arc_scores, rel_scores, mask, puncts = self.feed_batch(batch)
self.collect_outputs(arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos,
build_data)
outputs = self.post_outputs(predictions, data, order, use_pos, build_data, conll=conll)
if flat:
return outputs[0]
return outputs
def build_samples(self, data, use_pos=None):
samples = []
pos_key = 'CPOS' if 'CPOS' in self.vocabs else 'UPOS'
for idx, each in enumerate(data):
sample = {IDX: idx}
if use_pos:
token, pos = zip(*each)
sample.update({'FORM': list(token), pos_key: list(pos)})
else:
token = each
sample.update({'FORM': list(token)})
samples.append(sample)
return samples
def input_is_flat(self, data, use_pos=None):
if use_pos is None:
use_pos = 'CPOS' in self.vocabs
if use_pos:
flat = isinstance(data[0], (list, tuple)) and isinstance(data[0][0], str)
else:
flat = isinstance(data[0], str)
return flat
def before_outputs(self, data):
predictions, order = [], []
build_data = data is None
if build_data:
data = []
return predictions, build_data, data, order
def post_outputs(self, predictions, data, order, use_pos, build_data, conll=True):
predictions = reorder(predictions, order)
if build_data:
data = reorder(data, order)
outputs = []
self.predictions_to_human(predictions, outputs, data, use_pos, conll=conll)
return outputs
def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True):
if conll:
for d, (arcs, rels) in zip(data, predictions):
sent = CoNLLSentence()
for idx, (cell, a, r) in enumerate(zip(d, arcs, rels)):
if use_pos:
token, pos = cell
else:
token, pos = cell, None
sent.append(CoNLLWord(idx + 1, token, cpos=pos, head=a, deprel=self.vocabs['rel'][r]))
outputs.append(sent)
else:
for d, (arcs, rels) in zip(data, predictions):
sent = []
for idx, (a, r) in enumerate(zip(arcs, rels)):
sent.append((a, self.vocabs['rel'][r]))
outputs.append(sent)
def collect_outputs(self, arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos,
build_data):
lens = [len(token) - 1 for token in batch['token']]
arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch)
self.collect_outputs_extend(predictions, arc_preds, rel_preds, lens, mask)
order.extend(batch[IDX])
if build_data:
if use_pos:
data.extend(zip(batch['FORM'], batch['CPOS']))
else:
data.extend(batch['FORM'])
def collect_outputs_extend(self, predictions: list, arc_preds, rel_preds, lens, mask):
predictions.extend(zip([seq.tolist() for seq in arc_preds[mask].split(lens)],
[seq.tolist() for seq in rel_preds[mask].split(lens)]))
@property
def use_pos(self):
return self.config.get('feat', None) == 'pos'
def fit(self, trn_data, dev_data, save_dir,
feat=None,
n_embed=100,
pretrained_embed=None,
transformer=None,
average_subwords=False,
word_dropout=0.2,
transformer_hidden_dropout=None,
layer_dropout=0,
scalar_mix: int = None,
embed_dropout=.33,
n_lstm_hidden=400,
n_lstm_layers=3,
hidden_dropout=.33,
n_mlp_arc=500,
n_mlp_rel=100,
mlp_dropout=.33,
lr=2e-3,
transformer_lr=5e-5,
mu=.9,
nu=.9,
epsilon=1e-12,
grad_norm=5.0,
decay=.75,
decay_steps=5000,
weight_decay=0,
warmup_steps=0.1,
separate_optimizer=False,
patience=100,
lowercase=False,
epochs=50000,
tree=False,
proj=False,
punct=False,
min_freq=2,
logger=None,
verbose=True,
unk=UNK,
max_sequence_length=512,
batch_size=None,
sampler_builder=None,
gradient_accumulation=1,
devices: Union[float, int, List[int]] = None,
transform=None,
secondary_encoder=None,
**kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def execute_training_loop(self, trn, dev, devices, epochs, logger, patience, save_dir, optimizer,
gradient_accumulation, **kwargs):
optimizer, scheduler, transformer_optimizer, transformer_scheduler = optimizer
criterion = self.build_criterion()
best_e, best_metric = 0, self.build_metric()
timer = CountdownTimer(epochs)
history = History()
ratio_width = len(f'{len(trn) // gradient_accumulation}/{len(trn) // gradient_accumulation}')
for epoch in range(1, epochs + 1):
# train one epoch and update the parameters
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, optimizer, scheduler, criterion, epoch, logger, history,
transformer_optimizer, transformer_scheduler,
gradient_accumulation=gradient_accumulation)
loss, dev_metric = self.evaluate_dataloader(dev, criterion, ratio_width=ratio_width, logger=logger)
timer.update()
# logger.info(f"{'Dev' + ' ' * ratio_width} loss: {loss:.4f} {dev_metric}")
# save the model if it is the best so far
report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
if dev_metric > best_metric:
best_e, best_metric = epoch, dev_metric
self.save_weights(save_dir)
report += ' ([red]saved[/red])'
else:
if patience != epochs:
report += f' ({epoch - best_e}/{patience})'
else:
report += f' ({epoch - best_e})'
logger.info(report)
if patience is not None and epoch - best_e >= patience:
logger.info(f'LAS has stopped improving for {patience} epochs, early stop.')
break
timer.stop()
if not best_e:
self.save_weights(save_dir)
elif best_e != epoch:
self.load_weights(save_dir)
logger.info(f"Max score of dev is {best_metric.score:.2%} at epoch {best_e}")
logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
logger.info(f"{timer.elapsed_human} elapsed")
def build_optimizer(self, epochs, trn, gradient_accumulation, **kwargs):
config = self.config
model = self.model
if isinstance(model, nn.DataParallel):
model = model.module
if self.config.transformer:
transformer = model.encoder.transformer
optimizer = Adam(set(model.parameters()) - set(transformer.parameters()),
config.lr,
(config.mu, config.nu),
config.epsilon)
if self.config.transformer_lr:
num_training_steps = len(trn) * epochs // gradient_accumulation
if self.config.separate_optimizer:
transformer_optimizer, transformer_scheduler = \
build_optimizer_scheduler_with_transformer(transformer,
transformer,
config.transformer_lr,
config.transformer_lr,
num_training_steps,
config.warmup_steps,
config.weight_decay,
adam_epsilon=1e-8)
else:
optimizer, scheduler = build_optimizer_scheduler_with_transformer(model,
transformer,
config.lr,
config.transformer_lr,
num_training_steps,
config.warmup_steps,
config.weight_decay,
adam_epsilon=1e-8)
transformer_optimizer, transformer_scheduler = None, None
else:
transformer.requires_grad_(False)
transformer_optimizer, transformer_scheduler = None, None
else:
optimizer = Adam(model.parameters(),
config.lr,
(config.mu, config.nu),
config.epsilon)
transformer_optimizer, transformer_scheduler = None, None
if self.config.separate_optimizer:
scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps))
# noinspection PyUnboundLocalVariable
return optimizer, scheduler, transformer_optimizer, transformer_scheduler
def build_transformer_tokenizer(self):
transformer = self.config.transformer
if transformer:
transformer_tokenizer: PreTrainedTokenizer = AutoTokenizer_.from_pretrained(transformer, use_fast=True)
else:
transformer_tokenizer = None
self.transformer_tokenizer = transformer_tokenizer
return transformer_tokenizer
# noinspection PyMethodOverriding
def build_dataloader(self,
data,
shuffle,
device,
training=False,
logger=None,
gradient_accumulation=1,
sampler_builder=None,
batch_size=None,
**kwargs) -> DataLoader:
dataset = self.build_dataset(data)
if self.vocabs.mutable:
self.build_vocabs(dataset, logger, self.config.transformer)
transformer_tokenizer = self.transformer_tokenizer
if transformer_tokenizer:
dataset.transform.append(self.build_tokenizer_transform())
dataset.append_transform(FieldLength('token', 'sent_length'))
if isinstance(data, str):
dataset.purge_cache()
if len(dataset) > 1000 and isinstance(data, str):
timer = CountdownTimer(len(dataset))
self.cache_dataset(dataset, timer, training, logger)
if self.config.transformer:
lens = [len(sample['input_ids']) for sample in dataset]
else:
lens = [sample['sent_length'] for sample in dataset]
if sampler_builder:
sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
else:
sampler = None
loader = PadSequenceDataLoader(dataset=dataset,
batch_sampler=sampler,
batch_size=batch_size,
pad=self.get_pad_dict(),
device=device,
vocabs=self.vocabs)
return loader
def cache_dataset(self, dataset, timer, training=False, logger=None):
for each in dataset:
timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]')
def get_pad_dict(self):
return {'arc': 0}
def build_dataset(self, data, bos_transform=None):
if not bos_transform:
bos_transform = append_bos
transform = [bos_transform]
if self.config.get('transform', None):
transform.append(self.config.transform)
if self.config.get('lowercase', False):
transform.append(LowerCase('token'))
transform.append(self.vocabs)
if not self.config.punct:
transform.append(PunctuationMask('token', 'punct_mask'))
return CoNLLParsingDataset(data, transform=transform)
def build_tokenizer_transform(self):
return TransformerSequenceTokenizer(self.transformer_tokenizer, 'token', '',
ret_token_span=True, cls_is_bos=True,
max_seq_length=self.config.get('max_sequence_length',
512),
truncate_long_sequences=False)
def build_vocabs(self, dataset, logger=None, transformer=None):
rel_vocab = self.vocabs.get('rel', None)
if rel_vocab is None:
rel_vocab = Vocab(unk_token=None, pad_token=self.config.get('pad_rel', None))
self.vocabs.put(rel=rel_vocab)
if self.config.get('feat', None) == 'pos' or self.config.get('use_pos', False):
self.vocabs['pos'] = Vocab(unk_token=None, pad_token=None)
timer = CountdownTimer(len(dataset))
if transformer:
token_vocab = None
else:
token_vocab = Vocab()
self.vocabs.token = token_vocab
unk = self.config.get('unk', None)
if unk is not None:
token_vocab.unk_token = unk
if token_vocab and self.config.get('min_freq', None):
counter = Counter()
for sample in dataset:
for form in sample['token']:
counter[form] += 1
reserved_token = [token_vocab.pad_token, token_vocab.unk_token]
if ROOT in token_vocab:
reserved_token.append(ROOT)
freq_words = reserved_token + [token for token, freq in counter.items() if
freq >= self.config.min_freq]
token_vocab.token_to_idx.clear()
for word in freq_words:
token_vocab(word)
else:
for i, sample in enumerate(dataset):
timer.log('vocab building [blink][yellow]...[/yellow][/blink]', ratio_percentage=True)
rel_vocab.set_unk_as_safe_unk() # Some relation in dev set is OOV
self.vocabs.lock()
self.vocabs.summary(logger=logger)
if token_vocab:
self.config.n_words = len(self.vocabs['token'])
if 'pos' in self.vocabs:
self.config.n_feats = len(self.vocabs['pos'])
self.vocabs['pos'].set_unk_as_safe_unk()
self.config.n_rels = len(self.vocabs['rel'])
if token_vocab:
self.config.pad_index = self.vocabs['token'].pad_idx
self.config.unk_index = self.vocabs['token'].unk_idx
def build_model(self, training=True, **kwargs) -> torch.nn.Module:
pretrained_embed, transformer = self.build_embeddings(training=training)
if pretrained_embed is not None:
self.config.n_embed = pretrained_embed.size(-1)
model = self.create_model(pretrained_embed, transformer)
return model
def create_model(self, pretrained_embed, transformer):
return BiaffineDependencyModel(self.config,
pretrained_embed,
transformer,
self.transformer_tokenizer)
def build_embeddings(self, training=True):
pretrained_embed = None
if self.config.get('pretrained_embed', None):
pretrained_embed = index_word2vec_with_vocab(self.config.pretrained_embed, self.vocabs['token'],
init='zeros', normalize=True)
transformer = self.config.transformer
if transformer:
transformer = AutoModel_.from_pretrained(transformer, training=training)
return pretrained_embed, transformer
# noinspection PyMethodOverriding
def fit_dataloader(self,
trn,
optimizer,
scheduler,
criterion,
epoch,
logger,
history: History,
transformer_optimizer=None,
transformer_scheduler=None,
gradient_accumulation=1,
**kwargs):
self.model.train()
timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation))
metric = self.build_metric(training=True)
total_loss = 0
for idx, batch in enumerate(trn):
arc_scores, rel_scores, mask, puncts = self.feed_batch(batch)
arcs, rels = batch['arc'], batch['rel_id']
loss = self.compute_loss(arc_scores, rel_scores, arcs, rels, mask, criterion, batch)
if gradient_accumulation > 1:
loss /= gradient_accumulation
loss.backward()
total_loss += loss.item()
arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch)
self.update_metric(arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch)
if history.step(gradient_accumulation):
self._step(optimizer, scheduler, transformer_optimizer, transformer_scheduler)
report = self._report(total_loss / (timer.current + 1), metric)
timer.log(report, ratio_percentage=False, logger=logger)
del loss
def _step(self, optimizer, scheduler, transformer_optimizer, transformer_scheduler):
if self.config.get('grad_norm', None):
nn.utils.clip_grad_norm_(self.model.parameters(),
self.config.grad_norm)
optimizer.step()
optimizer.zero_grad()
scheduler.step()
if self.config.transformer and self.config.transformer_lr and transformer_optimizer:
transformer_optimizer.step()
transformer_optimizer.zero_grad()
transformer_scheduler.step()
def feed_batch(self, batch):
words, feats, lens, puncts = batch.get('token_id', None), batch.get('pos_id', None), batch['sent_length'], \
batch.get('punct_mask', None)
mask = lengths_to_mask(lens)
arc_scores, rel_scores = self.model(words=words, feats=feats, mask=mask, batch=batch, **batch)
# ignore the first token of each sentence
# RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
if self.model.training:
mask = mask.clone()
mask[:, 0] = 0
return arc_scores, rel_scores, mask, puncts
def _report(self, loss, metric: AttachmentScore):
return f'loss: {loss:.4f} {metric}'
def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None):
arc_scores, arcs = arc_scores[mask], arcs[mask]
rel_scores, rels = rel_scores[mask], rels[mask]
rel_scores = rel_scores[torch.arange(len(arcs)), arcs]
arc_loss = criterion(arc_scores, arcs)
rel_loss = criterion(rel_scores, rels)
loss = arc_loss + rel_loss
return loss
# noinspection PyUnboundLocalVariable
@torch.no_grad()
def evaluate_dataloader(self, loader: PadSequenceDataLoader, criterion, logger=None, filename=None, output=False,
ratio_width=None,
metric=None,
**kwargs):
self.model.eval()
loss = 0
if not metric:
metric = self.build_metric()
if output:
fp = open(output, 'w')
predictions, build_data, data, order = self.before_outputs(None)
timer = CountdownTimer(len(loader))
use_pos = self.use_pos
for batch in loader:
arc_scores, rel_scores, mask, puncts = self.feed_batch(batch)
if output:
self.collect_outputs(arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos,
build_data)
arcs, rels = batch['arc'], batch['rel_id']
loss += self.compute_loss(arc_scores, rel_scores, arcs, rels, mask, criterion, batch).item()
arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch)
self.update_metric(arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch)
report = self._report(loss / (timer.current + 1), metric)
if filename:
report = f'{os.path.basename(filename)} ' + report
timer.log(report, ratio_percentage=False, logger=logger, ratio_width=ratio_width)
loss /= len(loader)
if output:
outputs = self.post_outputs(predictions, data, order, use_pos, build_data)
for each in outputs:
fp.write(f'{each}\n\n')
fp.close()
logger.info(f'Predictions saved in [underline][yellow]{output}[/yellow][/underline]')
return loss, metric
def update_metric(self, arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch=None):
# ignore all punctuation if not specified
if not self.config.punct:
mask &= puncts
metric(arc_preds, rel_preds, arcs, rels, mask)
def decode(self, arc_scores, rel_scores, mask, batch=None):
tree, proj = self.config.tree, self.config.get('proj', False)
if tree:
arc_preds = decode_dep(arc_scores, mask, tree, proj)
else:
arc_preds = arc_scores.argmax(-1)
rel_preds = rel_scores.argmax(-1)
rel_preds = rel_preds.gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)
return arc_preds, rel_preds
def build_criterion(self, **kwargs):
criterion = nn.CrossEntropyLoss()
return criterion
def build_metric(self, **kwargs):
return AttachmentScore()
def on_config_ready(self, **kwargs):
self.build_transformer_tokenizer() # We have to build tokenizer before building the dataloader and model
self.config.patience = min(self.config.patience, self.config.epochs)
def prediction_to_head_rel(self, arcs: torch.LongTensor, rels: torch.LongTensor, batch: dict):
arcs = arcs[:, 1:] # Skip the ROOT
rels = rels[:, 1:]
arcs = arcs.tolist()
rels = rels.tolist()
vocab = self.vocabs['rel'].idx_to_token
for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']):
tokens = tokens[1:]
sent_len = len(tokens)
result = list(zip(arcs_per_sent[:sent_len], [vocab[r] for r in rels_per_sent[:sent_len]]))
yield result
================================================
FILE: hanlp/components/parsers/biaffine/biaffine_model.py
================================================
# -*- coding: utf-8 -*-
from typing import Any, Tuple
import torch
import torch.nn as nn
from torch.nn.utils.rnn import (pack_padded_sequence, pad_packed_sequence,
pad_sequence)
from hanlp.components.parsers.biaffine.biaffine import Biaffine
from hanlp.components.parsers.biaffine.mlp import MLP
from hanlp.components.parsers.biaffine.variationalbilstm import VariationalLSTM
from hanlp.layers.dropout import IndependentDropout, SharedDropout, WordDropout
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer
from hanlp.layers.transformers.utils import transformer_encode
class EncoderWithContextualLayer(nn.Module):
def __init__(self,
config,
pretrained_embed: torch.Tensor = None,
transformer: PreTrainedModel = None,
transformer_tokenizer: PreTrainedTokenizer = None,
):
super(EncoderWithContextualLayer, self).__init__()
self.secondary_encoder = config.get('secondary_encoder', None)
self.config = config
if not transformer:
self.pad_index = config.pad_index
self.unk_index = config.unk_index
if config.word_dropout:
oov = self.unk_index
excludes = [self.pad_index]
self.word_dropout = WordDropout(p=config.word_dropout, oov_token=oov, exclude_tokens=excludes)
else:
self.word_dropout = None
if transformer:
input_size = 0
if self.config.transformer_lr:
hidden_size = transformer.config.hidden_size
else:
input_size = transformer.config.hidden_size
hidden_size = config.n_lstm_hidden * 2
if config.feat == 'pos':
self.feat_embed = nn.Embedding(num_embeddings=config.n_feats,
embedding_dim=config.n_embed)
self.embed_dropout = IndependentDropout(p=config.embed_dropout)
if self.config.transformer_lr:
hidden_size += config.n_embed
else:
input_size += config.n_embed
if not self.config.transformer_lr:
self.lstm = VariationalLSTM(input_size=input_size,
hidden_size=config.n_lstm_hidden,
num_layers=config.n_lstm_layers,
dropout=config.hidden_dropout, bidirectional=True)
else:
# the embedding layer
input_size = config.n_embed
self.word_embed = nn.Embedding(num_embeddings=config.n_words,
embedding_dim=config.n_embed)
if pretrained_embed is not None:
if not isinstance(pretrained_embed, torch.Tensor):
pretrained_embed = torch.Tensor(pretrained_embed)
self.pretrained = nn.Embedding.from_pretrained(pretrained_embed)
nn.init.zeros_(self.word_embed.weight)
if config.feat == 'pos':
self.feat_embed = nn.Embedding(num_embeddings=config.n_feats,
embedding_dim=config.n_embed)
self.embed_dropout = IndependentDropout(p=config.embed_dropout)
input_size += config.n_embed
# the word-lstm layer
hidden_size = config.n_lstm_hidden * 2
self.lstm = VariationalLSTM(input_size=input_size,
hidden_size=config.n_lstm_hidden,
num_layers=config.n_lstm_layers,
dropout=config.hidden_dropout, bidirectional=True)
self.hidden_size = hidden_size
self.hidden_dropout = SharedDropout(p=config.hidden_dropout)
if transformer:
transformer = TransformerEncoder(transformer, transformer_tokenizer, config.average_subwords,
word_dropout=config.word_dropout,
max_sequence_length=config.max_sequence_length)
self.transformer = transformer
def forward(self, words, feats, input_ids, token_span, mask, lens):
if mask is None:
# get the mask and lengths of given batch
mask = words.ne(self.pad_index)
if lens is None:
lens = mask.sum(dim=1)
batch_size, seq_len = mask.shape
if self.config.transformer:
# trans_embed = self.run_transformer(input_ids, token_span=token_span)
trans_embed = self.transformer.forward(input_ids, token_span=token_span)
if hasattr(self, 'feat_embed'):
feat_embed = self.feat_embed(feats)
trans_embed, feat_embed = self.embed_dropout(trans_embed, feat_embed)
embed = torch.cat((trans_embed, feat_embed), dim=-1)
else:
embed = trans_embed
if hasattr(self, 'lstm'):
x = self.run_rnn(embed, lens, seq_len)
else:
x = embed
if self.secondary_encoder:
x = self.secondary_encoder(x, mask)
x = self.hidden_dropout(x)
else:
if self.word_dropout:
words = self.word_dropout(words)
# set the indices larger than num_embeddings to unk_index
ext_mask = words.ge(self.word_embed.num_embeddings)
ext_words = words.masked_fill(ext_mask, self.unk_index)
# get outputs from embedding layers
word_embed = self.word_embed(ext_words)
if hasattr(self, 'pretrained'):
word_embed += self.pretrained(words)
if self.config.feat == 'char':
feat_embed = self.feat_embed(feats[mask])
feat_embed = pad_sequence(feat_embed.split(lens.tolist()), True)
elif self.config.feat == 'bert':
feat_embed = self.feat_embed(*feats)
elif hasattr(self, 'feat_embed'):
feat_embed = self.feat_embed(feats)
else:
feat_embed = None
if feat_embed is not None:
word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed)
# concatenate the word and feat representations
embed = torch.cat((word_embed, feat_embed), dim=-1)
else:
embed = word_embed
x = self.run_rnn(embed, lens, seq_len)
x = self.hidden_dropout(x)
return x, mask
def run_rnn(self, embed, lens, seq_len):
x = pack_padded_sequence(embed, lens, True, False)
x, _ = self.lstm(x)
x, _ = pad_packed_sequence(x, True, total_length=seq_len)
return x
def run_transformer(self, input_ids, token_span):
return transformer_encode(self.transformer, input_ids, None, None, token_span,
average_subwords=self.config.average_subwords)
class BiaffineDecoder(nn.Module):
def __init__(self, hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels, arc_dropout=None,
rel_dropout=None) -> None:
super().__init__()
# the MLP layers
self.mlp_arc_h = MLP(hidden_size,
n_mlp_arc,
dropout=arc_dropout or mlp_dropout)
self.mlp_arc_d = MLP(hidden_size,
n_mlp_arc,
dropout=arc_dropout or mlp_dropout)
self.mlp_rel_h = MLP(hidden_size,
n_mlp_rel,
dropout=rel_dropout or mlp_dropout)
self.mlp_rel_d = MLP(hidden_size,
n_mlp_rel,
dropout=rel_dropout or mlp_dropout)
# the Biaffine layers
self.arc_attn = Biaffine(n_in=n_mlp_arc,
bias_x=True,
bias_y=False)
self.rel_attn = Biaffine(n_in=n_mlp_rel,
n_out=n_rels,
bias_x=True,
bias_y=True)
def forward(self, x, mask=None, **kwargs: Any) -> Tuple[torch.Tensor, torch.Tensor]:
arc_d, arc_h, rel_d, rel_h = self.apply_mlps(x)
s_arc, s_rel = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn, self.rel_attn)
return s_arc, s_rel
@staticmethod
def decode(arc_d, arc_h, rel_d, rel_h, mask, arc_attn, rel_attn):
# get arc and rel scores from the bilinear attention
# [batch_size, seq_len, seq_len]
s_arc = arc_attn(arc_d, arc_h)
# [batch_size, seq_len, seq_len, n_rels]
s_rel = rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
if mask is not None:
# set the scores that exceed the length of each sentence to -inf
s_arc.masked_fill_(~mask.unsqueeze(1), float('-inf'))
return s_arc, s_rel
def apply_mlps(self, x):
# apply MLPs to the hidden states
arc_d = self.mlp_arc_d(x)
arc_h = self.mlp_arc_h(x)
rel_d = self.mlp_rel_d(x)
rel_h = self.mlp_rel_h(x)
return arc_d, arc_h, rel_d, rel_h
class BiaffineDependencyModel(nn.Module):
def __init__(self, config, pretrained_embed: torch.Tensor = None, transformer: PreTrainedModel = None,
transformer_tokenizer: PreTrainedTokenizer = None):
super().__init__()
self.encoder = EncoderWithContextualLayer(config, pretrained_embed, transformer, transformer_tokenizer)
self.biaffine_decoder = BiaffineDecoder(self.encoder.hidden_size,
config.n_mlp_arc,
config.n_mlp_rel,
config.mlp_dropout,
config.n_rels)
def forward(self,
words=None,
feats=None,
input_ids=None,
token_span=None,
mask=None, lens=None, **kwargs):
x, mask = self.encoder(words, feats, input_ids, token_span, mask, lens)
s_arc, s_rel = self.biaffine_decoder(x, mask)
return s_arc, s_rel
================================================
FILE: hanlp/components/parsers/biaffine/biaffine_sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-28 15:12
import functools
from collections import Counter
from typing import Union, List
import torch
from torch import nn
from hanlp_common.constant import UNK
from hanlp.common.transform import TransformList
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp_common.conll import CoNLLUWord, CoNLLSentence
from hanlp.datasets.parsing.semeval15 import unpack_deps_to_head_deprel, append_bos_to_form_pos
from hanlp.metrics.parsing.labeled_f1 import LabeledF1
from hanlp_common.util import merge_locals_kwargs
class BiaffineSemanticDependencyParser(BiaffineDependencyParser):
def __init__(self) -> None:
r"""Implementation of "Stanford's graph-based neural dependency parser at
the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade"
(:cite:`he-choi-2019`).
"""
super().__init__()
def get_pad_dict(self):
return {'arc': False}
def build_metric(self, **kwargs):
return LabeledF1()
# noinspection PyMethodOverriding
def build_dataset(self, data, transform=None):
transforms = TransformList(functools.partial(append_bos_to_form_pos, pos_key='UPOS'),
functools.partial(unpack_deps_to_head_deprel, pad_rel=self.config.pad_rel))
if transform:
transforms.append(transform)
return super(BiaffineSemanticDependencyParser, self).build_dataset(data, transforms)
def build_criterion(self, **kwargs):
return nn.BCEWithLogitsLoss(), nn.CrossEntropyLoss()
def feed_batch(self, batch):
arc_scores, rel_scores, mask, puncts = super().feed_batch(batch)
mask = self.convert_to_3d_mask(arc_scores, mask)
puncts = self.convert_to_3d_puncts(puncts, mask)
return arc_scores, rel_scores, mask, puncts
@staticmethod
def convert_to_3d_puncts(puncts, mask):
if puncts is not None:
puncts = puncts.unsqueeze(-1).expand_as(mask)
return puncts
@staticmethod
def convert_to_3d_mask(arc_scores, mask):
# 3d masks
mask = mask.unsqueeze(-1).expand_as(arc_scores).clone()
mask[:, :, 1:] = mask[:, :, 1:] & mask.transpose(1, 2)[:, :, 1:] # Keep the 1st colum because it predicts root
return mask
def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask: torch.BoolTensor, criterion, batch=None):
bce, ce = criterion
arc_scores, arcs = arc_scores[mask], arcs[mask]
rel_scores, rels = rel_scores[mask], rels[mask]
rel_scores, rels = rel_scores[arcs], rels[arcs]
arc_loss = bce(arc_scores, arcs.to(torch.float))
arc_loss_interpolation = self.config.get('arc_loss_interpolation', None)
loss = arc_loss * arc_loss_interpolation if arc_loss_interpolation else arc_loss
if len(rels):
rel_loss = ce(rel_scores, rels)
loss += (rel_loss * (1 - arc_loss_interpolation)) if arc_loss_interpolation else rel_loss
if arc_loss_interpolation:
loss *= 2
return loss
def cache_dataset(self, dataset, timer, training=False, logger=None):
if not self.config.apply_constraint:
return super(BiaffineSemanticDependencyParser, self).cache_dataset(dataset, timer, training)
num_roots = Counter()
no_zero_head = True
root_rels = Counter()
for each in dataset:
if training:
num_roots[sum([x[0] for x in each['arc']])] += 1
no_zero_head &= all([x != '_' for x in each['DEPS']])
head_is_root = [i for i in range(len(each['arc'])) if each['arc'][i][0]]
if head_is_root:
for i in head_is_root:
root_rels[each['rel'][i][0]] += 1
timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]')
if training:
if self.config.single_root is None:
self.config.single_root = len(num_roots) == 1 and num_roots.most_common()[0][0] == 1
if self.config.no_zero_head is None:
self.config.no_zero_head = no_zero_head
root_rel = root_rels.most_common()[0][0]
self.config.root_rel_id = self.vocabs['rel'].get_idx(root_rel)
if logger:
logger.info(f'Training set properties: [blue]single_root = {self.config.single_root}[/blue], '
f'[blue]no_zero_head = {no_zero_head}[/blue], '
f'[blue]root_rel = {root_rel}[/blue]')
def decode(self, arc_scores, rel_scores, mask, batch=None):
eye = torch.arange(0, arc_scores.size(1), device=arc_scores.device).view(1, 1, -1).expand(
arc_scores.size(0), -1, -1)
inf = float('inf')
arc_scores.scatter_(dim=1, index=eye, value=-inf)
if self.config.apply_constraint:
if self.config.get('single_root', False):
arc_scores[~mask] = -inf # the biaffine decoder doesn't apply 3d mask for now
root_mask = arc_scores[:, :, 0].argmax(dim=-1).unsqueeze_(-1).expand_as(arc_scores[:, :, 0])
arc_scores[:, :, 0] = -inf
arc_scores[:, :, 0].scatter_(dim=-1, index=root_mask, value=inf)
root_rel_id = self.config.root_rel_id
rel_scores[:, :, 0, root_rel_id] = inf
rel_scores[:, :, 1:, root_rel_id] = -inf
arc_scores_T = arc_scores.transpose(-1, -2)
arc = ((arc_scores > 0) & (arc_scores_T < arc_scores))
if self.config.get('no_zero_head', False):
arc_scores_T[arc] = -inf # avoid cycle between a pair of nodes
arc_scores_fix = arc_scores_T.argmax(dim=-2).unsqueeze_(-1).expand_as(arc_scores)
arc.scatter_(dim=-1, index=arc_scores_fix, value=True)
else:
arc = arc_scores > 0
rel = rel_scores.argmax(dim=-1)
return arc, rel
def collect_outputs_extend(self, predictions, arc_preds, rel_preds, lens, mask):
predictions.extend(zip(arc_preds.tolist(), rel_preds.tolist(), mask.tolist()))
# all_arcs.extend(seq.tolist() for seq in arc_preds[mask].split([x * x for x in lens]))
# all_rels.extend(seq.tolist() for seq in rel_preds[mask].split([x * x for x in lens]))
def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True):
for d, (arcs, rels, masks) in zip(data, predictions):
sent = CoNLLSentence()
for idx, (cell, a, r) in enumerate(zip(d, arcs[1:], rels[1:])):
if use_pos:
token, pos = cell
else:
token, pos = cell, None
heads = [i for i in range(len(d) + 1) if a[i]]
deprels = [self.vocabs['rel'][r[i]] for i in range(len(d) + 1) if a[i]]
sent.append(
CoNLLUWord(idx + 1, token, upos=pos, head=None, deprel=None, deps=list(zip(heads, deprels))))
outputs.append(sent)
def fit(self, trn_data, dev_data, save_dir,
feat=None,
n_embed=100,
pretrained_embed=None,
transformer=None,
average_subwords=False,
word_dropout: float = 0.2,
transformer_hidden_dropout=None,
layer_dropout=0,
mix_embedding: int = None,
embed_dropout=.33,
n_lstm_hidden=400,
n_lstm_layers=3,
hidden_dropout=.33,
n_mlp_arc=500,
n_mlp_rel=100,
mlp_dropout=.33,
arc_dropout=None,
rel_dropout=None,
arc_loss_interpolation=0.4,
lr=2e-3,
transformer_lr=5e-5,
mu=.9,
nu=.9,
epsilon=1e-12,
clip=5.0,
decay=.75,
decay_steps=5000,
weight_decay=0,
warmup_steps=0.1,
separate_optimizer=True,
patience=100,
batch_size=None,
sampler_builder=None,
lowercase=False,
epochs=50000,
apply_constraint=False,
single_root=None,
no_zero_head=None,
punct=False,
min_freq=2,
logger=None,
verbose=True,
unk=UNK,
pad_rel=None,
max_sequence_length=512,
gradient_accumulation=1,
devices: Union[float, int, List[int]] = None,
transform=None,
**kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
================================================
FILE: hanlp/components/parsers/biaffine/mlp.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch.nn as nn
from hanlp.layers.dropout import SharedDropout
class MLP(nn.Module):
r"""
Applies a linear transformation together with a non-linear activation to the incoming tensor:
:math:`y = \mathrm{Activation}(x A^T + b)`
Args:
n_in (~torch.Tensor):
The size of each input feature.
n_out (~torch.Tensor):
The size of each output feature.
dropout (float):
If non-zero, introduce a :class:`SharedDropout` layer on the output with this dropout ratio. Default: 0.
activation (bool):
Whether to use activations. Default: True.
"""
def __init__(self, n_in, n_out, dropout=0, activation=True):
super().__init__()
self.n_in = n_in
self.n_out = n_out
self.linear = nn.Linear(n_in, n_out)
self.activation = nn.LeakyReLU(negative_slope=0.1) if activation else nn.Identity()
self.dropout = SharedDropout(p=dropout)
self.reset_parameters()
def __repr__(self):
s = f"n_in={self.n_in}, n_out={self.n_out}"
if self.dropout.p > 0:
s += f", dropout={self.dropout.p}"
return f"{self.__class__.__name__}({s})"
def reset_parameters(self):
nn.init.orthogonal_(self.linear.weight)
nn.init.zeros_(self.linear.bias)
def forward(self, x):
r"""
Args:
x (~torch.Tensor):
The size of each input feature is `n_in`.
Returns:
A tensor with the size of each output feature `n_out`.
"""
x = self.linear(x)
x = self.activation(x)
x = self.dropout(x)
return x
================================================
FILE: hanlp/components/parsers/biaffine/structual_attention.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-26 10:40
from typing import Union, List
import torch
import torch.nn.functional as F
from hanlp.utils.torch_util import lengths_to_mask
from torch import nn
from hanlp.common.torch_component import TorchComponent
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer
from hanlp.metrics.accuracy import CategoricalAccuracy
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp_common.util import merge_locals_kwargs
class StructuralAttentionLayer(nn.Module):
def __init__(self, hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels, projeciton=None) -> None:
super().__init__()
self.biaffine = BiaffineDecoder(hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels)
if projeciton:
self.projection = nn.Linear(hidden_size, projeciton)
hidden_size = projeciton
else:
self.projection = None
self.head_WV = nn.Parameter(torch.randn(n_rels, hidden_size, hidden_size))
self.dense = nn.Linear(hidden_size * n_rels, hidden_size)
self.activation = nn.GELU()
def forward(self, x, mask):
s_arc, s_rel = self.biaffine(x, mask)
p_arc = F.softmax(s_arc, dim=-1) * mask.unsqueeze(-1)
p_rel = F.softmax(s_rel, -1)
A = p_arc.unsqueeze(-1) * p_rel
if self.projection:
x = self.projection(x)
Ax = torch.einsum('bijk,bih->bihk', A, x)
AxW = torch.einsum('bihk,khm->bihk', Ax, self.head_WV)
AxW = AxW.flatten(2)
x = self.dense(AxW)
x = self.activation(x)
return s_arc, s_rel, x
class StructuralAttentionModel(nn.Module):
def __init__(self,
config,
transformer: PreTrainedModel = None,
transformer_tokenizer: PreTrainedTokenizer = None
) -> None:
super().__init__()
self.encoder = TransformerEncoder(transformer,
transformer_tokenizer,
config.average_subwords,
config.scalar_mix,
None, # No word_dropout since SA is predicting masked tokens
config.transformer_hidden_dropout,
config.layer_dropout,
config.max_sequence_length)
hidden_size = transformer.config.hidden_size
self.sa = StructuralAttentionLayer(hidden_size,
config.n_mlp_arc,
config.n_mlp_rel,
config.mlp_dropout,
config.n_rels,
config.projection
)
if config.projection:
hidden_size = config.projection
self.mlm = nn.Linear(hidden_size, transformer_tokenizer.vocab_size)
def forward(self,
input_ids: torch.LongTensor,
attention_mask=None,
token_type_ids=None,
token_span=None,
mask=None,
batch=None,
**kwargs):
h = self.encoder(input_ids, attention_mask, token_type_ids, token_span)
s_arc, s_rel, h = self.sa(h, mask)
x = self.mlm(h)
return s_arc, s_rel, x
class MaskedTokenGenerator(object):
def __init__(self, transformer_tokenizer: PreTrainedTokenizer, mask_prob=0.15) -> None:
super().__init__()
self.mask_prob = mask_prob
self.transformer_tokenizer = transformer_tokenizer
self.oov = transformer_tokenizer.mask_token_id
self.pad = transformer_tokenizer.pad_token_id
self.cls = transformer_tokenizer.cls_token_id
self.sep = transformer_tokenizer.sep_token_id
self.excludes = [self.pad, self.cls, self.sep]
def __call__(self, tokens: torch.LongTensor, prefix_mask: torch.LongTensor):
padding_mask = tokens.new_ones(tokens.size(), dtype=torch.bool)
for pad in self.excludes:
padding_mask &= (tokens != pad)
padding_mask &= prefix_mask # Only mask prefixes since the others won't be attended
# Create a uniformly random mask selecting either the original words or OOV tokens
dropout_mask = (tokens.new_empty(tokens.size(), dtype=torch.float).uniform_() < self.mask_prob)
oov_mask = dropout_mask & padding_mask
oov_fill = tokens.new_empty(tokens.size(), dtype=torch.long).fill_(self.oov)
result = torch.where(oov_mask, oov_fill, tokens)
return result, oov_mask
class StructuralAttentionParser(BiaffineDependencyParser):
def __init__(self) -> None:
super().__init__()
self.model: StructuralAttentionModel = None
self.mlm_generator: MaskedTokenGenerator = None
def build_model(self, training=True, **kwargs) -> torch.nn.Module:
transformer = TransformerEncoder.build_transformer(config=self.config, training=training)
model = StructuralAttentionModel(self.config, transformer, self.transformer_tokenizer)
return model
def fit(self, trn_data, dev_data, save_dir,
transformer=None,
mask_prob=0.15,
projection=None,
average_subwords=False,
transformer_hidden_dropout=None,
layer_dropout=0,
mix_embedding: int = None,
embed_dropout=.33,
n_mlp_arc=500,
n_mlp_rel=100,
mlp_dropout=.33,
lr=2e-3,
transformer_lr=5e-5,
mu=.9,
nu=.9,
epsilon=1e-12,
clip=5.0,
decay=.75,
decay_steps=5000,
patience=100,
sampler='kmeans',
n_buckets=32,
batch_max_tokens=5000,
batch_size=None,
epochs=50000,
tree=False,
punct=False,
logger=None,
verbose=True,
max_sequence_length=512,
devices: Union[float, int, List[int]] = None,
transform=None,
**kwargs):
return TorchComponent.fit(self, **merge_locals_kwargs(locals(), kwargs))
def feed_batch(self, batch):
if self.model.training:
input_ids = batch['input_ids']
prefix_mask = batch['prefix_mask']
batch['gold_input_ids'] = input_ids
batch['input_ids'], batch['input_ids_mask'] = self.mlm_generator(input_ids, prefix_mask)
words, feats, lens, puncts = batch.get('token_id', None), batch.get('pos_id', None), batch['sent_length'], \
batch.get('punct_mask', None)
mask = lengths_to_mask(lens)
arc_scores, rel_scores, pred_input_ids = self.model(words=words, feats=feats, mask=mask, batch=batch, **batch)
batch['pred_input_ids'] = pred_input_ids
# ignore the first token of each sentence
# RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
if self.model.training:
mask = mask.clone()
mask[:, 0] = 0
return arc_scores, rel_scores, mask, puncts
def on_config_ready(self, **kwargs):
super().on_config_ready(**kwargs)
self.mlm_generator = MaskedTokenGenerator(self.transformer_tokenizer, self.config.mask_prob)
def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None):
parse_loss = BiaffineDependencyParser.compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch)
if self.model.training:
gold_input_ids = batch['gold_input_ids']
pred_input_ids = batch['pred_input_ids']
input_ids_mask = batch['input_ids_mask']
token_span = batch['token_span']
gold_input_ids = batch['gold_input_ids'] = gold_input_ids.gather(1, token_span[:, :, 0])
input_ids_mask = batch['input_ids_mask'] = input_ids_mask.gather(1, token_span[:, :, 0])
mlm_loss = F.cross_entropy(pred_input_ids[input_ids_mask], gold_input_ids[input_ids_mask])
loss = parse_loss + mlm_loss
return loss
return parse_loss
def build_tokenizer_transform(self):
return TransformerSequenceTokenizer(self.transformer_tokenizer, 'token', '', ret_prefix_mask=True,
ret_token_span=True, cls_is_bos=True,
max_seq_length=self.config.get('max_sequence_length',
512),
truncate_long_sequences=False)
def build_metric(self, training=None, **kwargs):
parse_metric = super().build_metric(**kwargs)
if training:
mlm_metric = CategoricalAccuracy()
return parse_metric, mlm_metric
return parse_metric
def update_metric(self, arc_scores, rel_scores, arcs, rels, mask, puncts, metric, batch=None):
if isinstance(metric, tuple):
parse_metric, mlm_metric = metric
super().update_metric(arc_scores, rel_scores, arcs, rels, mask, puncts, parse_metric)
gold_input_ids = batch['gold_input_ids']
input_ids_mask = batch['input_ids_mask']
pred_input_ids = batch['pred_input_ids']
pred_input_ids = pred_input_ids[input_ids_mask]
gold_input_ids = gold_input_ids[input_ids_mask]
if len(pred_input_ids):
mlm_metric(pred_input_ids, gold_input_ids)
else:
super().update_metric(arc_scores, rel_scores, arcs, rels, mask, puncts, metric)
def _report(self, loss, metric):
if isinstance(metric, tuple):
parse_metric, mlm_metric = metric
return super()._report(loss, parse_metric) + f' {mlm_metric}'
else:
return super()._report(loss, metric)
================================================
FILE: hanlp/components/parsers/biaffine/variationalbilstm.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
import torch.nn as nn
from torch.nn.modules.rnn import apply_permutation
from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
from hanlp.common.structure import ConfigTracker
from hanlp.layers.dropout import SharedDropout
class VariationalLSTM(nn.Module):
r"""
LSTM is an variant of the vanilla bidirectional LSTM adopted by Biaffine Parser
with the only difference of the dropout strategy.
It drops nodes in the LSTM layers (input and recurrent connections)
and applies the same dropout mask at every recurrent timesteps.
APIs are roughly the same as :class:`~torch.nn.LSTM` except that we only allows
:class:`~torch.nn.utils.rnn.PackedSequence` as input.
References:
- Timothy Dozat and Christopher D. Manning. 2017.
`Deep Biaffine Attention for Neural Dependency Parsing`_.
Args:
input_size (int):
The number of expected features in the input.
hidden_size (int):
The number of features in the hidden state `h`.
num_layers (int):
The number of recurrent layers. Default: 1.
bidirectional (bool):
If ``True``, becomes a bidirectional LSTM. Default: ``False``
dropout (float):
If non-zero, introduces a :class:`SharedDropout` layer on the outputs of each LSTM layer except the last layer.
Default: 0.
.. _Deep Biaffine Attention for Neural Dependency Parsing:
https://openreview.net/forum?id=Hk95PK9le
"""
def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, dropout=0):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.dropout = dropout
self.num_directions = 1 + self.bidirectional
self.f_cells = nn.ModuleList()
if bidirectional:
self.b_cells = nn.ModuleList()
for _ in range(self.num_layers):
self.f_cells.append(nn.LSTMCell(input_size=input_size, hidden_size=hidden_size))
if bidirectional:
self.b_cells.append(nn.LSTMCell(input_size=input_size, hidden_size=hidden_size))
input_size = hidden_size * self.num_directions
self.reset_parameters()
def __repr__(self):
s = f"{self.input_size}, {self.hidden_size}"
if self.num_layers > 1:
s += f", num_layers={self.num_layers}"
if self.bidirectional:
s += f", bidirectional={self.bidirectional}"
if self.dropout > 0:
s += f", dropout={self.dropout}"
return f"{self.__class__.__name__}({s})"
def reset_parameters(self):
for param in self.parameters():
# apply orthogonal_ to weight
if len(param.shape) > 1:
nn.init.orthogonal_(param)
# apply zeros_ to bias
else:
nn.init.zeros_(param)
def permute_hidden(self, hx, permutation):
if permutation is None:
return hx
h = apply_permutation(hx[0], permutation)
c = apply_permutation(hx[1], permutation)
return h, c
def layer_forward(self, x, hx, cell, batch_sizes, reverse=False):
hx_0 = hx_i = hx
hx_n, output = [], []
steps = reversed(range(len(x))) if reverse else range(len(x))
if self.training:
hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout)
for t in steps:
last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t]
if last_batch_size < batch_size:
hx_i = [torch.cat((h, ih[last_batch_size:batch_size])) for h, ih in zip(hx_i, hx_0)]
else:
hx_n.append([h[batch_size:] for h in hx_i])
hx_i = [h[:batch_size] for h in hx_i]
hx_i = [h for h in cell(x[t], hx_i)]
output.append(hx_i[0])
if self.training:
hx_i[0] = hx_i[0] * hid_mask[:batch_size]
if reverse:
hx_n = hx_i
output.reverse()
else:
hx_n.append(hx_i)
hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))]
output = torch.cat(output)
return output, hx_n
def forward(self, sequence, hx=None):
r"""
Args:
sequence (~torch.nn.utils.rnn.PackedSequence):
A packed variable length sequence.
hx (~torch.Tensor, ~torch.Tensor):
A tuple composed of two tensors `h` and `c`.
`h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial hidden state
for each element in the batch.
`c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial cell state
for each element in the batch.
If `hx` is not provided, both `h` and `c` default to zero.
Default: ``None``.
Returns:
~torch.nn.utils.rnn.PackedSequence, (~torch.Tensor, ~torch.Tensor):
The first is a packed variable length sequence.
The second is a tuple of tensors `h` and `c`.
`h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the hidden state for `t=seq_len`.
Like output, the layers can be separated using ``h.view(num_layers, num_directions, batch_size, hidden_size)``
and similarly for c.
`c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the cell state for `t=seq_len`.
"""
x, batch_sizes = sequence.data, sequence.batch_sizes.tolist()
batch_size = batch_sizes[0]
h_n, c_n = [], []
if hx is None:
ih = x.new_zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)
h, c = ih, ih
else:
h, c = self.permute_hidden(hx, sequence.sorted_indices)
h = h.view(self.num_layers, self.num_directions, batch_size, self.hidden_size)
c = c.view(self.num_layers, self.num_directions, batch_size, self.hidden_size)
for i in range(self.num_layers):
x = torch.split(x, batch_sizes)
if self.training:
mask = SharedDropout.get_mask(x[0], self.dropout)
x = [i * mask[:len(i)] for i in x]
x_i, (h_i, c_i) = self.layer_forward(x=x,
hx=(h[i, 0], c[i, 0]),
cell=self.f_cells[i],
batch_sizes=batch_sizes)
if self.bidirectional:
x_b, (h_b, c_b) = self.layer_forward(x=x,
hx=(h[i, 1], c[i, 1]),
cell=self.b_cells[i],
batch_sizes=batch_sizes,
reverse=True)
x_i = torch.cat((x_i, x_b), -1)
h_i = torch.stack((h_i, h_b))
c_i = torch.stack((c_i, c_b))
x = x_i
h_n.append(h_i)
c_n.append(h_i)
x = PackedSequence(x,
sequence.batch_sizes,
sequence.sorted_indices,
sequence.unsorted_indices)
hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
hx = self.permute_hidden(hx, sequence.unsorted_indices)
return x, hx
class VariationalLSTMEncoder(VariationalLSTM, ConfigTracker):
def __init__(self,
input_size,
hidden_size,
num_layers=1,
bidirectional=False,
variational_dropout=0,
word_dropout=0,
):
super().__init__(input_size, hidden_size, num_layers, bidirectional, variational_dropout)
ConfigTracker.__init__(self, locals())
self.lstm_dropout = SharedDropout(p=word_dropout)
# noinspection PyMethodOverriding
def forward(self, embed, mask):
batch_size, seq_len = mask.shape
x = pack_padded_sequence(embed, mask.sum(1), True, False)
x, _ = super().forward(x)
x, _ = pad_packed_sequence(x, True, total_length=seq_len)
x = self.lstm_dropout(x)
return x
def get_output_dim(self):
return self.hidden_size * self.num_directions
================================================
FILE: hanlp/components/parsers/biaffine_parser_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-22 12:47
import logging
import math
import os
from typing import List
import numpy as np
import tensorflow as tf
from hanlp.components.parsers.parse_alg import unique_root, adjust_root_score, chu_liu_edmonds
from hanlp.layers.transformers.loader_tf import build_transformer
from hanlp.common.keras_component import KerasComponent
from hanlp.components.parsers.alg_tf import tarjan
from hanlp.components.parsers.biaffine_tf.model import BiaffineModelTF
from hanlp.transform.conll_tf import CoNLL_DEP_Transform, CoNLL_Transformer_Transform, CoNLL_SDP_Transform
from hanlp.layers.embeddings.util_tf import build_embedding
from hanlp.layers.transformers.tf_imports import PreTrainedTokenizer, TFAutoModel, TFPreTrainedModel, AutoTokenizer, \
TFAutoModelWithLMHead, BertTokenizerFast, AlbertConfig, BertTokenizer, TFBertModel
from hanlp.layers.transformers.utils_tf import build_adamw_optimizer
from hanlp.metrics.parsing.labeled_f1_tf import LabeledF1TF
from hanlp.metrics.parsing.labeled_score import LabeledScore
from hanlp_common.util import merge_locals_kwargs
class BiaffineDependencyParserTF(KerasComponent):
def __init__(self, transform: CoNLL_DEP_Transform = None) -> None:
if not transform:
transform = CoNLL_DEP_Transform()
super().__init__(transform)
self.transform: CoNLL_DEP_Transform = transform
self.model: BiaffineModelTF = None
def build_model(self, pretrained_embed, n_embed, training, **kwargs) -> tf.keras.Model:
if training:
self.config.n_words = len(self.transform.form_vocab)
else:
self.config.lstm_dropout = 0. # keras will use cuda lstm when config.lstm_dropout is 0
self.config.n_feats = len(self.transform.cpos_vocab)
self._init_config()
pretrained: tf.keras.layers.Embedding = build_embedding(pretrained_embed, self.transform.form_vocab,
self.transform) if pretrained_embed else None
if pretrained_embed:
self.config.n_embed = pretrained.output_dim
model = BiaffineModelTF(self.config, pretrained)
return model
def _init_config(self):
self.config.n_rels = len(self.transform.rel_vocab)
self.config.pad_index = self.transform.form_vocab.pad_idx
self.config.unk_index = self.transform.form_vocab.unk_idx
self.config.bos_index = 2
def load_weights(self, save_dir, filename='model.h5', functional=False, **kwargs):
super().load_weights(save_dir, filename)
if functional:
self.model = self.model.to_functional()
def fit(self, trn_data, dev_data, save_dir,
n_embed=100,
pretrained_embed=None,
embed_dropout=.33,
n_lstm_hidden=400,
n_lstm_layers=3,
lstm_dropout=.33,
n_mlp_arc=500,
n_mlp_rel=100,
mlp_dropout=.33,
optimizer='adam',
lr=2e-3,
mu=.9,
nu=.9,
epsilon=1e-12,
clip=5.0,
decay=.75,
decay_steps=5000,
patience=100,
arc_loss='sparse_categorical_crossentropy',
rel_loss='sparse_categorical_crossentropy',
metrics=('UAS', 'LAS'),
n_buckets=32,
batch_size=5000,
epochs=50000,
early_stopping_patience=100,
tree=False,
punct=False,
min_freq=2,
run_eagerly=False, logger=None, verbose=True,
**kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
# noinspection PyMethodOverriding
def train_loop(self, trn_data, dev_data, epochs, num_examples,
train_steps_per_epoch, dev_steps, model, optimizer, loss, metrics,
callbacks, logger: logging.Logger, arc_loss, rel_loss,
**kwargs):
arc_loss, rel_loss = loss
# because we are customizing batching
train_steps_per_epoch = len(list(iter(trn_data)))
# progbar: tf.keras.callbacks.ProgbarLogger = callbacks[-1]
c: tf.keras.callbacks.Callback = None
metric = self._build_metrics()
for c in callbacks:
if not hasattr(c, 'params'):
c.params = dict()
c.params['epochs'] = epochs
c.params['trn_data'] = trn_data
c.params['metrics'] = ['loss'] + self.config.metrics
c.params['metrics'] = c.params['metrics'] + [f'val_{k}' for k in c.params['metrics']]
c.on_train_begin()
for epoch in range(epochs):
metric.reset_states()
for c in callbacks:
c.params['steps'] = train_steps_per_epoch
c.on_epoch_begin(epoch)
for idx, ((words, feats), (arcs, rels)) in enumerate(iter(trn_data)):
logs = {}
for c in callbacks:
c.on_batch_begin(idx, logs)
mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index)
loss, arc_scores, rel_scores = self.train_batch(words, feats, arcs, rels, mask,
optimizer, arc_loss, rel_loss)
self.run_metrics(arcs, rels, arc_scores, rel_scores, words, mask, metric)
logs['loss'] = loss
logs.update(metric.to_dict())
if epoch == epochs - 1:
self.model.stop_training = True
for c in callbacks:
c.on_batch_end(idx, logs)
# evaluate on dev
metric.reset_states()
logs = {}
for idx, ((words, feats), (arcs, rels)) in enumerate(iter(dev_data)):
arc_scores, rel_scores, loss, mask, arc_preds, rel_preds = self.evaluate_batch(words, feats, arcs, rels,
arc_loss, rel_loss,
metric)
logs['val_loss'] = loss
logs.update((f'val_{k}', v) for k, v in metric.to_dict().items())
for c in callbacks:
c.on_epoch_end(epoch, logs)
if getattr(self.model, 'stop_training', None):
break
for c in callbacks:
c.on_train_end()
def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=None, logger: logging.Logger = None,
callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=False, verbose=True, **kwargs):
if batch_size is None:
batch_size = self.config.batch_size
return super().evaluate(input_path, save_dir, output, batch_size, logger, callbacks, warm_up, verbose, **kwargs)
def evaluate_batch(self, words, feats, arcs, rels, arc_loss, rel_loss, metric):
mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index)
arc_scores, rel_scores = self.model((words, feats))
loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss)
arc_preds, rel_preds = self.run_metrics(arcs, rels, arc_scores, rel_scores, words, mask, metric)
return arc_scores, rel_scores, loss, mask, arc_preds, rel_preds
def _build_metrics(self):
if isinstance(self.config.metrics, tuple):
self.config.metrics = list(self.config.metrics)
if self.config.metrics == ['UAS', 'LAS']:
metric = LabeledScore()
else:
metric = LabeledF1TF()
return metric
def run_metrics(self, arcs, rels, arc_scores, rel_scores, words, mask, metric):
arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
# ignore all punctuation if not specified
if not self.config.punct:
mask &= tf.reduce_all(tf.not_equal(tf.expand_dims(words, axis=-1), self.transform.puncts), axis=-1)
metric(arc_preds, rel_preds, arcs, rels, mask)
return arc_preds, rel_preds
def train_batch(self, words, feats, arcs, rels, mask, optimizer, arc_loss, rel_loss):
with tf.GradientTape() as tape:
arc_scores, rel_scores = self.model((words, feats), training=True)
loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss)
grads = tape.gradient(loss, self.model.trainable_variables)
optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
return loss, arc_scores, rel_scores
def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss):
arc_scores, arcs = arc_scores[mask], arcs[mask]
rel_scores, rels = rel_scores[mask], rels[mask]
rel_scores = tf.gather_nd(rel_scores, tf.stack([tf.range(len(arcs), dtype=tf.int64), arcs], axis=1))
arc_loss = arc_loss(arcs, arc_scores)
rel_loss = rel_loss(rels, rel_scores)
loss = arc_loss + rel_loss
return loss
def build_optimizer(self, optimizer='adam', lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75,
decay_steps=5000, **kwargs):
if optimizer == 'adam':
scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=lr,
decay_steps=decay_steps,
decay_rate=decay)
from hanlp.optimizers.adamw.optimization import AdamTF
optimizer = AdamTF(learning_rate=scheduler,
beta_1=mu,
beta_2=nu,
epsilon=epsilon,
clipnorm=clip)
return optimizer
return super().build_optimizer(optimizer, **kwargs)
# noinspection PyMethodOverriding
def build_loss(self, arc_loss, rel_loss, **kwargs):
if arc_loss == 'binary_crossentropy':
arc_loss = tf.losses.BinaryCrossentropy(from_logits=True)
else:
arc_loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True) if arc_loss == 'sparse_categorical_crossentropy' else super().build_loss(arc_loss)
rel_loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True) if rel_loss == 'sparse_categorical_crossentropy' else super().build_loss(rel_loss)
return arc_loss, rel_loss
@property
def sample_data(self):
return tf.constant([[2, 3, 4], [2, 5, 0]], dtype=tf.int64), tf.constant([[1, 2, 3], [4, 5, 0]], dtype=tf.int64)
def num_samples_in(self, dataset):
return sum(len(x[0][0]) for x in iter(dataset))
def build_train_dataset(self, trn_data, batch_size, num_examples):
trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size,
shuffle=True,
repeat=None)
return trn_data
# noinspection PyMethodOverriding
def build_callbacks(self, save_dir, logger, metrics, **kwargs):
callbacks = super().build_callbacks(save_dir, logger, metrics=metrics, **kwargs)
if isinstance(metrics, tuple):
metrics = list(metrics)
callbacks.append(self.build_progbar(metrics))
params = {'verbose': 1, 'epochs': 1}
for c in callbacks:
c.set_params(params)
c.set_model(self.model)
return callbacks
def build_progbar(self, metrics, training=True):
return tf.keras.callbacks.ProgbarLogger(count_mode='steps',
stateful_metrics=metrics + [f'val_{k}' for k in metrics] if training
else [])
def decode(self, arc_scores, rel_scores, mask):
if self.config.tree:
root_rel_idx = self.transform.root_rel_idx
root_rel_onehot = np.eye(len(self.transform.rel_vocab))[root_rel_idx]
arc_preds = np.zeros_like(mask, dtype=np.int64)
rel_preds = np.zeros_like(mask, dtype=np.int64)
for arc, rel, m, arc_pred, rel_pred in zip(arc_scores, rel_scores, mask, arc_preds, rel_preds):
length = int(tf.math.count_nonzero(m)) + 1
arc = arc[:length, :length]
arc_probs = tf.nn.softmax(arc).numpy()
m = np.expand_dims(m.numpy()[:length], -1)
if self.config.tree == 'tarjan':
heads = tarjan(arc_probs, length, m)
elif self.config.tree == 'mst':
heads, head_probs, tokens = unique_root(arc_probs, m, length)
arc = arc.numpy()
adjust_root_score(arc, heads, root_rel_idx)
heads = chu_liu_edmonds(arc, length)
else:
raise ValueError(f'Unknown tree algorithm {self.config.tree}')
arc_pred[:length] = heads
root = np.where(heads[np.arange(1, length)] == 0)[0] + 1
rel_prob = tf.nn.softmax(rel[:length, :length, :]).numpy()
rel_prob = rel_prob[np.arange(length), heads]
rel_prob[root] = root_rel_onehot
rel_prob[np.arange(length) != root, np.arange(len(self.transform.rel_vocab)) == root_rel_idx] = 0
# rels = rel_argmax(rel_prob, length, root_rel_idx)
rels = np.argmax(rel_prob, axis=1)
rel_pred[:length] = rels
arc_preds = tf.constant(arc_preds)
rel_preds = tf.constant(rel_preds)
else:
arc_preds = tf.argmax(arc_scores, -1)
rel_preds = tf.argmax(rel_scores, -1)
rel_preds = tf.squeeze(tf.gather(rel_preds, tf.expand_dims(arc_preds, -1), batch_dims=2), axis=-1)
return arc_preds, rel_preds
def evaluate_dataset(self, tst_data, callbacks, output, num_batches, ret_scores=None, **kwargs):
if 'mask_p' in self.config:
self.config['mask_p'] = None
arc_loss, rel_loss = self.build_loss(**self.config)
callbacks = [self.build_progbar(self.config['metrics'])]
steps_per_epoch = len(list(iter(tst_data)))
metric = self._build_metrics()
params = {'verbose': 1, 'epochs': 1, 'metrics': ['loss'] + self.config.metrics, 'steps': steps_per_epoch}
for c in callbacks:
c.set_params(params)
c.on_test_begin()
c.on_epoch_end(0)
logs = {}
if ret_scores:
scores = []
if output:
ext = os.path.splitext(output)[-1]
output = open(output, 'w', encoding='utf-8')
for idx, ((words, feats), Y) in enumerate(iter(tst_data)):
arcs, rels = Y[0], Y[1]
for c in callbacks:
c.on_test_batch_begin(idx, logs)
arc_scores, rel_scores, loss, mask, arc_preds, rel_preds = self.evaluate_batch(words, feats, arcs, rels,
arc_loss, rel_loss, metric)
if ret_scores:
scores.append((arc_scores.numpy(), rel_scores.numpy(), mask.numpy()))
if output:
for sent in self.transform.XY_to_inputs_outputs((words, feats, mask), (arc_preds, rel_preds),
conll=ext, arc_scores=arc_scores,
rel_scores=rel_scores):
output.write(str(sent))
output.write('\n\n')
logs['loss'] = loss
logs.update(metric.to_dict())
for c in callbacks:
c.on_test_batch_end(idx, logs)
for c in callbacks:
c.on_epoch_end(0)
c.on_test_end()
if output:
output.close()
loss = float(c.progbar._values['loss'][0] / c.progbar._values['loss'][1])
outputs = loss, metric.to_dict(), False
if ret_scores:
outputs += (scores,)
return outputs
def predict_batch(self, batch, inputs=None, conll=True, **kwargs):
((words, feats), (arcs, rels)) = batch
mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index)
arc_scores, rel_scores = self.model((words, feats))
arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
for sent in self.transform.XY_to_inputs_outputs((words, feats, mask), (arc_preds, rel_preds), gold=False,
inputs=inputs, conll=conll):
yield sent
def compile_model(self, optimizer, loss, metrics):
super().compile_model(optimizer, loss, metrics)
class BiaffineSemanticDependencyParserTF(BiaffineDependencyParserTF):
def __init__(self, transform: CoNLL_SDP_Transform = None) -> None:
if not transform:
transform = CoNLL_SDP_Transform()
# noinspection PyTypeChecker
super().__init__(transform)
self.transform: CoNLL_SDP_Transform = transform
def fit(self, trn_data, dev_data, save_dir, n_embed=100, pretrained_embed=None, embed_dropout=.33,
n_lstm_hidden=400, n_lstm_layers=3, lstm_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33,
optimizer='adam', lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100,
arc_loss='binary_crossentropy', rel_loss='sparse_categorical_crossentropy',
metrics=('UF', 'LF'), n_buckets=32, batch_size=5000, epochs=50000, early_stopping_patience=100,
tree=False, punct=False, min_freq=2, run_eagerly=False, logger=None, verbose=True, **kwargs):
return super().fit(trn_data, dev_data, save_dir, n_embed, pretrained_embed, embed_dropout, n_lstm_hidden,
n_lstm_layers, lstm_dropout, n_mlp_arc, n_mlp_rel, mlp_dropout, optimizer, lr, mu, nu,
epsilon, clip, decay, decay_steps, patience, arc_loss, rel_loss, metrics, n_buckets,
batch_size, epochs, early_stopping_patience, tree, punct, min_freq, run_eagerly, logger,
verbose, **kwargs)
def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss):
mask = tf.tile(tf.expand_dims(mask, -1), [1, 1, tf.shape(mask)[-1]])
mask &= tf.transpose(mask, [0, 2, 1])
arc_scores, arcs = arc_scores[mask], arcs[mask]
rel_scores, rels = rel_scores[mask], rels[mask]
rel_scores, rels = rel_scores[arcs], rels[arcs]
arc_loss = arc_loss(arcs, arc_scores)
rel_loss = rel_loss(rels, rel_scores)
loss = arc_loss + rel_loss
return loss
def decode(self, arc_scores, rel_scores, mask):
arc_preds = arc_scores > 0
rel_preds = tf.argmax(rel_scores, -1)
return arc_preds, rel_preds
class BiaffineTransformerDependencyParserTF(BiaffineDependencyParserTF, tf.keras.callbacks.Callback):
def __init__(self, transform: CoNLL_Transformer_Transform = None) -> None:
if not transform:
transform = CoNLL_Transformer_Transform()
super().__init__(transform)
self.transform: CoNLL_Transformer_Transform = transform
def build_model(self, transformer, training, **kwargs) -> tf.keras.Model:
transformer = self.build_transformer(training, transformer)
model = BiaffineModelTF(self.config, transformer=transformer)
return model
def build_transformer(self, training, transformer):
if training:
self.config.n_words = len(self.transform.form_vocab)
self._init_config()
if isinstance(transformer, str):
if 'albert_chinese' in transformer:
tokenizer = BertTokenizerFast.from_pretrained(transformer, add_special_tokens=False)
transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer,
from_pt=True)
elif transformer.startswith('albert') and transformer.endswith('zh'):
transformer, tokenizer, path = build_transformer(transformer)
transformer.config = AlbertConfig.from_json_file(os.path.join(path, "albert_config.json"))
tokenizer = BertTokenizer.from_pretrained(os.path.join(path, "vocab_chinese.txt"),
add_special_tokens=False)
elif 'chinese-roberta' in transformer:
tokenizer = BertTokenizer.from_pretrained(transformer)
transformer = TFBertModel.from_pretrained(transformer, name=transformer, from_pt=True)
else:
tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(transformer)
try:
transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer)
except (TypeError, OSError):
transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer,
from_pt=True)
elif transformer[0] == 'AutoModelWithLMHead':
tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(transformer[1])
transformer: TFAutoModelWithLMHead = TFAutoModelWithLMHead.from_pretrained(transformer[1])
else:
raise ValueError(f'Unknown identifier {transformer}')
self.transform.tokenizer = tokenizer
if self.config.get('fp16', None) or self.config.get('use_amp', None):
policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
tf.keras.mixed_precision.experimental.set_policy(policy)
# tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
transformer.set_weights([w.astype('float16') for w in transformer.get_weights()])
self.transform.transformer_config = transformer.config
return transformer
# noinspection PyMethodOverriding
def fit(self, trn_data, dev_data, save_dir, transformer, max_seq_length=256, transformer_dropout=.33,
d_positional=None,
n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33,
optimizer='adamw',
learning_rate=5e-5,
learning_rate_transformer=None,
weight_decay_rate=0,
epsilon=1e-8,
clipnorm=None,
fp16=False,
warmup_steps_ratio=0,
arc_loss='sparse_categorical_crossentropy', rel_loss='sparse_categorical_crossentropy',
metrics=('UAS', 'LAS'),
batch_size=3000,
samples_per_batch=150,
max_samples_per_batch=None,
epochs=100,
tree=False, punct=False, token_mapping=None, run_eagerly=False, logger=None, verbose=True, **kwargs):
self.set_params({})
return KerasComponent.fit(self, **merge_locals_kwargs(locals(), kwargs))
@property
def sample_data(self):
dataset = self.transform.inputs_to_dataset(
[[('Hello', 'NN'), ('world', 'NN')], [('HanLP', 'NN'), ('is', 'NN'), ('good', 'NN')]] if self.config.get(
'use_pos', None) else
[['Hello', 'world'], ['HanLP', 'is', 'good']])
return next(iter(dataset))[0]
# noinspection PyMethodOverriding
def build_optimizer(self, optimizer, learning_rate, epsilon, weight_decay_rate, clipnorm, fp16, train_steps,
**kwargs):
if optimizer == 'adamw':
epochs = self.config['epochs']
learning_rate_transformer = kwargs.get('learning_rate_transformer', None)
train_steps = math.ceil(self.config.train_examples * epochs / self.config.samples_per_batch)
warmup_steps = math.ceil(train_steps * self.config['warmup_steps_ratio'])
if learning_rate_transformer is not None:
if learning_rate_transformer > 0:
self.params['optimizer_transformer'] = build_adamw_optimizer(self.config, learning_rate_transformer,
epsilon,
clipnorm, train_steps, fp16,
math.ceil(warmup_steps),
weight_decay_rate)
else:
self.model.transformer.trainable = False
return super().build_optimizer(lr=learning_rate) # use a normal adam for biaffine
else:
return build_adamw_optimizer(self.config, learning_rate, epsilon, clipnorm, train_steps, fp16,
math.ceil(warmup_steps), weight_decay_rate)
return super().build_optimizer(optimizer, **kwargs)
def build_vocab(self, trn_data, logger):
self.config.train_examples = train_examples = super().build_vocab(trn_data, logger)
return train_examples
def build_callbacks(self, save_dir, logger, metrics, **kwargs):
callbacks = super().build_callbacks(save_dir, logger, metrics=metrics, **kwargs)
callbacks.append(self)
if not self.params:
self.set_params({})
return callbacks
def on_train_begin(self):
self.params['accum_grads'] = [tf.Variable(tf.zeros_like(tv.read_value()), trainable=False) for tv in
self.model.trainable_variables]
self.params['trained_samples'] = 0
self.params['transformer_variable_names'] = {x.name for x in self.model.transformer.trainable_variables}
def train_batch(self, words, feats, arcs, rels, mask, optimizer, arc_loss, rel_loss):
with tf.GradientTape() as tape:
arc_scores, rel_scores = self.model((words, feats), training=True)
loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss)
grads = tape.gradient(loss, self.model.trainable_variables)
accum_grads = self.params['accum_grads']
for i, grad in enumerate(grads):
if grad is not None:
accum_grads[i].assign_add(grad)
self.params['trained_samples'] += tf.shape(words)[0]
if self.params['trained_samples'] >= self.config.samples_per_batch:
self._apply_grads(accum_grads)
return loss, arc_scores, rel_scores
def _apply_grads(self, accum_grads):
optimizer_transformer = self.params.get('optimizer_transformer', None)
if optimizer_transformer:
transformer = self.params['transformer_variable_names']
trainable_variables = self.model.trainable_variables
optimizer_transformer.apply_gradients(
(g, w) for g, w in zip(accum_grads, trainable_variables) if w.name in transformer)
self.model.optimizer.apply_gradients(
(g, w) for g, w in zip(accum_grads, trainable_variables) if w.name not in transformer)
else:
self.model.optimizer.apply_gradients(zip(accum_grads, self.model.trainable_variables))
for tv in accum_grads:
tv.assign(tf.zeros_like(tv))
# print('Apply grads after', self.params['trained_samples'], 'samples')
self.params['trained_samples'] = 0
def on_epoch_end(self, epoch, logs=None):
if self.params['trained_samples']:
self._apply_grads(self.params['accum_grads'])
class BiaffineTransformerSemanticDependencyParser(BiaffineTransformerDependencyParserTF):
def __init__(self, transform: CoNLL_Transformer_Transform = None) -> None:
if not transform:
transform = CoNLL_Transformer_Transform(graph=True)
super().__init__(transform)
def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss):
return BiaffineSemanticDependencyParserTF.get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss,
rel_loss)
def fit(self, trn_data, dev_data, save_dir, transformer, max_seq_length=256, transformer_dropout=.33,
d_positional=None, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, optimizer='adamw', learning_rate=5e-5,
learning_rate_transformer=None, weight_decay_rate=0, epsilon=1e-8, clipnorm=None, fp16=False,
warmup_steps_ratio=0, arc_loss='binary_crossentropy',
rel_loss='sparse_categorical_crossentropy', metrics=('UF', 'LF'), batch_size=3000, samples_per_batch=150,
max_samples_per_batch=None, epochs=100, tree=False, punct=False, token_mapping=None, enhanced_only=False,
run_eagerly=False,
logger=None, verbose=True, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def decode(self, arc_scores, rel_scores, mask):
return BiaffineSemanticDependencyParserTF.decode(self, arc_scores, rel_scores, mask)
================================================
FILE: hanlp/components/parsers/biaffine_tf/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 23:03
================================================
FILE: hanlp/components/parsers/biaffine_tf/alg.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 19:49
# Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser
from typing import List
import numpy as np
import tensorflow as tf
from collections import defaultdict
def nonzero(t: tf.Tensor) -> tf.Tensor:
return tf.where(t > 0)
def view(t: tf.Tensor, *dims) -> tf.Tensor:
return tf.reshape(t, dims)
def arange(n: int) -> tf.Tensor:
return tf.range(n)
def randperm(n: int) -> tf.Tensor:
return tf.random.shuffle(arange(n))
def tolist(t: tf.Tensor) -> List:
if isinstance(t, tf.Tensor):
t = t.numpy()
return t.tolist()
def kmeans(x, k, seed=None):
"""See https://github.com/zysite/biaffine-parser/blob/master/parser/utils/alg.py#L7
Args:
x(list): Lengths of sentences
k(int):
seed: (Default value = None)
Returns:
"""
x = tf.constant(x, dtype=tf.float32)
# count the frequency of each datapoint
d, indices, f = tf.unique_with_counts(x, tf.int32)
f = tf.cast(f, tf.float32)
# calculate the sum of the values of the same datapoints
total = d * f
# initialize k centroids randomly
c, old = tf.random.shuffle(d, seed)[:k], None
# assign labels to each datapoint based on centroids
dists = tf.abs(tf.expand_dims(d, -1) - c)
y = tf.argmin(dists, axis=-1, output_type=tf.int32)
dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y])))
# make sure number of datapoints is greater than that of clusters
assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters"
while old is None or not tf.reduce_all(c == old):
# if an empty cluster is encountered,
# choose the farthest datapoint from the biggest cluster
# and move that the empty one
for i in range(k):
if not tf.reduce_any(y == i):
mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32)
lens = tf.reduce_sum(mask, axis=-1)
biggest = view(nonzero(mask[tf.argmax(lens)]), -1)
farthest = tf.argmax(tf.gather(dists, biggest))
tf.tensor_scatter_nd_update(y, tf.expand_dims(tf.expand_dims(biggest[farthest], -1), -1), [i])
mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32)
# update the centroids
c, old = tf.cast(tf.reduce_sum(total * mask, axis=-1), tf.float32) / tf.cast(tf.reduce_sum(f * mask, axis=-1),
tf.float32), c
# re-assign all datapoints to clusters
dists = tf.abs(tf.expand_dims(d, -1) - c)
y = tf.argmin(dists, axis=-1, output_type=tf.int32)
dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y])))
# assign all datapoints to the new-generated clusters
# without considering the empty ones
y, (assigned, _) = tf.gather(y, indices), tf.unique(y)
# get the centroids of the assigned clusters
centroids = tf.gather(c, assigned).numpy().tolist()
# map all values of datapoints to buckets
clusters = [tf.squeeze(tf.where(y == i), axis=-1).numpy().tolist() for i in assigned]
return centroids, clusters
# ***************************************************************
class Tarjan:
"""Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph"""
def __init__(self, prediction, tokens):
"""
Parameters
----------
prediction : numpy.ndarray
a predicted dependency tree where prediction[dep_idx] = head_idx
tokens : numpy.ndarray
the tokens we care about (i.e. exclude _GO, _EOS, and _PAD)
"""
self._edges = defaultdict(set)
self._vertices = set((0,))
for dep, head in enumerate(prediction[tokens]):
self._vertices.add(dep + 1)
self._edges[head].add(dep + 1)
self._indices = {}
self._lowlinks = {}
self._onstack = defaultdict(lambda: False)
self._SCCs = []
index = 0
stack = []
for v in self.vertices:
if v not in self.indices:
self.strongconnect(v, index, stack)
# =============================================================
def strongconnect(self, v, index, stack):
"""
Args:
v:
index:
stack:
Returns:
"""
self._indices[v] = index
self._lowlinks[v] = index
index += 1
stack.append(v)
self._onstack[v] = True
for w in self.edges[v]:
if w not in self.indices:
self.strongconnect(w, index, stack)
self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w])
elif self._onstack[w]:
self._lowlinks[v] = min(self._lowlinks[v], self._indices[w])
if self._lowlinks[v] == self._indices[v]:
self._SCCs.append(set())
while stack[-1] != v:
w = stack.pop()
self._onstack[w] = False
self._SCCs[-1].add(w)
w = stack.pop()
self._onstack[w] = False
self._SCCs[-1].add(w)
return
# ======================
@property
def edges(self):
return self._edges
@property
def vertices(self):
return self._vertices
@property
def indices(self):
return self._indices
@property
def SCCs(self):
return self._SCCs
def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True):
"""Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py
Args:
parse_probs(NDArray): seq_len x seq_len, the probability of arcs
length(NDArray): sentence length including ROOT
tokens_to_keep(NDArray): mask matrix
ensure_tree: (Default value = True)
Returns:
"""
if ensure_tree:
I = np.eye(len(tokens_to_keep))
# block loops and pad heads
parse_probs = parse_probs * tokens_to_keep * (1 - I)
parse_preds = np.argmax(parse_probs, axis=1)
tokens = np.arange(1, length)
roots = np.where(parse_preds[tokens] == 0)[0] + 1
# ensure at least one root
if len(roots) < 1:
# The current root probabilities
root_probs = parse_probs[tokens, 0]
# The current head probabilities
old_head_probs = parse_probs[tokens, parse_preds[tokens]]
# Get new potential root probabilities
new_root_probs = root_probs / old_head_probs
# Select the most probable root
new_root = tokens[np.argmax(new_root_probs)]
# Make the change
parse_preds[new_root] = 0
# ensure at most one root
elif len(roots) > 1:
# The probabilities of the current heads
root_probs = parse_probs[roots, 0]
# Set the probability of depending on the root zero
parse_probs[roots, 0] = 0
# Get new potential heads and their probabilities
new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1
new_head_probs = parse_probs[roots, new_heads] / root_probs
# Select the most probable root
new_root = roots[np.argmin(new_head_probs)]
# Make the change
parse_preds[roots] = new_heads
parse_preds[new_root] = 0
# remove cycles
tarjan = Tarjan(parse_preds, tokens)
for SCC in tarjan.SCCs:
if len(SCC) > 1:
dependents = set()
to_visit = set(SCC)
while len(to_visit) > 0:
node = to_visit.pop()
if not node in dependents:
dependents.add(node)
to_visit.update(tarjan.edges[node])
# The indices of the nodes that participate in the cycle
cycle = np.array(list(SCC))
# The probabilities of the current heads
old_heads = parse_preds[cycle]
old_head_probs = parse_probs[cycle, old_heads]
# Set the probability of depending on a non-head to zero
non_heads = np.array(list(dependents))
parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
# Get new potential heads and their probabilities
new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1
new_head_probs = parse_probs[cycle, new_heads] / old_head_probs
# Select the most probable change
change = np.argmax(new_head_probs)
changed_cycle = cycle[change]
old_head = old_heads[change]
new_head = new_heads[change]
# Make the change
parse_preds[changed_cycle] = new_head
tarjan.edges[new_head].add(changed_cycle)
tarjan.edges[old_head].remove(changed_cycle)
return parse_preds
else:
# block and pad heads
parse_probs = parse_probs * tokens_to_keep
parse_preds = np.argmax(parse_probs, axis=1)
return parse_preds
def rel_argmax(rel_probs, length, root, ensure_tree=True):
"""Fix the relation prediction by heuristic rules
Args:
rel_probs(NDArray): seq_len x rel_size
length: real sentence length
ensure_tree: (Default value = True)
root:
Returns:
"""
if ensure_tree:
tokens = np.arange(1, length)
rel_preds = np.argmax(rel_probs, axis=1)
roots = np.where(rel_preds[tokens] == root)[0] + 1
if len(roots) < 1:
rel_preds[1 + np.argmax(rel_probs[tokens, root])] = root
elif len(roots) > 1:
root_probs = rel_probs[roots, root]
rel_probs[roots, root] = 0
new_rel_preds = np.argmax(rel_probs[roots], axis=1)
new_rel_probs = rel_probs[roots, new_rel_preds] / root_probs
new_root = roots[np.argmin(new_rel_probs)]
rel_preds[roots] = new_rel_preds
rel_preds[new_root] = root
return rel_preds
else:
rel_preds = np.argmax(rel_probs, axis=1)
return rel_preds
================================================
FILE: hanlp/components/parsers/biaffine_tf/layers.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 23:05
# Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser
import tensorflow as tf
from hanlp.utils.tf_util import tf_bernoulli
class Biaffine(tf.keras.layers.Layer):
def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True, trainable=True, name=None, dtype=None, dynamic=False,
**kwargs):
super().__init__(trainable, name, dtype, dynamic, **kwargs)
self.n_in = n_in
self.n_out = n_out
self.bias_x = bias_x
self.bias_y = bias_y
self.weight = None
def build(self, input_shape):
self.weight = self.add_weight(name='kernel',
shape=(self.n_out,
self.n_in + self.bias_x,
self.n_in + self.bias_y),
initializer='zero')
def extra_repr(self):
s = f"n_in={self.n_in}, n_out={self.n_out}"
if self.bias_x:
s += f", bias_x={self.bias_x}"
if self.bias_y:
s += f", bias_y={self.bias_y}"
return s
# noinspection PyMethodOverriding
def call(self, x, y, **kwargs):
if self.bias_x:
x = tf.concat((x, tf.ones_like(x[..., :1])), -1)
if self.bias_y:
y = tf.concat((y, tf.ones_like(y[..., :1])), -1)
# [batch_size, n_out, seq_len, seq_len]
s = tf.einsum('bxi,oij,byj->boxy', x, self.weight, y)
# remove dim 1 if n_out == 1
if self.n_out == 1:
s = tf.squeeze(s, axis=1)
return s
class MLP(tf.keras.layers.Layer):
def __init__(self, n_hidden, dropout=0, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):
super().__init__(trainable, name, dtype, dynamic, **kwargs)
self.linear = tf.keras.layers.Dense(n_hidden, kernel_initializer='orthogonal')
self.activation = tf.keras.layers.LeakyReLU(0.1)
self.dropout = SharedDropout(p=dropout)
def call(self, x, **kwargs):
x = self.linear(x)
x = self.activation(x)
x = self.dropout(x)
return x
class SharedDropout(tf.keras.layers.Layer):
def __init__(self, p=0.5, batch_first=True, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):
"""Dropout on timesteps with bernoulli distribution"""
super().__init__(trainable, name, dtype, dynamic, **kwargs)
self.p = p
self.batch_first = batch_first
def extra_repr(self):
s = f"p={self.p}"
if self.batch_first:
s += f", batch_first={self.batch_first}"
return s
def call(self, x, training=None, **kwargs):
if training and self.p > 0:
if self.batch_first:
mask = self.get_mask(x[:, 0], self.p)
else:
mask = self.get_mask(x[0], self.p)
x *= tf.expand_dims(mask, axis=1) if self.batch_first else mask
return x
@staticmethod
def get_mask(x, p):
mask = tf_bernoulli(tf.shape(x), 1 - p, x.dtype)
mask = mask / (1 - p)
return mask
class IndependentDropout(tf.keras.layers.Layer):
def __init__(self, p=0.5, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):
"""Dropout on the first two dimensions"""
super().__init__(trainable, name, dtype, dynamic, **kwargs)
self.p = p
def extra_repr(self):
return f"p={self.p}"
def call(self, inputs, training=None, **kwargs):
if training and self.p > 0:
masks = [tf_bernoulli(tf.shape(x)[:2], 1 - self.p)
for x in inputs]
total = sum(masks)
scale = len(inputs) / tf.reduce_max(tf.ones_like(total))
masks = [mask * scale for mask in masks]
inputs = [item * tf.expand_dims(mask, axis=-1)
for item, mask in zip(inputs, masks)]
return inputs
================================================
FILE: hanlp/components/parsers/biaffine_tf/model.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 23:04
import tensorflow as tf
from hanlp.layers.transformers.tf_imports import TFPreTrainedModel
from hanlp.components.parsers.biaffine_tf.layers import IndependentDropout, SharedDropout, Biaffine, MLP
class BiaffineModelTF(tf.keras.Model):
def __init__(self, config, embed=None, transformer: TFPreTrainedModel = None):
"""An implementation of T. Dozat and C. D. Manning, “Deep Biaffine Attention for Neural Dependency Parsing.,” ICLR, 2017.
Although I have my MXNet implementation, I found zysite's PyTorch implementation is cleaner so I port it to TensorFlow
Args:
config: param embed:
Returns:
"""
super(BiaffineModelTF, self).__init__()
assert not (embed and transformer), 'Either pre-trained word embed and transformer is supported, but not both'
normal = tf.keras.initializers.RandomNormal(stddev=1.)
if not transformer:
# the embedding layer
self.word_embed = tf.keras.layers.Embedding(input_dim=config.n_words,
output_dim=config.n_embed,
embeddings_initializer=tf.keras.initializers.zeros() if embed
else normal,
name='word_embed')
self.feat_embed = tf.keras.layers.Embedding(input_dim=config.n_feats,
output_dim=config.n_embed,
embeddings_initializer=tf.keras.initializers.zeros() if embed
else normal,
name='feat_embed')
self.embed_dropout = IndependentDropout(p=config.embed_dropout, name='embed_dropout')
# the word-lstm layer
self.lstm = tf.keras.models.Sequential(name='lstm')
for _ in range(config.n_lstm_layers):
self.lstm.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
units=config.n_lstm_hidden,
dropout=config.lstm_dropout,
recurrent_dropout=config.lstm_dropout,
return_sequences=True,
kernel_initializer='orthogonal',
unit_forget_bias=False, # turns out to hinder performance
)))
self.lstm_dropout = SharedDropout(p=config.lstm_dropout, name='lstm_dropout')
else:
self.transformer = transformer
transformer_dropout = config.get('transformer_dropout', None)
if transformer_dropout:
self.transformer_dropout = SharedDropout(p=config.transformer_dropout, name='transformer_dropout')
d_positional = config.get('d_positional', None)
if d_positional:
max_seq_length = config.get('max_seq_length', 256)
self.position_table = self.add_weight(shape=(max_seq_length, d_positional),
initializer='random_normal',
trainable=True)
# the MLP layers
self.mlp_arc_h = MLP(n_hidden=config.n_mlp_arc,
dropout=config.mlp_dropout, name='mlp_arc_h')
self.mlp_arc_d = MLP(n_hidden=config.n_mlp_arc,
dropout=config.mlp_dropout, name='mlp_arc_d')
self.mlp_rel_h = MLP(n_hidden=config.n_mlp_rel,
dropout=config.mlp_dropout, name='mlp_rel_h')
self.mlp_rel_d = MLP(n_hidden=config.n_mlp_rel,
dropout=config.mlp_dropout, name='mlp_rel_d')
# the Biaffine layers
self.arc_attn = Biaffine(n_in=config.n_mlp_arc,
bias_x=True,
bias_y=False, name='arc_attn')
self.rel_attn = Biaffine(n_in=config.n_mlp_rel,
n_out=config.n_rels,
bias_x=True,
bias_y=True, name='rel_attn')
if embed is not None:
self.pretrained = embed
self.pad_index = tf.constant(config.pad_index, dtype=tf.int64)
self.unk_index = tf.constant(config.unk_index, dtype=tf.int64)
# noinspection PyMethodOverriding
def call(self, inputs, mask_inf=True, **kwargs):
# batch_size, seq_len = words.shape
# get the mask and lengths of given batch
# mask = words.ne(self.pad_index)
if hasattr(self, 'lstm'):
words, feats = inputs
mask = tf.not_equal(words, self.pad_index)
# set the indices larger than num_embeddings to unk_index
# ext_mask = words.ge(self.word_embed.num_embeddings)
ext_mask = tf.greater_equal(words, self.word_embed.input_dim)
ext_words = tf.where(ext_mask, self.unk_index, words)
# get outputs from embedding layers
word_embed = self.word_embed(ext_words)
if hasattr(self, 'pretrained'):
word_embed += self.pretrained(words)
feat_embed = self.feat_embed(feats)
word_embed, feat_embed = self.embed_dropout([word_embed, feat_embed])
# concatenate the word and feat representations
embed = tf.concat((word_embed, feat_embed), axis=-1)
x = self.lstm(embed, mask=mask)
x = self.lstm_dropout(x)
else:
words, (input_ids, input_mask, prefix_offset) = inputs
mask = tf.not_equal(words, self.pad_index)
x = self.run_transformer(input_ids, input_mask, prefix_offset)
# apply MLPs to the BiLSTM output states
arc_h = self.mlp_arc_h(x)
arc_d = self.mlp_arc_d(x)
rel_h = self.mlp_rel_h(x)
rel_d = self.mlp_rel_d(x)
# get arc and rel scores from the bilinear attention
# [batch_size, seq_len, seq_len]
s_arc = self.arc_attn(arc_d, arc_h)
# [batch_size, seq_len, seq_len, n_rels]
s_rel = tf.transpose(self.rel_attn(rel_d, rel_h), [0, 2, 3, 1])
# set the scores that exceed the length of each sentence to -inf
if mask_inf:
s_arc = tf.where(tf.expand_dims(mask, 1), s_arc, float('-inf'))
return s_arc, s_rel
def run_transformer(self, input_ids, input_mask, prefix_offset):
if isinstance(self.transformer, TFPreTrainedModel):
sequence_output = self.transformer([input_ids, input_mask])
sequence_output = sequence_output[0]
else:
sequence_output = self.transformer([input_ids, tf.zeros_like(input_ids)], mask=input_mask)
x = tf.gather(sequence_output, prefix_offset, batch_dims=1)
if hasattr(self, 'transformer_dropout'):
x = self.transformer_dropout(x)
if hasattr(self, 'position_table'):
batch_size, seq_length = tf.shape(x)[:2]
timing_signal = tf.broadcast_to(self.position_table[:seq_length],
[batch_size, seq_length, self.position_table.shape[-1]])
x = tf.concat([x, timing_signal], axis=-1)
return x
def to_functional(self):
words = tf.keras.Input(shape=[None], dtype=tf.int64, name='words')
feats = tf.keras.Input(shape=[None], dtype=tf.int64, name='feats')
s_arc, s_rel = self.call([words, feats], mask_inf=False)
return tf.keras.Model(inputs=[words, feats], outputs=[s_arc, s_rel])
================================================
FILE: hanlp/components/parsers/chu_liu_edmonds.py
================================================
# Adopted from https://github.com/allenai/allennlp under Apache Licence 2.0.
# Changed the packaging.
from typing import List, Set, Tuple, Dict
import numpy
def decode_mst(
energy: numpy.ndarray, length: int, has_labels: bool = True
) -> Tuple[numpy.ndarray, numpy.ndarray]:
"""Note: Counter to typical intuition, this function decodes the _maximum_
spanning tree.
Decode the optimal MST tree with the Chu-Liu-Edmonds algorithm for
maximum spanning arborescences on graphs.
Adopted from https://github.com/allenai/allennlp/blob/master/allennlp/nn/chu_liu_edmonds.py
which is licensed under the Apache License 2.0
# Parameters
energy : `numpy.ndarray`, required.
A tensor with shape (num_labels, timesteps, timesteps)
containing the energy of each edge. If has_labels is `False`,
the tensor should have shape (timesteps, timesteps) instead.
length : `int`, required.
The length of this sequence, as the energy may have come
from a padded batch.
has_labels : `bool`, optional, (default = True)
Whether the graph has labels or not.
Args:
energy: numpy.ndarray:
length: int:
has_labels: bool: (Default value = True)
Returns:
"""
if has_labels and energy.ndim != 3:
raise ValueError("The dimension of the energy array is not equal to 3.")
elif not has_labels and energy.ndim != 2:
raise ValueError("The dimension of the energy array is not equal to 2.")
input_shape = energy.shape
max_length = input_shape[-1]
# Our energy matrix might have been batched -
# here we clip it to contain only non padded tokens.
if has_labels:
energy = energy[:, :length, :length]
# get best label for each edge.
label_id_matrix = energy.argmax(axis=0)
energy = energy.max(axis=0)
else:
energy = energy[:length, :length]
label_id_matrix = None
# get original score matrix
original_score_matrix = energy
# initialize score matrix to original score matrix
score_matrix = numpy.array(original_score_matrix, copy=True)
old_input = numpy.zeros([length, length], dtype=numpy.int32)
old_output = numpy.zeros([length, length], dtype=numpy.int32)
current_nodes = [True for _ in range(length)]
representatives: List[Set[int]] = []
for node1 in range(length):
original_score_matrix[node1, node1] = 0.0
score_matrix[node1, node1] = 0.0
representatives.append({node1})
for node2 in range(node1 + 1, length):
old_input[node1, node2] = node1
old_output[node1, node2] = node2
old_input[node2, node1] = node2
old_output[node2, node1] = node1
final_edges: Dict[int, int] = {}
# The main algorithm operates inplace.
chu_liu_edmonds(
length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives
)
heads = numpy.zeros([max_length], numpy.int32)
if has_labels:
head_type = numpy.ones([max_length], numpy.int32)
else:
head_type = None
for child, parent in final_edges.items():
heads[child] = parent
if has_labels:
head_type[child] = label_id_matrix[parent, child]
return heads, head_type
def chu_liu_edmonds(
length: int,
score_matrix: numpy.ndarray,
current_nodes: List[bool],
final_edges: Dict[int, int],
old_input: numpy.ndarray,
old_output: numpy.ndarray,
representatives: List[Set[int]],
):
"""Applies the chu-liu-edmonds algorithm recursively
to a graph with edge weights defined by score_matrix.
Note that this function operates in place, so variables
will be modified.
# Parameters
length : `int`, required.
The number of nodes.
score_matrix : `numpy.ndarray`, required.
The score matrix representing the scores for pairs
of nodes.
current_nodes : `List[bool]`, required.
The nodes which are representatives in the graph.
A representative at it's most basic represents a node,
but as the algorithm progresses, individual nodes will
represent collapsed cycles in the graph.
final_edges : `Dict[int, int]`, required.
An empty dictionary which will be populated with the
nodes which are connected in the maximum spanning tree.
old_input : `numpy.ndarray`, required.
old_output : `numpy.ndarray`, required.
representatives : `List[Set[int]]`, required.
A list containing the nodes that a particular node
is representing at this iteration in the graph.
# Returns
Nothing - all variables are modified in place.
Args:
length: int:
score_matrix: numpy.ndarray:
current_nodes: List[bool]:
final_edges: Dict[int:
int]:
old_input: numpy.ndarray:
old_output: numpy.ndarray:
representatives: List[Set[int]]:
Returns:
"""
# Set the initial graph to be the greedy best one.
parents = [-1]
for node1 in range(1, length):
parents.append(0)
if current_nodes[node1]:
max_score = score_matrix[0, node1]
for node2 in range(1, length):
if node2 == node1 or not current_nodes[node2]:
continue
new_score = score_matrix[node2, node1]
if new_score > max_score:
max_score = new_score
parents[node1] = node2
# Check if this solution has a cycle.
has_cycle, cycle = _find_cycle(parents, length, current_nodes)
# If there are no cycles, find all edges and return.
if not has_cycle:
final_edges[0] = -1
for node in range(1, length):
if not current_nodes[node]:
continue
parent = old_input[parents[node], node]
child = old_output[parents[node], node]
final_edges[child] = parent
return
# Otherwise, we have a cycle so we need to remove an edge.
# From here until the recursive call is the contraction stage of the algorithm.
cycle_weight = 0.0
# Find the weight of the cycle.
index = 0
for node in cycle:
index += 1
cycle_weight += score_matrix[parents[node], node]
# For each node in the graph, find the maximum weight incoming
# and outgoing edge into the cycle.
cycle_representative = cycle[0]
for node in range(length):
if not current_nodes[node] or node in cycle:
continue
in_edge_weight = float("-inf")
in_edge = -1
out_edge_weight = float("-inf")
out_edge = -1
for node_in_cycle in cycle:
if score_matrix[node_in_cycle, node] > in_edge_weight:
in_edge_weight = score_matrix[node_in_cycle, node]
in_edge = node_in_cycle
# Add the new edge score to the cycle weight
# and subtract the edge we're considering removing.
score = (
cycle_weight
+ score_matrix[node, node_in_cycle]
- score_matrix[parents[node_in_cycle], node_in_cycle]
)
if score > out_edge_weight:
out_edge_weight = score
out_edge = node_in_cycle
score_matrix[cycle_representative, node] = in_edge_weight
old_input[cycle_representative, node] = old_input[in_edge, node]
old_output[cycle_representative, node] = old_output[in_edge, node]
score_matrix[node, cycle_representative] = out_edge_weight
old_output[node, cycle_representative] = old_output[node, out_edge]
old_input[node, cycle_representative] = old_input[node, out_edge]
# For the next recursive iteration, we want to consider the cycle as a
# single node. Here we collapse the cycle into the first node in the
# cycle (first node is arbitrary), set all the other nodes not be
# considered in the next iteration. We also keep track of which
# representatives we are considering this iteration because we need
# them below to check if we're done.
considered_representatives: List[Set[int]] = []
for i, node_in_cycle in enumerate(cycle):
considered_representatives.append(set())
if i > 0:
# We need to consider at least one
# node in the cycle, arbitrarily choose
# the first.
current_nodes[node_in_cycle] = False
for node in representatives[node_in_cycle]:
considered_representatives[i].add(node)
if i > 0:
representatives[cycle_representative].add(node)
chu_liu_edmonds(
length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives
)
# Expansion stage.
# check each node in cycle, if one of its representatives
# is a key in the final_edges, it is the one we need.
found = False
key_node = -1
for i, node in enumerate(cycle):
for cycle_rep in considered_representatives[i]:
if cycle_rep in final_edges:
key_node = node
found = True
break
if found:
break
previous = parents[key_node]
while previous != key_node:
child = old_output[parents[previous], previous]
parent = old_input[parents[previous], previous]
final_edges[child] = parent
previous = parents[previous]
def _find_cycle(
parents: List[int], length: int, current_nodes: List[bool]
) -> Tuple[bool, List[int]]:
added = [False for _ in range(length)]
added[0] = True
cycle = set()
has_cycle = False
for i in range(1, length):
if has_cycle:
break
# don't redo nodes we've already
# visited or aren't considering.
if added[i] or not current_nodes[i]:
continue
# Initialize a new possible cycle.
this_cycle = set()
this_cycle.add(i)
added[i] = True
has_cycle = True
next_node = i
while parents[next_node] not in this_cycle:
next_node = parents[next_node]
# If we see a node we've already processed,
# we can stop, because the node we are
# processing would have been in that cycle.
if added[next_node]:
has_cycle = False
break
added[next_node] = True
this_cycle.add(next_node)
if has_cycle:
original = next_node
cycle.add(original)
next_node = parents[original]
while next_node != original:
cycle.add(next_node)
next_node = parents[next_node]
break
return has_cycle, list(cycle)
================================================
FILE: hanlp/components/parsers/conll.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-26 15:37
from typing import Union
from hanlp.utils.io_util import get_resource, TimingFileIterator
from hanlp.utils.log_util import logger
def collapse_enhanced_empty_nodes(sent: list):
collapsed = []
for cells in sent:
if isinstance(cells[0], float):
id = cells[0]
head, deprel = cells[8].split(':', 1)
for x in sent:
arrows = [s.split(':', 1) for s in x[8].split('|')]
arrows = [(head, f'{head}:{deprel}>{r}') if h == str(id) else (h, r) for h, r in arrows]
arrows = sorted(arrows)
x[8] = '|'.join(f'{h}:{r}' for h, r in arrows)
sent[head][7] += f'>{cells[7]}'
else:
collapsed.append(cells)
return collapsed
def read_conll(filepath: Union[str, TimingFileIterator], underline_to_none=False, enhanced_collapse_empty_nodes=False):
sent = []
if isinstance(filepath, str):
filepath: str = get_resource(filepath)
if filepath.endswith('.conllu') and enhanced_collapse_empty_nodes is None:
enhanced_collapse_empty_nodes = True
src = open(filepath, encoding='utf-8')
else:
src = filepath
for idx, line in enumerate(src):
if line.startswith('#'):
continue
line = line.strip()
cells = line.split('\t')
if line and cells:
if enhanced_collapse_empty_nodes and '.' in cells[0]:
cells[0] = float(cells[0])
cells[6] = None
else:
if '-' in cells[0] or '.' in cells[0]:
# sent[-1][1] += cells[1]
continue
cells[0] = int(cells[0])
if cells[6] != '_':
try:
cells[6] = int(cells[6])
except ValueError:
cells[6] = 0
logger.exception(f'Wrong CoNLL format {filepath}:{idx + 1}\n{line}')
if underline_to_none:
for i, x in enumerate(cells):
if x == '_':
cells[i] = None
sent.append(cells)
else:
if enhanced_collapse_empty_nodes:
sent = collapse_enhanced_empty_nodes(sent)
yield sent
sent = []
if sent:
if enhanced_collapse_empty_nodes:
sent = collapse_enhanced_empty_nodes(sent)
yield sent
src.close()
================================================
FILE: hanlp/components/parsers/constituency/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-28 19:26
================================================
FILE: hanlp/components/parsers/constituency/crf_constituency_model.py
================================================
# -*- coding:utf-8 -*-
# Adopted from https://github.com/yzhangcs/parser
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
from torch import nn
from hanlp.components.parsers.constituency.treecrf import CRFConstituency
from hanlp.components.parsers.alg import cky
from hanlp.components.parsers.biaffine.biaffine import Biaffine
from hanlp.components.parsers.biaffine.mlp import MLP
class CRFConstituencyDecoder(nn.Module):
r"""
The implementation of CRF Constituency Parser,
also called FANCY (abbr. of Fast and Accurate Neural Crf constituencY) Parser.
References:
- Yu Zhang, Houquan Zhou and Zhenghua Li. 2020.
`Fast and Accurate Neural CRF Constituency Parsing`_.
Args:
n_words (int):
The size of the word vocabulary.
n_feats (int):
The size of the feat vocabulary.
n_labels (int):
The number of labels.
feat (str):
Specifies which type of additional feature to use: ``'char'`` | ``'bert'`` | ``'tag'``.
``'char'``: Character-level representations extracted by CharLSTM.
``'bert'``: BERT representations, other pretrained langugae models like XLNet are also feasible.
``'tag'``: POS tag embeddings.
Default: 'char'.
n_embed (int):
The size of word embeddings. Default: 100.
n_feat_embed (int):
The size of feature representations. Default: 100.
n_char_embed (int):
The size of character embeddings serving as inputs of CharLSTM, required if ``feat='char'``. Default: 50.
bert (str):
Specifies which kind of language model to use, e.g., ``'bert-base-cased'`` and ``'xlnet-base-cased'``.
This is required if ``feat='bert'``. The full list can be found in `transformers`.
Default: ``None``.
n_bert_layers (int):
Specifies how many last layers to use. Required if ``feat='bert'``.
The final outputs would be the weight sum of the hidden states of these layers.
Default: 4.
mix_dropout (float):
The dropout ratio of BERT layers. Required if ``feat='bert'``. Default: .0.
embed_dropout (float):
The dropout ratio of input embeddings. Default: .33.
n_hidden (int):
The size of LSTM hidden states. Default: 400.
n_lstm_layers (int):
The number of LSTM layers. Default: 3.
lstm_dropout (float):
The dropout ratio of LSTM. Default: .33.
n_mlp_span (int):
Span MLP size. Default: 500.
n_mlp_label (int):
Label MLP size. Default: 100.
mlp_dropout (float):
The dropout ratio of MLP layers. Default: .33.
feat_pad_index (int):
The index of the padding token in the feat vocabulary. Default: 0.
pad_index (int):
The index of the padding token in the word vocabulary. Default: 0.
unk_index (int):
The index of the unknown token in the word vocabulary. Default: 1.
.. _Fast and Accurate Neural CRF Constituency Parsing:
https://www.ijcai.org/Proceedings/2020/560/
.. _transformers:
https://github.com/huggingface/transformers
"""
def __init__(self,
n_labels,
n_hidden=400,
n_mlp_span=500,
n_mlp_label=100,
mlp_dropout=.33,
**kwargs
):
super().__init__()
# the MLP layers
self.mlp_span_l = MLP(n_in=n_hidden, n_out=n_mlp_span, dropout=mlp_dropout)
self.mlp_span_r = MLP(n_in=n_hidden, n_out=n_mlp_span, dropout=mlp_dropout)
self.mlp_label_l = MLP(n_in=n_hidden, n_out=n_mlp_label, dropout=mlp_dropout)
self.mlp_label_r = MLP(n_in=n_hidden, n_out=n_mlp_label, dropout=mlp_dropout)
# the Biaffine layers
self.span_attn = Biaffine(n_in=n_mlp_span, bias_x=True, bias_y=False)
self.label_attn = Biaffine(n_in=n_mlp_label, n_out=n_labels, bias_x=True, bias_y=True)
self.crf = CRFConstituency()
self.criterion = nn.CrossEntropyLoss()
def forward(self, x, **kwargs):
r"""
Args:
x (~torch.FloatTensor): ``[batch_size, seq_len, hidden_dim]``.
Hidden states from encoder.
Returns:
~torch.Tensor, ~torch.Tensor:
The first tensor of shape ``[batch_size, seq_len, seq_len]`` holds scores of all possible spans.
The second of shape ``[batch_size, seq_len, seq_len, n_labels]`` holds
scores of all possible labels on each span.
"""
x_f, x_b = x.chunk(2, -1)
x = torch.cat((x_f[:, :-1], x_b[:, 1:]), -1)
# apply MLPs to the BiLSTM output states
span_l = self.mlp_span_l(x)
span_r = self.mlp_span_r(x)
label_l = self.mlp_label_l(x)
label_r = self.mlp_label_r(x)
# [batch_size, seq_len, seq_len]
s_span = self.span_attn(span_l, span_r)
# [batch_size, seq_len, seq_len, n_labels]
s_label = self.label_attn(label_l, label_r).permute(0, 2, 3, 1)
return s_span, s_label
def loss(self, s_span, s_label, charts, mask, mbr=True):
r"""
Args:
s_span (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all spans
s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
Scores of all labels on each span.
charts (~torch.LongTensor): ``[batch_size, seq_len, seq_len]``.
The tensor of gold-standard labels, in which positions without labels are filled with -1.
mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
The mask for covering the unpadded tokens in each chart.
mbr (bool):
If ``True``, returns marginals for MBR decoding. Default: ``True``.
Returns:
~torch.Tensor, ~torch.Tensor:
The training loss and
original span scores of shape ``[batch_size, seq_len, seq_len]`` if ``mbr=False``, or marginals otherwise.
"""
span_mask = charts.ge(0) & mask
span_loss, span_probs = self.crf(s_span, mask, span_mask, mbr)
label_loss = self.criterion(s_label[span_mask], charts[span_mask])
loss = span_loss + label_loss
return loss, span_probs
def decode(self, s_span, s_label, mask):
r"""
Args:
s_span (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all spans.
s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
Scores of all labels on each span.
mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
The mask for covering the unpadded tokens in each chart.
Returns:
list[list[tuple]]:
Sequences of factorized labeled trees traversed in pre-order.
"""
span_preds = cky(s_span, mask)
label_preds = s_label.argmax(-1).tolist()
return [[(i, j, labels[i][j]) for i, j in spans] for spans, labels in zip(span_preds, label_preds)]
class CRFConstituencyModel(nn.Module):
def __init__(self, encoder, decoder: CRFConstituencyDecoder) -> None:
super().__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, batch):
r"""
Args:
batch (~dict):
Batch of input data.
Returns:
~torch.Tensor, ~torch.Tensor:
The first tensor of shape ``[batch_size, seq_len, seq_len]`` holds scores of all possible spans.
The second of shape ``[batch_size, seq_len, seq_len, n_labels]`` holds
scores of all possible labels on each span.
"""
x = self.encoder(batch)
return self.decoder(x)
================================================
FILE: hanlp/components/parsers/constituency/crf_constituency_parser.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-28 21:24
import logging
from typing import Union, List
import torch
from phrasetree.tree import Tree
from torch.utils.data import DataLoader
from hanlp_common.constant import BOS, EOS, IDX
from hanlp.common.dataset import TransformableDataset, SamplerBuilder, PadSequenceDataLoader
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength, TransformList
from hanlp.common.vocab import VocabWithNone
from hanlp.components.classifiers.transformer_classifier import TransformerComponent
from hanlp.datasets.parsing.loaders.constituency_dataset import ConstituencyDataset, unpack_tree_to_features, \
build_tree, factorize, remove_subcategory
from hanlp.components.parsers.constituency.crf_constituency_model import CRFConstituencyDecoder, CRFConstituencyModel
from hanlp.metrics.parsing.span import SpanMetric
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm
from hanlp_common.util import merge_locals_kwargs, merge_dict, reorder
class CRFConstituencyParser(TorchComponent):
def __init__(self, **kwargs) -> None:
"""Two-stage CRF Parsing (:cite:`ijcai2020-560`).
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
self.model: CRFConstituencyModel = self.model
def build_optimizer(self, trn, **kwargs):
# noinspection PyCallByClass,PyTypeChecker
return TransformerComponent.build_optimizer(self, trn, **kwargs)
def build_criterion(self, decoder=None, **kwargs):
return decoder
def build_metric(self, **kwargs):
return SpanMetric()
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, patience=0.5, eval_trn=True, **kwargs):
if isinstance(patience, float):
patience = int(patience * epochs)
best_epoch, best_metric = 0, -1
timer = CountdownTimer(epochs)
history = History()
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
eval_trn=eval_trn, **self.config)
loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width)
timer.update()
report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
if dev_metric > best_metric:
best_epoch, best_metric = epoch, dev_metric
self.save_weights(save_dir)
report += ' [red](saved)[/red]'
else:
report += f' ({epoch - best_epoch})'
if epoch - best_epoch >= patience:
report += ' early stop'
logger.info(report)
if epoch - best_epoch >= patience:
break
if not best_epoch:
self.save_weights(save_dir)
elif best_epoch != epoch:
self.load_weights(save_dir)
logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
logger.info(f"{timer.elapsed_human} elapsed")
# noinspection PyMethodOverriding
def fit_dataloader(self,
trn: DataLoader,
criterion,
optimizer,
metric: SpanMetric,
logger: logging.Logger,
history: History,
gradient_accumulation=1,
grad_norm=None,
ratio_width=None,
eval_trn=True,
**kwargs):
optimizer, scheduler = optimizer
metric.reset()
self.model.train()
timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
total_loss = 0
for idx, batch in enumerate(trn):
out, mask = self.feed_batch(batch)
y = batch['chart_id']
loss, span_probs = self.compute_loss(out, y, mask)
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
loss.backward()
total_loss += loss.item()
if eval_trn:
prediction = self.decode_output(out, mask, batch, span_probs)
self.update_metrics(metric, batch, prediction)
if history.step(gradient_accumulation):
self._step(optimizer, scheduler, grad_norm)
report = f'loss: {total_loss / (idx + 1):.4f} {metric}' if eval_trn \
else f'loss: {total_loss / (idx + 1):.4f}'
timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
del loss
del out
del mask
def decode_output(self, out, mask, batch, span_probs=None, decoder=None, tokens=None):
s_span, s_label = out
if not decoder:
decoder = self.model.decoder
if mask.any().item():
if span_probs is None:
if self.config.mbr:
s_span = decoder.crf(s_span, mask, mbr=True)
else:
s_span = span_probs
chart_preds = decoder.decode(s_span, s_label, mask)
else:
chart_preds = [[]] * len(tokens)
idx_to_token = self.vocabs.chart.idx_to_token
if tokens is None:
tokens = batch.get('token_', None) # Use the original tokens if any
if tokens is None:
tokens = batch['token']
tokens = [x[1:-1] for x in tokens]
trees = [build_tree(token, [(i, j, idx_to_token[label]) for i, j, label in chart]) for token, chart in
zip(tokens, chart_preds)]
# probs = [prob[:i - 1, 1:i].cpu() for i, prob in zip(lens, s_span.unbind())]
return trees
def update_metrics(self, metric, batch, prediction):
# Add pre-terminals (pos tags) back to prediction for safe factorization (deletion based on pos)
for pred, gold in zip(prediction, batch['constituency']):
pred: Tree = pred
gold: Tree = gold
for p, g in zip(pred.subtrees(lambda t: t.height() == 2), gold.pos()):
token, pos = g
p: Tree = p
assert p.label() == '_'
p.set_label(pos)
metric([factorize(tree, self.config.delete, self.config.equal) for tree in prediction],
[factorize(tree, self.config.delete, self.config.equal) for tree in batch['constituency']])
return metric
def feed_batch(self, batch: dict):
mask = self.compute_mask(batch)
s_span, s_label = self.model(batch)
return (s_span, s_label), mask
def compute_mask(self, batch, offset=1):
lens = batch['token_length'] - offset
seq_len = lens.max()
mask = lens.new_tensor(range(seq_len)) < lens.view(-1, 1, 1)
mask = mask & mask.new_ones(seq_len, seq_len).triu_(1)
return mask
def compute_loss(self, out, y, mask, crf_decoder=None):
if not crf_decoder:
crf_decoder = self.model.decoder
loss, span_probs = crf_decoder.loss(out[0], out[1], y, mask, self.config.mbr)
if loss < 0: # wired negative loss
loss *= 0
return loss, span_probs
def _step(self, optimizer, scheduler, grad_norm):
clip_grad_norm(self.model, grad_norm)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
@torch.no_grad()
def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, metric=None, output=None, **kwargs):
self.model.eval()
total_loss = 0
if not metric:
metric = self.build_metric()
else:
metric.reset()
timer = CountdownTimer(len(data))
for idx, batch in enumerate(data):
out, mask = self.feed_batch(batch)
y = batch['chart_id']
loss, span_probs = self.compute_loss(out, y, mask)
total_loss += loss.item()
prediction = self.decode_output(out, mask, batch, span_probs)
self.update_metrics(metric, batch, prediction)
timer.log(f'loss: {total_loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger,
ratio_width=ratio_width)
total_loss /= len(data)
if output:
output.close()
return total_loss, metric
# noinspection PyMethodOverriding
def build_model(self, encoder, training=True, **kwargs) -> torch.nn.Module:
decoder = CRFConstituencyDecoder(n_labels=len(self.vocabs.chart), n_hidden=encoder.get_output_dim(), **kwargs)
encoder = encoder.module(vocabs=self.vocabs, training=training)
return CRFConstituencyModel(encoder, decoder)
def build_dataloader(self,
data,
batch_size,
sampler_builder: SamplerBuilder = None,
gradient_accumulation=1,
shuffle=False,
device=None,
logger: logging.Logger = None,
**kwargs) -> DataLoader:
if isinstance(data, TransformableDataset):
dataset = data
else:
transform = self.config.encoder.transform()
if self.config.get('transform', None):
transform = TransformList(self.config.transform, transform)
dataset = self.build_dataset(data, transform, logger)
if self.vocabs.mutable:
# noinspection PyTypeChecker
self.build_vocabs(dataset, logger)
lens = [len(x['token_input_ids']) for x in dataset]
if sampler_builder:
sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
else:
sampler = None
return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)
def predict(self, data: Union[str, List[str]], **kwargs):
if not data:
return []
flat = self.input_is_flat(data)
if flat:
data = [data]
samples = self.build_samples(data)
dataloader = self.build_dataloader(samples, device=self.device, **kwargs)
outputs = []
orders = []
for idx, batch in enumerate(dataloader):
out, mask = self.feed_batch(batch)
prediction = self.decode_output(out, mask, batch, span_probs=None)
# prediction = [x[0] for x in prediction]
outputs.extend(prediction)
orders.extend(batch[IDX])
outputs = reorder(outputs, orders)
if flat:
return outputs[0]
return outputs
def input_is_flat(self, data):
return isinstance(data[0], str)
def build_samples(self, data):
return [{'token': [BOS] + token + [EOS]} for token in data]
# noinspection PyMethodOverriding
def fit(self,
trn_data,
dev_data,
save_dir,
encoder,
lr=5e-5,
transformer_lr=None,
adam_epsilon=1e-8,
weight_decay=0,
warmup_steps=0.1,
grad_norm=1.0,
n_mlp_span=500,
n_mlp_label=100,
mlp_dropout=.33,
batch_size=None,
batch_max_tokens=5000,
gradient_accumulation=1,
epochs=30,
patience=0.5,
mbr=True,
sampler_builder=None,
delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP', ',', 'S1'),
equal=(('ADVP', 'PRT'),),
no_subcategory=True,
eval_trn=True,
transform=None,
devices=None,
logger=None,
seed=None,
**kwargs):
if isinstance(equal, tuple):
equal = dict(equal)
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_dataset(self, data, transform, logger=None):
_transform = [
unpack_tree_to_features,
self.vocabs,
FieldLength('token'),
transform
]
if self.config.get('no_subcategory', True):
_transform.insert(0, remove_subcategory)
dataset = ConstituencyDataset(data,
transform=_transform,
cache=isinstance(data, str))
return dataset
def build_vocabs(self, trn, logger, **kwargs):
self.vocabs.chart = VocabWithNone(pad_token=None, unk_token=None)
timer = CountdownTimer(len(trn))
max_seq_len = 0
for each in trn:
max_seq_len = max(max_seq_len, len(each['token_input_ids']))
timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
self.vocabs.chart.set_unk_as_safe_unk()
self.vocabs.lock()
self.vocabs.summary(logger)
================================================
FILE: hanlp/components/parsers/constituency/treecrf.py
================================================
# -*- coding:utf-8 -*-
# Adopted from https://github.com/yzhangcs/parser
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
import torch.autograd as autograd
import torch.nn as nn
from hanlp.components.parsers.alg import stripe, istree, eisner, mst, eisner2o
class CRFConstituency(nn.Module):
r"""
TreeCRF for calculating partition functions and marginals in :math:`O(n^3)` for constituency trees.
References:
- Yu Zhang, houquan Zhou and Zhenghua Li. 2020.
`Fast and Accurate Neural CRF Constituency Parsing`_.
.. _Fast and Accurate Neural CRF Constituency Parsing:
https://www.ijcai.org/Proceedings/2020/560/
"""
@torch.enable_grad()
def forward(self, scores, mask, target=None, mbr=False):
r"""
Args:
scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all possible constituents.
mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
The mask to avoid parsing over padding tokens.
For each square matrix in a batch, the positions except upper triangular part should be masked out.
target (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``.
The tensor of gold-standard constituents. ``True`` if a constituent exists. Default: ``None``.
mbr (bool):
If ``True``, marginals will be returned to perform minimum Bayes-risk (MBR) decoding. Default: ``False``.
Returns:
~torch.Tensor, ~torch.Tensor:
The first is the training loss averaged by the number of tokens, which won't be returned if ``target=None``.
The second is a tensor of shape ``[batch_size, seq_len, seq_len]``, in which are marginals if ``mbr=True``,
or original scores otherwise.
"""
training = scores.requires_grad
# always enable the gradient computation of scores in order for the computation of marginals
logZ = self.inside(scores.requires_grad_(), mask)
# marginals are used for decoding, and can be computed by combining the inside pass and autograd mechanism
probs = scores
if mbr:
probs, = autograd.grad(logZ, scores, retain_graph=training)
if target is None:
return probs
loss = (logZ - scores[mask & target].sum()) / mask[:, 0].sum()
return loss, probs
def inside(self, scores, mask):
lens = mask[:, 0].sum(-1)
batch_size, seq_len, _ = scores.shape
# [seq_len, seq_len, batch_size]
scores, mask = scores.permute(1, 2, 0), mask.permute(1, 2, 0)
s = torch.full_like(scores, float('-inf'))
for w in range(1, seq_len):
# n denotes the number of spans to iterate,
# from span (0, w) to span (n, n+w) given width w
n = seq_len - w
if w == 1:
s.diagonal(w).copy_(scores.diagonal(w))
continue
# [n, w, batch_size]
s_s = stripe(s, n, w - 1, (0, 1)) + stripe(s, n, w - 1, (1, w), 0)
# [batch_size, n, w]
s_s = s_s.permute(2, 0, 1)
if s_s.requires_grad:
s_s.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
s_s = s_s.logsumexp(-1)
s.diagonal(w).copy_(s_s + scores.diagonal(w))
return s[0].gather(0, lens.unsqueeze(0)).sum()
class CRF2oDependency(nn.Module):
r"""
Second-order TreeCRF for calculating partition functions and marginals in :math:`O(n^3)` for projective dependency trees.
References:
- Yu Zhang, Zhenghua Li and Min Zhang. 2020.
`Efficient Second-Order TreeCRF for Neural Dependency Parsing`_.
.. _Efficient Second-Order TreeCRF for Neural Dependency Parsing:
https://www.aclweb.org/anthology/2020.acl-main.302/
"""
def __init__(self):
super().__init__()
self.criterion = nn.CrossEntropyLoss()
@torch.enable_grad()
def forward(self, scores, mask, target=None, mbr=True, partial=False):
r"""
Args:
scores (~torch.Tensor, ~torch.Tensor):
Tuple of two tensors `s_arc` and `s_sib`.
`s_arc` (``[batch_size, seq_len, seq_len]``) holds Scores of all possible dependent-head pairs.
`s_sib` (``[batch_size, seq_len, seq_len, seq_len]``) holds the scores of dependent-head-sibling triples.
mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
The mask to avoid aggregation on padding tokens.
The first column serving as pseudo words for roots should be ``False``.
target (~torch.LongTensor): ``[batch_size, seq_len]``.
Tensors of gold-standard dependent-head pairs and dependent-head-sibling triples.
If partially annotated, the unannotated positions should be filled with -1.
Default: ``None``.
mbr (bool):
If ``True``, marginals will be returned to perform minimum Bayes-risk (MBR) decoding. Default: ``False``.
partial (bool):
``True`` indicates that the trees are partially annotated. Default: ``False``.
Returns:
~torch.Tensor, ~torch.Tensor:
The first is the training loss averaged by the number of tokens, which won't be returned if ``target=None``.
The second is a tensor of shape ``[batch_size, seq_len, seq_len]``, in which are marginals if ``mbr=True``,
or original scores otherwise.
"""
s_arc, s_sib = scores
training = s_arc.requires_grad
batch_size, seq_len, _ = s_arc.shape
# always enable the gradient computation of scores in order for the computation of marginals
logZ = self.inside((s.requires_grad_() for s in scores), mask)
# marginals are used for decoding, and can be computed by combining the inside pass and autograd mechanism
probs = s_arc
if mbr:
probs, = autograd.grad(logZ, s_arc, retain_graph=training)
if target is None:
return probs
arcs, sibs = target
# the second inside process is needed if use partial annotation
if partial:
score = self.inside(scores, mask, arcs)
else:
arc_seq, sib_seq = arcs[mask], sibs[mask]
arc_mask, sib_mask = mask, sib_seq.gt(0)
sib_seq = sib_seq[sib_mask]
s_sib = s_sib[mask][torch.arange(len(arc_seq)), arc_seq]
s_arc = s_arc[arc_mask].gather(-1, arc_seq.unsqueeze(-1))
s_sib = s_sib[sib_mask].gather(-1, sib_seq.unsqueeze(-1))
score = s_arc.sum() + s_sib.sum()
loss = (logZ - score) / mask.sum()
return loss, probs
def inside(self, scores, mask, cands=None):
# the end position of each sentence in a batch
lens = mask.sum(1)
s_arc, s_sib = scores
batch_size, seq_len, _ = s_arc.shape
# [seq_len, seq_len, batch_size]
s_arc = s_arc.permute(2, 1, 0)
# [seq_len, seq_len, seq_len, batch_size]
s_sib = s_sib.permute(2, 1, 3, 0)
s_i = torch.full_like(s_arc, float('-inf'))
s_s = torch.full_like(s_arc, float('-inf'))
s_c = torch.full_like(s_arc, float('-inf'))
s_c.diagonal().fill_(0)
# set the scores of arcs excluded by cands to -inf
if cands is not None:
mask = mask.index_fill(1, lens.new_tensor(0), 1)
mask = (mask.unsqueeze(1) & mask.unsqueeze(-1)).permute(2, 1, 0)
cands = cands.unsqueeze(-1).index_fill(1, lens.new_tensor(0), -1)
cands = cands.eq(lens.new_tensor(range(seq_len))) | cands.lt(0)
cands = cands.permute(2, 1, 0) & mask
s_arc = s_arc.masked_fill(~cands, float('-inf'))
for w in range(1, seq_len):
# n denotes the number of spans to iterate,
# from span (0, w) to span (n, n+w) given width w
n = seq_len - w
# I(j->i) = logsum(exp(I(j->r) + S(j->r, i)) +, i < r < j
# exp(C(j->j) + C(i->j-1)))
# + s(j->i)
# [n, w, batch_size]
il = stripe(s_i, n, w, (w, 1)) + stripe(s_s, n, w, (1, 0), 0)
il += stripe(s_sib[range(w, n + w), range(n)], n, w, (0, 1))
# [n, 1, batch_size]
il0 = stripe(s_c, n, 1, (w, w)) + stripe(s_c, n, 1, (0, w - 1))
# il0[0] are set to zeros since the scores of the complete spans starting from 0 are always -inf
il[:, -1] = il0.index_fill_(0, lens.new_tensor(0), 0).squeeze(1)
if il.requires_grad:
il.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
il = il.permute(2, 0, 1).logsumexp(-1)
s_i.diagonal(-w).copy_(il + s_arc.diagonal(-w))
# I(i->j) = logsum(exp(I(i->r) + S(i->r, j)) +, i < r < j
# exp(C(i->i) + C(j->i+1)))
# + s(i->j)
# [n, w, batch_size]
ir = stripe(s_i, n, w) + stripe(s_s, n, w, (0, w), 0)
ir += stripe(s_sib[range(n), range(w, n + w)], n, w)
ir[0] = float('-inf')
# [n, 1, batch_size]
ir0 = stripe(s_c, n, 1) + stripe(s_c, n, 1, (w, 1))
ir[:, 0] = ir0.squeeze(1)
if ir.requires_grad:
ir.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
ir = ir.permute(2, 0, 1).logsumexp(-1)
s_i.diagonal(w).copy_(ir + s_arc.diagonal(w))
# [n, w, batch_size]
slr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
if slr.requires_grad:
slr.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
slr = slr.permute(2, 0, 1).logsumexp(-1)
# S(j, i) = logsumexp(C(i->r) + C(j->r+1)), i <= r < j
s_s.diagonal(-w).copy_(slr)
# S(i, j) = logsumexp(C(i->r) + C(j->r+1)), i <= r < j
s_s.diagonal(w).copy_(slr)
# C(j->i) = logsumexp(C(r->i) + I(j->r)), i <= r < j
cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
cl.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
s_c.diagonal(-w).copy_(cl.permute(2, 0, 1).logsumexp(-1))
# C(i->j) = logsumexp(I(i->r) + C(r->j)), i < r <= j
cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
cr.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0))
s_c.diagonal(w).copy_(cr.permute(2, 0, 1).logsumexp(-1))
# disable multi words to modify the root
s_c[0, w][lens.ne(w)] = float('-inf')
return s_c[0].gather(0, lens.unsqueeze(0)).sum()
def loss(self, s_arc, s_sib, s_rel, arcs, sibs, rels, mask, mbr=True, partial=False):
r"""
Args:
s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all possible arcs.
s_sib (~torch.Tensor): ``[batch_size, seq_len, seq_len, seq_len]``.
Scores of all possible dependent-head-sibling triples.
s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
Scores of all possible labels on each arc.
arcs (~torch.LongTensor): ``[batch_size, seq_len]``.
The tensor of gold-standard arcs.
sibs (~torch.LongTensor): ``[batch_size, seq_len]``.
The tensor of gold-standard siblings.
rels (~torch.LongTensor): ``[batch_size, seq_len]``.
The tensor of gold-standard labels.
mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
The mask for covering the unpadded tokens.
mbr (bool):
If ``True``, returns marginals for MBR decoding. Default: ``True``.
partial (bool):
``True`` denotes the trees are partially annotated. Default: ``False``.
Returns:
~torch.Tensor, ~torch.Tensor:
The training loss and
original arc scores of shape ``[batch_size, seq_len, seq_len]`` if ``mbr=False``, or marginals otherwise.
"""
scores, target = (s_arc, s_sib), (arcs, sibs)
arc_loss, arc_probs = self.forward(scores, mask, target, mbr, partial)
# -1 denotes un-annotated arcs
if partial:
mask = mask & arcs.ge(0)
s_rel, rels = s_rel[mask], rels[mask]
s_rel = s_rel[torch.arange(len(rels)), arcs[mask]]
rel_loss = self.criterion(s_rel, rels)
loss = arc_loss + rel_loss
return loss, arc_probs
# def decode(self, s_arc, s_rel, mask, tree=False, proj=False, alg=None):
# r"""
# Args:
# s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
# Scores of all possible arcs.
# s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
# Scores of all possible labels on each arc.
# mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
# The mask for covering the unpadded tokens.
# tree (bool):
# If ``True``, ensures to output well-formed trees. Default: ``False``.
# proj (bool):
# If ``True``, ensures to output projective trees. Default: ``False``.
#
# Returns:
# ~torch.Tensor, ~torch.Tensor:
# Predicted arcs and labels of shape ``[batch_size, seq_len]``.
# """
#
# lens = mask.sum(1)
# arc_preds = s_arc.argmax(-1)
# if tree and not alg:
# bad = [not istree(seq[1:i + 1], proj)
# for i, seq in zip(lens.tolist(), arc_preds.tolist())]
# if any(bad):
# alg = eisner if proj else mst
# arc_preds[bad] = alg(s_arc[bad], mask[bad])
# rel_preds = s_rel.argmax(-1).gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)
#
# return arc_preds, rel_preds
def decode(self, s_arc, s_sib, s_rel, mask, tree=False, mbr=True, proj=False):
r"""
Args:
s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``.
Scores of all possible arcs.
s_sib (~torch.Tensor): ``[batch_size, seq_len, seq_len, seq_len]``.
Scores of all possible dependent-head-sibling triples.
s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``.
Scores of all possible labels on each arc.
mask (~torch.BoolTensor): ``[batch_size, seq_len]``.
The mask for covering the unpadded tokens.
tree (bool):
If ``True``, ensures to output well-formed trees. Default: ``False``.
mbr (bool):
If ``True``, performs MBR decoding. Default: ``True``.
proj (bool):
If ``True``, ensures to output projective trees. Default: ``False``.
Returns:
~torch.Tensor, ~torch.Tensor:
Predicted arcs and labels of shape ``[batch_size, seq_len]``.
"""
lens = mask.sum(1)
arc_preds = s_arc.argmax(-1)
if tree:
bad = [not istree(seq[1:i + 1], proj)
for i, seq in zip(lens.tolist(), arc_preds.tolist())]
if any(bad):
if proj and not mbr:
arc_preds = eisner2o((s_arc, s_sib), mask)
else:
alg = eisner if proj else mst
arc_preds[bad] = alg(s_arc[bad], mask[bad])
rel_preds = s_rel.argmax(-1).gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)
return arc_preds, rel_preds
================================================
FILE: hanlp/components/parsers/parse_alg.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-04-02 23:20
from collections import defaultdict
import hanlp.utils.span_util
from hanlp.components.parsers.chu_liu_edmonds import decode_mst
import numpy as np
class Tarjan:
"""Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph"""
def __init__(self, prediction, tokens):
"""
Parameters
----------
prediction : numpy.ndarray
a predicted dependency tree where prediction[dep_idx] = head_idx
tokens : numpy.ndarray
the tokens we care about (i.e. exclude _GO, _EOS, and _PAD)
"""
self._edges = defaultdict(set)
self._vertices = set((0,))
for dep, head in enumerate(prediction[tokens]):
self._vertices.add(dep + 1)
self._edges[head].add(dep + 1)
self._indices = {}
self._lowlinks = {}
self._onstack = defaultdict(lambda: False)
self._SCCs = []
index = 0
stack = []
for v in self.vertices:
if v not in self.indices:
self.strongconnect(v, index, stack)
# =============================================================
def strongconnect(self, v, index, stack):
"""
Args:
v:
index:
stack:
Returns:
"""
self._indices[v] = index
self._lowlinks[v] = index
index += 1
stack.append(v)
self._onstack[v] = True
for w in self.edges[v]:
if w not in self.indices:
self.strongconnect(w, index, stack)
self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w])
elif self._onstack[w]:
self._lowlinks[v] = min(self._lowlinks[v], self._indices[w])
if self._lowlinks[v] == self._indices[v]:
self._SCCs.append(set())
while stack[-1] != v:
w = stack.pop()
self._onstack[w] = False
self._SCCs[-1].add(w)
w = stack.pop()
self._onstack[w] = False
self._SCCs[-1].add(w)
return
# ======================
@property
def edges(self):
return self._edges
@property
def vertices(self):
return self._vertices
@property
def indices(self):
return self._indices
@property
def SCCs(self):
return self._SCCs
class UnionFind(object):
def __init__(self, n) -> None:
super().__init__()
self.parent = [x for x in range(n)]
self.height = [0] * n
def find(self, x):
if self.parent[x] == x:
return x
self.parent[x] = self.find(self.parent[x])
return self.parent[x]
def unite(self, x, y):
x = self.find(x)
y = self.find(y)
if x == y:
return
if self.height[x] < self.height[y]:
self.parent[x] = y
else:
self.parent[y] = x
if self.height[x] == self.height[y]:
self.height[x] += 1
def same(self, x, y):
return self.find(x) == self.find(y)
def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True):
"""Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py
Args:
parse_probs(NDArray): seq_len x seq_len, the probability of arcs
length(NDArray): sentence length including ROOT
tokens_to_keep(NDArray): mask matrix
ensure_tree: (Default value = True)
Returns:
"""
if ensure_tree:
parse_preds, parse_probs, tokens = unique_root(parse_probs, tokens_to_keep, length)
# remove cycles
tarjan = Tarjan(parse_preds, tokens)
for SCC in tarjan.SCCs:
if len(SCC) > 1:
dependents = set()
to_visit = set(SCC)
while len(to_visit) > 0:
node = to_visit.pop()
if not node in dependents:
dependents.add(node)
to_visit.update(tarjan.edges[node])
# The indices of the nodes that participate in the cycle
cycle = np.array(list(SCC))
# The probabilities of the current heads
old_heads = parse_preds[cycle]
old_head_probs = parse_probs[cycle, old_heads]
# Set the probability of depending on a non-head to zero
non_heads = np.array(list(dependents))
parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
# Get new potential heads and their probabilities
new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1
new_head_probs = parse_probs[cycle, new_heads] / old_head_probs
# Select the most probable change
change = np.argmax(new_head_probs)
changed_cycle = cycle[change]
old_head = old_heads[change]
new_head = new_heads[change]
# Make the change
parse_preds[changed_cycle] = new_head
tarjan.edges[new_head].add(changed_cycle)
tarjan.edges[old_head].remove(changed_cycle)
return parse_preds
else:
# block and pad heads
parse_probs = parse_probs * tokens_to_keep
parse_preds = np.argmax(parse_probs, axis=1)
return parse_preds
def chu_liu_edmonds(parse_probs, length):
tree = decode_mst(hanlp.utils.span_util.T, length, False)[0]
tree[0] = 0
return tree
def unique_root(parse_probs, tokens_to_keep: np.ndarray, length):
I = np.eye(len(tokens_to_keep))
# block loops and pad heads
if tokens_to_keep.ndim == 1:
tokens_to_keep = np.expand_dims(tokens_to_keep, -1)
parse_probs = parse_probs * tokens_to_keep * (1 - I)
parse_preds = np.argmax(parse_probs, axis=1)
tokens = np.arange(1, length)
roots = np.where(parse_preds[tokens] == 0)[0] + 1
# ensure at least one root
if len(roots) < 1:
# The current root probabilities
root_probs = parse_probs[tokens, 0]
# The current head probabilities
old_head_probs = parse_probs[tokens, parse_preds[tokens]]
# Get new potential root probabilities
new_root_probs = root_probs / old_head_probs
# Select the most probable root
new_root = tokens[np.argmax(new_root_probs)]
# Make the change
parse_preds[new_root] = 0
# ensure at most one root
elif len(roots) > 1:
# The probabilities of the current heads
root_probs = parse_probs[roots, 0]
# Set the probability of depending on the root zero
parse_probs[roots, 0] = 0
# Get new potential heads and their probabilities
new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1
new_head_probs = parse_probs[roots, new_heads] / root_probs
# Select the most probable root
new_root = roots[np.argmin(new_head_probs)]
# Make the change
parse_preds[roots] = new_heads
parse_preds[new_root] = 0
return parse_preds, parse_probs, tokens
def dfs(graph, start, end):
fringe = [(start, [])]
while fringe:
state, path = fringe.pop()
if path and state == end:
yield path
continue
for next_state in graph[state]:
if next_state in path:
continue
fringe.append((next_state, path + [next_state]))
def mst_then_greedy(arc_scores, rel_scores, mask, root_rel_idx, rel_idx=None):
from scipy.special import softmax
from scipy.special import expit as sigmoid
length = sum(mask) + 1
mask = mask[:length]
arc_scores = arc_scores[:length, :length]
arc_pred = arc_scores > 0
arc_probs = sigmoid(arc_scores)
rel_scores = rel_scores[:length, :length, :]
rel_probs = softmax(rel_scores, -1)
if not any(arc_pred[:, 0][1:]): # no root
root = np.argmax(rel_probs[1:, 0, root_rel_idx]) + 1
arc_probs[root, 0] = 1
parse_preds, parse_probs, tokens = unique_root(arc_probs, mask, length)
root = adjust_root_score(arc_scores, parse_preds, root_rel_idx, rel_scores)
tree = chu_liu_edmonds(arc_scores, length)
if rel_idx is not None: # Unknown DEPREL label: 'ref'
rel_scores[np.arange(len(tree)), tree, rel_idx] = -float('inf')
return tree, add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx)
def adjust_root_score(arc_scores, parse_preds, root_rel_idx, rel_scores=None):
root = np.where(parse_preds[1:] == 0)[0] + 1
arc_scores[:, 0] = min(np.min(arc_scores), -1000)
arc_scores[root, 0] = max(np.max(arc_scores), 1000)
if rel_scores is not None:
rel_scores[:, :, root_rel_idx] = -float('inf')
rel_scores[root, 0, root_rel_idx] = float('inf')
return root
def add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx, arc_preds=None):
if not isinstance(tree, np.ndarray):
tree = np.array(tree)
if arc_preds is None:
arc_preds = arc_scores > 0
rel_pred = np.argmax(rel_scores, axis=-1)
return add_secondary_arcs_by_preds(arc_scores, arc_preds, rel_pred, tree, root_rel_idx)
def add_secondary_arcs_by_preds(arc_scores, arc_preds, rel_preds, tree, root_rel_idx=None):
dh = np.argwhere(arc_preds)
sdh = sorted([(arc_scores[x[0], x[1]], list(x)) for x in dh], reverse=True)
graph = [[] for _ in range(len(tree))]
for d, h in enumerate(tree):
if d:
graph[h].append(d)
for s, (d, h) in sdh:
if not d or not h or d in graph[h]:
continue
try:
path = next(dfs(graph, d, h))
except StopIteration:
# no path from d to h
graph[h].append(d)
parse_graph = [[] for _ in range(len(tree))]
num_root = 0
for h in range(len(tree)):
for d in graph[h]:
rel = rel_preds[d, h]
if h == 0 and root_rel_idx is not None:
rel = root_rel_idx
assert num_root == 0
num_root += 1
parse_graph[d].append((h, rel))
parse_graph[d] = sorted(parse_graph[d])
return parse_graph
def adjust_root_score_then_add_secondary_arcs(arc_scores, rel_scores, tree, root_rel_idx):
if len(arc_scores) != tree:
arc_scores = arc_scores[:len(tree), :len(tree)]
rel_scores = rel_scores[:len(tree), :len(tree), :]
parse_preds = arc_scores > 0
# adjust_root_score(arc_scores, parse_preds, rel_scores)
parse_preds[:, 0] = False # set heads to False
rel_scores[:, :, root_rel_idx] = -float('inf')
return add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx, parse_preds)
================================================
FILE: hanlp/components/parsers/ud/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-14 20:34
================================================
FILE: hanlp/components/parsers/ud/lemma_edit.py
================================================
"""
Utilities for processing lemmas
Adopted from UDPipe Future
https://github.com/CoNLL-UD-2018/UDPipe-Future
"""
def min_edit_script(source, target, allow_copy=False):
"""Finds the minimum edit script to transform the source to the target
Args:
source:
target:
allow_copy: (Default value = False)
Returns:
"""
a = [[(len(source) + len(target) + 1, None)] * (len(target) + 1) for _ in range(len(source) + 1)]
for i in range(0, len(source) + 1):
for j in range(0, len(target) + 1):
if i == 0 and j == 0:
a[i][j] = (0, "")
else:
if allow_copy and i and j and source[i - 1] == target[j - 1] and a[i - 1][j - 1][0] < a[i][j][0]:
a[i][j] = (a[i - 1][j - 1][0], a[i - 1][j - 1][1] + "→")
if i and a[i - 1][j][0] < a[i][j][0]:
a[i][j] = (a[i - 1][j][0] + 1, a[i - 1][j][1] + "-")
if j and a[i][j - 1][0] < a[i][j][0]:
a[i][j] = (a[i][j - 1][0] + 1, a[i][j - 1][1] + "+" + target[j - 1])
return a[-1][-1][1]
def gen_lemma_rule(form, lemma, allow_copy=False):
"""Generates a lemma rule to transform the source to the target
Args:
form:
lemma:
allow_copy: (Default value = False)
Returns:
"""
form = form.lower()
previous_case = -1
lemma_casing = ""
for i, c in enumerate(lemma):
case = "↑" if c.lower() != c else "↓"
if case != previous_case:
lemma_casing += "{}{}{}".format("¦" if lemma_casing else "", case,
i if i <= len(lemma) // 2 else i - len(lemma))
previous_case = case
lemma = lemma.lower()
best, best_form, best_lemma = 0, 0, 0
for l in range(len(lemma)):
for f in range(len(form)):
cpl = 0
while f + cpl < len(form) and l + cpl < len(lemma) and form[f + cpl] == lemma[l + cpl]: cpl += 1
if cpl > best:
best = cpl
best_form = f
best_lemma = l
rule = lemma_casing + ";"
if not best:
rule += "a" + lemma
else:
rule += "d{}¦{}".format(
min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy),
min_edit_script(form[best_form + best:], lemma[best_lemma + best:], allow_copy),
)
return rule
def apply_lemma_rule(form, lemma_rule):
"""Applies the lemma rule to the form to generate the lemma
Args:
form:
lemma_rule:
Returns:
"""
cells = lemma_rule.split(";", 1)
if len(cells) == 1: # Some predicted lemma rules are _, which might be due to partial annotation
return form.lower()
casing, rule = cells
if rule.startswith("a"):
lemma = rule[1:]
else:
form = form.lower()
rules, rule_sources = rule[1:].split("¦"), []
assert len(rules) == 2
for rule in rules:
source, i = 0, 0
while i < len(rule):
if rule[i] == "→" or rule[i] == "-":
source += 1
else:
assert rule[i] == "+"
i += 1
i += 1
rule_sources.append(source)
try:
lemma, form_offset = "", 0
for i in range(2):
j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
while j < len(rules[i]):
if rules[i][j] == "→":
lemma += form[offset]
offset += 1
elif rules[i][j] == "-":
offset += 1
else:
assert (rules[i][j] == "+")
lemma += rules[i][j + 1]
j += 1
j += 1
if i == 0:
lemma += form[rule_sources[0]: len(form) - rule_sources[1]]
except:
lemma = form
for rule in casing.split("¦"):
if rule == "↓0": continue # The lemma is lowercased initially
case, offset = rule[0], int(rule[1:])
lemma = lemma[:offset] + (lemma[offset:].upper() if case == "↑" else lemma[offset:].lower())
return lemma
================================================
FILE: hanlp/components/parsers/ud/tag_decoder.py
================================================
# This file is modified from udify, which is licensed under the MIT license:
# MIT License
#
# Copyright (c) 2019 Dan Kondratyuk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Decodes sequences of tags, e.g., POS tags, given a list of contextualized word embeddings
"""
from typing import Dict
import numpy
import torch
import torch.nn.functional as F
from torch.nn.modules.adaptive import AdaptiveLogSoftmaxWithLoss
from torch.nn.modules.linear import Linear
from hanlp.components.parsers.ud.lemma_edit import apply_lemma_rule
from hanlp.components.parsers.ud.udify_util import sequence_cross_entropy, sequence_cross_entropy_with_logits
class TagDecoder(torch.nn.Module):
"""A basic sequence tagger that decodes from inputs of word embeddings"""
def __init__(self,
input_dim,
num_classes,
label_smoothing: float = 0.03,
adaptive: bool = False) -> None:
super(TagDecoder, self).__init__()
self.label_smoothing = label_smoothing
self.num_classes = num_classes
self.adaptive = adaptive
if self.adaptive:
adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)]
self.task_output = AdaptiveLogSoftmaxWithLoss(input_dim,
self.num_classes,
cutoffs=adaptive_cutoffs,
div_value=4.0)
else:
self.task_output = Linear(self.output_dim, self.num_classes)
def forward(self,
encoded_text: torch.FloatTensor,
mask: torch.LongTensor,
gold_tags: torch.LongTensor,
) -> Dict[str, torch.Tensor]:
hidden = encoded_text
batch_size, sequence_length, _ = hidden.size()
output_dim = [batch_size, sequence_length, self.num_classes]
loss_fn = self._adaptive_loss if self.adaptive else self._loss
output_dict = loss_fn(hidden, mask, gold_tags, output_dim)
return output_dict
def _adaptive_loss(self, hidden, mask, gold_tags, output_dim):
logits = hidden
reshaped_log_probs = logits.reshape(-1, logits.size(2))
class_probabilities = self.task_output.log_prob(reshaped_log_probs).view(output_dim)
output_dict = {"logits": logits, "class_probabilities": class_probabilities}
if gold_tags is not None:
output_dict["loss"] = sequence_cross_entropy(class_probabilities,
gold_tags,
mask,
label_smoothing=self.label_smoothing)
return output_dict
def _loss(self, hidden, mask, gold_tags, output_dim):
logits = self.task_output(hidden)
reshaped_log_probs = logits.view(-1, self.num_classes)
class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(output_dim)
output_dict = {"logits": logits, "class_probabilities": class_probabilities}
if gold_tags is not None:
output_dict["loss"] = sequence_cross_entropy_with_logits(logits,
gold_tags,
mask,
label_smoothing=self.label_smoothing)
return output_dict
def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
all_words = output_dict["words"]
all_predictions = output_dict["class_probabilities"][self.task].cpu().data.numpy()
if all_predictions.ndim == 3:
predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
else:
predictions_list = [all_predictions]
all_tags = []
for predictions, words in zip(predictions_list, all_words):
argmax_indices = numpy.argmax(predictions, axis=-1)
tags = [self.vocab.get_token_from_index(x, namespace=self.task)
for x in argmax_indices]
if self.task == "lemmas":
def decode_lemma(word, rule):
if rule == "_":
return "_"
if rule == "@@UNKNOWN@@":
return word
return apply_lemma_rule(word, rule)
tags = [decode_lemma(word, rule) for word, rule in zip(words, tags)]
all_tags.append(tags)
output_dict[self.task] = all_tags
return output_dict
================================================
FILE: hanlp/components/parsers/ud/ud_model.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 14:21
from typing import Dict, Any
import torch
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder
from hanlp.components.parsers.ud.tag_decoder import TagDecoder
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbeddingModule
from hanlp.layers.scalar_mix import ScalarMixWithDropout
class UniversalDependenciesModel(torch.nn.Module):
def __init__(self,
encoder: ContextualWordEmbeddingModule,
n_mlp_arc,
n_mlp_rel,
mlp_dropout,
num_rels,
num_lemmas,
num_upos,
num_feats,
mix_embedding: int = 13,
layer_dropout: int = 0.0):
super().__init__()
self.encoder = encoder
self.decoder = UniversalDependenciesDecoder(
encoder.get_output_dim(),
n_mlp_arc,
n_mlp_rel,
mlp_dropout,
num_rels,
num_lemmas,
num_upos,
num_feats,
mix_embedding,
layer_dropout
)
def forward(self,
batch: Dict[str, torch.Tensor],
mask,
):
hidden = self.encoder(batch)
return self.decoder(hidden, batch=batch, mask=mask)
class UniversalDependenciesDecoder(torch.nn.Module):
def __init__(self,
hidden_size,
n_mlp_arc,
n_mlp_rel,
mlp_dropout,
num_rels,
num_lemmas,
num_upos,
num_feats,
mix_embedding: int = 13,
layer_dropout: int = 0.0,
) -> None:
super(UniversalDependenciesDecoder, self).__init__()
# decoders
self.decoders = torch.nn.ModuleDict({
'lemmas': TagDecoder(hidden_size, num_lemmas, label_smoothing=0.03, adaptive=True),
'upos': TagDecoder(hidden_size, num_upos, label_smoothing=0.03, adaptive=True),
'deps': BiaffineDecoder(hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, num_rels),
'feats': TagDecoder(hidden_size, num_feats, label_smoothing=0.03, adaptive=True),
})
self.gold_keys = {
'lemmas': 'lemma_id',
'upos': 'pos_id',
'feats': 'feat_id',
}
if mix_embedding:
self.scalar_mix = torch.nn.ModuleDict({
task: ScalarMixWithDropout((1, mix_embedding),
do_layer_norm=False,
dropout=layer_dropout)
for task in self.decoders
})
else:
self.scalar_mix = None
def forward(self,
hidden,
batch: Dict[str, torch.Tensor],
mask) -> Dict[str, Any]:
mask_without_root = mask.clone()
mask_without_root[:, 0] = False
logits = {}
class_probabilities = {}
output_dict = {"logits": logits,
"class_probabilities": class_probabilities}
loss = 0
arc = batch.get('arc', None)
# Run through each of the tasks on the shared encoder and save predictions
for task in self.decoders:
if self.scalar_mix:
decoder_input = self.scalar_mix[task](hidden, mask)
else:
decoder_input = hidden
if task == "deps":
s_arc, s_rel = self.decoders[task](decoder_input, mask)
pred_output = {'class_probabilities': {'s_arc': s_arc, 's_rel': s_rel}}
if arc is not None:
# noinspection PyTypeChecker
pred_output['loss'] = BiaffineDependencyParser.compute_loss(None, s_arc, s_rel, arc,
batch['rel_id'],
mask_without_root,
torch.nn.functional.cross_entropy)
else:
pred_output = self.decoders[task](decoder_input, mask_without_root,
batch.get(self.gold_keys[task], None))
if 'logits' in pred_output:
logits[task] = pred_output["logits"]
if 'class_probabilities' in pred_output:
class_probabilities[task] = pred_output["class_probabilities"]
if 'loss' in pred_output:
# Keep track of the loss if we have the gold tags available
loss += pred_output["loss"]
if arc is not None:
output_dict["loss"] = loss
return output_dict
def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
for task in self.tasks:
self.decoders[task].decode(output_dict)
return output_dict
================================================
FILE: hanlp/components/parsers/ud/ud_parser.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-14 20:34
import logging
from copy import deepcopy
from typing import Union, List, Callable
import torch
from torch.utils.data import DataLoader
from hanlp_common.constant import IDX
from hanlp.common.dataset import PadSequenceDataLoader, SortingSamplerBuilder
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength, PunctuationMask
from hanlp.common.vocab import Vocab
from hanlp.components.classifiers.transformer_classifier import TransformerComponent
from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser
from hanlp_common.conll import CoNLLUWord, CoNLLSentence
from hanlp.components.parsers.ud.ud_model import UniversalDependenciesModel
from hanlp.components.parsers.ud.util import generate_lemma_rule, append_bos, sample_form_missing
from hanlp.components.parsers.ud.lemma_edit import apply_lemma_rule
from hanlp.datasets.parsing.loaders.conll_dataset import CoNLLParsingDataset
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding
from hanlp.metrics.accuracy import CategoricalAccuracy
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp.metrics.parsing.attachmentscore import AttachmentScore
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask
from hanlp_common.util import merge_locals_kwargs, merge_dict, reorder
class UniversalDependenciesParser(TorchComponent):
def __init__(self, **kwargs) -> None:
"""Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing) implementation
of "75 Languages, 1 Model: Parsing Universal Dependencies Universally" (:cite:`kondratyuk-straka-2019-75`).
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
self.model: UniversalDependenciesModel = self.model
def build_dataloader(self,
data,
batch_size,
shuffle=False,
device=None,
logger: logging.Logger = None,
sampler_builder=None,
gradient_accumulation=1,
transformer: ContextualWordEmbedding = None,
**kwargs) -> DataLoader:
transform = [generate_lemma_rule, append_bos, self.vocabs, transformer.transform(), FieldLength('token')]
if not self.config.punct:
transform.append(PunctuationMask('token', 'punct_mask'))
dataset = self.build_dataset(data, transform)
if self.vocabs.mutable:
# noinspection PyTypeChecker
self.build_vocabs(dataset, logger)
lens = [len(x['token_input_ids']) for x in dataset]
if sampler_builder:
sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
else:
sampler = SortingSamplerBuilder(batch_size).build(lens, shuffle, gradient_accumulation)
return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler,
pad={'arc': 0}, )
def build_vocabs(self, trn, logger, **kwargs):
self.vocabs.pos = Vocab(unk_token=None, pad_token=None)
self.vocabs.rel = Vocab(unk_token=None, pad_token=None)
self.vocabs.lemma = Vocab(unk_token=None, pad_token=None)
self.vocabs.feat = Vocab(unk_token=None, pad_token=None)
timer = CountdownTimer(len(trn))
max_seq_len = 0
for each in trn:
max_seq_len = max(max_seq_len, len(each['token']))
timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
for v in self.vocabs.values():
v.set_unk_as_safe_unk()
self.vocabs.lock()
self.vocabs.summary(logger)
def build_dataset(self, data, transform):
dataset = CoNLLParsingDataset(data, transform=transform, prune=sample_form_missing, cache=isinstance(data, str))
return dataset
def build_optimizer(self, trn, **kwargs):
# noinspection PyCallByClass,PyTypeChecker
return TransformerComponent.build_optimizer(self, trn, **kwargs)
def build_criterion(self, **kwargs):
pass
def build_metric(self, **kwargs):
return MetricDict({
'lemmas': CategoricalAccuracy(),
'upos': CategoricalAccuracy(),
'deps': AttachmentScore(),
'feats': CategoricalAccuracy(),
})
def evaluate_dataloader(self,
data: DataLoader,
criterion: Callable,
metric: MetricDict = None,
output=False,
logger=None,
ratio_width=None,
**kwargs):
metric.reset()
self.model.eval()
timer = CountdownTimer(len(data))
total_loss = 0
for idx, batch in enumerate(data):
out, mask = self.feed_batch(batch)
loss = out['loss']
total_loss += loss.item()
self.decode_output(out, mask, batch)
self.update_metrics(metric, batch, out, mask)
report = f'loss: {total_loss / (idx + 1):.4f} {metric.cstr()}'
timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
del loss
del out
del mask
return total_loss / len(data), metric
# noinspection PyMethodOverriding
def build_model(self,
transformer: ContextualWordEmbedding,
n_mlp_arc,
n_mlp_rel,
mlp_dropout,
mix_embedding,
layer_dropout,
training=True,
**kwargs) -> torch.nn.Module:
assert bool(transformer.scalar_mix) == bool(mix_embedding), 'transformer.scalar_mix has to be 1 ' \
'when mix_embedding is non-zero.'
# noinspection PyTypeChecker
return UniversalDependenciesModel(transformer.module(training=training),
n_mlp_arc,
n_mlp_rel,
mlp_dropout,
len(self.vocabs.rel),
len(self.vocabs.lemma),
len(self.vocabs.pos),
len(self.vocabs.feat),
mix_embedding,
layer_dropout)
def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, **kwargs):
if not data:
return []
flat = self.input_is_flat(data)
if flat:
data = [data]
samples = self.build_samples(data)
if not batch_size:
batch_size = self.config.batch_size
dataloader = self.build_dataloader(samples,
device=self.devices[0], shuffle=False,
**merge_dict(self.config,
batch_size=batch_size,
overwrite=True,
**kwargs))
order = []
outputs = []
for batch in dataloader:
out, mask = self.feed_batch(batch)
self.decode_output(out, mask, batch)
outputs.extend(self.prediction_to_human(out, batch))
order.extend(batch[IDX])
outputs = reorder(outputs, order)
if flat:
return outputs[0]
return outputs
def build_samples(self, data: List[List[str]]):
return [{'FORM': x} for x in data]
def fit(self,
trn_data,
dev_data,
save_dir,
transformer: ContextualWordEmbedding,
sampler_builder=None,
mix_embedding: int = 13,
layer_dropout: int = 0.1,
n_mlp_arc=768,
n_mlp_rel=256,
mlp_dropout=.33,
lr=1e-3,
transformer_lr=2.5e-5,
patience=0.1,
batch_size=32,
epochs=30,
gradient_accumulation=1,
adam_epsilon=1e-8,
weight_decay=0,
warmup_steps=0.1,
grad_norm=1.0,
tree=False,
proj=False,
punct=False,
logger=None,
verbose=True,
devices: Union[float, int, List[int]] = None, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, patience=0.5, eval_trn=True, **kwargs):
if isinstance(patience, float):
patience = int(patience * epochs)
best_epoch, best_metric = 0, -1
timer = CountdownTimer(epochs)
history = History()
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
eval_trn=eval_trn, **self.config)
loss, dev_metric = self.evaluate_dataloader(dev, criterion, metric, logger=logger, ratio_width=ratio_width)
timer.update()
report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
if dev_metric > best_metric:
best_epoch, best_metric = epoch, deepcopy(dev_metric)
self.save_weights(save_dir)
report += ' [red](saved)[/red]'
else:
report += f' ({epoch - best_epoch})'
if epoch - best_epoch >= patience:
report += ' early stop'
logger.info(report)
if epoch - best_epoch >= patience:
break
if not best_epoch:
self.save_weights(save_dir)
elif best_epoch != epoch:
self.load_weights(save_dir)
logger.info(f"Max score of dev is {best_metric.cstr()} at epoch {best_epoch}")
logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
logger.info(f"{timer.elapsed_human} elapsed")
# noinspection PyMethodOverriding
def fit_dataloader(self,
trn: DataLoader,
criterion,
optimizer,
metric: MetricDict,
logger: logging.Logger,
history: History,
gradient_accumulation=1,
grad_norm=None,
ratio_width=None,
eval_trn=True,
**kwargs):
optimizer, scheduler = optimizer
metric.reset()
self.model.train()
timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
total_loss = 0
for idx, batch in enumerate(trn):
out, mask = self.feed_batch(batch)
loss = out['loss']
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
loss.backward()
total_loss += loss.item()
if eval_trn:
self.decode_output(out, mask, batch)
self.update_metrics(metric, batch, out, mask)
if history.step(gradient_accumulation):
self._step(optimizer, scheduler, grad_norm)
report = f'loss: {total_loss / (idx + 1):.4f} {metric.cstr()}' if eval_trn \
else f'loss: {total_loss / (idx + 1):.4f}'
timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
del loss
del out
del mask
def decode_output(self, outputs, mask, batch):
arc_scores, rel_scores = outputs['class_probabilities']['deps']['s_arc'], \
outputs['class_probabilities']['deps']['s_rel']
arc_preds, rel_preds = BiaffineDependencyParser.decode(self, arc_scores, rel_scores, mask, batch)
outputs['arc_preds'], outputs['rel_preds'] = arc_preds, rel_preds
return outputs
def update_metrics(self, metrics, batch, outputs, mask):
arc_preds, rel_preds, puncts = outputs['arc_preds'], outputs['rel_preds'], batch.get('punct_mask', None)
BiaffineDependencyParser.update_metric(self, arc_preds, rel_preds, batch['arc'], batch['rel_id'], mask, puncts,
metrics['deps'], batch)
for task, key in zip(['lemmas', 'upos', 'feats'], ['lemma_id', 'pos_id', 'feat_id']):
metric: Metric = metrics[task]
pred = outputs['class_probabilities'][task]
gold = batch[key]
metric(pred.detach(), gold, mask=mask)
return metrics
def feed_batch(self, batch: dict):
mask = self.compute_mask(batch)
output_dict = self.model(batch, mask)
if self.model.training:
mask = mask.clone()
mask[:, 0] = 0
return output_dict, mask
def compute_mask(self, batch):
lens = batch['token_length']
mask = lengths_to_mask(lens)
return mask
def _step(self, optimizer, scheduler, grad_norm):
clip_grad_norm(self.model, grad_norm)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
def input_is_flat(self, data):
# noinspection PyCallByClass,PyTypeChecker
return BiaffineDependencyParser.input_is_flat(self, data, False)
def prediction_to_human(self, outputs: dict, batch):
arcs, rels = outputs['arc_preds'], outputs['rel_preds']
upos = outputs['class_probabilities']['upos'][:, 1:, :].argmax(-1).tolist()
feats = outputs['class_probabilities']['feats'][:, 1:, :].argmax(-1).tolist()
lemmas = outputs['class_probabilities']['lemmas'][:, 1:, :].argmax(-1).tolist()
lem_vocab = self.vocabs['lemma'].idx_to_token
pos_vocab = self.vocabs['pos'].idx_to_token
feat_vocab = self.vocabs['feat'].idx_to_token
# noinspection PyCallByClass,PyTypeChecker
for tree, form, lemma, pos, feat in zip(BiaffineDependencyParser.prediction_to_head_rel(
self, arcs, rels, batch), batch['token'], lemmas, upos, feats):
form = form[1:]
assert len(form) == len(tree)
lemma = [apply_lemma_rule(t, lem_vocab[r]) for t, r in zip(form, lemma)]
pos = [pos_vocab[x] for x in pos]
feat = [feat_vocab[x] for x in feat]
yield CoNLLSentence(
[CoNLLUWord(id=i + 1, form=fo, lemma=l, upos=p, feats=fe, head=a, deprel=r) for
i, (fo, (a, r), l, p, fe) in enumerate(zip(form, tree, lemma, pos, feat))])
def __call__(self, data, batch_size=None, **kwargs) -> Union[CoNLLSentence, List[CoNLLSentence]]:
return super().__call__(data, batch_size, **kwargs)
================================================
FILE: hanlp/components/parsers/ud/udify_util.py
================================================
# This file is modified from udify and allennlp, which are licensed under the MIT license:
# MIT License
#
# Copyright (c) 2019 Dan Kondratyuk and allennlp
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import os
from typing import List, Dict, Tuple, Union
import numpy
import torch
def get_ud_treebank_files(dataset_dir: str, treebanks: List[str] = None) -> Dict[str, Tuple[str, str, str]]:
"""Retrieves all treebank data paths in the given directory.
Adopted from https://github.com/Hyperparticle/udify
MIT Licence
Args:
dataset_dir:
treebanks:
dataset_dir: str:
treebanks: List[str]: (Default value = None)
Returns:
"""
datasets = {}
treebanks = os.listdir(dataset_dir) if not treebanks else treebanks
for treebank in treebanks:
treebank_path = os.path.join(dataset_dir, treebank)
conllu_files = [file for file in sorted(os.listdir(treebank_path)) if file.endswith(".conllu")]
train_file = [file for file in conllu_files if file.endswith("train.conllu")]
dev_file = [file for file in conllu_files if file.endswith("dev.conllu")]
test_file = [file for file in conllu_files if file.endswith("test.conllu")]
train_file = os.path.join(treebank_path, train_file[0]) if train_file else None
dev_file = os.path.join(treebank_path, dev_file[0]) if dev_file else None
test_file = os.path.join(treebank_path, test_file[0]) if test_file else None
datasets[treebank] = (train_file, dev_file, test_file)
return datasets
def sequence_cross_entropy(log_probs: torch.FloatTensor,
targets: torch.LongTensor,
weights: torch.FloatTensor,
average: str = "batch",
label_smoothing: float = None) -> torch.FloatTensor:
if average not in {None, "token", "batch"}:
raise ValueError("Got average f{average}, expected one of "
"None, 'token', or 'batch'")
# shape : (batch * sequence_length, num_classes)
log_probs_flat = log_probs.view(-1, log_probs.size(2))
# shape : (batch * max_len, 1)
targets_flat = targets.view(-1, 1).long()
if label_smoothing is not None and label_smoothing > 0.0:
num_classes = log_probs.size(-1)
smoothing_value = label_smoothing / num_classes
# Fill all the correct indices with 1 - smoothing value.
one_hot_targets = torch.zeros_like(log_probs_flat).scatter_(-1, targets_flat, 1.0 - label_smoothing)
smoothed_targets = one_hot_targets + smoothing_value
negative_log_likelihood_flat = - log_probs_flat * smoothed_targets
negative_log_likelihood_flat = negative_log_likelihood_flat.sum(-1, keepdim=True)
else:
# Contribution to the negative log likelihood only comes from the exact indices
# of the targets, as the target distributions are one-hot. Here we use torch.gather
# to extract the indices of the num_classes dimension which contribute to the loss.
# shape : (batch * sequence_length, 1)
negative_log_likelihood_flat = - torch.gather(log_probs_flat, dim=1, index=targets_flat)
# shape : (batch, sequence_length)
negative_log_likelihood = negative_log_likelihood_flat.view(*targets.size())
# shape : (batch, sequence_length)
negative_log_likelihood = negative_log_likelihood * weights.float()
if average == "batch":
# shape : (batch_size,)
per_batch_loss = negative_log_likelihood.sum(1) / (weights.sum(1).float() + 1e-13)
num_non_empty_sequences = ((weights.sum(1) > 0).float().sum() + 1e-13)
return per_batch_loss.sum() / num_non_empty_sequences
elif average == "token":
return negative_log_likelihood.sum() / (weights.sum().float() + 1e-13)
else:
# shape : (batch_size,)
per_batch_loss = negative_log_likelihood.sum(1) / (weights.sum(1).float() + 1e-13)
return per_batch_loss
def sequence_cross_entropy_with_logits(
logits: torch.FloatTensor,
targets: torch.LongTensor,
weights: Union[torch.FloatTensor, torch.BoolTensor],
average: str = "batch",
label_smoothing: float = None,
gamma: float = None,
alpha: Union[float, List[float], torch.FloatTensor] = None,
) -> torch.FloatTensor:
"""Computes the cross entropy loss of a sequence, weighted with respect to
some user provided weights. Note that the weighting here is not the same as
in the `torch.nn.CrossEntropyLoss()` criterion, which is weighting
classes; here we are weighting the loss contribution from particular elements
in the sequence. This allows loss computations for models which use padding.
# Parameters
logits : `torch.FloatTensor`, required.
A `torch.FloatTensor` of size (batch_size, sequence_length, num_classes)
which contains the unnormalized probability for each class.
targets : `torch.LongTensor`, required.
A `torch.LongTensor` of size (batch, sequence_length) which contains the
index of the true class for each corresponding step.
weights : `Union[torch.FloatTensor, torch.BoolTensor]`, required.
A `torch.FloatTensor` of size (batch, sequence_length)
average: `str`, optional (default = `"batch"`)
If "batch", average the loss across the batches. If "token", average
the loss across each item in the input. If `None`, return a vector
of losses per batch element.
label_smoothing : `float`, optional (default = `None`)
Whether or not to apply label smoothing to the cross-entropy loss.
For example, with a label smoothing value of 0.2, a 4 class classification
target would look like `[0.05, 0.05, 0.85, 0.05]` if the 3rd class was
the correct label.
gamma : `float`, optional (default = `None`)
Focal loss[*] focusing parameter `gamma` to reduces the relative loss for
well-classified examples and put more focus on hard. The greater value
`gamma` is, the more focus on hard examples.
alpha : `Union[float, List[float]]`, optional (default = `None`)
Focal loss[*] weighting factor `alpha` to balance between classes. Can be
used independently with `gamma`. If a single `float` is provided, it
is assumed binary case using `alpha` and `1 - alpha` for positive and
negative respectively. If a list of `float` is provided, with the same
length as the number of classes, the weights will match the classes.
[*] T. Lin, P. Goyal, R. Girshick, K. He and P. Dollár, "Focal Loss for
Dense Object Detection," 2017 IEEE International Conference on Computer
Vision (ICCV), Venice, 2017, pp. 2999-3007.
# Returns
`torch.FloatTensor`
A torch.FloatTensor representing the cross entropy loss.
If `average=="batch"` or `average=="token"`, the returned loss is a scalar.
If `average is None`, the returned loss is a vector of shape (batch_size,).
Args:
logits: torch.FloatTensor:
targets: torch.LongTensor:
weights: Union[torch.FloatTensor:
torch.BoolTensor]:
average: str: (Default value = "batch")
label_smoothing: float: (Default value = None)
gamma: float: (Default value = None)
alpha: Union[float:
List[float]:
torch.FloatTensor]: (Default value = None)
Returns:
"""
if average not in {None, "token", "batch"}:
raise ValueError("Got average f{average}, expected one of None, 'token', or 'batch'")
# make sure weights are float
weights = weights.to(logits.dtype)
# sum all dim except batch
non_batch_dims = tuple(range(1, len(weights.shape)))
# shape : (batch_size,)
weights_batch_sum = weights.sum(dim=non_batch_dims)
# shape : (batch * sequence_length, num_classes)
logits_flat = logits.view(-1, logits.size(-1))
# shape : (batch * sequence_length, num_classes)
log_probs_flat = torch.nn.functional.log_softmax(logits_flat, dim=-1)
# shape : (batch * max_len, 1)
targets_flat = targets.view(-1, 1).long()
# focal loss coefficient
if gamma:
# shape : (batch * sequence_length, num_classes)
probs_flat = log_probs_flat.exp()
# shape : (batch * sequence_length,)
probs_flat = torch.gather(probs_flat, dim=1, index=targets_flat)
# shape : (batch * sequence_length,)
focal_factor = (1.0 - probs_flat) ** gamma
# shape : (batch, sequence_length)
focal_factor = focal_factor.view(*targets.size())
weights = weights * focal_factor
if alpha is not None:
# shape : () / (num_classes,)
if isinstance(alpha, (float, int)):
# shape : (2,)
alpha_factor = torch.tensor(
[1.0 - float(alpha), float(alpha)], dtype=weights.dtype, device=weights.device
)
elif isinstance(alpha, (list, numpy.ndarray, torch.Tensor)):
# shape : (c,)
alpha_factor = torch.tensor(alpha, dtype=weights.dtype, device=weights.device)
if not alpha_factor.size():
# shape : (1,)
alpha_factor = alpha_factor.view(1)
# shape : (2,)
alpha_factor = torch.cat([1 - alpha_factor, alpha_factor])
else:
raise TypeError(
("alpha must be float, list of float, or torch.FloatTensor, {} provided.").format(
type(alpha)
)
)
# shape : (batch, max_len)
alpha_factor = torch.gather(alpha_factor, dim=0, index=targets_flat.view(-1)).view(
*targets.size()
)
weights = weights * alpha_factor
if label_smoothing is not None and label_smoothing > 0.0:
num_classes = logits.size(-1)
smoothing_value = label_smoothing / num_classes
# Fill all the correct indices with 1 - smoothing value.
one_hot_targets = torch.zeros_like(log_probs_flat).scatter_(
-1, targets_flat, 1.0 - label_smoothing
)
smoothed_targets = one_hot_targets + smoothing_value
negative_log_likelihood_flat = -log_probs_flat * smoothed_targets
negative_log_likelihood_flat = negative_log_likelihood_flat.sum(-1, keepdim=True)
else:
# Contribution to the negative log likelihood only comes from the exact indices
# of the targets, as the target distributions are one-hot. Here we use torch.gather
# to extract the indices of the num_classes dimension which contribute to the loss.
# shape : (batch * sequence_length, 1)
negative_log_likelihood_flat = -torch.gather(log_probs_flat, dim=1, index=targets_flat)
# shape : (batch, sequence_length)
negative_log_likelihood = negative_log_likelihood_flat.view(*targets.size())
# shape : (batch, sequence_length)
negative_log_likelihood = negative_log_likelihood * weights
if average == "batch":
# shape : (batch_size,)
per_batch_loss = negative_log_likelihood.sum(non_batch_dims) / (
weights_batch_sum + tiny_value_of_dtype(negative_log_likelihood.dtype)
)
num_non_empty_sequences = (weights_batch_sum > 0).sum() + tiny_value_of_dtype(
negative_log_likelihood.dtype
)
return per_batch_loss.sum() / num_non_empty_sequences
elif average == "token":
return negative_log_likelihood.sum() / (
weights_batch_sum.sum() + tiny_value_of_dtype(negative_log_likelihood.dtype)
)
else:
# shape : (batch_size,)
per_batch_loss = negative_log_likelihood.sum(non_batch_dims) / (
weights_batch_sum + tiny_value_of_dtype(negative_log_likelihood.dtype)
)
return per_batch_loss
def tiny_value_of_dtype(dtype: torch.dtype):
"""Returns a moderately tiny value for a given PyTorch data type that is used to avoid numerical
issues such as division by zero.
This is different from `info_value_of_dtype(dtype).tiny` because it causes some NaN bugs.
Only supports floating point dtypes.
Args:
dtype: torch.dtype:
Returns:
"""
if not dtype.is_floating_point:
raise TypeError("Only supports floating point dtypes.")
if dtype == torch.float or dtype == torch.double:
return 1e-13
elif dtype == torch.half:
return 1e-4
else:
raise TypeError("Does not support dtype " + str(dtype))
def combine_initial_dims_to_1d_or_2d(tensor: torch.Tensor) -> torch.Tensor:
"""Given a (possibly higher order) tensor of ids with shape
(d1, ..., dn, sequence_length)
Args:
tensor: torch.Tensor:
Returns:
If original tensor is 1-d or 2-d, return it as is.
"""
if tensor.dim() <= 2:
return tensor
else:
return tensor.view(-1, tensor.size(-1))
def uncombine_initial_dims(tensor: torch.Tensor, original_size: torch.Size) -> torch.Tensor:
"""Given a tensor of embeddings with shape
(d1 * ... * dn, sequence_length, embedding_dim)
and the original shape
(d1, ..., dn, sequence_length),
Args:
tensor: torch.Tensor:
original_size: torch.Size:
Returns:
(d1, ..., dn, sequence_length, embedding_dim).
If original size is 1-d or 2-d, return it as is.
"""
if len(original_size) <= 2:
return tensor
else:
view_args = list(original_size) + [tensor.size(-1)]
return tensor.view(*view_args)
def get_range_vector(size: int, device: int) -> torch.Tensor:
"""Returns a range vector with the desired size, starting at 0. The CUDA implementation
is meant to avoid copy data from CPU to GPU.
Args:
size: int:
device: int:
Returns:
"""
if device > -1:
return torch.cuda.LongTensor(size, device=device).fill_(1).cumsum(0) - 1
else:
return torch.arange(0, size, dtype=torch.long)
def get_device_of(tensor: torch.Tensor) -> int:
"""Returns the device of the tensor.
Args:
tensor: torch.Tensor:
Returns:
"""
if not tensor.is_cuda:
return -1
else:
return tensor.get_device()
================================================
FILE: hanlp/components/parsers/ud/util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-14 20:44
from hanlp_common.constant import ROOT
from hanlp.components.parsers.ud.lemma_edit import gen_lemma_rule
def generate_lemma_rule(sample: dict):
if 'LEMMA' in sample:
sample['lemma'] = [gen_lemma_rule(word, lemma) if lemma != "_" else "_" for word, lemma in
zip(sample['FORM'], sample['LEMMA'])]
return sample
def append_bos(sample: dict):
if 'FORM' in sample:
sample['token'] = [ROOT] + sample['FORM']
if 'UPOS' in sample:
sample['pos'] = sample['UPOS'][:1] + sample['UPOS']
sample['arc'] = [0] + sample['HEAD']
sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL']
sample['lemma'] = sample['lemma'][:1] + sample['lemma']
sample['feat'] = sample['FEATS'][:1] + sample['FEATS']
return sample
def sample_form_missing(sample: dict):
return all(t == '_' for t in sample['FORM'])
================================================
FILE: hanlp/components/pipeline.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 00:22
import types
from typing import Callable, Union, Iterable, Any
from hanlp.components.lambda_wrapper import LambdaComponent
from hanlp.common.component import Component
from hanlp_common.document import Document
from hanlp.utils.component_util import load_from_meta
from hanlp_common.io import save_json, load_json
from hanlp_common.reflection import str_to_type, classpath_of
import hanlp
class Pipe(Component):
def __init__(self, component: Component, input_key: str = None, output_key: str = None, **kwargs) -> None:
super().__init__()
if not hasattr(self, 'config'):
self.config = {'classpath': classpath_of(self)}
self.output_key = output_key
self.input_key = input_key
self.component = component
self.kwargs = kwargs
self.config.update({
'component': component.config,
'input_key': self.input_key,
'output_key': self.output_key,
'kwargs': self.kwargs
})
# noinspection PyShadowingBuiltins
def predict(self, doc: Document, **kwargs) -> Document:
unpack = False
if self.input_key:
if isinstance(self.input_key, (tuple, list)):
if isinstance(self.component, LambdaComponent): # assume functions take multiple arguments
input = [doc[key] for key in self.input_key]
unpack = True
else:
input = list(list(zip(*sent)) for sent in zip(*[doc[key] for key in self.input_key]))
else:
input = doc[self.input_key]
else:
input = doc
if self.kwargs:
kwargs.update(self.kwargs)
if unpack:
kwargs['_hanlp_unpack'] = True
output = self.component(input, **kwargs)
if isinstance(output, types.GeneratorType):
output = list(output)
if self.output_key:
if not isinstance(doc, Document):
doc = Document()
if isinstance(self.output_key, tuple):
for key, value in zip(self.output_key, output):
doc[key] = value
else:
doc[self.output_key] = output
return doc
return output
def __repr__(self):
name = self.component.function.__name__ if isinstance(self.component, LambdaComponent) \
else self.component.__class__.__name__
return f'{self.input_key}->{name}->{self.output_key}'
@staticmethod
def from_config(meta: dict, **kwargs):
cls = str_to_type(meta['classpath'])
component = load_from_meta(meta['component'])
return cls(component, meta['input_key'], meta['output_key'], **meta['kwargs'])
class Pipeline(Component, list):
def __init__(self, *pipes: Pipe) -> None:
super().__init__()
if not hasattr(self, 'config'):
self.config = {'classpath': classpath_of(self)}
if pipes:
self.extend(pipes)
def append(self, component: Callable, input_key: Union[str, Iterable[str]] = None,
output_key: Union[str, Iterable[str]] = None, **kwargs):
"""
Append a pipe to the tail of this pipeline.
Args:
component: A callable function.
input_key: The input key indicating which fields will be inputted to the pipe. ``None``: inherit from
previous pipe; ``*``: use all the outputs from previous pipes wrapped in a
:class:`~hanlp_common.document.Document`.
output_key: The output key indicating where to store the outputs
**kwargs: Extra arguments passed to the ``Pipe`` constructor.
Returns:
Pipeline: A pipeline.
"""
self.insert(len(self), component, input_key, output_key, **kwargs)
return self
def insert(self, index: int, component: Callable, input_key: Union[str, Iterable[str]] = None,
output_key: Union[str, Iterable[str]] = None,
**kwargs):
"""
Args:
index: The index of the new pipe.
input_key: The input key indicating which fields will be inputted to the pipe. ``None``: inherit from
previous pipe; ``*``: use all the outputs from previous pipes wrapped in a
:class:`~hanlp_common.document.Document`.
output_key: The output key indicating where to store the outputs
**kwargs: Extra arguments passed to the ``Pipe`` constructor.
Returns:
Pipeline: A pipeline.
"""
if input_key == '*':
input_key = None
elif not input_key and len(self) and index:
input_key = self[index - 1].output_key
if not isinstance(component, Component):
component = LambdaComponent(component)
super().insert(index, Pipe(component, input_key, output_key, **kwargs))
return self
def __call__(self, doc: Union[Document, Any] = None, **kwargs) -> Document:
"""Run the pipeline as a function.
Args:
doc: A :class:`~hanlp_common.document.Document` or other data types.
**kwargs: If `doc` is set to None then create a :class:`~hanlp_common.document.Document` as the
input to the first pipe using all the parameters in ``kwargs``.
Returns:
A :class:`~hanlp_common.document.Document`.
"""
if doc is None:
doc = Document(**kwargs)
for component in self:
doc = component(doc)
return doc
def copy(self):
return self.__copy__()
def __copy__(self):
config = self.meta
return Pipeline.from_config(config)
@property
def meta(self):
return {
'classpath': classpath_of(self),
'hanlp_version': hanlp.version.__version__,
'pipes': [pipe.config for pipe in self]
}
@meta.setter
def meta(self, value):
pass
def save(self, filepath):
save_json(self.meta, filepath)
def load(self, filepath):
meta = load_json(filepath)
self.clear()
self.extend(Pipeline.from_config(meta))
@staticmethod
def from_config(meta: Union[dict, str], **kwargs):
if isinstance(meta, str):
meta = load_json(meta)
return Pipeline(*[load_from_meta(pipe) for pipe in meta['pipes']])
================================================
FILE: hanlp/components/rnn_language_model_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-04 17:28
from typing import List, Union
import tensorflow as tf
from hanlp.common.keras_component import KerasComponent
from hanlp.transform.text_tf import TextTransform
class RNNLanguageModel(KerasComponent):
def __init__(self, transform: TextTransform = None) -> None:
if not transform:
transform = TextTransform()
super().__init__(transform)
self.transform: TextTransform = transform
def fit(self, trn_data, dev_data, save_dir,
forward=True,
embedding=100,
rnn_input_dropout=0.1,
rnn_units: int = 1024,
rnn_output_dropout=0.1,
seq_len: int = 250,
optimizer='sgd',
learning_rate=20,
anneal_factor: float = 0.25,
anneal_patience: int = 10,
clipnorm=0.25,
batch_size: int = 100, epochs=1000, run_eagerly=False, logger=None, verbose=True,
**kwargs):
return super().fit(**dict((k, v) for k, v in locals().items() if k not in ('self', 'kwargs')))
def build_model(self, embedding, rnn_input_dropout, rnn_units, rnn_output_dropout, batch_size, seq_len, training,
**kwargs) -> tf.keras.Model:
model = tf.keras.Sequential()
extra_args = {}
if training:
extra_args['batch_input_shape'] = [batch_size, seq_len]
embedding = tf.keras.layers.Embedding(input_dim=len(self.transform.vocab), output_dim=embedding,
trainable=True, mask_zero=True, **extra_args)
model.add(embedding)
if rnn_input_dropout:
model.add(tf.keras.layers.Dropout(rnn_input_dropout, name='rnn_input_dropout'))
model.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=training, name='encoder'))
if rnn_output_dropout:
model.add(tf.keras.layers.Dropout(rnn_output_dropout, name='rnn_output_dropout'))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(self.transform.vocab)), name='decoder'))
return model
# noinspection PyMethodOverriding
def build_optimizer(self, optimizer, learning_rate, clipnorm, **kwargs):
if optimizer == 'sgd':
optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, clipnorm=clipnorm)
return super().build_optimizer(optimizer, **kwargs)
def build_train_dataset(self, trn_data, batch_size):
trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size, shuffle=False, repeat=-1)
return trn_data
def build_valid_dataset(self, dev_data, batch_size):
dev_data = self.transform.file_to_dataset(dev_data, batch_size=batch_size, shuffle=False, drop_remainder=True)
return dev_data
def generate_text(self, text: Union[str, List[str]] = '\n', num_steps=50):
char_mode = False
if isinstance(text, str):
text = list(text)
char_mode = True
forward = self.config['forward']
# A slow implementation. Might better to let LSTM return states.
# But anyway, this interface is for fun so let's take it easy
for step in range(num_steps):
output = self.predict(text)
first_or_last_token = output[-1]
if forward:
text += first_or_last_token
else:
text = [first_or_last_token] + text
if char_mode:
text = ''.join(text)
return text
================================================
FILE: hanlp/components/srl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 20:50
================================================
FILE: hanlp/components/srl/span_bio/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-04 13:59
================================================
FILE: hanlp/components/srl/span_bio/baffine_tagging.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-04 13:59
import math
import torch
from torch import nn
from hanlp.components.parsers.biaffine.biaffine import Biaffine
from hanlp.components.parsers.biaffine.mlp import MLP
from hanlp.layers.crf.crf import CRF
class BiaffineTaggingDecoder(nn.Module):
def __init__(self,
n_rels,
hidden_size,
n_mlp_rel=300,
mlp_dropout=0.2,
crf=False) -> None:
super().__init__()
self.mlp_rel_h = MLP(n_in=hidden_size,
n_out=n_mlp_rel,
dropout=mlp_dropout)
self.mlp_rel_d = MLP(n_in=hidden_size,
n_out=n_mlp_rel,
dropout=mlp_dropout)
self.rel_attn = Biaffine(n_in=n_mlp_rel,
n_out=n_rels,
bias_x=True,
bias_y=True)
bias = 1 / math.sqrt(self.rel_attn.weight.size(1))
nn.init.uniform_(self.rel_attn.weight, -bias, bias)
self.crf = CRF(n_rels) if crf else None
# noinspection PyUnusedLocal
def forward(self, x: torch.Tensor, **kwargs):
rel_h = self.mlp_rel_h(x)
rel_d = self.mlp_rel_d(x)
# get arc and rel scores from the bilinear attention
# [batch_size, seq_len, seq_len, n_rels]
s_rel = self.rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
return s_rel
class SpanBIOSemanticRoleLabelingModel(nn.Module):
def __init__(self,
embed,
encoder,
num_labels: int,
n_mlp_rel,
mlp_dropout,
crf=False,
) -> None:
super().__init__()
self.embed = embed
self.encoder = encoder
hidden_size = encoder.get_output_dim() if encoder else embed.get_output_dim()
self.decoder = BiaffineTaggingDecoder(
num_labels,
hidden_size,
n_mlp_rel,
mlp_dropout,
crf,
)
def forward(self, batch, mask):
x = self.embed(batch)
if self.encoder:
x = self.encoder(x, mask=mask)
x = self.decoder(x)
return x
================================================
FILE: hanlp/components/srl/span_bio/span_bio.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 20:54
import logging
from copy import copy
from typing import Union, List, Callable, Dict, Any
from bisect import bisect
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader
from hanlp_common.constant import IDX, PRED
from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder, TransformableDataset
from hanlp.common.structure import History
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength
from hanlp.common.vocab import Vocab
from hanlp.components.srl.span_bio.baffine_tagging import SpanBIOSemanticRoleLabelingModel
from hanlp.datasets.srl.loaders.conll2012 import CoNLL2012SRLBIODataset
from hanlp.layers.crf.crf import CRF
from hanlp.layers.embeddings.contextual_word_embedding import find_transformer
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
from hanlp.metrics.chunking.sequence_labeling import get_entities
from hanlp.metrics.f1 import F1
from hanlp.utils.string_util import guess_delimiter
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask
from hanlp_common.util import merge_locals_kwargs, reorder
class SpanBIOSemanticRoleLabeler(TorchComponent):
def __init__(self, **kwargs) -> None:
"""A span based Semantic Role Labeling task using BIO scheme for tagging the role of each token. Given a
predicate and a token, it uses biaffine (:cite:`dozat:17a`) to predict their relations as one of BIO-ROLE.
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
self.model: SpanBIOSemanticRoleLabelingModel = None
def build_optimizer(self,
trn,
epochs,
lr,
adam_epsilon,
weight_decay,
warmup_steps,
transformer_lr=None,
gradient_accumulation=1,
**kwargs):
num_training_steps = len(trn) * epochs // gradient_accumulation
if transformer_lr is None:
transformer_lr = lr
transformer = find_transformer(self.model.embed)
optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, transformer,
lr, transformer_lr,
num_training_steps, warmup_steps,
weight_decay, adam_epsilon)
return optimizer, scheduler
def build_criterion(self, decoder=None, **kwargs):
if self.config.crf:
if not decoder:
decoder = self.model.decoder
if isinstance(decoder, torch.nn.DataParallel):
decoder = decoder.module
return decoder.crf
else:
return nn.CrossEntropyLoss(reduction=self.config.loss_reduction)
def build_metric(self, **kwargs):
return F1()
def execute_training_loop(self,
trn: DataLoader,
dev: DataLoader,
epochs,
criterion,
optimizer,
metric,
save_dir,
logger: logging.Logger,
devices,
ratio_width=None,
patience=0.5,
**kwargs):
if isinstance(patience, float):
patience = int(patience * epochs)
best_epoch, best_metric = 0, -1
timer = CountdownTimer(epochs)
history = History()
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
**self.config)
loss, dev_metric = self.evaluate_dataloader(dev, criterion, metric, logger=logger, ratio_width=ratio_width)
timer.update()
report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
if dev_metric > best_metric:
best_epoch, best_metric = epoch, copy(dev_metric)
self.save_weights(save_dir)
report += ' [red](saved)[/red]'
else:
report += f' ({epoch - best_epoch})'
if epoch - best_epoch >= patience:
report += ' early stop'
logger.info(report)
if epoch - best_epoch >= patience:
break
if not best_epoch:
self.save_weights(save_dir)
elif best_epoch != epoch:
self.load_weights(save_dir)
logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
logger.info(f"{timer.elapsed_human} elapsed")
# noinspection PyMethodOverriding
def fit_dataloader(self,
trn: DataLoader,
criterion,
optimizer,
metric,
logger: logging.Logger,
history: History,
gradient_accumulation=1,
grad_norm=None,
ratio_width=None,
eval_trn=False,
**kwargs):
optimizer, scheduler = optimizer
self.model.train()
timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
total_loss = 0
for idx, batch in enumerate(trn):
pred, mask = self.feed_batch(batch)
loss = self.compute_loss(criterion, pred, batch['srl_id'], mask)
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
loss.backward()
total_loss += loss.item()
if eval_trn:
prediction = self.decode_output(pred, mask, batch)
self.update_metrics(metric, prediction, batch)
if history.step(gradient_accumulation):
self._step(optimizer, scheduler, grad_norm)
report = f'loss: {total_loss / (idx + 1):.4f} {metric}' if eval_trn else f'loss: {total_loss / (idx + 1):.4f}'
timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
del loss
del pred
del mask
def naive_decode(self, pred, mask, batch, decoder=None):
vocab = self.vocabs['srl'].idx_to_token
results = []
for sent, matrix in zip(batch['token'], pred.argmax(-1).tolist()):
results.append([])
for token, tags_per_token in zip(sent, matrix):
tags_per_token = [vocab[x] for x in tags_per_token][:len(sent)]
srl_per_token = get_entities(tags_per_token)
results[-1].append(srl_per_token)
return results
def decode_output(self, pred, mask, batch, decoder=None):
# naive = self.naive_decode(pred, mask, batch, decoder)
vocab = self.vocabs['srl'].idx_to_token
if mask is not None:
if self.config.crf:
if not decoder:
decoder = self.model.decoder
crf: CRF = decoder.crf
token_index, mask = mask
pred = crf.decode(pred, mask)
pred = sum(pred, [])
else:
pred = pred[mask].argmax(-1)
pred = pred.tolist()
pred = [vocab[x] for x in pred]
results = []
offset = 0
for sent in batch['token']:
results.append([])
for token in sent:
tags_per_token = pred[offset:offset + len(sent)]
srl_per_token = get_entities(tags_per_token)
results[-1].append(srl_per_token)
offset += len(sent)
assert offset == len(pred)
# assert results == naive
return results
def update_metrics(self, metric, prediction, batch):
for p, g in zip(prediction, batch['srl_set']):
srl = set()
for i, args in enumerate(p):
srl.update((i, start, end, label) for (label, start, end) in args)
metric(srl, g)
return metric
def feed_batch(self, batch: dict):
lens = batch['token_length']
mask2d = lengths_to_mask(lens)
pred = self.model(batch, mask=mask2d)
mask3d = self.compute_mask(mask2d)
if self.config.crf:
token_index = mask3d[0]
pred = pred.flatten(end_dim=1)[token_index]
pred = F.log_softmax(pred, dim=-1)
return pred, mask3d
def compute_mask(self, mask2d):
mask3d = mask2d.unsqueeze_(-1).expand(-1, -1, mask2d.size(1))
mask3d = mask3d & mask3d.transpose(1, 2)
if self.config.crf:
mask3d = mask3d.flatten(end_dim=1)
token_index = mask3d[:, 0]
mask3d = mask3d[token_index]
return token_index, mask3d
else:
return mask3d
def _step(self, optimizer, scheduler, grad_norm):
clip_grad_norm(self.model, grad_norm)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
# noinspection PyMethodOverriding
def build_model(self, embed: Embedding, encoder, training, **kwargs) -> torch.nn.Module:
# noinspection PyCallByClass
model = SpanBIOSemanticRoleLabelingModel(
embed.module(training=training, vocabs=self.vocabs),
encoder,
len(self.vocabs.srl),
self.config.n_mlp_rel,
self.config.mlp_dropout,
self.config.crf,
)
return model
# noinspection PyMethodOverriding
def build_dataloader(self, data, batch_size,
sampler_builder: SamplerBuilder = None,
gradient_accumulation=1,
shuffle=False, device=None, logger: logging.Logger = None,
transform=None,
**kwargs) -> DataLoader:
if isinstance(data, TransformableDataset):
dataset = data
else:
transforms = [self.config.embed.transform(vocabs=self.vocabs), self.vocabs, FieldLength('token')]
if transform:
transforms.insert(0, transform)
dataset = self.build_dataset(data, transforms)
if self.vocabs.mutable:
# noinspection PyTypeChecker
self.build_vocabs(dataset, logger)
lens = [len(x['token_input_ids']) for x in dataset]
if sampler_builder:
sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
else:
sampler = None
return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)
def build_dataset(self, data, transform):
dataset = CoNLL2012SRLBIODataset(data,
transform=transform,
doc_level_offset=self.config.get('doc_level_offset', True),
cache=isinstance(data, str))
return dataset
def build_vocabs(self, dataset, logger, **kwargs):
self.vocabs.srl = Vocab(pad_token=None, unk_token=None)
timer = CountdownTimer(len(dataset))
max_seq_len = 0
for sample in dataset:
max_seq_len = max(max_seq_len, len(sample['token_input_ids']))
timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
self.vocabs['srl'].set_unk_as_safe_unk() # C-ARGM-FRQ appears only in test set
self.vocabs.lock()
self.vocabs.summary(logger)
if self.config.get('delimiter') is None:
tokens = dataset[0]['token']
self.config.delimiter = guess_delimiter(tokens)
logger.info(f'Guess the delimiter between tokens could be [blue]"{self.config.delimiter}"[/blue]. '
f'If not, specify `delimiter` in `fit()`')
def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs):
if not data:
return []
flat = self.input_is_flat(data)
if flat:
data = [data]
dataloader = self.build_dataloader(self.build_samples(data), batch_size, device=self.device, **kwargs)
results = []
order = []
for batch in dataloader:
pred, mask = self.feed_batch(batch)
prediction = self.decode_output(pred, mask, batch)
results.extend(self.prediction_to_result(prediction, batch))
order.extend(batch[IDX])
results = reorder(results, order)
if flat:
return results[0]
return results
def build_samples(self, data):
return [{'token': token} for token in data]
# noinspection PyMethodOverriding
def fit(self,
trn_data,
dev_data,
save_dir,
embed,
encoder=None,
lr=1e-3,
transformer_lr=1e-4,
adam_epsilon=1e-8,
warmup_steps=0.1,
weight_decay=0,
crf=False,
n_mlp_rel=300,
mlp_dropout=0.2,
batch_size=32,
gradient_accumulation=1,
grad_norm=1,
loss_reduction='mean',
epochs=30,
delimiter=None,
doc_level_offset=True,
eval_trn=False,
logger=None,
devices: Union[float, int, List[int]] = None,
transform=None,
**kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def compute_loss(self, criterion, pred, srl, mask):
if self.config.crf:
token_index, mask = mask
criterion: CRF = criterion
loss = -criterion.forward(pred, srl.flatten(end_dim=1)[token_index], mask,
reduction=self.config.loss_reduction)
else:
loss = criterion(pred[mask], srl[mask])
return loss
# noinspection PyMethodOverriding
@torch.no_grad()
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric, logger, ratio_width=None,
filename=None, **kwargs):
self.model.eval()
timer = CountdownTimer(len(data))
total_loss = 0
metric.reset()
for idx, batch in enumerate(data):
pred, mask = self.feed_batch(batch)
loss = self.compute_loss(criterion, pred, batch['srl_id'], mask)
total_loss += loss.item()
prediction = self.decode_output(pred, mask, batch)
self.update_metrics(metric, prediction, batch)
report = f'loss: {total_loss / (idx + 1):.4f} {metric}'
timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
return total_loss / timer.total, metric
def input_is_flat(self, data) -> bool:
return isinstance(data[0], str)
def prediction_to_result(self, prediction: List, batch: Dict[str, Any], delimiter=None) -> List:
if delimiter is None:
delimiter = self.config.delimiter
for matrix, tokens in zip(prediction, batch['token']):
result = []
for i, arguments in enumerate(matrix):
if arguments:
pas = [(delimiter.join(tokens[x[1]:x[2]]),) + x for x in arguments]
pas.insert(bisect([a[1] for a in arguments], i), (tokens[i], PRED, i, i + 1))
result.append(pas)
yield result
================================================
FILE: hanlp/components/srl/span_rank/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-19 22:22
================================================
FILE: hanlp/components/srl/span_rank/highway_variational_lstm.py
================================================
# Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch.autograd import Variable
from .layer import DropoutLayer, HighwayLSTMCell, VariationalLSTMCell
def initializer_1d(input_tensor, initializer):
assert len(input_tensor.size()) == 1
input_tensor = input_tensor.view(-1, 1)
input_tensor = initializer(input_tensor)
return input_tensor.view(-1)
class HighwayBiLSTM(nn.Module):
"""A module that runs multiple steps of HighwayBiLSTM."""
def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, bidirectional=False, dropout_in=0,
dropout_out=0):
super(HighwayBiLSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.batch_first = batch_first
self.bidirectional = bidirectional
self.dropout_in = dropout_in
self.dropout_out = dropout_out
self.num_directions = 2 if bidirectional else 1
self.fcells, self.f_dropout, self.f_hidden_dropout = [], [], []
self.bcells, self.b_dropout, self.b_hidden_dropout = [], [], []
for layer in range(num_layers):
layer_input_size = input_size if layer == 0 else hidden_size
self.fcells.append(HighwayLSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
self.f_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
self.f_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
if self.bidirectional:
self.bcells.append(HighwayLSTMCell(input_size=hidden_size, hidden_size=hidden_size))
self.b_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
self.b_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
self.fcells, self.bcells = nn.ModuleList(self.fcells), nn.ModuleList(self.bcells)
self.f_dropout, self.b_dropout = nn.ModuleList(self.f_dropout), nn.ModuleList(self.b_dropout)
def reset_dropout_layer(self, batch_size):
for layer in range(self.num_layers):
self.f_dropout[layer].reset_dropout_mask(batch_size)
if self.bidirectional:
self.b_dropout[layer].reset_dropout_mask(batch_size)
@staticmethod
def _forward_rnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None):
max_time = input.size(0)
output = []
hx = initial
for time in range(max_time):
h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks)
hx = (h_next, c_next)
output.append(h_next)
output = torch.stack(output, 0)
return output, hx
@staticmethod
def _forward_brnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None):
max_time = input.size(0)
output = []
hx = initial
for time in reversed(list(range(max_time))):
h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks)
hx = (h_next, c_next)
output.append(h_next)
output.reverse()
output = torch.stack(output, 0)
return output, hx
def forward(self, input, masks, initial=None):
if self.batch_first:
input = input.transpose(0, 1) # transpose: return the transpose matrix
masks = torch.unsqueeze(masks.transpose(0, 1), dim=2)
max_time, batch_size, _ = input.size()
self.reset_dropout_layer(batch_size) # reset the dropout each batch forward
masks = masks.expand(-1, -1, self.hidden_size) # expand: -1 means not expand that dimension
if initial is None:
initial = Variable(input.data.new(batch_size, self.hidden_size).zero_())
initial = (initial, initial) # h0, c0
h_n, c_n = [], []
for layer in range(self.num_layers):
# hidden_mask, hidden_drop = None, None
hidden_mask, hidden_drop = self.f_dropout[layer], self.f_hidden_dropout[layer]
layer_output, (layer_h_n, layer_c_n) = HighwayBiLSTM._forward_rnn(cell=self.fcells[layer], \
gate=None, input=input, masks=masks,
initial=initial, \
drop_masks=hidden_mask,
hidden_drop=hidden_drop)
h_n.append(layer_h_n)
c_n.append(layer_c_n)
if self.bidirectional:
hidden_mask, hidden_drop = self.b_dropout[layer], self.b_hidden_dropout[layer]
blayer_output, (blayer_h_n, blayer_c_n) = HighwayBiLSTM._forward_brnn(cell=self.bcells[layer], \
gate=None, input=layer_output,
masks=masks, initial=initial, \
drop_masks=hidden_mask,
hidden_drop=hidden_drop)
h_n.append(blayer_h_n)
c_n.append(blayer_c_n)
input = blayer_output if self.bidirectional else layer_output
h_n, c_n = torch.stack(h_n, 0), torch.stack(c_n, 0)
if self.batch_first:
input = input.transpose(1, 0) # transpose: return the transpose matrix
return input, (h_n, c_n)
class StackedHighwayBiLSTM(nn.Module):
"""A module that runs multiple steps of HighwayBiLSTM."""
def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, \
bidirectional=False, dropout_in=0, dropout_out=0):
super(StackedHighwayBiLSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.batch_first = batch_first
self.bidirectional = bidirectional
self.dropout_in = dropout_in
self.dropout_out = dropout_out
self.num_directions = 2 if bidirectional else 1
self.fcells, self.f_dropout, self.f_hidden_dropout = [], [], []
self.bcells, self.b_dropout, self.b_hidden_dropout = [], [], []
self.f_initial, self.b_initial = [], []
for layer in range(num_layers):
layer_input_size = input_size if layer == 0 else 2 * hidden_size if self.bidirectional else hidden_size
self.fcells.append(VariationalLSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
self.f_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
self.f_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
self.f_initial.append(nn.Parameter(torch.Tensor(2, self.hidden_size)))
assert self.bidirectional is True
self.bcells.append(VariationalLSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
self.b_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
self.b_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out))
self.b_initial.append(nn.Parameter(torch.Tensor(2, self.hidden_size)))
self.lstm_project_layer = nn.ModuleList([nn.Linear(2 * self.hidden_size, 2 * self.hidden_size)
for _ in range(num_layers - 1)])
self.fcells, self.bcells = nn.ModuleList(self.fcells), nn.ModuleList(self.bcells)
self.f_dropout, self.b_dropout = nn.ModuleList(self.f_dropout), nn.ModuleList(self.b_dropout)
self.f_hidden_dropout, self.b_hidden_dropout = \
nn.ModuleList(self.f_hidden_dropout), nn.ModuleList(self.b_hidden_dropout)
self.f_initial, self.b_initial = nn.ParameterList(self.f_initial), nn.ParameterList(self.b_initial)
self.reset_parameters()
def reset_parameters(self):
for layer_initial in [self.f_initial, self.b_initial]:
for initial in layer_initial:
init.xavier_uniform_(initial)
for layer in self.lstm_project_layer:
init.xavier_uniform_(layer.weight)
initializer_1d(layer.bias, init.xavier_uniform_)
def reset_dropout_layer(self, batch_size):
for layer in range(self.num_layers):
self.f_dropout[layer].reset_dropout_mask(batch_size)
self.f_hidden_dropout[layer].reset_dropout_mask(batch_size)
if self.bidirectional:
self.b_dropout[layer].reset_dropout_mask(batch_size)
self.b_hidden_dropout[layer].reset_dropout_mask(batch_size)
def reset_state(self, batch_size):
f_states, b_states = [], []
for f_layer_initial, b_layer_initial in zip(self.f_initial, self.b_initial):
f_states.append([f_layer_initial[0].expand(batch_size, -1), f_layer_initial[1].expand(batch_size, -1)])
b_states.append([b_layer_initial[0].expand(batch_size, -1), b_layer_initial[1].expand(batch_size, -1)])
return f_states, b_states
@staticmethod
def _forward_rnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None):
max_time = input.size(0)
output = []
hx = initial
for time in range(max_time):
h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks)
hx = (h_next, c_next)
output.append(h_next)
output = torch.stack(output, 0)
return output, hx
@staticmethod
def _forward_brnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None):
max_time = input.size(0)
output = []
hx = initial
for time in reversed(list(range(max_time))):
h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks)
hx = (h_next, c_next)
output.append(h_next)
output.reverse()
output = torch.stack(output, 0)
return output, hx
def forward(self, input, masks, initial=None):
if self.batch_first:
input = input.transpose(0, 1) # transpose: return the transpose matrix
masks = torch.unsqueeze(masks.transpose(0, 1), dim=2)
max_time, batch_size, _ = input.size()
self.reset_dropout_layer(batch_size) # reset the dropout each batch forward
f_states, b_states = self.reset_state(batch_size)
masks = masks.expand(-1, -1, self.hidden_size) # expand: -1 means not expand that dimension
h_n, c_n = [], []
outputs = []
for layer in range(self.num_layers):
hidden_mask, hidden_drop = self.f_dropout[layer], self.f_hidden_dropout[layer]
layer_output, (layer_h_n, layer_c_n) = \
StackedHighwayBiLSTM._forward_rnn(cell=self.fcells[layer],
gate=None, input=input, masks=masks, initial=f_states[layer],
drop_masks=hidden_mask, hidden_drop=hidden_drop)
h_n.append(layer_h_n)
c_n.append(layer_c_n)
assert self.bidirectional is True
hidden_mask, hidden_drop = self.b_dropout[layer], self.b_hidden_dropout[layer]
blayer_output, (blayer_h_n, blayer_c_n) = \
StackedHighwayBiLSTM._forward_brnn(cell=self.bcells[layer],
gate=None, input=input, masks=masks, initial=b_states[layer],
drop_masks=hidden_mask, hidden_drop=hidden_drop)
h_n.append(blayer_h_n)
c_n.append(blayer_c_n)
output = torch.cat([layer_output, blayer_output], 2) if self.bidirectional else layer_output
output = F.dropout(output, self.dropout_out, self.training)
if layer > 0: # Highway
highway_gates = torch.sigmoid(self.lstm_project_layer[layer - 1].forward(output))
output = highway_gates * output + (1 - highway_gates) * input
if self.batch_first:
outputs.append(output.transpose(1, 0))
else:
outputs.append(output)
input = output
h_n, c_n = torch.stack(h_n, 0), torch.stack(c_n, 0)
if self.batch_first:
output = output.transpose(1, 0) # transpose: return the transpose matrix
return output, (h_n, c_n), outputs
================================================
FILE: hanlp/components/srl/span_rank/inference_utils.py
================================================
# Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL
# Inference functions for the SRL model.
import numpy as np
def decode_spans(span_starts, span_ends, span_scores, labels_inv):
"""
Args:
span_starts: [num_candidates,]
span_scores: [num_candidates, num_labels]
span_ends:
labels_inv:
Returns:
"""
pred_spans = []
span_labels = np.argmax(span_scores, axis=1) # [num_candidates]
spans_list = list(zip(span_starts, span_ends, span_labels, span_scores))
spans_list = sorted(spans_list, key=lambda x: x[3][x[2]], reverse=True)
predicted_spans = {}
for start, end, label, _ in spans_list:
# Skip invalid span.
if label == 0 or (start, end) in predicted_spans:
continue
pred_spans.append((start, end, labels_inv[label]))
predicted_spans[(start, end)] = label
return pred_spans
def greedy_decode(predict_dict, srl_labels_inv):
"""Greedy decoding for SRL predicate-argument structures.
Args:
predict_dict: Dictionary of name to numpy arrays.
srl_labels_inv: SRL label id to string name.
suppress_overlap: Whether to greedily suppress overlapping arguments for the same predicate.
Returns:
"""
arg_starts = predict_dict["arg_starts"]
arg_ends = predict_dict["arg_ends"]
predicates = predict_dict["predicates"]
arg_labels = predict_dict["arg_labels"]
scores = predict_dict["srl_scores"]
num_suppressed_args = 0
# Map from predicates to a list of labeled spans.
pred_to_args = {}
if len(arg_ends) > 0 and len(predicates) > 0:
max_len = max(np.max(arg_ends), np.max(predicates)) + 1
else:
max_len = 1
for j, pred_id in enumerate(predicates):
args_list = []
for i, (arg_start, arg_end) in enumerate(zip(arg_starts, arg_ends)):
# If label is not null.
if arg_labels[i][j] == 0:
continue
label = srl_labels_inv[arg_labels[i][j]]
# if label not in ["V", "C-V"]:
args_list.append((arg_start, arg_end, label, scores[i][j][arg_labels[i][j]]))
# Sort arguments by highest score first.
args_list = sorted(args_list, key=lambda x: x[3], reverse=True)
new_args_list = []
flags = [False for _ in range(max_len)]
# Predicate will not overlap with arguments either.
flags[pred_id] = True
for (arg_start, arg_end, label, score) in args_list:
# If none of the tokens has been covered:
if not max(flags[arg_start:arg_end + 1]):
new_args_list.append((arg_start, arg_end, label))
for k in range(arg_start, arg_end + 1):
flags[k] = True
# Only add predicate if it has any argument.
if new_args_list:
pred_to_args[pred_id] = new_args_list
num_suppressed_args += len(args_list) - len(new_args_list)
return pred_to_args, num_suppressed_args
_CORE_ARGS = {"ARG0": 1, "ARG1": 2, "ARG2": 4, "ARG3": 8, "ARG4": 16, "ARG5": 32, "ARGA": 64,
"A0": 1, "A1": 2, "A2": 4, "A3": 8, "A4": 16, "A5": 32, "AA": 64}
def get_predicted_clusters(top_span_starts, top_span_ends, predicted_antecedents):
mention_to_predicted = {}
predicted_clusters = []
for i, predicted_index in enumerate(predicted_antecedents):
if predicted_index < 0:
continue
assert i > predicted_index
predicted_antecedent = (int(top_span_starts[predicted_index]), int(top_span_ends[predicted_index]))
if predicted_antecedent in mention_to_predicted:
predicted_cluster = mention_to_predicted[predicted_antecedent]
else:
predicted_cluster = len(predicted_clusters)
predicted_clusters.append([predicted_antecedent])
mention_to_predicted[predicted_antecedent] = predicted_cluster
mention = (int(top_span_starts[i]), int(top_span_ends[i]))
predicted_clusters[predicted_cluster].append(mention)
mention_to_predicted[mention] = predicted_cluster
predicted_clusters = [tuple(pc) for pc in predicted_clusters]
mention_to_predicted = {m: predicted_clusters[i] for m, i in list(mention_to_predicted.items())}
return predicted_clusters, mention_to_predicted
def _decode_non_overlapping_spans(starts, ends, scores, max_len, labels_inv, pred_id):
labels = np.argmax(scores, axis=1)
spans = []
for i, (start, end, label) in enumerate(zip(starts, ends, labels)):
if label <= 0:
continue
label_str = labels_inv[label]
if pred_id is not None and label_str == "V":
continue
spans.append((start, end, label_str, scores[i][label]))
spans = sorted(spans, key=lambda x: x[3], reverse=True)
flags = np.zeros([max_len], dtype=bool)
if pred_id is not None:
flags[pred_id] = True
new_spans = []
for start, end, label_str, score in spans:
if not max(flags[start:end + 1]):
new_spans.append((start, end, label_str)) # , score))
for k in range(start, end + 1):
flags[k] = True
return new_spans
def _dp_decode_non_overlapping_spans(starts, ends, scores, max_len, labels_inv, pred_id, u_constraint=False):
num_roles = scores.shape[1] # [num_arg, num_roles]
labels = np.argmax(scores, axis=1).astype(np.int64)
spans = list(zip(starts, ends, list(range(len(starts)))))
spans = sorted(spans, key=lambda x: (x[0], x[1])) # sort according to the span start index
if u_constraint:
f = np.zeros([max_len + 1, 128], dtype=float) - 0.1
else: # This one
f = np.zeros([max_len + 1, 1], dtype=float) - 0.1
f[0, 0] = 0
states = {0: set([0])} # A dictionary from id to list of binary core-arg states.
pointers = {} # A dictionary from states to (arg_id, role, prev_t, prev_rs)
best_state = [(0, 0)]
def _update_state(t0, rs0, t1, rs1, delta, arg_id, role):
if f[t0][rs0] + delta > f[t1][rs1]:
f[t1][rs1] = f[t0][rs0] + delta
if t1 not in states:
states[t1] = set()
states[t1].update([rs1])
pointers[(t1, rs1)] = (arg_id, role, t0, rs0) # the pointers store
if f[t1][rs1] > f[best_state[0][0]][best_state[0][1]]:
best_state[0] = (t1, rs1)
for start, end, i in spans: # [arg_start, arg_end, arg_span_id]
assert scores[i][0] == 0 # dummy score
# The extra dummy score should be same for all states, so we can safely skip arguments overlap
# with the predicate.
if pred_id is not None and start <= pred_id and pred_id <= end: # skip the span contains the predicate
continue
r0 = labels[i] # Locally best role assignment.
# Strictly better to incorporate a dummy span if it has the highest local score.
if r0 == 0: # labels_inv[r0] == "O"
continue
r0_str = labels_inv[r0]
# Enumerate explored states.
t_states = [t for t in list(states.keys()) if t <= start] # collect the state which is before the current span
for t in t_states: # for each state
role_states = states[t]
# Update states if best role is not a core arg.
if not u_constraint or r0_str not in _CORE_ARGS: # True; this one
for rs in role_states: # the set type in the value in the state dict
_update_state(t, rs, end + 1, rs, scores[i][r0], i, r0) # update the state
else:
for rs in role_states:
for r in range(1, num_roles):
if scores[i][r] > 0:
r_str = labels_inv[r]
core_state = _CORE_ARGS.get(r_str, 0)
# print start, end, i, r_str, core_state, rs
if core_state & rs == 0:
_update_state(t, rs, end + 1, rs | core_state, scores[i][r], i, r)
# Backtrack to decode.
new_spans = []
t, rs = best_state[0]
while (t, rs) in pointers:
i, r, t0, rs0 = pointers[(t, rs)]
new_spans.append((int(starts[i]), int(ends[i]), labels_inv[r]))
t = t0
rs = rs0
return new_spans[::-1]
def srl_decode(sentence_lengths, predict_dict, srl_labels_inv, config): # decode the predictions.
# Decode sentence-level tasks.
num_sentences = len(sentence_lengths)
predictions = [{} for _ in range(num_sentences)]
# Sentence-level predictions.
for i in range(num_sentences): # for each sentences
# if predict_dict["No_arg"] is True:
# predictions["srl"][i][predict_dict["predicates"][i]] = []
# continue
predict_dict_num_args_ = predict_dict["num_args"].cpu().numpy()
predict_dict_num_preds_ = predict_dict["num_preds"].cpu().numpy()
predict_dict_predicates_ = predict_dict["predicates"].cpu().numpy()
predict_dict_arg_starts_ = predict_dict["arg_starts"].cpu().numpy()
predict_dict_arg_ends_ = predict_dict["arg_ends"].cpu().numpy()
predict_dict_srl_scores_ = predict_dict["srl_scores"].detach().cpu().numpy()
num_args = predict_dict_num_args_[i] # the number of the candidate argument spans
num_preds = predict_dict_num_preds_[i] # the number of the candidate predicates
# for each predicate id, exec the decode process
for j, pred_id in enumerate(predict_dict_predicates_[i][:num_preds]):
# sorted arg_starts and arg_ends and srl_scores ? should be??? enforce_srl_constraint = False
arg_spans = _dp_decode_non_overlapping_spans(
predict_dict_arg_starts_[i][:num_args],
predict_dict_arg_ends_[i][:num_args],
predict_dict_srl_scores_[i, :num_args, j, :],
sentence_lengths[i], srl_labels_inv, pred_id, config.enforce_srl_constraint)
# To avoid warnings in the eval script.
if config.use_gold_predicates: # false
arg_spans.append((pred_id, pred_id, "V"))
if arg_spans:
predictions[i][int(pred_id)] = sorted(arg_spans, key=lambda x: (x[0], x[1]))
return predictions
================================================
FILE: hanlp/components/srl/span_rank/layer.py
================================================
# Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F
from hanlp.components.srl.span_rank.util import block_orth_normal_initializer
def get_tensor_np(t):
return t.data.cpu().numpy()
def orthonormal_initializer(output_size, input_size):
"""adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/linalg.py
Args:
output_size:
input_size:
Returns:
"""
print((output_size, input_size))
I = np.eye(output_size)
lr = .1
eps = .05 / (output_size + input_size)
success = False
tries = 0
while not success and tries < 10:
Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
for i in range(100):
QTQmI = Q.T.dot(Q) - I
loss = np.sum(QTQmI ** 2 / 2)
Q2 = Q ** 2
Q -= lr * Q.dot(QTQmI) / (
np.abs(Q2 + Q2.sum(axis=0, keepdims=True) + Q2.sum(axis=1, keepdims=True) - 1) + eps)
if np.max(Q) > 1e6 or loss > 1e6 or not np.isfinite(loss):
tries += 1
lr /= 2
break
success = True
if success:
print(('Orthogonal pretrainer loss: %.2e' % loss))
else:
print('Orthogonal pretrainer failed, using non-orthogonal random matrix')
Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
return np.transpose(Q.astype(np.float32))
class LayerNorm(nn.Module):
def __init__(self, features, eps=1e-8):
super(LayerNorm, self).__init__()
self.gamma = nn.Parameter(torch.ones(features))
self.beta = nn.Parameter(torch.zeros(features))
self.eps = eps
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
class DropoutLayer3D(nn.Module):
def __init__(self, input_size, dropout_rate=0.0):
super(DropoutLayer3D, self).__init__()
self.dropout_rate = dropout_rate
self.input_size = input_size
self.drop_mask = torch.FloatTensor(self.input_size).fill_(1 - self.dropout_rate)
self.drop_mask = Variable(torch.bernoulli(self.drop_mask), requires_grad=False)
if torch.cuda.is_available():
self.drop_mask = self.drop_mask.cuda()
def reset_dropout_mask(self, batch_size, length):
self.drop_mask = torch.FloatTensor(batch_size, length, self.input_size).fill_(1 - self.dropout_rate)
self.drop_mask = Variable(torch.bernoulli(self.drop_mask), requires_grad=False)
if torch.cuda.is_available():
self.drop_mask = self.drop_mask.cuda()
def forward(self, x):
if self.training:
return torch.mul(x, self.drop_mask)
else: # eval
return x * (1.0 - self.dropout_rate)
class DropoutLayer(nn.Module):
def __init__(self, input_size, dropout_rate=0.0):
super(DropoutLayer, self).__init__()
self.dropout_rate = dropout_rate
self.input_size = input_size
self.drop_mask = torch.Tensor(self.input_size).fill_(1 - self.dropout_rate)
self.drop_mask = torch.bernoulli(self.drop_mask)
def reset_dropout_mask(self, batch_size):
self.drop_mask = torch.Tensor(batch_size, self.input_size).fill_(1 - self.dropout_rate)
self.drop_mask = torch.bernoulli(self.drop_mask)
def forward(self, x):
if self.training:
return torch.mul(x, self.drop_mask.to(x.device))
else: # eval
return x * (1.0 - self.dropout_rate)
class NonLinear(nn.Module):
def __init__(self, input_size, hidden_size, activation=None):
super(NonLinear, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.linear = nn.Linear(in_features=input_size, out_features=hidden_size)
if activation is None:
self._activate = lambda x: x
else:
if not callable(activation):
raise ValueError("activation must be callable: type={}".format(type(activation)))
self._activate = activation
self.reset_parameters()
def forward(self, x):
y = self.linear(x)
return self._activate(y)
def reset_parameters(self):
nn.init.xavier_uniform_(self.linear.weight)
nn.init.zeros_(self.linear.bias)
class Biaffine(nn.Module):
def __init__(self, in1_features, in2_features, out_features,
bias=(True, True)):
super(Biaffine, self).__init__()
self.in1_features = in1_features
self.in2_features = in2_features
self.out_features = out_features
self.bias = bias
self.linear_input_size = in1_features + int(bias[0])
self.linear_output_size = out_features * (in2_features + int(bias[1]))
self.linear = nn.Linear(in_features=self.linear_input_size,
out_features=self.linear_output_size,
bias=False)
self.reset_parameters()
def reset_parameters(self):
torch.nn.init.xavier_uniform_(self.linear.weight)
def forward(self, input1, input2):
batch_size, len1, dim1 = input1.size()
batch_size, len2, dim2 = input2.size()
if self.bias[0]:
ones = input1.data.new(batch_size, len1, 1).zero_().fill_(1) # this kind of implementation is too tedious
input1 = torch.cat((input1, Variable(ones)), dim=2)
dim1 += 1
if self.bias[1]:
ones = input2.data.new(batch_size, len2, 1).zero_().fill_(1)
input2 = torch.cat((input2, Variable(ones)), dim=2)
dim2 += 1
affine = self.linear(input1)
affine = affine.view(batch_size, len1 * self.out_features, dim2)
input2 = torch.transpose(input2, 1, 2)
# torch.bmm: Performs a batch matrix-matrix product of matrices stored in batch1 and batch2.
biaffine = torch.transpose(torch.bmm(affine, input2), 1, 2)
# view: Returns a new tensor with the same data as the self tensor but of a different size.
biaffine = biaffine.contiguous().view(batch_size, len2, len1, self.out_features)
return biaffine
def __repr__(self):
return self.__class__.__name__ + ' (' \
+ 'in1_features=' + str(self.in1_features) \
+ ', in2_features=' + str(self.in2_features) \
+ ', out_features=' + str(self.out_features) + ')'
class HighwayLSTMCell(nn.Module):
def __init__(self, input_size, hidden_size):
super(HighwayLSTMCell, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.linear_ih = nn.Linear(in_features=input_size,
out_features=6 * hidden_size)
self.linear_hh = nn.Linear(in_features=hidden_size,
out_features=5 * hidden_size,
bias=False)
self.reset_parameters() # reset all the param in the MyLSTMCell
def reset_parameters(self):
weight_ih = block_orth_normal_initializer([self.input_size, ], [self.hidden_size] * 6)
self.linear_ih.weight.data.copy_(weight_ih)
weight_hh = block_orth_normal_initializer([self.hidden_size, ], [self.hidden_size] * 5)
self.linear_hh.weight.data.copy_(weight_hh)
# nn.init.constant(self.linear_hh.weight, 1.0)
# nn.init.constant(self.linear_ih.weight, 1.0)
nn.init.constant(self.linear_ih.bias, 0.0)
def forward(self, x, mask=None, hx=None, dropout=None):
assert mask is not None and hx is not None
_h, _c = hx
_x = self.linear_ih(x) # compute the x
preact = self.linear_hh(_h) + _x[:, :self.hidden_size * 5]
i, f, o, t, j = preact.chunk(chunks=5, dim=1)
i, f, o, t, j = F.sigmoid(i), F.sigmoid(f + 1.0), F.sigmoid(o), F.sigmoid(t), F.tanh(j)
k = _x[:, self.hidden_size * 5:]
c = f * _c + i * j
c = mask * c + (1.0 - mask) * _c
h = t * o * F.tanh(c) + (1.0 - t) * k
if dropout is not None:
h = dropout(h)
h = mask * h + (1.0 - mask) * _h
return h, c
class VariationalLSTMCell(nn.Module):
def __init__(self, input_size, hidden_size):
super(VariationalLSTMCell, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.linear = nn.Linear(in_features=input_size + self.hidden_size, out_features=3 * hidden_size)
self.reset_parameters() # reset all the param in the MyLSTMCell
def reset_parameters(self):
weight = block_orth_normal_initializer([self.input_size + self.hidden_size, ], [self.hidden_size] * 3)
self.linear.weight.data.copy_(weight)
nn.init.constant_(self.linear.bias, 0.0)
def forward(self, x, mask=None, hx=None, dropout=None):
assert mask is not None and hx is not None
_h, _c = hx
_h = dropout(_h)
_x = self.linear(torch.cat([x, _h], 1)) # compute the x
i, j, o = _x.chunk(3, dim=1)
i = torch.sigmoid(i)
c = (1.0 - i) * _c + i * torch.tanh(j)
c = mask * c # + (1.0 - mask) * _c
h = torch.tanh(c) * torch.sigmoid(o)
h = mask * h # + (1.0 - mask) * _h
return h, c
class VariationalLSTM(nn.Module):
"""A module that runs multiple steps of LSTM."""
def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, \
bidirectional=False, dropout_in=0, dropout_out=0):
super(VariationalLSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.batch_first = batch_first
self.bidirectional = bidirectional
self.dropout_in = dropout_in
self.dropout_out = dropout_out
self.num_directions = 2 if bidirectional else 1
self.fcells = []
self.bcells = []
for layer in range(num_layers):
layer_input_size = input_size if layer == 0 else hidden_size * self.num_directions
self.fcells.append(nn.LSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
if self.bidirectional:
self.bcells.append(nn.LSTMCell(input_size=layer_input_size, hidden_size=hidden_size))
self._all_weights = []
for layer in range(num_layers):
layer_params = (self.fcells[layer].weight_ih, self.fcells[layer].weight_hh, \
self.fcells[layer].bias_ih, self.fcells[layer].bias_hh)
suffix = ''
param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
param_names = [x.format(layer, suffix) for x in param_names]
for name, param in zip(param_names, layer_params):
setattr(self, name, param)
self._all_weights.append(param_names)
if self.bidirectional:
layer_params = (self.bcells[layer].weight_ih, self.bcells[layer].weight_hh, \
self.bcells[layer].bias_ih, self.bcells[layer].bias_hh)
suffix = '_reverse'
param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
param_names = [x.format(layer, suffix) for x in param_names]
for name, param in zip(param_names, layer_params):
setattr(self, name, param)
self._all_weights.append(param_names)
self.reset_parameters()
def reset_parameters(self): # modified by kiro
for name, param in self.named_parameters():
print(name)
if "weight" in name:
# for i in range(4):
# nn.init.orthogonal(self.__getattr__(name)[self.hidden_size*i:self.hidden_size*(i+1),:])
nn.init.orthogonal(self.__getattr__(name))
if "bias" in name:
nn.init.normal(self.__getattr__(name), 0.0, 0.01)
# nn.init.constant(self.__getattr__(name), 1.0) # different from zhang's 0
@staticmethod
def _forward_rnn(cell, input, masks, initial, drop_masks):
max_time = input.size(0)
output = []
hx = initial
for time in range(max_time):
h_next, c_next = cell(input=input[time], hx=hx)
h_next = h_next * masks[time] + initial[0] * (1 - masks[time])
c_next = c_next * masks[time] + initial[1] * (1 - masks[time])
output.append(h_next)
if drop_masks is not None: h_next = h_next * drop_masks
hx = (h_next, c_next)
output = torch.stack(output, 0)
return output, hx
@staticmethod
def _forward_brnn(cell, input, masks, initial, drop_masks):
max_time = input.size(0)
output = []
hx = initial
for time in reversed(list(range(max_time))):
h_next, c_next = cell(input=input[time], hx=hx)
h_next = h_next * masks[time] + initial[0] * (1 - masks[time])
c_next = c_next * masks[time] + initial[1] * (1 - masks[time])
output.append(h_next)
if drop_masks is not None: h_next = h_next * drop_masks
hx = (h_next, c_next)
output.reverse()
output = torch.stack(output, 0)
return output, hx
def forward(self, input, masks, initial=None):
if self.batch_first:
input = input.transpose(0, 1) # transpose: return the transpose matrix
masks = torch.unsqueeze(masks.transpose(0, 1), dim=2)
max_time, batch_size, _ = input.size()
masks = masks.expand(-1, -1, self.hidden_size) # expand: -1 means not expand that dimension
if initial is None:
initial = Variable(input.data.new(batch_size, self.hidden_size).zero_())
initial = (initial, initial) # h0, c0
h_n = []
c_n = []
for layer in range(self.num_layers):
max_time, batch_size, input_size = input.size()
input_mask, hidden_mask = None, None
if self.training: # when training, use the dropout
input_mask = input.data.new(batch_size, input_size).fill_(1 - self.dropout_in)
input_mask = Variable(torch.bernoulli(input_mask), requires_grad=False)
input_mask = input_mask / (1 - self.dropout_in)
# permute: exchange the dimension
input_mask = torch.unsqueeze(input_mask, dim=2).expand(-1, -1, max_time).permute(2, 0, 1)
input = input * input_mask
hidden_mask = input.data.new(batch_size, self.hidden_size).fill_(1 - self.dropout_out)
hidden_mask = Variable(torch.bernoulli(hidden_mask), requires_grad=False)
hidden_mask = hidden_mask / (1 - self.dropout_out)
layer_output, (layer_h_n, layer_c_n) = VariationalLSTM._forward_rnn(cell=self.fcells[layer], \
input=input, masks=masks,
initial=initial,
drop_masks=hidden_mask)
if self.bidirectional:
blayer_output, (blayer_h_n, blayer_c_n) = VariationalLSTM._forward_brnn(cell=self.bcells[layer], \
input=input, masks=masks,
initial=initial,
drop_masks=hidden_mask)
h_n.append(torch.cat([layer_h_n, blayer_h_n], 1) if self.bidirectional else layer_h_n)
c_n.append(torch.cat([layer_c_n, blayer_c_n], 1) if self.bidirectional else layer_c_n)
input = torch.cat([layer_output, blayer_output], 2) if self.bidirectional else layer_output
h_n = torch.stack(h_n, 0)
c_n = torch.stack(c_n, 0)
if self.batch_first:
input = input.transpose(1, 0) # transpose: return the transpose matrix
return input, (h_n, c_n)
================================================
FILE: hanlp/components/srl/span_rank/span_rank.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-09 18:13
import logging
from bisect import bisect
from typing import Union, List, Callable, Tuple, Dict, Any
from hanlp_common.constant import IDX
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
import torch
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader, SortingSampler
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import FieldLength
from hanlp.common.vocab import Vocab
from hanlp.components.srl.span_rank.inference_utils import srl_decode
from hanlp.components.srl.span_rank.span_ranking_srl_model import SpanRankingSRLModel
from hanlp.components.srl.span_rank.srl_eval_utils import compute_srl_f1
from hanlp.datasets.srl.loaders.conll2012 import CoNLL2012SRLDataset, filter_v_args, unpack_srl, \
group_pa_by_p
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.metrics.f1 import F1
from hanlp_common.visualization import markdown_table
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, reorder
class SpanRankingSemanticRoleLabeler(TorchComponent):
def __init__(self, **kwargs) -> None:
"""An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling"
(:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them.
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
self.model: SpanRankingSRLModel = None
def build_optimizer(self,
trn,
epochs,
lr,
adam_epsilon,
weight_decay,
warmup_steps,
transformer_lr,
**kwargs):
# noinspection PyProtectedMember
transformer = self._get_transformer()
if transformer:
num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1)
optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model,
transformer,
lr, transformer_lr,
num_training_steps, warmup_steps,
weight_decay, adam_epsilon)
else:
optimizer = torch.optim.Adam(self.model.parameters(), self.config.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer=optimizer,
mode='max',
factor=0.5,
patience=2,
verbose=True,
)
return optimizer, scheduler
def _get_transformer(self):
return getattr(self.model_.embed, 'transformer', None)
def build_criterion(self, **kwargs):
pass
# noinspection PyProtectedMember
def build_metric(self, **kwargs) -> Tuple[F1, F1]:
predicate_f1 = F1()
end_to_end_f1 = F1()
return predicate_f1, end_to_end_f1
def execute_training_loop(self,
trn: DataLoader,
dev: DataLoader,
epochs,
criterion,
optimizer,
metric,
save_dir,
logger: logging.Logger,
devices,
**kwargs):
best_epoch, best_metric = 0, -1
predicate, end_to_end = metric
optimizer, scheduler = optimizer
timer = CountdownTimer(epochs)
ratio_width = len(f'{len(trn)}/{len(trn)}')
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger,
linear_scheduler=scheduler if self._get_transformer() else None)
if dev:
self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width)
report = f'{timer.elapsed_human}/{timer.total_time_human}'
dev_score = end_to_end.score
if not self._get_transformer():
scheduler.step(dev_score)
if dev_score > best_metric:
self.save_weights(save_dir)
best_metric = dev_score
report += ' [red]saved[/red]'
timer.log(report, ratio_percentage=False, newline=True, ratio=False)
def fit_dataloader(self,
trn: DataLoader,
criterion,
optimizer,
metric,
logger: logging.Logger,
linear_scheduler=None,
gradient_accumulation=1,
**kwargs):
self.model.train()
timer = CountdownTimer(len(trn) // gradient_accumulation)
total_loss = 0
self.reset_metrics(metric)
for idx, batch in enumerate(trn):
output_dict = self.feed_batch(batch)
self.update_metrics(batch, output_dict, metric)
loss = output_dict['loss']
loss = loss.sum() # For data parallel
if torch.isnan(loss): # w/ gold pred, some batches do not have PAs at all, resulting in empty scores
loss = torch.zeros((1,), device=loss.device)
else:
loss.backward()
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
if self.config.grad_norm:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
if (idx + 1) % gradient_accumulation == 0:
self._step(optimizer, linear_scheduler)
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger)
total_loss += loss.item()
del loss
if len(trn) % gradient_accumulation:
self._step(optimizer, linear_scheduler)
return total_loss / timer.total
def _step(self, optimizer, linear_scheduler):
optimizer.step()
optimizer.zero_grad()
if linear_scheduler:
linear_scheduler.step()
# noinspection PyMethodOverriding
@torch.no_grad()
def evaluate_dataloader(self,
data: DataLoader,
criterion: Callable,
metric,
logger,
ratio_width=None,
output=False,
official=False,
confusion_matrix=False,
**kwargs):
self.model.eval()
self.reset_metrics(metric)
timer = CountdownTimer(len(data))
total_loss = 0
if official:
sentences = []
gold = []
pred = []
for batch in data:
output_dict = self.feed_batch(batch)
if official:
sentences += batch['token']
gold += batch['srl']
pred += output_dict['prediction']
self.update_metrics(batch, output_dict, metric)
loss = output_dict['loss']
total_loss += loss.item()
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger,
ratio_width=ratio_width)
del loss
if official:
scores = compute_srl_f1(sentences, gold, pred)
if logger:
if confusion_matrix:
labels = sorted(set(y for x in scores.label_confusions.keys() for y in x))
headings = ['GOLD↓PRED→'] + labels
matrix = []
for i, gold in enumerate(labels):
row = [gold]
matrix.append(row)
for j, pred in enumerate(labels):
row.append(scores.label_confusions.get((gold, pred), 0))
matrix = markdown_table(headings, matrix)
logger.info(f'{"Confusion Matrix": ^{len(matrix.splitlines()[0])}}')
logger.info(matrix)
headings = ['Settings', 'Precision', 'Recall', 'F1']
data = []
for h, (p, r, f) in zip(['Unlabeled', 'Labeled', 'Official'], [
[scores.unlabeled_precision, scores.unlabeled_recall, scores.unlabeled_f1],
[scores.precision, scores.recall, scores.f1],
[scores.conll_precision, scores.conll_recall, scores.conll_f1],
]):
data.append([h] + [f'{x:.2%}' for x in [p, r, f]])
table = markdown_table(headings, data)
logger.info(f'{"Scores": ^{len(table.splitlines()[0])}}')
logger.info(table)
else:
scores = metric
return total_loss / timer.total, scores
def build_model(self,
training=True,
**kwargs) -> torch.nn.Module:
# noinspection PyTypeChecker
# embed: torch.nn.Embedding = self.config.embed.module(vocabs=self.vocabs)[0].embed
model = SpanRankingSRLModel(self.config,
self.config.embed.module(vocabs=self.vocabs, training=training),
self.config.context_layer,
len(self.vocabs.srl_label))
return model
# noinspection PyMethodOverriding
def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger,
generate_idx=False, transform=None, **kwargs) -> DataLoader:
batch_max_tokens = self.config.batch_max_tokens
gradient_accumulation = self.config.get('gradient_accumulation', 1)
if batch_size:
batch_size //= gradient_accumulation
if batch_max_tokens:
batch_max_tokens //= gradient_accumulation
dataset = self.build_dataset(data, generate_idx, logger, transform)
sampler = SortingSampler([x['token_length'] for x in dataset],
batch_size=batch_size,
batch_max_tokens=batch_max_tokens,
shuffle=shuffle)
return PadSequenceDataLoader(batch_sampler=sampler,
device=device,
dataset=dataset)
def build_dataset(self, data, generate_idx, logger, transform=None):
dataset = CoNLL2012SRLDataset(data, transform=[filter_v_args, unpack_srl, group_pa_by_p],
doc_level_offset=self.config.doc_level_offset, generate_idx=generate_idx)
if transform:
dataset.append_transform(transform)
if isinstance(self.config.get('embed', None), Embedding):
transform = self.config.embed.transform(vocabs=self.vocabs)
if transform:
dataset.append_transform(transform)
dataset.append_transform(self.vocabs)
dataset.append_transform(FieldLength('token'))
if isinstance(data, str):
dataset.purge_cache() # Enable cache
if self.vocabs.mutable:
self.build_vocabs(dataset, logger)
return dataset
def predict(self, data: Union[str, List[str]], batch_size: int = None, fmt='dict', **kwargs):
if not data:
return []
flat = self.input_is_flat(data)
if flat:
data = [data]
samples = []
for token in data:
sample = dict()
sample['token'] = token
samples.append(sample)
batch_size = batch_size or self.config.batch_size
dataloader = self.build_dataloader(samples, batch_size, False, self.device, None, generate_idx=True)
outputs = []
order = []
for batch in dataloader:
output_dict = self.feed_batch(batch)
outputs.extend(output_dict['prediction'])
order.extend(batch[IDX])
outputs = reorder(outputs, order)
if fmt == 'list':
outputs = self.format_dict_to_results(data, outputs)
if flat:
return outputs[0]
return outputs
@staticmethod
def format_dict_to_results(data, outputs, exclusive_offset=False, with_predicate=False, with_argument=False,
label_first=False):
results = []
for i in range(len(outputs)):
tokens = data[i]
output = []
for p, a in outputs[i].items():
# a: [(0, 0, 'ARG0')]
if with_predicate:
a.insert(bisect([x[0] for x in a], p), (p, p, 'PRED'))
if with_argument is not False:
a = [x + (tokens[x[0]:x[1] + 1],) for x in a]
if isinstance(with_argument, str):
a = [x[:-1] + (with_argument.join(x[-1]),) for x in a]
if exclusive_offset:
a = [(x[0], x[1] + 1) + x[2:] for x in a]
if label_first:
a = [tuple(reversed(x[2:])) + x[:2] for x in a]
output.append(a)
results.append(output)
return results
def input_is_flat(self, data):
return isinstance(data[0], str)
# noinspection PyMethodOverriding
def fit(self,
trn_data,
dev_data,
save_dir,
embed,
context_layer,
batch_size=40,
batch_max_tokens=700,
lexical_dropout=0.5,
dropout=0.2,
span_width_feature_size=20,
ffnn_size=150,
ffnn_depth=2,
argument_ratio=0.8,
predicate_ratio=0.4,
max_arg_width=30,
mlp_label_size=100,
enforce_srl_constraint=False,
use_gold_predicates=False,
doc_level_offset=True,
use_biaffine=False,
lr=1e-3,
transformer_lr=1e-5,
adam_epsilon=1e-6,
weight_decay=0.01,
warmup_steps=0.1,
grad_norm=5.0,
gradient_accumulation=1,
loss_reduction='sum',
transform=None,
devices=None,
logger=None,
seed=None,
**kwargs
):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_vocabs(self, dataset, logger, **kwargs):
self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None)
# Use null to indicate no relationship
self.vocabs.srl_label.add('')
timer = CountdownTimer(len(dataset))
max_seq_len = 0
for each in dataset:
max_seq_len = max(max_seq_len, len(each['token_input_ids']))
timer.log(f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]')
pass
timer.stop()
timer.erase()
self.vocabs['srl_label'].set_unk_as_safe_unk()
self.vocabs.lock()
self.vocabs.summary(logger)
def reset_metrics(self, metrics):
for each in metrics:
each.reset()
def report_metrics(self, loss, metrics):
predicate, end_to_end = metrics
return f'loss: {loss:.4f} predicate: {predicate.score:.2%} end_to_end: {end_to_end.score:.2%}'
def feed_batch(self, batch) -> Dict[str, Any]:
output_dict = self.model(batch)
prediction = self.decode_output(output_dict, batch, self.model.training)
output_dict['prediction'] = prediction
return output_dict
def decode_output(self, output_dict, batch, training=False):
idx_to_label = self.vocabs['srl_label'].idx_to_token
if training:
# Use fast decoding during training,
prediction = []
top_predicate_indices = output_dict['predicates'].tolist()
top_spans = torch.stack([output_dict['arg_starts'], output_dict['arg_ends']], dim=-1).tolist()
srl_mask = output_dict['srl_mask'].tolist()
srl_scores = output_dict['srl_scores']
pal_list = srl_scores.argmax(-1).tolist() if srl_scores.numel() else []
for n, (pal, predicate_indices, argument_spans) in enumerate(
zip(pal_list, top_predicate_indices, top_spans)):
srl_per_sentence = {}
for p, (al, predicate_index) in enumerate(zip(pal, predicate_indices)):
for a, (l, argument_span) in enumerate(zip(al, argument_spans)):
if l and srl_mask[n][p][a]:
args = srl_per_sentence.get(p, None)
if args is None:
args = srl_per_sentence[p] = []
args.append((*argument_span, idx_to_label[l]))
prediction.append(srl_per_sentence)
else:
prediction = srl_decode(batch['token_length'], output_dict, idx_to_label, self.config)
return prediction
def update_metrics(self, batch: dict, output_dict: dict, metrics):
def unpack(y: dict):
return set((p, bel) for p, a in y.items() for bel in a)
predicate, end_to_end = metrics
for pred, gold in zip(output_dict['prediction'], batch['srl']):
predicate(pred.keys(), gold.keys())
end_to_end(unpack(pred), unpack(gold))
================================================
FILE: hanlp/components/srl/span_rank/span_ranking_srl_model.py
================================================
from typing import Dict
import hanlp.utils.torch_util
from hanlp.layers.feedforward import FeedForward
from hanlp.layers.time_distributed import TimeDistributed
from .highway_variational_lstm import *
import torch
from ...parsers.biaffine.biaffine import Biaffine
def initializer_1d(input_tensor, initializer):
assert len(input_tensor.size()) == 1
input_tensor = input_tensor.view(-1, 1)
input_tensor = initializer(input_tensor)
return input_tensor.view(-1)
class SpanRankingSRLDecoder(nn.Module):
def __init__(self, context_layer_output_dim, label_space_size, config) -> None:
super().__init__()
self.config = config
self.label_space_size = label_space_size
self.dropout = float(config.dropout)
self.use_gold_predicates = config.use_gold_predicates
# span width feature embedding
self.span_width_embedding = nn.Embedding(self.config.max_arg_width, self.config.span_width_feature_size)
# self.context_projective_layer = nn.Linear(2 * self.lstm_hidden_size, self.config.num_attention_heads)
# span scores
self.span_emb_size = 3 * context_layer_output_dim + self.config.span_width_feature_size
self.arg_unary_score_layers = nn.ModuleList([nn.Linear(self.span_emb_size, self.config.ffnn_size) if i == 0
else nn.Linear(self.config.ffnn_size, self.config.ffnn_size) for i
in range(self.config.ffnn_depth)]) # [,150]
self.arg_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)])
self.arg_unary_score_projection = nn.Linear(self.config.ffnn_size, 1)
# predicate scores
self.pred_unary_score_layers = nn.ModuleList(
[nn.Linear(context_layer_output_dim, self.config.ffnn_size) if i == 0
else nn.Linear(self.config.ffnn_size, self.config.ffnn_size) for i
in range(self.config.ffnn_depth)]) # [,150]
self.pred_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)])
self.pred_unary_score_projection = nn.Linear(self.config.ffnn_size, 1)
# srl scores
self.srl_unary_score_input_size = self.span_emb_size + context_layer_output_dim
self.srl_unary_score_layers = nn.ModuleList([nn.Linear(self.srl_unary_score_input_size, self.config.ffnn_size)
if i == 0 else nn.Linear(self.config.ffnn_size,
self.config.ffnn_size)
for i in range(self.config.ffnn_depth)])
self.srl_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)])
self.srl_unary_score_projection = nn.Linear(self.config.ffnn_size, self.label_space_size - 1)
if config.use_biaffine:
self.predicate_scale = TimeDistributed(FeedForward(context_layer_output_dim, 1, self.span_emb_size, 'ReLU'))
self.biaffine = Biaffine(self.span_emb_size, self.label_space_size - 1)
self.loss_reduction = config.loss_reduction
self.reset_parameters()
def reset_parameters(self):
init.xavier_uniform_(self.span_width_embedding.weight)
# init.xavier_uniform_(self.context_projective_layer.weight)
# initializer_1d(self.context_projective_layer.bias, init.xavier_uniform_)
for layer in self.arg_unary_score_layers:
init.xavier_uniform_(layer.weight)
initializer_1d(layer.bias, init.xavier_uniform_)
init.xavier_uniform_(self.arg_unary_score_projection.weight)
initializer_1d(self.arg_unary_score_projection.bias, init.xavier_uniform_)
for layer in self.pred_unary_score_layers:
init.xavier_uniform_(layer.weight)
initializer_1d(layer.bias, init.xavier_uniform_)
init.xavier_uniform_(self.pred_unary_score_projection.weight)
initializer_1d(self.pred_unary_score_projection.bias, init.xavier_uniform_)
for layer in self.srl_unary_score_layers:
init.xavier_uniform_(layer.weight)
initializer_1d(layer.bias, init.xavier_uniform_)
init.xavier_uniform_(self.srl_unary_score_projection.weight)
initializer_1d(self.srl_unary_score_projection.bias, init.xavier_uniform_)
return None
def forward(self, hidden_states, batch, mask=None):
gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, masks, sent_lengths = SpanRankingSRLModel.unpack(
batch, mask=mask, training=self.training)
return self.decode(hidden_states, sent_lengths, masks, gold_arg_starts, gold_arg_ends, gold_arg_labels,
gold_predicates)
@staticmethod
def get_candidate_spans(sent_lengths: torch.Tensor, max_sent_length, max_arg_width):
num_sentences = len(sent_lengths)
device = sent_lengths.device
candidate_starts = torch.arange(0, max_sent_length, device=device).expand(num_sentences, max_arg_width, -1)
candidate_width = torch.arange(0, max_arg_width, device=device).view(1, -1, 1)
candidate_ends = candidate_starts + candidate_width
candidate_starts = candidate_starts.contiguous().view(num_sentences, max_sent_length * max_arg_width)
candidate_ends = candidate_ends.contiguous().view(num_sentences, max_sent_length * max_arg_width)
actual_sent_lengths = sent_lengths.view(-1, 1).expand(-1, max_sent_length * max_arg_width)
candidate_mask = candidate_ends < actual_sent_lengths
candidate_starts = candidate_starts * candidate_mask
candidate_ends = candidate_ends * candidate_mask
return candidate_starts, candidate_ends, candidate_mask
@staticmethod
def exclusive_cumsum(input: torch.Tensor, exclusive=True):
"""
Args:
input: input is the sentence lengths tensor.
exclusive: exclude the last sentence length (Default value = True)
input(torch.Tensor :):
input: torch.Tensor:
Returns:
"""
assert exclusive is True
if exclusive is True:
exclusive_sent_lengths = input.new_zeros(1, dtype=torch.long)
result = torch.cumsum(torch.cat([exclusive_sent_lengths, input], 0)[:-1], 0).view(-1, 1)
else:
result = torch.cumsum(input, 0).view(-1, 1)
return result
def flatten_emb(self, emb):
num_sentences, max_sentence_length = emb.size()[0], emb.size()[1]
assert len(emb.size()) == 3
flatted_emb = emb.contiguous().view(num_sentences * max_sentence_length, -1)
return flatted_emb
def flatten_emb_in_sentence(self, emb, batch_sentences_mask):
num_sentences, max_sentence_length = emb.size()[0], emb.size()[1]
flatted_emb = self.flatten_emb(emb)
return flatted_emb[batch_sentences_mask.reshape(num_sentences * max_sentence_length)]
def get_span_emb(self, flatted_context_emb, flatted_candidate_starts, flatted_candidate_ends,
config, dropout=0.0):
batch_word_num = flatted_context_emb.size()[0]
# gather slices from embeddings according to indices
span_start_emb = flatted_context_emb[flatted_candidate_starts]
span_end_emb = flatted_context_emb[flatted_candidate_ends]
span_emb_feature_list = [span_start_emb, span_end_emb] # store the span vector representations for span rep.
span_width = 1 + flatted_candidate_ends - flatted_candidate_starts # [num_spans], generate the span width
max_arg_width = config.max_arg_width
# get the span width feature emb
span_width_index = span_width - 1
span_width_emb = self.span_width_embedding(span_width_index)
span_width_emb = F.dropout(span_width_emb, dropout, self.training)
span_emb_feature_list.append(span_width_emb)
"""head features"""
cpu_flatted_candidte_starts = flatted_candidate_starts
span_indices = torch.arange(0, max_arg_width, device=flatted_context_emb.device).view(1, -1) + \
cpu_flatted_candidte_starts.view(-1, 1) # For all the i, where i in [begin, ..i, end] for span
# reset the position index to the batch_word_num index with index - 1
span_indices = torch.clamp(span_indices, max=batch_word_num - 1)
num_spans, spans_width = span_indices.size()[0], span_indices.size()[1]
flatted_span_indices = span_indices.view(-1) # so Huge!!!, column is the span?
# if torch.cuda.is_available():
flatted_span_indices = flatted_span_indices
span_text_emb = flatted_context_emb.index_select(0, flatted_span_indices).view(num_spans, spans_width, -1)
span_indices_mask = hanlp.utils.torch_util.lengths_to_mask(span_width, max_len=max_arg_width)
# project context output to num head
# head_scores = self.context_projective_layer.forward(flatted_context_emb)
# get span attention
# span_attention = head_scores.index_select(0, flatted_span_indices).view(num_spans, spans_width)
# span_attention = torch.add(span_attention, expanded_span_indices_log_mask).unsqueeze(2) # control the span len
# span_attention = F.softmax(span_attention, dim=1)
span_text_emb = span_text_emb * span_indices_mask.unsqueeze(2).expand(-1, -1, span_text_emb.size()[-1])
span_head_emb = torch.mean(span_text_emb, 1)
span_emb_feature_list.append(span_head_emb)
span_emb = torch.cat(span_emb_feature_list, 1)
return span_emb, None, span_text_emb, span_indices, span_indices_mask
def get_arg_unary_scores(self, span_emb):
"""Compute span score with FFNN(span embedding)
Args:
span_emb: tensor of [num_sentences, num_spans, emb_size]
config: param dropout:
num_labels: param name:
Returns:
"""
input = span_emb
for i, ffnn in enumerate(self.arg_unary_score_layers):
input = F.relu(ffnn.forward(input))
input = self.arg_dropout_layers[i].forward(input)
output = self.arg_unary_score_projection.forward(input)
return output
def get_pred_unary_scores(self, span_emb):
input = span_emb
for i, ffnn in enumerate(self.pred_unary_score_layers):
input = F.relu(ffnn.forward(input))
input = self.pred_dropout_layers[i].forward(input)
output = self.pred_unary_score_projection.forward(input)
return output
def extract_spans(self, candidate_scores, candidate_starts, candidate_ends, topk, max_sentence_length,
sort_spans, enforce_non_crossing):
"""extract the topk span indices
Args:
candidate_scores: param candidate_starts:
candidate_ends: param topk: [num_sentences]
max_sentence_length: param sort_spans:
enforce_non_crossing: return: indices [num_sentences, max_num_predictions]
candidate_starts:
topk:
sort_spans:
Returns:
"""
# num_sentences = candidate_scores.size()[0]
# num_input_spans = candidate_scores.size()[1]
max_num_output_spans = int(torch.max(topk))
indices = [score.topk(k)[1] for score, k in zip(candidate_scores, topk)]
output_span_indices_tensor = [F.pad(item, [0, max_num_output_spans - item.size()[0]], value=item[-1])
for item in indices]
output_span_indices_tensor = torch.stack(output_span_indices_tensor)
return output_span_indices_tensor
def batch_index_select(self, emb, indices):
num_sentences = emb.size()[0]
max_sent_length = emb.size()[1]
flatten_emb = self.flatten_emb(emb)
offset = (torch.arange(0, num_sentences, device=emb.device) * max_sent_length).unsqueeze(1)
return torch.index_select(flatten_emb, 0, (indices + offset).view(-1)) \
.view(indices.size()[0], indices.size()[1], emb.size(-1))
def get_batch_topk(self, candidate_starts: torch.Tensor, candidate_ends, candidate_scores, topk_ratio, text_len,
max_sentence_length, sort_spans=False, enforce_non_crossing=True):
num_sentences = candidate_starts.size()[0]
max_sentence_length = candidate_starts.size()[1]
topk = torch.floor(text_len.to(torch.float) * topk_ratio).to(torch.long)
topk = torch.max(topk, torch.ones(num_sentences, device=candidate_starts.device, dtype=torch.long))
# this part should be implemented with C++
predicted_indices = self.extract_spans(candidate_scores, candidate_starts, candidate_ends, topk,
max_sentence_length, sort_spans, enforce_non_crossing)
predicted_starts = torch.gather(candidate_starts, 1, predicted_indices)
predicted_ends = torch.gather(candidate_ends, 1, predicted_indices)
predicted_scores = torch.gather(candidate_scores, 1, predicted_indices)
return predicted_starts, predicted_ends, predicted_scores, topk, predicted_indices
def get_dense_span_labels(self, span_starts, span_ends, span_labels, max_sentence_length,
span_parents=None):
num_sentences = span_starts.size()[0]
max_spans_num = span_starts.size()[1]
# span_starts = span_starts + 1 - (span_labels > 0).to(torch.long)
span_starts[(span_labels == 0) & (span_starts < max_sentence_length - 1)] += 1 # make start > end
sentence_indices = torch.arange(0, num_sentences, device=span_starts.device).unsqueeze(1).expand(-1,
max_spans_num)
sparse_indices = torch.cat([sentence_indices.unsqueeze(2), span_starts.unsqueeze(2), span_ends.unsqueeze(2)],
dim=2)
if span_parents is not None: # semantic span predicate offset
sparse_indices = torch.cat([sparse_indices, span_parents.unsqueeze(2)], 2)
rank = 3 if span_parents is None else 4
dense_labels = torch.sparse.LongTensor(sparse_indices.view(num_sentences * max_spans_num, rank).t(),
span_labels.view(-1),
torch.Size([num_sentences] + [max_sentence_length] * (rank - 1))) \
.to_dense()
return dense_labels
@staticmethod
def gather_4d(params, indices):
assert len(params.size()) == 4 and len(indices) == 4
indices_a, indices_b, indices_c, indices_d = indices
result = params[indices_a, indices_b, indices_c, indices_d]
return result
def get_srl_labels(self,
arg_starts,
arg_ends,
predicates,
gold_predicates,
gold_arg_starts,
gold_arg_ends,
gold_arg_labels,
max_sentence_length
):
num_sentences = arg_starts.size()[0]
max_arg_num = arg_starts.size()[1]
max_pred_num = predicates.size()[1]
sentence_indices_2d = torch.arange(0, num_sentences, device=arg_starts.device).unsqueeze(1).unsqueeze(2).expand(
-1, max_arg_num, max_pred_num)
expanded_arg_starts = arg_starts.unsqueeze(2).expand(-1, -1, max_pred_num)
expanded_arg_ends = arg_ends.unsqueeze(2).expand(-1, -1, max_pred_num)
expanded_predicates = predicates.unsqueeze(1).expand(-1, max_arg_num, -1)
dense_srl_labels = self.get_dense_span_labels(gold_arg_starts,
gold_arg_ends,
gold_arg_labels,
max_sentence_length, span_parents=gold_predicates) # ans
srl_labels = self.gather_4d(dense_srl_labels,
[sentence_indices_2d, expanded_arg_starts, expanded_arg_ends, expanded_predicates])
return srl_labels
def get_srl_unary_scores(self, span_emb):
input = span_emb
for i, ffnn in enumerate(self.srl_unary_score_layers):
input = F.relu(ffnn.forward(input))
input = self.srl_dropout_layers[i].forward(input)
output = self.srl_unary_score_projection.forward(input)
return output
def get_srl_scores(self, arg_emb, pred_emb, arg_scores, pred_scores, num_labels, config, dropout):
num_sentences = arg_emb.size()[0]
num_args = arg_emb.size()[1] # [batch_size, max_arg_num, arg_emb_size]
num_preds = pred_emb.size()[1] # [batch_size, max_pred_num, pred_emb_size]
unsqueezed_arg_emb = arg_emb.unsqueeze(2)
unsqueezed_pred_emb = pred_emb.unsqueeze(1)
expanded_arg_emb = unsqueezed_arg_emb.expand(-1, -1, num_preds, -1)
expanded_pred_emb = unsqueezed_pred_emb.expand(-1, num_args, -1, -1)
pair_emb_list = [expanded_arg_emb, expanded_pred_emb]
pair_emb = torch.cat(pair_emb_list, 3) # concatenate the argument emb and pre emb
pair_emb_size = pair_emb.size()[3]
flat_pair_emb = pair_emb.view(num_sentences * num_args * num_preds, pair_emb_size)
# get unary scores
flat_srl_scores = self.get_srl_unary_scores(flat_pair_emb)
srl_scores = flat_srl_scores.view(num_sentences, num_args, num_preds, flat_srl_scores.size(-1))
if self.config.use_biaffine:
srl_scores += self.biaffine(arg_emb, self.predicate_scale(pred_emb)).permute([0, 2, 3, 1])
unsqueezed_arg_scores, unsqueezed_pred_scores = \
arg_scores.unsqueeze(2).unsqueeze(3), pred_scores.unsqueeze(1).unsqueeze(3)
srl_scores = srl_scores + unsqueezed_arg_scores + unsqueezed_pred_scores
dummy_scores = torch.zeros([num_sentences, num_args, num_preds, 1], device=arg_emb.device)
srl_scores = torch.cat([dummy_scores, srl_scores], 3)
return srl_scores
def get_srl_softmax_loss(self, srl_scores, srl_labels, num_predicted_args, num_predicted_preds):
srl_loss_mask = self.get_srl_loss_mask(srl_scores, num_predicted_args, num_predicted_preds)
loss = torch.nn.functional.cross_entropy(srl_scores[srl_loss_mask], srl_labels[srl_loss_mask],
reduction=self.loss_reduction)
return loss, srl_loss_mask
def get_srl_loss_mask(self, srl_scores, num_predicted_args, num_predicted_preds):
max_num_arg = srl_scores.size()[1]
max_num_pred = srl_scores.size()[2]
# num_predicted_args, 1D tensor; max_num_arg: a int variable means the gold ans's max arg number
args_mask = hanlp.utils.torch_util.lengths_to_mask(num_predicted_args, max_num_arg)
pred_mask = hanlp.utils.torch_util.lengths_to_mask(num_predicted_preds, max_num_pred)
srl_loss_mask = args_mask.unsqueeze(2) & pred_mask.unsqueeze(1)
return srl_loss_mask
def decode(self, contextualized_embeddings, sent_lengths, masks, gold_arg_starts, gold_arg_ends, gold_arg_labels,
gold_predicates):
num_sentences, max_sent_length = masks.size()
device = sent_lengths.device
"""generate candidate spans with argument pruning"""
# candidate_starts [num_sentences, max_sent_length * max_arg_width]
candidate_starts, candidate_ends, candidate_mask = self.get_candidate_spans(
sent_lengths, max_sent_length, self.config.max_arg_width)
flatted_candidate_mask = candidate_mask.view(-1)
batch_word_offset = self.exclusive_cumsum(sent_lengths) # get the word offset in a batch
# choose the flatted_candidate_starts with the actual existing positions, i.e. exclude the illegal starts
flatted_candidate_starts = candidate_starts + batch_word_offset
flatted_candidate_starts = flatted_candidate_starts.view(-1)[flatted_candidate_mask].to(torch.long)
flatted_candidate_ends = candidate_ends + batch_word_offset
flatted_candidate_ends = flatted_candidate_ends.view(-1)[flatted_candidate_mask].to(torch.long)
# flatten the lstm output according to the sentence mask, i.e. exclude the illegal (padding) lstm output
flatted_context_output = self.flatten_emb_in_sentence(contextualized_embeddings, masks)
"""generate the span embedding"""
candidate_span_emb, head_scores, span_head_emb, head_indices, head_indices_log_mask = self.get_span_emb(
flatted_context_output, flatted_candidate_starts, flatted_candidate_ends,
self.config, dropout=self.dropout)
"""Get the span ids"""
candidate_span_number = candidate_span_emb.size()[0]
max_candidate_spans_num_per_sentence = candidate_mask.size()[1]
sparse_indices = candidate_mask.nonzero(as_tuple=False)
sparse_values = torch.arange(0, candidate_span_number, device=device)
candidate_span_ids = torch.sparse.FloatTensor(sparse_indices.t(), sparse_values,
torch.Size([num_sentences,
max_candidate_spans_num_per_sentence])).to_dense()
spans_log_mask = torch.log(candidate_mask.to(torch.float))
predict_dict = {"candidate_starts": candidate_starts, "candidate_ends": candidate_ends,
"head_scores": head_scores}
"""Get unary scores and topk of candidate argument spans."""
flatted_candidate_arg_scores = self.get_arg_unary_scores(candidate_span_emb)
candidate_arg_scores = flatted_candidate_arg_scores.index_select(0, candidate_span_ids.view(-1)) \
.view(candidate_span_ids.size()[0], candidate_span_ids.size()[1])
candidate_arg_scores = candidate_arg_scores + spans_log_mask
arg_starts, arg_ends, arg_scores, num_args, top_arg_indices = \
self.get_batch_topk(candidate_starts, candidate_ends, candidate_arg_scores,
self.config.argument_ratio, sent_lengths, max_sent_length,
sort_spans=False, enforce_non_crossing=False)
"""Get the candidate predicate"""
candidate_pred_ids = torch.arange(0, max_sent_length, device=device).unsqueeze(0).expand(num_sentences, -1)
candidate_pred_emb = contextualized_embeddings
candidate_pred_scores = self.get_pred_unary_scores(candidate_pred_emb)
candidate_pred_scores = candidate_pred_scores + torch.log(masks.to(torch.float).unsqueeze(2))
candidate_pred_scores = candidate_pred_scores.squeeze(2)
if self.use_gold_predicates is True:
predicates = gold_predicates
num_preds = (gold_arg_labels > 0).sum(dim=-1)
pred_scores = torch.zeros_like(predicates)
top_pred_indices = predicates
else:
predicates, _, pred_scores, num_preds, top_pred_indices = self.get_batch_topk(
candidate_pred_ids, candidate_pred_ids, candidate_pred_scores, self.config.predicate_ratio,
sent_lengths, max_sent_length,
sort_spans=False, enforce_non_crossing=False)
"""Get top arg embeddings"""
arg_span_indices = torch.gather(candidate_span_ids, 1, top_arg_indices) # [num_sentences, max_num_args]
arg_emb = candidate_span_emb.index_select(0, arg_span_indices.view(-1)).view(
arg_span_indices.size()[0], arg_span_indices.size()[1], -1
) # [num_sentences, max_num_args, emb]
"""Get top predicate embeddings"""
pred_emb = self.batch_index_select(candidate_pred_emb,
top_pred_indices) # [num_sentences, max_num_preds, emb]
"""Get the srl scores according to the arg emb and pre emb."""
srl_scores = self.get_srl_scores(arg_emb, pred_emb, arg_scores, pred_scores, self.label_space_size, self.config,
self.dropout) # [num_sentences, max_num_args, max_num_preds, num_labels]
if gold_arg_labels is not None:
"""Get the answers according to the labels"""
srl_labels = self.get_srl_labels(arg_starts, arg_ends, predicates, gold_predicates, gold_arg_starts,
gold_arg_ends, gold_arg_labels, max_sent_length)
"""Compute the srl loss"""
srl_loss, srl_mask = self.get_srl_softmax_loss(srl_scores, srl_labels, num_args, num_preds)
predict_dict.update({
'srl_mask': srl_mask,
'loss': srl_loss
})
else:
predict_dict['srl_mask'] = self.get_srl_loss_mask(srl_scores, num_args, num_preds)
predict_dict.update({
"candidate_arg_scores": candidate_arg_scores,
"candidate_pred_scores": candidate_pred_scores,
"predicates": predicates,
"arg_starts": arg_starts,
"arg_ends": arg_ends,
"arg_scores": arg_scores,
"pred_scores": pred_scores,
"num_args": num_args,
"num_preds": num_preds,
# [num_sentences, num_args, num_preds] avoid max on empty tensor
# "arg_labels": torch.max(srl_scores, 1)[1] if srl_scores.numel() else srl_scores[:, :, :, 0],
"srl_scores": srl_scores,
})
return predict_dict
class SpanRankingSRLModel(nn.Module):
def __init__(self, config, embed: torch.nn.Module, context_layer: torch.nn.Module, label_space_size):
super(SpanRankingSRLModel, self).__init__()
self.config = config
self.dropout = float(config.dropout)
self.lexical_dropout = float(self.config.lexical_dropout)
self.label_space_size = label_space_size
# Initialize layers and parameters
self.word_embedding_dim = embed.get_output_dim() # get the embedding dim
self.embed = embed
# Initialize context layer
self.context_layer = context_layer
context_layer_output_dim = context_layer.get_output_dim() if context_layer else self.word_embedding_dim
self.decoder = SpanRankingSRLDecoder(context_layer_output_dim, label_space_size, config)
def forward(self,
batch: Dict[str, torch.Tensor]
):
gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, masks, sent_lengths = \
self.unpack(batch, training=self.training)
context_embeddings = self.embed(batch)
context_embeddings = F.dropout(context_embeddings, self.lexical_dropout, self.training)
if self.context_layer:
context_embeddings = self.context_layer(context_embeddings, masks)
return self.decoder.decode(context_embeddings, sent_lengths, masks, gold_arg_starts, gold_arg_ends,
gold_arg_labels, gold_predicates)
@staticmethod
def unpack(batch, mask=None, training=False):
keys = 'token_length', 'predicate_offset', 'argument_begin_offset', 'argument_end_offset', 'srl_label_id'
sent_lengths, gold_predicates, gold_arg_starts, gold_arg_ends, gold_arg_labels = [batch.get(k, None) for k in
keys]
if mask is None:
mask = hanlp.utils.torch_util.lengths_to_mask(sent_lengths)
# elif not training:
# sent_lengths = mask.sum(dim=1)
return gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, mask, sent_lengths
================================================
FILE: hanlp/components/srl/span_rank/srl_eval_utils.py
================================================
# Evaluation util functions for PropBank SRL.
import codecs
import collections
import operator
import tempfile
from collections import Counter
from hanlp.metrics.srl.srlconll import official_conll_05_evaluate
_SRL_CONLL_EVAL_SCRIPT = "../run_eval.sh"
def split_example_for_eval(example):
"""Split document-based samples into sentence-based samples for evaluation.
Args:
example:
Returns:
"""
sentences = example["sentences"]
num_words = sum(len(s) for s in sentences)
word_offset = 0
samples = []
# assert len(sentences) == 1
for i, sentence in enumerate(sentences):
# assert i == 0 # For CoNLL-2005, there are always document == sentence.
srl_rels = {}
ner_spans = [] # Unused.
for r in example["srl"][i]:
pred_id = r[0] - word_offset
if pred_id not in srl_rels:
srl_rels[pred_id] = []
srl_rels[pred_id].append((r[1] - word_offset, r[2] - word_offset, r[3]))
samples.append((sentence, srl_rels, ner_spans))
word_offset += len(sentence)
return samples
def evaluate_retrieval(span_starts, span_ends, span_scores, pred_starts, pred_ends, gold_spans,
text_length, evaluators, debugging=False):
"""Evaluation for unlabeled retrieval.
Args:
gold_spans: Set of tuples of (start, end).
span_starts:
span_ends:
span_scores:
pred_starts:
pred_ends:
text_length:
evaluators:
debugging: (Default value = False)
Returns:
"""
if len(span_starts) > 0:
sorted_starts, sorted_ends, sorted_scores = list(zip(*sorted(
zip(span_starts, span_ends, span_scores),
key=operator.itemgetter(2), reverse=True)))
else:
sorted_starts = []
sorted_ends = []
for k, evaluator in list(evaluators.items()):
if k == -3:
predicted_spans = set(zip(span_starts, span_ends)) & gold_spans
else:
if k == -2:
predicted_starts = pred_starts
predicted_ends = pred_ends
if debugging:
print("Predicted", list(zip(sorted_starts, sorted_ends, sorted_scores))[:len(gold_spans)])
print("Gold", gold_spans)
# FIXME: scalar index error
elif k == 0:
is_predicted = span_scores > 0
predicted_starts = span_starts[is_predicted]
predicted_ends = span_ends[is_predicted]
else:
if k == -1:
num_predictions = len(gold_spans)
else:
num_predictions = (k * text_length) / 100
predicted_starts = sorted_starts[:num_predictions]
predicted_ends = sorted_ends[:num_predictions]
predicted_spans = set(zip(predicted_starts, predicted_ends))
evaluator.update(gold_set=gold_spans, predicted_set=predicted_spans)
def _calc_f1(total_gold, total_predicted, total_matched, message=None):
precision = total_matched / total_predicted if total_predicted > 0 else 0
recall = total_matched / total_gold if total_gold > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
if message:
print(("{}: Precision: {:.2%} Recall: {:.2%} F1: {:.2%}".format(message, precision, recall, f1)))
return precision, recall, f1
def compute_span_f1(gold_data, predictions, task_name):
assert len(gold_data) == len(predictions)
total_gold = 0
total_predicted = 0
total_matched = 0
total_unlabeled_matched = 0
label_confusions = Counter() # Counter of (gold, pred) label pairs.
for i in range(len(gold_data)):
gold = gold_data[i]
pred = predictions[i]
total_gold += len(gold)
total_predicted += len(pred)
for a0 in gold:
for a1 in pred:
if a0[0] == a1[0] and a0[1] == a1[1]:
total_unlabeled_matched += 1
label_confusions.update([(a0[2], a1[2]), ])
if a0[2] == a1[2]:
total_matched += 1
prec, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched, task_name)
ul_prec, ul_recall, ul_f1 = _calc_f1(total_gold, total_predicted, total_unlabeled_matched,
"Unlabeled " + task_name)
return prec, recall, f1, ul_prec, ul_recall, ul_f1, label_confusions
def compute_unlabeled_span_f1(gold_data, predictions, task_name):
assert len(gold_data) == len(predictions)
total_gold = 0
total_predicted = 0
total_matched = 0
total_unlabeled_matched = 0
label_confusions = Counter() # Counter of (gold, pred) label pairs.
for i in range(len(gold_data)):
gold = gold_data[i]
pred = predictions[i]
total_gold += len(gold)
total_predicted += len(pred)
for a0 in gold:
for a1 in pred:
if a0[0] == a1[0] and a0[1] == a1[1]:
total_unlabeled_matched += 1
label_confusions.update([(a0[2], a1[2]), ])
if a0[2] == a1[2]:
total_matched += 1
prec, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched, task_name)
ul_prec, ul_recall, ul_f1 = _calc_f1(total_gold, total_predicted, total_unlabeled_matched,
"Unlabeled " + task_name)
return prec, recall, f1, ul_prec, ul_recall, ul_f1, label_confusions
SRLScores = collections.namedtuple('SRLScores',
['unlabeled_precision', 'unlabeled_recall', 'unlabeled_f1', 'precision', 'recall',
'f1', 'conll_precision', 'conll_recall', 'conll_f1', 'label_confusions',
'num_sents'])
def compute_srl_f1(sentences, gold_srl, predictions, gold_path=None) -> SRLScores:
assert len(gold_srl) == len(predictions)
total_gold = 0
total_predicted = 0
total_matched = 0
total_unlabeled_matched = 0
num_sents = 0
label_confusions = Counter()
# Compute unofficial F1 of SRL relations.
for gold, prediction in zip(gold_srl, predictions):
gold_rels = 0
pred_rels = 0
matched = 0
for pred_id, gold_args in gold.items():
filtered_gold_args = [a for a in gold_args if a[2] not in ["V", "C-V"]]
total_gold += len(filtered_gold_args)
gold_rels += len(filtered_gold_args)
if pred_id not in prediction:
continue
for a0 in filtered_gold_args:
for a1 in prediction[pred_id]:
if a0[0] == a1[0] and a0[1] == a1[1]:
total_unlabeled_matched += 1
label_confusions.update([(a0[2], a1[2]), ])
if a0[2] == a1[2]:
total_matched += 1
matched += 1
for pred_id, args in prediction.items():
filtered_args = [a for a in args if a[2] not in ["V"]] # "C-V"]]
total_predicted += len(filtered_args)
pred_rels += len(filtered_args)
if gold_rels == matched and pred_rels == matched:
num_sents += 1
precision, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched,
# "SRL (unofficial)"
)
unlabeled_precision, unlabeled_recall, unlabeled_f1 = _calc_f1(total_gold, total_predicted,
total_unlabeled_matched,
# "Unlabeled SRL (unofficial)"
)
# Prepare to compute official F1.
if not gold_path:
# print("No gold conll_eval data provided. Recreating ...")
gold_path = tempfile.NamedTemporaryFile().name
print_to_conll(sentences, gold_srl, gold_path, None)
gold_predicates = None
else:
gold_predicates = read_gold_predicates(gold_path)
temp_output = tempfile.NamedTemporaryFile().name
# print(("Output temp outoput {}".format(temp_output)))
print_to_conll(sentences, predictions, temp_output, gold_predicates)
# Evaluate twice with official script.
conll_precision, conll_recall, conll_f1 = official_conll_05_evaluate(temp_output, gold_path)
return SRLScores(unlabeled_precision, unlabeled_recall, unlabeled_f1, precision, recall, f1, conll_precision,
conll_recall, conll_f1, label_confusions, num_sents)
def print_sentence_to_conll(fout, tokens, labels):
"""Print a labeled sentence into CoNLL format.
Args:
fout:
tokens:
labels:
Returns:
"""
for label_column in labels:
assert len(label_column) == len(tokens)
for i in range(len(tokens)):
fout.write(tokens[i].ljust(15))
for label_column in labels:
fout.write(label_column[i].rjust(15))
fout.write("\n")
fout.write("\n")
def read_gold_predicates(gold_path):
print("gold path", gold_path)
fin = codecs.open(gold_path, "r", "utf-8")
gold_predicates = [[], ]
for line in fin:
line = line.strip()
if not line:
gold_predicates.append([])
else:
info = line.split()
gold_predicates[-1].append(info[0])
fin.close()
return gold_predicates
def print_to_conll(sentences, srl_labels, output_filename, gold_predicates=None):
fout = codecs.open(output_filename, "w", "utf-8")
for sent_id, words in enumerate(sentences):
if gold_predicates:
assert len(gold_predicates[sent_id]) == len(words)
pred_to_args = srl_labels[sent_id]
props = ["-" for _ in words]
col_labels = [["*" for _ in words] for _ in range(len(pred_to_args))]
for i, pred_id in enumerate(sorted(pred_to_args.keys())):
# To make sure CoNLL-eval script count matching predicates as correct.
if gold_predicates and gold_predicates[sent_id][pred_id] != "-":
props[pred_id] = gold_predicates[sent_id][pred_id]
else:
props[pred_id] = "P" + words[pred_id]
flags = [False for _ in words]
for start, end, label in pred_to_args[pred_id]:
if not max(flags[start:end + 1]):
col_labels[i][start] = "(" + label + col_labels[i][start]
col_labels[i][end] = col_labels[i][end] + ")"
for j in range(start, end + 1):
flags[j] = True
# Add unpredicted verb (for predicted SRL).
if not flags[pred_id]: # if the predicate id is False
col_labels[i][pred_id] = "(V*)"
print_sentence_to_conll(fout, props, col_labels)
fout.close()
================================================
FILE: hanlp/components/srl/span_rank/util.py
================================================
# Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL
import torch
def block_orth_normal_initializer(input_size, output_size):
weight = []
for o in output_size:
for i in input_size:
param = torch.FloatTensor(o, i)
torch.nn.init.orthogonal_(param)
weight.append(param)
return torch.cat(weight)
================================================
FILE: hanlp/components/sts/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 17:02
================================================
FILE: hanlp/components/sts/transformer_sts.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 17:03
import logging
from typing import Union, List
import torch
from torch.utils.data import DataLoader
from hanlp.common.structure import History
from hanlp.layers.transformers.pt_imports import AutoConfig_, AutoTokenizer_
from transformers import AutoModelForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.datasets.sts.stsb import SemanticTextualSimilarityDataset
from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer
from hanlp.metrics.spearman_correlation import SpearmanCorrelation
from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, reorder
from hanlp_common.constant import IDX
class TransformerSemanticTextualSimilarity(TorchComponent):
def __init__(self, **kwargs) -> None:
"""
A simple Semantic Textual Similarity (STS) baseline which fine-tunes a transformer with a regression layer on
top of it.
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
self._tokenizer = None
# noinspection PyMethodOverriding
def build_dataloader(self, data, batch_size, sent_a_col=None,
sent_b_col=None,
similarity_col=None,
delimiter='auto',
gradient_accumulation=1,
sampler_builder=None,
shuffle=False, device=None, logger: logging.Logger = None,
split=None,
**kwargs) -> DataLoader:
dataset = SemanticTextualSimilarityDataset(data,
sent_a_col,
sent_b_col,
similarity_col,
delimiter=delimiter,
transform=self._tokenizer,
cache=isinstance(data, str))
if split == 'trn':
scores = [x['similarity'] for x in dataset]
self.config.max_score = max(scores)
self.config.min_score = min(scores)
if not sampler_builder:
sampler_builder = SortingSamplerBuilder(batch_size=batch_size)
lens = [len(x['input_ids']) for x in dataset]
return PadSequenceDataLoader(dataset, batch_sampler=sampler_builder.build(lens, shuffle, gradient_accumulation),
device=device,
pad={'similarity': 0.0, 'input_ids': self._tokenizer.tokenizer.pad_token_id})
def build_optimizer(self, trn, epochs, gradient_accumulation=1, lr=1e-3, transformer_lr=5e-5, adam_epsilon=1e-8,
weight_decay=0.0, warmup_steps=0.1, **kwargs):
num_training_steps = len(trn) * epochs // gradient_accumulation
optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model,
self.model.base_model,
lr, transformer_lr,
num_training_steps, warmup_steps,
weight_decay, adam_epsilon)
return optimizer, scheduler
def build_criterion(self, **kwargs):
pass
def build_metric(self, **kwargs):
return SpearmanCorrelation()
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, gradient_accumulation=1, **kwargs):
best_epoch, best_metric = 0, -1
timer = CountdownTimer(epochs)
history = History()
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, ratio_width=ratio_width,
gradient_accumulation=gradient_accumulation, history=history, save_dir=save_dir)
report = f'{timer.elapsed_human}/{timer.total_time_human}'
self.evaluate_dataloader(dev, logger, ratio_width=ratio_width, save_dir=save_dir, metric=metric)
if metric > best_metric:
self.save_weights(save_dir)
best_metric = float(metric)
best_epoch = epoch
report += ' [red]saved[/red]'
timer.log(report, ratio_percentage=False, newline=True, ratio=False)
if best_epoch and best_epoch != epochs:
logger.info(f'Restored the best model with {best_metric} saved {epochs - best_epoch} epochs ago')
self.load_weights(save_dir)
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric: SpearmanCorrelation, logger: logging.Logger,
history=None, gradient_accumulation=1, **kwargs):
self.model.train()
optimizer, scheduler = optimizer
timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
total_loss = 0
metric.reset()
for batch in trn:
output = self.feed_batch(batch)
prediction = self.decode(output)
metric(prediction, batch['similarity'])
loss = output['loss']
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
loss.backward()
total_loss += loss.item()
if history.step(gradient_accumulation):
if self.config.grad_norm:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm)
optimizer.step()
if scheduler:
scheduler.step()
optimizer.zero_grad()
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger)
del loss
return total_loss / timer.total
@torch.no_grad()
def evaluate_dataloader(self, data: DataLoader, logger: logging.Logger, metric=None, output=False, **kwargs):
self.model.eval()
timer = CountdownTimer(len(data))
total_loss = 0
metric.reset()
if output:
predictions = []
orders = []
samples = []
for batch in data:
output_dict = self.feed_batch(batch)
prediction = self.decode(output_dict)
metric(prediction, batch['similarity'])
if output:
predictions.extend(prediction.tolist())
orders.extend(batch[IDX])
samples.extend(list(zip(batch['sent_a'], batch['sent_b'])))
loss = output_dict['loss']
total_loss += loss.item()
timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None,
logger=logger)
del loss
if output:
predictions = reorder(predictions, orders)
samples = reorder(samples, orders)
with open(output, 'w') as out:
for s, p in zip(samples, predictions):
out.write('\t'.join(s + (str(p),)))
out.write('\n')
return total_loss / timer.total
# noinspection PyMethodOverriding
def build_model(self, transformer, training=True, **kwargs) -> torch.nn.Module:
config = AutoConfig_.from_pretrained(transformer, num_labels=1)
if training:
model = AutoModelForSequenceClassification.from_pretrained(transformer, config=config)
else:
model = AutoModelForSequenceClassification.from_config(config)
return model
def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, **kwargs) -> Union[
float, List[float]]:
""" Predict the similarity between sentence pairs.
Args:
data: Sentence pairs.
batch_size: The number of samples in a batch.
**kwargs: Not used.
Returns:
Similarities between sentences.
"""
if not data:
return []
flat = isinstance(data[0], str)
if flat:
data = [data]
dataloader = self.build_dataloader([{'sent_a': x[0], 'sent_b': x[1]} for x in data],
batch_size=batch_size or self.config.batch_size,
device=self.device)
orders = []
predictions = []
for batch in dataloader:
output_dict = self.feed_batch(batch)
prediction = self.decode(output_dict)
predictions.extend(prediction.tolist())
orders.extend(batch[IDX])
predictions = reorder(predictions, orders)
if flat:
return predictions[0]
return predictions
# noinspection PyMethodOverriding
def fit(self, trn_data, dev_data, save_dir,
transformer,
sent_a_col,
sent_b_col,
similarity_col,
delimiter='auto',
batch_size=32,
max_seq_len=128,
epochs=3,
lr=1e-3,
transformer_lr=5e-5,
adam_epsilon=1e-8,
weight_decay=0.0,
warmup_steps=0.1,
gradient_accumulation=1,
grad_norm=1.0,
sampler_builder=None,
devices=None,
logger=None,
seed=None,
finetune: Union[bool, str] = False, eval_trn=True, _device_placeholder=False, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def on_config_ready(self, transformer, max_seq_len, **kwargs):
super().on_config_ready(**kwargs)
self._tokenizer = TransformerTextTokenizer(AutoTokenizer_.from_pretrained(transformer),
text_a_key='sent_a',
text_b_key='sent_b',
output_key='',
max_seq_length=max_seq_len)
def feed_batch(self, batch) -> SequenceClassifierOutput:
return self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],
token_type_ids=batch['token_type_ids'], labels=batch.get('similarity', None))
def decode(self, output: SequenceClassifierOutput):
return output.logits.squeeze(-1).detach().clip(self.config.min_score, self.config.max_score)
def report_metrics(self, loss, metric):
return f'loss: {loss:.4f} {metric}'
================================================
FILE: hanlp/components/taggers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-28 15:39
================================================
FILE: hanlp/components/taggers/cnn_tagger_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-31 13:52
from abc import ABC
from typing import Union, Tuple, Any, List, Iterable
import tensorflow as tf
from hanlp.components.taggers.tagger_tf import TaggerComponent
from hanlp.transform.tsv_tf import TSVTaggingTransform
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.embeddings.util_tf import build_embedding
class WindowTokenTransform(TSVTaggingTransform):
def fit(self, trn_path: str, **kwargs):
self.word_vocab = VocabTF()
self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
for ngrams, tags in self.file_to_samples(trn_path):
for words in ngrams:
self.word_vocab.update(words)
self.tag_vocab.update(tags)
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
window_radius = self.config.window_radius
window_size = 2 * window_radius + 1
types = tf.string, tf.string
shapes = [None, window_size], [None]
values = self.word_vocab.pad_token, self.tag_vocab.first_token
return types, shapes, values
def inputs_to_samples(self, inputs, gold=False):
window_radius = self.config.window_radius
for t in inputs:
if gold:
words, tags = t
else:
words, tags = t, [self.padding_values[-1]] * len(t)
ngrams = []
for i, word in enumerate(words):
features = []
for t in range(-window_radius, window_radius + 1):
index = i + t
if index < 0:
feature = 'bos{}'.format(index)
elif index >= len(words):
feature = 'eos+{}'.format(index - len(words) + 1)
else:
feature = words[index]
features.append(feature)
ngrams.append(features)
yield ngrams, tags
def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
for xs in X:
words = []
for x in xs:
words.append(self.word_vocab.idx_to_token[int(x[len(x) // 2])])
yield words
class CNNTaggingModel(tf.keras.models.Model):
def __init__(self, filters, num_tags, embed, dropout, kernels, **kwargs):
super().__init__()
self.embed = embed
self.embed_dropout = tf.keras.layers.Dropout(rate=dropout)
self.conv2d = []
for k in kernels:
self.conv2d.append(
tf.keras.layers.Conv2D(filters=filters, kernel_size=k, data_format='channels_last', padding='same'))
self.conv2d_dropout = tf.keras.layers.Dropout(rate=dropout)
self.concat = tf.keras.layers.Concatenate()
self.dense = tf.keras.layers.Dense(units=num_tags)
def call(self, inputs, **kwargs):
# if inputs.shape_h[0] is None:
# return tf.zeros_like()
# print(inputs)
embeds = self.embed(inputs)
embeds = self.embed_dropout(embeds)
hs = [conv(embeds) for conv in self.conv2d]
h = self.concat(hs)
h = self.conv2d_dropout(h)
shape_h = tf.shape(h)
h = tf.reshape(h, [shape_h[0], shape_h[1], h.shape[2] * h.shape[3]])
o = self.dense(h)
if h.shape[0]:
mask = embeds._keras_mask[:, :, 0]
o._keras_mask = mask
return o
class CNNTaggerTF(TaggerComponent, ABC):
def __init__(self, transform: WindowTokenTransform = None) -> None:
if not transform:
transform = WindowTokenTransform()
super().__init__(transform)
self.model: CNNTaggingModel = self.model # refine the type
self.transform: WindowTokenTransform = self.transform
def build_model(self, embedding, **kwargs) -> tf.keras.Model:
embed = build_embedding(embedding, self.transform.word_vocab, self.transform)
self.transform.map_x = embed.dtype != tf.string
model = CNNTaggingModel(num_tags=len(self.transform.tag_vocab),
embed=embed,
**kwargs)
# model.build((None, None, 3))
return model
# noinspection PyMethodOverriding
def fit(self, trn_data: Any, dev_data: Any, save_dir: str, embedding=200, window_radius=3,
kernels=(1, 2, 3, 4, 5), filters=200, dropout=0.3,
loss: Union[tf.keras.losses.Loss, str] = None,
optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy', batch_size=100,
epochs=100,
logger=None, verbose=True, **kwargs):
kwargs.update(locals())
for k in 'self', 'kwargs', '__class__':
kwargs.pop(k)
super().fit(**kwargs)
@property
def input_shape(self) -> List:
return [[None, None, self.config.window_radius * 2 + 1]]
================================================
FILE: hanlp/components/taggers/ngram_conv/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 22:18
================================================
FILE: hanlp/components/taggers/ngram_conv/ngram_conv_tagger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-25 00:04
from typing import Union, Optional, Tuple, Any, Iterable, List
import tensorflow as tf
from hanlp_common.structure import SerializableDict
from hanlp.components.taggers.tagger_tf import TaggerComponent
from hanlp.transform.tsv_tf import TSVTaggingTransform
from hanlp.transform.txt_tf import bmes_to_words, extract_ngram_features
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.embeddings.util_tf import build_embedding
from hanlp.layers.weight_normalization import WeightNormalization
from hanlp_common.util import merge_locals_kwargs
class NgramTransform(TSVTaggingTransform):
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
super().__init__(config, map_x, map_y, **kwargs)
self.ngram_vocab: Optional[VocabTF] = None
self.tag_vocab: Optional[VocabTF] = None
def inputs_to_samples(self, inputs, gold=False):
for data in inputs:
if gold:
words, tags = data
else:
words, tags = data, [self.tag_vocab.safe_pad_token] * len(data)
features = [words]
if not tags:
tags = [self.tag_vocab.first_token] * len(words)
features.extend(extract_ngram_features(words, False, self.config.window_size))
yield tuple(features), tags
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
ids = [self.word_vocab.lookup(x[0]) if self.config.map_word_feature else x[0]]
for ngram in x[1:]:
ids.append(self.ngram_vocab.lookup(ngram))
return tuple(ids)
def y_to_idx(self, y) -> tf.Tensor:
return self.tag_vocab.lookup(y)
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
window_size = self.config.window_size
ngram_size = window_size * (window_size + 1) // 2
vec_dim = 2 + ngram_size
shapes = tuple([[None]] * (vec_dim - 1)), [None]
types = tuple([tf.string] * (vec_dim - 1)), tf.string
word_vocab, ngram_vocab, tag_vocab = self.word_vocab, self.ngram_vocab, self.tag_vocab
defaults = tuple([word_vocab.pad_token] + [
ngram_vocab.pad_token if ngram_vocab else word_vocab.pad_token] * ngram_size), (
tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token)
return types, shapes, defaults
def fit(self, trn_path: str, **kwargs):
word_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(pad_token=None, unk_token=None)
num_samples = 0
for X, Y in self.file_to_samples(trn_path, gold=True):
num_samples += 1
word_vocab.update(X[0])
for ngram in X[1:]:
ngram_vocab.update(filter(lambda x: x, ngram))
tag_vocab.update(Y)
self.word_vocab, self.ngram_vocab, self.tag_vocab = word_vocab, ngram_vocab, tag_vocab
if self.config.window_size:
vocabs = word_vocab, ngram_vocab, tag_vocab
else:
vocabs = word_vocab, None, tag_vocab
self.word_vocab, self.ngram_vocab, self.tag_vocab = vocabs
return num_samples
def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
yield from super().X_to_inputs(X[0])
def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]):
words = bmes_to_words(input, output)
return ' '.join(words)
class NgramConvTaggingModel(tf.keras.models.Model):
def __init__(self, word_embed: tf.keras.layers.Embedding, ngram_embed: tf.keras.layers.Embedding, filters,
kernel_size, dropout_embed, dropout_hidden, weight_norm, num_tags, **kwargs):
super().__init__(**kwargs)
if ngram_embed is not None:
self.ngram_embed = ngram_embed
self.word_embed = word_embed
# self.concat = tf.keras.layers.Concatenate(axis=2)
self.dropout_embed = tf.keras.layers.Dropout(rate=dropout_embed)
self.filters_w = []
self.filters_v = []
def create_conv1d(filter, name):
conv = tf.keras.layers.Conv1D(filter, kernel_size, padding="same", name=name)
if weight_norm:
conv_norm = WeightNormalization(conv, name=name + '_norm', data_init=False)
return conv_norm
return conv
for idx, filter in enumerate(filters):
self.filters_w.append(create_conv1d(filter, 'Conv1Dw_{}'.format(idx)))
self.filters_v.append(create_conv1d(filter, 'Conv1Dv_{}'.format(idx)))
self.dropout_hidden = tf.keras.layers.Dropout(rate=dropout_hidden)
self.dense = tf.keras.layers.Dense(num_tags, use_bias=False)
def call(self, inputs, **kwargs):
if hasattr(self, 'ngram_embed'):
chars, ngrams = inputs[0], inputs[1:]
embeds = [self.word_embed(chars)]
mask = embeds[0]._keras_mask
for ngram in ngrams:
embeds.append(self.ngram_embed(ngram))
if len(embeds) > 1:
embed_input = tf.concat(embeds, axis=2)
else:
embed_input = embeds[0]
else:
chars = inputs if isinstance(inputs, tf.Tensor) else inputs[0]
embed_input = self.word_embed(chars)
mask = embed_input._keras_mask
mask_float = tf.dtypes.cast(mask, tf.float32)
embed_input = self.dropout_embed(embed_input)
hidden_output = embed_input
for fw, fv in zip(self.filters_w.layers, self.filters_v.layers):
w = fw(hidden_output)
v = fv(hidden_output)
hidden_output = w * tf.nn.sigmoid(v)
# Mask paddings.
hidden_output = hidden_output * tf.expand_dims(mask_float, -1)
hidden_output = self.dropout_hidden(hidden_output)
# dirty hack
hidden_output._keras_mask = mask
logits = self.dense(hidden_output)
return logits
class NgramConvTaggerTF(TaggerComponent):
def __init__(self, transform: NgramTransform = None) -> None:
if not transform:
transform = NgramTransform()
super().__init__(transform)
self.transform: NgramTransform = transform
def build_model(self, word_embed, ngram_embed, window_size, weight_norm, filters, kernel_size, dropout_embed,
dropout_hidden, **kwargs) -> tf.keras.Model:
word_vocab, ngram_vocab, tag_vocab = self.transform.word_vocab, self.transform.ngram_vocab, \
self.transform.tag_vocab
word_embed = build_embedding(word_embed, word_vocab, self.transform)
if 'map_x' in self.config:
self.config.map_word_feature = self.config.map_x
del self.config.map_x
else:
self.config.map_word_feature = True
if window_size:
ngram_embed = build_embedding(ngram_embed, ngram_vocab, self.transform)
else:
ngram_embed = None
model = NgramConvTaggingModel(word_embed, ngram_embed, filters, kernel_size, dropout_embed, dropout_hidden,
weight_norm, len(tag_vocab))
return model
def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200,
ngram_embed: Union[str, int,dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3,
filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True,
loss: Union[tf.keras.losses.Loss, str] = None,
optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy', batch_size=100,
epochs=100,
logger=None, verbose=True, **kwargs):
assert kwargs.get('run_eagerly', True), 'NgramConvTaggingModel can only run eagerly'
kwargs['run_eagerly'] = True
return super().fit(**merge_locals_kwargs(locals(), kwargs))
================================================
FILE: hanlp/components/taggers/pos_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-05 23:05
from hanlp.components.taggers.cnn_tagger_tf import CNNTaggerTF
from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF
class CNNPartOfSpeechTaggerTF(CNNTaggerTF):
pass
class RNNPartOfSpeechTaggerTF(RNNTaggerTF):
pass
================================================
FILE: hanlp/components/taggers/rnn/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-19 15:41
================================================
FILE: hanlp/components/taggers/rnn/rnntaggingmodel.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from typing import Union
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence
from hanlp.layers.crf.crf import CRF
class RNNTaggingModel(nn.Module):
def __init__(self,
embed: Union[nn.Embedding, int],
rnn_input,
rnn_hidden,
n_out,
drop=0.5,
crf=True,
crf_constraints=None):
super(RNNTaggingModel, self).__init__()
# the embedding layer
if isinstance(embed, nn.Module):
self.embed = embed
n_embed = embed.embedding_dim
else:
self.embed = None
n_embed = embed
if rnn_input:
self.embed_to_rnn = nn.Linear(n_embed, rnn_input)
else:
self.embed_to_rnn = None
rnn_input = n_embed
# the word-lstm layer
self.word_lstm = nn.LSTM(input_size=rnn_input,
hidden_size=rnn_hidden,
batch_first=True,
bidirectional=True)
# the output layer
self.out = nn.Linear(rnn_hidden * 2, n_out)
# the CRF layer
self.crf = CRF(n_out, crf_constraints) if crf else None
self.drop = nn.Dropout(drop)
# self.drop = SharedDropout(drop)
# self.drop = LockedDropout(drop)
self.reset_parameters()
def reset_parameters(self):
# init Linear
nn.init.xavier_uniform_(self.out.weight)
def forward(self,
x: torch.Tensor,
batch=None,
**kwargs):
# get the mask and lengths of given batch
mask = x.gt(0)
lens = mask.sum(dim=1)
# get outputs from embedding layers
if isinstance(self.embed, nn.Embedding):
x = self.embed(x[mask])
else:
x = self.embed(batch, mask=mask)
if x.dim() == 3:
x = x[mask]
x = self.drop(x)
if self.embed_to_rnn:
x = self.embed_to_rnn(x)
x = pack_sequence(torch.split(x, lens.tolist()), True)
x, _ = self.word_lstm(x)
x, _ = pad_packed_sequence(x, True)
x = self.drop(x)
return self.out(x), mask
================================================
FILE: hanlp/components/taggers/rnn_tagger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-20 13:12
import logging
import torch
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader, SortingSampler, TransformableDataset
from hanlp_common.configurable import Configurable
from hanlp.common.transform import EmbeddingNamedTransform
from hanlp.common.vocab import Vocab
from hanlp.components.taggers.rnn.rnntaggingmodel import RNNTaggingModel
from hanlp.components.taggers.tagger import Tagger
from hanlp.datasets.ner.loaders.tsv import TSVTaggingDataset
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.layers.embeddings.util import build_word2vec_with_vocab
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import merge_locals_kwargs, merge_dict
class RNNTagger(Tagger):
def __init__(self, **kwargs) -> None:
"""An old-school tagger using non-contextualized embeddings and RNNs as context layer.
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
self.model: RNNTaggingModel = None
# noinspection PyMethodOverriding
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion,
optimizer,
metric,
save_dir,
logger,
patience,
**kwargs):
max_e, max_metric = 0, -1
criterion = self.build_criterion()
timer = CountdownTimer(epochs)
ratio_width = len(f'{len(trn)}/{len(trn)}')
scheduler = self.build_scheduler(**merge_dict(self.config, optimizer=optimizer, overwrite=True))
if not patience:
patience = epochs
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, ratio_width=ratio_width)
loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger)
if scheduler:
if isinstance(scheduler, ReduceLROnPlateau):
scheduler.step(dev_metric.score)
else:
scheduler.step(epoch)
report_patience = f'Patience: {epoch - max_e}/{patience}'
# save the model if it is the best so far
if dev_metric > max_metric:
self.save_weights(save_dir)
max_e, max_metric = epoch, dev_metric
report_patience = '[red]Saved[/red] '
stop = epoch - max_e >= patience
if stop:
timer.stop()
timer.log(f'{report_patience} lr: {optimizer.param_groups[0]["lr"]:.4f}',
ratio_percentage=False, newline=True, ratio=False)
if stop:
break
timer.stop()
if max_e != epoch:
self.load_weights(save_dir)
logger.info(f"Max score of dev is {max_metric.score:.2%} at epoch {max_e}")
logger.info(f"{timer.elapsed_human} elapsed, average time of each epoch is {timer.elapsed_average_human}")
def build_scheduler(self, optimizer, anneal_factor, anneal_patience, **kwargs):
scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer,
factor=anneal_factor,
patience=anneal_patience,
mode='max') if anneal_factor and anneal_patience else None
return scheduler
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, ratio_width=None,
**kwargs):
self.model.train()
timer = CountdownTimer(len(trn))
total_loss = 0
for idx, batch in enumerate(trn):
optimizer.zero_grad()
out, mask = self.feed_batch(batch)
y = batch['tag_id']
loss = self.compute_loss(criterion, out, y, mask)
loss.backward()
nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
optimizer.step()
total_loss += loss.item()
prediction = self.decode_output(out, mask, batch)
self.update_metrics(metric, out, y, mask, batch, prediction)
timer.log(f'loss: {loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger,
ratio_width=ratio_width)
del loss
del out
del mask
def feed_batch(self, batch):
x = batch[f'{self.config.token_key}_id']
out, mask = self.model(x, **batch, batch=batch)
return out, mask
# noinspection PyMethodOverriding
def build_model(self, rnn_input, rnn_hidden, drop, crf, **kwargs) -> torch.nn.Module:
vocabs = self.vocabs
token_embed = self._convert_embed()
if isinstance(token_embed, EmbeddingNamedTransform):
token_embed = token_embed.output_dim
elif isinstance(token_embed, Embedding):
token_embed = token_embed.module(vocabs=vocabs)
else:
token_embed = build_word2vec_with_vocab(token_embed, vocabs[self.config.token_key])
model = RNNTaggingModel(token_embed, rnn_input, rnn_hidden, len(vocabs['tag']), drop, crf)
return model
def _convert_embed(self):
embed = self.config['embed']
if isinstance(embed, dict):
self.config['embed'] = embed = Configurable.from_config(embed)
return embed
def build_dataloader(self, data, batch_size, shuffle, device, logger=None, **kwargs) -> DataLoader:
vocabs = self.vocabs
token_embed = self._convert_embed()
dataset = data if isinstance(data, TransformableDataset) else self.build_dataset(data, transform=[vocabs])
if vocabs.mutable:
# Before building vocabs, let embeddings submit their vocabs, some embeddings will possibly opt out as their
# transforms are not relevant to vocabs
if isinstance(token_embed, Embedding):
transform = token_embed.transform(vocabs=vocabs)
if transform:
dataset.transform.insert(-1, transform)
self.build_vocabs(dataset, logger)
if isinstance(token_embed, Embedding):
# Vocabs built, now add all transforms to the pipeline. Be careful about redundant ones.
transform = token_embed.transform(vocabs=vocabs)
if transform and transform not in dataset.transform:
dataset.transform.insert(-1, transform)
sampler = SortingSampler([len(sample[self.config.token_key]) for sample in dataset], batch_size,
shuffle=shuffle)
return PadSequenceDataLoader(dataset,
device=device,
batch_sampler=sampler,
vocabs=vocabs)
def build_dataset(self, data, transform):
return TSVTaggingDataset(data, transform)
def build_vocabs(self, dataset, logger):
self.vocabs.tag = Vocab(unk_token=None, pad_token=None)
self.vocabs[self.config.token_key] = Vocab()
for each in dataset:
pass
self.vocabs.lock()
self.vocabs.summary(logger)
def fit(self, trn_data, dev_data, save_dir,
batch_size=50,
epochs=100,
embed=100,
rnn_input=None,
rnn_hidden=256,
drop=0.5,
lr=0.001,
patience=10,
crf=True,
optimizer='adam',
token_key='token',
tagging_scheme=None,
anneal_factor: float = 0.5,
anneal_patience=2,
devices=None, logger=None, verbose=True, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def _id_to_tags(self, ids):
batch = []
vocab = self.vocabs['tag'].idx_to_token
for b in ids:
batch.append([])
for i in b:
batch[-1].append(vocab[i])
return batch
def write_output(self, yhat, y, mask, batch, prediction, output):
pass
================================================
FILE: hanlp/components/taggers/rnn_tagger_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 20:30
from typing import Union, List
import tensorflow as tf
from hanlp.common.transform_tf import Transform
from hanlp.components.taggers.tagger_tf import TaggerComponent
from hanlp.transform.tsv_tf import TSVTaggingTransform
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.embeddings.util_tf import build_embedding, embeddings_require_string_input, \
embeddings_require_char_input
from hanlp_common.util import merge_locals_kwargs
class RNNTaggerTF(TaggerComponent):
def __init__(self, transform: Transform = None) -> None:
if not transform:
self.transform = transform = TSVTaggingTransform()
super().__init__(transform)
def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False,
rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, lower=False, logger=None,
loss: Union[tf.keras.losses.Loss, str] = None,
optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy',
batch_size=32, dev_batch_size=32, lr_decay_per_epoch=None, verbose=True, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def build_model(self, embeddings, embedding_trainable, rnn_input_dropout, rnn_output_dropout, rnn_units,
loss,
**kwargs) -> tf.keras.Model:
model = tf.keras.Sequential()
embeddings = build_embedding(embeddings, self.transform.word_vocab, self.transform)
model.add(embeddings)
if rnn_input_dropout:
model.add(tf.keras.layers.Dropout(rnn_input_dropout, name='rnn_input_dropout'))
model.add(
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True), name='bilstm'))
if rnn_output_dropout:
model.add(tf.keras.layers.Dropout(rnn_output_dropout, name='rnn_output_dropout'))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(self.transform.tag_vocab)), name='dense'))
return model
def predict(self, sents: Union[List[str], List[List[str]]], batch_size=32, **kwargs) -> Union[
List[str], List[List[str]]]:
return super().predict(sents, batch_size)
def save_weights(self, save_dir, filename='model.h5'):
# remove the pre-trained embedding
embedding_layer: tf.keras.layers.Embedding = self.model.get_layer(index=0)
if embedding_layer.trainable:
super().save_weights(save_dir, filename)
else:
truncated_model = tf.keras.Sequential(layers=self.model.layers[1:])
truncated_model.build(input_shape=embedding_layer.output_shape)
truncated_model.save_weights(save_dir)
def build_loss(self, loss, **kwargs):
if not loss:
loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM,
from_logits=True)
return loss
return super().build_loss(loss, **kwargs)
@property
def tag_vocab(self) -> VocabTF:
return self.transform.tag_vocab
def build_transform(self, embeddings, **kwargs):
if embeddings_require_string_input(embeddings):
self.transform.map_x = False
if embeddings_require_char_input(embeddings):
self.transform.char_vocab = VocabTF()
return super().build_transform(**kwargs)
@property
def sample_data(self):
if self.transform.char_vocab:
# You cannot build your model by calling `build` if your layers do not support float type inputs.
# Instead, in order to instantiate and build your model, `call` your model on real tensor data (of the
# correct dtype).
sample = tf.constant([
['hello', 'world', self.transform.word_vocab.pad_token],
['hello', 'this', 'world'],
])
sample._keras_mask = tf.not_equal(sample, self.transform.word_vocab.pad_token)
return sample
================================================
FILE: hanlp/components/taggers/tagger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 12:19
import logging
import warnings
from abc import ABC, abstractmethod
from typing import List, TextIO, Any, Union, Dict, Tuple, Sequence
import torch
from torch import optim, nn
from torch.utils.data import DataLoader
from hanlp_common.constant import IDX
from hanlp.common.structure import History
from hanlp.components.distillation.distillable_component import DistillableComponent
from hanlp.components.taggers.util import guess_tagging_scheme
from hanlp.layers.crf.crf import CRF
from hanlp.metrics.accuracy import CategoricalAccuracy
from hanlp.utils.time_util import CountdownTimer
from hanlp_common.util import reorder
from hanlp_trie import DictInterface, TrieDict
from hanlp_trie.dictionary import TupleTrieDict
class Tagger(DistillableComponent, ABC):
def build_optimizer(self, optimizer, lr, **kwargs):
if optimizer == 'adam':
return optim.Adam(params=self.model.parameters(), lr=lr)
elif optimizer == 'sgd':
return torch.optim.SGD(self.model.parameters(), lr=lr)
def build_criterion(self, model=None, reduction='mean', decoder=None, **kwargs):
if self.config.get('crf', False):
if not model:
model = decoder or self.model
if isinstance(model, nn.DataParallel):
raise ValueError('DataParallel not supported when CRF is used')
return self.model_from_config.module.crf
return model.crf
else:
return nn.CrossEntropyLoss(reduction=reduction)
def build_metric(self, **kwargs):
return CategoricalAccuracy()
@abstractmethod
def feed_batch(self, batch):
pass
def compute_loss(self, criterion, out, y, mask):
if self.config.get('crf', False):
criterion: CRF = criterion
loss = -criterion.forward(out, y, mask)
else:
loss = criterion(out[mask], y[mask])
return loss
def decode_output(self, logits, mask, batch, model=None):
if self.config.get('crf', False):
if model is None:
model = self.model
crf: CRF = model.crf
return crf.decode(logits, mask)
else:
return logits.argmax(-1)
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, patience=5, teacher=None,
kd_criterion=None, eval_trn=True,
**kwargs):
best_epoch, best_metric = 0, -1
timer = CountdownTimer(epochs)
history = History()
for epoch in range(1, epochs + 1):
logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width,
eval_trn=eval_trn, **self.config)
loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width)
timer.update()
report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}"
if dev_metric > best_metric:
best_epoch, best_metric = epoch, dev_metric
self.save_weights(save_dir)
report += ' [red](saved)[/red]'
else:
report += f' ({epoch - best_epoch})'
if epoch - best_epoch >= patience:
report += ' early stop'
logger.info(report)
if epoch - best_epoch >= patience:
break
if not best_epoch:
self.save_weights(save_dir)
elif best_epoch != epoch:
self.load_weights(save_dir)
logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}")
logger.info(f"Average time of each epoch is {timer.elapsed_average_human}")
logger.info(f"{timer.elapsed_human} elapsed")
return best_metric
def id_to_tags(self, ids: torch.LongTensor, lens: List[int]):
batch = []
vocab = self.vocabs['tag'].idx_to_token
for b, l in zip(ids, lens):
batch.append([])
for i in b[:l]:
batch[-1].append(vocab[i])
return batch
def update_metrics(self, metric, logits, y, mask, batch=None, prediction=None):
metric(logits, y, mask)
@torch.no_grad()
def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, metric=None, output=None, **kwargs):
self.model.eval()
if isinstance(output, str):
output = open(output, 'w')
loss = 0
if not metric:
metric = self.build_metric()
else:
metric.reset()
timer = CountdownTimer(len(data))
for idx, batch in enumerate(data):
logits, mask = self.feed_batch(batch)
y = batch['tag_id']
loss += self.compute_loss(criterion, logits, y, mask).item()
prediction = self.decode_output(logits, mask, batch)
self.update_metrics(metric, logits, y, mask, batch, prediction)
if output:
self.write_prediction(prediction, batch, output)
timer.log(f'loss: {loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger,
ratio_width=ratio_width)
loss /= len(data)
if output:
output.close()
return float(loss), metric
def write_prediction(self, prediction, batch, output: TextIO):
for tokens, ps, gs in zip(batch[self.config.token_key], prediction, batch['tag']):
output.write('\n'.join('\t'.join([t, p, g]) for t, p, g in zip(tokens, ps, gs)))
output.write('\n')
def predict(self, tokens: Any, batch_size: int = None, **kwargs):
if not tokens:
return []
flat = self.input_is_flat(tokens)
if flat:
tokens = [tokens]
outputs = self.predict_data(tokens, batch_size, **kwargs)
if flat:
return outputs[0]
return outputs
def input_is_flat(self, tokens):
return isinstance(tokens, list) and isinstance(tokens[0], str)
def predict_data(self, data, batch_size, sampler_builder=None, **kwargs):
samples = self.build_samples(data, **kwargs)
if not batch_size:
batch_size = self.config.get('batch_size', 32)
dataloader = self.build_dataloader(samples, batch_size, False, self.device, sampler_builder=sampler_builder,
**kwargs)
outputs = []
orders = []
vocab = self.vocabs['tag'].idx_to_token
for batch in dataloader:
out, mask = self.feed_batch(batch)
pred = self.decode_output(out, mask, batch)
outputs.extend(self.prediction_to_human(pred, vocab, batch))
orders.extend(batch[IDX])
outputs = reorder(outputs, orders)
return outputs
def build_samples(self, data: List[str], **kwargs):
return [{self.config.token_key: sent} for sent in data]
def prediction_to_human(self, pred_ids, vocab: List[str], batch):
if isinstance(pred_ids, torch.Tensor):
pred_ids = pred_ids.tolist()
sents = batch.get(f'{self.config.token_key}_')
if not sents:
sents = batch[self.config.token_key]
dict_tags: DictInterface = self.dict_tags
for each, sent in zip(pred_ids, sents):
tags = [vocab[id] for id in each[:len(sent)]]
if dict_tags:
for begin, end, label in dict_tags.tokenize(sent):
tags[begin:end] = label
yield tags
@property
def tagging_scheme(self):
tagging_scheme = self.config.tagging_scheme
if not tagging_scheme:
self.config.tagging_scheme = tagging_scheme = guess_tagging_scheme(self.vocabs.tag.idx_to_token)
if tagging_scheme == 'BIO':
warnings.warn(f'The tag scheme for {self.vocabs.tag.idx_to_token} might be IOB1 or IOB2 '
f'but we are using IOB2 by default. Please set tagging_scheme="IOB1" or tagging_scheme="BIO" '
f'to get rid of this warning.')
return tagging_scheme
@property
def dict_tags(self) -> DictInterface:
r""" A custom dictionary to override predicted tags by performing longest-prefix-matching.
Examples:
>>> pos.dict_tags = {'HanLP': 'state-of-the-art-tool'} # Force 'HanLP' to be 'state-of-the-art-tool'
>>> tagger("HanLP为生产环境带来次世代最先进的多语种NLP技术。")
# HanLP/state-of-the-art-tool 为/P 生产/NN 环境/NN 带来/VV 次世代/NN 最/AD 先进/VA 的/DEC 多语种/NN NLP/NR 技术/NN 。/PU
>>> pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'} # Conditional matching
>>> tagger("我的希望是希望张晚霞的背影被晚霞映红。")
# 我/PN 的/补语成分 希望/名词 是/VC 希望/动词 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU
"""
return self.config.get('dict_tags', None)
@dict_tags.setter
def dict_tags(self,
dictionary: Union[DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]]):
if dictionary is not None and not isinstance(dictionary, DictInterface):
assert isinstance(dictionary, dict), f'Expected dictionary to be `dict` but got {type(dictionary)}.'
_d = dict()
for k, v in dictionary.items():
if isinstance(k, str):
k = (k,)
if isinstance(v, str):
v = (v,) * len(k)
_d[k] = v
dictionary = TupleTrieDict(_d)
self.config.dict_tags = dictionary
================================================
FILE: hanlp/components/taggers/tagger_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-25 21:49
import logging
from abc import ABC
import tensorflow as tf
from hanlp.common.keras_component import KerasComponent
from hanlp.layers.crf.crf_layer_tf import CRF, CRFLoss, CRFWrapper
from hanlp.metrics.chunking.iobes_tf import IOBES_F1_TF
class TaggerComponent(KerasComponent, ABC):
def build_metrics(self, metrics, logger: logging.Logger, **kwargs):
if metrics == 'f1':
assert hasattr(self.transform, 'tag_vocab'), 'Name your tag vocab tag_vocab in your transform ' \
'or override build_metrics'
if not self.config.get('run_eagerly', None):
logger.debug('ChunkingF1 runs only under eager mode, '
'set run_eagerly=True to remove this warning')
self.config.run_eagerly = True
return IOBES_F1_TF(self.transform.tag_vocab)
return super().build_metrics(metrics, logger, **kwargs)
def build_loss(self, loss, **kwargs):
assert self.model is not None, 'should create model before build loss'
if loss == 'crf':
if isinstance(self.model, tf.keras.models.Sequential):
crf = CRF(len(self.transform.tag_vocab))
self.model.add(crf)
loss = CRFLoss(crf, self.model.dtype)
else:
self.model = CRFWrapper(self.model, len(self.transform.tag_vocab))
loss = CRFLoss(self.model.crf, self.model.dtype)
return loss
return super().build_loss(loss, **kwargs)
================================================
FILE: hanlp/components/taggers/transformers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 13:57
================================================
FILE: hanlp/components/taggers/transformers/metrics_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 16:33
import tensorflow as tf
class Accuracy(tf.keras.metrics.SparseCategoricalAccuracy):
def __init__(self, name='sparse_categorical_accuracy', dtype=None, mask_value=0):
super().__init__(name, dtype)
self.mask_value = mask_value
def update_state(self, y_true, y_pred, sample_weight=None):
sample_weight = tf.not_equal(y_true, self.mask_value)
return super().update_state(y_true, y_pred, sample_weight)
================================================
FILE: hanlp/components/taggers/transformers/transformer_tagger.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-15 20:55
import logging
from typing import Union, List
import torch
from torch import nn
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder, TransformableDataset
from hanlp.common.structure import History
from hanlp.common.transform import FieldLength, TransformList
from hanlp.common.vocab import Vocab
from hanlp.components.classifiers.transformer_classifier import TransformerComponent
from hanlp.components.taggers.tagger import Tagger
from hanlp.datasets.ner.loaders.tsv import TSVTaggingDataset
from hanlp.layers.crf.crf import CRF
from hanlp.layers.embeddings.embedding import EmbeddingDim, Embedding
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.time_util import CountdownTimer
from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask, filter_state_dict_safely
from hanlp_common.util import merge_locals_kwargs
# noinspection PyAbstractClass
class TransformerTaggingModel(nn.Module):
def __init__(self,
encoder: TransformerEncoder,
num_labels,
crf=False,
secondary_encoder=None,
extra_embeddings: EmbeddingDim = None) -> None:
"""
A shallow tagging model use transformer as decoder.
Args:
encoder: A pretrained transformer.
num_labels: Size of tagset.
crf: True to enable CRF.
extra_embeddings: Extra embeddings which will be concatenated to the encoder outputs.
"""
super().__init__()
self.encoder = encoder
self.secondary_encoder = secondary_encoder
self.extra_embeddings = extra_embeddings
# noinspection PyUnresolvedReferences
feature_size = encoder.transformer.config.hidden_size
if extra_embeddings:
feature_size += extra_embeddings.get_output_dim()
self.classifier = nn.Linear(feature_size, num_labels)
self.crf = CRF(num_labels) if crf else None
def forward(self, lens: torch.LongTensor, input_ids, token_span, token_type_ids=None, batch=None):
mask = lengths_to_mask(lens)
x = self.encoder(input_ids, token_span=token_span, token_type_ids=token_type_ids)
if self.secondary_encoder:
x = self.secondary_encoder(x, mask=mask)
if self.extra_embeddings:
# noinspection PyCallingNonCallable
embed = self.extra_embeddings(batch, mask=mask)
x = torch.cat([x, embed], dim=-1)
x = self.classifier(x)
return x, mask
class TransformerTagger(TransformerComponent, Tagger):
def __init__(self, **kwargs) -> None:
"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
any tagging tasks including PoS tagging and many others.
Args:
**kwargs: Not used.
"""
super().__init__(**kwargs)
self._tokenizer_transform = None
self.model: TransformerTaggingModel = None
# noinspection PyMethodOverriding
def fit_dataloader(self,
trn: DataLoader,
criterion,
optimizer,
metric,
logger: logging.Logger,
history: History,
gradient_accumulation=1,
grad_norm=None,
transformer_grad_norm=None,
teacher: Tagger = None,
kd_criterion=None,
temperature_scheduler=None,
ratio_width=None,
eval_trn=True,
**kwargs):
optimizer, scheduler = optimizer
if teacher:
scheduler, lambda_scheduler = scheduler
else:
lambda_scheduler = None
self.model.train()
timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation))
total_loss = 0
for idx, batch in enumerate(trn):
out, mask = self.feed_batch(batch)
y = batch['tag_id']
loss = self.compute_loss(criterion, out, y, mask)
if gradient_accumulation and gradient_accumulation > 1:
loss /= gradient_accumulation
if teacher:
with torch.no_grad():
out_T, _ = teacher.feed_batch(batch)
# noinspection PyNoneFunctionAssignment
kd_loss = self.compute_distill_loss(kd_criterion, out, out_T, mask, temperature_scheduler)
_lambda = float(lambda_scheduler)
loss = _lambda * loss + (1 - _lambda) * kd_loss
loss.backward()
total_loss += loss.item()
if eval_trn:
prediction = self.decode_output(out, mask, batch)
self.update_metrics(metric, out, y, mask, batch, prediction)
if history.step(gradient_accumulation):
self._step(optimizer, scheduler, grad_norm, transformer_grad_norm, lambda_scheduler)
report = f'loss: {total_loss / (idx + 1):.4f} {metric if eval_trn else ""}'
timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width)
del loss
del out
del mask
def _step(self, optimizer, scheduler, grad_norm, transformer_grad_norm, lambda_scheduler):
clip_grad_norm(self.model, grad_norm, self.model.encoder.transformer, transformer_grad_norm)
optimizer.step()
scheduler.step()
if lambda_scheduler:
lambda_scheduler.step()
optimizer.zero_grad()
def compute_distill_loss(self, kd_criterion, out_S, out_T, mask, temperature_scheduler):
logits_S = out_S[mask]
logits_T = out_T[mask]
temperature = temperature_scheduler(logits_S, logits_T)
return kd_criterion(logits_S, logits_T, temperature)
def build_model(self, training=True, extra_embeddings: Embedding = None, finetune=False, logger=None,
**kwargs) -> torch.nn.Module:
model = TransformerTaggingModel(
self.build_transformer(training=training),
len(self.vocabs.tag),
self.config.crf,
self.config.get('secondary_encoder', None),
extra_embeddings=extra_embeddings.module(self.vocabs) if extra_embeddings else None,
)
if finetune and self.model:
model_state = model.state_dict()
load_state = self.model.state_dict()
safe_state = filter_state_dict_safely(model_state, load_state)
missing_params = model_state.keys() - safe_state.keys()
if missing_params:
logger.info(f'The following parameters were missing from the checkpoint: '
f'{", ".join(sorted(missing_params))}.')
model.load_state_dict(safe_state, strict=False)
n = self.model.classifier.bias.size(0)
if model.classifier.bias.size(0) != n:
model.classifier.weight.data[:n, :] = self.model.classifier.weight.data[:n, :]
model.classifier.bias.data[:n] = self.model.classifier.bias.data[:n]
return model
# noinspection PyMethodOverriding
def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None,
sampler_builder: SamplerBuilder = None, gradient_accumulation=1,
extra_embeddings: Embedding = None, transform=None, max_seq_len=None, **kwargs) -> DataLoader:
if isinstance(data, TransformableDataset):
dataset = data
else:
args = dict((k, self.config.get(k, None)) for k in
['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'])
dataset = self.build_dataset(data, **args)
if self.config.token_key is None:
self.config.token_key = next(iter(dataset[0]))
logger.info(
f'Guess [bold][blue]token_key={self.config.token_key}[/blue][/bold] according to the '
f'training dataset: [blue]{dataset}[/blue]')
if transform:
dataset.append_transform(transform)
if extra_embeddings:
dataset.append_transform(extra_embeddings.transform(self.vocabs))
dataset.append_transform(self.tokenizer_transform)
dataset.append_transform(self.last_transform())
if not isinstance(data, list):
dataset.purge_cache()
if self.vocabs.mutable:
self.build_vocabs(dataset, logger)
if isinstance(data, str) and max_seq_len:
token_key = self.config.token_key
dataset.prune(lambda x: len(x[token_key]) > max_seq_len, logger)
if sampler_builder is not None:
sampler = sampler_builder.build([len(x[f'{self.config.token_key}_input_ids']) for x in dataset], shuffle,
gradient_accumulation=gradient_accumulation if shuffle else 1)
else:
sampler = None
return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)
def build_dataset(self, data, transform=None, **kwargs):
return TSVTaggingDataset(data, transform=transform, **kwargs)
def last_transform(self):
transforms = TransformList(self.vocabs, FieldLength(self.config.token_key))
return transforms
@property
def tokenizer_transform(self) -> TransformerSequenceTokenizer:
if not self._tokenizer_transform:
self._tokenizer_transform = TransformerSequenceTokenizer(self.transformer_tokenizer,
self.config.token_key,
ret_token_span=True)
return self._tokenizer_transform
def build_vocabs(self, trn, logger, **kwargs):
if 'tag' not in self.vocabs:
self.vocabs.tag = Vocab(pad_token=None, unk_token=None)
timer = CountdownTimer(len(trn))
max_seq_len = 0
token_key = self.config.token_key
for each in trn:
max_seq_len = max(max_seq_len, len(each[token_key]))
timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})')
self.vocabs.tag.set_unk_as_safe_unk()
self.vocabs.lock()
self.vocabs.summary(logger)
# noinspection PyMethodOverriding
def fit(self,
trn_data,
dev_data,
save_dir,
transformer,
average_subwords=False,
word_dropout: float = 0.2,
hidden_dropout=None,
layer_dropout=0,
scalar_mix=None,
mix_embedding: int = 0,
grad_norm=5.0,
transformer_grad_norm=None,
lr=5e-5,
transformer_lr=None,
transformer_layers=None,
gradient_accumulation=1,
adam_epsilon=1e-6,
weight_decay=0,
warmup_steps=0.1,
secondary_encoder=None,
extra_embeddings: Embedding = None,
crf=False,
reduction='sum',
batch_size=32,
sampler_builder: SamplerBuilder = None,
epochs=3,
patience=5,
token_key=None,
max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False,
transform=None,
logger=None,
devices: Union[float, int, List[int]] = None,
**kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def feed_batch(self, batch: dict):
features = [batch[k] for k in self.tokenizer_transform.output_key]
if len(features) == 2:
input_ids, token_span = features
else:
input_ids, token_span = features[0], None
lens = batch[f'{self.config.token_key}_length']
x, mask = self.model(lens, input_ids, token_span, batch.get(f'{self.config.token_key}_token_type_ids'),
batch=batch)
return x, mask
# noinspection PyMethodOverriding
def distill(self,
teacher: str,
trn_data,
dev_data,
save_dir,
transformer: str,
batch_size=None,
temperature_scheduler='flsw',
epochs=None,
devices=None,
logger=None,
seed=None,
**kwargs):
return super().distill(**merge_locals_kwargs(locals(), kwargs))
================================================
FILE: hanlp/components/taggers/transformers/transformer_tagger_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 13:55
import math
import tensorflow as tf
from hanlp.common.transform_tf import Transform
from hanlp.components.taggers.tagger_tf import TaggerComponent
from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform
from hanlp.layers.transformers.loader_tf import build_transformer
from hanlp.layers.transformers.utils_tf import build_adamw_optimizer
from hanlp.losses.sparse_categorical_crossentropy import SparseCategoricalCrossentropyOverBatchFirstDim
from hanlp_common.util import merge_locals_kwargs
class TransformerTaggingModel(tf.keras.Model):
def __init__(self, transformer: tf.keras.Model, *args, **kwargs):
super().__init__(*args, **kwargs)
self.transformer = transformer
def call(self, inputs, training=None, mask=None):
return super().call(inputs, training, mask)
class TransformerTaggerTF(TaggerComponent):
def __init__(self, transform: TransformerTransform = None) -> None:
if transform is None:
transform = TransformerTransform()
super().__init__(transform)
self.transform: TransformerTransform = transform
def build_model(self, transformer, max_seq_length, **kwargs) -> tf.keras.Model:
model, tokenizer = build_transformer(transformer, max_seq_length, len(self.transform.tag_vocab), tagging=True)
self.transform.tokenizer = tokenizer
return model
def fit(self, trn_data, dev_data, save_dir,
transformer,
optimizer='adamw',
learning_rate=5e-5,
weight_decay_rate=0,
epsilon=1e-8,
clipnorm=1.0,
warmup_steps_ratio=0,
use_amp=False,
max_seq_length=128,
batch_size=32,
epochs=3,
metrics='accuracy',
run_eagerly=False,
logger=None,
verbose=True,
**kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
# noinspection PyMethodOverriding
def build_optimizer(self, optimizer, learning_rate, epsilon, weight_decay_rate, clipnorm, use_amp, train_steps,
warmup_steps, **kwargs):
if optimizer == 'adamw':
opt = build_adamw_optimizer(self.config, learning_rate, epsilon, clipnorm, train_steps, use_amp,
warmup_steps, weight_decay_rate)
else:
opt = super().build_optimizer(optimizer)
return opt
def build_vocab(self, trn_data, logger):
train_examples = super().build_vocab(trn_data, logger)
warmup_steps_per_epoch = math.ceil(train_examples * self.config.warmup_steps_ratio / self.config.batch_size)
self.config.warmup_steps = warmup_steps_per_epoch * self.config.epochs
return train_examples
def train_loop(self, trn_data, dev_data, epochs, num_examples, train_steps_per_epoch, dev_steps, model, optimizer,
loss, metrics, callbacks, logger, **kwargs):
history = self.model.fit(trn_data, epochs=epochs, steps_per_epoch=train_steps_per_epoch,
validation_data=dev_data,
callbacks=callbacks,
validation_steps=dev_steps,
# mask out padding labels
# class_weight=dict(
# (i, 0 if i == 0 else 1) for i in range(len(self.transform.tag_vocab)))
) # type:tf.keras.callbacks.History
return history
def build_loss(self, loss, **kwargs):
if not loss:
return SparseCategoricalCrossentropyOverBatchFirstDim()
return super().build_loss(loss, **kwargs)
def load_transform(self, save_dir) -> Transform:
super().load_transform(save_dir)
self.transform.tokenizer = build_transformer(self.config.transformer, self.config.max_seq_length,
len(self.transform.tag_vocab), tagging=True, tokenizer_only=True)
return self.transform
================================================
FILE: hanlp/components/taggers/transformers/transformer_transform_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 15:14
from typing import Union, Tuple, List, Iterable
import tensorflow as tf
from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.transformers.utils_tf import convert_examples_to_features
from hanlp.transform.tsv_tf import TsvTaggingFormat
class TransformerTransform(TsvTaggingFormat, Transform):
def __init__(self,
tokenizer=None,
config: SerializableDict = None,
map_x=False, map_y=False, **kwargs) -> None:
super().__init__(config, map_x, map_y, **kwargs)
self._tokenizer = tokenizer
self.tag_vocab: VocabTF = None
self.special_token_ids = None
self.pad = '[PAD]'
self.unk = '[UNK]'
@property
def max_seq_length(self):
# -2 for special tokens [CLS] and [SEP]
return self.config.get('max_seq_length', 128) - 2
@property
def tokenizer(self):
return self._tokenizer
@tokenizer.setter
def tokenizer(self, tokenizer):
self._tokenizer = tokenizer
vocab = tokenizer._vocab if hasattr(tokenizer, '_vocab') else tokenizer.vocab
if self.pad not in vocab:
# English albert use instead of [PAD]
self.pad = ''
if self.unk not in vocab:
self.unk = ''
self.special_token_ids = tf.constant([vocab[token] for token in [self.pad, '[CLS]', '[SEP]']],
dtype=tf.int32)
def fit(self, trn_path: str, **kwargs) -> int:
self.tag_vocab = VocabTF(unk_token=None)
num_samples = 0
for words, tags in self.file_to_inputs(trn_path, gold=True):
num_samples += 1
self.tag_vocab.update(tags)
return num_samples
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
max_seq_length = self.config.get('max_seq_length', 128)
types = (tf.int32, tf.int32, tf.int32), tf.int32
# (input_ids, input_mask, segment_ids), label_ids
shapes = ([max_seq_length], [max_seq_length], [max_seq_length]), [None]
values = (0, 0, 0), self.tag_vocab.pad_idx
return types, shapes, values
def lock_vocabs(self):
super().lock_vocabs()
def inputs_to_samples(self, inputs, gold=False):
max_seq_length = self.config.get('max_seq_length', 128)
tokenizer = self._tokenizer
xlnet = False
roberta = False
pad_token = self.pad
cls_token = '[CLS]'
sep_token = '[SEP]'
unk_token = self.unk
pad_label_idx = self.tag_vocab.pad_idx
pad_token = tokenizer.convert_tokens_to_ids([pad_token])[0]
for sample in inputs:
if gold:
words, tags = sample
else:
words, tags = sample, [self.tag_vocab.idx_to_token[1]] * len(sample)
input_ids, input_mask, segment_ids, label_ids = convert_examples_to_features(words,
max_seq_length, tokenizer,
tags,
self.tag_vocab.token_to_idx,
cls_token_at_end=xlnet,
# xlnet has a cls token at the end
cls_token=cls_token,
cls_token_segment_id=2 if xlnet else 0,
sep_token=sep_token,
sep_token_extra=roberta,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=xlnet,
# pad on the left for xlnet
pad_token_id=pad_token,
pad_token_segment_id=4 if xlnet else 0,
pad_token_label_id=pad_label_idx,
unk_token=unk_token)
if None in input_ids:
print(input_ids)
if None in input_mask:
print(input_mask)
if None in segment_ids:
print(input_mask)
yield (input_ids, input_mask, segment_ids), label_ids
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
raise NotImplementedError('transformers has its own tagger, not need to convert idx for x')
def y_to_idx(self, y) -> tf.Tensor:
raise NotImplementedError('transformers has its own tagger, not need to convert idx for y')
def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
return isinstance(input[0], str)
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, X=None, inputs=None, batch=None,
**kwargs) -> Iterable:
assert batch is not None, 'Need the batch to know actual length of Y'
label_mask = batch[1]
if self.tag_vocab.pad_token:
Y[:, :, self.tag_vocab.pad_idx] = float('-inf')
Y = tf.argmax(Y, axis=-1)
Y = Y[label_mask > 0]
tags = [self.tag_vocab.idx_to_token[tid] for tid in Y]
offset = 0
for words in inputs:
yield tags[offset:offset + len(words)]
offset += len(words)
================================================
FILE: hanlp/components/taggers/util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-01 00:31
from typing import List, Tuple
from hanlp.utils.span_util import allowed_transitions
def guess_tagging_scheme(labels: List[str]) -> str:
tagset = set(y.split('-')[0] for y in labels)
for scheme in "BIO", "BIOUL", "BMES", 'IOBES':
if tagset == set(list(scheme)):
return scheme
def guess_allowed_transitions(labels) -> List[Tuple[int, int]]:
scheme = guess_tagging_scheme(labels)
if not scheme:
return None
if scheme == 'IOBES':
scheme = 'BIOUL'
labels = [y.replace('E-', 'L-').replace('S-', 'U-') for y in labels]
return allowed_transitions(scheme, dict(enumerate(labels)))
================================================
FILE: hanlp/components/tokenizers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 02:48
================================================
FILE: hanlp/components/tokenizers/multi_criteria_cws_transformer.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-21 19:55
from typing import List, Union
from hanlp.common.dataset import SamplerBuilder
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer
from hanlp.datasets.tokenization.loaders.multi_criteria_cws.mcws_dataset import MultiCriteriaTextTokenizingDataset, append_criteria_token
import functools
from hanlp.metrics.f1 import F1
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs
class MultiCriteriaTransformerTaggingTokenizer(TransformerTaggingTokenizer):
def __init__(self, **kwargs) -> None:
r"""Transformer based implementation of "Effective Neural Solution for Multi-Criteria Word Segmentation"
(:cite:`he2019effective`). It uses an artificial token ``[unused_i]`` instead of ``[SEP]`` in the input_ids to
mark the i-th segmentation criteria.
Args:
**kwargs: Not used.
"""
super().__init__(**kwargs)
def build_dataset(self, data, **kwargs):
return MultiCriteriaTextTokenizingDataset(data, **kwargs)
def on_config_ready(self, **kwargs):
super().on_config_ready(**kwargs)
# noinspection PyAttributeOutsideInit
if 'criteria_token_map' not in self.config:
unused_tokens = [f'[unused{i}]' for i in range(1, 100)]
ids = self.transformer_tokenizer.convert_tokens_to_ids(unused_tokens)
self.config.unused_tokens = dict((x, ids[i]) for i, x in enumerate(unused_tokens) if
ids[i] != self.transformer_tokenizer.unk_token_id)
self.config.criteria_token_map = dict()
def last_transform(self):
transforms = super().last_transform()
transforms.append(functools.partial(append_criteria_token,
criteria_tokens=self.config.unused_tokens,
criteria_token_map=self.config.criteria_token_map))
return transforms
def build_vocabs(self, trn, logger, **kwargs):
super().build_vocabs(trn, logger, **kwargs)
logger.info(f'criteria[{len(self.config.criteria_token_map)}] = {list(self.config.criteria_token_map)}')
def feed_batch(self, batch: dict):
x, mask = TransformerTagger.feed_batch(self, batch)
# strip [CLS], [SEP] and [unused_i]
return x[:, 1:-2, :], mask
def build_samples(self, data: List[str], criteria=None, **kwargs):
if not criteria:
criteria = next(iter(self.config.criteria_token_map.keys()))
else:
assert criteria in self.config.criteria_token_map, \
f'Unsupported criteria {criteria}. Choose one from {list(self.config.criteria_token_map.keys())}'
samples = super().build_samples(data, **kwargs)
for sample in samples:
sample['criteria'] = criteria
return samples
def build_metric(self, **kwargs):
metrics = MetricDict()
for criteria in self.config.criteria_token_map:
metrics[criteria] = F1()
return metrics
def update_metrics(self, metric, logits, y, mask, batch, prediction):
for p, g, c in zip(prediction, self.tag_to_span(batch['tag']), batch['criteria']):
pred = set(p)
gold = set(g)
metric[c](pred, gold)
def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2,
hidden_dropout=None, layer_dropout=0, scalar_mix=None, mix_embedding: int = 0, grad_norm=5.0,
transformer_grad_norm=None, lr=5e-5,
transformer_lr=None, transformer_layers=None, gradient_accumulation=1,
adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, crf=False, reduction='sum',
batch_size=32, sampler_builder: SamplerBuilder = None, epochs=30, patience=5, token_key=None,
tagging_scheme='BMES', delimiter=None,
max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None,
devices: Union[float, int, List[int]] = None, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
================================================
FILE: hanlp/components/tokenizers/tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-12 13:08
from typing import Any, Callable
from hanlp.components.taggers.rnn_tagger import RNNTagger
from hanlp.datasets.tokenization.loaders.chunking_dataset import ChunkingDataset
from hanlp.metrics.chunking.chunking_f1 import ChunkingF1
from hanlp.utils.span_util import bmes_to_words
from hanlp_common.util import merge_locals_kwargs
class RNNTokenizer(RNNTagger):
def predict(self, sentence: Any, batch_size: int = None, **kwargs):
flat = isinstance(sentence, str)
if flat:
sentence = [sentence]
for i, s in enumerate(sentence):
sentence[i] = list(s)
outputs = RNNTagger.predict(self, sentence, batch_size, **kwargs)
if flat:
return outputs[0]
return outputs
def predict_data(self, data, batch_size, **kwargs):
tags = RNNTagger.predict_data(self, data, batch_size, **kwargs)
words = [bmes_to_words(c, t) for c, t in zip(data, tags)]
return words
def build_dataset(self, data, transform=None):
dataset = ChunkingDataset(data)
if 'transform' in self.config:
dataset.append_transform(self.config.transform)
if transform:
dataset.append_transform(transform)
return dataset
def build_metric(self, **kwargs):
return ChunkingF1()
def update_metrics(self, metric, logits, y, mask, batch):
pred = self.decode_output(logits, mask, batch)
pred = self._id_to_tags(pred)
gold = batch['tag']
metric(pred, gold)
def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256,
drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='char', tagging_scheme=None,
anneal_factor: float = 0.5, anneal_patience=2, devices=None, logger=None,
verbose=True, transform: Callable = None, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
================================================
FILE: hanlp/components/tokenizers/tok_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-27 14:30
import logging
from typing import Union, Any, List, Tuple, Iterable
import tensorflow as tf
from hanlp.common.keras_component import KerasComponent
from hanlp.components.taggers.ngram_conv.ngram_conv_tagger import NgramTransform, NgramConvTaggerTF
from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF
from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF
from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform
from hanlp.losses.sparse_categorical_crossentropy import SparseCategoricalCrossentropyOverBatchFirstDim
from hanlp.metrics.chunking.bmes_tf import BMES_F1_TF
from hanlp.transform.tsv_tf import TSVTaggingTransform
from hanlp.transform.txt_tf import TxtFormat, TxtBMESFormat, extract_ngram_features_and_tags, bmes_to_words
from hanlp_common.util import merge_locals_kwargs
class BMESTokenizerTF(KerasComponent):
def build_metrics(self, metrics, logger: logging.Logger, **kwargs):
if metrics == 'f1':
self.config.run_eagerly = True
return BMES_F1_TF(self.transform.tag_vocab)
return super().build_metrics(metrics, logger, **kwargs)
class NgramConvTokenizerTransform(TxtFormat, NgramTransform):
def inputs_to_samples(self, inputs, gold=False):
if self.input_is_single_sample(inputs):
inputs = [inputs]
for sent in inputs:
# bigram_only = false
yield extract_ngram_features_and_tags(sent, False, self.config.window_size, gold)
def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
if not input:
return True
return isinstance(input, str)
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
**kwargs) -> Iterable:
yield from TxtBMESFormat.Y_to_tokens(self, self.tag_vocab, Y, gold, inputs)
class NgramConvTokenizerTF(BMESTokenizerTF, NgramConvTaggerTF):
def __init__(self) -> None:
super().__init__(NgramConvTokenizerTransform())
def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200,
ngram_embed: Union[str, int, dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3,
filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True,
loss: Union[tf.keras.losses.Loss, str] = None,
optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=100,
epochs=100, logger=None, verbose=True, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def evaluate_output_to_file(self, batch, outputs, out):
for x, y_pred in zip(self.transform.X_to_inputs(batch[0]),
self.transform.Y_to_outputs(outputs, gold=False)):
out.write(self.transform.input_truth_output_to_str(x, None, y_pred))
out.write('\n')
def build_loss(self, loss, **kwargs):
if loss is None:
return SparseCategoricalCrossentropyOverBatchFirstDim()
return super().build_loss(loss, **kwargs)
class TransformerTokenizerTransform(TxtBMESFormat, TransformerTransform):
def inputs_to_samples(self, inputs, gold=False):
yield from TransformerTransform.inputs_to_samples(self, TxtBMESFormat.inputs_to_samples(self, inputs, gold),
True)
def Y_to_tokens(self, tag_vocab, Y, gold, inputs):
if not gold:
Y = tf.argmax(Y, axis=2)
for text, ys in zip(inputs, Y):
tags = [tag_vocab.idx_to_token[int(y)] for y in ys[1:len(text) + 1]]
yield bmes_to_words(list(text), tags)
class TransformerTokenizerTF(BMESTokenizerTF, TransformerTaggerTF):
def __init__(self, transform: TransformerTokenizerTransform = None) -> None:
if transform is None:
transform = TransformerTokenizerTransform()
super().__init__(transform)
class RNNTokenizerTransform(TxtBMESFormat, TSVTaggingTransform):
pass
class RNNTokenizerTF(BMESTokenizerTF, RNNTaggerTF):
def __init__(self, transform: RNNTokenizerTransform = None) -> None:
if not transform:
transform = RNNTokenizerTransform()
super().__init__(transform)
def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False,
rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, lower=False, max_seq_len=50,
logger=None, loss: Union[tf.keras.losses.Loss, str] = None,
optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=32,
dev_batch_size=32, lr_decay_per_epoch=None, verbose=True, **kwargs):
return super().fit(**merge_locals_kwargs(locals(), kwargs))
================================================
FILE: hanlp/components/tokenizers/transformer.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 02:48
import functools
from typing import TextIO, Union, List, Dict, Any, Set
import torch
from hanlp.common.dataset import SamplerBuilder
from hanlp.common.transform import TransformList
from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger
from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset, generate_tags_for_subtokens
from hanlp.metrics.f1 import F1
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp.utils.span_util import bmes_to_spans
from hanlp.utils.string_util import possible_tokenization
from hanlp_common.util import merge_locals_kwargs
from hanlp_trie import DictInterface, TrieDict
from hanlp_trie.dictionary import TupleTrieDict
class TransformerTaggingTokenizer(TransformerTagger):
def __init__(self, **kwargs) -> None:
""" A tokenizer using transformer tagger for span prediction. It features with 2 high performance dictionaries
to handle edge cases in real application.
- ``dict_force``: High priority dictionary performs longest-prefix-matching on input text which takes higher
priority over model predictions.
- ``dict_combine``: Low priority dictionary performs longest-prefix-matching on model predictions then
combines them.
.. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
do and what it can't do. The tutorial in `this book `_ can be very helpful.
It also supports outputting the span of each token by setting ``config.output_spans = True``.
Args:
**kwargs: Predefined config.
"""
super().__init__(**kwargs)
@property
def dict_force(self) -> DictInterface:
r""" The high priority dictionary which perform longest-prefix-matching on inputs to split them into two subsets:
1. spans containing no keywords, which are then fed into tokenizer for further tokenization.
2. keywords, which will be outputed without furthur tokenization.
.. Caution::
Longest-prefix-matching **NEVER** guarantee the presence of any keywords. Abuse of
``dict_force`` can lead to low quality results. For more details, refer to
`this book `_.
Examples:
>>> tok.dict_force = {'和服', '服务行业'} # Force '和服' and '服务行业' by longest-prefix-matching
>>> tok("商品和服务行业")
['商品', '和服', '务行业']
>>> tok.dict_force = {'和服务': ['和', '服务']} # Force '和服务' to be tokenized as ['和', '服务']
>>> tok("商品和服务行业")
['商品', '和', '服务', '行业']
"""
return self.config.get('dict_force', None)
@dict_force.setter
def dict_force(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
if dictionary is not None and not isinstance(dictionary, DictInterface):
dictionary = TrieDict(dictionary)
self.config.dict_force = dictionary
self.tokenizer_transform.dict = dictionary
@property
def dict_combine(self) -> DictInterface:
""" The low priority dictionary which perform longest-prefix-matching on model predictions and combing them.
Examples:
>>> tok.dict_combine = {'和服', '服务行业'}
>>> tok("商品和服务行业") # '和服' is not in the original results ['商品', '和', '服务']. '服务', '行业' are combined to '服务行业'
['商品', '和', '服务行业']
"""
return self.config.get('dict_combine', None)
@dict_combine.setter
def dict_combine(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]):
if dictionary is not None and not isinstance(dictionary, DictInterface):
if all(isinstance(k, str) for k in dictionary):
dictionary = TrieDict(dictionary)
else:
_d = set()
for k in dictionary:
if isinstance(k, str):
_d.update(possible_tokenization(k))
else:
_d.add(k)
dictionary = TupleTrieDict(_d)
self.config.dict_combine = dictionary
def build_metric(self, **kwargs):
return F1()
# noinspection PyMethodOverriding
def update_metrics(self, metric, logits, y, mask, batch, prediction):
for p, g in zip(prediction, self.tag_to_span(batch['tag'], batch)):
pred = set(p)
gold = set(g)
metric(pred, gold)
def decode_output(self, logits, mask, batch, model=None):
output = super().decode_output(logits, mask, batch, model)
if isinstance(output, torch.Tensor):
output = output.tolist()
prediction = self.id_to_tags(output, [len(x) for x in batch['token']])
return self.tag_to_span(prediction, batch)
def tag_to_span(self, batch_tags, batch: dict):
spans = []
if 'custom_words' in batch:
if self.config.tagging_scheme == 'BMES':
S = 'S'
M = 'M'
E = 'E'
else:
S = 'B'
M = 'I'
E = 'I'
for tags, custom_words in zip(batch_tags, batch['custom_words']):
# [batch['raw_token'][0][x[0]:x[1]] for x in subwords]
if custom_words:
for start, end, label in custom_words:
if end - start == 1:
tags[start] = S
else:
tags[start] = 'B'
tags[end - 1] = E
for i in range(start + 1, end - 1):
tags[i] = M
if end < len(tags):
tags[end] = 'B'
if 'token_subtoken_offsets_group' not in batch: # only check prediction on raw text for now
# Check cases that a single char gets split into multiple subtokens, e.g., ‥ -> . + .
for tags, subtoken_offsets in zip(batch_tags, batch['token_subtoken_offsets']):
offset = -1 # BERT produces 'ᄒ', '##ᅡ', '##ᆫ' for '한' and they share the same span
prev_tag = None
for i, (tag, (b, e)) in enumerate(zip(tags, subtoken_offsets)):
if b < offset:
if prev_tag == 'S':
tags[i - 1] = 'B'
elif prev_tag == 'E':
tags[i - 1] = 'M'
tags[i] = tag = 'M'
offset = e
prev_tag = tag
for tags in batch_tags:
spans.append(bmes_to_spans(tags))
return spans
def write_prediction(self, prediction, batch, output: TextIO):
batch_tokens = self.spans_to_tokens(prediction, batch)
for tokens in batch_tokens:
output.write(' '.join(tokens))
output.write('\n')
@property
def tokenizer_transform(self):
if not self._tokenizer_transform:
self._tokenizer_transform = TransformerSequenceTokenizer(self.transformer_tokenizer,
self.config.token_key,
ret_subtokens=True,
ret_subtokens_group=True,
ret_token_span=False,
dict_force=self.dict_force)
return self._tokenizer_transform
def spans_to_tokens(self, spans, batch, rebuild_span=False):
batch_tokens = []
dict_combine = self.dict_combine
raw_text = batch.get('token_', None) # Use raw text to rebuild the token according to its offset
for b, (spans_per_sent, sub_tokens) in enumerate(zip(spans, batch[self.config.token_key])):
if raw_text: # This will restore iPhone X as a whole
text = raw_text[b]
offsets = batch['token_subtoken_offsets'][b]
tokens = [text[offsets[b][0]:offsets[e - 1][-1]] for b, e in spans_per_sent]
else: # This will merge iPhone X into iPhoneX
tokens = [''.join(sub_tokens[span[0]:span[1]]) for span in spans_per_sent]
if dict_combine:
buffer = []
offset = 0
delta = 0
for start, end, label in dict_combine.tokenize(tokens):
if offset < start:
buffer.extend(tokens[offset:start])
if raw_text:
# noinspection PyUnboundLocalVariable
combined = text[offsets[spans_per_sent[start - delta][0]][0]:
offsets[spans_per_sent[end - delta - 1][1] - 1][1]]
else:
combined = ''.join(tokens[start:end])
buffer.append(combined)
offset = end
if rebuild_span:
start -= delta
end -= delta
combined_span = (spans_per_sent[start][0], spans_per_sent[end - 1][1])
del spans_per_sent[start:end]
delta += end - start - 1
spans_per_sent.insert(start, combined_span)
if offset < len(tokens):
buffer.extend(tokens[offset:])
tokens = buffer
batch_tokens.append(tokens)
return batch_tokens
def generate_prediction_filename(self, tst_data, save_dir):
return super().generate_prediction_filename(tst_data.replace('.tsv', '.txt'), save_dir)
def prediction_to_human(self, pred, vocab, batch, rebuild_span=False):
output_spans = self.config.get('output_spans', None)
tokens = self.spans_to_tokens(pred, batch, rebuild_span or output_spans)
if output_spans:
subtoken_spans = batch['token_subtoken_offsets']
results = []
for toks, offs, subs in zip(tokens, pred, subtoken_spans):
r = []
results.append(r)
for t, (b, e) in zip(toks, offs):
r.append([t, subs[b][0], subs[e - 1][-1]])
return results
return tokens
def input_is_flat(self, tokens):
return isinstance(tokens, str)
def build_dataset(self, data, **kwargs):
return TextTokenizingDataset(data, **kwargs)
def last_transform(self):
return TransformList(functools.partial(generate_tags_for_subtokens, tagging_scheme=self.config.tagging_scheme),
super().last_transform())
def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2,
hidden_dropout=None, layer_dropout=0, scalar_mix=None, grad_norm=5.0,
transformer_grad_norm=None, lr=5e-5, eval_trn=True,
transformer_lr=None, transformer_layers=None, gradient_accumulation=1,
adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, crf=False, reduction='sum',
batch_size=32, sampler_builder: SamplerBuilder = None, epochs=30, patience=5, token_key=None,
tagging_scheme='BMES', delimiter=None,
max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None,
devices: Union[float, int, List[int]] = None, **kwargs):
"""
Args:
trn_data: Training set.
dev_data: Development set.
save_dir: The directory to save trained component.
transformer: An identifier of a pre-trained transformer.
average_subwords: ``True`` to average subword representations.
word_dropout: Dropout rate to randomly replace a subword with MASK.
hidden_dropout: Dropout rate applied to hidden states.
layer_dropout: Randomly zero out hidden states of a transformer layer.
scalar_mix: Layer attention.
grad_norm: Gradient norm for clipping.
transformer_grad_norm: Gradient norm for clipping transformer gradient.
lr: Learning rate for decoder.
transformer_lr: Learning for encoder.
transformer_layers: The number of bottom layers to use.
gradient_accumulation: Number of batches per update.
adam_epsilon: The epsilon to use in Adam.
weight_decay: The weight decay to use.
warmup_steps: The number of warmup steps.
crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
reduction: The loss reduction used in aggregating losses.
batch_size: The number of samples in a batch.
sampler_builder: The builder to build sampler, which will override batch_size.
epochs: The number of epochs to train.
patience: The number of patience epochs before early stopping.
token_key: The key to tokens in dataset.
tagging_scheme: Either ``BMES`` or ``BI``.
delimiter: Delimiter between tokens used to split a line in the corpus.
max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
be split here.
char_level: Whether the sequence length is measured at char level.
hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
in a sentence, it will be split at a token anyway.
transform: An optional transform to be applied to samples. Usually a character normalization transform is
passed in.
devices: Devices this component will live on.
logger: Any :class:`logging.Logger` instance.
seed: Random seed to reproduce this training.
**kwargs: Not used.
Returns:
Best metrics on dev set.
"""
return super().fit(**merge_locals_kwargs(locals(), kwargs))
def feed_batch(self, batch: dict):
x, mask = super().feed_batch(batch)
return x[:, 1:-1, :], mask
================================================
FILE: hanlp/datasets/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 18:15
================================================
FILE: hanlp/datasets/classification/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 11:49
================================================
FILE: hanlp/datasets/classification/sentiment.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 21:03
_ERNIE_TASK_DATA = 'https://ernie.bj.bcebos.com/task_data_zh.tgz#'
CHNSENTICORP_ERNIE_TRAIN = _ERNIE_TASK_DATA + 'chnsenticorp/train.tsv'
CHNSENTICORP_ERNIE_DEV = _ERNIE_TASK_DATA + 'chnsenticorp/dev.tsv'
CHNSENTICORP_ERNIE_TEST = _ERNIE_TASK_DATA + 'chnsenticorp/test.tsv'
================================================
FILE: hanlp/datasets/coref/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-04 13:39
================================================
FILE: hanlp/datasets/coref/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:03
================================================
FILE: hanlp/datasets/coref/loaders/conll12coref.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-04 15:33
import collections
import os
from typing import Union, List, Callable, DefaultDict, Tuple, Optional, Iterator
from hanlp.datasets.srl.loaders.ontonotes_loader import Ontonotes as _Ontonotes, OntonotesSentence, \
make_coref_instance
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import TimingFileIterator
class Ontonotes(_Ontonotes):
def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]:
"""An iterator over CONLL formatted files which yields documents, regardless
of the number of document annotations in a particular file. This is useful
for conll data which has been preprocessed, such as the preprocessing which
takes place for the 2012 CONLL Coreference Resolution task.
Args:
file_path: str:
Returns:
"""
open_file = TimingFileIterator(file_path)
conll_rows = []
document: List[OntonotesSentence] = []
for line in open_file:
open_file.log(f'Loading {os.path.basename(file_path)}')
line = line.strip()
if line != "" and not line.startswith("#"):
# Non-empty line. Collect the annotation.
conll_rows.append(line)
else:
if conll_rows:
document.append(self._conll_rows_to_sentence(conll_rows))
conll_rows = []
if line.startswith("#end document"):
yield document
document = []
open_file.erase()
if document:
# Collect any stragglers or files which might not
# have the '#end document' format for the end of the file.
yield document
class CONLL12CorefDataset(TransformableDataset):
def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None,
max_span_width=10, max_sentences=None, remove_singleton_clusters=False) -> None:
self.remove_singleton_clusters = remove_singleton_clusters
self.max_sentences = max_sentences
self.max_span_width = max_span_width
super().__init__(data, transform, cache)
def load_file(self, filepath: str):
ontonotes_reader = Ontonotes()
for sentences in ontonotes_reader.dataset_document_iterator(filepath):
clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list)
total_tokens = 0
for sentence in sentences:
for typed_span in sentence.coref_spans:
# Coref annotations are on a _per sentence_
# basis, so we need to adjust them to be relative
# to the length of the document.
span_id, (start, end) = typed_span
clusters[span_id].append((start + total_tokens, end + total_tokens))
total_tokens += len(sentence.words)
yield self.text_to_instance([s.words for s in sentences], list(clusters.values()))
def text_to_instance(
self, # type: ignore
sentences: List[List[str]],
gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
) -> dict:
return make_coref_instance(
sentences,
self.max_span_width,
gold_clusters,
self.max_sentences,
self.remove_singleton_clusters,
)
================================================
FILE: hanlp/datasets/eos/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-26 18:11
================================================
FILE: hanlp/datasets/eos/eos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-26 18:12
import itertools
from collections import Counter
from typing import Union, List, Callable
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import TimingFileIterator
from hanlp.utils.log_util import cprint
from hanlp.utils.string_util import ispunct
class SentenceBoundaryDetectionDataset(TransformableDataset):
def __init__(self,
data: Union[str, List],
transform: Union[Callable, List] = None,
cache=None,
append_after_sentence=None,
eos_chars=None,
eos_char_min_freq=200,
eos_char_is_punct=True,
window_size=5,
**kwargs,
) -> None:
"""Dataset for sentence boundary detection (eos).
Args:
data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
transform: Predefined transform(s).
cache: ``True`` to enable caching, so that transforms won't be called twice.
append_after_sentence: A :class:`str` to insert at the tail of each sentence. For example, English always
have a space between sentences.
eos_chars: Punctuations at the tail of sentences. If ``None``, then it will built from training samples.
eos_char_min_freq: Minimal frequency to keep a eos char.
eos_char_is_punct: Limit eos chars to punctuations.
window_size: Window size to extract ngram features.
kwargs: Not used.
"""
self.eos_char_is_punct = eos_char_is_punct
self.append_after_sentence = append_after_sentence
self.window_size = window_size
self.eos_chars = eos_chars
self.eos_char_min_freq = eos_char_min_freq
super().__init__(data, transform, cache)
def load_file(self, filepath: str):
"""Load eos corpus.
Args:
filepath: Path to the corpus.
.. highlight:: bash
.. code-block:: bash
$ head -n 2 ctb8.txt
中国经济简讯
新华社北京十月二十九日电中国经济简讯
"""
f = TimingFileIterator(filepath)
sents = []
eos_offsets = []
offset = 0
for line in f:
if not line.strip():
continue
line = line.rstrip('\n')
eos_offsets.append(offset + len(line.rstrip()) - 1)
offset += len(line)
if self.append_after_sentence:
line += self.append_after_sentence
offset += len(self.append_after_sentence)
f.log(line)
sents.append(line)
f.erase()
corpus = list(itertools.chain.from_iterable(sents))
if self.eos_chars:
if not isinstance(self.eos_chars, set):
self.eos_chars = set(self.eos_chars)
else:
eos_chars = Counter()
for i in eos_offsets:
eos_chars[corpus[i]] += 1
self.eos_chars = set(k for (k, v) in eos_chars.most_common() if
v >= self.eos_char_min_freq and (not self.eos_char_is_punct or ispunct(k)))
cprint(f'eos_chars = [yellow]{self.eos_chars}[/yellow]')
eos_index = 0
eos_offsets = [i for i in eos_offsets if corpus[i] in self.eos_chars]
window_size = self.window_size
for i, c in enumerate(corpus):
if c in self.eos_chars:
window = corpus[i - window_size: i + window_size + 1]
label_id = 1. if eos_offsets[eos_index] == i else 0.
if label_id > 0:
eos_index += 1
yield {'char': window, 'label_id': label_id}
assert eos_index == len(eos_offsets), f'{eos_index} != {len(eos_offsets)}'
================================================
FILE: hanlp/datasets/eos/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:03
================================================
FILE: hanlp/datasets/eos/loaders/nn_eos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-24 22:51
_SETIMES2_EN_HR_SENTENCES_HOME = 'https://schweter.eu/cloud/nn_eos/SETIMES2.en-hr.sentences.tar.xz'
SETIMES2_EN_HR_HR_SENTENCES_TRAIN = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.train'
'''Training set of SETimes corpus.'''
SETIMES2_EN_HR_HR_SENTENCES_DEV = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.dev'
'''Dev set of SETimes corpus.'''
SETIMES2_EN_HR_HR_SENTENCES_TEST = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.test'
'''Test set of SETimes corpus.'''
_EUROPARL_V7_DE_EN_EN_SENTENCES_HOME = 'http://schweter.eu/cloud/nn_eos/europarl-v7.de-en.en.sentences.tar.xz'
EUROPARL_V7_DE_EN_EN_SENTENCES_TRAIN = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.train'
'''Training set of Europarl corpus (:cite:`koehn2005europarl`).'''
EUROPARL_V7_DE_EN_EN_SENTENCES_DEV = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.dev'
'''Dev set of Europarl corpus (:cite:`koehn2005europarl`).'''
EUROPARL_V7_DE_EN_EN_SENTENCES_TEST = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.test'
'''Test set of Europarl corpus (:cite:`koehn2005europarl`).'''
================================================
FILE: hanlp/datasets/lm/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-05 21:41
_PTB_HOME = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz#'
PTB_TOKEN_TRAIN = _PTB_HOME + 'data/ptb.train.txt'
PTB_TOKEN_DEV = _PTB_HOME + 'data/ptb.valid.txt'
PTB_TOKEN_TEST = _PTB_HOME + 'data/ptb.test.txt'
PTB_CHAR_TRAIN = _PTB_HOME + 'data/ptb.char.train.txt'
PTB_CHAR_DEV = _PTB_HOME + 'data/ptb.char.valid.txt'
PTB_CHAR_TEST = _PTB_HOME + 'data/ptb.char.test.txt'
================================================
FILE: hanlp/datasets/lm/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:04
================================================
FILE: hanlp/datasets/lm/loaders/lm_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-05 21:42
import os
from typing import Union, Callable, List
import hanlp_common.io
import torch
from hanlp.common.dataset import TransformSequentialDataset
from hanlp.common.transform import ToChar, WhitespaceTokenizer, AppendEOS, FieldToIndex
from hanlp.common.vocab import Vocab
from hanlp.utils.io_util import file_cache, get_resource, TimingFileIterator
class LanguageModelDataset(TransformSequentialDataset):
def __init__(self,
data: str,
batch_size,
seq_len,
tokenizer='char',
eos='\n',
strip=True,
vocab=None,
cache=False,
transform: Union[Callable, List] = None) -> None:
self.cache = cache
self.eos = eos
self.strip = strip
super().__init__(transform)
if isinstance(tokenizer, str):
available_tokenizers = {
'char': ToChar('text', 'token'),
'whitespace': WhitespaceTokenizer('text', 'token')
}
assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} '
self.append_transform(available_tokenizers[tokenizer])
if vocab is None:
vocab = Vocab()
self.training = True
else:
self.training = vocab.mutable
self.append_transform(AppendEOS('token', eos=eos))
self.append_transform(FieldToIndex('token', vocab))
self.batch_size = batch_size
data = get_resource(data)
self.data = data
self.num_tokens = None
self.load_file(data)
self._fp = None
if isinstance(seq_len, int):
self.seq_len = lambda: seq_len
else:
self.seq_len = seq_len
@property
def vocab(self):
return self.transform[-1].vocab
@property
def vocab_path(self):
return os.path.splitext(self.data)[0] + '.vocab.json'
def load_file(self, filepath):
cache, valid = file_cache(filepath, not self.cache)
if not valid or (self.vocab.mutable and not os.path.isfile(self.vocab_path)):
with open(cache, 'wb') as out:
tokens, lines = 0, 0
f = TimingFileIterator(filepath)
for line in f:
if self.strip:
line = line.strip()
if not line:
continue
sample = {'text': line}
sample = self.transform_sample(sample, inplace=True)
for id in sample['token_id']:
out.write((id).to_bytes(4, 'little'))
tokens += len(sample['token_id'])
lines += 1
f.log(f'{tokens // 1000000}M tokens, {lines // 1000000}M lines\n'
f'{sample["token"][:10]}')
f.erase()
if self.vocab.mutable:
self.vocab.lock()
hanlp_common.io.save_json(self.vocab_path)
self.num_tokens = tokens
else:
self.num_tokens = int(os.path.getsize(self.filecache) / 4)
if self.vocab.mutable:
hanlp_common.io.load_json(self.vocab_path)
def __iter__(self):
batch_size = self.batch_size
max_seq_len = self.max_seq_len
i = 0
safety = 2 if self.training else 1
with open(self.filecache, 'rb') as fp:
while i < max_seq_len - safety:
seq_len = self.seq_len()
seq_len = min(seq_len, max_seq_len - 1 - i)
data = []
for j in range(batch_size):
data.append(self._read_chunk(fp, max_seq_len * j + i, seq_len + 1))
data = torch.LongTensor(data)
data.transpose_(0, 1)
data, targets = data[:seq_len, :], data[1:, :]
yield data, targets.contiguous().view(-1)
i += seq_len
def estimate_num_batches(self, seq_len=None):
if not seq_len:
seq_len = self.seq_len()
return self.max_seq_len // seq_len
@property
def max_seq_len(self):
max_seq_len = self.num_tokens // self.batch_size
return max_seq_len
@staticmethod
def _read_chunk(fp, offset, length):
data = []
fp.seek(offset * 4)
for i in range(length):
id = int.from_bytes(fp.read(4), 'little')
data.append(id)
return data
def _debug_load_cache(self):
with open(self.filecache, 'rb') as src:
ids = []
for i in range(self.num_tokens):
id = int.from_bytes(src.read(4), 'little')
ids.append(id)
return torch.LongTensor(ids)
@property
def filecache(self):
return file_cache(self.data)[0]
================================================
FILE: hanlp/datasets/lu/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:08
================================================
FILE: hanlp/datasets/lu/glue.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 11:47
from hanlp.common.dataset import TableDataset
STANFORD_SENTIMENT_TREEBANK_2_TRAIN = 'http://file.hankcs.com/corpus/SST2.zip#train.tsv'
STANFORD_SENTIMENT_TREEBANK_2_DEV = 'http://file.hankcs.com/corpus/SST2.zip#dev.tsv'
STANFORD_SENTIMENT_TREEBANK_2_TEST = 'http://file.hankcs.com/corpus/SST2.zip#test.tsv'
MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TRAIN = 'http://file.hankcs.com/corpus/mrpc.zip#train.tsv'
MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV = 'http://file.hankcs.com/corpus/mrpc.zip#dev.tsv'
MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TEST = 'http://file.hankcs.com/corpus/mrpc.zip#test.tsv'
class SST2Dataset(TableDataset):
pass
def main():
dataset = SST2Dataset(STANFORD_SENTIMENT_TREEBANK_2_TEST)
print(dataset)
if __name__ == '__main__':
main()
================================================
FILE: hanlp/datasets/ner/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-06 15:32
================================================
FILE: hanlp/datasets/ner/conll03.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-06 15:31
CONLL03_EN_TRAIN = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.train.tsv'
'''Training set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''
CONLL03_EN_DEV = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.dev.tsv'
'''Dev set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''
CONLL03_EN_TEST = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.test.tsv'
'''Test set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''
================================================
FILE: hanlp/datasets/ner/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:04
================================================
FILE: hanlp/datasets/ner/loaders/json_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-21 16:26
import json
import os
from typing import Union, List, Callable, Dict
from hanlp_common.constant import NULL
import hanlp.utils.span_util
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import TimingFileIterator, read_tsv_as_sents
class JsonNERDataset(TransformableDataset):
def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None,
generate_idx=None, doc_level_offset=True, tagset=None) -> None:
"""A dataset for ``.jsonlines`` format NER corpora.
Args:
data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
transform: Predefined transform(s).
cache: ``True`` to enable caching, so that transforms won't be called twice.
generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
samples are re-ordered by a sampler.
doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
tagset: Optional tagset to prune entities outside of this tagset from datasets.
"""
self.tagset = tagset
self.doc_level_offset = doc_level_offset
super().__init__(data, transform, cache, generate_idx)
def load_file(self, filepath: str):
"""Load ``.jsonlines`` NER corpus. Samples of this corpus can be found using the following scripts.
.. highlight:: python
.. code-block:: python
import json
from hanlp_common.document import Document
from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV
from hanlp.utils.io_util import get_resource
with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src:
for line in src:
doc = json.loads(line)
print(Document(doc))
break
Args:
filepath: ``.jsonlines`` NER corpus.
"""
filename = os.path.basename(filepath)
reader = TimingFileIterator(filepath)
num_docs, num_sentences = 0, 0
for line in reader:
line = line.strip()
if not line:
continue
doc = json.loads(line)
num_docs += 1
num_tokens_in_doc = 0
for sentence, ner in zip(doc['sentences'], doc['ner']):
if self.doc_level_offset:
ner = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc, x[2]) for x in ner]
else:
ner = [(x[0], x[1], x[2]) for x in ner]
if self.tagset:
ner = [x for x in ner if x[2] in self.tagset]
if isinstance(self.tagset, dict):
ner = [(x[0], x[1], self.tagset[x[2]]) for x in ner]
deduplicated_srl = []
be_set = set()
for b, e, l in ner:
be = (b, e)
if be in be_set:
continue
be_set.add(be)
deduplicated_srl.append((b, e, l))
yield {
'token': sentence,
'ner': deduplicated_srl
}
num_sentences += 1
num_tokens_in_doc += len(sentence)
reader.log(
f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]')
reader.erase()
def convert_conll03_to_json(file_path):
dataset = []
num_docs = [0]
def new_doc():
doc_key = num_docs[0]
num_docs[0] += 1
return {
'doc_key': doc_key,
'sentences': [],
'ner': [],
}
doc = new_doc()
offset = 0
for cells in read_tsv_as_sents(file_path):
if cells[0][0] == '-DOCSTART-' and doc['ner']:
dataset.append(doc)
doc = new_doc()
offset = 0
sentence = [x[0] for x in cells]
ner = [x[-1] for x in cells]
ner = hanlp.utils.span_util.iobes_tags_to_spans(ner)
adjusted_ner = []
for label, (span_start, span_end) in ner:
adjusted_ner.append([span_start + offset, span_end + offset, label])
doc['sentences'].append(sentence)
doc['ner'].append(adjusted_ner)
offset += len(sentence)
if doc['ner']:
dataset.append(doc)
output_path = os.path.splitext(file_path)[0] + '.json'
with open(output_path, 'w') as out:
for each in dataset:
json.dump(each, out)
out.write('\n')
def unpack_ner(sample: dict) -> dict:
ner: list = sample.get('ner', None)
if ner is not None:
if ner:
sample['begin_offset'], sample['end_offset'], sample['label'] = zip(*ner)
else:
# It's necessary to create a null label when there is no NER in the sentence for the sake of padding.
sample['begin_offset'], sample['end_offset'], sample['label'] = [0], [0], [NULL]
return sample
def prune_ner_tagset(sample: dict, tagset: Union[set, Dict[str, str]]):
if 'tag' in sample:
pruned_tag = []
for tag in sample['tag']:
cells = tag.split('-', 1)
if len(cells) == 2:
role, ner_type = cells
if ner_type in tagset:
if isinstance(tagset, dict):
tag = role + '-' + tagset[ner_type]
else:
tag = 'O'
pruned_tag.append(tag)
sample['tag'] = pruned_tag
return sample
================================================
FILE: hanlp/datasets/ner/loaders/tsv.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-24 23:09
from typing import Union, List, Callable
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv
from hanlp.utils.string_util import split_long_sentence_into
class TSVTaggingDataset(TransformableDataset):
def __init__(self,
data: Union[str, List],
transform: Union[Callable, List] = None,
cache=None,
generate_idx=None,
max_seq_len=None,
sent_delimiter=None,
char_level=False,
hard_constraint=False,
**kwargs
) -> None:
"""
Args:
data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
transform: Predefined transform(s).
cache: ``True`` to enable caching, so that transforms won't be called twice.
generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
samples are re-ordered by a sampler.
max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
be split here.
char_level: Whether the sequence length is measured at char level, which is never the case for
lemmatization.
hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
in a sentence, it will be split at a token anyway.
kwargs: Not used.
"""
self.char_level = char_level
self.hard_constraint = hard_constraint
self.sent_delimiter = sent_delimiter
self.max_seq_len = max_seq_len
super().__init__(data, transform, cache, generate_idx)
def load_file(self, filepath):
"""Load a ``.tsv`` file. A ``.tsv`` file for tagging is defined as a tab separated text file, where non-empty
lines have two columns for token and tag respectively, empty lines mark the end of sentences.
Args:
filepath: Path to a ``.tsv`` tagging file.
.. highlight:: bash
.. code-block:: bash
$ head eng.train.tsv
-DOCSTART- O
EU S-ORG
rejects O
German S-MISC
call O
to O
boycott O
British S-MISC
lamb O
"""
filepath = get_resource(filepath)
# idx = 0
for words, tags in generate_words_tags_from_tsv(filepath, lower=False):
# idx += 1
# if idx % 1000 == 0:
# print(f'\rRead instances {idx // 1000}k', end='')
if self.max_seq_len:
start = 0
for short_sents in split_long_sentence_into(words, self.max_seq_len, self.sent_delimiter,
char_level=self.char_level,
hard_constraint=self.hard_constraint):
end = start + len(short_sents)
yield {'token': short_sents, 'tag': tags[start:end]}
start = end
else:
yield {'token': words, 'tag': tags}
# print('\r', end='')
================================================
FILE: hanlp/datasets/ner/msra.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 23:13
_MSRA_NER_HOME = 'http://file.hankcs.com/corpus/msra_ner.zip'
_MSRA_NER_TOKEN_LEVEL_HOME = 'http://file.hankcs.com/corpus/msra_ner_token_level.zip'
MSRA_NER_CHAR_LEVEL_TRAIN = f'{_MSRA_NER_HOME}#train.tsv'
'''Training set of MSRA (:cite:`levow-2006-third`) in character level.'''
MSRA_NER_CHAR_LEVEL_DEV = f'{_MSRA_NER_HOME}#dev.tsv'
'''Dev set of MSRA (:cite:`levow-2006-third`) in character level.'''
MSRA_NER_CHAR_LEVEL_TEST = f'{_MSRA_NER_HOME}#test.tsv'
'''Test set of MSRA (:cite:`levow-2006-third`) in character level.'''
MSRA_NER_TOKEN_LEVEL_IOBES_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.tsv'
'''Training set of MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_IOBES_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.tsv'
'''Dev set of MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_IOBES_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.tsv'
'''Test set of MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.short.tsv'
'''Training set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.short.tsv'
'''Dev set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.short.tsv'
'''Test set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.'''
MSRA_NER_TOKEN_LEVEL_SHORT_JSON_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.short.jsonlines'
'''Training set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.'''
MSRA_NER_TOKEN_LEVEL_SHORT_JSON_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.short.jsonlines'
'''Dev set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.'''
MSRA_NER_TOKEN_LEVEL_SHORT_JSON_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.short.jsonlines'
'''Test set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.'''
================================================
FILE: hanlp/datasets/ner/resume.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-08 12:10
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv
_RESUME_NER_HOME = 'https://github.com/jiesutd/LatticeLSTM/archive/master.zip#'
RESUME_NER_TRAIN = _RESUME_NER_HOME + 'ResumeNER/train.char.bmes'
'''Training set of Resume in char level.'''
RESUME_NER_DEV = _RESUME_NER_HOME + 'ResumeNER/dev.char.bmes'
'''Dev set of Resume in char level.'''
RESUME_NER_TEST = _RESUME_NER_HOME + 'ResumeNER/test.char.bmes'
'''Test set of Resume in char level.'''
================================================
FILE: hanlp/datasets/ner/weibo.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-03 23:33
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv
_WEIBO_NER_HOME = 'https://github.com/hltcoe/golden-horse/archive/master.zip#data/'
WEIBO_NER_TRAIN = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.train'
'''Training set of Weibo in char level.'''
WEIBO_NER_DEV = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.dev'
'''Dev set of Weibo in char level.'''
WEIBO_NER_TEST = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.test'
'''Test set of Weibo in char level.'''
================================================
FILE: hanlp/datasets/parsing/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 00:51
================================================
FILE: hanlp/datasets/parsing/amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-18 17:47
from collections import defaultdict
from copy import copy
from typing import List
import numpy as np
import torch
from hanlp_common.constant import CLS
from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.common.vocab import VocabWithFrequency
from hanlp.components.amr.amr_parser.amrio import AMRIO
from hanlp.components.amr.amr_parser.data import END, DUM, list_to_tensor, lists_of_string_to_tensor, NIL, REL
from hanlp.components.amr.amr_parser.transformer import SelfAttentionMask
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
from hanlp_common.util import merge_list_of_dict
class AbstractMeaningRepresentationDataset(TransformableDataset):
def load_file(self, filepath: str):
for tok, lem, pos, ner, amr in AMRIO.read(filepath):
yield {'token': tok, 'lemma': lem, 'pos': pos, 'ner': ner, 'amr': amr}
def generate_oracle(sample: dict):
amr = sample.get('amr', None)
if amr:
concept, edge, _ = amr.root_centered_sort()
sample['concept'] = concept
sample['edge'] = edge
return sample
def chars_for_tok(sample: dict, max_string_len=20):
token = sample['token']
chars = []
for each in token:
each = each[:max_string_len]
chars.append([CLS] + list(each) + [END])
sample['word_char'] = chars
return sample
def append_bos(sample: dict):
for key in ['token', 'lemma', 'pos', 'ner']:
if key in sample:
sample[key] = [CLS] + sample[key]
return sample
def get_concepts(sample: dict, vocab: VocabWithFrequency = None, rel_vocab: VocabWithFrequency = None):
lem, tok = sample['lemma'], sample['token']
cp_seq, mp_seq = [], []
new_tokens = set()
for le, to in zip(lem, tok):
cp_seq.append(le + '_')
mp_seq.append(le)
for cp, mp in zip(cp_seq, mp_seq):
if vocab.get_idx(cp) == vocab.unk_idx:
new_tokens.add(cp)
if vocab.get_idx(mp) == vocab.unk_idx:
new_tokens.add(mp)
nxt = len(vocab)
token2idx, idx2token = dict(), dict()
if rel_vocab:
new_tokens = rel_vocab.idx_to_token + sorted(new_tokens)
else:
new_tokens = sorted(new_tokens)
for x in new_tokens:
token2idx[x] = nxt
idx2token[nxt] = x
nxt += 1
for k, v in zip(['cp_seq', 'mp_seq', 'token2idx', 'idx2token'], [cp_seq, mp_seq, token2idx, idx2token]):
sample[k] = v
return sample
def batchify(data, vocabs: VocabDict, unk_rate=0., device=None, squeeze=False,
tokenizer: TransformerSequenceTokenizer = None, shuffle_sibling=True,
levi_graph=False, extra_arc=False, bart=False):
rel_vocab: VocabWithFrequency = vocabs.rel
_tok = list_to_tensor(data['token'], vocabs['token'], unk_rate=unk_rate) if 'token' in vocabs else None
_lem = list_to_tensor(data['lemma'], vocabs['lemma'], unk_rate=unk_rate)
_pos = list_to_tensor(data['pos'], vocabs['pos'], unk_rate=unk_rate) if 'pos' in vocabs else None
_ner = list_to_tensor(data['ner'], vocabs['ner'], unk_rate=unk_rate) if 'ner' in vocabs else None
_word_char = lists_of_string_to_tensor(data['token'], vocabs['word_char']) if 'word_char' in vocabs else None
local_token2idx = data['token2idx']
local_idx2token = data['idx2token']
_cp_seq = list_to_tensor(data['cp_seq'], vocabs['predictable_concept'], local_token2idx)
_mp_seq = list_to_tensor(data['mp_seq'], vocabs['predictable_concept'], local_token2idx)
ret = copy(data)
if 'amr' in data:
concept, edge = [], []
for amr in data['amr']:
if levi_graph == 'kahn':
concept_i, edge_i = amr.to_levi(rel_vocab.get_frequency, shuffle=shuffle_sibling)
else:
concept_i, edge_i, _ = amr.root_centered_sort(rel_vocab.get_frequency, shuffle=shuffle_sibling)
concept.append(concept_i)
edge.append(edge_i)
if levi_graph is True:
concept_with_rel, edge_with_rel = levi_amr(concept, edge, extra_arc=extra_arc)
concept = concept_with_rel
edge = edge_with_rel
augmented_concept = [[DUM] + x + [END] for x in concept]
_concept_in = list_to_tensor(augmented_concept, vocabs.get('concept_and_rel', vocabs['concept']),
unk_rate=unk_rate)[:-1]
_concept_char_in = lists_of_string_to_tensor(augmented_concept, vocabs['concept_char'])[:-1]
_concept_out = list_to_tensor(augmented_concept, vocabs['predictable_concept'], local_token2idx)[1:]
out_conc_len, bsz = _concept_out.shape
_rel = np.full((1 + out_conc_len, bsz, out_conc_len), rel_vocab.pad_idx)
# v: [, concept_0, ..., concept_l, ..., concept_{n-1}, ] u: [, concept_0, ..., concept_l, ..., concept_{n-1}]
for bidx, (x, y) in enumerate(zip(edge, concept)):
for l, _ in enumerate(y):
if l > 0:
# l=1 => pos=l+1=2
_rel[l + 1, bidx, 1:l + 1] = rel_vocab.get_idx(NIL)
for v, u, r in x:
if levi_graph:
r = 1
else:
r = rel_vocab.get_idx(r)
assert v > u, 'Invalid typological order'
_rel[v + 1, bidx, u + 1] = r
ret.update(
{'concept_in': _concept_in, 'concept_char_in': _concept_char_in, 'concept_out': _concept_out, 'rel': _rel})
else:
augmented_concept = None
token_length = ret.get('token_length', None)
if token_length is not None and not isinstance(token_length, torch.Tensor):
ret['token_length'] = torch.tensor(token_length, dtype=torch.long, device=device if (
isinstance(device, torch.device) or device >= 0) else 'cpu:0')
ret.update({'lem': _lem, 'tok': _tok, 'pos': _pos, 'ner': _ner, 'word_char': _word_char,
'copy_seq': np.stack([_cp_seq, _mp_seq], -1), 'local_token2idx': local_token2idx,
'local_idx2token': local_idx2token})
if squeeze:
token_field = make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret)
else:
token_field = 'token'
subtoken_to_tensor(token_field, ret)
if bart:
make_batch_for_bart(augmented_concept, ret, tokenizer, device)
move_dict_to_device(ret, device)
return ret
def make_batch_for_bart(augmented_concept, ret, tokenizer, device, training=True):
token_field = 'concept'
tokenizer = TransformerSequenceTokenizer(tokenizer.tokenizer, token_field, cls_is_bos=True, sep_is_eos=None)
encodings = [tokenizer({token_field: x[:-1] if training else x}) for x in augmented_concept]
ret.update(merge_list_of_dict(encodings))
decoder_mask = []
max_seq_len = len(max(ret['concept_input_ids'], key=len))
last_concept_offset = []
for spans, concepts in zip(ret['concept_token_span'], augmented_concept):
mask = ~SelfAttentionMask.get_mask(max_seq_len, device, ret_parameter=False)
for group in spans:
for i in range(len(group)):
for j in range(i + 1, len(group)):
mask[group[i], group[j]] = True
decoder_mask.append(mask)
last_concept_offset.append(len(concepts) - 1)
ret['decoder_mask'] = torch.stack(decoder_mask)
if not training:
ret['last_concept_offset'] = torch.tensor(last_concept_offset, device=device, dtype=torch.long)
subtoken_to_tensor(token_field, ret)
def levi_amr(concept, edge, extra_arc=False):
concept_with_rel = []
edge_with_rel = []
for bidx, (edge_i, concept_i) in enumerate(zip(edge, concept)):
concept_i, edge_i = linearize(concept_i, edge_i, NIL, prefix=REL, extra_arc=extra_arc)
# This is a undirectional graph, so we can safely reverse edge
edge_i = [tuple(reversed(sorted(x[:2]))) + x[2:] for x in edge_i]
concept_with_rel.append(concept_i)
edge_with_rel.append(edge_i)
return concept_with_rel, edge_with_rel
def move_dict_to_device(ret, device):
if device == -1:
device = 'cpu:0'
for k, v in ret.items():
if isinstance(v, np.ndarray):
ret[k] = torch.tensor(v, device=device).contiguous()
elif isinstance(v, torch.Tensor):
ret[k] = v.to(device).contiguous()
def subtoken_to_tensor(token_field, ret):
token_input_ids = PadSequenceDataLoader.pad_data(ret[f'{token_field}_input_ids'], 0, torch.long)
token_token_span = PadSequenceDataLoader.pad_data(ret[f'{token_field}_token_span'], 0, torch.long)
ret.update({f'{token_field}_token_span': token_token_span, f'{token_field}_input_ids': token_input_ids})
def make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret):
token_field = 'token_and_concept'
attention_mask = []
token_and_concept = [t + [tokenizer.sep_token] + c for t, c in zip(data['token'], augmented_concept)]
encodings = [tokenizer({token_field: x}) for x in token_and_concept]
ret.update(merge_list_of_dict(encodings))
max_input_len = len(max(ret[f'{token_field}_input_ids'], key=len))
concept_mask = []
token_mask = []
token_type_ids = []
snt_len = []
last_concept_offset = []
for tokens, concepts, input_ids, spans in zip(data['token'], augmented_concept,
ret['token_and_concept_input_ids'],
ret['token_and_concept_token_span']):
raw_sent_len = len(tokens) + 1 # for [SEP]
raw_concept_len = len(concepts)
if concepts[-1] == END:
concept_mask.append([False] * raw_sent_len + [True] * (raw_concept_len - 1) + [False]) # skip END concept
else:
concept_mask.append([False] * raw_sent_len + [True] * raw_concept_len)
token_mask.append([False] + [True] * (raw_sent_len - 2) + [False] * (raw_concept_len + 1))
assert len(concept_mask) == len(token_mask)
snt_len.append(raw_sent_len - 2) # skip [CLS] and [SEP]
sent_len = input_ids.index(tokenizer.tokenizer.sep_token_id) + 1
concept_len = len(input_ids) - sent_len
mask = torch.zeros((max_input_len, max_input_len), dtype=torch.bool)
mask[:sent_len + concept_len, :sent_len] = True
bottom_right = ~SelfAttentionMask.get_mask(concept_len, device, ret_parameter=False)
mask[sent_len:sent_len + concept_len, sent_len:sent_len + concept_len] = bottom_right
for group in spans:
if group[0] >= sent_len:
for i in range(len(group)):
for j in range(i + 1, len(group)):
mask[group[i], group[j]] = True
attention_mask.append(mask)
_token_type_ids = [0] * sent_len + [1] * concept_len
token_type_ids.append(_token_type_ids)
assert len(input_ids) == len(_token_type_ids)
last_concept_offset.append(raw_concept_len - 1)
ret['attention_mask'] = torch.stack(attention_mask)
ret['concept_mask'] = PadSequenceDataLoader.pad_data(concept_mask, 0, torch.bool)
ret['token_mask'] = PadSequenceDataLoader.pad_data(token_mask, 0, torch.bool)
ret['token_type_ids'] = PadSequenceDataLoader.pad_data(token_type_ids, 0, torch.long)
ret['snt_len'] = PadSequenceDataLoader.pad_data(snt_len, 0, torch.long)
ret['last_concept_offset'] = PadSequenceDataLoader.pad_data(last_concept_offset, 0, torch.long)
return token_field
def linearize(concept: List, edge: List, label='', prefix=REL, extra_arc=False):
vur = defaultdict(dict)
for v, u, r in edge:
vur[v][u] = r
concept_with_rel = []
edge_with_rel = []
reorder = dict()
for v, c in enumerate(concept):
reorder[v] = len(concept_with_rel)
concept_with_rel.append(c)
ur = vur[v]
for u, r in ur.items():
if u < v:
concept_with_rel.append(prefix + r)
for k, v in reorder.items():
assert concept[k] == concept_with_rel[v]
for v, c in enumerate(concept):
ur = vur[v]
for i, (u, r) in enumerate(ur.items()):
if u < v:
_v = reorder[v]
_u = reorder[u]
_m = _v + i + 1
edge_with_rel.append((_v, _m, label))
edge_with_rel.append((_m, _u, label))
if extra_arc:
edge_with_rel.append((_v, _u, label))
return concept_with_rel, edge_with_rel
def unlinearize(concept: List, edge: List, prefix=REL, extra_arc=False):
real_concept, reorder = separate_concept_rel(concept, prefix)
if extra_arc:
edge = [x for x in edge if concept[x[0]].startswith(REL) or concept[x[1]].startswith(REL)]
real_edge = []
for f, b in zip(edge[::2], edge[1::2]):
if b[1] not in reorder:
continue
u = reorder[b[1]]
if f[0] not in reorder:
continue
v = reorder[f[0]]
r = concept[f[1]][len(prefix):]
real_edge.append((v, u, r))
return real_concept, real_edge
def separate_concept_rel(concept, prefix=REL):
reorder = dict()
real_concept = []
for i, c in enumerate(concept):
if not c.startswith(prefix):
reorder[i] = len(real_concept)
real_concept.append(c)
return real_concept, reorder
def remove_unconnected_components(concept: List, edge: List):
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph._traversal import connected_components
row = np.array([x[0] for x in edge], dtype=np.int)
col = np.array([x[1] for x in edge], dtype=np.int)
data = np.ones(len(row), dtype=np.int)
graph = csr_matrix((data, (row, col)), shape=(len(concept), len(concept)))
n_components, labels = connected_components(csgraph=graph, directed=True, return_labels=True)
if n_components > 1:
unique, counts = np.unique(labels, return_counts=True)
largest_component = max(zip(counts, unique))[-1]
connected_nodes = set(np.where(labels == largest_component)[0])
reorder = dict()
good_concept = []
good_edge = []
for i, c in enumerate(concept):
if i in connected_nodes:
reorder[i] = len(good_concept)
good_concept.append(c)
for v, u, r in edge:
if v in connected_nodes and u in connected_nodes:
good_edge.append((reorder[v], reorder[u], r))
concept, edge = good_concept, good_edge
return concept, edge
def largest_connected_component(triples: List):
node_to_id = dict()
concept = []
edge = []
for u, r, v in triples:
if u not in node_to_id:
node_to_id[u] = len(node_to_id)
concept.append(u)
if v not in node_to_id:
node_to_id[v] = len(node_to_id)
concept.append(v)
edge.append((node_to_id[u], node_to_id[v], r))
concept, edge = remove_unconnected_components(concept, edge)
return concept, edge
def to_triples(concept: List, edge: List):
return [(concept[u], r, concept[v]) for u, v, r in edge]
def reverse_edge_for_levi_bfs(concept, edge):
for v, u, r in edge:
if r == '_reverse_':
for x in v, u:
if concept[x].startswith(REL) and not concept[x].endswith('_reverse_'):
concept[x] += '_reverse_'
def un_kahn(concept, edge):
# (['want', 'rel=ARG1', 'rel=ARG0', 'believe', 'rel=ARG1', 'rel=ARG0', 'boy', 'girl'],
# [(0, 1, 0.9999417066574097), (0, 2, 0.9999995231628418), (1, 3, 0.9999992847442627), (3, 4, 1.0), (3, 5, 0.9999996423721313), (2, 6, 0.9996106624603271), (4, 6, 0.9999767541885376), (5, 7, 0.9999860525131226)])
real_concept, reorder = separate_concept_rel(concept)
tri_edge = dict()
for m, (a, b, p1) in enumerate(edge):
if concept[a].startswith(REL):
continue
for n, (c, d, p2) in enumerate(edge[m + 1:]):
if b == c:
key = (a, d)
_, p = tri_edge.get(key, (None, 0))
if p1 * p2 > p:
tri_edge[key] = (b, p1 * p2)
real_edge = []
for (a, d), (r, p) in tri_edge.items():
u = reorder[a]
r = concept[r][len(REL):]
v = reorder[d]
real_edge.append((v, u, r))
return real_concept, real_edge
================================================
FILE: hanlp/datasets/parsing/ctb5.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 18:44
from hanlp_common.constant import HANLP_URL
_CTB_HOME = HANLP_URL + 'embeddings/SUDA-LA-CIP_20200109_021624.zip#'
_CTB5_DEP_HOME = _CTB_HOME + 'BPNN/data/ctb5/'
CTB5_DEP_TRAIN = _CTB5_DEP_HOME + 'train.conll'
'''Training set for ctb5 dependency parsing.'''
CTB5_DEP_DEV = _CTB5_DEP_HOME + 'dev.conll'
'''Dev set for ctb5 dependency parsing.'''
CTB5_DEP_TEST = _CTB5_DEP_HOME + 'test.conll'
'''Test set for ctb5 dependency parsing.'''
CIP_W2V_100_CN = _CTB_HOME + 'BPNN/data/embed.txt'
================================================
FILE: hanlp/datasets/parsing/ctb7.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 18:44
from hanlp.datasets.parsing.ctb5 import _CTB_HOME
_CTB7_HOME = _CTB_HOME + 'BPNN/data/ctb7/'
CTB7_DEP_TRAIN = _CTB7_HOME + 'train.conll'
'''Training set for ctb7 dependency parsing.'''
CTB7_DEP_DEV = _CTB7_HOME + 'dev.conll'
'''Dev set for ctb7 dependency parsing.'''
CTB7_DEP_TEST = _CTB7_HOME + 'test.conll'
'''Test set for ctb7 dependency parsing.'''
================================================
FILE: hanlp/datasets/parsing/ctb8.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-14 20:54
from hanlp.datasets.parsing.loaders._ctb_utils import make_ctb
_CTB8_HOME = 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/'
CTB8_CWS_TRAIN = _CTB8_HOME + 'tasks/cws/train.txt'
'''Training set for ctb8 Chinese word segmentation.'''
CTB8_CWS_DEV = _CTB8_HOME + 'tasks/cws/dev.txt'
'''Dev set for ctb8 Chinese word segmentation.'''
CTB8_CWS_TEST = _CTB8_HOME + 'tasks/cws/test.txt'
'''Test set for ctb8 Chinese word segmentation.'''
CTB8_POS_TRAIN = _CTB8_HOME + 'tasks/pos/train.tsv'
'''Training set for ctb8 PoS tagging.'''
CTB8_POS_DEV = _CTB8_HOME + 'tasks/pos/dev.tsv'
'''Dev set for ctb8 PoS tagging.'''
CTB8_POS_TEST = _CTB8_HOME + 'tasks/pos/test.tsv'
'''Test set for ctb8 PoS tagging.'''
CTB8_BRACKET_LINE_TRAIN = _CTB8_HOME + 'tasks/par/train.txt'
'''Training set for ctb8 constituency parsing with empty categories.'''
CTB8_BRACKET_LINE_DEV = _CTB8_HOME + 'tasks/par/dev.txt'
'''Dev set for ctb8 constituency parsing with empty categories.'''
CTB8_BRACKET_LINE_TEST = _CTB8_HOME + 'tasks/par/test.txt'
'''Test set for ctb8 constituency parsing with empty categories.'''
CTB8_BRACKET_LINE_NOEC_TRAIN = _CTB8_HOME + 'tasks/par/train.noempty.txt'
'''Training set for ctb8 constituency parsing without empty categories.'''
CTB8_BRACKET_LINE_NOEC_DEV = _CTB8_HOME + 'tasks/par/dev.noempty.txt'
'''Dev set for ctb8 constituency parsing without empty categories.'''
CTB8_BRACKET_LINE_NOEC_TEST = _CTB8_HOME + 'tasks/par/test.noempty.txt'
'''Test set for ctb8 constituency parsing without empty categories.'''
CTB8_SD330_TRAIN = _CTB8_HOME + 'tasks/dep/train.conllx'
'''Training set for ctb8 in Stanford Dependencies 3.3.0 standard.'''
CTB8_SD330_DEV = _CTB8_HOME + 'tasks/dep/dev.conllx'
'''Dev set for ctb8 in Stanford Dependencies 3.3.0 standard.'''
CTB8_SD330_TEST = _CTB8_HOME + 'tasks/dep/test.conllx'
'''Test set for ctb8 in Stanford Dependencies 3.3.0 standard.'''
make_ctb(_CTB8_HOME)
================================================
FILE: hanlp/datasets/parsing/ctb9.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-14 20:54
from urllib.error import HTTPError
from hanlp.datasets.parsing.loaders._ctb_utils import make_ctb
from hanlp.utils.io_util import get_resource, path_from_url
_CTB9_HOME = 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/'
CTB9_CWS_TRAIN = _CTB9_HOME + 'tasks/cws/train.txt'
'''Training set for ctb9 Chinese word segmentation.'''
CTB9_CWS_DEV = _CTB9_HOME + 'tasks/cws/dev.txt'
'''Dev set for ctb9 Chinese word segmentation.'''
CTB9_CWS_TEST = _CTB9_HOME + 'tasks/cws/test.txt'
'''Test set for ctb9 Chinese word segmentation.'''
CTB9_POS_TRAIN = _CTB9_HOME + 'tasks/pos/train.tsv'
'''Training set for ctb9 PoS tagging.'''
CTB9_POS_DEV = _CTB9_HOME + 'tasks/pos/dev.tsv'
'''Dev set for ctb9 PoS tagging.'''
CTB9_POS_TEST = _CTB9_HOME + 'tasks/pos/test.tsv'
'''Test set for ctb9 PoS tagging.'''
CTB9_BRACKET_LINE_TRAIN = _CTB9_HOME + 'tasks/par/train.txt'
'''Training set for ctb9 constituency parsing with empty categories.'''
CTB9_BRACKET_LINE_DEV = _CTB9_HOME + 'tasks/par/dev.txt'
'''Dev set for ctb9 constituency parsing with empty categories.'''
CTB9_BRACKET_LINE_TEST = _CTB9_HOME + 'tasks/par/test.txt'
'''Test set for ctb9 constituency parsing with empty categories.'''
CTB9_BRACKET_LINE_NOEC_TRAIN = _CTB9_HOME + 'tasks/par/train.noempty.txt'
'''Training set for ctb9 constituency parsing without empty categories.'''
CTB9_BRACKET_LINE_NOEC_DEV = _CTB9_HOME + 'tasks/par/dev.noempty.txt'
'''Dev set for ctb9 constituency parsing without empty categories.'''
CTB9_BRACKET_LINE_NOEC_TEST = _CTB9_HOME + 'tasks/par/test.noempty.txt'
'''Test set for ctb9 constituency parsing without empty categories.'''
CTB9_SD330_TRAIN = _CTB9_HOME + 'tasks/dep/train.conllx'
'''Training set for ctb9 in Stanford Dependencies 3.3.0 standard.'''
CTB9_SD330_DEV = _CTB9_HOME + 'tasks/dep/dev.conllx'
'''Dev set for ctb9 in Stanford Dependencies 3.3.0 standard.'''
CTB9_SD330_TEST = _CTB9_HOME + 'tasks/dep/test.conllx'
'''Test set for ctb9 in Stanford Dependencies 3.3.0 standard.'''
try:
get_resource(_CTB9_HOME)
except HTTPError:
raise FileNotFoundError(
'Chinese Treebank 9.0 is a copyright dataset owned by LDC which we cannot re-distribute. '
f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) '
f'then download it to {path_from_url(_CTB9_HOME)}'
)
make_ctb(_CTB9_HOME)
================================================
FILE: hanlp/datasets/parsing/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:04
================================================
FILE: hanlp/datasets/parsing/loaders/_ctb_utils.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-25 16:14
import os
import shutil
import sys
from collections import defaultdict
from os import listdir
from os.path import join, isfile
from typing import List
from phrasetree.tree import Tree
from hanlp.components.parsers.conll import read_conll
from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr, read_tsv_as_sents, run_cmd, pushd
from hanlp.utils.log_util import cprint
from hanlp.utils.time_util import CountdownTimer
# See Shao et al., 2017
# CTB9_ACADEMIA_SPLITS = {
# 'train': '''
# 0044-0143, 0170-0270, 0400-0899,
# 1001-1017, 1019, 1021-1035, 1037-
# 1043, 1045-1059, 1062-1071, 1073-
# 1117, 1120-1131, 1133-1140, 1143-
# 1147, 1149-1151, 2000-2915, 4051-
# 4099, 4112-4180, 4198-4368, 5000-
# 5446, 6000-6560, 7000-7013
# ''',
# 'dev': '''
# 0301-0326, 2916-3030, 4100-4106,
# 4181-4189, 4369-4390, 5447-5492,
# 6561-6630, 7013-7014
# ''',
# 'test': '''
# 0001-0043, 0144-0169, 0271-0301,
# 0900-0931, 1018, 1020, 1036, 1044,
# 1060, 1061, 1072, 1118, 1119, 1132,
# 1141, 1142, 1148, 3031-3145, 4107-
# 4111, 4190-4197, 4391-4411, 5493-
# 5558, 6631-6700, 7015-7017
# '''
# }
#
#
# def _make_splits(splits: Dict[str, str]):
# total = set()
# for part, text in list(splits.items()):
# if not isinstance(text, str):
# continue
# lines = text.replace('\n', '').split()
# cids = set()
# for line in lines:
# for each in line.split(','):
# each = each.strip()
# if not each:
# continue
# if '-' in each:
# start, end = each.split('-')
# start, end = map(lambda x: int(x), [start, end])
# cids.update(range(start, end + 1))
# # cids.update(map(lambda x: f'{x:04d}', range(start, end)))
# else:
# cids.add(int(each))
# cids = set(f'{x:04d}' for x in cids)
# assert len(cids & total) == 0, f'Overlap found in {part}'
# splits[part] = cids
#
# return splits
#
#
# _make_splits(CTB9_ACADEMIA_SPLITS)
def convert_to_dependency(src, dst, language='zh', version='3.3.0', conllx=True, ud=False):
cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version {version}. '
f'It might take a while [blink][yellow]...[/yellow][/blink]')
if version == '3.3.0':
sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip'
elif version == '4.2.0':
sp_home = 'https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip'
else:
raise ValueError(f'Unsupported version {version}')
sp_home = get_resource(sp_home)
# jar_path = get_resource(f'{sp_home}#stanford-parser.jar')
if ud:
jclass = 'edu.stanford.nlp.trees.international.pennchinese.UniversalChineseGrammaticalStructure' if language == 'zh' \
else 'edu.stanford.nlp.trees.ud.UniversalDependenciesConverter'
else:
jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \
else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure'
cmd = f'java -cp {sp_home}/* {jclass} ' \
f'-treeFile {src}'
if conllx:
cmd += ' -conllx'
if not ud:
cmd += f' -basic -keepPunct'
code, out, err = get_exitcode_stdout_stderr(cmd)
with open(dst, 'w') as f:
f.write(out)
if code:
raise RuntimeError(f'Conversion failed with code {code} for {src}. The err message is:\n {err}\n'
f'Do you have java installed? Do you have enough memory?')
def clean_ctb_bracketed(ctb_root, out_root):
os.makedirs(out_root, exist_ok=True)
ctb_root = join(ctb_root, 'bracketed')
chtbs = _list_treebank_root(ctb_root)
timer = CountdownTimer(len(chtbs))
for f in chtbs:
with open(join(ctb_root, f), encoding='utf-8') as src, open(join(out_root, f + '.txt'), 'w',
encoding='utf-8') as out:
for line in src:
if not line.strip().startswith('<'):
out.write(line)
timer.log('Cleaning up CTB [blink][yellow]...[/yellow][/blink]', erase=False)
def _list_treebank_root(ctb_root):
chtbs = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and f.startswith('chtb')]
return sorted(chtbs)
def list_treebank(ctb_home):
ctb_home = get_resource(ctb_home)
cleaned_root = join(ctb_home, 'cleaned_bracket')
return _list_treebank_root(cleaned_root)
def load_bracketed_trees(chtbs) -> List[Tree]:
trees = []
for f in chtbs:
with open(f, encoding='utf-8') as src:
content = src.read()
trees = [x for x in content.split('\n\n') if x.strip()]
for tree in trees:
tree = Tree.fromstring(tree)
trees.append(tree)
return trees
def split_str_to_trees(text: str):
trees = []
buffer = []
for line in text.split('\n'):
if not line.strip():
continue
if line.startswith('('):
if buffer:
trees.append('\n'.join(buffer).strip())
buffer = []
buffer.append(line)
if buffer:
trees.append('\n'.join(buffer).strip())
return trees
def make_ctb_tasks(chtbs, out_root, part):
for task in ['cws', 'pos', 'par', 'dep']:
os.makedirs(join(out_root, task), exist_ok=True)
timer = CountdownTimer(len(chtbs))
par_path = join(out_root, 'par', f'{part}.txt')
with open(join(out_root, 'cws', f'{part}.txt'), 'w', encoding='utf-8') as cws, \
open(join(out_root, 'pos', f'{part}.tsv'), 'w', encoding='utf-8') as pos, \
open(par_path, 'w', encoding='utf-8') as par:
for f in chtbs:
with open(f, encoding='utf-8') as src:
content = src.read()
trees = split_str_to_trees(content)
for tree in trees:
try:
tree = Tree.fromstring(tree)
except ValueError:
print(tree)
exit(1)
words = []
for word, tag in tree.pos():
if tag == '-NONE-' or not tag:
continue
tag = tag.split('-')[0]
if tag == 'X': # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT
tag = 'FW'
pos.write('{}\t{}\n'.format(word, tag))
words.append(word)
cws.write(' '.join(words))
par.write(tree.pformat(margin=sys.maxsize))
for fp in cws, pos, par:
fp.write('\n')
timer.log(f'Preprocesing the [blue]{part}[/blue] set of CTB [blink][yellow]...[/yellow][/blink]',
erase=False)
remove_all_ec(par_path)
dep_path = join(out_root, 'dep', f'{part}.conllx')
convert_to_dependency(par_path, dep_path)
sents = list(read_conll(dep_path))
with open(dep_path, 'w') as out:
for sent in sents:
for i, cells in enumerate(sent):
tag = cells[3]
tag = tag.split('-')[0] # NT-SHORT ---> NT
if tag == 'X': # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT
tag = 'FW'
cells[3] = cells[4] = tag
out.write('\t'.join(str(x) for x in cells))
out.write('\n')
out.write('\n')
def reverse_splits(splits):
cid_domain = dict()
for domain, cids in splits.items():
for each in cids:
cid_domain[each] = domain
return cid_domain
def split_chtb(chtbs: List[str], splits=None):
train, dev, test = [], [], []
unused = []
for each in chtbs:
name, domain, ext = each.split('.', 2)
_, cid = name.split('_')
if splits:
if cid in splits['train']:
bin = train
elif cid in splits['dev']:
bin = dev
elif cid in splits['test']:
bin = test
else:
bin = unused
# raise IOError(f'{name} not in any splits')
else:
bin = train
if name.endswith('8'):
bin = dev
elif name.endswith('9'):
bin = test
bin.append(each)
return train, dev, test
def id_of_chtb(each: str):
return int(each.split('.')[0].split('_')[-1])
def make_ctb(ctb_home):
ctb_home = get_resource(ctb_home)
cleaned_root = join(ctb_home, 'cleaned_bracket')
if not os.path.isdir(cleaned_root):
clean_ctb_bracketed(ctb_home, cleaned_root)
tasks_root = join(ctb_home, 'tasks')
if not os.path.isdir(tasks_root):
try:
chtbs = _list_treebank_root(cleaned_root)
print(f'For the {len(chtbs)} files in CTB, we apply the following splits:')
train, dev, test = split_chtb(chtbs)
for part, name in zip([train, dev, test], ['train', 'dev', 'test']):
print(f'{name} = {[id_of_chtb(x) for x in part]}')
cprint('[yellow]Each file id ending with 8/9 is put into '
'dev/test respectively, the rest are put into train. '
'Our splits ensure files are evenly split across each genre, which is recommended '
'for production systems.[/yellow]')
for part, name in zip([train, dev, test], ['train', 'dev', 'test']):
make_ctb_tasks([join(cleaned_root, x) for x in part], tasks_root, name)
cprint('Done pre-processing CTB. Enjoy your research with [blue]HanLP[/blue]!')
except Exception as e:
shutil.rmtree(tasks_root, ignore_errors=True)
raise e
def load_domains(ctb_home):
"""
Load file ids from a Chinese treebank grouped by domains.
Args:
ctb_home: Root path to CTB.
Returns:
A dict of sets, each represents a domain.
"""
ctb_home = get_resource(ctb_home)
ctb_root = join(ctb_home, 'bracketed')
chtbs = _list_treebank_root(ctb_root)
domains = defaultdict(set)
for each in chtbs:
name, domain = each.split('.')
_, fid = name.split('_')
domains[domain].add(fid)
return domains
def ctb_pos_to_text_format(path, delimiter='_'):
"""
Convert ctb pos tagging corpus from tsv format to text format, where each word is followed by
its pos tag.
Args:
path: File to be converted.
delimiter: Delimiter between word and tag.
"""
path = get_resource(path)
name, ext = os.path.splitext(path)
with open(f'{name}.txt', 'w', encoding='utf-8') as out:
for sent in read_tsv_as_sents(path):
out.write(' '.join([delimiter.join(x) for x in sent]))
out.write('\n')
def remove_all_ec(path):
"""
Remove empty categories for all trees in this file and save them into a "noempty" file.
Args:
path: File path.
"""
script = get_resource('https://file.hankcs.com/bin/remove_ec.zip')
with pushd(script):
run_cmd(f'java -cp elit-ddr-0.0.5-SNAPSHOT.jar:elit-sdk-0.0.5-SNAPSHOT.jar:hanlp-1.7.8.jar:'
f'fastutil-8.1.1.jar:. demo.RemoveEmptyCategoriesTreebank {path}')
================================================
FILE: hanlp/datasets/parsing/loaders/conll_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 16:10
from typing import Union, List, Callable, Dict
from hanlp_common.constant import ROOT, EOS, BOS
from hanlp.common.dataset import TransformableDataset
from hanlp.components.parsers.conll import read_conll
from hanlp.utils.io_util import TimingFileIterator
class CoNLLParsingDataset(TransformableDataset):
def __init__(self,
data: Union[str, List],
transform: Union[Callable, List] = None,
cache=None,
generate_idx=None,
prune: Callable[[Dict[str, List[str]]], bool] = None) -> None:
"""General class for CoNLL style dependency parsing datasets.
Args:
data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
transform: Predefined transform(s).
cache: ``True`` to enable caching, so that transforms won't be called twice.
generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
samples are re-ordered by a sampler.
prune: A filter to prune unwanted samples.
"""
self._prune = prune
super().__init__(data, transform, cache, generate_idx)
def load_file(self, filepath):
"""Both ``.conllx`` and ``.conllu`` are supported. Their descriptions can be found in
:class:`hanlp_common.conll.CoNLLWord` and :class:`hanlp_common.conll.CoNLLUWord` respectively.
Args:
filepath: ``.conllx`` or ``.conllu`` file path.
"""
if filepath.endswith('.conllu'):
# See https://universaldependencies.org/format.html
field_names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS',
'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
else:
field_names = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS',
'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
fp = TimingFileIterator(filepath)
for idx, sent in enumerate(read_conll(fp)):
sample = {}
for i, field in enumerate(field_names):
sample[field] = [cell[i] for cell in sent]
if not self._prune or not self._prune(sample):
yield sample
fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]')
def __len__(self) -> int:
return len(self.data)
def append_bos(sample: dict, pos_key='CPOS', bos=ROOT) -> dict:
"""
Args:
sample:
pos_key:
bos: A special token inserted to the head of tokens.
Returns:
"""
sample['token'] = [bos] + sample['FORM']
if pos_key in sample:
sample['pos'] = [ROOT] + sample[pos_key]
if 'HEAD' in sample:
sample['arc'] = [0] + sample['HEAD']
sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL']
return sample
def append_bos_eos(sample: dict) -> dict:
sample['token'] = [BOS] + sample['FORM'] + [EOS]
if 'CPOS' in sample:
sample['pos'] = [BOS] + sample['CPOS'] + [EOS]
if 'HEAD' in sample:
sample['arc'] = [0] + sample['HEAD'] + [0]
sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] + sample['DEPREL'][:1]
return sample
def get_sibs(sample: dict) -> dict:
heads = sample.get('arc', None)
if heads:
sibs = [-1] * len(heads)
for i in range(1, len(heads)):
hi = heads[i]
for j in range(i + 1, len(heads)):
hj = heads[j]
di, dj = hi - i, hj - j
if hi >= 0 and hj >= 0 and hi == hj and di * dj > 0:
if abs(di) > abs(dj):
sibs[i] = j
else:
sibs[j] = i
break
sample['sib_id'] = [0] + sibs[1:]
return sample
================================================
FILE: hanlp/datasets/parsing/loaders/constituency_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-28 19:27
from typing import List
from phrasetree.tree import Tree
from hanlp_common.constant import EOS, BOS
from hanlp.common.dataset import TransformableDataset
class ConstituencyDataset(TransformableDataset):
def load_file(self, filepath: str):
with open(filepath) as src:
for line in src:
line = line.strip()
if not line:
continue
yield {'constituency': Tree.fromstring(line)}
def unpack_tree_to_features(sample: dict):
tree = sample.get('constituency', None)
if tree:
words, tags = zip(*tree.pos())
chart = [[None] * (len(words) + 1) for _ in range(len(words) + 1)]
for i, j, label in factorize(binarize(tree)[0]):
# if no_subcategory:
# label = label.split('-')[0]
chart[i][j] = label
sample['token'] = [BOS] + list(words) + [EOS]
sample['chart'] = chart
return sample
def append_bos_eos(sample: dict):
if '_con_token' not in sample:
sample['_con_token'] = sample['token']
sample['token'] = [BOS] + sample['token'] + [EOS]
return sample
def remove_subcategory(sample: dict):
tree: Tree = sample.get('constituency', None)
if tree:
for subtree in tree.subtrees():
label = subtree.label()
subtree.set_label(label.split('-')[0])
return sample
def binarize(tree: Tree):
r"""
Conducts binarization over the tree.
First, the tree is transformed to satisfy `Chomsky Normal Form (CNF)`_.
Here we call :meth:`~tree.Tree.chomsky_normal_form` to conduct left-binarization.
Second, all unary productions in the tree are collapsed.
Args:
tree (tree.Tree):
The tree to be binarized.
Returns:
The binarized tree.
Examples:
>>> tree = Tree.fromstring('''
(TOP
(S
(NP (_ She))
(VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis)))))
(_ .)))
''')
>>> print(Tree.binarize(tree))
(TOP
(S
(S|<>
(NP (_ She))
(VP
(VP|<> (_ enjoys))
(S+VP (VP|<> (_ playing)) (NP (_ tennis)))))
(S|<> (_ .))))
.. _Chomsky Normal Form (CNF):
https://en.wikipedia.org/wiki/Chomsky_normal_form
"""
tree: Tree = tree.copy(True)
nodes = [tree]
while nodes:
node = nodes.pop()
if isinstance(node, Tree):
nodes.extend([child for child in node])
if len(node) > 1:
for i, child in enumerate(node):
if not isinstance(child[0], Tree):
node[i] = Tree(f"{node.label()}|<>", [child])
tree.chomsky_normal_form('left', 0, 0)
tree.collapse_unary()
return tree
def factorize(tree, delete_labels=None, equal_labels=None):
r"""
Factorizes the tree into a sequence.
The tree is traversed in pre-order.
Args:
tree (tree.Tree):
The tree to be factorized.
delete_labels (set[str]):
A set of labels to be ignored. This is used for evaluation.
If it is a pre-terminal label, delete the word along with the brackets.
If it is a non-terminal label, just delete the brackets (don't delete childrens).
In `EVALB`_, the default set is:
{'TOP', 'S1', '-NONE-', ',', ':', '``', "''", '.', '?', '!', ''}
Default: ``None``.
equal_labels (dict[str, str]):
The key-val pairs in the dict are considered equivalent (non-directional). This is used for evaluation.
The default dict defined in `EVALB`_ is: {'ADVP': 'PRT'}
Default: ``None``.
Returns:
The sequence of the factorized tree.
Examples:
>>> tree = Tree.fromstring('' (TOP
(S
(NP (_ She))
(VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis)))))
(_ .)))
'')
>>> Tree.factorize(tree)
[(0, 5, 'TOP'), (0, 5, 'S'), (0, 1, 'NP'), (1, 4, 'VP'), (2, 4, 'S'), (2, 4, 'VP'), (3, 4, 'NP')]
>>> Tree.factorize(tree, delete_labels={'TOP', 'S1', '-NONE-', ',', ':', '``', "''", '.', '?', '!', ''})
[(0, 5, 'S'), (0, 1, 'NP'), (1, 4, 'VP'), (2, 4, 'S'), (2, 4, 'VP'), (3, 4, 'NP')]
.. _EVALB:
https://nlp.cs.nyu.edu/evalb/
"""
def track(tree, i):
label = tree.label()
if delete_labels is not None and label in delete_labels:
label = None
if equal_labels is not None:
label = equal_labels.get(label, label)
if len(tree) == 1 and not isinstance(tree[0], Tree):
return (i + 1 if label is not None else i), []
j, spans = i, []
for child in tree:
if isinstance(child, Tree):
j, s = track(child, j)
spans += s
if label is not None and j > i:
spans = [(i, j, label)] + spans
return j, spans
return track(tree, 0)[1]
def build_tree(tokens: List[str], sequence):
r"""
Builds a constituency tree from the sequence. The sequence is generated in pre-order.
During building the tree, the sequence is de-binarized to the original format (i.e.,
the suffixes ``|<>`` are ignored, the collapsed labels are recovered).
Args:
tokens :
All tokens in a sentence.
sequence (list[tuple]):
A list of tuples used for generating a tree.
Each tuple consits of the indices of left/right span boundaries and label of the span.
Returns:
A result constituency tree.
Examples:
>>> tree = Tree.totree(['She', 'enjoys', 'playing', 'tennis', '.'], 'TOP')
>>> sequence = [(0, 5, 'S'), (0, 4, 'S|<>'), (0, 1, 'NP'), (1, 4, 'VP'), (1, 2, 'VP|<>'),
(2, 4, 'S+VP'), (2, 3, 'VP|<>'), (3, 4, 'NP'), (4, 5, 'S|<>')]
>>> print(Tree.build_tree(root, sequence))
(TOP
(S
(NP (_ She))
(VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis)))))
(_ .)))
"""
if not tokens: # User passed in [], which is the tokenized result of ''
return Tree('TOP', [])
tree = Tree('TOP', [Tree('_', [t]) for t in tokens])
root = tree.label()
leaves = [subtree for subtree in tree.subtrees() if not isinstance(subtree[0], Tree)]
def track(node):
i, j, label = next(node)
if j == i + 1:
children = [leaves[i]]
else:
children = track(node) + track(node)
if label.endswith('|<>'):
return children
labels = label.split('+')
tree = Tree(labels[-1], children)
for label in reversed(labels[:-1]):
tree = Tree(label, [tree])
return [tree]
return Tree(root, track(iter(sequence)))
================================================
FILE: hanlp/datasets/parsing/pmt1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-02-15 04:14
import os.path
from hanlp.utils.io_util import get_resource
from hanlp.utils.log_util import cprint
from hanlp_common.conll import CoNLLSentence, CoNLLWord
_HOME = 'https://github.com/qiulikun/PKUMultiviewTreebank/archive/refs/heads/master.zip'
PTM_V1_RAW = _HOME + '#199801_dependency_treebank_2014pos.txt'
PTM_V1_TRAIN = _HOME + '#train.conllx'
'The training set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).'
PTM_V1_DEV = _HOME + '#dev.conllx'
'The dev set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).'
PTM_V1_TEST = _HOME + '#test.conllx'
'The test set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).'
def _make_ptm():
raw = get_resource(PTM_V1_RAW)
home = os.path.dirname(raw)
done = True
for part in ['train', 'dev', 'test']:
if not os.path.isfile(os.path.join(home, f'{part}.conllx')):
done = False
break
if done:
return
sents = []
with open(raw) as src:
buffer = []
for line in src:
line = line.strip()
if line:
buffer.append(line)
else:
if buffer:
tok, pos, rel, arc = [x.split() for x in buffer]
sent = CoNLLSentence()
for i, (t, p, r, a) in enumerate(zip(tok, pos, rel, arc)):
sent.append(CoNLLWord(i + 1, form=t, cpos=p, head=a, deprel=r))
sents.append(sent)
buffer.clear()
prev_offset = 0
# Sentences 12001-13000 and 13001-14463 are used as the development and test set, respectively. The remaining
# sentences are used as training data.
for part, offset in zip(['train', 'dev', 'test'], [12000, 13000, 14463]):
with open(os.path.join(home, f'{part}.conllx'), 'w') as out:
portion = sents[prev_offset:offset]
cprint(f'[yellow]{len(portion)}[/yellow] sentences [cyan][{prev_offset + 1}:{offset})[/cyan] in {part}')
for sent in portion:
out.write(str(sent) + '\n\n')
prev_offset = offset
_make_ptm()
================================================
FILE: hanlp/datasets/parsing/ptb.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-02-17 15:46
_PTB_HOME = 'https://github.com/KhalilMrini/LAL-Parser/archive/master.zip#data/'
PTB_TRAIN = _PTB_HOME + '02-21.10way.clean'
'''Training set of PTB without empty categories. PoS tags are automatically predicted using 10-fold
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_DEV = _PTB_HOME + '22.auto.clean'
'''Dev set of PTB without empty categories. PoS tags are automatically predicted using 10-fold
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_TEST = _PTB_HOME + '23.auto.clean'
'''Test set of PTB without empty categories. PoS tags are automatically predicted using 10-fold
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_SD330_TRAIN = _PTB_HOME + 'ptb_train_3.3.0.sd.clean'
'''Training set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_SD330_DEV = _PTB_HOME + 'ptb_dev_3.3.0.sd.clean'
'''Dev set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_SD330_TEST = _PTB_HOME + 'ptb_test_3.3.0.sd.clean'
'''Test set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold
jackknifing (:cite:`collins-koo-2005-discriminative`).'''
PTB_TOKEN_MAPPING = {
"-LRB-": "(",
"-RRB-": ")",
"-LCB-": "{",
"-RCB-": "}",
"-LSB-": "[",
"-RSB-": "]",
"``": '"',
"''": '"',
"`": "'",
'«': '"',
'»': '"',
'‘': "'",
'’': "'",
'“': '"',
'”': '"',
'„': '"',
'‹': "'",
'›': "'",
"\u2013": "--", # en dash
"\u2014": "--", # em dash
}
================================================
FILE: hanlp/datasets/parsing/semeval15.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-28 14:40
# from hanlp.datasets.parsing.conll_dataset import CoNLLParsingDataset
#
#
# class SemEval15Dataset(CoNLLParsingDataset):
# def load_file(self, filepath: str):
# pass
import warnings
from hanlp_common.constant import ROOT, PAD
from hanlp_common.conll import CoNLLSentence
def unpack_deps_to_head_deprel(sample: dict, pad_rel=None, arc_key='arc', rel_key='rel'):
if 'DEPS' in sample:
deps = ['_'] + sample['DEPS']
sample[arc_key] = arc = []
sample[rel_key] = rel = []
for each in deps:
arc_per_token = [False] * len(deps)
rel_per_token = [None] * len(deps)
if each != '_':
for ar in each.split('|'):
a, r = ar.split(':')
a = int(a)
arc_per_token[a] = True
rel_per_token[a] = r
if not pad_rel:
pad_rel = r
arc.append(arc_per_token)
rel.append(rel_per_token)
if not pad_rel:
pad_rel = PAD
for i in range(len(rel)):
rel[i] = [r if r else pad_rel for r in rel[i]]
return sample
def append_bos_to_form_pos(sample, pos_key='CPOS'):
sample['token'] = [ROOT] + sample['FORM']
if pos_key in sample:
sample['pos'] = [ROOT] + sample[pos_key]
return sample
def merge_head_deprel_with_2nd(sample: dict):
if 'arc' in sample:
arc_2nd = sample['arc_2nd']
rel_2nd = sample['rel_2nd']
for i, (arc, rel) in enumerate(zip(sample['arc'], sample['rel'])):
if i:
if arc_2nd[i][arc] and rel_2nd[i][arc] != rel:
sample_str = CoNLLSentence.from_dict(sample, conllu=True).to_markdown()
warnings.warn(f'The main dependency conflicts with 2nd dependency at ID={i}, ' \
'which means joint mode might not be suitable. ' \
f'The sample is\n{sample_str}')
arc_2nd[i][arc] = True
rel_2nd[i][arc] = rel
return sample
================================================
FILE: hanlp/datasets/parsing/semeval16.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 00:51
from hanlp_common.conll import CoNLLSentence
import os
from hanlp.utils.io_util import get_resource, merge_files
from hanlp_common.io import eprint
_SEMEVAL2016_HOME = 'https://github.com/HIT-SCIR/SemEval-2016/archive/master.zip'
SEMEVAL2016_NEWS_TRAIN = _SEMEVAL2016_HOME + '#train/news.train.conll'
SEMEVAL2016_NEWS_DEV = _SEMEVAL2016_HOME + '#validation/news.valid.conll'
SEMEVAL2016_NEWS_TEST = _SEMEVAL2016_HOME + '#test/news.test.conll'
SEMEVAL2016_NEWS_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/news.train.conllu'
SEMEVAL2016_NEWS_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/news.valid.conllu'
SEMEVAL2016_NEWS_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/news.test.conllu'
SEMEVAL2016_TEXT_TRAIN = _SEMEVAL2016_HOME + '#train/text.train.conll'
SEMEVAL2016_TEXT_DEV = _SEMEVAL2016_HOME + '#validation/text.valid.conll'
SEMEVAL2016_TEXT_TEST = _SEMEVAL2016_HOME + '#test/text.test.conll'
SEMEVAL2016_TEXT_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/text.train.conllu'
SEMEVAL2016_TEXT_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/text.valid.conllu'
SEMEVAL2016_TEXT_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/text.test.conllu'
SEMEVAL2016_FULL_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/full.train.conllu'
SEMEVAL2016_FULL_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/full.valid.conllu'
SEMEVAL2016_FULL_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/full.test.conllu'
def convert_conll_to_conllu(path):
sents = CoNLLSentence.from_file(path, conllu=True)
with open(os.path.splitext(path)[0] + '.conllu', 'w') as out:
for sent in sents:
for word in sent:
if not word.deps:
word.deps = [(word.head, word.deprel)]
word.head = None
word.deprel = None
out.write(str(sent))
out.write('\n\n')
for file in [SEMEVAL2016_NEWS_TRAIN, SEMEVAL2016_NEWS_DEV, SEMEVAL2016_NEWS_TEST,
SEMEVAL2016_TEXT_TRAIN, SEMEVAL2016_TEXT_DEV, SEMEVAL2016_TEXT_TEST]:
file = get_resource(file)
conllu = os.path.splitext(file)[0] + '.conllu'
if not os.path.isfile(conllu):
eprint(f'Converting {os.path.basename(file)} to {os.path.basename(conllu)} ...')
convert_conll_to_conllu(file)
for group, part in zip([[SEMEVAL2016_NEWS_TRAIN_CONLLU, SEMEVAL2016_TEXT_TRAIN_CONLLU],
[SEMEVAL2016_NEWS_DEV_CONLLU, SEMEVAL2016_TEXT_DEV_CONLLU],
[SEMEVAL2016_NEWS_TEST_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU]],
['train', 'valid', 'test']):
root = get_resource(_SEMEVAL2016_HOME)
dst = f'{root}/train/full.{part}.conllu'
if not os.path.isfile(dst):
group = [get_resource(x) for x in group]
eprint(f'Concatenating {os.path.basename(group[0])} and {os.path.basename(group[1])} '
f'into full dataset {os.path.basename(dst)} ...')
merge_files(group, dst)
================================================
FILE: hanlp/datasets/parsing/ud/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-07 21:45
import os
import shutil
from hanlp.components.parsers.ud.udify_util import get_ud_treebank_files
from hanlp.utils.io_util import get_resource
from hanlp.utils.log_util import flash
def concat_treebanks(home, version):
ud_home = get_resource(home)
treebanks = get_ud_treebank_files(ud_home)
output_dir = os.path.abspath(os.path.join(ud_home, os.path.pardir, os.path.pardir, f'ud-multilingual-v{version}'))
if os.path.isdir(output_dir):
return output_dir
os.makedirs(output_dir)
train, dev, test = list(zip(*[treebanks[k] for k in treebanks]))
for treebank, name in zip([train, dev, test], ["train.conllu", "dev.conllu", "test.conllu"]):
flash(f'Concatenating {len(train)} treebanks into {name} [blink][yellow]...[/yellow][/blink]')
with open(os.path.join(output_dir, name), 'w') as write:
for t in treebank:
if not t:
continue
with open(t, 'r') as read:
shutil.copyfileobj(read, write)
flash('')
return output_dir
================================================
FILE: hanlp/datasets/parsing/ud/ud210.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-07 21:03
import glob
import os
from hanlp.utils.io_util import uncompress, get_resource
_UD_210_URL = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4758/allzip"
_UD_210_HOME = _UD_210_URL + '#ud-treebanks-v2.10/'
_path = get_resource(_UD_210_URL)
if os.path.isfile(_path):
os.rename(_path, _path + '.zip')
uncompress(_path + '.zip')
uncompress(os.path.join(_path, 'ud-treebanks-v2.10.tgz'))
# noinspection PyShadowingNames
def _list_dir(path, home):
prefix = home.lstrip('_').replace('_HOME', '')
path = get_resource(path)
with open('ud210.py', 'a') as out:
for f in sorted(glob.glob(path + '/ud-treebanks-v2.10/UD_*')):
basename = os.path.basename(f)
name = basename[len('UD_'):]
name = name.upper().replace('-', '_')
for split in 'train', 'dev', 'test':
sp = glob.glob(f + f'/*{split}.conllu')
if not sp:
continue
sp = os.path.basename(sp[0])
out.write(f'{prefix}_{name}_{split.upper()} = {home} + "{basename}/{sp}"\n')
out.write(f'"{prefix} {split} set of {name}."\n')
def main():
_list_dir(_UD_210_URL, '_UD_210_HOME')
pass
if __name__ == '__main__':
main()
UD_210_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu"
"UD_210 train set of AFRIKAANS_AFRIBOOMS."
UD_210_AFRIKAANS_AFRIBOOMS_DEV = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu"
"UD_210 dev set of AFRIKAANS_AFRIBOOMS."
UD_210_AFRIKAANS_AFRIBOOMS_TEST = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu"
"UD_210 test set of AFRIKAANS_AFRIBOOMS."
UD_210_AKKADIAN_PISANDUB_TEST = _UD_210_HOME + "UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu"
"UD_210 test set of AKKADIAN_PISANDUB."
UD_210_AKKADIAN_RIAO_TEST = _UD_210_HOME + "UD_Akkadian-RIAO/akk_riao-ud-test.conllu"
"UD_210 test set of AKKADIAN_RIAO."
UD_210_AKUNTSU_TUDET_TEST = _UD_210_HOME + "UD_Akuntsu-TuDeT/aqz_tudet-ud-test.conllu"
"UD_210 test set of AKUNTSU_TUDET."
UD_210_ALBANIAN_TSA_TEST = _UD_210_HOME + "UD_Albanian-TSA/sq_tsa-ud-test.conllu"
"UD_210 test set of ALBANIAN_TSA."
UD_210_AMHARIC_ATT_TEST = _UD_210_HOME + "UD_Amharic-ATT/am_att-ud-test.conllu"
"UD_210 test set of AMHARIC_ATT."
UD_210_ANCIENT_GREEK_PROIEL_TRAIN = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu"
"UD_210 train set of ANCIENT_GREEK_PROIEL."
UD_210_ANCIENT_GREEK_PROIEL_DEV = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu"
"UD_210 dev set of ANCIENT_GREEK_PROIEL."
UD_210_ANCIENT_GREEK_PROIEL_TEST = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu"
"UD_210 test set of ANCIENT_GREEK_PROIEL."
UD_210_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu"
"UD_210 train set of ANCIENT_GREEK_PERSEUS."
UD_210_ANCIENT_GREEK_PERSEUS_DEV = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu"
"UD_210 dev set of ANCIENT_GREEK_PERSEUS."
UD_210_ANCIENT_GREEK_PERSEUS_TEST = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu"
"UD_210 test set of ANCIENT_GREEK_PERSEUS."
UD_210_ANCIENT_HEBREW_PTNK_TRAIN = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-train.conllu"
"UD_210 train set of ANCIENT_HEBREW_PTNK."
UD_210_ANCIENT_HEBREW_PTNK_DEV = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-dev.conllu"
"UD_210 dev set of ANCIENT_HEBREW_PTNK."
UD_210_ANCIENT_HEBREW_PTNK_TEST = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-test.conllu"
"UD_210 test set of ANCIENT_HEBREW_PTNK."
UD_210_APURINA_UFPA_TEST = _UD_210_HOME + "UD_Apurina-UFPA/apu_ufpa-ud-test.conllu"
"UD_210 test set of APURINA_UFPA."
UD_210_ARABIC_NYUAD_TRAIN = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu"
"UD_210 train set of ARABIC_NYUAD."
UD_210_ARABIC_NYUAD_DEV = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu"
"UD_210 dev set of ARABIC_NYUAD."
UD_210_ARABIC_NYUAD_TEST = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu"
"UD_210 test set of ARABIC_NYUAD."
UD_210_ARABIC_PADT_TRAIN = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-train.conllu"
"UD_210 train set of ARABIC_PADT."
UD_210_ARABIC_PADT_DEV = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-dev.conllu"
"UD_210 dev set of ARABIC_PADT."
UD_210_ARABIC_PADT_TEST = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-test.conllu"
"UD_210 test set of ARABIC_PADT."
UD_210_ARABIC_PUD_TEST = _UD_210_HOME + "UD_Arabic-PUD/ar_pud-ud-test.conllu"
"UD_210 test set of ARABIC_PUD."
UD_210_ARMENIAN_ARMTDP_TRAIN = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"
"UD_210 train set of ARMENIAN_ARMTDP."
UD_210_ARMENIAN_ARMTDP_DEV = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu"
"UD_210 dev set of ARMENIAN_ARMTDP."
UD_210_ARMENIAN_ARMTDP_TEST = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu"
"UD_210 test set of ARMENIAN_ARMTDP."
UD_210_ARMENIAN_BSUT_TRAIN = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-train.conllu"
"UD_210 train set of ARMENIAN_BSUT."
UD_210_ARMENIAN_BSUT_DEV = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-dev.conllu"
"UD_210 dev set of ARMENIAN_BSUT."
UD_210_ARMENIAN_BSUT_TEST = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-test.conllu"
"UD_210 test set of ARMENIAN_BSUT."
UD_210_ASSYRIAN_AS_TEST = _UD_210_HOME + "UD_Assyrian-AS/aii_as-ud-test.conllu"
"UD_210 test set of ASSYRIAN_AS."
UD_210_BAMBARA_CRB_TEST = _UD_210_HOME + "UD_Bambara-CRB/bm_crb-ud-test.conllu"
"UD_210 test set of BAMBARA_CRB."
UD_210_BASQUE_BDT_TRAIN = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-train.conllu"
"UD_210 train set of BASQUE_BDT."
UD_210_BASQUE_BDT_DEV = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-dev.conllu"
"UD_210 dev set of BASQUE_BDT."
UD_210_BASQUE_BDT_TEST = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-test.conllu"
"UD_210 test set of BASQUE_BDT."
UD_210_BEJA_NSC_TEST = _UD_210_HOME + "UD_Beja-NSC/bej_nsc-ud-test.conllu"
"UD_210 test set of BEJA_NSC."
UD_210_BELARUSIAN_HSE_TRAIN = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-train.conllu"
"UD_210 train set of BELARUSIAN_HSE."
UD_210_BELARUSIAN_HSE_DEV = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-dev.conllu"
"UD_210 dev set of BELARUSIAN_HSE."
UD_210_BELARUSIAN_HSE_TEST = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-test.conllu"
"UD_210 test set of BELARUSIAN_HSE."
UD_210_BENGALI_BRU_TEST = _UD_210_HOME + "UD_Bengali-BRU/bn_bru-ud-test.conllu"
"UD_210 test set of BENGALI_BRU."
UD_210_BHOJPURI_BHTB_TEST = _UD_210_HOME + "UD_Bhojpuri-BHTB/bho_bhtb-ud-test.conllu"
"UD_210 test set of BHOJPURI_BHTB."
UD_210_BRETON_KEB_TEST = _UD_210_HOME + "UD_Breton-KEB/br_keb-ud-test.conllu"
"UD_210 test set of BRETON_KEB."
UD_210_BULGARIAN_BTB_TRAIN = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-train.conllu"
"UD_210 train set of BULGARIAN_BTB."
UD_210_BULGARIAN_BTB_DEV = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-dev.conllu"
"UD_210 dev set of BULGARIAN_BTB."
UD_210_BULGARIAN_BTB_TEST = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-test.conllu"
"UD_210 test set of BULGARIAN_BTB."
UD_210_BURYAT_BDT_TRAIN = _UD_210_HOME + "UD_Buryat-BDT/bxr_bdt-ud-train.conllu"
"UD_210 train set of BURYAT_BDT."
UD_210_BURYAT_BDT_TEST = _UD_210_HOME + "UD_Buryat-BDT/bxr_bdt-ud-test.conllu"
"UD_210 test set of BURYAT_BDT."
UD_210_CANTONESE_HK_TEST = _UD_210_HOME + "UD_Cantonese-HK/yue_hk-ud-test.conllu"
"UD_210 test set of CANTONESE_HK."
UD_210_CATALAN_ANCORA_TRAIN = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-train.conllu"
"UD_210 train set of CATALAN_ANCORA."
UD_210_CATALAN_ANCORA_DEV = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-dev.conllu"
"UD_210 dev set of CATALAN_ANCORA."
UD_210_CATALAN_ANCORA_TEST = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-test.conllu"
"UD_210 test set of CATALAN_ANCORA."
UD_210_CEBUANO_GJA_TEST = _UD_210_HOME + "UD_Cebuano-GJA/ceb_gja-ud-test.conllu"
"UD_210 test set of CEBUANO_GJA."
UD_210_CHINESE_CFL_TEST = _UD_210_HOME + "UD_Chinese-CFL/zh_cfl-ud-test.conllu"
"UD_210 test set of CHINESE_CFL."
UD_210_CHINESE_GSD_TRAIN = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-train.conllu"
"UD_210 train set of CHINESE_GSD."
UD_210_CHINESE_GSD_DEV = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-dev.conllu"
"UD_210 dev set of CHINESE_GSD."
UD_210_CHINESE_GSD_TEST = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-test.conllu"
"UD_210 test set of CHINESE_GSD."
UD_210_CHINESE_GSDSIMP_TRAIN = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu"
"UD_210 train set of CHINESE_GSDSIMP."
UD_210_CHINESE_GSDSIMP_DEV = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu"
"UD_210 dev set of CHINESE_GSDSIMP."
UD_210_CHINESE_GSDSIMP_TEST = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu"
"UD_210 test set of CHINESE_GSDSIMP."
UD_210_CHINESE_HK_TEST = _UD_210_HOME + "UD_Chinese-HK/zh_hk-ud-test.conllu"
"UD_210 test set of CHINESE_HK."
UD_210_CHINESE_PUD_TEST = _UD_210_HOME + "UD_Chinese-PUD/zh_pud-ud-test.conllu"
"UD_210 test set of CHINESE_PUD."
UD_210_CHUKCHI_HSE_TEST = _UD_210_HOME + "UD_Chukchi-HSE/ckt_hse-ud-test.conllu"
"UD_210 test set of CHUKCHI_HSE."
UD_210_CLASSICAL_CHINESE_KYOTO_TRAIN = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu"
"UD_210 train set of CLASSICAL_CHINESE_KYOTO."
UD_210_CLASSICAL_CHINESE_KYOTO_DEV = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu"
"UD_210 dev set of CLASSICAL_CHINESE_KYOTO."
UD_210_CLASSICAL_CHINESE_KYOTO_TEST = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu"
"UD_210 test set of CLASSICAL_CHINESE_KYOTO."
UD_210_COPTIC_SCRIPTORIUM_TRAIN = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu"
"UD_210 train set of COPTIC_SCRIPTORIUM."
UD_210_COPTIC_SCRIPTORIUM_DEV = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu"
"UD_210 dev set of COPTIC_SCRIPTORIUM."
UD_210_COPTIC_SCRIPTORIUM_TEST = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu"
"UD_210 test set of COPTIC_SCRIPTORIUM."
UD_210_CROATIAN_SET_TRAIN = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-train.conllu"
"UD_210 train set of CROATIAN_SET."
UD_210_CROATIAN_SET_DEV = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-dev.conllu"
"UD_210 dev set of CROATIAN_SET."
UD_210_CROATIAN_SET_TEST = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-test.conllu"
"UD_210 test set of CROATIAN_SET."
UD_210_CZECH_CAC_TRAIN = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-train.conllu"
"UD_210 train set of CZECH_CAC."
UD_210_CZECH_CAC_DEV = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-dev.conllu"
"UD_210 dev set of CZECH_CAC."
UD_210_CZECH_CAC_TEST = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-test.conllu"
"UD_210 test set of CZECH_CAC."
UD_210_CZECH_CLTT_TRAIN = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-train.conllu"
"UD_210 train set of CZECH_CLTT."
UD_210_CZECH_CLTT_DEV = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-dev.conllu"
"UD_210 dev set of CZECH_CLTT."
UD_210_CZECH_CLTT_TEST = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-test.conllu"
"UD_210 test set of CZECH_CLTT."
UD_210_CZECH_FICTREE_TRAIN = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-train.conllu"
"UD_210 train set of CZECH_FICTREE."
UD_210_CZECH_FICTREE_DEV = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-dev.conllu"
"UD_210 dev set of CZECH_FICTREE."
UD_210_CZECH_FICTREE_TEST = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-test.conllu"
"UD_210 test set of CZECH_FICTREE."
UD_210_CZECH_PDT_TRAIN = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-train.conllu"
"UD_210 train set of CZECH_PDT."
UD_210_CZECH_PDT_DEV = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-dev.conllu"
"UD_210 dev set of CZECH_PDT."
UD_210_CZECH_PDT_TEST = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-test.conllu"
"UD_210 test set of CZECH_PDT."
UD_210_CZECH_PUD_TEST = _UD_210_HOME + "UD_Czech-PUD/cs_pud-ud-test.conllu"
"UD_210 test set of CZECH_PUD."
UD_210_DANISH_DDT_TRAIN = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-train.conllu"
"UD_210 train set of DANISH_DDT."
UD_210_DANISH_DDT_DEV = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-dev.conllu"
"UD_210 dev set of DANISH_DDT."
UD_210_DANISH_DDT_TEST = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-test.conllu"
"UD_210 test set of DANISH_DDT."
UD_210_DUTCH_ALPINO_TRAIN = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
"UD_210 train set of DUTCH_ALPINO."
UD_210_DUTCH_ALPINO_DEV = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
"UD_210 dev set of DUTCH_ALPINO."
UD_210_DUTCH_ALPINO_TEST = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-test.conllu"
"UD_210 test set of DUTCH_ALPINO."
UD_210_DUTCH_LASSYSMALL_TRAIN = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu"
"UD_210 train set of DUTCH_LASSYSMALL."
UD_210_DUTCH_LASSYSMALL_DEV = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu"
"UD_210 dev set of DUTCH_LASSYSMALL."
UD_210_DUTCH_LASSYSMALL_TEST = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu"
"UD_210 test set of DUTCH_LASSYSMALL."
UD_210_ENGLISH_ATIS_TRAIN = _UD_210_HOME + "UD_English-Atis/en_atis-ud-train.conllu"
"UD_210 train set of ENGLISH_ATIS."
UD_210_ENGLISH_ATIS_DEV = _UD_210_HOME + "UD_English-Atis/en_atis-ud-dev.conllu"
"UD_210 dev set of ENGLISH_ATIS."
UD_210_ENGLISH_ATIS_TEST = _UD_210_HOME + "UD_English-Atis/en_atis-ud-test.conllu"
"UD_210 test set of ENGLISH_ATIS."
UD_210_ENGLISH_ESL_TRAIN = _UD_210_HOME + "UD_English-ESL/en_esl-ud-train.conllu"
"UD_210 train set of ENGLISH_ESL."
UD_210_ENGLISH_ESL_DEV = _UD_210_HOME + "UD_English-ESL/en_esl-ud-dev.conllu"
"UD_210 dev set of ENGLISH_ESL."
UD_210_ENGLISH_ESL_TEST = _UD_210_HOME + "UD_English-ESL/en_esl-ud-test.conllu"
"UD_210 test set of ENGLISH_ESL."
UD_210_ENGLISH_EWT_TRAIN = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-train.conllu"
"UD_210 train set of ENGLISH_EWT."
UD_210_ENGLISH_EWT_DEV = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-dev.conllu"
"UD_210 dev set of ENGLISH_EWT."
UD_210_ENGLISH_EWT_TEST = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-test.conllu"
"UD_210 test set of ENGLISH_EWT."
UD_210_ENGLISH_GUM_TRAIN = _UD_210_HOME + "UD_English-GUM/en_gum-ud-train.conllu"
"UD_210 train set of ENGLISH_GUM."
UD_210_ENGLISH_GUM_DEV = _UD_210_HOME + "UD_English-GUM/en_gum-ud-dev.conllu"
"UD_210 dev set of ENGLISH_GUM."
UD_210_ENGLISH_GUM_TEST = _UD_210_HOME + "UD_English-GUM/en_gum-ud-test.conllu"
"UD_210 test set of ENGLISH_GUM."
UD_210_ENGLISH_GUMREDDIT_TRAIN = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-train.conllu"
"UD_210 train set of ENGLISH_GUMREDDIT."
UD_210_ENGLISH_GUMREDDIT_DEV = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-dev.conllu"
"UD_210 dev set of ENGLISH_GUMREDDIT."
UD_210_ENGLISH_GUMREDDIT_TEST = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-test.conllu"
"UD_210 test set of ENGLISH_GUMREDDIT."
UD_210_ENGLISH_LINES_TRAIN = _UD_210_HOME + "UD_English-LinES/en_lines-ud-train.conllu"
"UD_210 train set of ENGLISH_LINES."
UD_210_ENGLISH_LINES_DEV = _UD_210_HOME + "UD_English-LinES/en_lines-ud-dev.conllu"
"UD_210 dev set of ENGLISH_LINES."
UD_210_ENGLISH_LINES_TEST = _UD_210_HOME + "UD_English-LinES/en_lines-ud-test.conllu"
"UD_210 test set of ENGLISH_LINES."
UD_210_ENGLISH_PUD_TEST = _UD_210_HOME + "UD_English-PUD/en_pud-ud-test.conllu"
"UD_210 test set of ENGLISH_PUD."
UD_210_ENGLISH_PARTUT_TRAIN = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-train.conllu"
"UD_210 train set of ENGLISH_PARTUT."
UD_210_ENGLISH_PARTUT_DEV = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-dev.conllu"
"UD_210 dev set of ENGLISH_PARTUT."
UD_210_ENGLISH_PARTUT_TEST = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-test.conllu"
"UD_210 test set of ENGLISH_PARTUT."
UD_210_ENGLISH_PRONOUNS_TEST = _UD_210_HOME + "UD_English-Pronouns/en_pronouns-ud-test.conllu"
"UD_210 test set of ENGLISH_PRONOUNS."
UD_210_ERZYA_JR_TEST = _UD_210_HOME + "UD_Erzya-JR/myv_jr-ud-test.conllu"
"UD_210 test set of ERZYA_JR."
UD_210_ESTONIAN_EDT_TRAIN = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-train.conllu"
"UD_210 train set of ESTONIAN_EDT."
UD_210_ESTONIAN_EDT_DEV = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-dev.conllu"
"UD_210 dev set of ESTONIAN_EDT."
UD_210_ESTONIAN_EDT_TEST = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-test.conllu"
"UD_210 test set of ESTONIAN_EDT."
UD_210_ESTONIAN_EWT_TRAIN = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-train.conllu"
"UD_210 train set of ESTONIAN_EWT."
UD_210_ESTONIAN_EWT_DEV = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-dev.conllu"
"UD_210 dev set of ESTONIAN_EWT."
UD_210_ESTONIAN_EWT_TEST = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-test.conllu"
"UD_210 test set of ESTONIAN_EWT."
UD_210_FAROESE_FARPAHC_TRAIN = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu"
"UD_210 train set of FAROESE_FARPAHC."
UD_210_FAROESE_FARPAHC_DEV = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu"
"UD_210 dev set of FAROESE_FARPAHC."
UD_210_FAROESE_FARPAHC_TEST = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu"
"UD_210 test set of FAROESE_FARPAHC."
UD_210_FAROESE_OFT_TEST = _UD_210_HOME + "UD_Faroese-OFT/fo_oft-ud-test.conllu"
"UD_210 test set of FAROESE_OFT."
UD_210_FINNISH_FTB_TRAIN = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-train.conllu"
"UD_210 train set of FINNISH_FTB."
UD_210_FINNISH_FTB_DEV = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-dev.conllu"
"UD_210 dev set of FINNISH_FTB."
UD_210_FINNISH_FTB_TEST = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-test.conllu"
"UD_210 test set of FINNISH_FTB."
UD_210_FINNISH_OOD_TEST = _UD_210_HOME + "UD_Finnish-OOD/fi_ood-ud-test.conllu"
"UD_210 test set of FINNISH_OOD."
UD_210_FINNISH_PUD_TEST = _UD_210_HOME + "UD_Finnish-PUD/fi_pud-ud-test.conllu"
"UD_210 test set of FINNISH_PUD."
UD_210_FINNISH_TDT_TRAIN = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-train.conllu"
"UD_210 train set of FINNISH_TDT."
UD_210_FINNISH_TDT_DEV = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-dev.conllu"
"UD_210 dev set of FINNISH_TDT."
UD_210_FINNISH_TDT_TEST = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-test.conllu"
"UD_210 test set of FINNISH_TDT."
UD_210_FRENCH_FQB_TEST = _UD_210_HOME + "UD_French-FQB/fr_fqb-ud-test.conllu"
"UD_210 test set of FRENCH_FQB."
UD_210_FRENCH_FTB_TRAIN = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-train.conllu"
"UD_210 train set of FRENCH_FTB."
UD_210_FRENCH_FTB_DEV = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-dev.conllu"
"UD_210 dev set of FRENCH_FTB."
UD_210_FRENCH_FTB_TEST = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-test.conllu"
"UD_210 test set of FRENCH_FTB."
UD_210_FRENCH_GSD_TRAIN = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-train.conllu"
"UD_210 train set of FRENCH_GSD."
UD_210_FRENCH_GSD_DEV = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-dev.conllu"
"UD_210 dev set of FRENCH_GSD."
UD_210_FRENCH_GSD_TEST = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-test.conllu"
"UD_210 test set of FRENCH_GSD."
UD_210_FRENCH_PUD_TEST = _UD_210_HOME + "UD_French-PUD/fr_pud-ud-test.conllu"
"UD_210 test set of FRENCH_PUD."
UD_210_FRENCH_PARTUT_TRAIN = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-train.conllu"
"UD_210 train set of FRENCH_PARTUT."
UD_210_FRENCH_PARTUT_DEV = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-dev.conllu"
"UD_210 dev set of FRENCH_PARTUT."
UD_210_FRENCH_PARTUT_TEST = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-test.conllu"
"UD_210 test set of FRENCH_PARTUT."
UD_210_FRENCH_PARISSTORIES_TRAIN = _UD_210_HOME + "UD_French-ParisStories/fr_parisstories-ud-train.conllu"
"UD_210 train set of FRENCH_PARISSTORIES."
UD_210_FRENCH_PARISSTORIES_TEST = _UD_210_HOME + "UD_French-ParisStories/fr_parisstories-ud-test.conllu"
"UD_210 test set of FRENCH_PARISSTORIES."
UD_210_FRENCH_RHAPSODIE_TRAIN = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-train.conllu"
"UD_210 train set of FRENCH_RHAPSODIE."
UD_210_FRENCH_RHAPSODIE_DEV = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-dev.conllu"
"UD_210 dev set of FRENCH_RHAPSODIE."
UD_210_FRENCH_RHAPSODIE_TEST = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-test.conllu"
"UD_210 test set of FRENCH_RHAPSODIE."
UD_210_FRENCH_SEQUOIA_TRAIN = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-train.conllu"
"UD_210 train set of FRENCH_SEQUOIA."
UD_210_FRENCH_SEQUOIA_DEV = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-dev.conllu"
"UD_210 dev set of FRENCH_SEQUOIA."
UD_210_FRENCH_SEQUOIA_TEST = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-test.conllu"
"UD_210 test set of FRENCH_SEQUOIA."
UD_210_FRISIAN_DUTCH_FAME_TEST = _UD_210_HOME + "UD_Frisian_Dutch-Fame/qfn_fame-ud-test.conllu"
"UD_210 test set of FRISIAN_DUTCH_FAME."
UD_210_GALICIAN_CTG_TRAIN = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-train.conllu"
"UD_210 train set of GALICIAN_CTG."
UD_210_GALICIAN_CTG_DEV = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-dev.conllu"
"UD_210 dev set of GALICIAN_CTG."
UD_210_GALICIAN_CTG_TEST = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-test.conllu"
"UD_210 test set of GALICIAN_CTG."
UD_210_GALICIAN_TREEGAL_TRAIN = _UD_210_HOME + "UD_Galician-TreeGal/gl_treegal-ud-train.conllu"
"UD_210 train set of GALICIAN_TREEGAL."
UD_210_GALICIAN_TREEGAL_TEST = _UD_210_HOME + "UD_Galician-TreeGal/gl_treegal-ud-test.conllu"
"UD_210 test set of GALICIAN_TREEGAL."
UD_210_GERMAN_GSD_TRAIN = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-train.conllu"
"UD_210 train set of GERMAN_GSD."
UD_210_GERMAN_GSD_DEV = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-dev.conllu"
"UD_210 dev set of GERMAN_GSD."
UD_210_GERMAN_GSD_TEST = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-test.conllu"
"UD_210 test set of GERMAN_GSD."
UD_210_GERMAN_HDT_TRAIN = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-train.conllu"
"UD_210 train set of GERMAN_HDT."
UD_210_GERMAN_HDT_DEV = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-dev.conllu"
"UD_210 dev set of GERMAN_HDT."
UD_210_GERMAN_HDT_TEST = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-test.conllu"
"UD_210 test set of GERMAN_HDT."
UD_210_GERMAN_LIT_TEST = _UD_210_HOME + "UD_German-LIT/de_lit-ud-test.conllu"
"UD_210 test set of GERMAN_LIT."
UD_210_GERMAN_PUD_TEST = _UD_210_HOME + "UD_German-PUD/de_pud-ud-test.conllu"
"UD_210 test set of GERMAN_PUD."
UD_210_GOTHIC_PROIEL_TRAIN = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-train.conllu"
"UD_210 train set of GOTHIC_PROIEL."
UD_210_GOTHIC_PROIEL_DEV = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-dev.conllu"
"UD_210 dev set of GOTHIC_PROIEL."
UD_210_GOTHIC_PROIEL_TEST = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-test.conllu"
"UD_210 test set of GOTHIC_PROIEL."
UD_210_GREEK_GDT_TRAIN = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-train.conllu"
"UD_210 train set of GREEK_GDT."
UD_210_GREEK_GDT_DEV = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-dev.conllu"
"UD_210 dev set of GREEK_GDT."
UD_210_GREEK_GDT_TEST = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-test.conllu"
"UD_210 test set of GREEK_GDT."
UD_210_GUAJAJARA_TUDET_TEST = _UD_210_HOME + "UD_Guajajara-TuDeT/gub_tudet-ud-test.conllu"
"UD_210 test set of GUAJAJARA_TUDET."
UD_210_GUARANI_OLDTUDET_TEST = _UD_210_HOME + "UD_Guarani-OldTuDeT/gn_oldtudet-ud-test.conllu"
"UD_210 test set of GUARANI_OLDTUDET."
UD_210_HEBREW_HTB_TRAIN = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-train.conllu"
"UD_210 train set of HEBREW_HTB."
UD_210_HEBREW_HTB_DEV = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-dev.conllu"
"UD_210 dev set of HEBREW_HTB."
UD_210_HEBREW_HTB_TEST = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-test.conllu"
"UD_210 test set of HEBREW_HTB."
UD_210_HEBREW_IAHLTWIKI_TRAIN = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu"
"UD_210 train set of HEBREW_IAHLTWIKI."
UD_210_HEBREW_IAHLTWIKI_DEV = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu"
"UD_210 dev set of HEBREW_IAHLTWIKI."
UD_210_HEBREW_IAHLTWIKI_TEST = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-test.conllu"
"UD_210 test set of HEBREW_IAHLTWIKI."
UD_210_HINDI_HDTB_TRAIN = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-train.conllu"
"UD_210 train set of HINDI_HDTB."
UD_210_HINDI_HDTB_DEV = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu"
"UD_210 dev set of HINDI_HDTB."
UD_210_HINDI_HDTB_TEST = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-test.conllu"
"UD_210 test set of HINDI_HDTB."
UD_210_HINDI_PUD_TEST = _UD_210_HOME + "UD_Hindi-PUD/hi_pud-ud-test.conllu"
"UD_210 test set of HINDI_PUD."
UD_210_HINDI_ENGLISH_HIENCS_TRAIN = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu"
"UD_210 train set of HINDI_ENGLISH_HIENCS."
UD_210_HINDI_ENGLISH_HIENCS_DEV = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu"
"UD_210 dev set of HINDI_ENGLISH_HIENCS."
UD_210_HINDI_ENGLISH_HIENCS_TEST = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu"
"UD_210 test set of HINDI_ENGLISH_HIENCS."
UD_210_HITTITE_HITTB_TEST = _UD_210_HOME + "UD_Hittite-HitTB/hit_hittb-ud-test.conllu"
"UD_210 test set of HITTITE_HITTB."
UD_210_HUNGARIAN_SZEGED_TRAIN = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-train.conllu"
"UD_210 train set of HUNGARIAN_SZEGED."
UD_210_HUNGARIAN_SZEGED_DEV = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu"
"UD_210 dev set of HUNGARIAN_SZEGED."
UD_210_HUNGARIAN_SZEGED_TEST = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-test.conllu"
"UD_210 test set of HUNGARIAN_SZEGED."
UD_210_ICELANDIC_ICEPAHC_TRAIN = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu"
"UD_210 train set of ICELANDIC_ICEPAHC."
UD_210_ICELANDIC_ICEPAHC_DEV = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu"
"UD_210 dev set of ICELANDIC_ICEPAHC."
UD_210_ICELANDIC_ICEPAHC_TEST = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu"
"UD_210 test set of ICELANDIC_ICEPAHC."
UD_210_ICELANDIC_MODERN_TRAIN = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-train.conllu"
"UD_210 train set of ICELANDIC_MODERN."
UD_210_ICELANDIC_MODERN_DEV = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-dev.conllu"
"UD_210 dev set of ICELANDIC_MODERN."
UD_210_ICELANDIC_MODERN_TEST = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-test.conllu"
"UD_210 test set of ICELANDIC_MODERN."
UD_210_ICELANDIC_PUD_TEST = _UD_210_HOME + "UD_Icelandic-PUD/is_pud-ud-test.conllu"
"UD_210 test set of ICELANDIC_PUD."
UD_210_INDONESIAN_CSUI_TRAIN = _UD_210_HOME + "UD_Indonesian-CSUI/id_csui-ud-train.conllu"
"UD_210 train set of INDONESIAN_CSUI."
UD_210_INDONESIAN_CSUI_TEST = _UD_210_HOME + "UD_Indonesian-CSUI/id_csui-ud-test.conllu"
"UD_210 test set of INDONESIAN_CSUI."
UD_210_INDONESIAN_GSD_TRAIN = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-train.conllu"
"UD_210 train set of INDONESIAN_GSD."
UD_210_INDONESIAN_GSD_DEV = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-dev.conllu"
"UD_210 dev set of INDONESIAN_GSD."
UD_210_INDONESIAN_GSD_TEST = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-test.conllu"
"UD_210 test set of INDONESIAN_GSD."
UD_210_INDONESIAN_PUD_TEST = _UD_210_HOME + "UD_Indonesian-PUD/id_pud-ud-test.conllu"
"UD_210 test set of INDONESIAN_PUD."
UD_210_IRISH_IDT_TRAIN = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-train.conllu"
"UD_210 train set of IRISH_IDT."
UD_210_IRISH_IDT_DEV = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-dev.conllu"
"UD_210 dev set of IRISH_IDT."
UD_210_IRISH_IDT_TEST = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-test.conllu"
"UD_210 test set of IRISH_IDT."
UD_210_IRISH_TWITTIRISH_TEST = _UD_210_HOME + "UD_Irish-TwittIrish/ga_twittirish-ud-test.conllu"
"UD_210 test set of IRISH_TWITTIRISH."
UD_210_ITALIAN_ISDT_TRAIN = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-train.conllu"
"UD_210 train set of ITALIAN_ISDT."
UD_210_ITALIAN_ISDT_DEV = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-dev.conllu"
"UD_210 dev set of ITALIAN_ISDT."
UD_210_ITALIAN_ISDT_TEST = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-test.conllu"
"UD_210 test set of ITALIAN_ISDT."
UD_210_ITALIAN_MARKIT_TRAIN = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-train.conllu"
"UD_210 train set of ITALIAN_MARKIT."
UD_210_ITALIAN_MARKIT_DEV = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-dev.conllu"
"UD_210 dev set of ITALIAN_MARKIT."
UD_210_ITALIAN_MARKIT_TEST = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-test.conllu"
"UD_210 test set of ITALIAN_MARKIT."
UD_210_ITALIAN_PUD_TEST = _UD_210_HOME + "UD_Italian-PUD/it_pud-ud-test.conllu"
"UD_210 test set of ITALIAN_PUD."
UD_210_ITALIAN_PARTUT_TRAIN = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-train.conllu"
"UD_210 train set of ITALIAN_PARTUT."
UD_210_ITALIAN_PARTUT_DEV = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-dev.conllu"
"UD_210 dev set of ITALIAN_PARTUT."
UD_210_ITALIAN_PARTUT_TEST = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-test.conllu"
"UD_210 test set of ITALIAN_PARTUT."
UD_210_ITALIAN_POSTWITA_TRAIN = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-train.conllu"
"UD_210 train set of ITALIAN_POSTWITA."
UD_210_ITALIAN_POSTWITA_DEV = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu"
"UD_210 dev set of ITALIAN_POSTWITA."
UD_210_ITALIAN_POSTWITA_TEST = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-test.conllu"
"UD_210 test set of ITALIAN_POSTWITA."
UD_210_ITALIAN_TWITTIRO_TRAIN = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu"
"UD_210 train set of ITALIAN_TWITTIRO."
UD_210_ITALIAN_TWITTIRO_DEV = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu"
"UD_210 dev set of ITALIAN_TWITTIRO."
UD_210_ITALIAN_TWITTIRO_TEST = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu"
"UD_210 test set of ITALIAN_TWITTIRO."
UD_210_ITALIAN_VIT_TRAIN = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-train.conllu"
"UD_210 train set of ITALIAN_VIT."
UD_210_ITALIAN_VIT_DEV = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-dev.conllu"
"UD_210 dev set of ITALIAN_VIT."
UD_210_ITALIAN_VIT_TEST = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-test.conllu"
"UD_210 test set of ITALIAN_VIT."
UD_210_ITALIAN_VALICO_TEST = _UD_210_HOME + "UD_Italian-Valico/it_valico-ud-test.conllu"
"UD_210 test set of ITALIAN_VALICO."
UD_210_JAPANESE_BCCWJ_TRAIN = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu"
"UD_210 train set of JAPANESE_BCCWJ."
UD_210_JAPANESE_BCCWJ_DEV = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu"
"UD_210 dev set of JAPANESE_BCCWJ."
UD_210_JAPANESE_BCCWJ_TEST = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu"
"UD_210 test set of JAPANESE_BCCWJ."
UD_210_JAPANESE_BCCWJLUW_TRAIN = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-train.conllu"
"UD_210 train set of JAPANESE_BCCWJLUW."
UD_210_JAPANESE_BCCWJLUW_DEV = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-dev.conllu"
"UD_210 dev set of JAPANESE_BCCWJLUW."
UD_210_JAPANESE_BCCWJLUW_TEST = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-test.conllu"
"UD_210 test set of JAPANESE_BCCWJLUW."
UD_210_JAPANESE_GSD_TRAIN = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-train.conllu"
"UD_210 train set of JAPANESE_GSD."
UD_210_JAPANESE_GSD_DEV = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-dev.conllu"
"UD_210 dev set of JAPANESE_GSD."
UD_210_JAPANESE_GSD_TEST = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-test.conllu"
"UD_210 test set of JAPANESE_GSD."
UD_210_JAPANESE_GSDLUW_TRAIN = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-train.conllu"
"UD_210 train set of JAPANESE_GSDLUW."
UD_210_JAPANESE_GSDLUW_DEV = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-dev.conllu"
"UD_210 dev set of JAPANESE_GSDLUW."
UD_210_JAPANESE_GSDLUW_TEST = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-test.conllu"
"UD_210 test set of JAPANESE_GSDLUW."
UD_210_JAPANESE_MODERN_TEST = _UD_210_HOME + "UD_Japanese-Modern/ja_modern-ud-test.conllu"
"UD_210 test set of JAPANESE_MODERN."
UD_210_JAPANESE_PUD_TEST = _UD_210_HOME + "UD_Japanese-PUD/ja_pud-ud-test.conllu"
"UD_210 test set of JAPANESE_PUD."
UD_210_JAPANESE_PUDLUW_TEST = _UD_210_HOME + "UD_Japanese-PUDLUW/ja_pudluw-ud-test.conllu"
"UD_210 test set of JAPANESE_PUDLUW."
UD_210_JAVANESE_CSUI_TEST = _UD_210_HOME + "UD_Javanese-CSUI/jv_csui-ud-test.conllu"
"UD_210 test set of JAVANESE_CSUI."
UD_210_KAAPOR_TUDET_TEST = _UD_210_HOME + "UD_Kaapor-TuDeT/urb_tudet-ud-test.conllu"
"UD_210 test set of KAAPOR_TUDET."
UD_210_KANGRI_KDTB_TEST = _UD_210_HOME + "UD_Kangri-KDTB/xnr_kdtb-ud-test.conllu"
"UD_210 test set of KANGRI_KDTB."
UD_210_KARELIAN_KKPP_TEST = _UD_210_HOME + "UD_Karelian-KKPP/krl_kkpp-ud-test.conllu"
"UD_210 test set of KARELIAN_KKPP."
UD_210_KARO_TUDET_TEST = _UD_210_HOME + "UD_Karo-TuDeT/arr_tudet-ud-test.conllu"
"UD_210 test set of KARO_TUDET."
UD_210_KAZAKH_KTB_TRAIN = _UD_210_HOME + "UD_Kazakh-KTB/kk_ktb-ud-train.conllu"
"UD_210 train set of KAZAKH_KTB."
UD_210_KAZAKH_KTB_TEST = _UD_210_HOME + "UD_Kazakh-KTB/kk_ktb-ud-test.conllu"
"UD_210 test set of KAZAKH_KTB."
UD_210_KHUNSARI_AHA_TEST = _UD_210_HOME + "UD_Khunsari-AHA/kfm_aha-ud-test.conllu"
"UD_210 test set of KHUNSARI_AHA."
UD_210_KICHE_IU_TEST = _UD_210_HOME + "UD_Kiche-IU/quc_iu-ud-test.conllu"
"UD_210 test set of KICHE_IU."
UD_210_KOMI_PERMYAK_UH_TEST = _UD_210_HOME + "UD_Komi_Permyak-UH/koi_uh-ud-test.conllu"
"UD_210 test set of KOMI_PERMYAK_UH."
UD_210_KOMI_ZYRIAN_IKDP_TEST = _UD_210_HOME + "UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu"
"UD_210 test set of KOMI_ZYRIAN_IKDP."
UD_210_KOMI_ZYRIAN_LATTICE_TEST = _UD_210_HOME + "UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu"
"UD_210 test set of KOMI_ZYRIAN_LATTICE."
UD_210_KOREAN_GSD_TRAIN = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-train.conllu"
"UD_210 train set of KOREAN_GSD."
UD_210_KOREAN_GSD_DEV = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-dev.conllu"
"UD_210 dev set of KOREAN_GSD."
UD_210_KOREAN_GSD_TEST = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-test.conllu"
"UD_210 test set of KOREAN_GSD."
UD_210_KOREAN_KAIST_TRAIN = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-train.conllu"
"UD_210 train set of KOREAN_KAIST."
UD_210_KOREAN_KAIST_DEV = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-dev.conllu"
"UD_210 dev set of KOREAN_KAIST."
UD_210_KOREAN_KAIST_TEST = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-test.conllu"
"UD_210 test set of KOREAN_KAIST."
UD_210_KOREAN_PUD_TEST = _UD_210_HOME + "UD_Korean-PUD/ko_pud-ud-test.conllu"
"UD_210 test set of KOREAN_PUD."
UD_210_KURMANJI_MG_TRAIN = _UD_210_HOME + "UD_Kurmanji-MG/kmr_mg-ud-train.conllu"
"UD_210 train set of KURMANJI_MG."
UD_210_KURMANJI_MG_TEST = _UD_210_HOME + "UD_Kurmanji-MG/kmr_mg-ud-test.conllu"
"UD_210 test set of KURMANJI_MG."
UD_210_LATIN_ITTB_TRAIN = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-train.conllu"
"UD_210 train set of LATIN_ITTB."
UD_210_LATIN_ITTB_DEV = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-dev.conllu"
"UD_210 dev set of LATIN_ITTB."
UD_210_LATIN_ITTB_TEST = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-test.conllu"
"UD_210 test set of LATIN_ITTB."
UD_210_LATIN_LLCT_TRAIN = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-train.conllu"
"UD_210 train set of LATIN_LLCT."
UD_210_LATIN_LLCT_DEV = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-dev.conllu"
"UD_210 dev set of LATIN_LLCT."
UD_210_LATIN_LLCT_TEST = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-test.conllu"
"UD_210 test set of LATIN_LLCT."
UD_210_LATIN_PROIEL_TRAIN = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-train.conllu"
"UD_210 train set of LATIN_PROIEL."
UD_210_LATIN_PROIEL_DEV = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-dev.conllu"
"UD_210 dev set of LATIN_PROIEL."
UD_210_LATIN_PROIEL_TEST = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-test.conllu"
"UD_210 test set of LATIN_PROIEL."
UD_210_LATIN_PERSEUS_TRAIN = _UD_210_HOME + "UD_Latin-Perseus/la_perseus-ud-train.conllu"
"UD_210 train set of LATIN_PERSEUS."
UD_210_LATIN_PERSEUS_TEST = _UD_210_HOME + "UD_Latin-Perseus/la_perseus-ud-test.conllu"
"UD_210 test set of LATIN_PERSEUS."
UD_210_LATIN_UDANTE_TRAIN = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-train.conllu"
"UD_210 train set of LATIN_UDANTE."
UD_210_LATIN_UDANTE_DEV = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-dev.conllu"
"UD_210 dev set of LATIN_UDANTE."
UD_210_LATIN_UDANTE_TEST = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-test.conllu"
"UD_210 test set of LATIN_UDANTE."
UD_210_LATVIAN_LVTB_TRAIN = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-train.conllu"
"UD_210 train set of LATVIAN_LVTB."
UD_210_LATVIAN_LVTB_DEV = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu"
"UD_210 dev set of LATVIAN_LVTB."
UD_210_LATVIAN_LVTB_TEST = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-test.conllu"
"UD_210 test set of LATVIAN_LVTB."
UD_210_LIGURIAN_GLT_TRAIN = _UD_210_HOME + "UD_Ligurian-GLT/lij_glt-ud-train.conllu"
"UD_210 train set of LIGURIAN_GLT."
UD_210_LIGURIAN_GLT_TEST = _UD_210_HOME + "UD_Ligurian-GLT/lij_glt-ud-test.conllu"
"UD_210 test set of LIGURIAN_GLT."
UD_210_LITHUANIAN_ALKSNIS_TRAIN = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu"
"UD_210 train set of LITHUANIAN_ALKSNIS."
UD_210_LITHUANIAN_ALKSNIS_DEV = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu"
"UD_210 dev set of LITHUANIAN_ALKSNIS."
UD_210_LITHUANIAN_ALKSNIS_TEST = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu"
"UD_210 test set of LITHUANIAN_ALKSNIS."
UD_210_LITHUANIAN_HSE_TRAIN = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-train.conllu"
"UD_210 train set of LITHUANIAN_HSE."
UD_210_LITHUANIAN_HSE_DEV = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-dev.conllu"
"UD_210 dev set of LITHUANIAN_HSE."
UD_210_LITHUANIAN_HSE_TEST = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-test.conllu"
"UD_210 test set of LITHUANIAN_HSE."
UD_210_LIVVI_KKPP_TRAIN = _UD_210_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-train.conllu"
"UD_210 train set of LIVVI_KKPP."
UD_210_LIVVI_KKPP_TEST = _UD_210_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-test.conllu"
"UD_210 test set of LIVVI_KKPP."
UD_210_LOW_SAXON_LSDC_TEST = _UD_210_HOME + "UD_Low_Saxon-LSDC/nds_lsdc-ud-test.conllu"
"UD_210 test set of LOW_SAXON_LSDC."
UD_210_MADI_JARAWARA_TEST = _UD_210_HOME + "UD_Madi-Jarawara/jaa_jarawara-ud-test.conllu"
"UD_210 test set of MADI_JARAWARA."
UD_210_MAKURAP_TUDET_TEST = _UD_210_HOME + "UD_Makurap-TuDeT/mpu_tudet-ud-test.conllu"
"UD_210 test set of MAKURAP_TUDET."
UD_210_MALTESE_MUDT_TRAIN = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-train.conllu"
"UD_210 train set of MALTESE_MUDT."
UD_210_MALTESE_MUDT_DEV = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-dev.conllu"
"UD_210 dev set of MALTESE_MUDT."
UD_210_MALTESE_MUDT_TEST = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-test.conllu"
"UD_210 test set of MALTESE_MUDT."
UD_210_MANX_CADHAN_TEST = _UD_210_HOME + "UD_Manx-Cadhan/gv_cadhan-ud-test.conllu"
"UD_210 test set of MANX_CADHAN."
UD_210_MARATHI_UFAL_TRAIN = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-train.conllu"
"UD_210 train set of MARATHI_UFAL."
UD_210_MARATHI_UFAL_DEV = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-dev.conllu"
"UD_210 dev set of MARATHI_UFAL."
UD_210_MARATHI_UFAL_TEST = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-test.conllu"
"UD_210 test set of MARATHI_UFAL."
UD_210_MBYA_GUARANI_DOOLEY_TEST = _UD_210_HOME + "UD_Mbya_Guarani-Dooley/gun_dooley-ud-test.conllu"
"UD_210 test set of MBYA_GUARANI_DOOLEY."
UD_210_MBYA_GUARANI_THOMAS_TEST = _UD_210_HOME + "UD_Mbya_Guarani-Thomas/gun_thomas-ud-test.conllu"
"UD_210 test set of MBYA_GUARANI_THOMAS."
UD_210_MOKSHA_JR_TEST = _UD_210_HOME + "UD_Moksha-JR/mdf_jr-ud-test.conllu"
"UD_210 test set of MOKSHA_JR."
UD_210_MUNDURUKU_TUDET_TEST = _UD_210_HOME + "UD_Munduruku-TuDeT/myu_tudet-ud-test.conllu"
"UD_210 test set of MUNDURUKU_TUDET."
UD_210_NAIJA_NSC_TRAIN = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-train.conllu"
"UD_210 train set of NAIJA_NSC."
UD_210_NAIJA_NSC_DEV = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-dev.conllu"
"UD_210 dev set of NAIJA_NSC."
UD_210_NAIJA_NSC_TEST = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-test.conllu"
"UD_210 test set of NAIJA_NSC."
UD_210_NAYINI_AHA_TEST = _UD_210_HOME + "UD_Nayini-AHA/nyq_aha-ud-test.conllu"
"UD_210 test set of NAYINI_AHA."
UD_210_NEAPOLITAN_RB_TEST = _UD_210_HOME + "UD_Neapolitan-RB/nap_rb-ud-test.conllu"
"UD_210 test set of NEAPOLITAN_RB."
UD_210_NORTH_SAMI_GIELLA_TRAIN = _UD_210_HOME + "UD_North_Sami-Giella/sme_giella-ud-train.conllu"
"UD_210 train set of NORTH_SAMI_GIELLA."
UD_210_NORTH_SAMI_GIELLA_TEST = _UD_210_HOME + "UD_North_Sami-Giella/sme_giella-ud-test.conllu"
"UD_210 test set of NORTH_SAMI_GIELLA."
UD_210_NORWEGIAN_BOKMAAL_TRAIN = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu"
"UD_210 train set of NORWEGIAN_BOKMAAL."
UD_210_NORWEGIAN_BOKMAAL_DEV = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu"
"UD_210 dev set of NORWEGIAN_BOKMAAL."
UD_210_NORWEGIAN_BOKMAAL_TEST = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu"
"UD_210 test set of NORWEGIAN_BOKMAAL."
UD_210_NORWEGIAN_NYNORSK_TRAIN = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu"
"UD_210 train set of NORWEGIAN_NYNORSK."
UD_210_NORWEGIAN_NYNORSK_DEV = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu"
"UD_210 dev set of NORWEGIAN_NYNORSK."
UD_210_NORWEGIAN_NYNORSK_TEST = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu"
"UD_210 test set of NORWEGIAN_NYNORSK."
UD_210_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu"
"UD_210 train set of NORWEGIAN_NYNORSKLIA."
UD_210_NORWEGIAN_NYNORSKLIA_DEV = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu"
"UD_210 dev set of NORWEGIAN_NYNORSKLIA."
UD_210_NORWEGIAN_NYNORSKLIA_TEST = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu"
"UD_210 test set of NORWEGIAN_NYNORSKLIA."
UD_210_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu"
"UD_210 train set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_210_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu"
"UD_210 dev set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_210_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu"
"UD_210 test set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_210_OLD_EAST_SLAVIC_BIRCHBARK_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-train.conllu"
"UD_210 train set of OLD_EAST_SLAVIC_BIRCHBARK."
UD_210_OLD_EAST_SLAVIC_BIRCHBARK_DEV = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-dev.conllu"
"UD_210 dev set of OLD_EAST_SLAVIC_BIRCHBARK."
UD_210_OLD_EAST_SLAVIC_BIRCHBARK_TEST = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-test.conllu"
"UD_210 test set of OLD_EAST_SLAVIC_BIRCHBARK."
UD_210_OLD_EAST_SLAVIC_RNC_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-RNC/orv_rnc-ud-train.conllu"
"UD_210 train set of OLD_EAST_SLAVIC_RNC."
UD_210_OLD_EAST_SLAVIC_RNC_TEST = _UD_210_HOME + "UD_Old_East_Slavic-RNC/orv_rnc-ud-test.conllu"
"UD_210 test set of OLD_EAST_SLAVIC_RNC."
UD_210_OLD_EAST_SLAVIC_TOROT_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-train.conllu"
"UD_210 train set of OLD_EAST_SLAVIC_TOROT."
UD_210_OLD_EAST_SLAVIC_TOROT_DEV = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-dev.conllu"
"UD_210 dev set of OLD_EAST_SLAVIC_TOROT."
UD_210_OLD_EAST_SLAVIC_TOROT_TEST = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-test.conllu"
"UD_210 test set of OLD_EAST_SLAVIC_TOROT."
UD_210_OLD_FRENCH_SRCMF_TRAIN = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu"
"UD_210 train set of OLD_FRENCH_SRCMF."
UD_210_OLD_FRENCH_SRCMF_DEV = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu"
"UD_210 dev set of OLD_FRENCH_SRCMF."
UD_210_OLD_FRENCH_SRCMF_TEST = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu"
"UD_210 test set of OLD_FRENCH_SRCMF."
UD_210_OLD_TURKISH_TONQQ_TEST = _UD_210_HOME + "UD_Old_Turkish-Tonqq/otk_tonqq-ud-test.conllu"
"UD_210 test set of OLD_TURKISH_TONQQ."
UD_210_PERSIAN_PERDT_TRAIN = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-train.conllu"
"UD_210 train set of PERSIAN_PERDT."
UD_210_PERSIAN_PERDT_DEV = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-dev.conllu"
"UD_210 dev set of PERSIAN_PERDT."
UD_210_PERSIAN_PERDT_TEST = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-test.conllu"
"UD_210 test set of PERSIAN_PERDT."
UD_210_PERSIAN_SERAJI_TRAIN = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-train.conllu"
"UD_210 train set of PERSIAN_SERAJI."
UD_210_PERSIAN_SERAJI_DEV = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-dev.conllu"
"UD_210 dev set of PERSIAN_SERAJI."
UD_210_PERSIAN_SERAJI_TEST = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-test.conllu"
"UD_210 test set of PERSIAN_SERAJI."
UD_210_POLISH_LFG_TRAIN = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-train.conllu"
"UD_210 train set of POLISH_LFG."
UD_210_POLISH_LFG_DEV = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-dev.conllu"
"UD_210 dev set of POLISH_LFG."
UD_210_POLISH_LFG_TEST = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-test.conllu"
"UD_210 test set of POLISH_LFG."
UD_210_POLISH_PDB_TRAIN = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-train.conllu"
"UD_210 train set of POLISH_PDB."
UD_210_POLISH_PDB_DEV = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-dev.conllu"
"UD_210 dev set of POLISH_PDB."
UD_210_POLISH_PDB_TEST = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-test.conllu"
"UD_210 test set of POLISH_PDB."
UD_210_POLISH_PUD_TEST = _UD_210_HOME + "UD_Polish-PUD/pl_pud-ud-test.conllu"
"UD_210 test set of POLISH_PUD."
UD_210_POMAK_PHILOTIS_TRAIN = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-train.conllu"
"UD_210 train set of POMAK_PHILOTIS."
UD_210_POMAK_PHILOTIS_DEV = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-dev.conllu"
"UD_210 dev set of POMAK_PHILOTIS."
UD_210_POMAK_PHILOTIS_TEST = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-test.conllu"
"UD_210 test set of POMAK_PHILOTIS."
UD_210_PORTUGUESE_BOSQUE_TRAIN = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-train.conllu"
"UD_210 train set of PORTUGUESE_BOSQUE."
UD_210_PORTUGUESE_BOSQUE_DEV = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu"
"UD_210 dev set of PORTUGUESE_BOSQUE."
UD_210_PORTUGUESE_BOSQUE_TEST = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-test.conllu"
"UD_210 test set of PORTUGUESE_BOSQUE."
UD_210_PORTUGUESE_GSD_TRAIN = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-train.conllu"
"UD_210 train set of PORTUGUESE_GSD."
UD_210_PORTUGUESE_GSD_DEV = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-dev.conllu"
"UD_210 dev set of PORTUGUESE_GSD."
UD_210_PORTUGUESE_GSD_TEST = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-test.conllu"
"UD_210 test set of PORTUGUESE_GSD."
UD_210_PORTUGUESE_PUD_TEST = _UD_210_HOME + "UD_Portuguese-PUD/pt_pud-ud-test.conllu"
"UD_210 test set of PORTUGUESE_PUD."
UD_210_ROMANIAN_ART_TEST = _UD_210_HOME + "UD_Romanian-ArT/ro_art-ud-test.conllu"
"UD_210 test set of ROMANIAN_ART."
UD_210_ROMANIAN_NONSTANDARD_TRAIN = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu"
"UD_210 train set of ROMANIAN_NONSTANDARD."
UD_210_ROMANIAN_NONSTANDARD_DEV = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu"
"UD_210 dev set of ROMANIAN_NONSTANDARD."
UD_210_ROMANIAN_NONSTANDARD_TEST = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu"
"UD_210 test set of ROMANIAN_NONSTANDARD."
UD_210_ROMANIAN_RRT_TRAIN = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-train.conllu"
"UD_210 train set of ROMANIAN_RRT."
UD_210_ROMANIAN_RRT_DEV = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-dev.conllu"
"UD_210 dev set of ROMANIAN_RRT."
UD_210_ROMANIAN_RRT_TEST = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-test.conllu"
"UD_210 test set of ROMANIAN_RRT."
UD_210_ROMANIAN_SIMONERO_TRAIN = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-train.conllu"
"UD_210 train set of ROMANIAN_SIMONERO."
UD_210_ROMANIAN_SIMONERO_DEV = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-dev.conllu"
"UD_210 dev set of ROMANIAN_SIMONERO."
UD_210_ROMANIAN_SIMONERO_TEST = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-test.conllu"
"UD_210 test set of ROMANIAN_SIMONERO."
UD_210_RUSSIAN_GSD_TRAIN = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-train.conllu"
"UD_210 train set of RUSSIAN_GSD."
UD_210_RUSSIAN_GSD_DEV = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-dev.conllu"
"UD_210 dev set of RUSSIAN_GSD."
UD_210_RUSSIAN_GSD_TEST = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-test.conllu"
"UD_210 test set of RUSSIAN_GSD."
UD_210_RUSSIAN_PUD_TEST = _UD_210_HOME + "UD_Russian-PUD/ru_pud-ud-test.conllu"
"UD_210 test set of RUSSIAN_PUD."
UD_210_RUSSIAN_SYNTAGRUS_TRAIN = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu"
"UD_210 train set of RUSSIAN_SYNTAGRUS."
UD_210_RUSSIAN_SYNTAGRUS_DEV = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu"
"UD_210 dev set of RUSSIAN_SYNTAGRUS."
UD_210_RUSSIAN_SYNTAGRUS_TEST = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu"
"UD_210 test set of RUSSIAN_SYNTAGRUS."
UD_210_RUSSIAN_TAIGA_TRAIN = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-train.conllu"
"UD_210 train set of RUSSIAN_TAIGA."
UD_210_RUSSIAN_TAIGA_DEV = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-dev.conllu"
"UD_210 dev set of RUSSIAN_TAIGA."
UD_210_RUSSIAN_TAIGA_TEST = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-test.conllu"
"UD_210 test set of RUSSIAN_TAIGA."
UD_210_SANSKRIT_UFAL_TEST = _UD_210_HOME + "UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu"
"UD_210 test set of SANSKRIT_UFAL."
UD_210_SANSKRIT_VEDIC_TRAIN = _UD_210_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu"
"UD_210 train set of SANSKRIT_VEDIC."
UD_210_SANSKRIT_VEDIC_TEST = _UD_210_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu"
"UD_210 test set of SANSKRIT_VEDIC."
UD_210_SCOTTISH_GAELIC_ARCOSG_TRAIN = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu"
"UD_210 train set of SCOTTISH_GAELIC_ARCOSG."
UD_210_SCOTTISH_GAELIC_ARCOSG_DEV = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu"
"UD_210 dev set of SCOTTISH_GAELIC_ARCOSG."
UD_210_SCOTTISH_GAELIC_ARCOSG_TEST = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu"
"UD_210 test set of SCOTTISH_GAELIC_ARCOSG."
UD_210_SERBIAN_SET_TRAIN = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-train.conllu"
"UD_210 train set of SERBIAN_SET."
UD_210_SERBIAN_SET_DEV = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-dev.conllu"
"UD_210 dev set of SERBIAN_SET."
UD_210_SERBIAN_SET_TEST = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-test.conllu"
"UD_210 test set of SERBIAN_SET."
UD_210_SKOLT_SAMI_GIELLAGAS_TEST = _UD_210_HOME + "UD_Skolt_Sami-Giellagas/sms_giellagas-ud-test.conllu"
"UD_210 test set of SKOLT_SAMI_GIELLAGAS."
UD_210_SLOVAK_SNK_TRAIN = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-train.conllu"
"UD_210 train set of SLOVAK_SNK."
UD_210_SLOVAK_SNK_DEV = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-dev.conllu"
"UD_210 dev set of SLOVAK_SNK."
UD_210_SLOVAK_SNK_TEST = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-test.conllu"
"UD_210 test set of SLOVAK_SNK."
UD_210_SLOVENIAN_SSJ_TRAIN = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-train.conllu"
"UD_210 train set of SLOVENIAN_SSJ."
UD_210_SLOVENIAN_SSJ_DEV = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu"
"UD_210 dev set of SLOVENIAN_SSJ."
UD_210_SLOVENIAN_SSJ_TEST = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-test.conllu"
"UD_210 test set of SLOVENIAN_SSJ."
UD_210_SLOVENIAN_SST_TRAIN = _UD_210_HOME + "UD_Slovenian-SST/sl_sst-ud-train.conllu"
"UD_210 train set of SLOVENIAN_SST."
UD_210_SLOVENIAN_SST_TEST = _UD_210_HOME + "UD_Slovenian-SST/sl_sst-ud-test.conllu"
"UD_210 test set of SLOVENIAN_SST."
UD_210_SOI_AHA_TEST = _UD_210_HOME + "UD_Soi-AHA/soj_aha-ud-test.conllu"
"UD_210 test set of SOI_AHA."
UD_210_SOUTH_LEVANTINE_ARABIC_MADAR_TEST = _UD_210_HOME + "UD_South_Levantine_Arabic-MADAR/ajp_madar-ud-test.conllu"
"UD_210 test set of SOUTH_LEVANTINE_ARABIC_MADAR."
UD_210_SPANISH_ANCORA_TRAIN = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-train.conllu"
"UD_210 train set of SPANISH_ANCORA."
UD_210_SPANISH_ANCORA_DEV = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-dev.conllu"
"UD_210 dev set of SPANISH_ANCORA."
UD_210_SPANISH_ANCORA_TEST = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-test.conllu"
"UD_210 test set of SPANISH_ANCORA."
UD_210_SPANISH_GSD_TRAIN = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-train.conllu"
"UD_210 train set of SPANISH_GSD."
UD_210_SPANISH_GSD_DEV = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-dev.conllu"
"UD_210 dev set of SPANISH_GSD."
UD_210_SPANISH_GSD_TEST = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-test.conllu"
"UD_210 test set of SPANISH_GSD."
UD_210_SPANISH_PUD_TEST = _UD_210_HOME + "UD_Spanish-PUD/es_pud-ud-test.conllu"
"UD_210 test set of SPANISH_PUD."
UD_210_SWEDISH_LINES_TRAIN = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-train.conllu"
"UD_210 train set of SWEDISH_LINES."
UD_210_SWEDISH_LINES_DEV = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-dev.conllu"
"UD_210 dev set of SWEDISH_LINES."
UD_210_SWEDISH_LINES_TEST = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-test.conllu"
"UD_210 test set of SWEDISH_LINES."
UD_210_SWEDISH_PUD_TEST = _UD_210_HOME + "UD_Swedish-PUD/sv_pud-ud-test.conllu"
"UD_210 test set of SWEDISH_PUD."
UD_210_SWEDISH_TALBANKEN_TRAIN = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu"
"UD_210 train set of SWEDISH_TALBANKEN."
UD_210_SWEDISH_TALBANKEN_DEV = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu"
"UD_210 dev set of SWEDISH_TALBANKEN."
UD_210_SWEDISH_TALBANKEN_TEST = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu"
"UD_210 test set of SWEDISH_TALBANKEN."
UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu"
"UD_210 train set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu"
"UD_210 dev set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu"
"UD_210 test set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_210_SWISS_GERMAN_UZH_TEST = _UD_210_HOME + "UD_Swiss_German-UZH/gsw_uzh-ud-test.conllu"
"UD_210 test set of SWISS_GERMAN_UZH."
UD_210_TAGALOG_TRG_TEST = _UD_210_HOME + "UD_Tagalog-TRG/tl_trg-ud-test.conllu"
"UD_210 test set of TAGALOG_TRG."
UD_210_TAGALOG_UGNAYAN_TEST = _UD_210_HOME + "UD_Tagalog-Ugnayan/tl_ugnayan-ud-test.conllu"
"UD_210 test set of TAGALOG_UGNAYAN."
UD_210_TAMIL_MWTT_TEST = _UD_210_HOME + "UD_Tamil-MWTT/ta_mwtt-ud-test.conllu"
"UD_210 test set of TAMIL_MWTT."
UD_210_TAMIL_TTB_TRAIN = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-train.conllu"
"UD_210 train set of TAMIL_TTB."
UD_210_TAMIL_TTB_DEV = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-dev.conllu"
"UD_210 dev set of TAMIL_TTB."
UD_210_TAMIL_TTB_TEST = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-test.conllu"
"UD_210 test set of TAMIL_TTB."
UD_210_TATAR_NMCTT_TEST = _UD_210_HOME + "UD_Tatar-NMCTT/tt_nmctt-ud-test.conllu"
"UD_210 test set of TATAR_NMCTT."
UD_210_TEKO_TUDET_TEST = _UD_210_HOME + "UD_Teko-TuDeT/eme_tudet-ud-test.conllu"
"UD_210 test set of TEKO_TUDET."
UD_210_TELUGU_MTG_TRAIN = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-train.conllu"
"UD_210 train set of TELUGU_MTG."
UD_210_TELUGU_MTG_DEV = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-dev.conllu"
"UD_210 dev set of TELUGU_MTG."
UD_210_TELUGU_MTG_TEST = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-test.conllu"
"UD_210 test set of TELUGU_MTG."
UD_210_THAI_PUD_TEST = _UD_210_HOME + "UD_Thai-PUD/th_pud-ud-test.conllu"
"UD_210 test set of THAI_PUD."
UD_210_TUPINAMBA_TUDET_TEST = _UD_210_HOME + "UD_Tupinamba-TuDeT/tpn_tudet-ud-test.conllu"
"UD_210 test set of TUPINAMBA_TUDET."
UD_210_TURKISH_ATIS_TRAIN = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-train.conllu"
"UD_210 train set of TURKISH_ATIS."
UD_210_TURKISH_ATIS_DEV = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-dev.conllu"
"UD_210 dev set of TURKISH_ATIS."
UD_210_TURKISH_ATIS_TEST = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-test.conllu"
"UD_210 test set of TURKISH_ATIS."
UD_210_TURKISH_BOUN_TRAIN = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-train.conllu"
"UD_210 train set of TURKISH_BOUN."
UD_210_TURKISH_BOUN_DEV = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-dev.conllu"
"UD_210 dev set of TURKISH_BOUN."
UD_210_TURKISH_BOUN_TEST = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-test.conllu"
"UD_210 test set of TURKISH_BOUN."
UD_210_TURKISH_FRAMENET_TRAIN = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-train.conllu"
"UD_210 train set of TURKISH_FRAMENET."
UD_210_TURKISH_FRAMENET_DEV = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-dev.conllu"
"UD_210 dev set of TURKISH_FRAMENET."
UD_210_TURKISH_FRAMENET_TEST = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-test.conllu"
"UD_210 test set of TURKISH_FRAMENET."
UD_210_TURKISH_GB_TEST = _UD_210_HOME + "UD_Turkish-GB/tr_gb-ud-test.conllu"
"UD_210 test set of TURKISH_GB."
UD_210_TURKISH_IMST_TRAIN = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-train.conllu"
"UD_210 train set of TURKISH_IMST."
UD_210_TURKISH_IMST_DEV = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-dev.conllu"
"UD_210 dev set of TURKISH_IMST."
UD_210_TURKISH_IMST_TEST = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-test.conllu"
"UD_210 test set of TURKISH_IMST."
UD_210_TURKISH_KENET_TRAIN = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-train.conllu"
"UD_210 train set of TURKISH_KENET."
UD_210_TURKISH_KENET_DEV = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-dev.conllu"
"UD_210 dev set of TURKISH_KENET."
UD_210_TURKISH_KENET_TEST = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-test.conllu"
"UD_210 test set of TURKISH_KENET."
UD_210_TURKISH_PUD_TEST = _UD_210_HOME + "UD_Turkish-PUD/tr_pud-ud-test.conllu"
"UD_210 test set of TURKISH_PUD."
UD_210_TURKISH_PENN_TRAIN = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-train.conllu"
"UD_210 train set of TURKISH_PENN."
UD_210_TURKISH_PENN_DEV = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-dev.conllu"
"UD_210 dev set of TURKISH_PENN."
UD_210_TURKISH_PENN_TEST = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-test.conllu"
"UD_210 test set of TURKISH_PENN."
UD_210_TURKISH_TOURISM_TRAIN = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-train.conllu"
"UD_210 train set of TURKISH_TOURISM."
UD_210_TURKISH_TOURISM_DEV = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-dev.conllu"
"UD_210 dev set of TURKISH_TOURISM."
UD_210_TURKISH_TOURISM_TEST = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-test.conllu"
"UD_210 test set of TURKISH_TOURISM."
UD_210_TURKISH_GERMAN_SAGT_TRAIN = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-train.conllu"
"UD_210 train set of TURKISH_GERMAN_SAGT."
UD_210_TURKISH_GERMAN_SAGT_DEV = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-dev.conllu"
"UD_210 dev set of TURKISH_GERMAN_SAGT."
UD_210_TURKISH_GERMAN_SAGT_TEST = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-test.conllu"
"UD_210 test set of TURKISH_GERMAN_SAGT."
UD_210_UKRAINIAN_IU_TRAIN = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-train.conllu"
"UD_210 train set of UKRAINIAN_IU."
UD_210_UKRAINIAN_IU_DEV = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-dev.conllu"
"UD_210 dev set of UKRAINIAN_IU."
UD_210_UKRAINIAN_IU_TEST = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-test.conllu"
"UD_210 test set of UKRAINIAN_IU."
UD_210_UMBRIAN_IKUVINA_TEST = _UD_210_HOME + "UD_Umbrian-IKUVINA/xum_ikuvina-ud-test.conllu"
"UD_210 test set of UMBRIAN_IKUVINA."
UD_210_UPPER_SORBIAN_UFAL_TRAIN = _UD_210_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu"
"UD_210 train set of UPPER_SORBIAN_UFAL."
UD_210_UPPER_SORBIAN_UFAL_TEST = _UD_210_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu"
"UD_210 test set of UPPER_SORBIAN_UFAL."
UD_210_URDU_UDTB_TRAIN = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-train.conllu"
"UD_210 train set of URDU_UDTB."
UD_210_URDU_UDTB_DEV = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-dev.conllu"
"UD_210 dev set of URDU_UDTB."
UD_210_URDU_UDTB_TEST = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-test.conllu"
"UD_210 test set of URDU_UDTB."
UD_210_UYGHUR_UDT_TRAIN = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-train.conllu"
"UD_210 train set of UYGHUR_UDT."
UD_210_UYGHUR_UDT_DEV = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-dev.conllu"
"UD_210 dev set of UYGHUR_UDT."
UD_210_UYGHUR_UDT_TEST = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-test.conllu"
"UD_210 test set of UYGHUR_UDT."
UD_210_VIETNAMESE_VTB_TRAIN = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-train.conllu"
"UD_210 train set of VIETNAMESE_VTB."
UD_210_VIETNAMESE_VTB_DEV = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu"
"UD_210 dev set of VIETNAMESE_VTB."
UD_210_VIETNAMESE_VTB_TEST = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-test.conllu"
"UD_210 test set of VIETNAMESE_VTB."
UD_210_WARLPIRI_UFAL_TEST = _UD_210_HOME + "UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu"
"UD_210 test set of WARLPIRI_UFAL."
UD_210_WELSH_CCG_TRAIN = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-train.conllu"
"UD_210 train set of WELSH_CCG."
UD_210_WELSH_CCG_DEV = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-dev.conllu"
"UD_210 dev set of WELSH_CCG."
UD_210_WELSH_CCG_TEST = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-test.conllu"
"UD_210 test set of WELSH_CCG."
UD_210_WESTERN_ARMENIAN_ARMTDP_TRAIN = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-train.conllu"
"UD_210 train set of WESTERN_ARMENIAN_ARMTDP."
UD_210_WESTERN_ARMENIAN_ARMTDP_DEV = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-dev.conllu"
"UD_210 dev set of WESTERN_ARMENIAN_ARMTDP."
UD_210_WESTERN_ARMENIAN_ARMTDP_TEST = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-test.conllu"
"UD_210 test set of WESTERN_ARMENIAN_ARMTDP."
UD_210_WOLOF_WTB_TRAIN = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-train.conllu"
"UD_210 train set of WOLOF_WTB."
UD_210_WOLOF_WTB_DEV = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-dev.conllu"
"UD_210 dev set of WOLOF_WTB."
UD_210_WOLOF_WTB_TEST = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-test.conllu"
"UD_210 test set of WOLOF_WTB."
UD_210_XIBE_XDT_TEST = _UD_210_HOME + "UD_Xibe-XDT/sjo_xdt-ud-test.conllu"
"UD_210 test set of XIBE_XDT."
UD_210_YAKUT_YKTDT_TEST = _UD_210_HOME + "UD_Yakut-YKTDT/sah_yktdt-ud-test.conllu"
"UD_210 test set of YAKUT_YKTDT."
UD_210_YORUBA_YTB_TEST = _UD_210_HOME + "UD_Yoruba-YTB/yo_ytb-ud-test.conllu"
"UD_210 test set of YORUBA_YTB."
UD_210_YUPIK_SLI_TEST = _UD_210_HOME + "UD_Yupik-SLI/ess_sli-ud-test.conllu"
"UD_210 test set of YUPIK_SLI."
================================================
FILE: hanlp/datasets/parsing/ud/ud210m.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-21 20:39
import os
from hanlp.datasets.parsing.ud import concat_treebanks
from hanlp.datasets.parsing.ud.ud210 import _UD_210_HOME
_UD_210_MULTILINGUAL_HOME = concat_treebanks(_UD_210_HOME, '2.10')
UD_210_MULTILINGUAL_TRAIN = os.path.join(_UD_210_MULTILINGUAL_HOME, 'train.conllu')
"Training set of multilingual UD_210 obtained by concatenating all training sets."
UD_210_MULTILINGUAL_DEV = os.path.join(_UD_210_MULTILINGUAL_HOME, 'dev.conllu')
"Dev set of multilingual UD_210 obtained by concatenating all dev sets."
UD_210_MULTILINGUAL_TEST = os.path.join(_UD_210_MULTILINGUAL_HOME, 'test.conllu')
"Test set of multilingual UD_210 obtained by concatenating all test sets."
================================================
FILE: hanlp/datasets/parsing/ud/ud23.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-21 20:26
_UD_23_HOME = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2895/ud-treebanks-v2.3.tgz?sequence=1&isAllowed=y"
_UD_24_HOME = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2988/ud-treebanks-v2.4.tgz?sequence=4&isAllowed=y"
def _list_dir(path, home):
prefix = home.lstrip('_').replace('_HOME', '')
from hanlp.utils.io_util import get_resource
import glob
import os
path = get_resource(path)
with open('ud23.py', 'a') as out:
for f in sorted(glob.glob(path + '/UD_*')):
basename = os.path.basename(f)
name = basename[len('UD_'):]
name = name.upper().replace('-', '_')
for split in 'train', 'dev', 'test':
sp = glob.glob(f + f'/*{split}.conllu')
if not sp:
continue
sp = os.path.basename(sp[0])
out.write(f'{prefix}_{name}_{split.upper()} = {home} + "#{basename}/{sp}"\n')
def main():
_list_dir(_UD_23_HOME, '_UD_23_HOME')
pass
if __name__ == '__main__':
main()
UD_23_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu"
UD_23_AFRIKAANS_AFRIBOOMS_DEV = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu"
UD_23_AFRIKAANS_AFRIBOOMS_TEST = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu"
UD_23_AKKADIAN_PISANDUB_TEST = _UD_23_HOME + "#UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu"
UD_23_AMHARIC_ATT_TEST = _UD_23_HOME + "#UD_Amharic-ATT/am_att-ud-test.conllu"
UD_23_ANCIENT_GREEK_PROIEL_TRAIN = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu"
UD_23_ANCIENT_GREEK_PROIEL_DEV = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu"
UD_23_ANCIENT_GREEK_PROIEL_TEST = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu"
UD_23_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu"
UD_23_ANCIENT_GREEK_PERSEUS_DEV = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu"
UD_23_ANCIENT_GREEK_PERSEUS_TEST = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu"
UD_23_ARABIC_NYUAD_TRAIN = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu"
UD_23_ARABIC_NYUAD_DEV = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu"
UD_23_ARABIC_NYUAD_TEST = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu"
UD_23_ARABIC_PADT_TRAIN = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-train.conllu"
UD_23_ARABIC_PADT_DEV = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-dev.conllu"
UD_23_ARABIC_PADT_TEST = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-test.conllu"
UD_23_ARABIC_PUD_TEST = _UD_23_HOME + "#UD_Arabic-PUD/ar_pud-ud-test.conllu"
UD_23_ARMENIAN_ARMTDP_TRAIN = _UD_23_HOME + "#UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"
UD_23_ARMENIAN_ARMTDP_TEST = _UD_23_HOME + "#UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu"
UD_23_BAMBARA_CRB_TEST = _UD_23_HOME + "#UD_Bambara-CRB/bm_crb-ud-test.conllu"
UD_23_BASQUE_BDT_TRAIN = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-train.conllu"
UD_23_BASQUE_BDT_DEV = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-dev.conllu"
UD_23_BASQUE_BDT_TEST = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-test.conllu"
UD_23_BELARUSIAN_HSE_TRAIN = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-train.conllu"
UD_23_BELARUSIAN_HSE_DEV = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-dev.conllu"
UD_23_BELARUSIAN_HSE_TEST = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-test.conllu"
UD_23_BRETON_KEB_TEST = _UD_23_HOME + "#UD_Breton-KEB/br_keb-ud-test.conllu"
UD_23_BULGARIAN_BTB_TRAIN = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-train.conllu"
UD_23_BULGARIAN_BTB_DEV = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-dev.conllu"
UD_23_BULGARIAN_BTB_TEST = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-test.conllu"
UD_23_BURYAT_BDT_TRAIN = _UD_23_HOME + "#UD_Buryat-BDT/bxr_bdt-ud-train.conllu"
UD_23_BURYAT_BDT_TEST = _UD_23_HOME + "#UD_Buryat-BDT/bxr_bdt-ud-test.conllu"
UD_23_CANTONESE_HK_TEST = _UD_23_HOME + "#UD_Cantonese-HK/yue_hk-ud-test.conllu"
UD_23_CATALAN_ANCORA_TRAIN = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-train.conllu"
UD_23_CATALAN_ANCORA_DEV = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-dev.conllu"
UD_23_CATALAN_ANCORA_TEST = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-test.conllu"
UD_23_CHINESE_CFL_TEST = _UD_23_HOME + "#UD_Chinese-CFL/zh_cfl-ud-test.conllu"
UD_23_CHINESE_GSD_TRAIN = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-train.conllu"
UD_23_CHINESE_GSD_DEV = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-dev.conllu"
UD_23_CHINESE_GSD_TEST = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-test.conllu"
UD_23_CHINESE_HK_TEST = _UD_23_HOME + "#UD_Chinese-HK/zh_hk-ud-test.conllu"
UD_23_CHINESE_PUD_TEST = _UD_23_HOME + "#UD_Chinese-PUD/zh_pud-ud-test.conllu"
UD_23_COPTIC_SCRIPTORIUM_TRAIN = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu"
UD_23_COPTIC_SCRIPTORIUM_DEV = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu"
UD_23_COPTIC_SCRIPTORIUM_TEST = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu"
UD_23_CROATIAN_SET_TRAIN = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-train.conllu"
UD_23_CROATIAN_SET_DEV = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-dev.conllu"
UD_23_CROATIAN_SET_TEST = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-test.conllu"
UD_23_CZECH_CAC_TRAIN = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-train.conllu"
UD_23_CZECH_CAC_DEV = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-dev.conllu"
UD_23_CZECH_CAC_TEST = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-test.conllu"
UD_23_CZECH_CLTT_TRAIN = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-train.conllu"
UD_23_CZECH_CLTT_DEV = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-dev.conllu"
UD_23_CZECH_CLTT_TEST = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-test.conllu"
UD_23_CZECH_FICTREE_TRAIN = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-train.conllu"
UD_23_CZECH_FICTREE_DEV = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-dev.conllu"
UD_23_CZECH_FICTREE_TEST = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-test.conllu"
UD_23_CZECH_PDT_TRAIN = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-train.conllu"
UD_23_CZECH_PDT_DEV = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-dev.conllu"
UD_23_CZECH_PDT_TEST = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-test.conllu"
UD_23_CZECH_PUD_TEST = _UD_23_HOME + "#UD_Czech-PUD/cs_pud-ud-test.conllu"
UD_23_DANISH_DDT_TRAIN = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-train.conllu"
UD_23_DANISH_DDT_DEV = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-dev.conllu"
UD_23_DANISH_DDT_TEST = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-test.conllu"
UD_23_DUTCH_ALPINO_TRAIN = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
UD_23_DUTCH_ALPINO_DEV = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
UD_23_DUTCH_ALPINO_TEST = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-test.conllu"
UD_23_DUTCH_LASSYSMALL_TRAIN = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu"
UD_23_DUTCH_LASSYSMALL_DEV = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu"
UD_23_DUTCH_LASSYSMALL_TEST = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu"
UD_23_ENGLISH_ESL_TRAIN = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-train.conllu"
UD_23_ENGLISH_ESL_DEV = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-dev.conllu"
UD_23_ENGLISH_ESL_TEST = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-test.conllu"
UD_23_ENGLISH_EWT_TRAIN = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-train.conllu"
UD_23_ENGLISH_EWT_DEV = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-dev.conllu"
UD_23_ENGLISH_EWT_TEST = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-test.conllu"
UD_23_ENGLISH_GUM_TRAIN = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-train.conllu"
UD_23_ENGLISH_GUM_DEV = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-dev.conllu"
UD_23_ENGLISH_GUM_TEST = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-test.conllu"
UD_23_ENGLISH_LINES_TRAIN = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-train.conllu"
UD_23_ENGLISH_LINES_DEV = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-dev.conllu"
UD_23_ENGLISH_LINES_TEST = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-test.conllu"
UD_23_ENGLISH_PUD_TEST = _UD_23_HOME + "#UD_English-PUD/en_pud-ud-test.conllu"
UD_23_ENGLISH_PARTUT_TRAIN = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-train.conllu"
UD_23_ENGLISH_PARTUT_DEV = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-dev.conllu"
UD_23_ENGLISH_PARTUT_TEST = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-test.conllu"
UD_23_ERZYA_JR_TEST = _UD_23_HOME + "#UD_Erzya-JR/myv_jr-ud-test.conllu"
UD_23_ESTONIAN_EDT_TRAIN = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-train.conllu"
UD_23_ESTONIAN_EDT_DEV = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-dev.conllu"
UD_23_ESTONIAN_EDT_TEST = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-test.conllu"
UD_23_FAROESE_OFT_TEST = _UD_23_HOME + "#UD_Faroese-OFT/fo_oft-ud-test.conllu"
UD_23_FINNISH_FTB_TRAIN = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-train.conllu"
UD_23_FINNISH_FTB_DEV = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-dev.conllu"
UD_23_FINNISH_FTB_TEST = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-test.conllu"
UD_23_FINNISH_PUD_TEST = _UD_23_HOME + "#UD_Finnish-PUD/fi_pud-ud-test.conllu"
UD_23_FINNISH_TDT_TRAIN = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-train.conllu"
UD_23_FINNISH_TDT_DEV = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-dev.conllu"
UD_23_FINNISH_TDT_TEST = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-test.conllu"
UD_23_FRENCH_FTB_TRAIN = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-train.conllu"
UD_23_FRENCH_FTB_DEV = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-dev.conllu"
UD_23_FRENCH_FTB_TEST = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-test.conllu"
UD_23_FRENCH_GSD_TRAIN = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-train.conllu"
UD_23_FRENCH_GSD_DEV = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-dev.conllu"
UD_23_FRENCH_GSD_TEST = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-test.conllu"
UD_23_FRENCH_PUD_TEST = _UD_23_HOME + "#UD_French-PUD/fr_pud-ud-test.conllu"
UD_23_FRENCH_PARTUT_TRAIN = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-train.conllu"
UD_23_FRENCH_PARTUT_DEV = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-dev.conllu"
UD_23_FRENCH_PARTUT_TEST = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-test.conllu"
UD_23_FRENCH_SEQUOIA_TRAIN = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-train.conllu"
UD_23_FRENCH_SEQUOIA_DEV = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-dev.conllu"
UD_23_FRENCH_SEQUOIA_TEST = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-test.conllu"
UD_23_FRENCH_SPOKEN_TRAIN = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-train.conllu"
UD_23_FRENCH_SPOKEN_DEV = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-dev.conllu"
UD_23_FRENCH_SPOKEN_TEST = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-test.conllu"
UD_23_GALICIAN_CTG_TRAIN = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-train.conllu"
UD_23_GALICIAN_CTG_DEV = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-dev.conllu"
UD_23_GALICIAN_CTG_TEST = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-test.conllu"
UD_23_GALICIAN_TREEGAL_TRAIN = _UD_23_HOME + "#UD_Galician-TreeGal/gl_treegal-ud-train.conllu"
UD_23_GALICIAN_TREEGAL_TEST = _UD_23_HOME + "#UD_Galician-TreeGal/gl_treegal-ud-test.conllu"
UD_23_GERMAN_GSD_TRAIN = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-train.conllu"
UD_23_GERMAN_GSD_DEV = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-dev.conllu"
UD_23_GERMAN_GSD_TEST = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-test.conllu"
UD_23_GERMAN_PUD_TEST = _UD_23_HOME + "#UD_German-PUD/de_pud-ud-test.conllu"
UD_23_GOTHIC_PROIEL_TRAIN = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-train.conllu"
UD_23_GOTHIC_PROIEL_DEV = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-dev.conllu"
UD_23_GOTHIC_PROIEL_TEST = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-test.conllu"
UD_23_GREEK_GDT_TRAIN = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-train.conllu"
UD_23_GREEK_GDT_DEV = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-dev.conllu"
UD_23_GREEK_GDT_TEST = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-test.conllu"
UD_23_HEBREW_HTB_TRAIN = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-train.conllu"
UD_23_HEBREW_HTB_DEV = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-dev.conllu"
UD_23_HEBREW_HTB_TEST = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-test.conllu"
UD_23_HINDI_HDTB_TRAIN = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-train.conllu"
UD_23_HINDI_HDTB_DEV = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu"
UD_23_HINDI_HDTB_TEST = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-test.conllu"
UD_23_HINDI_PUD_TEST = _UD_23_HOME + "#UD_Hindi-PUD/hi_pud-ud-test.conllu"
UD_23_HINDI_ENGLISH_HIENCS_TRAIN = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu"
UD_23_HINDI_ENGLISH_HIENCS_DEV = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu"
UD_23_HINDI_ENGLISH_HIENCS_TEST = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu"
UD_23_HUNGARIAN_SZEGED_TRAIN = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-train.conllu"
UD_23_HUNGARIAN_SZEGED_DEV = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu"
UD_23_HUNGARIAN_SZEGED_TEST = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-test.conllu"
UD_23_INDONESIAN_GSD_TRAIN = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-train.conllu"
UD_23_INDONESIAN_GSD_DEV = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-dev.conllu"
UD_23_INDONESIAN_GSD_TEST = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-test.conllu"
UD_23_INDONESIAN_PUD_TEST = _UD_23_HOME + "#UD_Indonesian-PUD/id_pud-ud-test.conllu"
UD_23_IRISH_IDT_TRAIN = _UD_23_HOME + "#UD_Irish-IDT/ga_idt-ud-train.conllu"
UD_23_IRISH_IDT_TEST = _UD_23_HOME + "#UD_Irish-IDT/ga_idt-ud-test.conllu"
UD_23_ITALIAN_ISDT_TRAIN = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-train.conllu"
UD_23_ITALIAN_ISDT_DEV = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-dev.conllu"
UD_23_ITALIAN_ISDT_TEST = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-test.conllu"
UD_23_ITALIAN_PUD_TEST = _UD_23_HOME + "#UD_Italian-PUD/it_pud-ud-test.conllu"
UD_23_ITALIAN_PARTUT_TRAIN = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-train.conllu"
UD_23_ITALIAN_PARTUT_DEV = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-dev.conllu"
UD_23_ITALIAN_PARTUT_TEST = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-test.conllu"
UD_23_ITALIAN_POSTWITA_TRAIN = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-train.conllu"
UD_23_ITALIAN_POSTWITA_DEV = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu"
UD_23_ITALIAN_POSTWITA_TEST = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-test.conllu"
UD_23_JAPANESE_BCCWJ_TRAIN = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu"
UD_23_JAPANESE_BCCWJ_DEV = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu"
UD_23_JAPANESE_BCCWJ_TEST = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu"
UD_23_JAPANESE_GSD_TRAIN = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-train.conllu"
UD_23_JAPANESE_GSD_DEV = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-dev.conllu"
UD_23_JAPANESE_GSD_TEST = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-test.conllu"
UD_23_JAPANESE_MODERN_TEST = _UD_23_HOME + "#UD_Japanese-Modern/ja_modern-ud-test.conllu"
UD_23_JAPANESE_PUD_TEST = _UD_23_HOME + "#UD_Japanese-PUD/ja_pud-ud-test.conllu"
UD_23_KAZAKH_KTB_TRAIN = _UD_23_HOME + "#UD_Kazakh-KTB/kk_ktb-ud-train.conllu"
UD_23_KAZAKH_KTB_TEST = _UD_23_HOME + "#UD_Kazakh-KTB/kk_ktb-ud-test.conllu"
UD_23_KOMI_ZYRIAN_IKDP_TEST = _UD_23_HOME + "#UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu"
UD_23_KOMI_ZYRIAN_LATTICE_TEST = _UD_23_HOME + "#UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu"
UD_23_KOREAN_GSD_TRAIN = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-train.conllu"
UD_23_KOREAN_GSD_DEV = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-dev.conllu"
UD_23_KOREAN_GSD_TEST = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-test.conllu"
UD_23_KOREAN_KAIST_TRAIN = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-train.conllu"
UD_23_KOREAN_KAIST_DEV = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-dev.conllu"
UD_23_KOREAN_KAIST_TEST = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-test.conllu"
UD_23_KOREAN_PUD_TEST = _UD_23_HOME + "#UD_Korean-PUD/ko_pud-ud-test.conllu"
UD_23_KURMANJI_MG_TRAIN = _UD_23_HOME + "#UD_Kurmanji-MG/kmr_mg-ud-train.conllu"
UD_23_KURMANJI_MG_TEST = _UD_23_HOME + "#UD_Kurmanji-MG/kmr_mg-ud-test.conllu"
UD_23_LATIN_ITTB_TRAIN = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-train.conllu"
UD_23_LATIN_ITTB_DEV = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-dev.conllu"
UD_23_LATIN_ITTB_TEST = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-test.conllu"
UD_23_LATIN_PROIEL_TRAIN = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-train.conllu"
UD_23_LATIN_PROIEL_DEV = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-dev.conllu"
UD_23_LATIN_PROIEL_TEST = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-test.conllu"
UD_23_LATIN_PERSEUS_TRAIN = _UD_23_HOME + "#UD_Latin-Perseus/la_perseus-ud-train.conllu"
UD_23_LATIN_PERSEUS_TEST = _UD_23_HOME + "#UD_Latin-Perseus/la_perseus-ud-test.conllu"
UD_23_LATVIAN_LVTB_TRAIN = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-train.conllu"
UD_23_LATVIAN_LVTB_DEV = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu"
UD_23_LATVIAN_LVTB_TEST = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-test.conllu"
UD_23_LITHUANIAN_HSE_TRAIN = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-train.conllu"
UD_23_LITHUANIAN_HSE_DEV = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-dev.conllu"
UD_23_LITHUANIAN_HSE_TEST = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-test.conllu"
UD_23_MALTESE_MUDT_TRAIN = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-train.conllu"
UD_23_MALTESE_MUDT_DEV = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-dev.conllu"
UD_23_MALTESE_MUDT_TEST = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-test.conllu"
UD_23_MARATHI_UFAL_TRAIN = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-train.conllu"
UD_23_MARATHI_UFAL_DEV = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-dev.conllu"
UD_23_MARATHI_UFAL_TEST = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-test.conllu"
UD_23_NAIJA_NSC_TEST = _UD_23_HOME + "#UD_Naija-NSC/pcm_nsc-ud-test.conllu"
UD_23_NORTH_SAMI_GIELLA_TRAIN = _UD_23_HOME + "#UD_North_Sami-Giella/sme_giella-ud-train.conllu"
UD_23_NORTH_SAMI_GIELLA_TEST = _UD_23_HOME + "#UD_North_Sami-Giella/sme_giella-ud-test.conllu"
UD_23_NORWEGIAN_BOKMAAL_TRAIN = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu"
UD_23_NORWEGIAN_BOKMAAL_DEV = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu"
UD_23_NORWEGIAN_BOKMAAL_TEST = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu"
UD_23_NORWEGIAN_NYNORSK_TRAIN = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu"
UD_23_NORWEGIAN_NYNORSK_DEV = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu"
UD_23_NORWEGIAN_NYNORSK_TEST = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu"
UD_23_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_23_HOME + "#UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu"
UD_23_NORWEGIAN_NYNORSKLIA_TEST = _UD_23_HOME + "#UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu"
UD_23_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu"
UD_23_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu"
UD_23_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu"
UD_23_OLD_FRENCH_SRCMF_TRAIN = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu"
UD_23_OLD_FRENCH_SRCMF_DEV = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu"
UD_23_OLD_FRENCH_SRCMF_TEST = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu"
UD_23_PERSIAN_SERAJI_TRAIN = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-train.conllu"
UD_23_PERSIAN_SERAJI_DEV = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-dev.conllu"
UD_23_PERSIAN_SERAJI_TEST = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-test.conllu"
UD_23_POLISH_LFG_TRAIN = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-train.conllu"
UD_23_POLISH_LFG_DEV = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-dev.conllu"
UD_23_POLISH_LFG_TEST = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-test.conllu"
UD_23_POLISH_SZ_TRAIN = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-train.conllu"
UD_23_POLISH_SZ_DEV = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-dev.conllu"
UD_23_POLISH_SZ_TEST = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-test.conllu"
UD_23_PORTUGUESE_BOSQUE_TRAIN = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-train.conllu"
UD_23_PORTUGUESE_BOSQUE_DEV = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu"
UD_23_PORTUGUESE_BOSQUE_TEST = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-test.conllu"
UD_23_PORTUGUESE_GSD_TRAIN = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-train.conllu"
UD_23_PORTUGUESE_GSD_DEV = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-dev.conllu"
UD_23_PORTUGUESE_GSD_TEST = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-test.conllu"
UD_23_PORTUGUESE_PUD_TEST = _UD_23_HOME + "#UD_Portuguese-PUD/pt_pud-ud-test.conllu"
UD_23_ROMANIAN_NONSTANDARD_TRAIN = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu"
UD_23_ROMANIAN_NONSTANDARD_DEV = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu"
UD_23_ROMANIAN_NONSTANDARD_TEST = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu"
UD_23_ROMANIAN_RRT_TRAIN = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-train.conllu"
UD_23_ROMANIAN_RRT_DEV = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-dev.conllu"
UD_23_ROMANIAN_RRT_TEST = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-test.conllu"
UD_23_RUSSIAN_GSD_TRAIN = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-train.conllu"
UD_23_RUSSIAN_GSD_DEV = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-dev.conllu"
UD_23_RUSSIAN_GSD_TEST = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-test.conllu"
UD_23_RUSSIAN_PUD_TEST = _UD_23_HOME + "#UD_Russian-PUD/ru_pud-ud-test.conllu"
UD_23_RUSSIAN_SYNTAGRUS_TRAIN = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu"
UD_23_RUSSIAN_SYNTAGRUS_DEV = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu"
UD_23_RUSSIAN_SYNTAGRUS_TEST = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu"
UD_23_RUSSIAN_TAIGA_TRAIN = _UD_23_HOME + "#UD_Russian-Taiga/ru_taiga-ud-train.conllu"
UD_23_RUSSIAN_TAIGA_TEST = _UD_23_HOME + "#UD_Russian-Taiga/ru_taiga-ud-test.conllu"
UD_23_SANSKRIT_UFAL_TEST = _UD_23_HOME + "#UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu"
UD_23_SERBIAN_SET_TRAIN = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-train.conllu"
UD_23_SERBIAN_SET_DEV = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-dev.conllu"
UD_23_SERBIAN_SET_TEST = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-test.conllu"
UD_23_SLOVAK_SNK_TRAIN = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-train.conllu"
UD_23_SLOVAK_SNK_DEV = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-dev.conllu"
UD_23_SLOVAK_SNK_TEST = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-test.conllu"
UD_23_SLOVENIAN_SSJ_TRAIN = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-train.conllu"
UD_23_SLOVENIAN_SSJ_DEV = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu"
UD_23_SLOVENIAN_SSJ_TEST = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-test.conllu"
UD_23_SLOVENIAN_SST_TRAIN = _UD_23_HOME + "#UD_Slovenian-SST/sl_sst-ud-train.conllu"
UD_23_SLOVENIAN_SST_TEST = _UD_23_HOME + "#UD_Slovenian-SST/sl_sst-ud-test.conllu"
UD_23_SPANISH_ANCORA_TRAIN = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-train.conllu"
UD_23_SPANISH_ANCORA_DEV = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-dev.conllu"
UD_23_SPANISH_ANCORA_TEST = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-test.conllu"
UD_23_SPANISH_GSD_TRAIN = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-train.conllu"
UD_23_SPANISH_GSD_DEV = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-dev.conllu"
UD_23_SPANISH_GSD_TEST = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-test.conllu"
UD_23_SPANISH_PUD_TEST = _UD_23_HOME + "#UD_Spanish-PUD/es_pud-ud-test.conllu"
UD_23_SWEDISH_LINES_TRAIN = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-train.conllu"
UD_23_SWEDISH_LINES_DEV = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-dev.conllu"
UD_23_SWEDISH_LINES_TEST = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-test.conllu"
UD_23_SWEDISH_PUD_TEST = _UD_23_HOME + "#UD_Swedish-PUD/sv_pud-ud-test.conllu"
UD_23_SWEDISH_TALBANKEN_TRAIN = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu"
UD_23_SWEDISH_TALBANKEN_DEV = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu"
UD_23_SWEDISH_TALBANKEN_TEST = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu"
UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu"
UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu"
UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu"
UD_23_TAGALOG_TRG_TEST = _UD_23_HOME + "#UD_Tagalog-TRG/tl_trg-ud-test.conllu"
UD_23_TAMIL_TTB_TRAIN = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-train.conllu"
UD_23_TAMIL_TTB_DEV = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-dev.conllu"
UD_23_TAMIL_TTB_TEST = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-test.conllu"
UD_23_TELUGU_MTG_TRAIN = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-train.conllu"
UD_23_TELUGU_MTG_DEV = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-dev.conllu"
UD_23_TELUGU_MTG_TEST = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-test.conllu"
UD_23_THAI_PUD_TEST = _UD_23_HOME + "#UD_Thai-PUD/th_pud-ud-test.conllu"
UD_23_TURKISH_IMST_TRAIN = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-train.conllu"
UD_23_TURKISH_IMST_DEV = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-dev.conllu"
UD_23_TURKISH_IMST_TEST = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-test.conllu"
UD_23_TURKISH_PUD_TEST = _UD_23_HOME + "#UD_Turkish-PUD/tr_pud-ud-test.conllu"
UD_23_UKRAINIAN_IU_TRAIN = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-train.conllu"
UD_23_UKRAINIAN_IU_DEV = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-dev.conllu"
UD_23_UKRAINIAN_IU_TEST = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-test.conllu"
UD_23_UPPER_SORBIAN_UFAL_TRAIN = _UD_23_HOME + "#UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu"
UD_23_UPPER_SORBIAN_UFAL_TEST = _UD_23_HOME + "#UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu"
UD_23_URDU_UDTB_TRAIN = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-train.conllu"
UD_23_URDU_UDTB_DEV = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-dev.conllu"
UD_23_URDU_UDTB_TEST = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-test.conllu"
UD_23_UYGHUR_UDT_TRAIN = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-train.conllu"
UD_23_UYGHUR_UDT_DEV = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-dev.conllu"
UD_23_UYGHUR_UDT_TEST = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-test.conllu"
UD_23_VIETNAMESE_VTB_TRAIN = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-train.conllu"
UD_23_VIETNAMESE_VTB_DEV = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu"
UD_23_VIETNAMESE_VTB_TEST = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-test.conllu"
UD_23_WARLPIRI_UFAL_TEST = _UD_23_HOME + "#UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu"
UD_23_YORUBA_YTB_TEST = _UD_23_HOME + "#UD_Yoruba-YTB/yo_ytb-ud-test.conllu"
================================================
FILE: hanlp/datasets/parsing/ud/ud23m.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-21 20:39
import os
from hanlp.datasets.parsing.ud import concat_treebanks
from .ud23 import _UD_23_HOME
_UD_23_MULTILINGUAL_HOME = concat_treebanks(_UD_23_HOME, '2.3')
UD_23_MULTILINGUAL_TRAIN = os.path.join(_UD_23_MULTILINGUAL_HOME, 'train.conllu')
UD_23_MULTILINGUAL_DEV = os.path.join(_UD_23_MULTILINGUAL_HOME, 'dev.conllu')
UD_23_MULTILINGUAL_TEST = os.path.join(_UD_23_MULTILINGUAL_HOME, 'test.conllu')
================================================
FILE: hanlp/datasets/parsing/ud/ud27.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-07 21:03
import glob
import os
from hanlp.utils.io_util import uncompress, get_resource
_UD_27_URL = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3424/allzip"
_UD_27_HOME = _UD_27_URL + '#ud-treebanks-v2.7/'
_path = get_resource(_UD_27_URL)
if os.path.isfile(_path):
os.rename(_path, _path + '.zip')
uncompress(_path + '.zip')
uncompress(os.path.join(_path, 'ud-treebanks-v2.7.tgz'))
# noinspection PyShadowingNames
def _list_dir(path, home):
prefix = home.lstrip('_').replace('_HOME', '')
path = get_resource(path)
with open('ud27.py', 'a') as out:
for f in sorted(glob.glob(path + '/ud-treebanks-v2.7/UD_*')):
basename = os.path.basename(f)
name = basename[len('UD_'):]
name = name.upper().replace('-', '_')
for split in 'train', 'dev', 'test':
sp = glob.glob(f + f'/*{split}.conllu')
if not sp:
continue
sp = os.path.basename(sp[0])
out.write(f'{prefix}_{name}_{split.upper()} = {home} + "{basename}/{sp}"\n')
out.write(f'"{prefix} {split} set of {name}."\n')
def main():
_list_dir(_UD_27_URL, '_UD_27_HOME')
pass
if __name__ == '__main__':
main()
UD_27_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu"
"UD_27 train set of AFRIKAANS_AFRIBOOMS."
UD_27_AFRIKAANS_AFRIBOOMS_DEV = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu"
"UD_27 dev set of AFRIKAANS_AFRIBOOMS."
UD_27_AFRIKAANS_AFRIBOOMS_TEST = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu"
"UD_27 test set of AFRIKAANS_AFRIBOOMS."
UD_27_AKKADIAN_PISANDUB_TEST = _UD_27_HOME + "UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu"
"UD_27 test set of AKKADIAN_PISANDUB."
UD_27_AKKADIAN_RIAO_TEST = _UD_27_HOME + "UD_Akkadian-RIAO/akk_riao-ud-test.conllu"
"UD_27 test set of AKKADIAN_RIAO."
UD_27_AKUNTSU_TUDET_TEST = _UD_27_HOME + "UD_Akuntsu-TuDeT/aqz_tudet-ud-test.conllu"
"UD_27 test set of AKUNTSU_TUDET."
UD_27_ALBANIAN_TSA_TEST = _UD_27_HOME + "UD_Albanian-TSA/sq_tsa-ud-test.conllu"
"UD_27 test set of ALBANIAN_TSA."
UD_27_AMHARIC_ATT_TEST = _UD_27_HOME + "UD_Amharic-ATT/am_att-ud-test.conllu"
"UD_27 test set of AMHARIC_ATT."
UD_27_ANCIENT_GREEK_PROIEL_TRAIN = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu"
"UD_27 train set of ANCIENT_GREEK_PROIEL."
UD_27_ANCIENT_GREEK_PROIEL_DEV = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu"
"UD_27 dev set of ANCIENT_GREEK_PROIEL."
UD_27_ANCIENT_GREEK_PROIEL_TEST = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu"
"UD_27 test set of ANCIENT_GREEK_PROIEL."
UD_27_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu"
"UD_27 train set of ANCIENT_GREEK_PERSEUS."
UD_27_ANCIENT_GREEK_PERSEUS_DEV = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu"
"UD_27 dev set of ANCIENT_GREEK_PERSEUS."
UD_27_ANCIENT_GREEK_PERSEUS_TEST = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu"
"UD_27 test set of ANCIENT_GREEK_PERSEUS."
UD_27_APURINA_UFPA_TEST = _UD_27_HOME + "UD_Apurina-UFPA/apu_ufpa-ud-test.conllu"
"UD_27 test set of APURINA_UFPA."
UD_27_ARABIC_NYUAD_TRAIN = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu"
"UD_27 train set of ARABIC_NYUAD."
UD_27_ARABIC_NYUAD_DEV = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu"
"UD_27 dev set of ARABIC_NYUAD."
UD_27_ARABIC_NYUAD_TEST = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu"
"UD_27 test set of ARABIC_NYUAD."
UD_27_ARABIC_PADT_TRAIN = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-train.conllu"
"UD_27 train set of ARABIC_PADT."
UD_27_ARABIC_PADT_DEV = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-dev.conllu"
"UD_27 dev set of ARABIC_PADT."
UD_27_ARABIC_PADT_TEST = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-test.conllu"
"UD_27 test set of ARABIC_PADT."
UD_27_ARABIC_PUD_TEST = _UD_27_HOME + "UD_Arabic-PUD/ar_pud-ud-test.conllu"
"UD_27 test set of ARABIC_PUD."
UD_27_ARMENIAN_ARMTDP_TRAIN = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"
"UD_27 train set of ARMENIAN_ARMTDP."
UD_27_ARMENIAN_ARMTDP_DEV = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu"
"UD_27 dev set of ARMENIAN_ARMTDP."
UD_27_ARMENIAN_ARMTDP_TEST = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu"
"UD_27 test set of ARMENIAN_ARMTDP."
UD_27_ASSYRIAN_AS_TEST = _UD_27_HOME + "UD_Assyrian-AS/aii_as-ud-test.conllu"
"UD_27 test set of ASSYRIAN_AS."
UD_27_BAMBARA_CRB_TEST = _UD_27_HOME + "UD_Bambara-CRB/bm_crb-ud-test.conllu"
"UD_27 test set of BAMBARA_CRB."
UD_27_BASQUE_BDT_TRAIN = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-train.conllu"
"UD_27 train set of BASQUE_BDT."
UD_27_BASQUE_BDT_DEV = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-dev.conllu"
"UD_27 dev set of BASQUE_BDT."
UD_27_BASQUE_BDT_TEST = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-test.conllu"
"UD_27 test set of BASQUE_BDT."
UD_27_BELARUSIAN_HSE_TRAIN = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-train.conllu"
"UD_27 train set of BELARUSIAN_HSE."
UD_27_BELARUSIAN_HSE_DEV = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-dev.conllu"
"UD_27 dev set of BELARUSIAN_HSE."
UD_27_BELARUSIAN_HSE_TEST = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-test.conllu"
"UD_27 test set of BELARUSIAN_HSE."
UD_27_BHOJPURI_BHTB_TEST = _UD_27_HOME + "UD_Bhojpuri-BHTB/bho_bhtb-ud-test.conllu"
"UD_27 test set of BHOJPURI_BHTB."
UD_27_BRETON_KEB_TEST = _UD_27_HOME + "UD_Breton-KEB/br_keb-ud-test.conllu"
"UD_27 test set of BRETON_KEB."
UD_27_BULGARIAN_BTB_TRAIN = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-train.conllu"
"UD_27 train set of BULGARIAN_BTB."
UD_27_BULGARIAN_BTB_DEV = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-dev.conllu"
"UD_27 dev set of BULGARIAN_BTB."
UD_27_BULGARIAN_BTB_TEST = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-test.conllu"
"UD_27 test set of BULGARIAN_BTB."
UD_27_BURYAT_BDT_TRAIN = _UD_27_HOME + "UD_Buryat-BDT/bxr_bdt-ud-train.conllu"
"UD_27 train set of BURYAT_BDT."
UD_27_BURYAT_BDT_TEST = _UD_27_HOME + "UD_Buryat-BDT/bxr_bdt-ud-test.conllu"
"UD_27 test set of BURYAT_BDT."
UD_27_CANTONESE_HK_TEST = _UD_27_HOME + "UD_Cantonese-HK/yue_hk-ud-test.conllu"
"UD_27 test set of CANTONESE_HK."
UD_27_CATALAN_ANCORA_TRAIN = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-train.conllu"
"UD_27 train set of CATALAN_ANCORA."
UD_27_CATALAN_ANCORA_DEV = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-dev.conllu"
"UD_27 dev set of CATALAN_ANCORA."
UD_27_CATALAN_ANCORA_TEST = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-test.conllu"
"UD_27 test set of CATALAN_ANCORA."
UD_27_CHINESE_CFL_TEST = _UD_27_HOME + "UD_Chinese-CFL/zh_cfl-ud-test.conllu"
"UD_27 test set of CHINESE_CFL."
UD_27_CHINESE_GSD_TRAIN = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-train.conllu"
"UD_27 train set of CHINESE_GSD."
UD_27_CHINESE_GSD_DEV = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-dev.conllu"
"UD_27 dev set of CHINESE_GSD."
UD_27_CHINESE_GSD_TEST = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-test.conllu"
"UD_27 test set of CHINESE_GSD."
UD_27_CHINESE_GSDSIMP_TRAIN = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu"
"UD_27 train set of CHINESE_GSDSIMP."
UD_27_CHINESE_GSDSIMP_DEV = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu"
"UD_27 dev set of CHINESE_GSDSIMP."
UD_27_CHINESE_GSDSIMP_TEST = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu"
"UD_27 test set of CHINESE_GSDSIMP."
UD_27_CHINESE_HK_TEST = _UD_27_HOME + "UD_Chinese-HK/zh_hk-ud-test.conllu"
"UD_27 test set of CHINESE_HK."
UD_27_CHINESE_PUD_TEST = _UD_27_HOME + "UD_Chinese-PUD/zh_pud-ud-test.conllu"
"UD_27 test set of CHINESE_PUD."
UD_27_CHUKCHI_HSE_TEST = _UD_27_HOME + "UD_Chukchi-HSE/ckt_hse-ud-test.conllu"
"UD_27 test set of CHUKCHI_HSE."
UD_27_CLASSICAL_CHINESE_KYOTO_TRAIN = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu"
"UD_27 train set of CLASSICAL_CHINESE_KYOTO."
UD_27_CLASSICAL_CHINESE_KYOTO_DEV = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu"
"UD_27 dev set of CLASSICAL_CHINESE_KYOTO."
UD_27_CLASSICAL_CHINESE_KYOTO_TEST = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu"
"UD_27 test set of CLASSICAL_CHINESE_KYOTO."
UD_27_COPTIC_SCRIPTORIUM_TRAIN = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu"
"UD_27 train set of COPTIC_SCRIPTORIUM."
UD_27_COPTIC_SCRIPTORIUM_DEV = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu"
"UD_27 dev set of COPTIC_SCRIPTORIUM."
UD_27_COPTIC_SCRIPTORIUM_TEST = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu"
"UD_27 test set of COPTIC_SCRIPTORIUM."
UD_27_CROATIAN_SET_TRAIN = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-train.conllu"
"UD_27 train set of CROATIAN_SET."
UD_27_CROATIAN_SET_DEV = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-dev.conllu"
"UD_27 dev set of CROATIAN_SET."
UD_27_CROATIAN_SET_TEST = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-test.conllu"
"UD_27 test set of CROATIAN_SET."
UD_27_CZECH_CAC_TRAIN = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-train.conllu"
"UD_27 train set of CZECH_CAC."
UD_27_CZECH_CAC_DEV = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-dev.conllu"
"UD_27 dev set of CZECH_CAC."
UD_27_CZECH_CAC_TEST = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-test.conllu"
"UD_27 test set of CZECH_CAC."
UD_27_CZECH_CLTT_TRAIN = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-train.conllu"
"UD_27 train set of CZECH_CLTT."
UD_27_CZECH_CLTT_DEV = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-dev.conllu"
"UD_27 dev set of CZECH_CLTT."
UD_27_CZECH_CLTT_TEST = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-test.conllu"
"UD_27 test set of CZECH_CLTT."
UD_27_CZECH_FICTREE_TRAIN = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-train.conllu"
"UD_27 train set of CZECH_FICTREE."
UD_27_CZECH_FICTREE_DEV = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-dev.conllu"
"UD_27 dev set of CZECH_FICTREE."
UD_27_CZECH_FICTREE_TEST = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-test.conllu"
"UD_27 test set of CZECH_FICTREE."
UD_27_CZECH_PDT_TRAIN = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-train.conllu"
"UD_27 train set of CZECH_PDT."
UD_27_CZECH_PDT_DEV = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-dev.conllu"
"UD_27 dev set of CZECH_PDT."
UD_27_CZECH_PDT_TEST = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-test.conllu"
"UD_27 test set of CZECH_PDT."
UD_27_CZECH_PUD_TEST = _UD_27_HOME + "UD_Czech-PUD/cs_pud-ud-test.conllu"
"UD_27 test set of CZECH_PUD."
UD_27_DANISH_DDT_TRAIN = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-train.conllu"
"UD_27 train set of DANISH_DDT."
UD_27_DANISH_DDT_DEV = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-dev.conllu"
"UD_27 dev set of DANISH_DDT."
UD_27_DANISH_DDT_TEST = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-test.conllu"
"UD_27 test set of DANISH_DDT."
UD_27_DUTCH_ALPINO_TRAIN = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
"UD_27 train set of DUTCH_ALPINO."
UD_27_DUTCH_ALPINO_DEV = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
"UD_27 dev set of DUTCH_ALPINO."
UD_27_DUTCH_ALPINO_TEST = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-test.conllu"
"UD_27 test set of DUTCH_ALPINO."
UD_27_DUTCH_LASSYSMALL_TRAIN = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu"
"UD_27 train set of DUTCH_LASSYSMALL."
UD_27_DUTCH_LASSYSMALL_DEV = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu"
"UD_27 dev set of DUTCH_LASSYSMALL."
UD_27_DUTCH_LASSYSMALL_TEST = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu"
"UD_27 test set of DUTCH_LASSYSMALL."
UD_27_ENGLISH_ESL_TRAIN = _UD_27_HOME + "UD_English-ESL/en_esl-ud-train.conllu"
"UD_27 train set of ENGLISH_ESL."
UD_27_ENGLISH_ESL_DEV = _UD_27_HOME + "UD_English-ESL/en_esl-ud-dev.conllu"
"UD_27 dev set of ENGLISH_ESL."
UD_27_ENGLISH_ESL_TEST = _UD_27_HOME + "UD_English-ESL/en_esl-ud-test.conllu"
"UD_27 test set of ENGLISH_ESL."
UD_27_ENGLISH_EWT_TRAIN = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-train.conllu"
"UD_27 train set of ENGLISH_EWT."
UD_27_ENGLISH_EWT_DEV = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-dev.conllu"
"UD_27 dev set of ENGLISH_EWT."
UD_27_ENGLISH_EWT_TEST = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-test.conllu"
"UD_27 test set of ENGLISH_EWT."
UD_27_ENGLISH_GUM_TRAIN = _UD_27_HOME + "UD_English-GUM/en_gum-ud-train.conllu"
"UD_27 train set of ENGLISH_GUM."
UD_27_ENGLISH_GUM_DEV = _UD_27_HOME + "UD_English-GUM/en_gum-ud-dev.conllu"
"UD_27 dev set of ENGLISH_GUM."
UD_27_ENGLISH_GUM_TEST = _UD_27_HOME + "UD_English-GUM/en_gum-ud-test.conllu"
"UD_27 test set of ENGLISH_GUM."
UD_27_ENGLISH_GUMREDDIT_TRAIN = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-train.conllu"
"UD_27 train set of ENGLISH_GUMREDDIT."
UD_27_ENGLISH_GUMREDDIT_DEV = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-dev.conllu"
"UD_27 dev set of ENGLISH_GUMREDDIT."
UD_27_ENGLISH_GUMREDDIT_TEST = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-test.conllu"
"UD_27 test set of ENGLISH_GUMREDDIT."
UD_27_ENGLISH_LINES_TRAIN = _UD_27_HOME + "UD_English-LinES/en_lines-ud-train.conllu"
"UD_27 train set of ENGLISH_LINES."
UD_27_ENGLISH_LINES_DEV = _UD_27_HOME + "UD_English-LinES/en_lines-ud-dev.conllu"
"UD_27 dev set of ENGLISH_LINES."
UD_27_ENGLISH_LINES_TEST = _UD_27_HOME + "UD_English-LinES/en_lines-ud-test.conllu"
"UD_27 test set of ENGLISH_LINES."
UD_27_ENGLISH_PUD_TEST = _UD_27_HOME + "UD_English-PUD/en_pud-ud-test.conllu"
"UD_27 test set of ENGLISH_PUD."
UD_27_ENGLISH_PARTUT_TRAIN = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-train.conllu"
"UD_27 train set of ENGLISH_PARTUT."
UD_27_ENGLISH_PARTUT_DEV = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-dev.conllu"
"UD_27 dev set of ENGLISH_PARTUT."
UD_27_ENGLISH_PARTUT_TEST = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-test.conllu"
"UD_27 test set of ENGLISH_PARTUT."
UD_27_ENGLISH_PRONOUNS_TEST = _UD_27_HOME + "UD_English-Pronouns/en_pronouns-ud-test.conllu"
"UD_27 test set of ENGLISH_PRONOUNS."
UD_27_ERZYA_JR_TEST = _UD_27_HOME + "UD_Erzya-JR/myv_jr-ud-test.conllu"
"UD_27 test set of ERZYA_JR."
UD_27_ESTONIAN_EDT_TRAIN = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-train.conllu"
"UD_27 train set of ESTONIAN_EDT."
UD_27_ESTONIAN_EDT_DEV = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-dev.conllu"
"UD_27 dev set of ESTONIAN_EDT."
UD_27_ESTONIAN_EDT_TEST = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-test.conllu"
"UD_27 test set of ESTONIAN_EDT."
UD_27_ESTONIAN_EWT_TRAIN = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-train.conllu"
"UD_27 train set of ESTONIAN_EWT."
UD_27_ESTONIAN_EWT_DEV = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-dev.conllu"
"UD_27 dev set of ESTONIAN_EWT."
UD_27_ESTONIAN_EWT_TEST = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-test.conllu"
"UD_27 test set of ESTONIAN_EWT."
UD_27_FAROESE_FARPAHC_TRAIN = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu"
"UD_27 train set of FAROESE_FARPAHC."
UD_27_FAROESE_FARPAHC_DEV = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu"
"UD_27 dev set of FAROESE_FARPAHC."
UD_27_FAROESE_FARPAHC_TEST = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu"
"UD_27 test set of FAROESE_FARPAHC."
UD_27_FAROESE_OFT_TEST = _UD_27_HOME + "UD_Faroese-OFT/fo_oft-ud-test.conllu"
"UD_27 test set of FAROESE_OFT."
UD_27_FINNISH_FTB_TRAIN = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-train.conllu"
"UD_27 train set of FINNISH_FTB."
UD_27_FINNISH_FTB_DEV = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-dev.conllu"
"UD_27 dev set of FINNISH_FTB."
UD_27_FINNISH_FTB_TEST = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-test.conllu"
"UD_27 test set of FINNISH_FTB."
UD_27_FINNISH_OOD_TEST = _UD_27_HOME + "UD_Finnish-OOD/fi_ood-ud-test.conllu"
"UD_27 test set of FINNISH_OOD."
UD_27_FINNISH_PUD_TEST = _UD_27_HOME + "UD_Finnish-PUD/fi_pud-ud-test.conllu"
"UD_27 test set of FINNISH_PUD."
UD_27_FINNISH_TDT_TRAIN = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-train.conllu"
"UD_27 train set of FINNISH_TDT."
UD_27_FINNISH_TDT_DEV = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-dev.conllu"
"UD_27 dev set of FINNISH_TDT."
UD_27_FINNISH_TDT_TEST = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-test.conllu"
"UD_27 test set of FINNISH_TDT."
UD_27_FRENCH_FQB_TEST = _UD_27_HOME + "UD_French-FQB/fr_fqb-ud-test.conllu"
"UD_27 test set of FRENCH_FQB."
UD_27_FRENCH_FTB_TRAIN = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-train.conllu"
"UD_27 train set of FRENCH_FTB."
UD_27_FRENCH_FTB_DEV = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-dev.conllu"
"UD_27 dev set of FRENCH_FTB."
UD_27_FRENCH_FTB_TEST = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-test.conllu"
"UD_27 test set of FRENCH_FTB."
UD_27_FRENCH_GSD_TRAIN = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-train.conllu"
"UD_27 train set of FRENCH_GSD."
UD_27_FRENCH_GSD_DEV = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-dev.conllu"
"UD_27 dev set of FRENCH_GSD."
UD_27_FRENCH_GSD_TEST = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-test.conllu"
"UD_27 test set of FRENCH_GSD."
UD_27_FRENCH_PUD_TEST = _UD_27_HOME + "UD_French-PUD/fr_pud-ud-test.conllu"
"UD_27 test set of FRENCH_PUD."
UD_27_FRENCH_PARTUT_TRAIN = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-train.conllu"
"UD_27 train set of FRENCH_PARTUT."
UD_27_FRENCH_PARTUT_DEV = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-dev.conllu"
"UD_27 dev set of FRENCH_PARTUT."
UD_27_FRENCH_PARTUT_TEST = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-test.conllu"
"UD_27 test set of FRENCH_PARTUT."
UD_27_FRENCH_SEQUOIA_TRAIN = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-train.conllu"
"UD_27 train set of FRENCH_SEQUOIA."
UD_27_FRENCH_SEQUOIA_DEV = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-dev.conllu"
"UD_27 dev set of FRENCH_SEQUOIA."
UD_27_FRENCH_SEQUOIA_TEST = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-test.conllu"
"UD_27 test set of FRENCH_SEQUOIA."
UD_27_FRENCH_SPOKEN_TRAIN = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-train.conllu"
"UD_27 train set of FRENCH_SPOKEN."
UD_27_FRENCH_SPOKEN_DEV = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-dev.conllu"
"UD_27 dev set of FRENCH_SPOKEN."
UD_27_FRENCH_SPOKEN_TEST = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-test.conllu"
"UD_27 test set of FRENCH_SPOKEN."
UD_27_GALICIAN_CTG_TRAIN = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-train.conllu"
"UD_27 train set of GALICIAN_CTG."
UD_27_GALICIAN_CTG_DEV = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-dev.conllu"
"UD_27 dev set of GALICIAN_CTG."
UD_27_GALICIAN_CTG_TEST = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-test.conllu"
"UD_27 test set of GALICIAN_CTG."
UD_27_GALICIAN_TREEGAL_TRAIN = _UD_27_HOME + "UD_Galician-TreeGal/gl_treegal-ud-train.conllu"
"UD_27 train set of GALICIAN_TREEGAL."
UD_27_GALICIAN_TREEGAL_TEST = _UD_27_HOME + "UD_Galician-TreeGal/gl_treegal-ud-test.conllu"
"UD_27 test set of GALICIAN_TREEGAL."
UD_27_GERMAN_GSD_TRAIN = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-train.conllu"
"UD_27 train set of GERMAN_GSD."
UD_27_GERMAN_GSD_DEV = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-dev.conllu"
"UD_27 dev set of GERMAN_GSD."
UD_27_GERMAN_GSD_TEST = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-test.conllu"
"UD_27 test set of GERMAN_GSD."
UD_27_GERMAN_HDT_TRAIN = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-train.conllu"
"UD_27 train set of GERMAN_HDT."
UD_27_GERMAN_HDT_DEV = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-dev.conllu"
"UD_27 dev set of GERMAN_HDT."
UD_27_GERMAN_HDT_TEST = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-test.conllu"
"UD_27 test set of GERMAN_HDT."
UD_27_GERMAN_LIT_TEST = _UD_27_HOME + "UD_German-LIT/de_lit-ud-test.conllu"
"UD_27 test set of GERMAN_LIT."
UD_27_GERMAN_PUD_TEST = _UD_27_HOME + "UD_German-PUD/de_pud-ud-test.conllu"
"UD_27 test set of GERMAN_PUD."
UD_27_GOTHIC_PROIEL_TRAIN = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-train.conllu"
"UD_27 train set of GOTHIC_PROIEL."
UD_27_GOTHIC_PROIEL_DEV = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-dev.conllu"
"UD_27 dev set of GOTHIC_PROIEL."
UD_27_GOTHIC_PROIEL_TEST = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-test.conllu"
"UD_27 test set of GOTHIC_PROIEL."
UD_27_GREEK_GDT_TRAIN = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-train.conllu"
"UD_27 train set of GREEK_GDT."
UD_27_GREEK_GDT_DEV = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-dev.conllu"
"UD_27 dev set of GREEK_GDT."
UD_27_GREEK_GDT_TEST = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-test.conllu"
"UD_27 test set of GREEK_GDT."
UD_27_HEBREW_HTB_TRAIN = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-train.conllu"
"UD_27 train set of HEBREW_HTB."
UD_27_HEBREW_HTB_DEV = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-dev.conllu"
"UD_27 dev set of HEBREW_HTB."
UD_27_HEBREW_HTB_TEST = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-test.conllu"
"UD_27 test set of HEBREW_HTB."
UD_27_HINDI_HDTB_TRAIN = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-train.conllu"
"UD_27 train set of HINDI_HDTB."
UD_27_HINDI_HDTB_DEV = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu"
"UD_27 dev set of HINDI_HDTB."
UD_27_HINDI_HDTB_TEST = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-test.conllu"
"UD_27 test set of HINDI_HDTB."
UD_27_HINDI_PUD_TEST = _UD_27_HOME + "UD_Hindi-PUD/hi_pud-ud-test.conllu"
"UD_27 test set of HINDI_PUD."
UD_27_HINDI_ENGLISH_HIENCS_TRAIN = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu"
"UD_27 train set of HINDI_ENGLISH_HIENCS."
UD_27_HINDI_ENGLISH_HIENCS_DEV = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu"
"UD_27 dev set of HINDI_ENGLISH_HIENCS."
UD_27_HINDI_ENGLISH_HIENCS_TEST = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu"
"UD_27 test set of HINDI_ENGLISH_HIENCS."
UD_27_HUNGARIAN_SZEGED_TRAIN = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-train.conllu"
"UD_27 train set of HUNGARIAN_SZEGED."
UD_27_HUNGARIAN_SZEGED_DEV = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu"
"UD_27 dev set of HUNGARIAN_SZEGED."
UD_27_HUNGARIAN_SZEGED_TEST = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-test.conllu"
"UD_27 test set of HUNGARIAN_SZEGED."
UD_27_ICELANDIC_ICEPAHC_TRAIN = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu"
"UD_27 train set of ICELANDIC_ICEPAHC."
UD_27_ICELANDIC_ICEPAHC_DEV = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu"
"UD_27 dev set of ICELANDIC_ICEPAHC."
UD_27_ICELANDIC_ICEPAHC_TEST = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu"
"UD_27 test set of ICELANDIC_ICEPAHC."
UD_27_ICELANDIC_PUD_TEST = _UD_27_HOME + "UD_Icelandic-PUD/is_pud-ud-test.conllu"
"UD_27 test set of ICELANDIC_PUD."
UD_27_INDONESIAN_CSUI_TRAIN = _UD_27_HOME + "UD_Indonesian-CSUI/id_csui-ud-train.conllu"
"UD_27 train set of INDONESIAN_CSUI."
UD_27_INDONESIAN_CSUI_TEST = _UD_27_HOME + "UD_Indonesian-CSUI/id_csui-ud-test.conllu"
"UD_27 test set of INDONESIAN_CSUI."
UD_27_INDONESIAN_GSD_TRAIN = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-train.conllu"
"UD_27 train set of INDONESIAN_GSD."
UD_27_INDONESIAN_GSD_DEV = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-dev.conllu"
"UD_27 dev set of INDONESIAN_GSD."
UD_27_INDONESIAN_GSD_TEST = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-test.conllu"
"UD_27 test set of INDONESIAN_GSD."
UD_27_INDONESIAN_PUD_TEST = _UD_27_HOME + "UD_Indonesian-PUD/id_pud-ud-test.conllu"
"UD_27 test set of INDONESIAN_PUD."
UD_27_IRISH_IDT_TRAIN = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-train.conllu"
"UD_27 train set of IRISH_IDT."
UD_27_IRISH_IDT_DEV = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-dev.conllu"
"UD_27 dev set of IRISH_IDT."
UD_27_IRISH_IDT_TEST = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-test.conllu"
"UD_27 test set of IRISH_IDT."
UD_27_ITALIAN_ISDT_TRAIN = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-train.conllu"
"UD_27 train set of ITALIAN_ISDT."
UD_27_ITALIAN_ISDT_DEV = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-dev.conllu"
"UD_27 dev set of ITALIAN_ISDT."
UD_27_ITALIAN_ISDT_TEST = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-test.conllu"
"UD_27 test set of ITALIAN_ISDT."
UD_27_ITALIAN_PUD_TEST = _UD_27_HOME + "UD_Italian-PUD/it_pud-ud-test.conllu"
"UD_27 test set of ITALIAN_PUD."
UD_27_ITALIAN_PARTUT_TRAIN = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-train.conllu"
"UD_27 train set of ITALIAN_PARTUT."
UD_27_ITALIAN_PARTUT_DEV = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-dev.conllu"
"UD_27 dev set of ITALIAN_PARTUT."
UD_27_ITALIAN_PARTUT_TEST = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-test.conllu"
"UD_27 test set of ITALIAN_PARTUT."
UD_27_ITALIAN_POSTWITA_TRAIN = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-train.conllu"
"UD_27 train set of ITALIAN_POSTWITA."
UD_27_ITALIAN_POSTWITA_DEV = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu"
"UD_27 dev set of ITALIAN_POSTWITA."
UD_27_ITALIAN_POSTWITA_TEST = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-test.conllu"
"UD_27 test set of ITALIAN_POSTWITA."
UD_27_ITALIAN_TWITTIRO_TRAIN = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu"
"UD_27 train set of ITALIAN_TWITTIRO."
UD_27_ITALIAN_TWITTIRO_DEV = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu"
"UD_27 dev set of ITALIAN_TWITTIRO."
UD_27_ITALIAN_TWITTIRO_TEST = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu"
"UD_27 test set of ITALIAN_TWITTIRO."
UD_27_ITALIAN_VIT_TRAIN = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-train.conllu"
"UD_27 train set of ITALIAN_VIT."
UD_27_ITALIAN_VIT_DEV = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-dev.conllu"
"UD_27 dev set of ITALIAN_VIT."
UD_27_ITALIAN_VIT_TEST = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-test.conllu"
"UD_27 test set of ITALIAN_VIT."
UD_27_JAPANESE_BCCWJ_TRAIN = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu"
"UD_27 train set of JAPANESE_BCCWJ."
UD_27_JAPANESE_BCCWJ_DEV = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu"
"UD_27 dev set of JAPANESE_BCCWJ."
UD_27_JAPANESE_BCCWJ_TEST = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu"
"UD_27 test set of JAPANESE_BCCWJ."
UD_27_JAPANESE_GSD_TRAIN = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-train.conllu"
"UD_27 train set of JAPANESE_GSD."
UD_27_JAPANESE_GSD_DEV = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-dev.conllu"
"UD_27 dev set of JAPANESE_GSD."
UD_27_JAPANESE_GSD_TEST = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-test.conllu"
"UD_27 test set of JAPANESE_GSD."
UD_27_JAPANESE_MODERN_TEST = _UD_27_HOME + "UD_Japanese-Modern/ja_modern-ud-test.conllu"
"UD_27 test set of JAPANESE_MODERN."
UD_27_JAPANESE_PUD_TEST = _UD_27_HOME + "UD_Japanese-PUD/ja_pud-ud-test.conllu"
"UD_27 test set of JAPANESE_PUD."
UD_27_KARELIAN_KKPP_TEST = _UD_27_HOME + "UD_Karelian-KKPP/krl_kkpp-ud-test.conllu"
"UD_27 test set of KARELIAN_KKPP."
UD_27_KAZAKH_KTB_TRAIN = _UD_27_HOME + "UD_Kazakh-KTB/kk_ktb-ud-train.conllu"
"UD_27 train set of KAZAKH_KTB."
UD_27_KAZAKH_KTB_TEST = _UD_27_HOME + "UD_Kazakh-KTB/kk_ktb-ud-test.conllu"
"UD_27 test set of KAZAKH_KTB."
UD_27_KHUNSARI_AHA_TEST = _UD_27_HOME + "UD_Khunsari-AHA/kfm_aha-ud-test.conllu"
"UD_27 test set of KHUNSARI_AHA."
UD_27_KOMI_PERMYAK_UH_TEST = _UD_27_HOME + "UD_Komi_Permyak-UH/koi_uh-ud-test.conllu"
"UD_27 test set of KOMI_PERMYAK_UH."
UD_27_KOMI_ZYRIAN_IKDP_TEST = _UD_27_HOME + "UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu"
"UD_27 test set of KOMI_ZYRIAN_IKDP."
UD_27_KOMI_ZYRIAN_LATTICE_TEST = _UD_27_HOME + "UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu"
"UD_27 test set of KOMI_ZYRIAN_LATTICE."
UD_27_KOREAN_GSD_TRAIN = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-train.conllu"
"UD_27 train set of KOREAN_GSD."
UD_27_KOREAN_GSD_DEV = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-dev.conllu"
"UD_27 dev set of KOREAN_GSD."
UD_27_KOREAN_GSD_TEST = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-test.conllu"
"UD_27 test set of KOREAN_GSD."
UD_27_KOREAN_KAIST_TRAIN = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-train.conllu"
"UD_27 train set of KOREAN_KAIST."
UD_27_KOREAN_KAIST_DEV = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-dev.conllu"
"UD_27 dev set of KOREAN_KAIST."
UD_27_KOREAN_KAIST_TEST = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-test.conllu"
"UD_27 test set of KOREAN_KAIST."
UD_27_KOREAN_PUD_TEST = _UD_27_HOME + "UD_Korean-PUD/ko_pud-ud-test.conllu"
"UD_27 test set of KOREAN_PUD."
UD_27_KURMANJI_MG_TRAIN = _UD_27_HOME + "UD_Kurmanji-MG/kmr_mg-ud-train.conllu"
"UD_27 train set of KURMANJI_MG."
UD_27_KURMANJI_MG_TEST = _UD_27_HOME + "UD_Kurmanji-MG/kmr_mg-ud-test.conllu"
"UD_27 test set of KURMANJI_MG."
UD_27_LATIN_ITTB_TRAIN = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-train.conllu"
"UD_27 train set of LATIN_ITTB."
UD_27_LATIN_ITTB_DEV = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-dev.conllu"
"UD_27 dev set of LATIN_ITTB."
UD_27_LATIN_ITTB_TEST = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-test.conllu"
"UD_27 test set of LATIN_ITTB."
UD_27_LATIN_LLCT_TRAIN = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-train.conllu"
"UD_27 train set of LATIN_LLCT."
UD_27_LATIN_LLCT_DEV = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-dev.conllu"
"UD_27 dev set of LATIN_LLCT."
UD_27_LATIN_LLCT_TEST = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-test.conllu"
"UD_27 test set of LATIN_LLCT."
UD_27_LATIN_PROIEL_TRAIN = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-train.conllu"
"UD_27 train set of LATIN_PROIEL."
UD_27_LATIN_PROIEL_DEV = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-dev.conllu"
"UD_27 dev set of LATIN_PROIEL."
UD_27_LATIN_PROIEL_TEST = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-test.conllu"
"UD_27 test set of LATIN_PROIEL."
UD_27_LATIN_PERSEUS_TRAIN = _UD_27_HOME + "UD_Latin-Perseus/la_perseus-ud-train.conllu"
"UD_27 train set of LATIN_PERSEUS."
UD_27_LATIN_PERSEUS_TEST = _UD_27_HOME + "UD_Latin-Perseus/la_perseus-ud-test.conllu"
"UD_27 test set of LATIN_PERSEUS."
UD_27_LATVIAN_LVTB_TRAIN = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-train.conllu"
"UD_27 train set of LATVIAN_LVTB."
UD_27_LATVIAN_LVTB_DEV = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu"
"UD_27 dev set of LATVIAN_LVTB."
UD_27_LATVIAN_LVTB_TEST = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-test.conllu"
"UD_27 test set of LATVIAN_LVTB."
UD_27_LITHUANIAN_ALKSNIS_TRAIN = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu"
"UD_27 train set of LITHUANIAN_ALKSNIS."
UD_27_LITHUANIAN_ALKSNIS_DEV = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu"
"UD_27 dev set of LITHUANIAN_ALKSNIS."
UD_27_LITHUANIAN_ALKSNIS_TEST = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu"
"UD_27 test set of LITHUANIAN_ALKSNIS."
UD_27_LITHUANIAN_HSE_TRAIN = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-train.conllu"
"UD_27 train set of LITHUANIAN_HSE."
UD_27_LITHUANIAN_HSE_DEV = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-dev.conllu"
"UD_27 dev set of LITHUANIAN_HSE."
UD_27_LITHUANIAN_HSE_TEST = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-test.conllu"
"UD_27 test set of LITHUANIAN_HSE."
UD_27_LIVVI_KKPP_TRAIN = _UD_27_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-train.conllu"
"UD_27 train set of LIVVI_KKPP."
UD_27_LIVVI_KKPP_TEST = _UD_27_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-test.conllu"
"UD_27 test set of LIVVI_KKPP."
UD_27_MALTESE_MUDT_TRAIN = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-train.conllu"
"UD_27 train set of MALTESE_MUDT."
UD_27_MALTESE_MUDT_DEV = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-dev.conllu"
"UD_27 dev set of MALTESE_MUDT."
UD_27_MALTESE_MUDT_TEST = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-test.conllu"
"UD_27 test set of MALTESE_MUDT."
UD_27_MANX_CADHAN_TEST = _UD_27_HOME + "UD_Manx-Cadhan/gv_cadhan-ud-test.conllu"
"UD_27 test set of MANX_CADHAN."
UD_27_MARATHI_UFAL_TRAIN = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-train.conllu"
"UD_27 train set of MARATHI_UFAL."
UD_27_MARATHI_UFAL_DEV = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-dev.conllu"
"UD_27 dev set of MARATHI_UFAL."
UD_27_MARATHI_UFAL_TEST = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-test.conllu"
"UD_27 test set of MARATHI_UFAL."
UD_27_MBYA_GUARANI_DOOLEY_TEST = _UD_27_HOME + "UD_Mbya_Guarani-Dooley/gun_dooley-ud-test.conllu"
"UD_27 test set of MBYA_GUARANI_DOOLEY."
UD_27_MBYA_GUARANI_THOMAS_TEST = _UD_27_HOME + "UD_Mbya_Guarani-Thomas/gun_thomas-ud-test.conllu"
"UD_27 test set of MBYA_GUARANI_THOMAS."
UD_27_MOKSHA_JR_TEST = _UD_27_HOME + "UD_Moksha-JR/mdf_jr-ud-test.conllu"
"UD_27 test set of MOKSHA_JR."
UD_27_MUNDURUKU_TUDET_TEST = _UD_27_HOME + "UD_Munduruku-TuDeT/myu_tudet-ud-test.conllu"
"UD_27 test set of MUNDURUKU_TUDET."
UD_27_NAIJA_NSC_TRAIN = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-train.conllu"
"UD_27 train set of NAIJA_NSC."
UD_27_NAIJA_NSC_DEV = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-dev.conllu"
"UD_27 dev set of NAIJA_NSC."
UD_27_NAIJA_NSC_TEST = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-test.conllu"
"UD_27 test set of NAIJA_NSC."
UD_27_NAYINI_AHA_TEST = _UD_27_HOME + "UD_Nayini-AHA/nyq_aha-ud-test.conllu"
"UD_27 test set of NAYINI_AHA."
UD_27_NORTH_SAMI_GIELLA_TRAIN = _UD_27_HOME + "UD_North_Sami-Giella/sme_giella-ud-train.conllu"
"UD_27 train set of NORTH_SAMI_GIELLA."
UD_27_NORTH_SAMI_GIELLA_TEST = _UD_27_HOME + "UD_North_Sami-Giella/sme_giella-ud-test.conllu"
"UD_27 test set of NORTH_SAMI_GIELLA."
UD_27_NORWEGIAN_BOKMAAL_TRAIN = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu"
"UD_27 train set of NORWEGIAN_BOKMAAL."
UD_27_NORWEGIAN_BOKMAAL_DEV = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu"
"UD_27 dev set of NORWEGIAN_BOKMAAL."
UD_27_NORWEGIAN_BOKMAAL_TEST = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu"
"UD_27 test set of NORWEGIAN_BOKMAAL."
UD_27_NORWEGIAN_NYNORSK_TRAIN = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu"
"UD_27 train set of NORWEGIAN_NYNORSK."
UD_27_NORWEGIAN_NYNORSK_DEV = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu"
"UD_27 dev set of NORWEGIAN_NYNORSK."
UD_27_NORWEGIAN_NYNORSK_TEST = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu"
"UD_27 test set of NORWEGIAN_NYNORSK."
UD_27_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu"
"UD_27 train set of NORWEGIAN_NYNORSKLIA."
UD_27_NORWEGIAN_NYNORSKLIA_DEV = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu"
"UD_27 dev set of NORWEGIAN_NYNORSKLIA."
UD_27_NORWEGIAN_NYNORSKLIA_TEST = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu"
"UD_27 test set of NORWEGIAN_NYNORSKLIA."
UD_27_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu"
"UD_27 train set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_27_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu"
"UD_27 dev set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_27_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu"
"UD_27 test set of OLD_CHURCH_SLAVONIC_PROIEL."
UD_27_OLD_FRENCH_SRCMF_TRAIN = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu"
"UD_27 train set of OLD_FRENCH_SRCMF."
UD_27_OLD_FRENCH_SRCMF_DEV = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu"
"UD_27 dev set of OLD_FRENCH_SRCMF."
UD_27_OLD_FRENCH_SRCMF_TEST = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu"
"UD_27 test set of OLD_FRENCH_SRCMF."
UD_27_OLD_RUSSIAN_RNC_TRAIN = _UD_27_HOME + "UD_Old_Russian-RNC/orv_rnc-ud-train.conllu"
"UD_27 train set of OLD_RUSSIAN_RNC."
UD_27_OLD_RUSSIAN_RNC_TEST = _UD_27_HOME + "UD_Old_Russian-RNC/orv_rnc-ud-test.conllu"
"UD_27 test set of OLD_RUSSIAN_RNC."
UD_27_OLD_RUSSIAN_TOROT_TRAIN = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-train.conllu"
"UD_27 train set of OLD_RUSSIAN_TOROT."
UD_27_OLD_RUSSIAN_TOROT_DEV = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-dev.conllu"
"UD_27 dev set of OLD_RUSSIAN_TOROT."
UD_27_OLD_RUSSIAN_TOROT_TEST = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-test.conllu"
"UD_27 test set of OLD_RUSSIAN_TOROT."
UD_27_OLD_TURKISH_TONQQ_TEST = _UD_27_HOME + "UD_Old_Turkish-Tonqq/otk_tonqq-ud-test.conllu"
"UD_27 test set of OLD_TURKISH_TONQQ."
UD_27_PERSIAN_PERDT_TRAIN = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-train.conllu"
"UD_27 train set of PERSIAN_PERDT."
UD_27_PERSIAN_PERDT_DEV = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-dev.conllu"
"UD_27 dev set of PERSIAN_PERDT."
UD_27_PERSIAN_PERDT_TEST = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-test.conllu"
"UD_27 test set of PERSIAN_PERDT."
UD_27_PERSIAN_SERAJI_TRAIN = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-train.conllu"
"UD_27 train set of PERSIAN_SERAJI."
UD_27_PERSIAN_SERAJI_DEV = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-dev.conllu"
"UD_27 dev set of PERSIAN_SERAJI."
UD_27_PERSIAN_SERAJI_TEST = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-test.conllu"
"UD_27 test set of PERSIAN_SERAJI."
UD_27_POLISH_LFG_TRAIN = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-train.conllu"
"UD_27 train set of POLISH_LFG."
UD_27_POLISH_LFG_DEV = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-dev.conllu"
"UD_27 dev set of POLISH_LFG."
UD_27_POLISH_LFG_TEST = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-test.conllu"
"UD_27 test set of POLISH_LFG."
UD_27_POLISH_PDB_TRAIN = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-train.conllu"
"UD_27 train set of POLISH_PDB."
UD_27_POLISH_PDB_DEV = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-dev.conllu"
"UD_27 dev set of POLISH_PDB."
UD_27_POLISH_PDB_TEST = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-test.conllu"
"UD_27 test set of POLISH_PDB."
UD_27_POLISH_PUD_TEST = _UD_27_HOME + "UD_Polish-PUD/pl_pud-ud-test.conllu"
"UD_27 test set of POLISH_PUD."
UD_27_PORTUGUESE_BOSQUE_TRAIN = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-train.conllu"
"UD_27 train set of PORTUGUESE_BOSQUE."
UD_27_PORTUGUESE_BOSQUE_DEV = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu"
"UD_27 dev set of PORTUGUESE_BOSQUE."
UD_27_PORTUGUESE_BOSQUE_TEST = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-test.conllu"
"UD_27 test set of PORTUGUESE_BOSQUE."
UD_27_PORTUGUESE_GSD_TRAIN = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-train.conllu"
"UD_27 train set of PORTUGUESE_GSD."
UD_27_PORTUGUESE_GSD_DEV = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-dev.conllu"
"UD_27 dev set of PORTUGUESE_GSD."
UD_27_PORTUGUESE_GSD_TEST = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-test.conllu"
"UD_27 test set of PORTUGUESE_GSD."
UD_27_PORTUGUESE_PUD_TEST = _UD_27_HOME + "UD_Portuguese-PUD/pt_pud-ud-test.conllu"
"UD_27 test set of PORTUGUESE_PUD."
UD_27_ROMANIAN_NONSTANDARD_TRAIN = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu"
"UD_27 train set of ROMANIAN_NONSTANDARD."
UD_27_ROMANIAN_NONSTANDARD_DEV = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu"
"UD_27 dev set of ROMANIAN_NONSTANDARD."
UD_27_ROMANIAN_NONSTANDARD_TEST = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu"
"UD_27 test set of ROMANIAN_NONSTANDARD."
UD_27_ROMANIAN_RRT_TRAIN = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-train.conllu"
"UD_27 train set of ROMANIAN_RRT."
UD_27_ROMANIAN_RRT_DEV = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-dev.conllu"
"UD_27 dev set of ROMANIAN_RRT."
UD_27_ROMANIAN_RRT_TEST = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-test.conllu"
"UD_27 test set of ROMANIAN_RRT."
UD_27_ROMANIAN_SIMONERO_TRAIN = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-train.conllu"
"UD_27 train set of ROMANIAN_SIMONERO."
UD_27_ROMANIAN_SIMONERO_DEV = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-dev.conllu"
"UD_27 dev set of ROMANIAN_SIMONERO."
UD_27_ROMANIAN_SIMONERO_TEST = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-test.conllu"
"UD_27 test set of ROMANIAN_SIMONERO."
UD_27_RUSSIAN_GSD_TRAIN = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-train.conllu"
"UD_27 train set of RUSSIAN_GSD."
UD_27_RUSSIAN_GSD_DEV = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-dev.conllu"
"UD_27 dev set of RUSSIAN_GSD."
UD_27_RUSSIAN_GSD_TEST = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-test.conllu"
"UD_27 test set of RUSSIAN_GSD."
UD_27_RUSSIAN_PUD_TEST = _UD_27_HOME + "UD_Russian-PUD/ru_pud-ud-test.conllu"
"UD_27 test set of RUSSIAN_PUD."
UD_27_RUSSIAN_SYNTAGRUS_TRAIN = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu"
"UD_27 train set of RUSSIAN_SYNTAGRUS."
UD_27_RUSSIAN_SYNTAGRUS_DEV = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu"
"UD_27 dev set of RUSSIAN_SYNTAGRUS."
UD_27_RUSSIAN_SYNTAGRUS_TEST = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu"
"UD_27 test set of RUSSIAN_SYNTAGRUS."
UD_27_RUSSIAN_TAIGA_TRAIN = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-train.conllu"
"UD_27 train set of RUSSIAN_TAIGA."
UD_27_RUSSIAN_TAIGA_DEV = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-dev.conllu"
"UD_27 dev set of RUSSIAN_TAIGA."
UD_27_RUSSIAN_TAIGA_TEST = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-test.conllu"
"UD_27 test set of RUSSIAN_TAIGA."
UD_27_SANSKRIT_UFAL_TEST = _UD_27_HOME + "UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu"
"UD_27 test set of SANSKRIT_UFAL."
UD_27_SANSKRIT_VEDIC_TRAIN = _UD_27_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu"
"UD_27 train set of SANSKRIT_VEDIC."
UD_27_SANSKRIT_VEDIC_TEST = _UD_27_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu"
"UD_27 test set of SANSKRIT_VEDIC."
UD_27_SCOTTISH_GAELIC_ARCOSG_TRAIN = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu"
"UD_27 train set of SCOTTISH_GAELIC_ARCOSG."
UD_27_SCOTTISH_GAELIC_ARCOSG_DEV = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu"
"UD_27 dev set of SCOTTISH_GAELIC_ARCOSG."
UD_27_SCOTTISH_GAELIC_ARCOSG_TEST = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu"
"UD_27 test set of SCOTTISH_GAELIC_ARCOSG."
UD_27_SERBIAN_SET_TRAIN = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-train.conllu"
"UD_27 train set of SERBIAN_SET."
UD_27_SERBIAN_SET_DEV = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-dev.conllu"
"UD_27 dev set of SERBIAN_SET."
UD_27_SERBIAN_SET_TEST = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-test.conllu"
"UD_27 test set of SERBIAN_SET."
UD_27_SKOLT_SAMI_GIELLAGAS_TEST = _UD_27_HOME + "UD_Skolt_Sami-Giellagas/sms_giellagas-ud-test.conllu"
"UD_27 test set of SKOLT_SAMI_GIELLAGAS."
UD_27_SLOVAK_SNK_TRAIN = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-train.conllu"
"UD_27 train set of SLOVAK_SNK."
UD_27_SLOVAK_SNK_DEV = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-dev.conllu"
"UD_27 dev set of SLOVAK_SNK."
UD_27_SLOVAK_SNK_TEST = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-test.conllu"
"UD_27 test set of SLOVAK_SNK."
UD_27_SLOVENIAN_SSJ_TRAIN = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-train.conllu"
"UD_27 train set of SLOVENIAN_SSJ."
UD_27_SLOVENIAN_SSJ_DEV = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu"
"UD_27 dev set of SLOVENIAN_SSJ."
UD_27_SLOVENIAN_SSJ_TEST = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-test.conllu"
"UD_27 test set of SLOVENIAN_SSJ."
UD_27_SLOVENIAN_SST_TRAIN = _UD_27_HOME + "UD_Slovenian-SST/sl_sst-ud-train.conllu"
"UD_27 train set of SLOVENIAN_SST."
UD_27_SLOVENIAN_SST_TEST = _UD_27_HOME + "UD_Slovenian-SST/sl_sst-ud-test.conllu"
"UD_27 test set of SLOVENIAN_SST."
UD_27_SOI_AHA_TEST = _UD_27_HOME + "UD_Soi-AHA/soj_aha-ud-test.conllu"
"UD_27 test set of SOI_AHA."
UD_27_SOUTH_LEVANTINE_ARABIC_MADAR_TEST = _UD_27_HOME + "UD_South_Levantine_Arabic-MADAR/ajp_madar-ud-test.conllu"
"UD_27 test set of SOUTH_LEVANTINE_ARABIC_MADAR."
UD_27_SPANISH_ANCORA_TRAIN = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-train.conllu"
"UD_27 train set of SPANISH_ANCORA."
UD_27_SPANISH_ANCORA_DEV = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-dev.conllu"
"UD_27 dev set of SPANISH_ANCORA."
UD_27_SPANISH_ANCORA_TEST = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-test.conllu"
"UD_27 test set of SPANISH_ANCORA."
UD_27_SPANISH_GSD_TRAIN = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-train.conllu"
"UD_27 train set of SPANISH_GSD."
UD_27_SPANISH_GSD_DEV = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-dev.conllu"
"UD_27 dev set of SPANISH_GSD."
UD_27_SPANISH_GSD_TEST = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-test.conllu"
"UD_27 test set of SPANISH_GSD."
UD_27_SPANISH_PUD_TEST = _UD_27_HOME + "UD_Spanish-PUD/es_pud-ud-test.conllu"
"UD_27 test set of SPANISH_PUD."
UD_27_SWEDISH_LINES_TRAIN = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-train.conllu"
"UD_27 train set of SWEDISH_LINES."
UD_27_SWEDISH_LINES_DEV = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-dev.conllu"
"UD_27 dev set of SWEDISH_LINES."
UD_27_SWEDISH_LINES_TEST = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-test.conllu"
"UD_27 test set of SWEDISH_LINES."
UD_27_SWEDISH_PUD_TEST = _UD_27_HOME + "UD_Swedish-PUD/sv_pud-ud-test.conllu"
"UD_27 test set of SWEDISH_PUD."
UD_27_SWEDISH_TALBANKEN_TRAIN = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu"
"UD_27 train set of SWEDISH_TALBANKEN."
UD_27_SWEDISH_TALBANKEN_DEV = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu"
"UD_27 dev set of SWEDISH_TALBANKEN."
UD_27_SWEDISH_TALBANKEN_TEST = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu"
"UD_27 test set of SWEDISH_TALBANKEN."
UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu"
"UD_27 train set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu"
"UD_27 dev set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu"
"UD_27 test set of SWEDISH_SIGN_LANGUAGE_SSLC."
UD_27_SWISS_GERMAN_UZH_TEST = _UD_27_HOME + "UD_Swiss_German-UZH/gsw_uzh-ud-test.conllu"
"UD_27 test set of SWISS_GERMAN_UZH."
UD_27_TAGALOG_TRG_TEST = _UD_27_HOME + "UD_Tagalog-TRG/tl_trg-ud-test.conllu"
"UD_27 test set of TAGALOG_TRG."
UD_27_TAGALOG_UGNAYAN_TEST = _UD_27_HOME + "UD_Tagalog-Ugnayan/tl_ugnayan-ud-test.conllu"
"UD_27 test set of TAGALOG_UGNAYAN."
UD_27_TAMIL_MWTT_TEST = _UD_27_HOME + "UD_Tamil-MWTT/ta_mwtt-ud-test.conllu"
"UD_27 test set of TAMIL_MWTT."
UD_27_TAMIL_TTB_TRAIN = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-train.conllu"
"UD_27 train set of TAMIL_TTB."
UD_27_TAMIL_TTB_DEV = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-dev.conllu"
"UD_27 dev set of TAMIL_TTB."
UD_27_TAMIL_TTB_TEST = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-test.conllu"
"UD_27 test set of TAMIL_TTB."
UD_27_TELUGU_MTG_TRAIN = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-train.conllu"
"UD_27 train set of TELUGU_MTG."
UD_27_TELUGU_MTG_DEV = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-dev.conllu"
"UD_27 dev set of TELUGU_MTG."
UD_27_TELUGU_MTG_TEST = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-test.conllu"
"UD_27 test set of TELUGU_MTG."
UD_27_THAI_PUD_TEST = _UD_27_HOME + "UD_Thai-PUD/th_pud-ud-test.conllu"
"UD_27 test set of THAI_PUD."
UD_27_TUPINAMBA_TUDET_TEST = _UD_27_HOME + "UD_Tupinamba-TuDeT/tpn_tudet-ud-test.conllu"
"UD_27 test set of TUPINAMBA_TUDET."
UD_27_TURKISH_BOUN_TRAIN = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-train.conllu"
"UD_27 train set of TURKISH_BOUN."
UD_27_TURKISH_BOUN_DEV = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-dev.conllu"
"UD_27 dev set of TURKISH_BOUN."
UD_27_TURKISH_BOUN_TEST = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-test.conllu"
"UD_27 test set of TURKISH_BOUN."
UD_27_TURKISH_GB_TEST = _UD_27_HOME + "UD_Turkish-GB/tr_gb-ud-test.conllu"
"UD_27 test set of TURKISH_GB."
UD_27_TURKISH_IMST_TRAIN = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-train.conllu"
"UD_27 train set of TURKISH_IMST."
UD_27_TURKISH_IMST_DEV = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-dev.conllu"
"UD_27 dev set of TURKISH_IMST."
UD_27_TURKISH_IMST_TEST = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-test.conllu"
"UD_27 test set of TURKISH_IMST."
UD_27_TURKISH_PUD_TEST = _UD_27_HOME + "UD_Turkish-PUD/tr_pud-ud-test.conllu"
"UD_27 test set of TURKISH_PUD."
UD_27_TURKISH_GERMAN_SAGT_TRAIN = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-train.conllu"
"UD_27 train set of TURKISH_GERMAN_SAGT."
UD_27_TURKISH_GERMAN_SAGT_DEV = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-dev.conllu"
"UD_27 dev set of TURKISH_GERMAN_SAGT."
UD_27_TURKISH_GERMAN_SAGT_TEST = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-test.conllu"
"UD_27 test set of TURKISH_GERMAN_SAGT."
UD_27_UKRAINIAN_IU_TRAIN = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-train.conllu"
"UD_27 train set of UKRAINIAN_IU."
UD_27_UKRAINIAN_IU_DEV = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-dev.conllu"
"UD_27 dev set of UKRAINIAN_IU."
UD_27_UKRAINIAN_IU_TEST = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-test.conllu"
"UD_27 test set of UKRAINIAN_IU."
UD_27_UPPER_SORBIAN_UFAL_TRAIN = _UD_27_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu"
"UD_27 train set of UPPER_SORBIAN_UFAL."
UD_27_UPPER_SORBIAN_UFAL_TEST = _UD_27_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu"
"UD_27 test set of UPPER_SORBIAN_UFAL."
UD_27_URDU_UDTB_TRAIN = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-train.conllu"
"UD_27 train set of URDU_UDTB."
UD_27_URDU_UDTB_DEV = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-dev.conllu"
"UD_27 dev set of URDU_UDTB."
UD_27_URDU_UDTB_TEST = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-test.conllu"
"UD_27 test set of URDU_UDTB."
UD_27_UYGHUR_UDT_TRAIN = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-train.conllu"
"UD_27 train set of UYGHUR_UDT."
UD_27_UYGHUR_UDT_DEV = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-dev.conllu"
"UD_27 dev set of UYGHUR_UDT."
UD_27_UYGHUR_UDT_TEST = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-test.conllu"
"UD_27 test set of UYGHUR_UDT."
UD_27_VIETNAMESE_VTB_TRAIN = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-train.conllu"
"UD_27 train set of VIETNAMESE_VTB."
UD_27_VIETNAMESE_VTB_DEV = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu"
"UD_27 dev set of VIETNAMESE_VTB."
UD_27_VIETNAMESE_VTB_TEST = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-test.conllu"
"UD_27 test set of VIETNAMESE_VTB."
UD_27_WARLPIRI_UFAL_TEST = _UD_27_HOME + "UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu"
"UD_27 test set of WARLPIRI_UFAL."
UD_27_WELSH_CCG_TRAIN = _UD_27_HOME + "UD_Welsh-CCG/cy_ccg-ud-train.conllu"
"UD_27 train set of WELSH_CCG."
UD_27_WELSH_CCG_TEST = _UD_27_HOME + "UD_Welsh-CCG/cy_ccg-ud-test.conllu"
"UD_27 test set of WELSH_CCG."
UD_27_WOLOF_WTB_TRAIN = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-train.conllu"
"UD_27 train set of WOLOF_WTB."
UD_27_WOLOF_WTB_DEV = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-dev.conllu"
"UD_27 dev set of WOLOF_WTB."
UD_27_WOLOF_WTB_TEST = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-test.conllu"
"UD_27 test set of WOLOF_WTB."
UD_27_YORUBA_YTB_TEST = _UD_27_HOME + "UD_Yoruba-YTB/yo_ytb-ud-test.conllu"
"UD_27 test set of YORUBA_YTB."
================================================
FILE: hanlp/datasets/parsing/ud/ud27m.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-21 20:39
import os
from hanlp.datasets.parsing.ud import concat_treebanks
from hanlp.datasets.parsing.ud.ud27 import _UD_27_HOME
_UD_27_MULTILINGUAL_HOME = concat_treebanks(_UD_27_HOME, '2.7')
UD_27_MULTILINGUAL_TRAIN = os.path.join(_UD_27_MULTILINGUAL_HOME, 'train.conllu')
"Training set of multilingual UD_27 obtained by concatenating all training sets."
UD_27_MULTILINGUAL_DEV = os.path.join(_UD_27_MULTILINGUAL_HOME, 'dev.conllu')
"Dev set of multilingual UD_27 obtained by concatenating all dev sets."
UD_27_MULTILINGUAL_TEST = os.path.join(_UD_27_MULTILINGUAL_HOME, 'test.conllu')
"Test set of multilingual UD_27 obtained by concatenating all test sets."
================================================
FILE: hanlp/datasets/pos/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:50
================================================
FILE: hanlp/datasets/pos/ctb5.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:51
_CTB5_POS_HOME = 'http://file.hankcs.com/corpus/ctb5.1-pos.zip'
CTB5_POS_TRAIN = f'{_CTB5_POS_HOME}#train.tsv'
'''PoS training set for CTB5.'''
CTB5_POS_DEV = f'{_CTB5_POS_HOME}#dev.tsv'
'''PoS dev set for CTB5.'''
CTB5_POS_TEST = f'{_CTB5_POS_HOME}#test.tsv'
'''PoS test set for CTB5.'''
================================================
FILE: hanlp/datasets/qa/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-20 19:17
================================================
FILE: hanlp/datasets/qa/hotpotqa.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-20 19:46
from enum import Enum, auto
import torch
import ujson
from torch.nn.utils.rnn import pad_sequence
from hanlp.common.dataset import TransformableDataset
from hanlp_common.util import merge_list_of_dict
HOTPOT_QA_TRAIN = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json'
HOTPOT_QA_DISTRACTOR_DEV = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json'
HOTPOT_QA_FULLWIKI_DEV = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json'
class HotpotQADataset(TransformableDataset):
def load_file(self, filepath):
with open(filepath) as fd:
return ujson.load(fd)
class BuildGraph(object):
def __init__(self, dst='graph') -> None:
super().__init__()
self.dst = dst
def __call__(self, sample: dict):
sample[self.dst] = build_graph(sample)
return sample
def hotpotqa_collate_fn(samples):
batch = merge_list_of_dict(samples)
max_seq_len = len(max([x['graph'] for x in samples], key=len))
arc = torch.zeros([len(samples), max_seq_len, max_seq_len])
token_offset = torch.zeros([len(samples), max_seq_len], dtype=torch.long)
src_mask = torch.zeros([len(samples), max_seq_len], dtype=torch.bool)
sp_candidate_mask = torch.zeros([len(samples), max_seq_len], dtype=torch.bool)
sp_label = torch.zeros([len(samples), max_seq_len], dtype=torch.float)
# sp = torch.zeros([len(samples), max_seq_len], dtype=torch.bool)
tokens = []
offset = 0
for i, sample in enumerate(samples):
graph = sample['graph']
for j, u in enumerate(graph):
u: Vertex = u
for v in u.to:
v: Vertex = v
arc[i, v.id, u.id] = 1
arc[i, u.id, v.id] = 1
# record each vertex's token offset
token_offset[i, u.id] = offset
src_mask[i, u.id] = True
sp_candidate_mask[i, u.id] = u.is_sp_root_candidate()
sp_label[i, u.id] = u.is_sp_root()
offset += 1
tokens.extend(sample['token_id'])
seq_lengths = torch.LongTensor(list(map(len, tokens)))
tokens = [torch.LongTensor(x) for x in tokens]
tokens = pad_sequence(tokens, batch_first=True)
batch['adj'] = arc
batch['tokens'] = tokens
batch['src_mask'] = src_mask
batch['seq_lengths'] = seq_lengths
batch['token_offset'] = token_offset
batch['sp_candidate_mask'] = sp_candidate_mask
batch['sp_label'] = sp_label
return batch
def flat_sentence(sample: dict) -> dict:
sample['token'] = token = []
for sent in sample['parsed_sentences']:
token.append(['bos'] + [x.lower() for x in sent[0]])
return sample
def create_sp_label(sample: dict) -> dict:
sample['sp_label'] = sp_label = []
def label(title_, index_):
for t, i in sample['supporting_facts']:
if t == title_ and i == index_:
return 1
return 0
for context in sample['context']:
title, sents = context
for idx, sent in enumerate(sents):
sp_label.append(label(title, idx))
assert len(sample['supporting_facts']) == sum(sp_label)
return sample
class Type(Enum):
Q_ROOT = auto()
Q_WORD = auto()
SP_ROOT = auto()
SP_WORD = auto()
NON_SP_ROOT = auto()
NON_SP_WORD = auto()
DOCUMENT_TITLE = auto()
class Vertex(object):
def __init__(self, id, type: Type, text=None) -> None:
super().__init__()
self.id = id
self.type = type
if not text:
text = str(type).split('.')[-1]
self.text = text
self.to = []
self.rel = []
def connect(self, to, rel):
self.to.append(to)
self.rel.append(rel)
def __str__(self) -> str:
return f'{self.text} {self.id}'
def __hash__(self) -> int:
return self.id
def is_word(self):
return self.type in {Type.SP_WORD, Type.Q_WORD, Type.NON_SP_WORD}
def is_question(self):
return self.type in {Type.Q_ROOT, Type.Q_WORD}
def is_sp(self):
return self.type in {Type.SP_ROOT, Type.SP_WORD}
def is_sp_root(self):
return self.type in {Type.SP_ROOT}
def is_sp_root_candidate(self):
return self.type in {Type.SP_ROOT, Type.NON_SP_ROOT}
def build_graph(each: dict, debug=False):
raw_sents = []
raw_sents.append(each['question'])
sp_idx = set()
sp_sents = {}
for sp in each['supporting_facts']:
title, offset = sp
ids = sp_sents.get(title, None)
if ids is None:
sp_sents[title] = ids = set()
ids.add(offset)
idx = 1
for document in each['context']:
title, sents = document
raw_sents += sents
for i, s in enumerate(sents):
if title in sp_sents and i in sp_sents[title]:
sp_idx.add(idx)
idx += 1
assert idx == len(raw_sents)
parsed_sents = each['parsed_sentences']
assert len(raw_sents) == len(parsed_sents)
graph = []
for idx, (raw, sent) in enumerate(zip(raw_sents, parsed_sents)):
if debug:
if idx > 1 and idx not in sp_idx:
continue
offset = len(graph)
if idx == 0:
if debug:
print(f'Question: {raw}')
graph.append(Vertex(len(graph), Type.Q_ROOT))
else:
if debug:
if idx in sp_idx:
print(f'Supporting Fact: {raw}')
graph.append(Vertex(len(graph), Type.SP_ROOT if idx in sp_idx else Type.NON_SP_ROOT))
tokens, heads, deprels = sent
for t, h, d in zip(tokens, heads, deprels):
graph.append(
Vertex(len(graph), (Type.SP_WORD if idx in sp_idx else Type.NON_SP_WORD) if idx else Type.Q_WORD, t))
for i, (h, d) in enumerate(zip(heads, deprels)):
graph[offset + h].connect(graph[offset + i + 1], d)
q_root = graph[0]
for u in graph:
if u.type == Type.SP_ROOT or u.type == Type.NON_SP_ROOT:
q_root.connect(u, 'supporting fact?')
return graph
================================================
FILE: hanlp/datasets/srl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 19:15
================================================
FILE: hanlp/datasets/srl/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:05
================================================
FILE: hanlp/datasets/srl/loaders/conll2012.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 19:15
import glob
import json
import os
from typing import Union, List, Callable
from hanlp.utils.span_util import enumerate_spans
from hanlp.common.dataset import TransformableDataset
from hanlp.common.transform import NamedTransform
from hanlp.utils.io_util import read_tsv_as_sents, get_resource, TimingFileIterator
from hanlp.utils.time_util import CountdownTimer
class CoNLL2012BIOSRLDataset(TransformableDataset):
def load_file(self, filepath: str):
filepath = get_resource(filepath)
if os.path.isfile(filepath):
files = [filepath]
else:
assert os.path.isdir(filepath), f'{filepath} has to be a directory of CoNLL 2012'
files = sorted(glob.glob(f'{filepath}/**/*gold_conll', recursive=True))
timer = CountdownTimer(len(files))
for fid, f in enumerate(files):
timer.log(f'files loading[blink][yellow]...[/yellow][/blink]')
# 0:DOCUMENT 1:PART 2:INDEX 3:WORD 4:POS 5:PARSE 6:LEMMA 7:FRAME 8:SENSE 9:SPEAKER 10:NE 11-N:ARGS N:COREF
for sent in read_tsv_as_sents(f, ignore_prefix='#'):
sense = [cell[7] for cell in sent]
props = [cell[11:-1] for cell in sent]
props = map(lambda p: p, zip(*props))
prd_bio_labels = [self._make_bio_labels(prop) for prop in props]
prd_bio_labels = [self._remove_B_V(x) for x in prd_bio_labels]
prd_indices = [i for i, x in enumerate(sense) if x != '-']
token = [x[3] for x in sent]
srl = [None for x in token]
for idx, labels in zip(prd_indices, prd_bio_labels):
srl[idx] = labels
srl = [x if x else ['O'] * len(token) for x in srl]
yield {'token': token, 'srl': srl}
@staticmethod
def _make_bio_labels(prop):
"""Copied from https://github.com/hiroki13/span-based-srl/blob/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/utils/sent.py#L42
Args:
prop: 1D: n_words; elem=bracket label
Returns:
1D: n_words; elem=BIO label
"""
labels = []
prev = None
for arg in prop:
if arg.startswith('('):
if arg.endswith(')'):
prev = arg.split("*")[0][1:]
label = 'B-' + prev
prev = None
else:
prev = arg[1:-1]
label = 'B-' + prev
else:
if prev:
label = 'I-' + prev
if arg.endswith(')'):
prev = None
else:
label = 'O'
labels.append(label)
return labels
@staticmethod
def _remove_B_V(labels):
return ['O' if x == 'B-V' else x for x in labels]
class CoNLL2012SRLDataset(TransformableDataset):
def __init__(self,
data: Union[str, List],
transform: Union[Callable, List] = None,
cache=None,
doc_level_offset=True,
generate_idx=None) -> None:
self.doc_level_offset = doc_level_offset
super().__init__(data, transform, cache, generate_idx=generate_idx)
def load_file(self, filepath: str):
"""Load ``.jsonlines`` CoNLL12-style corpus. Samples of this corpus can be found using the following scripts.
.. highlight:: python
.. code-block:: python
import json
from hanlp_common.document import Document
from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV
from hanlp.utils.io_util import get_resource
with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src:
for line in src:
doc = json.loads(line)
print(Document(doc))
break
Args:
filepath: ``.jsonlines`` CoNLL12 corpus.
"""
filename = os.path.basename(filepath)
reader = TimingFileIterator(filepath)
num_docs, num_sentences = 0, 0
for line in reader:
doc = json.loads(line)
num_docs += 1
num_tokens_in_doc = 0
for sid, (sentence, srl) in enumerate(zip(doc['sentences'], doc['srl'])):
if self.doc_level_offset:
srl = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc, x[2] - num_tokens_in_doc, x[3]) for x in
srl]
else:
srl = [(x[0], x[1], x[2], x[3]) for x in srl]
for x in srl:
if any([o < 0 for o in x[:3]]):
raise ValueError(f'Negative offset occurred, maybe doc_level_offset=False')
if any([o >= len(sentence) for o in x[:3]]):
raise ValueError('Offset exceeds sentence length, maybe doc_level_offset=True')
deduplicated_srl = set()
pa_set = set()
for p, b, e, l in srl:
pa = (p, b, e)
if pa in pa_set:
continue
pa_set.add(pa)
deduplicated_srl.add((p, b, e, l))
yield self.build_sample(sentence, deduplicated_srl, doc, sid)
num_sentences += 1
num_tokens_in_doc += len(sentence)
reader.log(
f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]')
reader.erase()
# noinspection PyMethodMayBeStatic
def build_sample(self, sentence, deduplicated_srl, doc, sid):
return {
'token': sentence,
'srl': deduplicated_srl
}
def group_pa_by_p(sample: dict) -> dict:
if 'srl' in sample:
srl: list = sample['srl']
grouped_srl = group_pa_by_p_(srl)
sample['srl'] = grouped_srl
return sample
def group_pa_by_p_(srl):
grouped_srl = {}
for p, b, e, l in srl:
bel = grouped_srl.get(p, None)
if not bel:
bel = grouped_srl[p] = set()
bel.add((b, e, l))
return grouped_srl
def filter_v_args(sample: dict) -> dict:
if 'srl' in sample:
sample['srl'] = [t for t in sample['srl'] if t[-1] not in ["V", "C-V"]]
return sample
def unpack_srl(sample: dict) -> dict:
if 'srl' in sample:
srl = sample['srl']
predicate_offset = [x[0] for x in srl]
argument_begin_offset = [x[1] for x in srl]
argument_end_offset = [x[2] for x in srl]
srl_label = [x[-1] for x in srl]
sample.update({
'predicate_offset': predicate_offset,
'argument_begin_offset': argument_begin_offset,
'argument_end_offset': argument_end_offset,
'srl_label': srl_label, # We can obtain mask by srl_label > 0
# 'srl_mask': len(srl_label),
})
return sample
class SpanCandidatesGenerator(NamedTransform):
def __init__(self, src: str, dst: str = None, max_span_width=None) -> None:
if not dst:
dst = f'{src}_span'
super().__init__(src, dst)
self.max_span_width = max_span_width
def __call__(self, sample: dict) -> dict:
sample[self.dst] = list(enumerate_spans(sample[self.src], max_span_width=self.max_span_width))
return sample
class CoNLL2012SRLBIODataset(CoNLL2012SRLDataset):
def build_sample(self, tokens, deduplicated_srl, doc, sid):
# Convert srl to exclusive format
deduplicated_srl = set((x[0], x[1], x[2] + 1, x[3]) for x in deduplicated_srl if x[3] != 'V')
labels = [['O'] * len(tokens) for _ in range(len(tokens))]
srl = group_pa_by_p_(deduplicated_srl)
for p, args in sorted(srl.items()):
labels_per_p = labels[p]
for start, end, label in args:
assert end > start
assert label != 'V' # We don't predict predicate
labels_per_p[start] = 'B-' + label
for j in range(start + 1, end):
labels_per_p[j] = 'I-' + label
sample = {
'token': tokens,
'srl': labels,
'srl_set': deduplicated_srl,
}
if 'pos' in doc:
sample['pos'] = doc['pos'][sid]
return sample
================================================
FILE: hanlp/datasets/srl/loaders/ontonotes_loader.py
================================================
from typing import DefaultDict, List, Optional, Iterator, Set, Tuple, Dict
from collections import defaultdict
import codecs
import os
import logging
from hanlp.utils.span_util import TypedSpan, enumerate_spans
from phrasetree.tree import Tree
logger = logging.getLogger(__name__)
class OntonotesSentence:
"""
A class representing the annotations available for a single CONLL formatted sentence.
# Parameters
document_id : `str`
This is a variation on the document filename
sentence_id : `int`
The integer ID of the sentence within a document.
words : `List[str]`
This is the tokens as segmented/tokenized in the Treebank.
pos_tags : `List[str]`
This is the Penn-Treebank-style part of speech. When parse information is missing,
all parts of speech except the one for which there is some sense or proposition
annotation are marked with a XX tag. The verb is marked with just a VERB tag.
parse_tree : `nltk.Tree`
An nltk Tree representing the parse. It includes POS tags as pre-terminal nodes.
When the parse information is missing, the parse will be `None`.
predicate_lemmas : `List[Optional[str]]`
The predicate lemma of the words for which we have semantic role
information or word sense information. All other indices are `None`.
predicate_framenet_ids : `List[Optional[int]]`
The PropBank frameset ID of the lemmas in `predicate_lemmas`, or `None`.
word_senses : `List[Optional[float]]`
The word senses for the words in the sentence, or `None`. These are floats
because the word sense can have values after the decimal, like `1.1`.
speakers : `List[Optional[str]]`
The speaker information for the words in the sentence, if present, or `None`
This is the speaker or author name where available. Mostly in Broadcast Conversation
and Web Log data. When not available the rows are marked with an "-".
named_entities : `List[str]`
The BIO tags for named entities in the sentence.
srl_frames : `List[Tuple[str, List[str]]]`
A dictionary keyed by the verb in the sentence for the given
Propbank frame labels, in a BIO format.
coref_spans : `Set[TypedSpan]`
The spans for entity mentions involved in coreference resolution within the sentence.
Each element is a tuple composed of (cluster_id, (start_index, end_index)). Indices
are `inclusive`.
"""
def __init__(
self,
document_id: str,
sentence_id: int,
words: List[str],
pos_tags: List[str],
parse_tree: Optional[Tree],
predicate_lemmas: List[Optional[str]],
predicate_framenet_ids: List[Optional[str]],
word_senses: List[Optional[float]],
speakers: List[Optional[str]],
named_entities: List[str],
srl_frames: List[Tuple[str, List[str]]],
coref_spans: Set[TypedSpan],
) -> None:
self.document_id = document_id
self.sentence_id = sentence_id
self.words = words
self.pos_tags = pos_tags
self.parse_tree = parse_tree
self.predicate_lemmas = predicate_lemmas
self.predicate_framenet_ids = predicate_framenet_ids
self.word_senses = word_senses
self.speakers = speakers
self.named_entities = named_entities
self.srl_frames = srl_frames
self.coref_spans = coref_spans
class Ontonotes:
"""
This `DatasetReader` is designed to read in the English OntoNotes v5.0 data
in the format used by the CoNLL 2011/2012 shared tasks. In order to use this
Reader, you must follow the instructions provided [here (v12 release):]
(https://cemantix.org/data/ontonotes.html), which will allow you to download
the CoNLL style annotations for the OntoNotes v5.0 release -- LDC2013T19.tgz
obtained from LDC.
Once you have run the scripts on the extracted data, you will have a folder
structured as follows:
```
conll-formatted-ontonotes-5.0/
── data
├── development
└── data
└── english
└── annotations
├── bc
├── bn
├── mz
├── nw
├── pt
├── tc
└── wb
├── test
└── data
└── english
└── annotations
├── bc
├── bn
├── mz
├── nw
├── pt
├── tc
└── wb
└── train
└── data
└── english
└── annotations
├── bc
├── bn
├── mz
├── nw
├── pt
├── tc
└── wb
```
The file path provided to this class can then be any of the train, test or development
directories(or the top level data directory, if you are not utilizing the splits).
The data has the following format, ordered by column.
1. Document ID : `str`
This is a variation on the document filename
2. Part number : `int`
Some files are divided into multiple parts numbered as 000, 001, 002, ... etc.
3. Word number : `int`
This is the word index of the word in that sentence.
4. Word : `str`
This is the token as segmented/tokenized in the Treebank. Initially the `*_skel` file
contain the placeholder [WORD] which gets replaced by the actual token from the
Treebank which is part of the OntoNotes release.
5. POS Tag : `str`
This is the Penn Treebank style part of speech. When parse information is missing,
all part of speeches except the one for which there is some sense or proposition
annotation are marked with a XX tag. The verb is marked with just a VERB tag.
6. Parse bit : `str`
This is the bracketed structure broken before the first open parenthesis in the parse,
and the word/part-of-speech leaf replaced with a `*`. When the parse information is
missing, the first word of a sentence is tagged as `(TOP*` and the last word is tagged
as `*)` and all intermediate words are tagged with a `*`.
7. Predicate lemma : `str`
The predicate lemma is mentioned for the rows for which we have semantic role
information or word sense information. All other rows are marked with a "-".
8. Predicate Frameset ID : `int`
The PropBank frameset ID of the predicate in Column 7.
9. Word sense : `float`
This is the word sense of the word in Column 3.
10. Speaker/Author : `str`
This is the speaker or author name where available. Mostly in Broadcast Conversation
and Web Log data. When not available the rows are marked with an "-".
11. Named Entities : `str`
These columns identifies the spans representing various named entities. For documents
which do not have named entity annotation, each line is represented with an `*`.
12. Predicate Arguments : `str`
There is one column each of predicate argument structure information for the predicate
mentioned in Column 7. If there are no predicates tagged in a sentence this is a
single column with all rows marked with an `*`.
-1. Co-reference : `str`
Co-reference chain information encoded in a parenthesis structure. For documents that do
not have co-reference annotations, each line is represented with a "-".
"""
def dataset_iterator(self, file_path: str) -> Iterator[OntonotesSentence]:
"""
An iterator over the entire dataset, yielding all sentences processed.
"""
for conll_file in self.dataset_path_iterator(file_path):
yield from self.sentence_iterator(conll_file)
@staticmethod
def dataset_path_iterator(file_path: str) -> Iterator[str]:
"""
An iterator returning file_paths in a directory
containing CONLL-formatted files.
"""
logger.info("Reading CONLL sentences from dataset files at: %s", file_path)
for root, _, files in list(os.walk(file_path)):
for data_file in files:
# These are a relic of the dataset pre-processing. Every
# file will be duplicated - one file called filename.gold_skel
# and one generated from the preprocessing called filename.gold_conll.
if not data_file.endswith("gold_conll"):
continue
yield os.path.join(root, data_file)
def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]:
"""
An iterator over CONLL formatted files which yields documents, regardless
of the number of document annotations in a particular file. This is useful
for conll data which has been preprocessed, such as the preprocessing which
takes place for the 2012 CONLL Coreference Resolution task.
"""
with codecs.open(file_path, "r", encoding="utf8") as open_file:
conll_rows = []
document: List[OntonotesSentence] = []
for line in open_file:
line = line.strip()
if line != "" and not line.startswith("#"):
# Non-empty line. Collect the annotation.
conll_rows.append(line)
else:
if conll_rows:
document.append(self._conll_rows_to_sentence(conll_rows))
conll_rows = []
if line.startswith("#end document"):
yield document
document = []
if document:
# Collect any stragglers or files which might not
# have the '#end document' format for the end of the file.
yield document
def sentence_iterator(self, file_path: str) -> Iterator[OntonotesSentence]:
"""
An iterator over the sentences in an individual CONLL formatted file.
"""
for document in self.dataset_document_iterator(file_path):
for sentence in document:
yield sentence
def _conll_rows_to_sentence(self, conll_rows: List[str]) -> OntonotesSentence:
document_id: str = None
sentence_id: int = None
# The words in the sentence.
sentence: List[str] = []
# The pos tags of the words in the sentence.
pos_tags: List[str] = []
# the pieces of the parse tree.
parse_pieces: List[str] = []
# The lemmatised form of the words in the sentence which
# have SRL or word sense information.
predicate_lemmas: List[str] = []
# The FrameNet ID of the predicate.
predicate_framenet_ids: List[str] = []
# The sense of the word, if available.
word_senses: List[float] = []
# The current speaker, if available.
speakers: List[str] = []
verbal_predicates: List[str] = []
span_labels: List[List[str]] = []
current_span_labels: List[str] = []
# Cluster id -> List of (start_index, end_index) spans.
clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list)
# Cluster id -> List of start_indices which are open for this id.
coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)
for index, row in enumerate(conll_rows):
conll_components = row.split()
document_id = conll_components[0]
sentence_id = int(conll_components[1])
word = conll_components[3]
pos_tag = conll_components[4]
parse_piece = conll_components[5]
# Replace brackets in text and pos tags
# with a different token for parse trees.
if pos_tag != "XX" and word != "XX":
if word == "(":
parse_word = "-LRB-"
elif word == ")":
parse_word = "-RRB-"
else:
parse_word = word
if pos_tag == "(":
pos_tag = "-LRB-"
if pos_tag == ")":
pos_tag = "-RRB-"
(left_brackets, right_hand_side) = parse_piece.split("*")
# only keep ')' if there are nested brackets with nothing in them.
right_brackets = right_hand_side.count(")") * ")"
parse_piece = f"{left_brackets} ({pos_tag} {parse_word}) {right_brackets}"
else:
# There are some bad annotations in the CONLL data.
# They contain no information, so to make this explicit,
# we just set the parse piece to be None which will result
# in the overall parse tree being None.
parse_piece = None
lemmatised_word = conll_components[6]
framenet_id = conll_components[7]
word_sense = conll_components[8]
speaker = conll_components[9]
if not span_labels:
# If this is the first word in the sentence, create
# empty lists to collect the NER and SRL BIO labels.
# We can't do this upfront, because we don't know how many
# components we are collecting, as a sentence can have
# variable numbers of SRL frames.
span_labels = [[] for _ in conll_components[10:-1]]
# Create variables representing the current label for each label
# sequence we are collecting.
current_span_labels = [None for _ in conll_components[10:-1]]
self._process_span_annotations_for_word(
conll_components[10:-1], span_labels, current_span_labels
)
# If any annotation marks this word as a verb predicate,
# we need to record its index. This also has the side effect
# of ordering the verbal predicates by their location in the
# sentence, automatically aligning them with the annotations.
word_is_verbal_predicate = any("(V" in x for x in conll_components[11:-1])
if word_is_verbal_predicate:
verbal_predicates.append(word)
self._process_coref_span_annotations_for_word(
conll_components[-1], index, clusters, coref_stacks
)
sentence.append(word)
pos_tags.append(pos_tag)
parse_pieces.append(parse_piece)
predicate_lemmas.append(lemmatised_word if lemmatised_word != "-" else None)
predicate_framenet_ids.append(framenet_id if framenet_id != "-" else None)
word_senses.append(float(word_sense) if word_sense != "-" else None)
speakers.append(speaker if speaker != "-" else None)
named_entities = span_labels[0]
srl_frames = [
(predicate, labels) for predicate, labels in zip(verbal_predicates, span_labels[1:])
]
if all(parse_pieces):
parse_tree = Tree.fromstring("".join(parse_pieces))
else:
parse_tree = None
coref_span_tuples: Set[TypedSpan] = {
(cluster_id, span) for cluster_id, span_list in clusters.items() for span in span_list
}
return OntonotesSentence(
document_id,
sentence_id,
sentence,
pos_tags,
parse_tree,
predicate_lemmas,
predicate_framenet_ids,
word_senses,
speakers,
named_entities,
srl_frames,
coref_span_tuples,
)
@staticmethod
def _process_coref_span_annotations_for_word(
label: str,
word_index: int,
clusters: DefaultDict[int, List[Tuple[int, int]]],
coref_stacks: DefaultDict[int, List[int]],
) -> None:
"""
For a given coref label, add it to a currently open span(s), complete a span(s) or
ignore it, if it is outside of all spans. This method mutates the clusters and coref_stacks
dictionaries.
# Parameters
label : `str`
The coref label for this word.
word_index : `int`
The word index into the sentence.
clusters : `DefaultDict[int, List[Tuple[int, int]]]`
A dictionary mapping cluster ids to lists of inclusive spans into the
sentence.
coref_stacks : `DefaultDict[int, List[int]]`
Stacks for each cluster id to hold the start indices of active spans (spans
which we are inside of when processing a given word). Spans with the same id
can be nested, which is why we collect these opening spans on a stack, e.g:
[Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1
"""
if label != "-":
for segment in label.split("|"):
# The conll representation of coref spans allows spans to
# overlap. If spans end or begin at the same word, they are
# separated by a "|".
if segment[0] == "(":
# The span begins at this word.
if segment[-1] == ")":
# The span begins and ends at this word (single word span).
cluster_id = int(segment[1:-1])
clusters[cluster_id].append((word_index, word_index))
else:
# The span is starting, so we record the index of the word.
cluster_id = int(segment[1:])
coref_stacks[cluster_id].append(word_index)
else:
# The span for this id is ending, but didn't start at this word.
# Retrieve the start index from the document state and
# add the span to the clusters for this id.
cluster_id = int(segment[:-1])
start = coref_stacks[cluster_id].pop()
clusters[cluster_id].append((start, word_index))
@staticmethod
def _process_span_annotations_for_word(
annotations: List[str],
span_labels: List[List[str]],
current_span_labels: List[Optional[str]],
) -> None:
"""
Given a sequence of different label types for a single word and the current
span label we are inside, compute the BIO tag for each label and append to a list.
# Parameters
annotations : `List[str]`
A list of labels to compute BIO tags for.
span_labels : `List[List[str]]`
A list of lists, one for each annotation, to incrementally collect
the BIO tags for a sequence.
current_span_labels : `List[Optional[str]]`
The currently open span per annotation type, or `None` if there is no open span.
"""
for annotation_index, annotation in enumerate(annotations):
# strip all bracketing information to
# get the actual propbank label.
label = annotation.strip("()*")
if "(" in annotation:
# Entering into a span for a particular semantic role label.
# We append the label and set the current span for this annotation.
bio_label = "B-" + label
span_labels[annotation_index].append(bio_label)
current_span_labels[annotation_index] = label
elif current_span_labels[annotation_index] is not None:
# If there's no '(' token, but the current_span_label is not None,
# then we are inside a span.
bio_label = "I-" + current_span_labels[annotation_index]
span_labels[annotation_index].append(bio_label)
else:
# We're outside a span.
span_labels[annotation_index].append("O")
# Exiting a span, so we reset the current span label for this annotation.
if ")" in annotation:
current_span_labels[annotation_index] = None
def make_coref_instance(
sentences: List[List[str]],
max_span_width: int,
gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
max_sentences: int = None,
remove_singleton_clusters: bool = True,
) -> dict:
"""
# Parameters
sentences : `List[List[str]]`, required.
A list of lists representing the tokenised words and sentences in the document.
token_indexers : `Dict[str, TokenIndexer]`
This is used to index the words in the document. See :class:`TokenIndexer`.
max_span_width : `int`, required.
The maximum width of candidate spans to consider.
gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = None)
A list of all clusters in the document, represented as word spans with absolute indices
in the entire document. Each cluster contains some number of spans, which can be nested
and overlap. If there are exact matches between clusters, they will be resolved
using `_canonicalize_clusters`.
wordpiece_modeling_tokenizer: `PretrainedTransformerTokenizer`, optional (default = None)
If not None, this dataset reader does subword tokenization using the supplied tokenizer
and distribute the labels to the resulting wordpieces. All the modeling will be based on
wordpieces. If this is set to `False` (default), the user is expected to use
`PretrainedTransformerMismatchedIndexer` and `PretrainedTransformerMismatchedEmbedder`,
and the modeling will be on the word-level.
max_sentences: int, optional (default = None)
The maximum number of sentences in each document to keep. By default keeps all sentences.
remove_singleton_clusters : `bool`, optional (default = True)
Some datasets contain clusters that are singletons (i.e. no coreferents). This option allows
the removal of them.
# Returns
An `Instance` containing the following `Fields`:
text : `TextField`
The text of the full document.
spans : `ListField[SpanField]`
A ListField containing the spans represented as `SpanFields`
with respect to the document text.
span_labels : `SequenceLabelField`, optional
The id of the cluster which each possible span belongs to, or -1 if it does
not belong to a cluster. As these labels have variable length (it depends on
how many spans we are considering), we represent this a as a `SequenceLabelField`
with respect to the spans `ListField`.
"""
if max_sentences is not None and len(sentences) > max_sentences:
sentences = sentences[:max_sentences]
total_length = sum(len(sentence) for sentence in sentences)
if gold_clusters is not None:
new_gold_clusters = []
for cluster in gold_clusters:
new_cluster = []
for mention in cluster:
if mention[1] < total_length:
new_cluster.append(mention)
if new_cluster:
new_gold_clusters.append(new_cluster)
gold_clusters = new_gold_clusters
flattened_sentences = [_normalize_word(word) for sentence in sentences for word in sentence]
flat_sentences_tokens = [word for word in flattened_sentences]
text_field = flat_sentences_tokens
cluster_dict = {}
if gold_clusters is not None:
gold_clusters = _canonicalize_clusters(gold_clusters)
if remove_singleton_clusters:
gold_clusters = [cluster for cluster in gold_clusters if len(cluster) > 1]
for cluster_id, cluster in enumerate(gold_clusters):
for mention in cluster:
cluster_dict[tuple(mention)] = cluster_id
spans: List = []
span_labels: Optional[List[int]] = [] if gold_clusters is not None else None
sentence_offset = 0
for sentence in sentences:
for start, end in enumerate_spans(
sentence, offset=sentence_offset, max_span_width=max_span_width
):
if span_labels is not None:
if (start, end) in cluster_dict:
span_labels.append(cluster_dict[(start, end)])
else:
span_labels.append(-1)
spans.append((start, end))
sentence_offset += len(sentence)
span_field = spans
# metadata: Dict[str, Any] = {"original_text": flattened_sentences}
# if gold_clusters is not None:
# metadata["clusters"] = gold_clusters
# metadata_field = MetadataField(metadata)
fields: Dict[str, List] = {
"text": text_field,
"spans": span_field,
'clusters': gold_clusters,
# "metadata": metadata_field,
}
if span_labels is not None:
fields["span_labels"] = span_labels
return fields
def _normalize_word(word):
if word in ("/.", "/?"):
return word[1:]
else:
return word
def _canonicalize_clusters(clusters: List[List[Tuple[int, int]]]) -> List[List[Tuple[int, int]]]:
"""
The data might include 2 annotated spans which are identical,
but have different ids. This checks all clusters for spans which are
identical, and if it finds any, merges the clusters containing the
identical spans.
"""
merged_clusters: List[Set[Tuple[int, int]]] = []
for cluster in clusters:
cluster_with_overlapping_mention = None
for mention in cluster:
# Look at clusters we have already processed to
# see if they contain a mention in the current
# cluster for comparison.
for cluster2 in merged_clusters:
if mention in cluster2:
# first cluster in merged clusters
# which contains this mention.
cluster_with_overlapping_mention = cluster2
break
# Already encountered overlap - no need to keep looking.
if cluster_with_overlapping_mention is not None:
break
if cluster_with_overlapping_mention is not None:
# Merge cluster we are currently processing into
# the cluster in the processed list.
cluster_with_overlapping_mention.update(cluster)
else:
merged_clusters.append(set(cluster))
return [list(c) for c in merged_clusters]
================================================
FILE: hanlp/datasets/srl/ontonotes5/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-26 16:07
ONTONOTES5_HOME = 'https://catalog.ldc.upenn.edu/LDC2013T19/LDC2013T19.tgz#/ontonotes-release-5.0/data/'
CONLL12_HOME = ONTONOTES5_HOME + '../conll-2012/'
================================================
FILE: hanlp/datasets/srl/ontonotes5/_utils.py
================================================
#!/usr/bin/env python
import codecs
import collections
import glob
import json
import os
import re
import sys
from pprint import pprint
from typing import List, Dict, Union
from hanlp_common.io import eprint, save_json
from hanlp.common.transform import NormalizeToken
from hanlp.datasets.parsing.loaders._ctb_utils import remove_all_ec, convert_to_dependency
from hanlp.datasets.parsing.ptb import PTB_TOKEN_MAPPING
from hanlp.utils.io_util import merge_files, get_resource, pushd, run_cmd, read_tsv_as_sents, replace_ext, \
get_exitcode_stdout_stderr
from hanlp.utils.log_util import flash
BEGIN_DOCUMENT_REGEX = re.compile(r"#begin document \((.*)\); part (\d+)")
def flatten(l):
return [item for sublist in l for item in sublist]
def get_doc_key(doc_id, part):
return "{}_{}".format(doc_id, int(part))
class DocumentState(object):
def __init__(self):
self.doc_key = None
self.text = []
self.text_speakers = []
self.speakers = []
self.sentences = []
self.pos = []
self.lemma = []
self.pos_buffer = []
self.lemma_buffer = []
self.constituents = [] # {}
self.const_stack = []
self.const_buffer = []
self.ner = []
self.ner_stack = []
self.ner_buffer = []
self.srl = []
self.argument_stacks = []
self.argument_buffers = []
self.predicate_buffer = []
self.clusters = collections.defaultdict(list)
self.coref_stacks = collections.defaultdict(list)
def assert_empty(self):
assert self.doc_key is None
assert len(self.text) == 0
assert len(self.text_speakers) == 0
assert len(self.speakers) == 0
assert len(self.sentences) == 0
assert len(self.srl) == 0
assert len(self.predicate_buffer) == 0
assert len(self.argument_buffers) == 0
assert len(self.argument_stacks) == 0
assert len(self.constituents) == 0
assert len(self.const_stack) == 0
assert len(self.const_buffer) == 0
assert len(self.ner) == 0
assert len(self.lemma_buffer) == 0
assert len(self.pos_buffer) == 0
assert len(self.ner_stack) == 0
assert len(self.ner_buffer) == 0
assert len(self.coref_stacks) == 0
assert len(self.clusters) == 0
def assert_finalizable(self):
assert self.doc_key is not None
assert len(self.text) == 0
assert len(self.text_speakers) == 0
assert len(self.speakers) > 0
assert len(self.sentences) > 0
assert len(self.constituents) > 0
assert len(self.const_stack) == 0
assert len(self.ner_stack) == 0
assert len(self.predicate_buffer) == 0
assert all(len(s) == 0 for s in list(self.coref_stacks.values()))
def finalize_sentence(self):
self.sentences.append(tuple(self.text))
del self.text[:]
self.lemma.append(tuple(self.lemma_buffer))
del self.lemma_buffer[:]
self.pos.append(tuple(self.pos_buffer))
del self.pos_buffer[:]
self.speakers.append(tuple(self.text_speakers))
del self.text_speakers[:]
assert len(self.predicate_buffer) == len(self.argument_buffers)
self.srl.append([])
for pred, args in zip(self.predicate_buffer, self.argument_buffers):
for start, end, label in args:
self.srl[-1].append((pred, start, end, label))
self.predicate_buffer = []
self.argument_buffers = []
self.argument_stacks = []
self.constituents.append([c for c in self.const_buffer])
self.const_buffer = []
self.ner.append([c for c in self.ner_buffer])
self.ner_buffer = []
def finalize(self):
merged_clusters = []
for c1 in list(self.clusters.values()):
existing = None
for m in c1:
for c2 in merged_clusters:
if m in c2:
existing = c2
break
if existing is not None:
break
if existing is not None:
print("Merging clusters (shouldn't happen very often.)")
existing.update(c1)
else:
merged_clusters.append(set(c1))
merged_clusters = [list(c) for c in merged_clusters]
all_mentions = flatten(merged_clusters)
assert len(all_mentions) == len(set(all_mentions))
assert len(self.sentences) == len(self.srl)
assert len(self.sentences) == len(self.constituents)
assert len(self.sentences) == len(self.ner)
return {
"doc_key": self.doc_key,
"sentences": self.sentences,
"lemma": self.lemma,
"pos": self.pos,
"speakers": self.speakers,
"srl": self.srl,
"constituents": self.constituents,
"ner": self.ner,
"clusters": merged_clusters
}
def filter_data(input_json_file, output_json_file, doc_ids_file=None, annotation=None):
"""Filter OntoNotes5 data based on CoNLL2012 (coref) doc ids.
https://github.com/bcmi220/unisrl/blob/master/scripts/filter_conll2012_data.py
Args:
input_json_file: All documents.
output_json_file:
doc_ids_file:
Returns:
"""
assert doc_ids_file or annotation
doc_count = 0
sentence_count = 0
srl_count = 0
ner_count = 0
cluster_count = 0
word_count = 0
missing_count = 0
doc_ids = []
doc_ids_to_keys = collections.defaultdict(list)
filtered_examples = {}
ontonotes_root = os.path.abspath(os.path.join(os.path.dirname(input_json_file), *['..'] * 2))
language = os.path.basename(input_json_file).split('.')[1]
if doc_ids_file:
with open(doc_ids_file, "r") as f:
for line in f:
doc_id = line.strip().split("annotations/")[1]
doc_ids.append(doc_id)
doc_ids_to_keys[doc_id] = []
f.close()
with codecs.open(input_json_file, "r", "utf8") as f:
for jsonline in f:
example = json.loads(jsonline)
doc_key = example["doc_key"]
dk_prefix = "_".join(doc_key.split("_")[:-1])
if doc_ids_file and dk_prefix not in doc_ids_to_keys:
continue
if annotation and not os.path.isfile(
os.path.join(ontonotes_root, 'data/files/data', language, 'annotations', dk_prefix) + annotation):
print(os.path.join(ontonotes_root, 'data/files/data', language, 'annotations', dk_prefix) + annotation)
missing_count += 1
continue
doc_ids_to_keys[dk_prefix].append(doc_key)
filtered_examples[doc_key] = example
sentences = example["sentences"]
word_count += sum([len(s) for s in sentences])
sentence_count += len(sentences)
srl_count += sum([len(srl) for srl in example["srl"]])
ner_count += sum([len(ner) for ner in example["ner"]])
coref = example["clusters"]
cluster_count += len(coref)
doc_count += 1
f.close()
print(("Documents: {}\nSentences: {}\nWords: {}\nNER: {}, PAS: {}, Clusters: {}, No annotations: {}".format(
doc_count, sentence_count, word_count, ner_count, srl_count, cluster_count, missing_count)))
if doc_ids_file:
with codecs.open(output_json_file, "w", "utf8") as f:
for doc_id in doc_ids: # Arrange the files in order of id files
for key in doc_ids_to_keys[doc_id]:
f.write(json.dumps(filtered_examples[key], ensure_ascii=False))
f.write("\n")
f.close()
else:
with codecs.open(output_json_file, "w", "utf8") as f:
for doc in filtered_examples.values():
f.write(json.dumps(doc, ensure_ascii=False))
f.write("\n")
f.close()
def normalize_word(word, language):
if language == "arabic":
word = word[:word.find("#")]
if word == "/." or word == "/?":
return word[1:]
else:
return word
def handle_bit(word_index, bit, stack, spans, label_set):
asterisk_idx = bit.find("*")
if asterisk_idx >= 0:
open_parens = bit[:asterisk_idx]
close_parens = bit[asterisk_idx + 1:]
else:
open_parens = bit[:-1]
close_parens = bit[-1]
current_idx = open_parens.find("(")
while current_idx >= 0:
next_idx = open_parens.find("(", current_idx + 1)
if next_idx >= 0:
label = open_parens[current_idx + 1:next_idx]
else:
label = open_parens[current_idx + 1:]
label_set.add(label)
stack.append((word_index, label))
current_idx = next_idx
for c in close_parens:
try:
assert c == ")"
except AssertionError:
print(word_index, bit, spans, stack)
continue
open_index, label = stack.pop()
spans.append((open_index, word_index, label))
''' current_span = (open_index, word_index)
if current_span in spans:
spans[current_span] += "_" + label
else:
spans[current_span] = label
spans[current_span] = label '''
def handle_line(line, document_state: DocumentState, language, labels, stats):
begin_document_match = re.match(BEGIN_DOCUMENT_REGEX, line)
if begin_document_match:
document_state.assert_empty()
document_state.doc_key = get_doc_key(begin_document_match.group(1), begin_document_match.group(2))
return None
elif line.startswith("#end document"):
document_state.assert_finalizable()
finalized_state = document_state.finalize()
stats["num_clusters"] += len(finalized_state["clusters"])
stats["num_mentions"] += sum(len(c) for c in finalized_state["clusters"])
# labels["{}_const_labels".format(language)].update(l for _, _, l in finalized_state["constituents"])
# labels["ner"].update(l for _, _, l in finalized_state["ner"])
return finalized_state
else:
row = line.split()
# Starting a new sentence.
if len(row) == 0:
stats["max_sent_len_{}".format(language)] = max(len(document_state.text),
stats["max_sent_len_{}".format(language)])
stats["num_sents_{}".format(language)] += 1
document_state.finalize_sentence()
return None
assert len(row) >= 12
doc_key = get_doc_key(row[0], row[1])
word = normalize_word(row[3], language)
pos = row[4]
parse = row[5]
lemma = row[6]
predicate_sense = row[7]
speaker = row[9]
ner = row[10]
args = row[11:-1]
coref = row[-1]
word_index = len(document_state.text) + sum(len(s) for s in document_state.sentences)
document_state.text.append(word)
document_state.text_speakers.append(speaker)
document_state.pos_buffer.append(pos)
document_state.lemma_buffer.append(lemma)
handle_bit(word_index, parse, document_state.const_stack, document_state.const_buffer, labels["categories"])
handle_bit(word_index, ner, document_state.ner_stack, document_state.ner_buffer, labels["ner"])
if len(document_state.argument_stacks) < len(args):
document_state.argument_stacks = [[] for _ in args]
document_state.argument_buffers = [[] for _ in args]
for i, arg in enumerate(args):
handle_bit(word_index, arg, document_state.argument_stacks[i], document_state.argument_buffers[i],
labels["srl"])
if predicate_sense != "-":
document_state.predicate_buffer.append(word_index)
if coref != "-":
for segment in coref.split("|"):
if segment[0] == "(":
if segment[-1] == ")":
cluster_id = int(segment[1:-1])
document_state.clusters[cluster_id].append((word_index, word_index))
else:
cluster_id = int(segment[1:])
document_state.coref_stacks[cluster_id].append(word_index)
else:
cluster_id = int(segment[:-1])
start = document_state.coref_stacks[cluster_id].pop()
document_state.clusters[cluster_id].append((start, word_index))
return None
def ontonotes_document_generator(input_path, language, labels, stats):
with open(input_path, "r") as input_file:
document_state = DocumentState()
for line in input_file.readlines():
document = handle_line(line, document_state, language, labels, stats)
if document is not None:
yield document
document_state = DocumentState()
def convert_to_jsonlines(input_path, output_path, language, labels=None, stats=None):
if labels is None:
labels = collections.defaultdict(set)
if stats is None:
stats = collections.defaultdict(int)
count = 0
with open(output_path, "w") as output_file:
for document in ontonotes_document_generator(input_path, language, labels, stats):
output_file.write(json.dumps(document, ensure_ascii=False))
output_file.write("\n")
count += 1
return labels, stats
def make_ontonotes_jsonlines(conll12_ontonotes_path, output_path, languages=None):
if languages is None:
languages = ['english', 'chinese', 'arabic']
for language in languages:
make_ontonotes_language_jsonlines(conll12_ontonotes_path, output_path, language)
def make_ontonotes_language_jsonlines(conll12_ontonotes_path, output_path=None, language='english'):
conll12_ontonotes_path = get_resource(conll12_ontonotes_path)
if output_path is None:
output_path = os.path.dirname(conll12_ontonotes_path)
for split in ['train', 'development', 'test']:
pattern = f'{conll12_ontonotes_path}/data/{split}/data/{language}/annotations/*/*/*/*gold_conll'
files = sorted(glob.glob(pattern, recursive=True))
assert files, f'No gold_conll files found in {pattern}'
version = os.path.basename(files[0]).split('.')[-1].split('_')[0]
if version.startswith('v'):
assert all([version in os.path.basename(f) for f in files])
else:
version = 'v5'
lang_dir = f'{output_path}/{language}'
if split == 'conll-2012-test':
split = 'test'
full_file = f'{lang_dir}/{split}.{language}.{version}_gold_conll'
os.makedirs(lang_dir, exist_ok=True)
print(f'Merging {len(files)} files to {full_file}')
merge_files(files, full_file)
v5_json_file = full_file.replace(f'.{version}_gold_conll', f'.{version}.jsonlines')
print(f'Converting CoNLL file {full_file} to json file {v5_json_file}')
labels, stats = convert_to_jsonlines(full_file, v5_json_file, language)
print('Labels:')
pprint(labels)
print('Statistics:')
pprint(stats)
conll12_json_file = f'{lang_dir}/{split}.{language}.conll12.jsonlines'
print(f'Applying CoNLL 12 official splits on {v5_json_file} to {conll12_json_file}')
id_file = get_resource(f'https://file.hankcs.com/research/emnlp2021/conll.cemantix.org.zip#2012/download/ids/'
f'{language}/coref/{split}.id')
filter_data(v5_json_file, conll12_json_file, id_file)
def ensure_python_points_to_python2():
exitcode, out, version = get_exitcode_stdout_stderr('python --version')
if not version:
version = out
if not version.startswith('Python 2'):
raise EnvironmentError(f'Your python command needs to be Python2, not {version.strip()}. Try:\n\n\t'
'ln -sf "$(which python2)" "$(which python)"')
def make_gold_conll(ontonotes_path, language):
ensure_python_points_to_python2()
ontonotes_path = os.path.abspath(get_resource(ontonotes_path))
to_conll = get_resource(
'https://gist.githubusercontent.com/hankcs/46b9137016c769e4b6137104daf43a92/raw/66369de6c24b5ec47696ae307591f0d72c6f3f02/ontonotes_to_conll.sh')
to_conll = os.path.abspath(to_conll)
# shutil.rmtree(os.path.join(ontonotes_path, 'conll-2012'), ignore_errors=True)
with pushd(ontonotes_path):
try:
flash(f'Converting [blue]{language}[/blue] to CoNLL format, '
f'this might take half an hour [blink][yellow]...[/yellow][/blink]')
run_cmd(f'bash {to_conll} {ontonotes_path} {language}')
flash('')
except RuntimeError as e:
flash(f'[red]Failed[/red] to convert {language} of {ontonotes_path} to CoNLL. See exceptions for detail')
raise e
def convert_jsonlines_to_IOBES(json_file, output_file=None, doc_level_offset=True, normalize_token=False):
json_file = get_resource(json_file)
if not output_file:
output_file = os.path.splitext(json_file)[0] + '.ner.tsv'
if normalize_token:
transform = NormalizeToken(PTB_TOKEN_MAPPING, 'token')
with open(json_file) as src, open(output_file, 'w', encoding='utf-8') as out:
for line in src:
doc = json.loads(line)
offset = 0
for sent, ner in zip(doc['sentences'], doc['ner']):
if normalize_token:
sent = transform({'token': sent})['token']
tags = ['O'] * len(sent)
for start, end, label in ner:
if doc_level_offset:
start -= offset
end -= offset
if start == end:
tags[start] = 'S-' + label
else:
tags[start] = 'B-' + label
for i in range(start + 1, end + 1):
tags[i] = 'I-' + label
tags[end] = 'E-' + label
offset += len(sent)
for token, tag in zip(sent, tags):
out.write(f'{token}\t{tag}\n')
out.write('\n')
def make_ner_tsv_if_necessary(json_file):
json_file = get_resource(json_file)
output_file = os.path.splitext(json_file)[0] + '.ner.tsv'
if not os.path.isfile(output_file):
convert_jsonlines_to_IOBES(json_file, output_file)
return output_file
def batch_make_ner_tsv_if_necessary(json_files):
for each in json_files:
make_ner_tsv_if_necessary(each)
def make_pos_tsv_if_necessary(json_file):
json_file = get_resource(json_file)
output_file = os.path.splitext(json_file)[0] + '.pos.tsv'
if not os.path.isfile(output_file):
make_pos_tsv(json_file, output_file)
return output_file
def make_pos_tsv(json_file, output_file):
with open(json_file) as src, open(output_file, 'w', encoding='utf-8') as out:
for line in src:
doc = json.loads(line)
for sent, pos in zip(doc['sentences'], doc['pos']):
for token, tag in zip(sent, pos):
out.write(f'{token}\t{tag}\n')
out.write('\n')
def batch_make_pos_tsv_if_necessary(json_files):
for each in json_files:
make_pos_tsv_if_necessary(each)
def make_con_txt(conll_file, output_file):
with open(output_file, 'w') as out:
for sent in read_tsv_as_sents(conll_file):
tree = []
pos_per_sent = []
for cell in sent:
if cell[0] == '#begin' or cell[0] == '#end':
continue
if len(cell) < 8:
print(cell)
filename, sentence_id, token_id, word, POS, parse, framefile, roleset, *_ = cell
parse = parse.replace('*', f'({POS} {word})')
tree.append(parse)
pos_per_sent.append(POS)
bracketed = ' '.join(tree)
out.write(bracketed)
out.write('\n')
def make_con_txt_if_necessary(json_file):
json_file = get_resource(json_file)
output_file = os.path.splitext(json_file)[0] + '.con.txt'
if not os.path.isfile(output_file):
make_con_txt(json_file, output_file)
return output_file
def batch_make_con_txt_if_necessary(json_files):
for each in json_files:
make_con_txt_if_necessary(each)
def batch_remove_empty_category_if_necessary(json_files):
for each in json_files:
src = get_resource(each)
dst = replace_ext(src, '.noempty.txt')
if not os.path.isfile(dst):
remove_all_ec(src)
def make_dep_conllx(con_txt_file, output_file, language='en'):
con_txt_file = get_resource(con_txt_file)
convert_to_dependency(con_txt_file, output_file, language=language)
def make_dep_conllx_if_necessary(con_txt_file: str, language='en'):
con_txt_file = get_resource(con_txt_file)
output_file = con_txt_file.replace('.con.txt', '.dep.conllx', 1)
if os.path.isfile(output_file):
return
make_dep_conllx(con_txt_file, output_file, language)
def batch_make_dep_conllx_if_necessary(con_txt_files, language='en'):
for each in con_txt_files:
make_dep_conllx_if_necessary(each, language)
def make_ner_json_if_necessary(json_file):
json_file = get_resource(json_file)
output_file = os.path.splitext(json_file)[0] + '.ner.jsonlines'
if not os.path.isfile(output_file):
make_ner_json(json_file, output_file)
return output_file
def batch_make_ner_json_if_necessary(json_files):
for each in json_files:
make_ner_json_if_necessary(each)
def make_ner_json(json_file, output_file):
filter_data(json_file, output_file, doc_ids_file=None, annotation='.name')
def make_srl_json_if_necessary(json_file):
json_file = get_resource(json_file)
output_file = os.path.splitext(json_file)[0] + '.srl.jsonlines'
if not os.path.isfile(output_file):
make_srl_json(json_file, output_file)
return output_file
def make_coref_json_if_necessary(json_file):
json_file = get_resource(json_file)
output_file = os.path.splitext(json_file)[0] + '.coref.jsonlines'
if not os.path.isfile(output_file):
make_coref_json(json_file, output_file)
return output_file
def batch_make_srl_json_if_necessary(json_files):
for each in json_files:
make_srl_json_if_necessary(each)
def make_srl_json(json_file, output_file):
filter_data(json_file, output_file, doc_ids_file=None, annotation='.prop')
def batch_make_coref_json_if_necessary(json_files):
for each in json_files:
make_coref_json_if_necessary(each)
def make_coref_json(json_file, output_file):
filter_data(json_file, output_file, doc_ids_file=None, annotation='.coref')
def load_raw_text(onf_file) -> List[str]:
with open(onf_file) as src:
sents = []
expect_sent = False
expect_sent_line = False
sent_parts = []
for line in src:
line = line.strip()
if line == 'Plain sentence:':
expect_sent_line = True
elif expect_sent_line:
expect_sent_line = False
expect_sent = True
continue
elif expect_sent:
if not line:
sents.append(' '.join(sent_parts))
expect_sent = False
sent_parts = []
else:
sent_parts.append(line)
return sents
def batch_load_raw_text(root: str) -> Dict[str, List[str]]:
onf_files = sorted(glob.glob(os.path.join(root, '**/*.onf'), recursive=True))
sents = dict()
for path in onf_files:
filename = path.split('annotations/')[1][:-len('.onf')]
sents[filename] = load_raw_text(path)
return sents
def make_raw_text_if_necessary(home: str):
home = get_resource(home)
jsonpath = os.path.join(home, 'text.jsonlines')
if os.path.isfile(jsonpath):
return
sents = batch_load_raw_text(home)
save_json(sents, jsonpath)
class RestoreToken(NormalizeToken):
def __init__(self, src: str, mapper: Union[str, dict] = None, dst: str = None) -> None:
if not mapper:
mapper = {
'/-': '-',
'/.': '.',
}
super().__init__(mapper, src, dst)
def __call__(self, sample: dict) -> dict:
src = sample[self.src]
src = [[self.convert(y) for y in x] for x in src]
sample[self.dst] = src
return sample
def main():
if len(sys.argv) != 3:
eprint('2 arguments required: ontonotes_path output_path')
exit(1)
ontonotes_path = sys.argv[1]
output_path = sys.argv[2]
make_ontonotes_jsonlines(ontonotes_path, output_path)
if __name__ == "__main__":
main()
================================================
FILE: hanlp/datasets/srl/ontonotes5/chinese.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-26 16:07
import os
from urllib.error import HTTPError
import shutil
from hanlp.datasets.srl.ontonotes5 import ONTONOTES5_HOME, CONLL12_HOME
from hanlp.datasets.srl.ontonotes5._utils import make_gold_conll, make_ontonotes_language_jsonlines, \
batch_make_ner_tsv_if_necessary, batch_make_pos_tsv_if_necessary, batch_make_con_txt_if_necessary, \
batch_make_dep_conllx_if_necessary
from hanlp.utils.io_util import get_resource, path_from_url
from hanlp.utils.log_util import cprint, flash
_ONTONOTES5_CHINESE_HOME = ONTONOTES5_HOME + 'files/data/chinese/'
_ONTONOTES5_CONLL12_CHINESE_HOME = CONLL12_HOME + 'chinese/'
ONTONOTES5_CONLL12_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.conll12.jsonlines'
'''Training set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.conll12.jsonlines'
'''Dev set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.conll12.jsonlines'
'''Test set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.conll12.ner.tsv'
'''Training set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.conll12.ner.tsv'
'''Dev set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.conll12.ner.tsv'
'''Test set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.jsonlines'
ONTONOTES5_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.jsonlines'
ONTONOTES5_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.jsonlines'
ONTONOTES5_CONLL_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4_gold_conll'
ONTONOTES5_CONLL_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4_gold_conll'
ONTONOTES5_CONLL_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4_gold_conll'
ONTONOTES5_POS_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.pos.tsv'
ONTONOTES5_POS_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.pos.tsv'
ONTONOTES5_POS_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.pos.tsv'
ONTONOTES5_CON_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.con.txt'
ONTONOTES5_CON_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.con.txt'
ONTONOTES5_CON_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.con.txt'
ONTONOTES5_DEP_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.dep.conllx'
ONTONOTES5_DEP_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.dep.conllx'
ONTONOTES5_DEP_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.dep.conllx'
# ONTONOTES5_CON_CHINESE_NOEC_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.con.noempty.txt'
# ONTONOTES5_CON_CHINESE_NOEC_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.con.noempty.txt'
# ONTONOTES5_CON_CHINESE_NOEC_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.con.noempty.txt'
ONTONOTES5_NER_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.ner.tsv'
ONTONOTES5_NER_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.ner.tsv'
ONTONOTES5_NER_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.ner.tsv'
try:
get_resource(ONTONOTES5_HOME, verbose=False)
except HTTPError:
intended_file_path = path_from_url(ONTONOTES5_HOME)
cprint('Ontonotes 5.0 is a [red][bold]copyright[/bold][/red] dataset owned by LDC which we cannot re-distribute. '
f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) '
f'then download it to {intended_file_path}')
cprint('Luckily, an [red]unofficial[/red] Chinese version is provided on GitHub '
'which will be used for demonstration purpose.')
unofficial_chinese = get_resource('https://github.com/GuocaiL/Coref_Resolution/archive/master.zip#data/')
intended_home, _ = os.path.splitext(intended_file_path)
intended_home = os.path.join(os.path.dirname(intended_home), 'ontonotes-release-5.0')
intended_chinese = f'{intended_home}/data/files/data/chinese/'
# print(os.path.dirname(intended_chinese))
# print(unofficial_chinese)
# print(intended_chinese)
for folder in ['annotations', 'metadata']:
flash(f'Copying {unofficial_chinese}{folder} to {intended_chinese}{folder} [blink][yellow]...[/yellow][/blink]')
shutil.copytree(f'{unofficial_chinese}{folder}', f'{intended_chinese}{folder}')
flash('')
try:
get_resource(ONTONOTES5_CONLL12_CHINESE_TRAIN, verbose=False)
except HTTPError:
make_gold_conll(ONTONOTES5_HOME + '..', 'chinese')
make_ontonotes_language_jsonlines(CONLL12_HOME + 'v4', language='chinese')
batch_make_ner_tsv_if_necessary(
[ONTONOTES5_CONLL12_CHINESE_TRAIN, ONTONOTES5_CONLL12_CHINESE_DEV, ONTONOTES5_CONLL12_CHINESE_TEST])
batch_make_ner_tsv_if_necessary(
[ONTONOTES5_CONLL12_CHINESE_TRAIN, ONTONOTES5_CONLL12_CHINESE_DEV, ONTONOTES5_CONLL12_CHINESE_TEST])
batch_make_ner_tsv_if_necessary(
[ONTONOTES5_CHINESE_TRAIN, ONTONOTES5_CHINESE_DEV, ONTONOTES5_CHINESE_TEST])
batch_make_pos_tsv_if_necessary(
[ONTONOTES5_CHINESE_TRAIN, ONTONOTES5_CHINESE_DEV, ONTONOTES5_CHINESE_TEST])
batch_make_con_txt_if_necessary(
[ONTONOTES5_CONLL_CHINESE_TRAIN, ONTONOTES5_CONLL_CHINESE_DEV, ONTONOTES5_CONLL_CHINESE_TEST])
batch_make_dep_conllx_if_necessary(
[ONTONOTES5_CON_CHINESE_TRAIN, ONTONOTES5_CON_CHINESE_DEV, ONTONOTES5_CON_CHINESE_TEST], language='zh')
# batch_remove_empty_category_if_necessary(
# [ONTONOTES5_CON_CHINESE_TRAIN, ONTONOTES5_CON_CHINESE_DEV, ONTONOTES5_CON_CHINESE_TEST])
================================================
FILE: hanlp/datasets/srl/ontonotes5/english.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-25 18:48
from urllib.error import HTTPError
from hanlp.datasets.srl.ontonotes5 import ONTONOTES5_HOME, CONLL12_HOME
from hanlp.datasets.srl.ontonotes5._utils import make_gold_conll, make_ontonotes_language_jsonlines, \
batch_make_ner_tsv_if_necessary, batch_make_pos_tsv_if_necessary, batch_make_con_txt_if_necessary, \
batch_make_dep_conllx_if_necessary
from hanlp.utils.io_util import get_resource, path_from_url
from hanlp.utils.log_util import cprint
_ONTONOTES5_ENGLISH_HOME = ONTONOTES5_HOME + 'files/data/english/'
_ONTONOTES5_CONLL12_ENGLISH_HOME = CONLL12_HOME + 'english/'
ONTONOTES5_CONLL12_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.conll12.jsonlines'
'''Training set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.conll12.jsonlines'
'''Dev set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.conll12.jsonlines'
'''Test set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.jsonlines'
ONTONOTES5_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.jsonlines'
ONTONOTES5_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.jsonlines'
ONTONOTES5_CONLL_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4_gold_conll'
ONTONOTES5_CONLL_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4_gold_conll'
ONTONOTES5_CONLL_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4_gold_conll'
ONTONOTES5_POS_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.pos.tsv'
ONTONOTES5_POS_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.pos.tsv'
ONTONOTES5_POS_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.pos.tsv'
ONTONOTES5_CON_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.con.txt'
ONTONOTES5_CON_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.con.txt'
ONTONOTES5_CON_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.con.txt'
ONTONOTES5_DEP_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.dep.conllx'
ONTONOTES5_DEP_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.dep.conllx'
ONTONOTES5_DEP_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.dep.conllx'
# ONTONOTES5_CON_ENGLISH_NOEC_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.con.noempty.txt'
# ONTONOTES5_CON_ENGLISH_NOEC_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.con.noempty.txt'
# ONTONOTES5_CON_ENGLISH_NOEC_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.con.noempty.txt'
ONTONOTES5_CONLL12_NER_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.conll12.ner.tsv'
'''Training set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.conll12.ner.tsv'
'''Dev set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_CONLL12_NER_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.conll12.ner.tsv'
'''Test set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).'''
ONTONOTES5_NER_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.ner.tsv'
ONTONOTES5_NER_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.ner.tsv'
ONTONOTES5_NER_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.ner.tsv'
try:
get_resource(ONTONOTES5_HOME, verbose=False)
except HTTPError:
intended_file_path = path_from_url(ONTONOTES5_HOME)
cprint('Ontonotes 5.0 is a [red][bold]copyright[/bold][/red] dataset owned by LDC which we cannot re-distribute. '
f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) '
f'then download it to {intended_file_path}')
exit(1)
try:
get_resource(ONTONOTES5_CONLL12_ENGLISH_TRAIN, verbose=False)
except HTTPError:
make_gold_conll(ONTONOTES5_HOME + '..', 'english')
make_ontonotes_language_jsonlines(CONLL12_HOME + 'v4', language='english')
batch_make_ner_tsv_if_necessary(
[ONTONOTES5_CONLL12_ENGLISH_TRAIN, ONTONOTES5_CONLL12_ENGLISH_DEV, ONTONOTES5_CONLL12_ENGLISH_TEST])
batch_make_ner_tsv_if_necessary(
[ONTONOTES5_ENGLISH_TRAIN, ONTONOTES5_ENGLISH_DEV, ONTONOTES5_ENGLISH_TEST])
batch_make_pos_tsv_if_necessary(
[ONTONOTES5_ENGLISH_TRAIN, ONTONOTES5_ENGLISH_DEV, ONTONOTES5_ENGLISH_TEST])
batch_make_con_txt_if_necessary(
[ONTONOTES5_CONLL_ENGLISH_TRAIN, ONTONOTES5_CONLL_ENGLISH_DEV, ONTONOTES5_CONLL_ENGLISH_TEST])
batch_make_dep_conllx_if_necessary(
[ONTONOTES5_CON_ENGLISH_TRAIN, ONTONOTES5_CON_ENGLISH_DEV, ONTONOTES5_CON_ENGLISH_TEST])
# batch_remove_empty_category_if_necessary(
# [ONTONOTES5_CON_ENGLISH_TRAIN, ONTONOTES5_CON_ENGLISH_DEV, ONTONOTES5_CON_ENGLISH_TEST])
================================================
FILE: hanlp/datasets/sts/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 16:25
================================================
FILE: hanlp/datasets/sts/stsb.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 16:25
from typing import Union, List, Callable
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import read_cells
STS_B_TRAIN = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-train.csv'
STS_B_DEV = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-dev.csv'
STS_B_TEST = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-test.csv'
class SemanticTextualSimilarityDataset(TransformableDataset):
def __init__(self,
data: Union[str, List],
sent_a_col,
sent_b_col,
similarity_col,
delimiter='auto',
transform: Union[Callable, List] = None,
cache=None,
generate_idx=None) -> None:
self.delimiter = delimiter
self.similarity_col = similarity_col
self.sent_b_col = sent_b_col
self.sent_a_col = sent_a_col
super().__init__(data, transform, cache, generate_idx)
def load_file(self, filepath: str):
for i, cells in enumerate(read_cells(filepath, strip=True, delimiter=self.delimiter)):
yield {
'sent_a': cells[self.sent_a_col],
'sent_b': cells[self.sent_b_col],
'similarity': float(cells[self.similarity_col])
}
================================================
FILE: hanlp/datasets/tokenization/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-01 12:33
================================================
FILE: hanlp/datasets/tokenization/ctb6.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 22:19
_CTB6_CWS_HOME = 'http://file.hankcs.com/corpus/ctb6_cws.zip'
CTB6_CWS_TRAIN = _CTB6_CWS_HOME + '#train.txt'
'''CTB6 training set.'''
CTB6_CWS_DEV = _CTB6_CWS_HOME + '#dev.txt'
'''CTB6 dev set.'''
CTB6_CWS_TEST = _CTB6_CWS_HOME + '#test.txt'
'''CTB6 test set.'''
================================================
FILE: hanlp/datasets/tokenization/loaders/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:06
================================================
FILE: hanlp/datasets/tokenization/loaders/chunking_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-03 18:50
from typing import Union, List, Callable
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import get_resource
from hanlp.utils.span_util import bmes_of
from hanlp.utils.string_util import ispunct
class ChunkingDataset(TransformableDataset):
def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None,
generate_idx=None, max_seq_len=None, sent_delimiter=None) -> None:
if not sent_delimiter:
sent_delimiter = lambda x: ispunct(x)
elif isinstance(sent_delimiter, str):
sent_delimiter = set(list(sent_delimiter))
sent_delimiter = lambda x: x in sent_delimiter
self.sent_delimiter = sent_delimiter
self.max_seq_len = max_seq_len
super().__init__(data, transform, cache, generate_idx)
def load_file(self, filepath):
max_seq_len = self.max_seq_len
delimiter = self.sent_delimiter
for chars, tags in self._generate_chars_tags(filepath, delimiter, max_seq_len):
yield {'char': chars, 'tag': tags}
@staticmethod
def _generate_chars_tags(filepath, delimiter, max_seq_len):
filepath = get_resource(filepath)
with open(filepath, encoding='utf8') as src:
for text in src:
chars, tags = bmes_of(text, True)
if max_seq_len and delimiter and len(chars) > max_seq_len:
short_chars, short_tags = [], []
for idx, (char, tag) in enumerate(zip(chars, tags)):
short_chars.append(char)
short_tags.append(tag)
if len(short_chars) >= max_seq_len and delimiter(char):
yield short_chars, short_tags
short_chars, short_tags = [], []
if short_chars:
yield short_chars, short_tags
else:
yield chars, tags
================================================
FILE: hanlp/datasets/tokenization/loaders/multi_criteria_cws/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-11 20:35
_HOME = 'https://github.com/hankcs/multi-criteria-cws/archive/naive-mix.zip#data/raw/'
CNC_TRAIN_ALL = _HOME + 'cnc/train-all.txt'
CNC_TRAIN = _HOME + 'cnc/train.txt'
CNC_DEV = _HOME + 'cnc/dev.txt'
CNC_TEST = _HOME + 'cnc/test.txt'
CTB_TRAIN_ALL = _HOME + 'ctb/train-all.txt'
CTB_TRAIN = _HOME + 'ctb/train.txt'
CTB_DEV = _HOME + 'ctb/dev.txt'
CTB_TEST = _HOME + 'ctb/test.txt'
SXU_TRAIN_ALL = _HOME + 'sxu/train-all.txt'
SXU_TRAIN = _HOME + 'sxu/train.txt'
SXU_DEV = _HOME + 'sxu/dev.txt'
SXU_TEST = _HOME + 'sxu/test.txt'
UDC_TRAIN_ALL = _HOME + 'udc/train-all.txt'
UDC_TRAIN = _HOME + 'udc/train.txt'
UDC_DEV = _HOME + 'udc/dev.txt'
UDC_TEST = _HOME + 'udc/test.txt'
WTB_TRAIN_ALL = _HOME + 'wtb/train-all.txt'
WTB_TRAIN = _HOME + 'wtb/train.txt'
WTB_DEV = _HOME + 'wtb/dev.txt'
WTB_TEST = _HOME + 'wtb/test.txt'
ZX_TRAIN_ALL = _HOME + 'zx/train-all.txt'
ZX_TRAIN = _HOME + 'zx/train.txt'
ZX_DEV = _HOME + 'zx/dev.txt'
ZX_TEST = _HOME + 'zx/test.txt'
================================================
FILE: hanlp/datasets/tokenization/loaders/multi_criteria_cws/mcws_dataset.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-10-21 19:11
import os
from typing import Union, List, Callable, Dict, Iterable
from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset
from hanlp.utils.io_util import get_resource
class MultiCriteriaTextTokenizingDataset(TextTokenizingDataset):
def __init__(self,
data: Union[str, List],
transform: Union[Callable, List] = None,
cache=None,
generate_idx=None,
delimiter=None,
max_seq_len=None,
sent_delimiter=None,
char_level=False,
hard_constraint=False) -> None:
super().__init__(data, transform, cache, generate_idx, delimiter, max_seq_len, sent_delimiter, char_level,
hard_constraint)
def should_load_file(self, data) -> bool:
return isinstance(data, (tuple, dict))
def load_file(self, filepath: Union[Iterable[str], Dict[str, str]]):
"""Load multi-criteria corpora specified in filepath.
Args:
filepath: A list of files where filename is its criterion. Or a dict of filename-criterion pairs.
.. highlight:: bash
.. code-block:: bash
$ tree -L 2 .
.
├── cnc
│ ├── dev.txt
│ ├── test.txt
│ ├── train-all.txt
│ └── train.txt
├── ctb
│ ├── dev.txt
│ ├── test.txt
│ ├── train-all.txt
│ └── train.txt
├── sxu
│ ├── dev.txt
│ ├── test.txt
│ ├── train-all.txt
│ └── train.txt
├── udc
│ ├── dev.txt
│ ├── test.txt
│ ├── train-all.txt
│ └── train.txt
├── wtb
│ ├── dev.txt
│ ├── test.txt
│ ├── train-all.txt
│ └── train.txt
└── zx
├── dev.txt
├── test.txt
├── train-all.txt
└── train.txt
$ head -n 2 ctb/dev.txt
上海 浦东 开发 与 法制 建设 同步
新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )
"""
for eachpath in (filepath.items() if isinstance(filepath, dict) else filepath):
if isinstance(eachpath, tuple):
criteria, eachpath = eachpath
eachpath = get_resource(eachpath)
else:
eachpath = get_resource(eachpath)
criteria = os.path.basename(os.path.dirname(eachpath))
for sample in super().load_file(eachpath):
sample['criteria'] = criteria
yield sample
def append_criteria_token(sample: dict, criteria_tokens: Dict[str, int], criteria_token_map: dict) -> dict:
criteria = sample['criteria']
token = criteria_token_map.get(criteria, None)
if not token:
unused_tokens = list(criteria_tokens.keys())
size = len(criteria_token_map)
assert size + 1 < len(unused_tokens), f'No unused token available for criteria {criteria}. ' \
f'Current criteria_token_map = {criteria_token_map}'
token = criteria_token_map[criteria] = unused_tokens[size]
sample['token_token_type_ids'] = [0] * len(sample['token_input_ids']) + [1]
sample['token_input_ids'] = sample['token_input_ids'] + [criteria_tokens[token]]
return sample
================================================
FILE: hanlp/datasets/tokenization/loaders/txt.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-01 12:35
from typing import Union, List, Callable
from hanlp.common.dataset import TransformableDataset
from hanlp.utils.io_util import TimingFileIterator
from hanlp.utils.span_util import words_to_bmes, words_to_bi
from hanlp.utils.string_util import split_long_sentence_into
class TextTokenizingDataset(TransformableDataset):
def __init__(self,
data: Union[str, List],
transform: Union[Callable, List] = None,
cache=None,
generate_idx=None,
delimiter=None,
max_seq_len=None,
sent_delimiter=None,
char_level=False,
hard_constraint=False,
) -> None:
"""A dataset for tagging tokenization tasks.
Args:
data: The local or remote path to a dataset, or a list of samples where each sample is a dict.
transform: Predefined transform(s).
cache: ``True`` to enable caching, so that transforms won't be called twice.
generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when
samples are re-ordered by a sampler.
delimiter: Delimiter between tokens used to split a line in the corpus.
max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
be split here.
char_level: Whether the sequence length is measured at char level.
hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
in a sentence, it will be split at a token anyway.
"""
self.hard_constraint = hard_constraint
self.char_level = char_level
self.sent_delimiter = sent_delimiter
self.max_seq_len = max_seq_len
self.delimiter = delimiter
super().__init__(data, transform, cache, generate_idx)
def load_file(self, filepath: str):
"""Load tokenized corpus. The format is one sentence per line, where each line consisits of tokens seperated
by a delimiter (usually space).
.. highlight:: bash
.. code-block:: bash
$ head train.txt
上海 浦东 开发 与 法制 建设 同步
新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )
Args:
filepath: The path to the corpus.
"""
f = TimingFileIterator(filepath)
# longest_sent = 0
for line in f:
line = line.rstrip('\n')
tokens = line.split(self.delimiter)
if not tokens:
continue
if self.max_seq_len and sum(len(t) for t in tokens) > self.max_seq_len:
# debug = []
for short_sents in split_long_sentence_into(tokens, self.max_seq_len, self.sent_delimiter,
char_level=self.char_level,
hard_constraint=self.hard_constraint):
# debug.extend(short_sents)
# longest_sent = max(longest_sent, len(''.join(short_sents)))
yield {'token': short_sents}
# assert debug == tokens
else:
# longest_sent = max(longest_sent, len(''.join(tokens)))
yield {'token': tokens}
f.log(line[:20])
f.erase()
# print(f'Longest sent: {longest_sent} in {filepath}')
def generate_tags_for_subtokens(sample: dict, tagging_scheme='BMES'):
"""
Create a sequence of x for tokenization task. Each x is an atomic subtoken that will be tagged with BMES or BI tags.
Args:
sample: During prediction, it is a dict with 'token' being the input text, 'token_subtoken_offsets' being
incremental offsets per each subtoken. During training, it is a dict with 'token' being a sequence of tokens,
'token_subtoken_offsets' being non-incremental offsets per each subtoken, 'token_subtoken_offsets_group' being
subtoken offsets grouped by each token.
tagging_scheme:
Returns:
"""
# We could use token_token_span but we don't want token_token_span in the batch
subtokens_group = sample.get('token_subtoken_offsets_group', None)
sample['raw_token'] = sample['token']
tokens = sample.get('token_') or sample['token']
if subtokens_group:
sample['token'] = subtokens_group_to_subtokens(tokens, subtokens_group)
if tagging_scheme == 'BMES':
sample['tag'] = words_to_bmes(subtokens_group)
elif tagging_scheme == 'BI':
sample['tag'] = words_to_bi(subtokens_group)
else:
raise NotImplementedError(f'Unsupported tagging scheme {tagging_scheme}.')
else:
sample['token'] = subtoken_offsets_to_subtokens(tokens, sample['token_subtoken_offsets'])
return sample
def subtoken_offsets_to_subtokens(text, token_subtoken_offsets):
results = []
for b, e in token_subtoken_offsets:
results.append(text[b:e])
return results
def subtokens_group_to_subtokens(tokens, subtoken_offsets_group):
results = []
for subtoken_offsets, token in zip(subtoken_offsets_group, tokens):
for b, e in subtoken_offsets:
results.append(token[b:e])
return results
================================================
FILE: hanlp/datasets/tokenization/sighan2005/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
import os
from hanlp.utils.io_util import get_resource, split_file
from hanlp.utils.log_util import logger
SIGHAN2005 = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip'
def make(train):
root = get_resource(SIGHAN2005)
train = os.path.join(root, train.split('#')[-1])
if not os.path.isfile(train):
full = train.replace('_90.txt', '.utf8')
logger.info(f'Splitting {full} into training set and valid set with 9:1 proportion')
valid = train.replace('90.txt', '10.txt')
split_file(full, train=0.9, dev=0.1, test=0, names={'train': train, 'dev': valid})
assert os.path.isfile(train), f'Failed to make {train}'
assert os.path.isfile(valid), f'Failed to make {valid}'
logger.info(f'Successfully made {train} {valid}')
================================================
FILE: hanlp/datasets/tokenization/sighan2005/as_.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make
SIGHAN2005_AS_DICT = SIGHAN2005 + "#" + "gold/as_training_words.utf8"
'''Dictionary built on trainings set.'''
SIGHAN2005_AS_TRAIN_ALL = SIGHAN2005 + "#" + "training/as_training.utf8"
'''Full training set.'''
SIGHAN2005_AS_TRAIN = SIGHAN2005 + "#" + "training/as_training_90.txt"
'''Training set (first 90% of the full official training set).'''
SIGHAN2005_AS_DEV = SIGHAN2005 + "#" + "training/as_training_10.txt"
'''Dev set (last 10% of full official training set).'''
SIGHAN2005_AS_TEST_INPUT = SIGHAN2005 + "#" + "testing/as_testing.utf8"
'''Test input.'''
SIGHAN2005_AS_TEST = SIGHAN2005 + "#" + "gold/as_testing_gold.utf8"
'''Test set.'''
make(SIGHAN2005_AS_TRAIN)
================================================
FILE: hanlp/datasets/tokenization/sighan2005/cityu.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make
SIGHAN2005_CITYU_DICT = SIGHAN2005 + "#" + "gold/cityu_training_words.utf8"
'''Dictionary built on trainings set.'''
SIGHAN2005_CITYU_TRAIN_ALL = SIGHAN2005 + "#" + "training/cityu_training.utf8"
'''Full training set.'''
SIGHAN2005_CITYU_TRAIN = SIGHAN2005 + "#" + "training/cityu_training_90.txt"
'''Training set (first 90% of the full official training set).'''
SIGHAN2005_CITYU_DEV = SIGHAN2005 + "#" + "training/cityu_training_10.txt"
'''Dev set (last 10% of full official training set).'''
SIGHAN2005_CITYU_TEST_INPUT = SIGHAN2005 + "#" + "testing/cityu_test.utf8"
'''Test input.'''
SIGHAN2005_CITYU_TEST = SIGHAN2005 + "#" + "gold/cityu_test_gold.utf8"
'''Test set.'''
make(SIGHAN2005_CITYU_TRAIN)
================================================
FILE: hanlp/datasets/tokenization/sighan2005/msr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make
SIGHAN2005_MSR_DICT = SIGHAN2005 + "#" + "gold/msr_training_words.utf8"
'''Dictionary built on trainings set.'''
SIGHAN2005_MSR_TRAIN_ALL = SIGHAN2005 + "#" + "training/msr_training.utf8"
'''Full training set.'''
SIGHAN2005_MSR_TRAIN = SIGHAN2005 + "#" + "training/msr_training_90.txt"
'''Training set (first 90% of the full official training set).'''
SIGHAN2005_MSR_DEV = SIGHAN2005 + "#" + "training/msr_training_10.txt"
'''Dev set (last 10% of full official training set).'''
SIGHAN2005_MSR_TEST_INPUT = SIGHAN2005 + "#" + "testing/msr_test.utf8"
'''Test input.'''
SIGHAN2005_MSR_TEST = SIGHAN2005 + "#" + "gold/msr_test_gold.utf8"
'''Test set.'''
make(SIGHAN2005_MSR_TRAIN)
================================================
FILE: hanlp/datasets/tokenization/sighan2005/pku.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 15:42
from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make
SIGHAN2005_PKU_DICT = SIGHAN2005 + "#" + "gold/pku_training_words.utf8"
'''Dictionary built on trainings set.'''
SIGHAN2005_PKU_TRAIN_ALL = SIGHAN2005 + "#" + "training/pku_training.utf8"
'''Full training set.'''
SIGHAN2005_PKU_TRAIN = SIGHAN2005 + "#" + "training/pku_training_90.txt"
'''Training set (first 90% of the full official training set).'''
SIGHAN2005_PKU_DEV = SIGHAN2005 + "#" + "training/pku_training_10.txt"
'''Dev set (last 10% of full official training set).'''
SIGHAN2005_PKU_TEST_INPUT = SIGHAN2005 + "#" + "testing/pku_test.utf8"
'''Test input.'''
SIGHAN2005_PKU_TEST = SIGHAN2005 + "#" + "gold/pku_test_gold.utf8"
'''Test set.'''
make(SIGHAN2005_PKU_TRAIN)
================================================
FILE: hanlp/layers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-26 00:50
================================================
FILE: hanlp/layers/cnn_encoder.py
================================================
from typing import Optional, Tuple
import torch
from torch.nn import Conv1d, Linear
class CnnEncoder(torch.nn.Module):
"""
A `CnnEncoder` is a combination of multiple convolution layers and max pooling layers. As a
[`Seq2VecEncoder`](./seq2vec_encoder.md), the input to this module is of shape `(batch_size, num_tokens,
input_dim)`, and the output is of shape `(batch_size, output_dim)`.
The CNN has one convolution layer for each ngram filter size. Each convolution operation gives
out a vector of size num_filters. The number of times a convolution layer will be used
is `num_tokens - ngram_size + 1`. The corresponding maxpooling layer aggregates all these
outputs from the convolution layer and outputs the max.
This operation is repeated for every ngram size passed, and consequently the dimensionality of
the output after maxpooling is `len(ngram_filter_sizes) * num_filters`. This then gets
(optionally) projected down to a lower dimensional output, specified by `output_dim`.
We then use a fully connected layer to project in back to the desired output_dim. For more
details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural
Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1.
Registered as a `Seq2VecEncoder` with name "cnn".
# Parameters
embedding_dim : `int`, required
This is the input dimension to the encoder. We need this because we can't do shape
inference in pytorch, and we need to know what size filters to construct in the CNN.
num_filters : `int`, required
This is the output dim for each convolutional layer, which is the number of "filters"
learned by that layer.
ngram_filter_sizes : `Tuple[int]`, optional (default=`(2, 3, 4, 5)`)
This specifies both the number of convolutional layers we will create and their sizes. The
default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding
ngrams of size 2 to 5 with some number of filters.
conv_layer_activation : `Activation`, optional (default=`torch.nn.ReLU`)
Activation to use after the convolution layers.
output_dim : `Optional[int]`, optional (default=`None`)
After doing convolutions and pooling, we'll project the collected features into a vector of
this size. If this value is `None`, we will just return the result of the max pooling,
giving an output of shape `len(ngram_filter_sizes) * num_filters`.
"""
def __init__(
self,
embedding_dim: int,
num_filters: int,
ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5),
conv_layer_activation: str = 'ReLU',
output_dim: Optional[int] = None,
) -> None:
super().__init__()
self._embedding_dim = embedding_dim
self._num_filters = num_filters
self._ngram_filter_sizes = ngram_filter_sizes
self._activation = getattr(torch.nn, conv_layer_activation)()
self._output_dim = output_dim
self._convolution_layers = [
Conv1d(
in_channels=self._embedding_dim,
out_channels=self._num_filters,
kernel_size=ngram_size,
)
for ngram_size in self._ngram_filter_sizes
]
for i, conv_layer in enumerate(self._convolution_layers):
self.add_module("conv_layer_%d" % i, conv_layer)
maxpool_output_dim = self._num_filters * len(self._ngram_filter_sizes)
if self._output_dim:
self.projection_layer = Linear(maxpool_output_dim, self._output_dim)
else:
self.projection_layer = None
self._output_dim = maxpool_output_dim
def get_input_dim(self) -> int:
return self._embedding_dim
def get_output_dim(self) -> int:
return self._output_dim
def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor):
if mask is not None:
tokens = tokens * mask.unsqueeze(-1)
# Our input is expected to have shape `(batch_size, num_tokens, embedding_dim)`. The
# convolution layers expect input of shape `(batch_size, in_channels, sequence_length)`,
# where the conv layer `in_channels` is our `embedding_dim`. We thus need to transpose the
# tensor first.
tokens = torch.transpose(tokens, 1, 2)
# Each convolution layer returns output of size `(batch_size, num_filters, pool_length)`,
# where `pool_length = num_tokens - ngram_size + 1`. We then do an activation function,
# then do max pooling over each filter for the whole input sequence. Because our max
# pooling is simple, we just use `torch.max`. The resultant tensor of has shape
# `(batch_size, num_conv_layers * num_filters)`, which then gets projected using the
# projection layer, if requested.
filter_outputs = []
for i in range(len(self._convolution_layers)):
convolution_layer = getattr(self, "conv_layer_{}".format(i))
filter_outputs.append(self._activation(convolution_layer(tokens)).max(dim=2)[0])
# Now we have a list of `num_conv_layers` tensors of shape `(batch_size, num_filters)`.
# Concatenating them gives us a tensor of shape `(batch_size, num_filters * num_conv_layers)`.
maxpool_output = (
torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0]
)
if self.projection_layer:
result = self.projection_layer(maxpool_output)
else:
result = maxpool_output
return result
================================================
FILE: hanlp/layers/crf/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-18 22:55
================================================
FILE: hanlp/layers/crf/crf.py
================================================
# Copied from https://github.com/kmkurn/pytorch-crf
# Copyright 2017 Kemal Kurniawan
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do
# so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
__version__ = '0.7.2'
from typing import List, Optional
import torch
import torch.nn as nn
class CRF(nn.Module):
"""Conditional random field.
This module implements a conditional random field [LMP01]_. The forward computation
of this class computes the log likelihood of the given sequence of tags and
emission score tensor. This class also has `~CRF.decode` method which finds
the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
Args:
num_tags: Number of tags.
batch_first: Whether the first dimension corresponds to the size of a minibatch.
Attributes:
start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
``(num_tags,)``.
end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
``(num_tags,)``.
transitions (`~torch.nn.Parameter`): Transition score tensor of size
``(num_tags, num_tags)``.
.. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
"Conditional random fields: Probabilistic models for segmenting and
labeling sequence data". *Proc. 18th International Conf. on Machine
Learning*. Morgan Kaufmann. pp. 282–289.
.. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
"""
def __init__(self, num_tags: int, batch_first: bool = True) -> None:
if num_tags <= 0:
raise ValueError(f'invalid number of tags: {num_tags}')
super().__init__()
self.num_tags = num_tags
self.batch_first = batch_first
self.start_transitions = nn.Parameter(torch.empty(num_tags))
self.end_transitions = nn.Parameter(torch.empty(num_tags))
self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
self.reset_parameters()
def reset_parameters(self) -> None:
"""Initialize the transition parameters.
The parameters will be initialized randomly from a uniform distribution
between -0.1 and 0.1.
"""
nn.init.uniform_(self.start_transitions, -0.1, 0.1)
nn.init.uniform_(self.end_transitions, -0.1, 0.1)
nn.init.uniform_(self.transitions, -0.1, 0.1)
def __repr__(self) -> str:
return f'{self.__class__.__name__}(num_tags={self.num_tags})'
def forward(
self,
emissions: torch.Tensor,
tags: torch.LongTensor,
mask: Optional[torch.ByteTensor] = None,
reduction: str = 'sum',
) -> torch.Tensor:
"""Compute the conditional log likelihood of a sequence of tags given emission scores.
Args:
emissions (`~torch.Tensor`): Emission score tensor of size
``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
``(batch_size, seq_length, num_tags)`` otherwise.
tags (`~torch.LongTensor`): Sequence of tags tensor of size
``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
``(batch_size, seq_length)`` otherwise.
mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
reduction: Specifies the reduction to apply to the output:
``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
``sum``: the output will be summed over batches. ``mean``: the output will be
averaged over batches. ``token_mean``: the output will be averaged over tokens.
Returns:
`~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
reduction is ``none``, ``()`` otherwise.
"""
self._validate(emissions, tags=tags, mask=mask)
if reduction not in ('none', 'sum', 'mean', 'token_mean'):
raise ValueError(f'invalid reduction: {reduction}')
if mask is None:
mask = torch.ones_like(tags, dtype=torch.uint8)
if self.batch_first:
emissions = emissions.transpose(0, 1)
tags = tags.transpose(0, 1)
mask = mask.transpose(0, 1)
# shape: (batch_size,)
numerator = self._compute_score(emissions, tags, mask)
# shape: (batch_size,)
denominator = self._compute_normalizer(emissions, mask)
# shape: (batch_size,)
llh = numerator - denominator
if reduction == 'none':
return llh
if reduction == 'sum':
return llh.sum()
if reduction == 'mean':
return llh.mean()
assert reduction == 'token_mean'
return llh.sum() / mask.type_as(emissions).sum()
def decode(self, emissions: torch.Tensor,
mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
"""Find the most likely tag sequence using Viterbi algorithm.
Args:
emissions (`~torch.Tensor`): Emission score tensor of size
``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
``(batch_size, seq_length, num_tags)`` otherwise.
mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
Returns:
List of list containing the best tag sequence for each batch.
"""
self._validate(emissions, mask=mask)
if mask is None:
mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
if self.batch_first:
emissions = emissions.transpose(0, 1)
mask = mask.transpose(0, 1)
return self._viterbi_decode(emissions, mask)
def _validate(
self,
emissions: torch.Tensor,
tags: Optional[torch.LongTensor] = None,
mask: Optional[torch.ByteTensor] = None) -> None:
if emissions.dim() != 3:
raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
if emissions.size(2) != self.num_tags:
raise ValueError(
f'expected last dimension of emissions is {self.num_tags}, '
f'got {emissions.size(2)}')
if tags is not None:
if emissions.shape[:2] != tags.shape:
raise ValueError(
'the first two dimensions of emissions and tags must match, '
f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}')
if mask is not None:
if emissions.shape[:2] != mask.shape:
raise ValueError(
'the first two dimensions of emissions and mask must match, '
f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}')
no_empty_seq = not self.batch_first and mask[0].all()
no_empty_seq_bf = self.batch_first and mask[:, 0].all()
if not no_empty_seq and not no_empty_seq_bf:
raise ValueError('mask of the first timestep must all be on')
def _compute_score(
self, emissions: torch.Tensor, tags: torch.LongTensor,
mask: torch.ByteTensor) -> torch.Tensor:
# emissions: (seq_length, batch_size, num_tags)
# tags: (seq_length, batch_size)
# mask: (seq_length, batch_size)
assert emissions.dim() == 3 and tags.dim() == 2
assert emissions.shape[:2] == tags.shape
assert emissions.size(2) == self.num_tags
assert mask.shape == tags.shape
assert mask[0].all()
seq_length, batch_size = tags.shape
mask = mask.type_as(emissions)
# Start transition score and first emission
# shape: (batch_size,)
score = self.start_transitions[tags[0]]
score += emissions[0, torch.arange(batch_size), tags[0]]
for i in range(1, seq_length):
# Transition score to next tag, only added if next timestep is valid (mask == 1)
# shape: (batch_size,)
score += self.transitions[tags[i - 1], tags[i]] * mask[i]
# Emission score for next tag, only added if next timestep is valid (mask == 1)
# shape: (batch_size,)
score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
# End transition score
# shape: (batch_size,)
seq_ends = mask.long().sum(dim=0) - 1
# shape: (batch_size,)
last_tags = tags[seq_ends, torch.arange(batch_size)]
# shape: (batch_size,)
score += self.end_transitions[last_tags]
return score
def _compute_normalizer(
self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
# emissions: (seq_length, batch_size, num_tags)
# mask: (seq_length, batch_size)
assert emissions.dim() == 3 and mask.dim() == 2
assert emissions.shape[:2] == mask.shape
assert emissions.size(2) == self.num_tags
assert mask[0].all()
seq_length = emissions.size(0)
# Start transition score and first emission; score has size of
# (batch_size, num_tags) where for each batch, the j-th column stores
# the score that the first timestep has tag j
# shape: (batch_size, num_tags)
score = self.start_transitions + emissions[0]
for i in range(1, seq_length):
# Broadcast score for every possible next tag
# shape: (batch_size, num_tags, 1)
broadcast_score = score.unsqueeze(2)
# Broadcast emission score for every possible current tag
# shape: (batch_size, 1, num_tags)
broadcast_emissions = emissions[i].unsqueeze(1)
# Compute the score tensor of size (batch_size, num_tags, num_tags) where
# for each sample, entry at row i and column j stores the sum of scores of all
# possible tag sequences so far that end with transitioning from tag i to tag j
# and emitting
# shape: (batch_size, num_tags, num_tags)
next_score = broadcast_score + self.transitions + broadcast_emissions
# Sum over all possible current tags, but we're in score space, so a sum
# becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
# all possible tag sequences so far, that end in tag i
# shape: (batch_size, num_tags)
next_score = torch.logsumexp(next_score, dim=1)
# Set score to the next score if this timestep is valid (mask == 1)
# shape: (batch_size, num_tags)
score = torch.where(mask[i].unsqueeze(1), next_score, score)
# End transition score
# shape: (batch_size, num_tags)
score += self.end_transitions
# Sum (log-sum-exp) over all possible tags
# shape: (batch_size,)
return torch.logsumexp(score, dim=1)
def _viterbi_decode(self, emissions: torch.FloatTensor,
mask: torch.ByteTensor) -> List[List[int]]:
# emissions: (seq_length, batch_size, num_tags)
# mask: (seq_length, batch_size)
assert emissions.dim() == 3 and mask.dim() == 2
assert emissions.shape[:2] == mask.shape
assert emissions.size(2) == self.num_tags
assert mask[0].all()
seq_length, batch_size = mask.shape
# Start transition and first emission
# shape: (batch_size, num_tags)
score = self.start_transitions + emissions[0]
history = []
# score is a tensor of size (batch_size, num_tags) where for every batch,
# value at column j stores the score of the best tag sequence so far that ends
# with tag j
# history saves where the best tags candidate transitioned from; this is used
# when we trace back the best tag sequence
# Viterbi algorithm recursive case: we compute the score of the best tag sequence
# for every possible next tag
for i in range(1, seq_length):
# Broadcast viterbi score for every possible next tag
# shape: (batch_size, num_tags, 1)
broadcast_score = score.unsqueeze(2)
# Broadcast emission score for every possible current tag
# shape: (batch_size, 1, num_tags)
broadcast_emission = emissions[i].unsqueeze(1)
# Compute the score tensor of size (batch_size, num_tags, num_tags) where
# for each sample, entry at row i and column j stores the score of the best
# tag sequence so far that ends with transitioning from tag i to tag j and emitting
# shape: (batch_size, num_tags, num_tags)
next_score = broadcast_score + self.transitions + broadcast_emission
# Find the maximum score over all possible current tag
# shape: (batch_size, num_tags)
next_score, indices = next_score.max(dim=1)
# Set score to the next score if this timestep is valid (mask == 1)
# and save the index that produces the next score
# shape: (batch_size, num_tags)
score = torch.where(mask[i].unsqueeze(1), next_score, score)
history.append(indices)
# End transition score
# shape: (batch_size, num_tags)
score += self.end_transitions
# Now, compute the best path for each sample
# shape: (batch_size,)
seq_ends = mask.long().sum(dim=0) - 1
best_tags_list = []
for idx in range(batch_size):
# Find the tag which maximizes the score at the last timestep; this is our best tag
# for the last timestep
_, best_last_tag = score[idx].max(dim=0)
best_tags = [best_last_tag.item()]
# We trace back where the best last tag comes from, append that to our best tag
# sequence, and trace it back again, and so on
for hist in reversed(history[:seq_ends[idx]]):
best_last_tag = hist[idx][best_tags[-1]]
best_tags.append(best_last_tag.item())
# Reverse the order because we start from the last timestep
best_tags.reverse()
best_tags_list.append(best_tags)
return best_tags_list
================================================
FILE: hanlp/layers/crf/crf_layer_tf.py
================================================
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import tensorflow as tf
from hanlp.layers.crf.crf_tf import crf_decode, crf_log_likelihood
class CRF(tf.keras.layers.Layer):
"""Conditional Random Field layer (tf.keras)
`CRF` can be used as the last layer in a network (as a classifier). Input shape (features)
must be equal to the number of classes the CRF can predict (a linear layer is recommended).
Note: the loss and accuracy functions of networks using `CRF` must
use the provided loss and accuracy functions (denoted as loss and viterbi_accuracy)
as the classification of sequences are used with the layers internal weights.
Copyright: this is a modified version of
https://github.com/NervanaSystems/nlp-architect/blob/master/nlp_architect/nn/tensorflow/python/keras/layers/crf.py
Args:
num_labels(int): the number of labels to tag each temporal input.
Input shape:
num_labels(int): the number of labels to tag each temporal input.
Input shape:
nD tensor with shape `(batch_size, sentence length, num_classes)`.
Output shape:
nD tensor with shape: `(batch_size, sentence length, num_classes)`.
Returns:
"""
def __init__(self, num_classes, **kwargs):
self.transitions = None
super(CRF, self).__init__(**kwargs)
# num of output labels
self.output_dim = int(num_classes)
self.input_spec = tf.keras.layers.InputSpec(min_ndim=3)
self.supports_masking = False
sequence_lengths = None
def get_config(self):
config = {
'output_dim': self.output_dim,
'supports_masking': self.supports_masking,
'transitions': tf.keras.backend.eval(self.transitions)
}
base_config = super(CRF, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
assert len(input_shape) == 3
f_shape = tf.TensorShape(input_shape)
input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})
if f_shape[-1] is None:
raise ValueError('The last dimension of the inputs to `CRF` '
'should be defined. Found `None`.')
if f_shape[-1] != self.output_dim:
raise ValueError('The last dimension of the input shape must be equal to output'
' shape. Use a linear layer if needed.')
self.input_spec = input_spec
self.transitions = self.add_weight(name='transitions',
shape=[self.output_dim, self.output_dim],
initializer='glorot_uniform',
trainable=True)
self.built = True
def compute_mask(self, inputs, mask=None):
# Just pass the received mask from previous layer, to the next layer or
# manipulate it if this layer changes the shape of the input
return mask
# pylint: disable=arguments-differ
def call(self, inputs, sequence_lengths=None, mask=None, training=None, **kwargs):
sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
if sequence_lengths is not None:
assert len(sequence_lengths.shape) == 2
assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
assert seq_len_shape[1] == 1
sequence_lengths = tf.keras.backend.flatten(sequence_lengths)
else:
sequence_lengths = tf.math.count_nonzero(mask, axis=1)
viterbi_sequence, _ = crf_decode(sequences, self.transitions,
sequence_lengths)
output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim)
return tf.keras.backend.in_train_phase(sequences, output)
# def loss(self, y_true, y_pred):
# y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
# log_likelihood, self.transitions = \
# crf_log_likelihood(y_pred,
# tf.cast(y_true, dtype=tf.int32),
# sequence_lengths,
# transition_params=self.transitions)
# return tf.reduce_mean(-log_likelihood)
def compute_output_shape(self, input_shape):
tf.TensorShape(input_shape).assert_has_rank(3)
return input_shape[:2] + (self.output_dim,)
@property
def viterbi_accuracy(self):
def accuracy(y_true, y_pred):
shape = tf.shape(y_pred)
sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
viterbi_sequence, _ = crf_decode(y_pred, self.transitions,
sequence_lengths)
output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim)
return tf.keras.metrics.categorical_accuracy(y_true, output)
accuracy.func_name = 'viterbi_accuracy'
return accuracy
class CRFLoss(object):
def __init__(self, crf: CRF, dtype) -> None:
super().__init__()
self.crf = crf
self.dtype = dtype
self.__name__ = type(self).__name__
def __call__(self, y_true, y_pred, sample_weight=None, **kwargs):
assert sample_weight is not None, 'your model has to support masking'
if len(y_true.shape) == 3:
y_true = tf.argmax(y_true, axis=-1)
sequence_lengths = tf.math.count_nonzero(sample_weight, axis=1)
y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
log_likelihood, self.crf.transitions = \
crf_log_likelihood(y_pred,
tf.cast(y_true, dtype=tf.int32),
sequence_lengths,
transition_params=self.crf.transitions)
return tf.reduce_mean(-log_likelihood)
class CRFWrapper(tf.keras.Model):
def __init__(self, model: tf.keras.Model, num_classes=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.model = model
self.crf = CRF(model.output.shape[-1] if not num_classes else num_classes)
def call(self, inputs, training=None, mask=None):
output = self.model(inputs, training=training, mask=mask)
viterbi_output = self.crf(output)
return viterbi_output
def compute_output_shape(self, input_shape):
return self.model.compute_output_shape(input_shape)
================================================
FILE: hanlp/layers/crf/crf_tf.py
================================================
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
# TODO: Wrap functions in @tf.function once
# https://github.com/tensorflow/tensorflow/issues/29075 is resolved
def crf_sequence_score(inputs, tag_indices, sequence_lengths,
transition_params):
"""Computes the unnormalized score for a tag sequence.
Args:
inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
to use as input to the CRF layer.
tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
we compute the unnormalized score.
sequence_lengths: A [batch_size] vector of true sequence lengths.
transition_params:
Returns:
sequence_scores: A [batch_size] vector of unnormalized sequence scores.
"""
tag_indices = tf.cast(tag_indices, dtype=tf.int32)
sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
# If max_seq_len is 1, we skip the score calculation and simply gather the
# unary potentials of the single tag.
def _single_seq_fn():
batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
example_inds = tf.reshape(
tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
sequence_scores = tf.gather_nd(
tf.squeeze(inputs, [1]),
tf.concat([example_inds, tag_indices], axis=1))
sequence_scores = tf.where(
tf.less_equal(sequence_lengths, 0), tf.zeros_like(sequence_scores),
sequence_scores)
return sequence_scores
def _multi_seq_fn():
# Compute the scores of the given tag sequence.
unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
binary_scores = crf_binary_score(tag_indices, sequence_lengths,
transition_params)
sequence_scores = unary_scores + binary_scores
return sequence_scores
if inputs.shape[1] == 1:
return _single_seq_fn()
else:
return _multi_seq_fn()
def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
transition_params):
"""Computes the unnormalized score of all tag sequences matching
tag_bitmap.
tag_bitmap enables more than one tag to be considered correct at each time
step. This is useful when an observed output at a given time step is
consistent with more than one tag, and thus the log likelihood of that
observation must take into account all possible consistent tags.
Using one-hot vectors in tag_bitmap gives results identical to
crf_sequence_score.
Args:
inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
to use as input to the CRF layer.
tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
representing all active tags at each index for which to calculate the
unnormalized score.
sequence_lengths: A [batch_size] vector of true sequence lengths.
transition_params:
Returns:
sequence_scores: A [batch_size] vector of unnormalized sequence scores.
"""
tag_bitmap = tf.cast(tag_bitmap, dtype=tf.bool)
sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
filtered_inputs = tf.where(tag_bitmap, inputs,
tf.fill(tf.shape(inputs), float("-inf")))
# If max_seq_len is 1, we skip the score calculation and simply gather the
# unary potentials of all active tags.
def _single_seq_fn():
return tf.reduce_logsumexp(
filtered_inputs, axis=[1, 2], keepdims=False)
def _multi_seq_fn():
# Compute the logsumexp of all scores of sequences matching the given tags.
return crf_log_norm(
inputs=filtered_inputs,
sequence_lengths=sequence_lengths,
transition_params=transition_params)
if inputs.shape[1] == 1:
return _single_seq_fn()
else:
return _multi_seq_fn()
def crf_log_norm(inputs, sequence_lengths, transition_params):
"""Computes the normalization for a CRF.
Args:
inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
to use as input to the CRF layer.
sequence_lengths: A [batch_size] vector of true sequence lengths.
transition_params:
Returns:
log_norm: A [batch_size] vector of normalizers for a CRF.
"""
sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
# Split up the first and rest of the inputs in preparation for the forward
# algorithm.
first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
first_input = tf.squeeze(first_input, [1])
# If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
# the "initial state" (the unary potentials).
def _single_seq_fn():
log_norm = tf.reduce_logsumexp(first_input, [1])
# Mask `log_norm` of the sequences with length <= zero.
log_norm = tf.where(
tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm),
log_norm)
return log_norm
def _multi_seq_fn():
"""Forward computation of alpha values."""
rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])
# Compute the alpha values in the forward algorithm in order to get the
# partition function.
alphas = crf_forward(rest_of_input, first_input, transition_params,
sequence_lengths)
log_norm = tf.reduce_logsumexp(alphas, [1])
# Mask `log_norm` of the sequences with length <= zero.
log_norm = tf.where(
tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm),
log_norm)
return log_norm
if inputs.shape[1] == 1:
return _single_seq_fn()
else:
return _multi_seq_fn()
def crf_log_likelihood(inputs,
tag_indices,
sequence_lengths,
transition_params=None):
"""Computes the log-likelihood of tag sequences in a CRF.
Args:
inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
to use as input to the CRF layer.
tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
we compute the log-likelihood.
sequence_lengths: A [batch_size] vector of true sequence lengths.
transition_params: A [num_tags, num_tags] transition matrix, (Default value = None)
Returns:
log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
each example, given the sequence of tag indices.
transition_params: A [num_tags, num_tags] transition matrix. This is
either provided by the caller or created in this function.
"""
num_tags = inputs.shape[2]
# cast type to handle different types
tag_indices = tf.cast(tag_indices, dtype=tf.int32)
sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
if transition_params is None:
initializer = tf.keras.initializers.GlorotUniform()
transition_params = tf.Variable(
initializer([num_tags, num_tags]), "transitions")
sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
transition_params)
log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
# Normalize the scores to get the log-likelihood per example.
log_likelihood = sequence_scores - log_norm
return log_likelihood, transition_params
def crf_unary_score(tag_indices, sequence_lengths, inputs):
"""Computes the unary scores of tag sequences.
Args:
tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
sequence_lengths: A [batch_size] vector of true sequence lengths.
inputs:
Returns:
unary_scores: A [batch_size] vector of unary scores.
"""
assert len(tag_indices.shape) == 2, 'tag_indices: A [batch_size, max_seq_len] matrix of tag indices.'
tag_indices = tf.cast(tag_indices, dtype=tf.int32)
sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
batch_size = tf.shape(inputs)[0]
max_seq_len = tf.shape(inputs)[1]
num_tags = tf.shape(inputs)[2]
flattened_inputs = tf.reshape(inputs, [-1])
offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1)
offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
# Use int32 or int64 based on tag_indices' dtype.
if tag_indices.dtype == tf.int64:
offsets = tf.cast(offsets, tf.int64)
flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])
unary_scores = tf.reshape(
tf.gather(flattened_inputs, flattened_tag_indices),
[batch_size, max_seq_len])
masks = tf.sequence_mask(
sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32)
unary_scores = tf.reduce_sum(unary_scores * masks, 1)
return unary_scores
def crf_binary_score(tag_indices, sequence_lengths, transition_params):
"""Computes the binary scores of tag sequences.
Args:
tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
sequence_lengths: A [batch_size] vector of true sequence lengths.
transition_params:
Returns:
binary_scores: A [batch_size] vector of binary scores.
"""
tag_indices = tf.cast(tag_indices, dtype=tf.int32)
sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
num_tags = tf.shape(transition_params)[0]
num_transitions = tf.shape(tag_indices)[1] - 1
# Truncate by one on each side of the sequence to get the start and end
# indices of each transition.
start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions])
end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])
# Encode the indices in a flattened representation.
flattened_transition_indices = start_tag_indices * \
num_tags + end_tag_indices
flattened_transition_params = tf.reshape(transition_params, [-1])
# Get the binary scores based on the flattened representation.
binary_scores = tf.gather(flattened_transition_params,
flattened_transition_indices)
masks = tf.sequence_mask(
sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32)
truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
return binary_scores
def crf_forward(inputs, state, transition_params, sequence_lengths):
"""Computes the alpha values in a linear-chain CRF.
See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
Args:
inputs: A [batch_size, num_tags] matrix of unary potentials.
state: A [batch_size, num_tags] matrix containing the previous alpha
values.
transition_params: A [num_tags, num_tags] matrix of binary potentials.
This matrix is expanded into a [1, num_tags, num_tags] in preparation
for the broadcast summation occurring within the cell.
sequence_lengths: A [batch_size] vector of true sequence lengths.
Returns:
new_alphas: A [batch_size, num_tags] matrix containing the
new alpha values.
"""
sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
sequence_lengths = tf.maximum(
tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 2)
inputs = tf.transpose(inputs, [1, 0, 2])
transition_params = tf.expand_dims(transition_params, 0)
def _scan_fn(state, inputs):
state = tf.expand_dims(state, 2)
transition_scores = state + transition_params
new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1])
return new_alphas
all_alphas = tf.transpose(tf.scan(_scan_fn, inputs, state), [1, 0, 2])
idxs = tf.stack(
[tf.range(tf.shape(sequence_lengths)[0]), sequence_lengths], axis=1)
return tf.gather_nd(all_alphas, idxs)
def viterbi_decode(score, transition_params):
"""Decode the highest scoring sequence of tags outside of TensorFlow.
This should only be used at test time.
Args:
score: A [seq_len, num_tags] matrix of unary potentials.
transition_params: A [num_tags, num_tags] matrix of binary potentials.
Returns:
viterbi: A [seq_len] list of integers containing the highest scoring tag
indices.
viterbi_score: A float containing the score for the Viterbi sequence.
"""
trellis = np.zeros_like(score)
backpointers = np.zeros_like(score, dtype=np.int32)
trellis[0] = score[0]
for t in range(1, score.shape[0]):
v = np.expand_dims(trellis[t - 1], 1) + transition_params
trellis[t] = score[t] + np.max(v, 0)
backpointers[t] = np.argmax(v, 0)
viterbi = [np.argmax(trellis[-1])]
for bp in reversed(backpointers[1:]):
viterbi.append(bp[viterbi[-1]])
viterbi.reverse()
viterbi_score = np.max(trellis[-1])
return viterbi, viterbi_score
class CrfDecodeForwardRnnCell(tf.keras.layers.AbstractRNNCell):
"""Computes the forward decoding in a linear-chain CRF."""
def __init__(self, transition_params, **kwargs):
"""Initialize the CrfDecodeForwardRnnCell.
Args:
transition_params: A [num_tags, num_tags] matrix of binary
potentials. This matrix is expanded into a
[1, num_tags, num_tags] in preparation for the broadcast
summation occurring within the cell.
"""
super(CrfDecodeForwardRnnCell, self).__init__(**kwargs)
self._transition_params = tf.expand_dims(transition_params, 0)
self._num_tags = transition_params.shape[0]
@property
def state_size(self):
return self._num_tags
@property
def output_size(self):
return self._num_tags
def build(self, input_shape):
super(CrfDecodeForwardRnnCell, self).build(input_shape)
def call(self, inputs, state):
"""Build the CrfDecodeForwardRnnCell.
Args:
inputs: A [batch_size, num_tags] matrix of unary potentials.
state: A [batch_size, num_tags] matrix containing the previous step's
score values.
Returns:
backpointers: A [batch_size, num_tags] matrix of backpointers.
new_state: A [batch_size, num_tags] matrix of new score values.
"""
state = tf.expand_dims(state[0], 2)
transition_scores = state + self._transition_params
new_state = inputs + tf.reduce_max(transition_scores, [1])
backpointers = tf.argmax(transition_scores, 1)
backpointers = tf.cast(backpointers, dtype=tf.int32)
return backpointers, new_state
def crf_decode_forward(inputs, state, transition_params, sequence_lengths):
"""Computes forward decoding in a linear-chain CRF.
Args:
inputs: A [batch_size, num_tags] matrix of unary potentials.
state: A [batch_size, num_tags] matrix containing the previous step's
score values.
transition_params: A [num_tags, num_tags] matrix of binary potentials.
sequence_lengths: A [batch_size] vector of true sequence lengths.
Returns:
backpointers: A [batch_size, num_tags] matrix of backpointers.
new_state: A [batch_size, num_tags] matrix of new score values.
"""
sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
mask = tf.sequence_mask(sequence_lengths, tf.shape(inputs)[1])
crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
crf_fwd_layer = tf.keras.layers.RNN(
crf_fwd_cell, return_sequences=True, return_state=True)
return crf_fwd_layer(inputs, state, mask=mask)
def crf_decode_backward(inputs, state):
"""Computes backward decoding in a linear-chain CRF.
Args:
inputs: A [batch_size, num_tags] matrix of
backpointer of next step (in time order).
state: A [batch_size, 1] matrix of tag index of next step.
Returns:
new_tags: A [batch_size, num_tags]
tensor containing the new tag indices.
"""
inputs = tf.transpose(inputs, [1, 0, 2])
def _scan_fn(state, inputs):
state = tf.squeeze(state, axis=[1])
idxs = tf.stack([tf.range(tf.shape(inputs)[0]), state], axis=1)
new_tags = tf.expand_dims(tf.gather_nd(inputs, idxs), axis=-1)
return new_tags
return tf.transpose(tf.scan(_scan_fn, inputs, state), [1, 0, 2])
def crf_decode(potentials, transition_params, sequence_length):
"""Decode the highest scoring sequence of tags in TensorFlow.
This is a function for tensor.
Args:
potentials: A [batch_size, max_seq_len, num_tags] tensor of
unary potentials.
transition_params: A [num_tags, num_tags] matrix of
binary potentials.
sequence_length: A [batch_size] vector of true sequence lengths.
Returns:
decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
Contains the highest scoring tag indices.
best_score: A [batch_size] vector, containing the score of `decode_tags`.
"""
sequence_length = tf.cast(sequence_length, dtype=tf.int32)
# If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
# and the max activation.
def _single_seq_fn():
squeezed_potentials = tf.squeeze(potentials, [1])
decode_tags = tf.expand_dims(tf.argmax(squeezed_potentials, axis=1), 1)
best_score = tf.reduce_max(squeezed_potentials, axis=1)
return tf.cast(decode_tags, dtype=tf.int32), best_score
def _multi_seq_fn():
"""Decoding of highest scoring sequence."""
# Computes forward decoding. Get last score and backpointers.
initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1])
initial_state = tf.squeeze(initial_state, axis=[1])
inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1])
sequence_length_less_one = tf.maximum(
tf.constant(0, dtype=sequence_length.dtype), sequence_length - 1)
backpointers, last_score = crf_decode_forward(
inputs, initial_state, transition_params, sequence_length_less_one)
backpointers = tf.reverse_sequence(
backpointers, sequence_length_less_one, seq_axis=1)
initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32)
initial_state = tf.expand_dims(initial_state, axis=-1)
decode_tags = crf_decode_backward(backpointers, initial_state)
decode_tags = tf.squeeze(decode_tags, axis=[2])
decode_tags = tf.concat([initial_state, decode_tags], axis=1)
decode_tags = tf.reverse_sequence(
decode_tags, sequence_length, seq_axis=1)
best_score = tf.reduce_max(last_score, axis=1)
return decode_tags, best_score
if potentials.shape[1] == 1:
return _single_seq_fn()
else:
return _multi_seq_fn()
================================================
FILE: hanlp/layers/dropout.py
================================================
# -*- coding:utf-8 -*-
# Date: 2020-06-05 17:47
from typing import List
import torch
import torch.nn as nn
class WordDropout(nn.Module):
def __init__(self, p: float, oov_token: int, exclude_tokens: List[int] = None) -> None:
super().__init__()
self.oov_token = oov_token
self.p = p
if not exclude_tokens:
exclude_tokens = [0]
self.exclude = exclude_tokens
@staticmethod
def token_dropout(tokens: torch.LongTensor,
oov_token: int,
exclude_tokens: List[int],
p: float = 0.2,
training: float = True) -> torch.LongTensor:
"""During training, randomly replaces some of the non-padding tokens to a mask token with probability ``p``
Adopted from https://github.com/Hyperparticle/udify
Args:
tokens: The current batch of padded sentences with word ids
oov_token: The mask token
exclude_tokens: The tokens for padding the input batch
p: The probability a word gets mapped to the unknown token
training: Applies the dropout if set to ``True``
tokens: torch.LongTensor:
oov_token: int:
exclude_tokens: List[int]:
p: float: (Default value = 0.2)
training: float: (Default value = True)
Returns:
A copy of the input batch with token dropout applied
"""
if training and p > 0:
# This creates a mask that only considers unpadded tokens for mapping to oov
padding_mask = tokens.new_ones(tokens.size(), dtype=torch.bool)
for pad in exclude_tokens:
padding_mask &= (tokens != pad)
# Create a uniformly random mask selecting either the original words or OOV tokens
dropout_mask = (tokens.new_empty(tokens.size(), dtype=torch.float).uniform_() < p)
oov_mask = dropout_mask & padding_mask
oov_fill = tokens.new_empty(tokens.size(), dtype=torch.long).fill_(oov_token)
result = torch.where(oov_mask, oov_fill, tokens)
return result
else:
return tokens
def forward(self, tokens: torch.LongTensor) -> torch.LongTensor:
return self.token_dropout(tokens, self.oov_token, self.exclude, self.p, self.training)
class SharedDropout(nn.Module):
def __init__(self, p=0.5, batch_first=True):
super(SharedDropout, self).__init__()
self.p = p
self.batch_first = batch_first
def extra_repr(self):
s = f"p={self.p}"
if self.batch_first:
s += f", batch_first={self.batch_first}"
return s
def forward(self, x):
if self.training:
if self.batch_first:
mask = self.get_mask(x[:, 0], self.p)
else:
mask = self.get_mask(x[0], self.p)
x *= mask.unsqueeze(1) if self.batch_first else mask
return x
@staticmethod
def get_mask(x, p):
mask = x.new_empty(x.shape).bernoulli_(1 - p)
mask = mask / (1 - p)
return mask
class IndependentDropout(nn.Module):
def __init__(self, p=0.5):
r"""
For :math:`N` tensors, they use different dropout masks respectively.
When :math:`N-M` of them are dropped, the remaining :math:`M` ones are scaled by a factor of :math:`N/M` to compensate,
and when all of them are dropped together, zeros are returned.
Copied from https://github.com/yzhangcs/parser/master/supar/modules/dropout.py.
Args:
p (float):
The probability of an element to be zeroed. Default: 0.5.
Examples:
>>> x, y = torch.ones(1, 3, 5), torch.ones(1, 3, 5)
>>> x, y = IndependentDropout()(x, y)
>>> x
tensor([[[1., 1., 1., 1., 1.],
[0., 0., 0., 0., 0.],
[2., 2., 2., 2., 2.]]])
>>> y
tensor([[[1., 1., 1., 1., 1.],
[2., 2., 2., 2., 2.],
[0., 0., 0., 0., 0.]]])
"""
super(IndependentDropout, self).__init__()
self.p = p
def extra_repr(self):
return f"p={self.p}"
def forward(self, *items):
if self.training:
masks = [x.new_empty(x.shape[:2]).bernoulli_(1 - self.p)
for x in items]
total = sum(masks)
scale = len(items) / total.max(torch.ones_like(total))
masks = [mask * scale for mask in masks]
items = [item * mask.unsqueeze(dim=-1)
for item, mask in zip(items, masks)]
return items
class LockedDropout(nn.Module):
def __init__(self, dropout_rate=0.5):
super(LockedDropout, self).__init__()
self.dropout_rate = dropout_rate
def forward(self, x):
if not self.training or not self.dropout_rate:
return x
if x.dim() == 3:
mask = x.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout_rate) / (1 - self.dropout_rate)
mask = mask.expand_as(x)
elif x.dim() == 2:
mask = torch.empty_like(x).bernoulli_(1 - self.dropout_rate) / (1 - self.dropout_rate)
else:
raise ValueError(f'Unsupported dim: {x.dim()}. Only 2d (T,C) or 3d (B,T,C) is supported')
return mask * x
================================================
FILE: hanlp/layers/embeddings/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-24 21:48
================================================
FILE: hanlp/layers/embeddings/char_cnn.py
================================================
# Adopted from https://github.com/allenai/allennlp under Apache Licence 2.0.
# Changed the packaging and created a subclass CharCNNEmbedding
from typing import Union, Tuple, Optional, Callable
import torch
from torch import nn
from hanlp.layers.cnn_encoder import CnnEncoder
from hanlp.layers.time_distributed import TimeDistributed
from hanlp_common.configurable import AutoConfigurable
from hanlp.common.transform import VocabDict, ToChar
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import EmbeddingDim, Embedding
class CharCNN(nn.Module):
def __init__(self,
field: str,
embed: Union[int, Embedding], num_filters: int,
ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5),
conv_layer_activation: str = 'ReLU',
output_dim: Optional[int] = None,
vocab_size=None) -> None:
"""A `CnnEncoder` is a combination of multiple convolution layers and max pooling layers.
The input to this module is of shape `(batch_size, num_tokens,
input_dim)`, and the output is of shape `(batch_size, output_dim)`.
The CNN has one convolution layer for each ngram filter size. Each convolution operation gives
out a vector of size num_filters. The number of times a convolution layer will be used
is `num_tokens - ngram_size + 1`. The corresponding maxpooling layer aggregates all these
outputs from the convolution layer and outputs the max.
This operation is repeated for every ngram size passed, and consequently the dimensionality of
the output after maxpooling is `len(ngram_filter_sizes) * num_filters`. This then gets
(optionally) projected down to a lower dimensional output, specified by `output_dim`.
We then use a fully connected layer to project in back to the desired output_dim. For more
details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural
Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1.
See allennlp.modules.seq2vec_encoders.cnn_encoder.CnnEncoder, Apache 2.0
Args:
field: The field in samples this encoder will work on.
embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object.
num_filters: This is the output dim for each convolutional layer, which is the number of "filters"
learned by that layer.
ngram_filter_sizes: This specifies both the number of convolutional layers we will create and their sizes. The
default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding
ngrams of size 2 to 5 with some number of filters.
conv_layer_activation: `Activation`, optional (default=`torch.nn.ReLU`)
Activation to use after the convolution layers.
output_dim: After doing convolutions and pooling, we'll project the collected features into a vector of
this size. If this value is `None`, we will just return the result of the max pooling,
giving an output of shape `len(ngram_filter_sizes) * num_filters`.
vocab_size: The size of character vocab.
Returns:
A tensor of shape `(batch_size, output_dim)`.
"""
super().__init__()
EmbeddingDim.__init__(self)
# the embedding layer
if isinstance(embed, int):
embed = nn.Embedding(num_embeddings=vocab_size,
embedding_dim=embed)
else:
raise ValueError(f'Unrecognized type for {embed}')
self.field = field
self.embed = TimeDistributed(embed)
self.encoder = TimeDistributed(
CnnEncoder(embed.embedding_dim, num_filters, ngram_filter_sizes, conv_layer_activation, output_dim))
self.embedding_dim = output_dim or num_filters * len(ngram_filter_sizes)
def forward(self, batch: dict, **kwargs):
tokens: torch.Tensor = batch[f'{self.field}_char_id']
mask = tokens.ge(0)
x = self.embed(tokens)
return self.encoder(x, mask)
def get_output_dim(self) -> int:
return self.embedding_dim
class CharCNNEmbedding(Embedding, AutoConfigurable):
def __init__(self,
field,
embed: Union[int, Embedding],
num_filters: int,
ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5),
conv_layer_activation: str = 'ReLU',
output_dim: Optional[int] = None,
min_word_length=None
) -> None:
"""
Args:
field: The character field in samples this encoder will work on.
embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object.
num_filters: This is the output dim for each convolutional layer, which is the number of "filters"
learned by that layer.
ngram_filter_sizes: This specifies both the number of convolutional layers we will create and their sizes. The
default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding
ngrams of size 2 to 5 with some number of filters.
conv_layer_activation: `Activation`, optional (default=`torch.nn.ReLU`)
Activation to use after the convolution layers.
output_dim: After doing convolutions and pooling, we'll project the collected features into a vector of
this size. If this value is `None`, we will just return the result of the max pooling,
giving an output of shape `len(ngram_filter_sizes) * num_filters`.
min_word_length: For ngram filter with max size, the input (chars) is required to have at least max size
chars.
"""
super().__init__()
if min_word_length is None:
min_word_length = max(ngram_filter_sizes)
self.min_word_length = min_word_length
self.output_dim = output_dim
self.conv_layer_activation = conv_layer_activation
self.ngram_filter_sizes = ngram_filter_sizes
self.num_filters = num_filters
self.embed = embed
self.field = field
def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]:
if isinstance(self.embed, Embedding):
self.embed.transform(vocabs=vocabs)
vocab_name = self.vocab_name
if vocab_name not in vocabs:
vocabs[vocab_name] = Vocab()
return ToChar(self.field, vocab_name, min_word_length=self.min_word_length,
pad=vocabs[vocab_name].safe_pad_token)
@property
def vocab_name(self):
vocab_name = f'{self.field}_char'
return vocab_name
def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]:
embed = self.embed
if isinstance(embed, Embedding):
embed = embed.module(vocabs=vocabs)
return CharCNN(self.field,
embed,
self.num_filters,
self.ngram_filter_sizes,
self.conv_layer_activation,
self.output_dim,
vocab_size=len(vocabs[self.vocab_name]))
================================================
FILE: hanlp/layers/embeddings/char_cnn_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 21:15
from functools import reduce
import tensorflow as tf
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.tf_util import hanlp_register
@hanlp_register
class CharCNNEmbeddingTF(tf.keras.layers.Layer):
def __init__(self, word_vocab: VocabTF, char_vocab: VocabTF,
char_embedding=100,
kernel_size=3,
filters=50,
dropout=0.5,
trainable=True, name=None, dtype=None, dynamic=False,
**kwargs):
super().__init__(trainable, name, dtype, dynamic, **kwargs)
self.char_embedding = char_embedding
self.filters = filters
self.kernel_size = kernel_size
self.char_vocab = char_vocab
self.word_vocab = word_vocab
self.embedding = tf.keras.layers.Embedding(input_dim=len(self.char_vocab), output_dim=char_embedding,
trainable=True, mask_zero=True)
self.dropout = tf.keras.layers.Dropout(dropout)
self.cnn = tf.keras.layers.Conv1D(filters, kernel_size, padding='same')
def call(self, inputs: tf.Tensor, **kwargs):
mask = tf.not_equal(inputs, self.word_vocab.pad_token)
inputs = tf.ragged.boolean_mask(inputs, mask)
chars = tf.strings.unicode_split(inputs, input_encoding='UTF-8')
chars = chars.to_tensor(default_value=self.char_vocab.pad_token)
chars = self.char_vocab.lookup(chars)
embed = self.embedding(chars)
weights = embed._keras_mask
embed = self.dropout(embed)
features = masked_conv1d_and_max(embed, weights, self.cnn)
features._keras_mask = mask
return features
def compute_output_shape(self, input_shape):
return super().compute_output_shape(input_shape)
def get_config(self):
config = {
'char_embedding': self.char_embedding,
'kernel_size': self.kernel_size,
'filters': self.filters,
'dropout': self.dropout.rate,
}
base_config = super(CharCNNEmbeddingTF, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def masked_conv1d_and_max(t, weights, conv1d):
"""Applies 1d convolution and a masked max-pooling
https://github.com/guillaumegenthial/tf_ner/blob/master/models/chars_conv_lstm_crf/masked_conv.py
Args:
t(tf.Tensor): A tensor with at least 3 dimensions [d1, d2, ..., dn-1, dn]
weights(tf.Tensor of tf.bool): A Tensor of shape [d1, d2, dn-1]
filters(int): number of filters
kernel_size(int): kernel size for the temporal convolution
conv1d:
Returns:
"""
# Get shape and parameters
shape = tf.shape(t)
ndims = t.shape.ndims
dim1 = reduce(lambda x, y: x * y, [shape[i] for i in range(ndims - 2)])
dim2 = shape[-2]
dim3 = t.shape[-1]
# Reshape weights
weights = tf.reshape(weights, shape=[dim1, dim2, 1])
weights = tf.cast(weights, tf.float32)
# Reshape input and apply weights
flat_shape = [dim1, dim2, dim3]
t = tf.reshape(t, shape=flat_shape)
t *= weights
# Apply convolution
t_conv = conv1d(t)
t_conv *= weights
# Reduce max -- set to zero if all padded
t_conv += (1. - weights) * tf.reduce_min(t_conv, axis=-2, keepdims=True)
t_max = tf.reduce_max(t_conv, axis=-2)
# Reshape the output
final_shape = [shape[i] for i in range(ndims - 2)] + [conv1d.filters]
t_max = tf.reshape(t_max, shape=final_shape)
return t_max
================================================
FILE: hanlp/layers/embeddings/char_rnn.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-02 23:49
from typing import Optional, Callable, Union
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from hanlp_common.configurable import AutoConfigurable
from hanlp.common.transform import VocabDict, ToChar
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim
class CharRNN(nn.Module, EmbeddingDim):
def __init__(self,
field,
vocab_size,
embed: Union[int, nn.Embedding],
hidden_size):
"""Character level RNN embedding module.
Args:
field: The field in samples this encoder will work on.
vocab_size: The size of character vocab.
embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object.
hidden_size: The hidden size of RNNs.
"""
super(CharRNN, self).__init__()
self.field = field
# the embedding layer
if isinstance(embed, int):
self.embed = nn.Embedding(num_embeddings=vocab_size,
embedding_dim=embed)
elif isinstance(embed, nn.Module):
self.embed = embed
embed = embed.embedding_dim
else:
raise ValueError(f'Unrecognized type for {embed}')
# the lstm layer
self.lstm = nn.LSTM(input_size=embed,
hidden_size=hidden_size,
batch_first=True,
bidirectional=True)
def forward(self, batch, mask, **kwargs):
x = batch[f'{self.field}_char_id']
# [batch_size, seq_len, fix_len]
mask = x.ne(0)
# [batch_size, seq_len]
lens = mask.sum(-1)
char_mask = lens.gt(0)
# [n, fix_len, n_embed]
x = self.embed(batch) if isinstance(self.embed, EmbeddingDim) else self.embed(x[char_mask])
x = pack_padded_sequence(x[char_mask], lens[char_mask].cpu(), True, False)
x, (h, _) = self.lstm(x)
# [n, fix_len, n_out]
h = torch.cat(torch.unbind(h), -1)
# [batch_size, seq_len, n_out]
embed = h.new_zeros(*lens.shape, h.size(-1))
embed = embed.masked_scatter_(char_mask.unsqueeze(-1), h)
return embed
@property
def embedding_dim(self) -> int:
return self.lstm.hidden_size * 2
class CharRNNEmbedding(Embedding, AutoConfigurable):
def __init__(self,
field,
embed,
hidden_size,
max_word_length=None) -> None:
"""Character level RNN embedding module builder.
Args:
field: The field in samples this encoder will work on.
embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object.
hidden_size: The hidden size of RNNs.
max_word_length: Character sequence longer than ``max_word_length`` will be truncated.
"""
super().__init__()
self.field = field
self.hidden_size = hidden_size
self.embed = embed
self.max_word_length = max_word_length
def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]:
if isinstance(self.embed, Embedding):
self.embed.transform(vocabs=vocabs)
vocab_name = self.vocab_name
if vocab_name not in vocabs:
vocabs[vocab_name] = Vocab()
return ToChar(self.field, vocab_name, max_word_length=self.max_word_length)
@property
def vocab_name(self):
vocab_name = f'{self.field}_char'
return vocab_name
def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]:
embed = self.embed
if isinstance(self.embed, Embedding):
embed = self.embed.module(vocabs=vocabs)
return CharRNN(self.field, len(vocabs[self.vocab_name]), embed, self.hidden_size)
================================================
FILE: hanlp/layers/embeddings/char_rnn_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 17:02
import tensorflow as tf
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.tf_util import hanlp_register
@hanlp_register
class CharRNNEmbeddingTF(tf.keras.layers.Layer):
def __init__(self, word_vocab: VocabTF, char_vocab: VocabTF,
char_embedding=100,
char_rnn_units=25,
dropout=0.5,
trainable=True, name=None, dtype=None, dynamic=False,
**kwargs):
super().__init__(trainable, name, dtype, dynamic, **kwargs)
self.char_embedding = char_embedding
self.char_rnn_units = char_rnn_units
self.char_vocab = char_vocab
self.word_vocab = word_vocab
self.embedding = tf.keras.layers.Embedding(input_dim=len(self.char_vocab), output_dim=char_embedding,
trainable=True, mask_zero=True)
self.dropout = tf.keras.layers.Dropout(dropout)
self.rnn = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=char_rnn_units,
return_state=True), name='bilstm')
def call(self, inputs: tf.Tensor, **kwargs):
mask = tf.not_equal(inputs, self.word_vocab.pad_token)
inputs = tf.ragged.boolean_mask(inputs, mask)
chars = tf.strings.unicode_split(inputs, input_encoding='UTF-8')
chars = chars.to_tensor(default_value=self.char_vocab.pad_token)
chars = self.char_vocab.lookup(chars)
embed = self.embedding(chars)
char_mask = embed._keras_mask
embed = self.dropout(embed)
embed_shape = tf.shape(embed)
embed = tf.reshape(embed, [-1, embed_shape[2], embed_shape[3]])
char_mask = tf.reshape(char_mask, [-1, embed_shape[2]])
all_zeros = tf.reduce_sum(tf.cast(char_mask, tf.int32), axis=1) == 0
char_mask_shape = tf.shape(char_mask)
hole = tf.zeros(shape=(char_mask_shape[0], char_mask_shape[1] - 1), dtype=tf.bool)
all_zeros = tf.expand_dims(all_zeros, -1)
non_all_zeros = tf.concat([all_zeros, hole], axis=1)
char_mask = tf.logical_or(char_mask, non_all_zeros)
output, h_fw, c_fw, h_bw, c_bw = self.rnn(embed, mask=char_mask)
hidden = tf.concat([h_fw, h_bw], axis=-1)
# hidden = output
hidden = tf.reshape(hidden, [embed_shape[0], embed_shape[1], -1])
hidden._keras_mask = mask
return hidden
def get_config(self):
config = {
'char_embedding': self.char_embedding,
'char_rnn_units': self.char_rnn_units,
'dropout': self.dropout.rate,
}
base_config = super(CharRNNEmbeddingTF, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
================================================
FILE: hanlp/layers/embeddings/concat_embedding.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 17:08
import tensorflow as tf
from hanlp.utils.tf_util import hanlp_register, copy_mask
@hanlp_register
class ConcatEmbedding(tf.keras.layers.Layer):
def __init__(self, *embeddings, trainable=True, name=None, dtype=None, dynamic=False, **kwargs):
self.embeddings = []
for embed in embeddings:
embed: tf.keras.layers.Layer = tf.keras.utils.deserialize_keras_object(embed) if isinstance(embed,
dict) else embed
self.embeddings.append(embed)
if embed.trainable:
trainable = True
if embed.dynamic:
dynamic = True
if embed.supports_masking:
self.supports_masking = True
super().__init__(trainable, name, dtype, dynamic, **kwargs)
def build(self, input_shape):
for embed in self.embeddings:
embed.build(input_shape)
super().build(input_shape)
def compute_mask(self, inputs, mask=None):
for embed in self.embeddings:
mask = embed.compute_mask(inputs, mask)
if mask is not None:
return mask
return mask
def call(self, inputs, **kwargs):
embeds = [embed.call(inputs) for embed in self.embeddings]
feature = tf.concat(embeds, axis=-1)
for embed in embeds:
mask = copy_mask(embed, feature)
if mask is not None:
break
return feature
def get_config(self):
config = {
'embeddings': [embed.get_config() for embed in self.embeddings],
}
base_config = super(ConcatEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape):
dim = 0
for embed in self.embeddings:
dim += embed.compute_output_shape(input_shape)[-1]
return input_shape + dim
================================================
FILE: hanlp/layers/embeddings/contextual_string_embedding.py
================================================
# Most codes of this file is adopted from flair, which is licenced under:
#
# The MIT License (MIT)
#
# Flair is licensed under the following MIT License (MIT) Copyright © 2018 Zalando SE, https://tech.zalando.com
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import os
from typing import List, Dict, Callable
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from hanlp_common.configurable import Configurable
from hanlp.common.transform import TransformList, FieldToIndex
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim
from hanlp.utils.io_util import get_resource
from hanlp.utils.torch_util import pad_lists, batched_index_select
from tests import cdroot
class RNNLanguageModel(nn.Module):
"""Container module with an encoder, a recurrent module, and a decoder."""
def __init__(self,
n_tokens,
is_forward_lm: bool,
hidden_size: int,
embedding_size: int = 100):
super(RNNLanguageModel, self).__init__()
self.is_forward_lm: bool = is_forward_lm
self.n_tokens = n_tokens
self.hidden_size = hidden_size
self.embedding_size = embedding_size
self.encoder = nn.Embedding(n_tokens, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, batch_first=True)
def forward(self, ids: torch.LongTensor, lens: torch.LongTensor):
emb = self.encoder(ids)
x = pack_padded_sequence(emb, lens, True, False)
x, _ = self.rnn(x)
x, _ = pad_packed_sequence(x, True)
return x
@classmethod
def load_language_model(cls, model_file):
model_file = get_resource(model_file)
state = torch.load(model_file)
model = RNNLanguageModel(state['n_tokens'],
state['is_forward_lm'],
state['hidden_size'],
state['embedding_size'])
model.load_state_dict(state['state_dict'], strict=False)
return model
def save(self, file):
model_state = {
'state_dict': self.state_dict(),
'n_tokens': self.n_tokens,
'is_forward_lm': self.is_forward_lm,
'hidden_size': self.hidden_size,
'embedding_size': self.embedding_size,
}
torch.save(model_state, file, pickle_protocol=4)
class ContextualStringEmbeddingModule(nn.Module, EmbeddingDim):
def __init__(self, field: str, path: str, trainable=False) -> None:
super().__init__()
self.field = field
path = get_resource(path)
f = os.path.join(path, 'forward.pt')
b = os.path.join(path, 'backward.pt')
self.f: RNNLanguageModel = RNNLanguageModel.load_language_model(f)
self.b: RNNLanguageModel = RNNLanguageModel.load_language_model(b)
if not trainable:
for p in self.parameters():
p.requires_grad_(False)
def __call__(self, batch: dict, **kwargs):
args = ['f_char_id', 'f_offset', 'b_char_id', 'b_offset']
keys = [f'{self.field}_{key}' for key in args]
args = [batch[key] for key in keys]
return super().__call__(*args, **kwargs)
@property
def embedding_dim(self):
return self.f.rnn.hidden_size + self.b.rnn.hidden_size
def run_lm(self, lm, ids: torch.Tensor, offsets: torch.LongTensor):
lens = offsets.max(-1)[0] + 1
rnn_output = lm(ids, lens)
return batched_index_select(rnn_output, offsets)
def forward(self,
f_chars_id: torch.Tensor,
f_offset: torch.LongTensor,
b_chars_id: torch.Tensor,
b_offset: torch.LongTensor, **kwargs):
f = self.run_lm(self.f, f_chars_id, f_offset)
b = self.run_lm(self.b, b_chars_id, b_offset)
return torch.cat([f, b], dim=-1)
def embed(self, sents: List[List[str]], vocab: Dict[str, int]):
f_chars, f_offsets = [], []
b_chars, b_offsets = [], []
transform = ContextualStringEmbeddingTransform('token')
for tokens in sents:
sample = transform({'token': tokens})
for each, name in zip([f_chars, b_chars, f_offsets, b_offsets],
'f_chars, b_chars, f_offsets, b_offsets'.split(', ')):
each.append(sample[f'token_{name}'])
f_ids = []
for cs in f_chars:
f_ids.append([vocab[c] for c in cs])
f_ids = pad_lists(f_ids)
f_offsets = pad_lists(f_offsets)
b_ids = []
for cs in b_chars:
b_ids.append([vocab[c] for c in cs])
b_ids = pad_lists(b_ids)
b_offsets = pad_lists(b_offsets)
return self.forward(f_ids, f_offsets, b_ids, b_offsets)
class ContextualStringEmbeddingTransform(Configurable):
def __init__(self, src: str) -> None:
self.src = src
def __call__(self, sample: dict):
tokens = sample[self.src]
f_o = []
b_o = []
sentence_text = ' '.join(tokens)
end_marker = ' '
extra_offset = 1
# f
input_text = '\n' + sentence_text + end_marker
f_chars = list(input_text)
# b
sentence_text = sentence_text[::-1]
input_text = '\n' + sentence_text + end_marker
b_chars = list(input_text)
offset_forward: int = extra_offset
offset_backward: int = len(sentence_text) + extra_offset
for token in tokens:
offset_forward += len(token)
f_o.append(offset_forward)
b_o.append(offset_backward)
# This language model is tokenized
offset_forward += 1
offset_backward -= 1
offset_backward -= len(token)
sample[f'{self.src}_f_char'] = f_chars
sample[f'{self.src}_b_char'] = b_chars
sample[f'{self.src}_f_offset'] = f_o
sample[f'{self.src}_b_offset'] = b_o
return sample
class ContextualStringEmbedding(Embedding):
def __init__(self, field, path, trainable=False) -> None:
super().__init__()
self.trainable = trainable
self.path = path
self.field = field
def transform(self, **kwargs) -> Callable:
vocab = Vocab()
vocab.load(os.path.join(get_resource(self.path), 'vocab.json'))
return TransformList(ContextualStringEmbeddingTransform(self.field),
FieldToIndex(f'{self.field}_f_char', vocab),
FieldToIndex(f'{self.field}_b_char', vocab))
def module(self, **kwargs) -> nn.Module:
return ContextualStringEmbeddingModule(self.field, self.path, self.trainable)
def main():
# _validate()
flair = ContextualStringEmbedding('token', 'FASTTEXT_DEBUG_EMBEDDING_EN')
print(flair.config)
def _validate():
cdroot()
flair = ContextualStringEmbeddingModule('token', 'FLAIR_LM_WMT11_EN')
vocab = torch.load('/home/hhe43/flair/item2idx.pt')
vocab = dict((x.decode(), y) for x, y in vocab.items())
# vocab = Vocab(token_to_idx=vocab, pad_token='')
# vocab.lock()
# vocab.summary()
# vocab.save('vocab.json')
tokens = 'I love Berlin .'.split()
sent = ' '.join(tokens)
embed = flair.embed([tokens, tokens], vocab)
gold = torch.load('/home/hhe43/flair/gold.pt')
print(torch.allclose(embed[1, :, :2048], gold, atol=1e-6))
# print(torch.all(torch.eq(embed[1, :, :], gold)))
if __name__ == '__main__':
main()
================================================
FILE: hanlp/layers/embeddings/contextual_string_embedding_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-19 03:24
from typing import List
import tensorflow as tf
import numpy as np
from hanlp.components.rnn_language_model_tf import RNNLanguageModel
from hanlp_common.constant import PAD
from hanlp.utils.io_util import get_resource
from hanlp.utils.tf_util import copy_mask, hanlp_register, str_tensor_2d_to_list
from hanlp_common.util import infer_space_after
@hanlp_register
class ContextualStringEmbeddingTF(tf.keras.layers.Layer):
def __init__(self, forward_model_path=None, backward_model_path=None, max_word_len=10,
trainable=False, name=None, dtype=None,
dynamic=True, **kwargs):
assert dynamic, 'ContextualStringEmbedding works only in eager mode'
super().__init__(trainable, name, dtype, dynamic, **kwargs)
assert any([forward_model_path, backward_model_path]), 'At least one model is required'
self.forward_model_path = forward_model_path
self.backward_model_path = backward_model_path
self.forward_model = self._load_lm(forward_model_path) if forward_model_path else None
self.backward_model = self._load_lm(backward_model_path) if backward_model_path else None
if trainable:
self._fw = self.forward_model.model
self._bw = self.backward_model.model
for m in self._fw, self._bw:
m.trainable = True
self.supports_masking = True
self.max_word_len = max_word_len
def call(self, inputs, **kwargs):
str_inputs = str_tensor_2d_to_list(inputs)
outputs = self.embed(str_inputs)
copy_mask(inputs, outputs)
return outputs
def _load_lm(self, filepath):
filepath = get_resource(filepath)
lm = RNNLanguageModel()
lm.load(filepath)
model: tf.keras.Sequential = lm.model
for idx, layer in enumerate(model.layers):
if isinstance(layer, tf.keras.layers.LSTM):
lm.model = tf.keras.Sequential(model.layers[:idx + 1]) # discard dense layer
return lm
def embed(self, texts: List[List[str]]):
"""Embedding sentences (list of words) with contextualized string embedding
Args:
texts: List of words, not chars
texts: List[List[str]]:
Returns:
"""
fw = None
if self.forward_model:
fw = self._run_rnn(texts, model=self.forward_model)
bw = None
if self.backward_model:
bw = self._run_rnn(texts, model=self.backward_model)
if not all(x is not None for x in [fw, bw]):
return fw if fw is not None else bw
else:
return tf.concat([fw, bw], axis=-1)
def _run_rnn(self, texts, model):
embeddings = []
inputs = []
offsets = []
tokenizer = model.transform.tokenize_func()
backward = not model.config['forward']
for sent in texts:
raw, off = self._get_raw_string(sent, tokenizer)
inputs.append(raw)
offsets.append(off)
outputs = model.model_from_config.predict(model.transform.inputs_to_dataset(inputs))
if backward:
outputs = tf.reverse(outputs, axis=[1])
maxlen = len(max(texts, key=len))
for hidden, off, sent in zip(outputs, offsets, texts):
embed = []
for (start, end), word in zip(off, sent):
embed.append(hidden[end - 1, :])
if len(embed) < maxlen:
embed += [np.zeros_like(embed[-1])] * (maxlen - len(embed))
embeddings.append(np.stack(embed))
return tf.stack(embeddings)
def _get_raw_string(self, sent: List[str], tokenizer):
raw_string = []
offsets = []
whitespace_after = infer_space_after(sent)
start = 0
for word, space in zip(sent, whitespace_after):
chars = tokenizer(word)
chars = chars[:self.max_word_len]
if space:
chars += [' ']
end = start + len(chars)
offsets.append((start, end))
start = end
raw_string += chars
return raw_string, offsets
def get_config(self):
config = {
'forward_model_path': self.forward_model_path,
'backward_model_path': self.backward_model_path,
'max_word_len': self.max_word_len,
}
base_config = super(ContextualStringEmbeddingTF, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
@property
def output_dim(self):
dim = 0
for model in self.forward_model, self.backward_model:
if model:
dim += model.config['rnn_units']
return dim
def compute_output_shape(self, input_shape):
return input_shape + self.output_dim
def compute_mask(self, inputs, mask=None):
return tf.not_equal(inputs, PAD)
================================================
FILE: hanlp/layers/embeddings/contextual_word_embedding.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-05 13:50
from typing import Optional, Union, List, Any, Dict, Tuple
import torch
from torch import nn
from hanlp_common.configurable import AutoConfigurable
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.layers.transformers.encoder import TransformerEncoder
from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, AutoConfig_, AutoTokenizer_
from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer
class ContextualWordEmbeddingModule(TransformerEncoder):
def __init__(self,
field: str,
transformer: str,
transformer_tokenizer: PreTrainedTokenizer,
average_subwords=False,
scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
word_dropout=None,
max_sequence_length=None,
ret_raw_hidden_states=False,
transformer_args: Dict[str, Any] = None,
trainable=True,
training=True) -> None:
"""A contextualized word embedding module.
Args:
field: The field to work on. Usually some token fields.
transformer: An identifier of a ``PreTrainedModel``.
transformer_tokenizer:
average_subwords: ``True`` to average subword representations.
scalar_mix: Layer attention.
word_dropout: Dropout rate of randomly replacing a subword with MASK.
max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
window.
ret_raw_hidden_states: ``True`` to return hidden states of each layer.
transformer_args: Extra arguments passed to the transformer.
trainable: ``False`` to use static embeddings.
training: ``False`` to skip loading weights from pre-trained transformers.
"""
super().__init__(transformer, transformer_tokenizer, average_subwords, scalar_mix, word_dropout,
max_sequence_length, ret_raw_hidden_states, transformer_args, trainable,
training)
self.field = field
# noinspection PyMethodOverriding
# noinspection PyTypeChecker
def forward(self, batch: dict, mask=None, **kwargs):
input_ids: torch.LongTensor = batch[f'{self.field}_input_ids']
token_span: torch.LongTensor = batch.get(f'{self.field}_token_span', None)
# input_device = input_ids.device
# this_device = self.get_device()
# if input_device != this_device:
# input_ids = input_ids.to(this_device)
# token_span = token_span.to(this_device)
# We might want to apply mask here
output: Union[torch.Tensor, List[torch.Tensor]] = super().forward(input_ids, token_span=token_span, **kwargs)
# if input_device != this_device:
# if isinstance(output, torch.Tensor):
# output = output.to(input_device)
# else:
# output = [x.to(input_device) for x in output]
return output
def get_output_dim(self):
return self.transformer.config.hidden_size
def get_device(self):
device: torch.device = next(self.parameters()).device
return device
class ContextualWordEmbedding(Embedding, AutoConfigurable):
def __init__(self, field: str,
transformer: str,
average_subwords=False,
scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
word_dropout: Optional[Union[float, Tuple[float, str]]] = None,
max_sequence_length=None,
truncate_long_sequences=False,
cls_is_bos=False,
sep_is_eos=False,
ret_token_span=True,
ret_subtokens=False,
ret_subtokens_group=False,
ret_prefix_mask=False,
ret_raw_hidden_states=False,
transformer_args: Dict[str, Any] = None,
use_fast=True,
do_basic_tokenize=True,
trainable=True) -> None:
"""A contextual word embedding builder which builds a
:class:`~hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule` and a
:class:`~hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer`.
Args:
field: The field to work on. Usually some token fields.
transformer: An identifier of a ``PreTrainedModel``.
average_subwords: ``True`` to average subword representations.
scalar_mix: Layer attention.
word_dropout: Dropout rate of randomly replacing a subword with MASK.
max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
window.
truncate_long_sequences: ``True`` to return hidden states of each layer.
cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
``False`` (default) means the first token is not [CLS], it will have its own embedding other than
the embedding of [CLS].
sep_is_eos: ``True`` means the last token of input is [SEP].
``False`` means it's not but [SEP] will be appended,
``None`` means it dependents on `input[-1] == [EOS]`.
ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
ret_subtokens: ``True`` to return list of subtokens belonging to each token.
ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
ret_raw_hidden_states: ``True`` to return hidden states of each layer.
transformer_args: Extra arguments passed to the transformer.
use_fast: Whether or not to try to load the fast version of the tokenizer.
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
trainable: ``False`` to use static embeddings.
"""
super().__init__()
self.truncate_long_sequences = truncate_long_sequences
self.transformer_args = transformer_args
self.trainable = trainable
self.ret_subtokens_group = ret_subtokens_group
self.ret_subtokens = ret_subtokens
self.ret_raw_hidden_states = ret_raw_hidden_states
self.sep_is_eos = sep_is_eos
self.cls_is_bos = cls_is_bos
self.max_sequence_length = max_sequence_length
self.word_dropout = word_dropout
self.scalar_mix = scalar_mix
self.average_subwords = average_subwords
self.transformer = transformer
self.field = field
self._transformer_tokenizer = AutoTokenizer_.from_pretrained(self.transformer,
use_fast=use_fast,
do_basic_tokenize=do_basic_tokenize)
self._tokenizer_transform = TransformerSequenceTokenizer(self._transformer_tokenizer,
field,
truncate_long_sequences=truncate_long_sequences,
ret_prefix_mask=ret_prefix_mask,
ret_token_span=ret_token_span,
cls_is_bos=cls_is_bos,
sep_is_eos=sep_is_eos,
ret_subtokens=ret_subtokens,
ret_subtokens_group=ret_subtokens_group,
max_seq_length=self.max_sequence_length
)
def transform(self, **kwargs) -> TransformerSequenceTokenizer:
return self._tokenizer_transform
def module(self, training=True, **kwargs) -> Optional[nn.Module]:
return ContextualWordEmbeddingModule(self.field,
self.transformer,
self._transformer_tokenizer,
self.average_subwords,
self.scalar_mix,
self.word_dropout,
self.max_sequence_length,
self.ret_raw_hidden_states,
self.transformer_args,
self.trainable,
training=training)
def get_output_dim(self):
config = AutoConfig_.from_pretrained(self.transformer)
return config.hidden_size
def get_tokenizer(self):
return self._transformer_tokenizer
def find_transformer(embed: nn.Module):
if isinstance(embed, ContextualWordEmbeddingModule):
return embed
if isinstance(embed, nn.ModuleList):
for child in embed:
found = find_transformer(child)
if found:
return found
================================================
FILE: hanlp/layers/embeddings/embedding.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-02 13:04
from abc import ABC, abstractmethod
from typing import Callable, List, Optional, Iterable
import torch
from torch import nn
from torch.nn import Module
from hanlp_common.configurable import AutoConfigurable
from hanlp.common.transform import TransformList
from hanlp.layers.dropout import IndependentDropout
class EmbeddingDim(ABC):
@property
@abstractmethod
def embedding_dim(self) -> int:
return -1
def get_output_dim(self) -> int:
return self.embedding_dim
class Embedding(AutoConfigurable, ABC):
def __init__(self) -> None:
"""
Base class for embedding builders.
"""
super().__init__()
def transform(self, **kwargs) -> Optional[Callable]:
"""Build a transform function for this embedding.
Args:
**kwargs: Containing vocabs, training etc. Not finalized for now.
Returns:
A transform function.
"""
return None
def module(self, **kwargs) -> Optional[nn.Module]:
"""Build a module for this embedding.
Args:
**kwargs: Containing vocabs, training etc. Not finalized for now.
Returns:
A module.
"""
return None
class ConcatModuleList(nn.ModuleList, EmbeddingDim):
def __init__(self, *modules: Optional[Iterable[Module]], dropout=None) -> None:
"""A ``nn.ModuleList`` to bundle several embeddings modules.
Args:
*modules: Embedding layers.
dropout: Dropout applied on the concatenated embedding.
"""
super().__init__(*modules)
if dropout:
dropout = IndependentDropout(p=dropout)
self.dropout = dropout
@property
def embedding_dim(self) -> int:
return sum(embed.embedding_dim for embed in self)
def get_output_dim(self) -> int:
return sum(embed.get_output_dim() for embed in self)
# noinspection PyMethodOverriding
def forward(self, batch: dict, **kwargs):
embeds = [embed(batch, **kwargs) for embed in self.embeddings]
if self.dropout:
embeds = self.dropout(*embeds)
return torch.cat(embeds, -1)
@property
def embeddings(self):
embeddings = [x for x in self]
if self.dropout:
embeddings.remove(self.dropout)
return embeddings
class EmbeddingList(Embedding):
def __init__(self, *embeddings_, embeddings: dict = None, dropout=None) -> None:
"""An embedding builder to bundle several embedding builders.
Args:
*embeddings_: A list of embedding builders.
embeddings: Deserialization for a dict of embedding builders.
dropout: Dropout applied on the concatenated embedding.
"""
# noinspection PyTypeChecker
self.dropout = dropout
self._embeddings: List[Embedding] = list(embeddings_)
if embeddings:
for each in embeddings:
if isinstance(each, dict):
each = AutoConfigurable.from_config(each)
self._embeddings.append(each)
self.embeddings = [e.config for e in self._embeddings]
def transform(self, **kwargs):
transforms = [e.transform(**kwargs) for e in self._embeddings]
transforms = [t for t in transforms if t]
return TransformList(*transforms)
def module(self, **kwargs):
modules = [e.module(**kwargs) for e in self._embeddings]
modules = [m for m in modules if m]
return ConcatModuleList(modules, dropout=self.dropout)
def to_list(self):
return self._embeddings
def find_embedding_by_class(embed: Embedding, cls):
if isinstance(embed, cls):
return embed
if isinstance(embed, EmbeddingList):
for child in embed.to_list():
found = find_embedding_by_class(child, cls)
if found:
return found
================================================
FILE: hanlp/layers/embeddings/fast_text.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-27 15:06
import logging
import os
import sys
from typing import Optional, Callable
import fasttext
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from hanlp_common.configurable import AutoConfigurable
from torch.utils.data import DataLoader
from hanlp.common.dataset import PadSequenceDataLoader, TransformableDataset
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import EmbeddingNamedTransform
from hanlp.common.vocab import Vocab
from hanlp.layers.embeddings.embedding import Embedding
from hanlp.utils.io_util import get_resource, stdout_redirected
from hanlp.utils.log_util import flash
class FastTextTransform(EmbeddingNamedTransform):
def __init__(self, filepath: str, src, dst=None, **kwargs) -> None:
if not dst:
dst = src + '_fasttext'
self.filepath = filepath
flash(f'Loading fasttext model {filepath} [blink][yellow]...[/yellow][/blink]')
filepath = get_resource(filepath)
with stdout_redirected(to=os.devnull, stdout=sys.stderr):
self._model = fasttext.load_model(filepath)
flash('')
output_dim = self._model['king'].size
super().__init__(output_dim, src, dst)
def __call__(self, sample: dict):
word = sample[self.src]
if isinstance(word, str):
vector = self.embed(word)
else:
vector = torch.stack([self.embed(each) for each in word])
sample[self.dst] = vector
return sample
def embed(self, word: str):
return torch.tensor(self._model[word])
class SelectFromBatchModule(torch.nn.Module):
def __init__(self, key) -> None:
super().__init__()
self.key = key
def __call__(self, batch: dict, mask=None, **kwargs):
return batch[self.key]
class FastTextEmbeddingModule(SelectFromBatchModule):
def __init__(self, key, embedding_dim: int) -> None:
"""An embedding layer for fastText (:cite:`bojanowski2017enriching`).
Args:
key: Field name.
embedding_dim: Size of this embedding layer
"""
super().__init__(key)
self.embedding_dim = embedding_dim
def __call__(self, batch: dict, mask=None, **kwargs):
outputs = super().__call__(batch, **kwargs)
outputs = pad_sequence(outputs, True, 0)
if mask is not None:
outputs = outputs.to(mask.device)
return outputs
def __repr__(self):
s = self.__class__.__name__ + '('
s += f'key={self.key}, embedding_dim={self.embedding_dim}'
s += ')'
return s
def get_output_dim(self):
return self.embedding_dim
class FastTextEmbedding(Embedding, AutoConfigurable):
def __init__(self, src: str, filepath: str) -> None:
"""An embedding layer builder for fastText (:cite:`bojanowski2017enriching`).
Args:
src: Field name.
filepath: Filepath to pretrained fastText embeddings.
"""
super().__init__()
self.src = src
self.filepath = filepath
self._fasttext = FastTextTransform(self.filepath, self.src)
def transform(self, **kwargs) -> Optional[Callable]:
return self._fasttext
def module(self, **kwargs) -> Optional[nn.Module]:
return FastTextEmbeddingModule(self._fasttext.dst, self._fasttext.output_dim)
class FastTextDataset(TransformableDataset):
def load_file(self, filepath: str):
raise NotImplementedError('Not supported.')
class FastTextEmbeddingComponent(TorchComponent):
def __init__(self, **kwargs) -> None:
""" Toy example of Word2VecEmbedding. It simply returns the embedding of a given word
Args:
**kwargs:
"""
super().__init__(**kwargs)
def build_dataloader(self, data, shuffle=False, device=None, logger: logging.Logger = None,
**kwargs) -> DataLoader:
embed: FastTextEmbedding = self.config.embed
dataset = FastTextDataset([{'token': data}], transform=embed.transform())
return PadSequenceDataLoader(dataset, device=device)
def build_optimizer(self, **kwargs):
raise NotImplementedError('Not supported.')
def build_criterion(self, **kwargs):
raise NotImplementedError('Not supported.')
def build_metric(self, **kwargs):
raise NotImplementedError('Not supported.')
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, **kwargs):
raise NotImplementedError('Not supported.')
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
raise NotImplementedError('Not supported.')
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
raise NotImplementedError('Not supported.')
def load_vocabs(self, save_dir, filename='vocabs.json'):
pass
def load_weights(self, save_dir, filename='model.pt', **kwargs):
pass
def build_model(self, training=True, **kwargs) -> torch.nn.Module:
embed: FastTextEmbedding = self.config.embed
return embed.module()
def predict(self, data: str, **kwargs):
dataloader = self.build_dataloader(data, device=self.device)
for batch in dataloader: # It's a toy so doesn't really do batching
return self.model(batch)[0]
@property
def devices(self):
return [torch.device('cpu')]
================================================
FILE: hanlp/layers/embeddings/fast_text_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-29 13:14
import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.utils import tf_utils
from hanlp_common.constant import PAD
from hanlp.utils.io_util import get_resource, stdout_redirected
from hanlp.utils.log_util import logger
from hanlp.utils.tf_util import hanlp_register
@hanlp_register
class FastTextEmbeddingTF(tf.keras.layers.Embedding):
def __init__(self, filepath: str, padding=PAD, name=None, **kwargs):
import fasttext
self.padding = padding.encode('utf-8')
self.filepath = filepath
filepath = get_resource(filepath)
assert os.path.isfile(filepath), f'Resolved path {filepath} is not a file'
logger.debug('Loading fasttext model from [{}].'.format(filepath))
# fasttext print a blank line here
with stdout_redirected(to=os.devnull, stdout=sys.stderr):
self.model = fasttext.load_model(filepath)
kwargs.pop('input_dim', None)
kwargs.pop('output_dim', None)
kwargs.pop('mask_zero', None)
if not name:
name = os.path.splitext(os.path.basename(filepath))[0]
super().__init__(input_dim=len(self.model.words), output_dim=self.model['king'].size,
mask_zero=padding is not None, trainable=False, dtype=tf.string, name=name, **kwargs)
embed_fn = np.frompyfunc(self.embed, 1, 1)
# vf = np.vectorize(self.embed, otypes=[np.ndarray])
self._embed_np = embed_fn
def embed(self, word):
return self.model[word]
def embed_np(self, words: np.ndarray):
output = self._embed_np(words)
if self.mask_zero:
mask = words != self.padding
output *= mask
output = np.stack(output.reshape(-1)).reshape(list(words.shape) + [self.output_dim])
return output, tf.constant(mask)
else:
output = np.stack(output.reshape(-1)).reshape(list(words.shape) + [self.output_dim])
return output
@tf_utils.shape_type_conversion
def build(self, input_shape):
self.built = True
@tf_utils.shape_type_conversion
def compute_output_shape(self, input_shape):
return input_shape + (self.output_dim,)
def call(self, inputs: tf.Tensor):
if isinstance(inputs, list):
inputs = inputs[0]
if not hasattr(inputs, 'numpy'): # placeholder tensor
inputs = tf.expand_dims(inputs, axis=-1)
inputs = tf.tile(inputs, [1] * (len(inputs.shape) - 1) + [self.output_dim])
inputs = tf.zeros_like(inputs, dtype=tf.float32)
return inputs
# seq_len = inputs.shape[-1]
# if not seq_len:
# seq_len = 1
# return tf.zeros([1, seq_len, self.output_dim])
if self.mask_zero:
outputs, masks = self.embed_np(inputs.numpy())
outputs = tf.constant(outputs)
outputs._keras_mask = masks
else:
outputs = self.embed_np(inputs.numpy())
outputs = tf.constant(outputs)
return outputs
def compute_mask(self, inputs, mask=None):
if not self.mask_zero:
return None
return tf.not_equal(inputs, self.padding)
def get_config(self):
config = {
'filepath': self.filepath,
'padding': self.padding.decode('utf-8')
}
base_config = super(FastTextEmbeddingTF, self).get_config()
for junk in 'embeddings_initializer' \
, 'batch_input_shape' \
, 'embeddings_regularizer' \
, 'embeddings_constraint' \
, 'activity_regularizer' \
, 'trainable' \
, 'input_length' \
:
base_config.pop(junk)
return dict(list(base_config.items()) + list(config.items()))
================================================
FILE: hanlp/layers/embeddings/util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 15:45
from typing import Union
import torch
from torch import nn
from hanlp.common.vocab import Vocab
from hanlp.utils.init_util import embedding_uniform
from hanlp.utils.torch_util import load_word2vec, load_word2vec_as_vocab_tensor
def index_word2vec_with_vocab(filepath: str,
vocab: Vocab,
extend_vocab=True,
unk=None,
lowercase=False,
init='uniform',
normalize=None) -> torch.Tensor:
"""
Args:
filepath: The path to pretrained embedding.
vocab: The vocabulary from training set.
extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file.
unk: UNK token.
lowercase: Convert words in pretrained embeddings into lowercase.
init: Indicate which initialization to use for oov tokens.
normalize: ``True`` or a method to normalize the embedding matrix.
Returns:
An embedding matrix.
"""
pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath)
if unk and unk in pret_vocab:
pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk)
if extend_vocab:
vocab.unlock()
for word in pret_vocab:
vocab.get_idx(word.lower() if lowercase else word)
vocab.lock()
ids = []
unk_id_offset = 0
for word, idx in vocab.token_to_idx.items():
word_id = pret_vocab.get(word, None)
# Retry lower case
if word_id is None:
word_id = pret_vocab.get(word.lower(), None)
if word_id is None:
word_id = len(pret_vocab) + unk_id_offset
unk_id_offset += 1
ids.append(word_id)
if unk_id_offset:
unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1))
if init and init != 'zeros':
if init == 'uniform':
init = embedding_uniform
else:
raise ValueError(f'Unsupported init {init}')
unk_embeds = init(unk_embeds)
pret_matrix = torch.cat([pret_matrix, unk_embeds])
ids = torch.LongTensor(ids)
embedding = pret_matrix.index_select(0, ids)
if normalize == 'norm':
embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)
elif normalize == 'l2':
embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)
elif normalize == 'std':
embedding /= torch.std(embedding)
else:
raise ValueError(f'Unsupported normalization method {normalize}')
return embedding
def build_word2vec_with_vocab(embed: Union[str, int],
vocab: Vocab,
extend_vocab=True,
unk=None,
lowercase=False,
trainable=False,
init='zeros',
normalize=None) -> nn.Embedding:
"""Build word2vec embedding and a vocab.
Args:
embed:
vocab: The vocabulary from training set.
extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file.
unk: UNK token.
lowercase: Convert words in pretrained embeddings into lowercase.
trainable: ``False`` to use static embeddings.
init: Indicate which initialization to use for oov tokens.
normalize: ``True`` or a method to normalize the embedding matrix.
Returns:
An embedding matrix.
"""
if isinstance(embed, str):
embed = index_word2vec_with_vocab(embed, vocab, extend_vocab, unk, lowercase, init, normalize)
embed = nn.Embedding.from_pretrained(embed, freeze=not trainable, padding_idx=vocab.pad_idx)
return embed
elif isinstance(embed, int):
embed = nn.Embedding(len(vocab), embed, padding_idx=vocab.pad_idx)
return embed
else:
raise ValueError(f'Unsupported parameter type: {embed}')
================================================
FILE: hanlp/layers/embeddings/util_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 15:46
from typing import Union
import tensorflow as tf
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.layers.embeddings.char_cnn_tf import CharCNNEmbeddingTF
from hanlp.layers.embeddings.char_rnn_tf import CharRNNEmbeddingTF
from hanlp.layers.embeddings.concat_embedding import ConcatEmbedding
from hanlp.layers.embeddings.contextual_string_embedding_tf import ContextualStringEmbeddingTF
from hanlp.layers.embeddings.fast_text_tf import FastTextEmbeddingTF
from hanlp.layers.embeddings.word2vec_tf import Word2VecEmbeddingTF, StringWord2VecEmbeddingTF, Word2VecEmbeddingV1
_upgrade = tf.keras.utils.get_custom_objects()
for k, v in list(_upgrade.items()):
if k.startswith('HanLP>') and k.endswith('TF'):
_upgrade[k[:-2]] = v
def build_embedding(embeddings: Union[str, int, dict], word_vocab: VocabTF, transform: Transform):
if not embeddings:
return None
config = transform.config
if isinstance(embeddings, int):
embeddings = tf.keras.layers.Embedding(input_dim=len(word_vocab), output_dim=embeddings,
trainable=True, mask_zero=True)
config.embedding_trainable = True
elif isinstance(embeddings, dict):
# Upgrade to 2.1
embed_name = embeddings['class_name'].split('>')[-1]
if embeddings['class_name'].startswith('HanLP>') and not embeddings['class_name'].endswith('TF'):
embed_name += 'TF'
# Embeddings need vocab
if embed_name in (Word2VecEmbeddingTF.__name__, StringWord2VecEmbeddingTF.__name__):
# Vocab won't present in the dict
embeddings['config']['vocab'] = word_vocab
elif embed_name in (CharRNNEmbeddingTF.__name__, CharCNNEmbeddingTF.__name__):
embeddings['config']['word_vocab'] = word_vocab
embeddings['config']['char_vocab'] = transform.char_vocab
transform.map_x = False
layer: tf.keras.layers.Embedding = tf.keras.utils.deserialize_keras_object(embeddings)
# Embedding specific configuration
if layer.__class__.__name__ in ('FastTextEmbedding', 'FastTextEmbeddingTF'):
config.run_eagerly = True # fasttext can only run in eager mode
config.embedding_trainable = False
transform.map_x = False # fasttext accept string instead of int
return layer
elif isinstance(embeddings, list):
if embeddings_require_string_input(embeddings):
# those embeddings require string as input
transform.map_x = False
# use the string version of Word2VecEmbedding instead
for embed in embeddings:
if embed['class_name'].split('>')[-1] == Word2VecEmbeddingTF.__name__:
embed['class_name'] = 'HanLP>' + StringWord2VecEmbeddingTF.__name__
return ConcatEmbedding(*[build_embedding(embed, word_vocab, transform) for embed in embeddings])
else:
assert isinstance(embeddings, str), 'embedding should be str or int or dict'
# word_vocab.unlock()
embeddings = Word2VecEmbeddingV1(path=embeddings, vocab=word_vocab,
trainable=config.get('embedding_trainable', False))
embeddings = embeddings.array_ks
return embeddings
def any_embedding_in(embeddings, *cls):
names = set(x.__name__ for x in cls)
names.update(list(x[:-2] for x in names if x.endswith('TF')))
for embed in embeddings:
if isinstance(embed, dict) and embed['class_name'].split('>')[-1] in names:
return True
return False
def embeddings_require_string_input(embeddings):
if not isinstance(embeddings, list):
embeddings = [embeddings]
return any_embedding_in(embeddings, CharRNNEmbeddingTF, CharCNNEmbeddingTF, FastTextEmbeddingTF,
ContextualStringEmbeddingTF)
def embeddings_require_char_input(embeddings):
if not isinstance(embeddings, list):
embeddings = [embeddings]
return any_embedding_in(embeddings, CharRNNEmbeddingTF, CharCNNEmbeddingTF, ContextualStringEmbeddingTF)
================================================
FILE: hanlp/layers/embeddings/word2vec.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 13:38
import logging
import math
import os.path
from typing import Optional, Callable, Union, List, Dict
import torch
from hanlp_common.configurable import AutoConfigurable
from hanlp_common.constant import HANLP_VERBOSE
from hanlp_trie.trie import Trie
from torch import nn
from torch.utils.data import DataLoader
from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader
from hanlp.common.torch_component import TorchComponent
from hanlp.common.transform import VocabDict
from hanlp.common.vocab import Vocab
from hanlp.layers.dropout import WordDropout
from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim
from hanlp.layers.embeddings.util import build_word2vec_with_vocab
from hanlp.utils.log_util import flash
from hanlp.utils.torch_util import load_word2vec_as_vocab_tensor
class Word2VecEmbeddingModule(nn.Module, EmbeddingDim):
def __init__(self, field: str, embed: nn.Embedding, word_dropout: WordDropout = None, cpu=False,
second_channel=False, num_tokens_in_trn=None, unk_idx=1) -> None:
"""A word2vec style embedding module which maps a token to its embedding through looking up a pre-defined table.
Args:
field: The field to work on. Usually some token fields.
embed: An ``Embedding`` layer.
word_dropout: The probability of randomly replacing a token with ``UNK``.
cpu: Reside on CPU instead of GPU.
second_channel: A trainable second channel for each token, which will be added to pretrained embeddings.
num_tokens_in_trn: The number of tokens in training set.
unk_idx: The index of ``UNK``.
"""
super().__init__()
self.cpu = cpu
self.field = field
self.embed = embed
self.word_dropout = word_dropout
self.num_tokens_in_trn = num_tokens_in_trn
self.unk_idx = unk_idx
if second_channel:
n_words, n_embed = embed.weight.size()
if num_tokens_in_trn:
n_words = num_tokens_in_trn
second_channel = nn.Embedding(num_embeddings=n_words,
embedding_dim=n_embed)
nn.init.zeros_(second_channel.weight)
self.second_channel = second_channel
def forward(self, batch: dict, **kwargs):
x: torch.Tensor = batch[f'{self.field}_id']
if self.cpu:
device = x.device
x = x.cpu()
if self.word_dropout:
x = self.word_dropout(x)
if self.second_channel:
ext_mask = x.ge(self.second_channel.num_embeddings)
ext_words = x.masked_fill(ext_mask, self.unk_idx)
x = self.embed(x) + self.second_channel(ext_words)
else:
x = self.embed(x)
if self.cpu:
# noinspection PyUnboundLocalVariable
x = x.to(device)
return x
@property
def embedding_dim(self) -> int:
return self.embed.embedding_dim
# noinspection PyMethodOverriding
# def to(self, device, **kwargs):
# print(self.cpu)
# exit(1)
# if self.cpu:
# return super(Word2VecEmbeddingModule, self).to(-1, **kwargs)
# return super(Word2VecEmbeddingModule, self).to(device, **kwargs)
def _apply(self, fn):
if not self.cpu: # This might block all fn not limiting to moving between devices.
return super(Word2VecEmbeddingModule, self)._apply(fn)
class Word2VecEmbedding(Embedding, AutoConfigurable):
def __init__(self,
field,
embed: Union[int, str],
extend_vocab=True,
pad=None,
unk=None,
lowercase=False,
trainable=False,
second_channel=False,
word_dropout: float = 0,
normalize=False,
cpu=False,
init='zeros') -> None:
"""A word2vec style embedding builder which maps a token to its embedding through looking up a pre-defined
table.
Args:
field: The field to work on. Usually some token fields.
embed: A path to pre-trained embedding file or an integer defining the size of randomly initialized
embedding.
extend_vocab: Unlock vocabulary of training set to add those tokens in pre-trained embedding file.
pad: The padding token.
unk: The unknown token.
lowercase: Convert words in pretrained embeddings into lowercase.
trainable: ``False`` to use static embeddings.
second_channel: A trainable second channel for each token, which will be added to pretrained embeddings.
word_dropout: The probability of randomly replacing a token with ``UNK``.
normalize: ``l2`` or ``std`` to normalize the embedding matrix.
cpu: Reside on CPU instead of GPU.
init: Indicate which initialization to use for oov tokens.
"""
super().__init__()
self.pad = pad
self.second_channel = second_channel
self.cpu = cpu
self.normalize = normalize
self.word_dropout = word_dropout
self.init = init
self.lowercase = lowercase
self.unk = unk
self.extend_vocab = extend_vocab
self.trainable = trainable
self.embed = embed
self.field = field
def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]:
vocab = vocabs[self.field]
num_tokens_in_trn = len(vocab)
embed = build_word2vec_with_vocab(self.embed,
vocab,
self.extend_vocab,
self.unk,
self.lowercase,
self.trainable,
normalize=self.normalize)
if self.word_dropout:
assert vocab.unk_token, f'unk_token of vocab {self.field} has to be set in order to ' \
f'make use of word_dropout'
padding = []
if vocab.pad_token:
padding.append(vocab.pad_idx)
word_dropout = WordDropout(self.word_dropout, vocab.unk_idx, exclude_tokens=padding)
else:
word_dropout = None
return Word2VecEmbeddingModule(self.field, embed, word_dropout=word_dropout, cpu=self.cpu,
second_channel=self.second_channel, num_tokens_in_trn=num_tokens_in_trn,
unk_idx=vocab.unk_idx)
def transform(self, vocabs: VocabDict = None, **kwargs) -> Optional[Callable]:
assert vocabs is not None
if self.field not in vocabs:
vocabs[self.field] = Vocab(pad_token=self.pad, unk_token=self.unk)
return super().transform(**kwargs)
class Word2VecDataset(TransformableDataset):
def load_file(self, filepath: str):
raise NotImplementedError('Not supported.')
class Word2VecEmbeddingComponent(TorchComponent):
def __init__(self, **kwargs) -> None:
""" Toy example of Word2VecEmbedding. It simply returns the embedding of a given word
Args:
**kwargs:
"""
super().__init__(**kwargs)
self._tokenizer: Trie = None
def build_dataloader(self, data: List[str], shuffle=False, device=None, logger: logging.Logger = None,
doc2vec=False, batch_size=32, **kwargs) -> DataLoader:
dataset = Word2VecDataset([{'token': x} for x in data], transform=self._tokenize if doc2vec else self.vocabs)
return PadSequenceDataLoader(dataset, device=device, batch_size=batch_size)
def build_optimizer(self, **kwargs):
raise NotImplementedError('Not supported.')
def build_criterion(self, **kwargs):
raise NotImplementedError('Not supported.')
def build_metric(self, **kwargs):
raise NotImplementedError('Not supported.')
def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
logger: logging.Logger, devices, ratio_width=None, **kwargs):
raise NotImplementedError('Not supported.')
def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
raise NotImplementedError('Not supported.')
def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
raise NotImplementedError('Not supported.')
def load_vocabs(self, save_dir, filename='vocabs.json'):
self.vocabs['token'] = Vocab()
def load_weights(self, save_dir, filename='model.pt', **kwargs):
pass
def build_model(self, training=True, **kwargs) -> torch.nn.Module:
self._tokenizer = None
embed: Word2VecEmbedding = self.config.embed
model = embed.module(self.vocabs)
return model
def predict(self, word: str, doc2vec=False, **kwargs):
dataloader = self.build_dataloader([word], device=self.device, doc2vec=doc2vec)
for batch in dataloader: # It's a toy so doesn't really do batching
embeddings = self.model(batch)[0]
if doc2vec:
embeddings = embeddings[0].mean(dim=0)
return embeddings
@torch.no_grad()
def most_similar(self, words: Union[str, List[str]], topk=10, doc2vec=False, similarity_less_than=None,
batch_size=32) -> Union[Dict[str, float], List[Dict[str, float]]]:
"""Find the `topk` most similar words of a given word or phrase.
Args:
words: A word or phrase or multiple words/phrases.
topk: Number of top similar words.
doc2vec: Enable doc2vec model for processing OOV and phrases.
similarity_less_than: Only return words with a similarity less than this value.
batch_size: Number of words or phrases per batch.
Returns:
Similar words and similarities stored in a dict.
"""
flat = isinstance(words, str)
if flat:
words = [words]
dataloader = self.build_dataloader(words, device=self.device, doc2vec=doc2vec, batch_size=batch_size)
results = []
vocab = self.vocabs['token']
for batch in dataloader:
embeddings = self.model(batch)
token_id = batch['token_id']
if doc2vec:
lens = token_id.count_nonzero(dim=1)
embeddings = embeddings.sum(1)
embeddings = embeddings / lens.unsqueeze(1)
block_word_id = batch['block_word_id']
token_is_unk = (lens == 1) & (token_id[:, 0] == vocab.unk_idx)
else:
block_word_id = token_id
token_is_unk = token_id == vocab.unk_idx
similarities = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), self.model.embed.weight,
dim=-1)
if similarity_less_than is not None:
similarities[similarities > similarity_less_than] = -math.inf
similarities[torch.arange(similarities.size(0), device=self.device), block_word_id] = -math.inf
scores, indices = similarities.topk(topk)
for sc, idx, unk in zip(scores.tolist(), indices.tolist(), token_is_unk.tolist()):
results.append(dict() if unk else dict(zip([vocab.idx_to_token[i] for i in idx], sc)))
if flat:
results = results[0]
return results
def _tokenize(self, sample: dict) -> dict:
tokens = sample['token']
ids = [idx for b, e, idx in self.tokenizer.parse_longest(tokens)]
vocab = self.vocabs['token']
if not ids:
ids = [vocab.unk_idx]
sample['token_id'] = ids
sample['block_word_id'] = ids[0] if len(ids) == 1 else vocab.pad_idx
return sample
@property
def tokenizer(self):
if not self._tokenizer:
if HANLP_VERBOSE:
flash('Building Trie-based tokenizer for Doc2Vec [blink][yellow]...[/yellow][/blink]')
self._tokenizer = Trie(self.vocabs['token'].token_to_idx)
if HANLP_VERBOSE:
flash('')
return self._tokenizer
def load_config(self, save_dir, filename='config.json', **kwargs):
if os.path.isfile(save_dir):
self.config.update({'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbeddingComponent',
'embed': Word2VecEmbedding(field='token', embed=save_dir, normalize='l2')})
return
super().load_config(save_dir, filename, **kwargs)
class GazetterTransform(object):
def __init__(self, field, words: dict) -> None:
super().__init__()
self.field = field
self.trie = Trie()
for word, idx in words.items():
self.trie[word] = idx
def __call__(self, sample: dict) -> dict:
tokens = sample[self.field]
lexicons = self.trie.parse(tokens)
skips_l2r = [[] for _ in range(len(tokens))]
skips_r2l = [[] for _ in range(len(tokens))]
for w, i, s, e in lexicons:
e = e - 1
skips_l2r[e].append((s, w, i))
skips_r2l[s].append((e, w, i))
for direction, value in zip(['skips_l2r', 'skips_r2l'], [skips_l2r, skips_r2l]):
sample[f'{self.field}_{direction}_offset'] = [list(map(lambda x: x[0], p)) for p in value]
sample[f'{self.field}_{direction}_id'] = [list(map(lambda x: x[-1], p)) for p in value]
sample[f'{self.field}_{direction}_count'] = list(map(len, value))
return sample
class GazetteerEmbedding(Embedding, AutoConfigurable):
def __init__(self, embed: str, field='char', trainable=False) -> None:
self.trainable = trainable
self.embed = embed
self.field = field
vocab, matrix = load_word2vec_as_vocab_tensor(self.embed)
ids = []
_vocab = {}
for word, idx in vocab.items():
if len(word) > 1:
ids.append(idx)
_vocab[word] = len(_vocab)
ids = torch.tensor(ids)
_matrix = matrix.index_select(0, ids)
self._vocab = _vocab
self._matrix = _matrix
def transform(self, **kwargs) -> Optional[Callable]:
return GazetterTransform(self.field, self._vocab)
def module(self, **kwargs) -> Optional[nn.Module]:
embed = nn.Embedding.from_pretrained(self._matrix, freeze=not self.trainable)
return embed
@staticmethod
def _remove_short_tokens(word2vec):
word2vec = dict((w, v) for w, v in word2vec.items() if len(w) > 1)
return word2vec
================================================
FILE: hanlp/layers/embeddings/word2vec_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-24 21:49
import os
from typing import Tuple, Union, List
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import math_ops
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.io_util import get_resource
from hanlp.utils.torch_util import load_word2vec
from hanlp.utils.tf_util import hanlp_register
from hanlp_common.util import DummyContext
class Word2VecEmbeddingV1(tf.keras.layers.Layer):
def __init__(self, path: str = None, vocab: VocabTF = None, normalize: bool = False, load_all=True, mask_zero=True,
trainable=False, name=None, dtype=None, dynamic=False, **kwargs):
super().__init__(trainable, name, dtype, dynamic, **kwargs)
if load_all and vocab and vocab.locked:
vocab.unlock()
self.vocab, self.array_np = self._load(path, vocab, normalize)
self.vocab.lock()
self.array_ks = tf.keras.layers.Embedding(input_dim=len(self.vocab), output_dim=self.dim, trainable=trainable,
embeddings_initializer=tf.keras.initializers.Constant(self.array_np),
mask_zero=mask_zero)
self.mask_zero = mask_zero
self.supports_masking = mask_zero
def compute_mask(self, inputs, mask=None):
if not self.mask_zero:
return None
return math_ops.not_equal(inputs, self.vocab.pad_idx)
def call(self, inputs, **kwargs):
return self.array_ks(inputs, **kwargs)
def compute_output_shape(self, input_shape):
return input_shape[0], self.dim
@staticmethod
def _load(path, vocab, normalize=False) -> Tuple[VocabTF, Union[np.ndarray, None]]:
if not vocab:
vocab = VocabTF()
if not path:
return vocab, None
assert vocab.unk_idx is not None
word2vec, dim = load_word2vec(path)
for word in word2vec:
vocab.get_idx(word)
pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32)
state = np.random.get_state()
np.random.seed(0)
bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32)
scale = np.sqrt(3.0 / dim)
for word, idx in vocab.token_to_idx.items():
vec = word2vec.get(word, None)
if vec is None:
vec = word2vec.get(word.lower(), None)
# if vec is not None:
# vec += bias
if vec is None:
# vec = np.random.uniform(-scale, scale, [dim])
vec = np.zeros([dim], dtype=np.float32)
pret_embs[idx] = vec
# noinspection PyTypeChecker
np.random.set_state(state)
return vocab, pret_embs
@property
def size(self):
if self.array_np is not None:
return self.array_np.shape[0]
@property
def dim(self):
if self.array_np is not None:
return self.array_np.shape[1]
@property
def shape(self):
if self.array_np is None:
return None
return self.array_np.shape
def get_vector(self, word: str) -> np.ndarray:
assert self.array_np is not None
return self.array_np[self.vocab.get_idx_without_add(word)]
def __getitem__(self, word: Union[str, List, tf.Tensor]) -> np.ndarray:
if isinstance(word, str):
return self.get_vector(word)
elif isinstance(word, list):
vectors = np.zeros(shape=(len(word), self.dim))
for idx, token in enumerate(word):
vectors[idx] = self.get_vector(token)
return vectors
elif isinstance(word, tf.Tensor):
if word.dtype == tf.string:
word_ids = self.vocab.token_to_idx_table.lookup(word)
return tf.nn.embedding_lookup(self.array_tf, word_ids)
elif word.dtype == tf.int32 or word.dtype == tf.int64:
return tf.nn.embedding_lookup(self.array_tf, word)
@hanlp_register
class Word2VecEmbeddingTF(tf.keras.layers.Embedding):
def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=True,
input_dim=None, output_dim=None, unk=None, normalize=False,
embeddings_initializer='VarianceScaling',
embeddings_regularizer=None,
activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None,
name=None, cpu=True, **kwargs):
filepath = get_resource(filepath)
word2vec, _output_dim = load_word2vec(filepath)
if output_dim:
assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
output_dim = _output_dim
# if the `unk` token exists in the pretrained,
# then replace it with a self-defined one, usually the one in word vocab
if unk and unk in word2vec:
word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
if vocab is None:
vocab = VocabTF()
vocab.update(word2vec.keys())
if expand_vocab and vocab.mutable:
for word in word2vec:
vocab.get_idx(word.lower() if lowercase else word)
if input_dim:
assert input_dim == len(vocab), f'input_dim = {input_dim} does not match {filepath}'
input_dim = len(vocab)
# init matrix
self._embeddings_initializer = embeddings_initializer
embeddings_initializer = tf.keras.initializers.get(embeddings_initializer)
with tf.device('cpu:0') if cpu else DummyContext():
pret_embs = embeddings_initializer(shape=[input_dim, output_dim]).numpy()
# insert to pret_embs
for word, idx in vocab.token_to_idx.items():
vec = word2vec.get(word, None)
# Retry lower case
if vec is None and lowercase:
vec = word2vec.get(word.lower(), None)
if vec is not None:
pret_embs[idx] = vec
if normalize:
pret_embs /= np.std(pret_embs)
if not name:
name = os.path.splitext(os.path.basename(filepath))[0]
super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer,
activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs)
self.filepath = filepath
self.expand_vocab = expand_vocab
self.lowercase = lowercase
def get_config(self):
config = {
'filepath': self.filepath,
'expand_vocab': self.expand_vocab,
'lowercase': self.lowercase,
}
base_config = super(Word2VecEmbeddingTF, self).get_config()
base_config['embeddings_initializer'] = self._embeddings_initializer
return dict(list(base_config.items()) + list(config.items()))
@hanlp_register
class StringWord2VecEmbeddingTF(Word2VecEmbeddingTF):
def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=False, input_dim=None,
output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling',
embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True,
input_length=None, name=None, **kwargs):
if vocab is None:
vocab = VocabTF()
self.vocab = vocab
super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim, output_dim, unk, normalize,
embeddings_initializer, embeddings_regularizer, activity_regularizer, embeddings_constraint,
mask_zero, input_length, name, **kwargs)
def call(self, inputs):
assert inputs.dtype == tf.string, \
f'Expect tf.string but got tf.{inputs.dtype.name}. {inputs}' \
f'Please pass tf.{inputs.dtype.name} in.'
inputs = self.vocab.lookup(inputs)
# inputs._keras_mask = tf.not_equal(inputs, self.vocab.pad_idx)
return super().call(inputs)
def compute_mask(self, inputs, mask=None):
if not self.mask_zero:
return None
return tf.not_equal(inputs, self.vocab.pad_token)
================================================
FILE: hanlp/layers/feed_forward.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-06 14:37
from typing import Union, List
from hanlp.layers import feedforward
from hanlp.common.structure import ConfigTracker
class FeedForward(feedforward.FeedForward, ConfigTracker):
def __init__(self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]],
activations: Union[str, List[str]], dropout: Union[float, List[float]] = 0.0) -> None:
super().__init__(input_dim, num_layers, hidden_dims, activations, dropout)
ConfigTracker.__init__(self, locals())
================================================
FILE: hanlp/layers/feedforward.py
================================================
"""
A feed-forward neural network.
"""
from typing import List, Union
import torch
from hanlp.utils.torch_util import activation_from_name
class FeedForward(torch.nn.Module):
"""
This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with
activation functions in between.
# Parameters
input_dim : `int`, required
The dimensionality of the input. We assume the input has shape `(batch_size, input_dim)`.
num_layers : `int`, required
The number of `Linear` layers to apply to the input.
hidden_dims : `Union[int, List[int]]`, required
The output dimension of each of the `Linear` layers. If this is a single `int`, we use
it for all `Linear` layers. If it is a `List[int]`, `len(hidden_dims)` must be
`num_layers`.
activations : `Union[Activation, List[Activation]]`, required
The activation function to use after each `Linear` layer. If this is a single function,
we use it after all `Linear` layers. If it is a `List[Activation]`,
`len(activations)` must be `num_layers`. Activation must have torch.nn.Module type.
dropout : `Union[float, List[float]]`, optional (default = `0.0`)
If given, we will apply this amount of dropout after each layer. Semantics of `float`
versus `List[float]` is the same as with other parameters.
# Examples
```python
FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
#> FeedForward(
#> (_activations): ModuleList(
#> (0): ReLU()
#> (1): ReLU()
#> )
#> (_linear_layers): ModuleList(
#> (0): Linear(in_features=124, out_features=64, bias=True)
#> (1): Linear(in_features=64, out_features=32, bias=True)
#> )
#> (_dropout): ModuleList(
#> (0): Dropout(p=0.2, inplace=False)
#> (1): Dropout(p=0.2, inplace=False)
#> )
#> )
```
"""
def __init__(
self,
input_dim: int,
num_layers: int,
hidden_dims: Union[int, List[int]],
activations: Union[str, List[str]],
dropout: Union[float, List[float]] = 0.0,
) -> None:
super().__init__()
if not isinstance(hidden_dims, list):
hidden_dims = [hidden_dims] * num_layers # type: ignore
if not isinstance(activations, list):
activations = [activations] * num_layers # type: ignore
activations = [activation_from_name(a)() for a in activations]
if not isinstance(dropout, list):
dropout = [dropout] * num_layers # type: ignore
if len(hidden_dims) != num_layers:
raise ValueError(
"len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers)
)
if len(activations) != num_layers:
raise ValueError(
"len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers)
)
if len(dropout) != num_layers:
raise ValueError(
"len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers)
)
self._activations = torch.nn.ModuleList(activations)
input_dims = [input_dim] + hidden_dims[:-1]
linear_layers = []
for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
self._linear_layers = torch.nn.ModuleList(linear_layers)
dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
self._dropout = torch.nn.ModuleList(dropout_layers)
self._output_dim = hidden_dims[-1]
self.input_dim = input_dim
def get_output_dim(self):
return self._output_dim
def get_input_dim(self):
return self.input_dim
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
output = inputs
for layer, activation, dropout in zip(
self._linear_layers, self._activations, self._dropout
):
output = dropout(activation(layer(output)))
return output
================================================
FILE: hanlp/layers/scalar_mix.py
================================================
# This file is modified from udify, which is licensed under the MIT license:
# MIT License
#
# Copyright (c) 2019 Dan Kondratyuk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
The dot-product "Layer Attention" that is applied to the layers of BERT, along with layer dropout to reduce overfitting
"""
from typing import List, Tuple
import torch
from torch.nn import ParameterList, Parameter
from hanlp.common.structure import ConfigTracker
class ScalarMixWithDropout(torch.nn.Module):
"""Computes a parameterised scalar mixture of N tensors, ``mixture = gamma * sum(s_k * tensor_k)``
where ``s = softmax(w)``, with ``w`` and ``gamma`` scalar parameters.
If ``do_layer_norm=True`` then apply layer normalization to each tensor before weighting.
If ``dropout > 0``, then for each scalar weight, adjust its softmax weight mass to 0 with
the dropout probability (i.e., setting the unnormalized weight to -inf). This effectively
should redistribute dropped probability mass to all other weights.
Args:
Returns:
"""
def __init__(self,
mixture_range: Tuple[int, int],
do_layer_norm: bool = False,
initial_scalar_parameters: List[float] = None,
trainable: bool = True,
dropout: float = None,
dropout_value: float = -1e20,
**kwargs) -> None:
super(ScalarMixWithDropout, self).__init__()
self.mixture_range = mixture_range
mixture_size = mixture_range[1] - mixture_range[0]
self.mixture_size = mixture_size
self.do_layer_norm = do_layer_norm
self.dropout = dropout
if initial_scalar_parameters is None:
initial_scalar_parameters = [0.0] * mixture_size
elif len(initial_scalar_parameters) != mixture_size:
raise ValueError("Length of initial_scalar_parameters {} differs "
"from mixture_size {}".format(
initial_scalar_parameters, mixture_size))
# self.scalar_parameters = ParameterList(
# [Parameter(torch.FloatTensor([initial_scalar_parameters[i]]),
# requires_grad=trainable) for i
# in range(mixture_size)])
self.scalar_parameters = Parameter(torch.FloatTensor(initial_scalar_parameters), requires_grad=True)
self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=trainable)
if self.dropout:
dropout_mask = torch.zeros(len(self.scalar_parameters))
dropout_fill = torch.empty(len(self.scalar_parameters)).fill_(dropout_value)
self.register_buffer("dropout_mask", dropout_mask)
self.register_buffer("dropout_fill", dropout_fill)
def forward(self, tensors: List[torch.Tensor], # pylint: disable=arguments-differ
mask: torch.Tensor = None) -> torch.Tensor:
"""Compute a weighted average of the ``tensors``. The input tensors an be any shape
with at least two dimensions, but must all be the same shape.
When ``do_layer_norm=True``, the ``mask`` is required input. If the ``tensors`` are
dimensioned ``(dim_0, ..., dim_{n-1}, dim_n)``, then the ``mask`` is dimensioned
``(dim_0, ..., dim_{n-1})``, as in the typical case with ``tensors`` of shape
``(batch_size, timesteps, dim)`` and ``mask`` of shape ``(batch_size, timesteps)``.
When ``do_layer_norm=False`` the ``mask`` is ignored.
Args:
tensors: List[torch.Tensor]:
# pylint: disable: (Default value = arguments-differmask: torch.Tensor = None)
Returns:
"""
if len(tensors) != self.mixture_size:
tensors = tensors[self.mixture_range[0]:self.mixture_range[1]]
if len(tensors) != self.mixture_size:
raise ValueError("{} tensors were passed, but the module was initialized to "
"mix {} tensors.".format(len(tensors), self.mixture_size))
def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
tensor_masked = tensor * broadcast_mask
mean = torch.sum(tensor_masked) / num_elements_not_masked
variance = torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / num_elements_not_masked
return (tensor - mean) / torch.sqrt(variance + 1E-12)
weights = self.scalar_parameters
if self.dropout:
weights = torch.where(self.dropout_mask.uniform_() > self.dropout, weights, self.dropout_fill)
normed_weights = torch.nn.functional.softmax(weights, dim=0)
if not self.do_layer_norm:
return self.gamma * torch.einsum('i,ijkl->jkl', normed_weights, tensors)
# pieces = []
# for weight, tensor in zip(normed_weights, tensors):
# pieces.append(weight * tensor)
# return self.gamma * sum(pieces)
else:
normed_weights = torch.split(normed_weights, split_size_or_sections=1)
mask_float = mask.float()
broadcast_mask = mask_float.unsqueeze(-1)
input_dim = tensors[0].size(-1)
num_elements_not_masked = torch.sum(mask_float) * input_dim
pieces = []
for weight, tensor in zip(normed_weights, tensors):
pieces.append(weight * _do_layer_norm(tensor,
broadcast_mask, num_elements_not_masked))
return self.gamma * sum(pieces)
class ScalarMixWithDropoutBuilder(ConfigTracker, ScalarMixWithDropout):
def __init__(self,
mixture_range: Tuple[int, int],
do_layer_norm: bool = False,
initial_scalar_parameters: List[float] = None,
trainable: bool = True,
dropout: float = None,
dropout_value: float = -1e20) -> None:
super().__init__(locals())
def build(self):
return ScalarMixWithDropout(**self.config)
================================================
FILE: hanlp/layers/time_distributed.py
================================================
"""
A wrapper that unrolls the second (time) dimension of a tensor
into the first (batch) dimension, applies some other `Module`,
and then rolls the time dimension back up.
"""
from typing import List
import torch
class TimeDistributed(torch.nn.Module):
"""
Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes
inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be
`(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back.
Note that while the above gives shapes with `batch_size` first, this `Module` also works if
`batch_size` is second - we always just combine the first two dimensions, then split them.
It also reshapes keyword arguments unless they are not tensors or their name is specified in
the optional `pass_through` iterable.
"""
def __init__(self, module):
super().__init__()
self._module = module
def forward(self, *inputs, pass_through: List[str] = None, **kwargs):
pass_through = pass_through or []
reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs]
# Need some input to then get the batch_size and time_steps.
some_input = None
if inputs:
some_input = inputs[-1]
reshaped_kwargs = {}
for key, value in kwargs.items():
if isinstance(value, torch.Tensor) and key not in pass_through:
if some_input is None:
some_input = value
value = self._reshape_tensor(value)
reshaped_kwargs[key] = value
reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs)
if some_input is None:
raise RuntimeError("No input tensor to time-distribute")
# Now get the output back into the right shape.
# (batch_size, time_steps, **output_size)
new_size = some_input.size()[:2] + reshaped_outputs.size()[1:]
outputs = reshaped_outputs.contiguous().view(new_size)
return outputs
@staticmethod
def _reshape_tensor(input_tensor):
input_size = input_tensor.size()
if len(input_size) <= 2:
raise RuntimeError(f"No dimension to distribute: {input_size}")
# Squash batch_size and time_steps into a single axis; result has shape
# (batch_size * time_steps, **input_size).
squashed_shape = [-1] + list(input_size[2:])
return input_tensor.contiguous().view(*squashed_shape)
================================================
FILE: hanlp/layers/transformers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 15:17
# mute transformers
import logging
logging.getLogger('transformers.file_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.filelock').setLevel(logging.ERROR)
logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.configuration_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.modeling_tf_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.modeling_utils').setLevel(logging.ERROR)
logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR)
================================================
FILE: hanlp/layers/transformers/encoder.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-22 21:06
import warnings
from typing import Union, Dict, Any, Sequence, Tuple, Optional
import torch
from torch import nn
from hanlp.layers.dropout import WordDropout
from hanlp.layers.scalar_mix import ScalarMixWithDropout, ScalarMixWithDropoutBuilder
from hanlp.layers.transformers.resource import get_tokenizer_mirror
from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer, AutoTokenizer, AutoModel_, \
BertTokenizer, AutoTokenizer_
from hanlp.layers.transformers.utils import transformer_encode
# noinspection PyAbstractClass
class TransformerEncoder(nn.Module):
def __init__(self,
transformer: Union[PreTrainedModel, str],
transformer_tokenizer: PreTrainedTokenizer,
average_subwords=False,
scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
word_dropout=None,
max_sequence_length=None,
ret_raw_hidden_states=False,
transformer_args: Dict[str, Any] = None,
trainable=Union[bool, Optional[Tuple[int, int]]],
training=True) -> None:
"""A pre-trained transformer encoder.
Args:
transformer: A ``PreTrainedModel`` or an identifier of a ``PreTrainedModel``.
transformer_tokenizer: A ``PreTrainedTokenizer``.
average_subwords: ``True`` to average subword representations.
scalar_mix: Layer attention.
word_dropout: Dropout rate of randomly replacing a subword with MASK.
max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
window. If ``None``, then the ``max_position_embeddings`` of the transformer will be used.
ret_raw_hidden_states: ``True`` to return hidden states of each layer.
transformer_args: Extra arguments passed to the transformer.
trainable: ``False`` to use static embeddings.
training: ``False`` to skip loading weights from pre-trained transformers.
"""
super().__init__()
self.ret_raw_hidden_states = ret_raw_hidden_states
self.average_subwords = average_subwords
if word_dropout:
oov = transformer_tokenizer.mask_token_id
if isinstance(word_dropout, Sequence):
word_dropout, replacement = word_dropout
if replacement == 'unk':
# Electra English has to use unk
oov = transformer_tokenizer.unk_token_id
elif replacement == 'mask':
# UDify uses [MASK]
oov = transformer_tokenizer.mask_token_id
else:
oov = replacement
pad = transformer_tokenizer.pad_token_id
cls = transformer_tokenizer.cls_token_id
sep = transformer_tokenizer.sep_token_id
excludes = [pad, cls, sep]
self.word_dropout = WordDropout(p=word_dropout, oov_token=oov, exclude_tokens=excludes)
else:
self.word_dropout = None
if isinstance(transformer, str):
output_hidden_states = scalar_mix is not None
if transformer_args is None:
transformer_args = dict()
transformer_args['output_hidden_states'] = output_hidden_states
transformer = AutoModel_.from_pretrained(transformer, training=training or not trainable,
**transformer_args)
if max_sequence_length is None:
max_sequence_length = transformer.config.max_position_embeddings
self.max_sequence_length = max_sequence_length
if hasattr(transformer, 'encoder') and hasattr(transformer, 'decoder'):
# For seq2seq model, use its encoder
transformer = transformer.encoder
self.transformer = transformer
if not trainable:
transformer.requires_grad_(False)
elif isinstance(trainable, tuple):
layers = []
if hasattr(transformer, 'embeddings'):
layers.append(transformer.embeddings)
layers.extend(transformer.encoder.layer)
for i, layer in enumerate(layers):
if i < trainable[0] or i >= trainable[1]:
layer.requires_grad_(False)
if isinstance(scalar_mix, ScalarMixWithDropoutBuilder):
self.scalar_mix: ScalarMixWithDropout = scalar_mix.build()
else:
self.scalar_mix = None
def forward(self, input_ids: torch.LongTensor, attention_mask=None, token_type_ids=None, token_span=None, **kwargs):
if self.word_dropout:
input_ids = self.word_dropout(input_ids)
x = transformer_encode(self.transformer,
input_ids,
attention_mask,
token_type_ids,
token_span,
layer_range=self.scalar_mix.mixture_range if self.scalar_mix else 0,
max_sequence_length=self.max_sequence_length,
average_subwords=self.average_subwords,
ret_raw_hidden_states=self.ret_raw_hidden_states)
if self.ret_raw_hidden_states:
x, raw_hidden_states = x
if self.scalar_mix:
x = self.scalar_mix(x)
if self.ret_raw_hidden_states:
# noinspection PyUnboundLocalVariable
return x, raw_hidden_states
return x
@staticmethod
def build_transformer(config, training=True) -> PreTrainedModel:
kwargs = {}
if config.scalar_mix and config.scalar_mix > 0:
kwargs['output_hidden_states'] = True
transformer = AutoModel_.from_pretrained(config.transformer, training=training, **kwargs)
return transformer
@staticmethod
def build_transformer_tokenizer(config_or_str, use_fast=True, do_basic_tokenize=True) -> PreTrainedTokenizer:
return AutoTokenizer_.from_pretrained(config_or_str, use_fast, do_basic_tokenize)
================================================
FILE: hanlp/layers/transformers/loader_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-04 06:05
import tensorflow as tf
from transformers import TFAutoModel
from hanlp.layers.transformers.pt_imports import AutoTokenizer_, AutoModel_
def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False):
tokenizer = AutoTokenizer_.from_pretrained(transformer)
if tokenizer_only:
return tokenizer
l_bert = TFAutoModel.from_pretrained(transformer)
l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids")
l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids")
l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids")
output = l_bert(input_ids=l_input_ids, token_type_ids=l_token_type_ids, attention_mask=l_mask_ids).last_hidden_state
if not tagging:
output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
logits = tf.keras.layers.Dense(num_labels)(output)
model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits)
model.build(input_shape=(None, max_seq_length))
return model, tokenizer
================================================
FILE: hanlp/layers/transformers/pt_imports.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 11:25
import os
import warnings
from hanlp.layers.transformers.resource import get_tokenizer_mirror, get_model_mirror
if os.environ.get('TOKENIZERS_PARALLELISM', None) is None:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import BertTokenizer, BertConfig, PretrainedConfig, AutoConfig, AutoTokenizer, PreTrainedTokenizer, \
BertTokenizerFast, AlbertConfig, BertModel, AutoModel, PreTrainedModel, AutoModelForSequenceClassification, \
AutoModelForTokenClassification, BartModel
class AutoModel_(AutoModel):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, training=True, **kwargs):
pretrained_model_name_or_path = get_model_mirror(pretrained_model_name_or_path)
if training:
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
else:
if isinstance(pretrained_model_name_or_path, str):
pretrained_model_name_or_path = get_tokenizer_mirror(pretrained_model_name_or_path)
return super().from_config(AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs))
else:
assert not kwargs
return super().from_config(pretrained_model_name_or_path)
class AutoConfig_(AutoConfig):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
pretrained_model_name_or_path = get_tokenizer_mirror(pretrained_model_name_or_path)
return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
class AutoTokenizer_(AutoTokenizer):
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True,
do_basic_tokenize=True) -> PreTrainedTokenizer:
if isinstance(pretrained_model_name_or_path, str):
transformer = pretrained_model_name_or_path
else:
transformer = pretrained_model_name_or_path.transformer
additional_config = dict()
if transformer.startswith('voidful/albert_chinese_') or transformer.startswith('uer/albert'):
cls = BertTokenizer
elif transformer == 'cl-tohoku/bert-base-japanese-char':
# Since it's char level model, it's OK to use char level tok instead of fugashi
# from hanlp.utils.lang.ja.bert_tok import BertJapaneseTokenizerFast
# cls = BertJapaneseTokenizerFast
from transformers import BertJapaneseTokenizer
cls = BertJapaneseTokenizer
# from transformers import BertTokenizerFast
# cls = BertTokenizerFast
additional_config['word_tokenizer_type'] = 'basic'
elif transformer == "Langboat/mengzi-bert-base":
cls = BertTokenizerFast if use_fast else BertTokenizer
else:
cls = AutoTokenizer
if use_fast and not do_basic_tokenize:
warnings.warn('`do_basic_tokenize=False` might not work when `use_fast=True`')
tokenizer = cls.from_pretrained(get_tokenizer_mirror(transformer), use_fast=use_fast,
do_basic_tokenize=do_basic_tokenize,
**additional_config)
tokenizer.name_or_path = transformer
return tokenizer
================================================
FILE: hanlp/layers/transformers/relative_transformer.py
================================================
# A modified version of the implementation from the following paper:
# TENER: Adapting Transformer Encoder for Named Entity Recognition
# Hang Yan, Bocao Deng, Xiaonan Li, Xipeng Qiu
import math
import torch
import torch.nn.functional as F
from torch import Tensor, nn
from hanlp.common.structure import ConfigTracker
class RelativeSinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length.
Padding symbols are ignored.
Args:
embedding_dim: embedding size of each position
padding_idx:
Returns:
"""
def __init__(self, embedding_dim, padding_idx, init_size=1024):
super().__init__()
self.embedding_dim = embedding_dim
self.padding_idx = padding_idx
assert init_size % 2 == 0
weights = self.get_embedding(
init_size + 1,
embedding_dim,
padding_idx,
)
self.register_buffer('weights', weights)
def get_embedding(self, num_embeddings, embedding_dim, padding_idx=None):
"""Build sinusoidal embeddings.
This matches the implementation in tensor2tensor, but differs slightly
from the description in Section 3.5 of "Attention Is All You Need".
Args:
num_embeddings:
embedding_dim:
padding_idx: (Default value = None)
Returns:
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
emb = torch.arange(-num_embeddings // 2, num_embeddings // 2, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
# zero pad
emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
self.origin_shift = num_embeddings // 2 + 1
return emb
def forward(self, inputs: Tensor):
"""Input is expected to be of size [bsz x seqlen].
Args:
inputs: Tensor:
Returns:
"""
bsz, seq_len = inputs.size()
max_pos = self.padding_idx + seq_len
if max_pos >= self.origin_shift:
# recompute/expand embeddings if needed
weights = self.get_embedding(
max_pos * 2,
self.embedding_dim,
self.padding_idx,
)
weights = weights.to(self.weights.device)
del self.weights
self.origin_shift = weights.size(0) // 2
self.register_buffer('weights', weights)
positions = torch.arange(-seq_len, seq_len).to(inputs.device).long() + self.origin_shift # 2*seq_len
embed = self.weights.index_select(0, positions.long()).detach()
return embed
class RelativeMultiHeadAttn(nn.Module):
def __init__(self, in_features, num_heads, dropout, r_w_bias=None, r_r_bias=None, init_seq_length=1024,
k_as_x=True):
"""
Args:
in_features:
num_heads:
dropout:
r_w_bias: n_head x head_dim or None
r_r_bias: n_head x head_dim or None
init_seq_length:
k_as_x:
"""
super().__init__()
self.k_as_x = k_as_x
if k_as_x:
self.qv_linear = nn.Linear(in_features, in_features * 2, bias=False)
else:
self.qkv_linear = nn.Linear(in_features, in_features * 3, bias=False)
self.n_head = num_heads
self.head_dim = in_features // num_heads
self.dropout_layer = nn.Dropout(dropout)
self.pos_embed = RelativeSinusoidalPositionalEmbedding(self.head_dim, 0, init_seq_length)
if r_r_bias is None or r_w_bias is None: # Biases are not shared
self.r_r_bias = nn.Parameter(nn.init.xavier_normal_(torch.zeros(num_heads, in_features // num_heads)))
self.r_w_bias = nn.Parameter(nn.init.xavier_normal_(torch.zeros(num_heads, in_features // num_heads)))
else:
self.r_r_bias = r_r_bias # r_r_bias就是v
self.r_w_bias = r_w_bias # r_w_bias就是u
def forward(self, x, mask):
"""
Args:
x: batch_size x max_len x d_model
mask: batch_size x max_len
Returns:
"""
batch_size, max_len, d_model = x.size()
pos_embed = self.pos_embed(mask) # l x head_dim
if self.k_as_x:
qv = self.qv_linear(x) # batch_size x max_len x d_model2
q, v = torch.chunk(qv, chunks=2, dim=-1)
k = x.view(batch_size, max_len, self.n_head, -1).transpose(1, 2)
else:
qkv = self.qkv_linear(x) # batch_size x max_len x d_model3
q, k, v = torch.chunk(qkv, chunks=3, dim=-1)
k = k.view(batch_size, max_len, self.n_head, -1).transpose(1, 2)
q = q.view(batch_size, max_len, self.n_head, -1).transpose(1, 2)
v = v.view(batch_size, max_len, self.n_head, -1).transpose(1, 2) # b x n x l x d
rw_head_q = q + self.r_r_bias[:, None]
AC = torch.einsum('bnqd,bnkd->bnqk', [rw_head_q, k]) # b x n x l x d, n是head
D_ = torch.einsum('nd,ld->nl', self.r_w_bias, pos_embed)[None, :, None] # head x 2max_len, 每个head对位置的bias
B_ = torch.einsum('bnqd,ld->bnql', q, pos_embed) # bsz x head x max_len x 2max_len,每个query对每个shift的偏移
E_ = torch.einsum('bnqd,ld->bnql', k, pos_embed) # bsz x head x max_len x 2max_len, key对relative的bias
BD = B_ + D_ # bsz x head x max_len x 2max_len, 要转换为bsz x head x max_len x max_len
if self.k_as_x:
BD = self._shift(BD)
attn = AC + BD
else:
BDE = self._shift(BD) + self._transpose_shift(E_)
attn = AC + BDE
attn = attn.masked_fill(mask[:, None, None, :].eq(0), float('-inf'))
attn = F.softmax(attn, dim=-1)
attn = self.dropout_layer(attn)
v = torch.matmul(attn, v).transpose(1, 2).reshape(batch_size, max_len, d_model) # b x n x l x d
return v
def _shift(self, BD):
"""类似
-3 -2 -1 0 1 2
-3 -2 -1 0 1 2
-3 -2 -1 0 1 2
转换为
0 1 2
-1 0 1
-2 -1 0
Args:
BD: batch_size x n_head x max_len x 2max_len
Returns:
batch_size x n_head x max_len x max_len
"""
bsz, n_head, max_len, _ = BD.size()
zero_pad = BD.new_zeros(bsz, n_head, max_len, 1)
BD = torch.cat([BD, zero_pad], dim=-1).view(bsz, n_head, -1, max_len) # bsz x n_head x (2max_len+1) x max_len
BD = BD.narrow(dim=2, start=0, length=2 * max_len) \
.view(bsz, n_head, max_len, -1) # bsz x n_head x 2max_len x max_len
BD = BD.narrow(dim=-1, start=max_len, length=max_len)
return BD
def _transpose_shift(self, E):
"""类似
-3 -2 -1 0 1 2
-30 -20 -10 00 10 20
-300 -200 -100 000 100 200
转换为
0 -10 -200
1 00 -100
2 10 000
Args:
E: batch_size x n_head x max_len x 2max_len
Returns:
batch_size x n_head x max_len x max_len
"""
bsz, n_head, max_len, _ = E.size()
zero_pad = E.new_zeros(bsz, n_head, max_len, 1)
# bsz x n_head x -1 x (max_len+1)
E = torch.cat([E, zero_pad], dim=-1).view(bsz, n_head, -1, max_len)
indice = (torch.arange(max_len) * 2 + 1).to(E.device)
E = E.index_select(index=indice, dim=-2).transpose(-1, -2) # bsz x n_head x max_len x max_len
return E
class RelativeTransformerLayer(nn.Module):
def __init__(self,
in_features,
num_heads=4,
feedforward_dim=256,
dropout=0.2,
dropout_attn=None,
after_norm=True,
k_as_x=True,
init_seq_length=1024):
super().__init__()
if dropout_attn is None:
dropout_attn = dropout
self.after_norm = after_norm
self.norm1 = nn.LayerNorm(in_features)
self.norm2 = nn.LayerNorm(in_features)
self.self_attn = RelativeMultiHeadAttn(in_features,
num_heads,
dropout=dropout_attn,
init_seq_length=init_seq_length,
k_as_x=k_as_x)
self.ffn = nn.Sequential(nn.Linear(in_features, feedforward_dim),
nn.LeakyReLU(),
nn.Dropout(dropout, inplace=True),
nn.Linear(feedforward_dim, in_features),
nn.Dropout(dropout, inplace=True))
def forward(self, x, mask):
"""
Args:
x: batch_size x max_len x hidden_size
mask: batch_size x max_len, 为0的地方为pad
Returns:
batch_size x max_len x hidden_size
"""
residual = x
if not self.after_norm:
x = self.norm1(x)
x = self.self_attn(x, mask)
x = x + residual
if self.after_norm:
x = self.norm1(x)
residual = x
if not self.after_norm:
x = self.norm2(x)
x = self.ffn(x)
x = residual + x
if self.after_norm:
x = self.norm2(x)
return x
class RelativeTransformer(nn.Module):
def __init__(self,
in_features,
num_layers,
feedforward_dim,
num_heads,
dropout,
dropout_attn=None,
after_norm=True,
init_seq_length=1024,
k_as_x=True):
super().__init__()
self.layers = nn.ModuleList([
RelativeTransformerLayer(in_features, feedforward_dim, num_heads, dropout, dropout_attn, after_norm,
init_seq_length=init_seq_length, k_as_x=k_as_x)
for _ in range(num_layers)
])
def forward(self, x: Tensor, mask: Tensor):
"""
Args:
x: batch_size x max_len
mask: batch_size x max_len. 有value的地方为1
x: Tensor:
mask: Tensor:
Returns:
"""
if not x.numel():
return x
for layer in self.layers:
x = layer(x, mask)
return x
class RelativeTransformerEncoder(RelativeTransformer, ConfigTracker):
def __init__(self,
in_features,
num_layers=2,
num_heads=4,
feedforward_dim=256,
dropout=0.1,
dropout_attn=0.1,
after_norm=True,
k_as_x=True,
):
super().__init__(in_features, num_layers, num_heads, feedforward_dim, dropout, dropout_attn, after_norm)
ConfigTracker.__init__(self, locals())
def get_output_dim(self):
return self.config['in_features']
================================================
FILE: hanlp/layers/transformers/resource.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-20 12:43
from hanlp.utils.io_util import get_resource
from hanlp_common.constant import HANLP_URL
tokenizer_mirrors = {
'hfl/chinese-electra-180g-base-discriminator': HANLP_URL + 'transformers/electra_zh_base_20210706_125233.zip',
'hfl/chinese-electra-180g-small-discriminator': HANLP_URL + 'transformers/electra_zh_small_20210706_125427.zip',
'xlm-roberta-base': HANLP_URL + 'transformers/xlm-roberta-base_20210706_125502.zip',
'cl-tohoku/bert-base-japanese-char': HANLP_URL + 'transformers/bert-base-japanese-char_20210602_215445.zip',
'bart5-chinese-small': HANLP_URL + 'transformers/bart5-chinese-small_tok_20210723_180743.zip',
'ernie-gram': HANLP_URL + 'transformers/ernie-gram_20220207_103518.zip',
'xlm-roberta-base-no-space': HANLP_URL + 'transformers/xlm-roberta-base-no-space-tokenizer_20220610_204241.zip',
'mMiniLMv2L6-no-space': HANLP_URL + 'transformers/mMiniLMv2L6-no-space-tokenizer_20220616_094859.zip',
'mMiniLMv2L12-no-space': HANLP_URL + 'transformers/mMiniLMv2L12-no-space-tokenizer_20220616_095900.zip',
}
model_mirrors = {
'bart5-chinese-small': HANLP_URL + 'transformers/bart5-chinese-small_20210723_203923.zip',
'xlm-roberta-base-no-space': HANLP_URL + 'transformers/xlm-roberta-base-no-space_20220610_203944.zip',
'mMiniLMv2L6-no-space': HANLP_URL + 'transformers/mMiniLMv2L6-no-space_20220616_094949.zip',
'mMiniLMv2L12-no-space': HANLP_URL + 'transformers/mMiniLMv2L12-no-space_20220616_095924.zip',
}
def get_tokenizer_mirror(transformer: str) -> str:
m = tokenizer_mirrors.get(transformer, None)
if m:
return get_resource(m)
return transformer
def get_model_mirror(transformer: str) -> str:
m = model_mirrors.get(transformer, None)
if m:
return get_resource(m)
return transformer
================================================
FILE: hanlp/layers/transformers/tf_imports.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 21:57
from transformers import BertTokenizer, BertConfig, PretrainedConfig, TFAutoModel, \
AutoConfig, AutoTokenizer, PreTrainedTokenizer, TFPreTrainedModel, TFAlbertModel, TFAutoModelWithLMHead, \
BertTokenizerFast, TFAlbertForMaskedLM, AlbertConfig, TFBertModel
================================================
FILE: hanlp/layers/transformers/utils.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-15 21:22
from collections import defaultdict
from typing import Tuple, Union
import torch
from torch.nn import functional as F
from hanlp.components.parsers.ud import udify_util as util
from hanlp.layers.transformers.pt_imports import PreTrainedModel
def transformer_encode(transformer: PreTrainedModel,
input_ids,
attention_mask=None,
token_type_ids=None,
token_span=None,
layer_range: Union[int, Tuple[int, int]] = 0,
max_sequence_length=None,
average_subwords=False,
ret_raw_hidden_states=False):
"""Run transformer and pool its outputs.
Args:
transformer: A transformer model.
input_ids: Indices of subwords.
attention_mask: Mask for these subwords.
token_type_ids: Type ids for each subword.
token_span: The spans of tokens.
layer_range: The range of layers to use. Note that the 0-th layer means embedding layer, so the last 3 layers
of a 12-layer BERT will be (10, 13).
max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
window.
average_subwords: ``True`` to average subword representations.
ret_raw_hidden_states: ``True`` to return hidden states of each layer.
Returns:
Pooled outputs.
"""
if max_sequence_length and input_ids.size(-1) > max_sequence_length:
# TODO: split token type ids in transformer_sliding_window if token type ids are not always 1
outputs = transformer_sliding_window(transformer, input_ids, max_pieces=max_sequence_length)
else:
if attention_mask is None:
attention_mask = input_ids.ne(0)
if transformer.config.output_hidden_states:
outputs = transformer(input_ids, attention_mask, token_type_ids)[-1]
else:
outputs = transformer(input_ids, attention_mask, token_type_ids)[0]
if transformer.config.output_hidden_states:
if isinstance(layer_range, int):
outputs = outputs[layer_range:]
else:
outputs = outputs[layer_range[0], layer_range[1]]
# Slow pick
# hs = []
# for h in outputs:
# hs.append(pick_tensor_for_each_token(h, token_span, average_subwords))
# Fast pick
if not isinstance(outputs, torch.Tensor):
x = torch.stack(outputs)
else:
x = outputs
L, B, T, F = x.size()
x = x.flatten(end_dim=1)
# tile token_span as x
if token_span is not None:
token_span = token_span.repeat(L, 1, 1)
hs = pick_tensor_for_each_token(x, token_span, average_subwords).view(L, B, -1, F)
if ret_raw_hidden_states:
return hs, outputs
return hs
else:
if ret_raw_hidden_states:
return pick_tensor_for_each_token(outputs, token_span, average_subwords), outputs
return pick_tensor_for_each_token(outputs, token_span, average_subwords)
def pick_tensor_for_each_token(h, token_span, average_subwords):
if token_span is None:
return h
if average_subwords and token_span.size(-1) > 1:
batch_size = h.size(0)
h_span = h.gather(1, token_span.view(batch_size, -1).unsqueeze(-1).expand(-1, -1, h.shape[-1]))
h_span = h_span.view(batch_size, *token_span.shape[1:], -1)
n_sub_tokens = token_span.ne(0)
n_sub_tokens[:, 0, 0] = True
h_span = (h_span * n_sub_tokens.unsqueeze(-1)).sum(2)
n_sub_tokens = n_sub_tokens.sum(-1).unsqueeze(-1)
zero_mask = n_sub_tokens == 0
if torch.any(zero_mask):
n_sub_tokens[zero_mask] = 1 # avoid dividing by zero
embed = h_span / n_sub_tokens
else:
embed = h.gather(1, token_span[:, :, 0].unsqueeze(-1).expand(-1, -1, h.size(-1)))
return embed
def transformer_sliding_window(transformer: PreTrainedModel,
input_ids: torch.LongTensor,
input_mask=None,
offsets: torch.LongTensor = None,
token_type_ids: torch.LongTensor = None,
max_pieces=512,
start_tokens: int = 1,
end_tokens: int = 1,
ret_cls=None,
) -> torch.Tensor:
"""
Args:
transformer:
input_ids: torch.LongTensor:
input_mask: (Default value = None)
offsets: torch.LongTensor: (Default value = None)
token_type_ids: torch.LongTensor: (Default value = None)
max_pieces: (Default value = 512)
start_tokens: int: (Default value = 1)
end_tokens: int: (Default value = 1)
ret_cls: (Default value = None)
Returns:
"""
# pylint: disable=arguments-differ
batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1)
initial_dims = list(input_ids.shape[:-1])
# The embedder may receive an input tensor that has a sequence length longer than can
# be fit. In that case, we should expect the wordpiece indexer to create padded windows
# of length `max_pieces` for us, and have them concatenated into one long sequence.
# E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..."
# We can then split the sequence into sub-sequences of that length, and concatenate them
# along the batch dimension so we effectively have one huge batch of partial sentences.
# This can then be fed into BERT without any sentence length issues. Keep in mind
# that the memory consumption can dramatically increase for large batches with extremely
# long sentences.
needs_split = full_seq_len > max_pieces
if needs_split:
input_ids = split_to_sliding_window(input_ids, max_pieces)
# if token_type_ids is None:
# token_type_ids = torch.zeros_like(input_ids)
if input_mask is None:
input_mask = (input_ids != 0).long()
# input_ids may have extra dimensions, so we reshape down to 2-d
# before calling the BERT model and then reshape back at the end.
outputs = transformer(input_ids=util.combine_initial_dims_to_1d_or_2d(input_ids),
# token_type_ids=util.combine_initial_dims_to_1d_or_2d(token_type_ids),
attention_mask=util.combine_initial_dims_to_1d_or_2d(input_mask)).to_tuple()
if len(outputs) == 3:
all_encoder_layers = outputs.hidden_states
all_encoder_layers = torch.stack(all_encoder_layers)
elif len(outputs) == 2:
all_encoder_layers, _ = outputs[:2]
else:
all_encoder_layers = outputs[0]
if needs_split:
if ret_cls is not None:
cls_mask = input_ids[:, 0] == input_ids[0][0]
cls_hidden = all_encoder_layers[:, 0, :]
if ret_cls == 'max':
cls_hidden[~cls_mask] = -1e20
else:
cls_hidden[~cls_mask] = 0
cls_mask = cls_mask.view(-1, batch_size).transpose(0, 1)
cls_hidden = cls_hidden.reshape(cls_mask.size(1), batch_size, -1).transpose(0, 1)
if ret_cls == 'max':
cls_hidden = cls_hidden.max(1)[0]
elif ret_cls == 'raw':
return cls_hidden, cls_mask
else:
cls_hidden = torch.sum(cls_hidden, dim=1)
cls_hidden /= torch.sum(cls_mask, dim=1, keepdim=True)
return cls_hidden
else:
recombined_embeddings, select_indices = restore_from_sliding_window(all_encoder_layers, batch_size,
max_pieces, full_seq_len, start_tokens,
end_tokens)
initial_dims.append(len(select_indices))
else:
recombined_embeddings = all_encoder_layers
# Recombine the outputs of all layers
# (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim)
# recombined = torch.cat(combined, dim=2)
# input_mask = (recombined_embeddings != 0).long()
# At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim)
if offsets is None:
# Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim)
dims = initial_dims if needs_split else input_ids.size()
layers = util.uncombine_initial_dims(recombined_embeddings, dims)
else:
# offsets is (batch_size, d1, ..., dn, orig_sequence_length)
offsets2d = util.combine_initial_dims_to_1d_or_2d(offsets)
# now offsets is (batch_size * d1 * ... * dn, orig_sequence_length)
range_vector = util.get_range_vector(offsets2d.size(0),
device=util.get_device_of(recombined_embeddings)).unsqueeze(1)
# selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length)
selected_embeddings = recombined_embeddings[:, range_vector, offsets2d]
layers = util.uncombine_initial_dims(selected_embeddings, offsets.size())
return layers
def split_to_sliding_window(input_ids, max_pieces):
# Split the flattened list by the window size, `max_pieces`
split_input_ids = list(input_ids.split(max_pieces, dim=-1))
# We want all sequences to be the same length, so pad the last sequence
last_window_size = split_input_ids[-1].size(-1)
padding_amount = max_pieces - last_window_size
split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0)
# Now combine the sequences along the batch dimension
input_ids = torch.cat(split_input_ids, dim=0)
return input_ids
def restore_from_sliding_window(all_encoder_layers, batch_size, max_pieces, full_seq_len, start_tokens, end_tokens):
# First, unpack the output embeddings into one long sequence again
unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=-3)
unpacked_embeddings = torch.cat(unpacked_embeddings, dim=-2)
# Next, select indices of the sequence such that it will result in embeddings representing the original
# sentence. To capture maximal context, the indices will be the middle part of each embedded window
# sub-sequence (plus any leftover start and final edge windows), e.g.,
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# "[CLS] I went to the very fine [SEP] [CLS] the very fine store to eat [SEP]"
# with max_pieces = 8 should produce max context indices [2, 3, 4, 10, 11, 12] with additional start
# and final windows with indices [0, 1] and [14, 15] respectively.
# Find the stride as half the max pieces, ignoring the special start and end tokens
# Calculate an offset to extract the centermost embeddings of each window
stride = (max_pieces - start_tokens - end_tokens) // 2
stride_offset = stride // 2 + start_tokens
first_window = list(range(stride_offset))
max_context_windows = [i for i in range(full_seq_len)
if stride_offset - 1 < i % max_pieces < stride_offset + stride]
final_window_start = max_context_windows[-1] + 1
final_window = list(range(final_window_start, full_seq_len))
select_indices = first_window + max_context_windows + final_window
select_indices = torch.LongTensor(select_indices).to(unpacked_embeddings.device)
recombined_embeddings = unpacked_embeddings.index_select(-2, select_indices)
return recombined_embeddings, select_indices
def build_optimizer_for_pretrained(model: torch.nn.Module,
pretrained: torch.nn.Module,
lr=1e-5,
weight_decay=0.01,
eps=1e-8,
transformer_lr=None,
transformer_weight_decay=None,
no_decay=('bias', 'LayerNorm.bias', 'LayerNorm.weight'),
**kwargs):
if transformer_lr is None:
transformer_lr = lr
if transformer_weight_decay is None:
transformer_weight_decay = weight_decay
params = defaultdict(lambda: defaultdict(list))
pretrained = set(pretrained.parameters())
if isinstance(no_decay, tuple):
def no_decay_fn(name):
return any(nd in name for nd in no_decay)
else:
assert callable(no_decay), 'no_decay has to be callable or a tuple of str'
no_decay_fn = no_decay
for n, p in model.named_parameters():
is_pretrained = 'pretrained' if p in pretrained else 'non_pretrained'
is_no_decay = 'no_decay' if no_decay_fn(n) else 'decay'
params[is_pretrained][is_no_decay].append(p)
grouped_parameters = [
{'params': params['pretrained']['decay'], 'weight_decay': transformer_weight_decay, 'lr': transformer_lr},
{'params': params['pretrained']['no_decay'], 'weight_decay': 0.0, 'lr': transformer_lr},
{'params': params['non_pretrained']['decay'], 'weight_decay': weight_decay, 'lr': lr},
{'params': params['non_pretrained']['no_decay'], 'weight_decay': 0.0, 'lr': lr},
]
from transformers import optimization
return optimization.AdamW(
grouped_parameters,
lr=lr,
weight_decay=weight_decay,
eps=eps,
no_deprecation_warning=True, # For backwards compatability
**kwargs)
def build_optimizer_scheduler_with_transformer(model: torch.nn.Module,
transformer: torch.nn.Module,
lr: float,
transformer_lr: float,
num_training_steps: int,
warmup_steps: Union[float, int],
weight_decay: float,
adam_epsilon: float,
no_decay=('bias', 'LayerNorm.bias', 'LayerNorm.weight')):
optimizer = build_optimizer_for_pretrained(model,
transformer,
lr,
weight_decay,
eps=adam_epsilon,
transformer_lr=transformer_lr,
no_decay=no_decay)
if isinstance(warmup_steps, float):
assert 0 < warmup_steps < 1, 'warmup_steps has to fall in range (0, 1) when it is float.'
warmup_steps = num_training_steps * warmup_steps
from transformers import optimization
scheduler = optimization.get_linear_schedule_with_warmup(optimizer, warmup_steps, num_training_steps)
return optimizer, scheduler
def get_optimizers(
model: torch.nn.Module,
num_training_steps: int,
learning_rate=5e-5,
adam_epsilon=1e-8,
weight_decay=0.0,
warmup_steps=0.1,
) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
"""
Modified from https://github.com/huggingface/transformers/blob/7b75aa9fa55bee577e2c7403301ed31103125a35/src/transformers/trainer.py#L232
Setup the optimizer and the learning rate scheduler.
We provide a reasonable default that works well.
"""
if isinstance(warmup_steps, float):
assert 0 < warmup_steps < 1
warmup_steps = int(num_training_steps * warmup_steps)
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
from transformers import AdamW, get_linear_schedule_with_warmup
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
)
return optimizer, scheduler
def collect_decay_params(model, weight_decay):
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
return optimizer_grouped_parameters
================================================
FILE: hanlp/layers/transformers/utils_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 15:32
import tensorflow as tf
from hanlp.optimizers.adamw import create_optimizer
from hanlp.utils.log_util import logger
def config_is(config, model='bert'):
return model in type(config).__name__.lower()
def convert_examples_to_features(
words,
max_seq_length,
tokenizer,
labels=None,
label_map=None,
cls_token_at_end=False,
cls_token="[CLS]",
cls_token_segment_id=1,
sep_token="[SEP]",
sep_token_extra=False,
pad_on_left=False,
pad_token_id=0,
pad_token_segment_id=0,
pad_token_label_id=0,
sequence_a_segment_id=0,
mask_padding_with_zero=True,
unk_token='[UNK]',
do_padding=True
):
"""Loads a data file into a list of `InputBatch`s
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
Args:
words:
max_seq_length:
tokenizer:
labels: (Default value = None)
label_map: (Default value = None)
cls_token_at_end: (Default value = False)
cls_token: (Default value = "[CLS]")
cls_token_segment_id: (Default value = 1)
sep_token: (Default value = "[SEP]")
sep_token_extra: (Default value = False)
pad_on_left: (Default value = False)
pad_token_id: (Default value = 0)
pad_token_segment_id: (Default value = 0)
pad_token_label_id: (Default value = 0)
sequence_a_segment_id: (Default value = 0)
mask_padding_with_zero: (Default value = True)
unk_token: (Default value = '[UNK]')
do_padding: (Default value = True)
Returns:
"""
args = locals()
if not labels:
labels = words
pad_token_label_id = False
tokens = []
label_ids = []
for word, label in zip(words, labels):
word_tokens = tokenizer.tokenize(word)
if not word_tokens:
# some wired chars cause the tagger to return empty list
word_tokens = [unk_token] * len(word)
tokens.extend(word_tokens)
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
label_ids.extend([label_map[label] if label_map else True] + [pad_token_label_id] * (len(word_tokens) - 1))
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
special_tokens_count = 3 if sep_token_extra else 2
if len(tokens) > max_seq_length - special_tokens_count:
logger.warning(
f'Input tokens {words} exceed the max sequence length of {max_seq_length - special_tokens_count}. '
f'The exceeded part will be truncated and ignored. '
f'You are recommended to split your long text into several sentences within '
f'{max_seq_length - special_tokens_count} tokens beforehand.')
tokens = tokens[: (max_seq_length - special_tokens_count)]
label_ids = label_ids[: (max_seq_length - special_tokens_count)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# token_type_ids: 0 0 0 0 0 0 0
#
# Where "token_type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens += [sep_token]
label_ids += [pad_token_label_id]
if sep_token_extra:
# roberta uses an extra separator b/w pairs of sentences
tokens += [sep_token]
label_ids += [pad_token_label_id]
segment_ids = [sequence_a_segment_id] * len(tokens)
if cls_token_at_end:
tokens += [cls_token]
label_ids += [pad_token_label_id]
segment_ids += [cls_token_segment_id]
else:
tokens = [cls_token] + tokens
label_ids = [pad_token_label_id] + label_ids
segment_ids = [cls_token_segment_id] + segment_ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
if do_padding:
# Zero-pad up to the sequence length.
padding_length = max_seq_length - len(input_ids)
if pad_on_left:
input_ids = ([pad_token_id] * padding_length) + input_ids
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
label_ids = ([pad_token_label_id] * padding_length) + label_ids
else:
input_ids += [pad_token_id] * padding_length
input_mask += [0 if mask_padding_with_zero else 1] * padding_length
segment_ids += [pad_token_segment_id] * padding_length
label_ids += [pad_token_label_id] * padding_length
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
assert len(label_ids) == max_seq_length, f'failed for:\n {args}'
else:
assert len(set(len(x) for x in [input_ids, input_mask, segment_ids, label_ids])) == 1
return input_ids, input_mask, segment_ids, label_ids
def build_adamw_optimizer(config, learning_rate, epsilon, clipnorm, train_steps, use_amp, warmup_steps,
weight_decay_rate):
opt = create_optimizer(init_lr=learning_rate,
epsilon=epsilon,
weight_decay_rate=weight_decay_rate,
clipnorm=clipnorm,
num_train_steps=train_steps, num_warmup_steps=warmup_steps)
# opt = tfa.optimizers.AdamW(learning_rate=3e-5, epsilon=1e-08, weight_decay=0.01)
# opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
config.optimizer = tf.keras.utils.serialize_keras_object(opt)
lr_config = config.optimizer['config']['learning_rate']['config']
if 'decay_schedule_fn' in lr_config:
lr_config['decay_schedule_fn'] = dict(
(k, v) for k, v in lr_config['decay_schedule_fn'].items() if not k.startswith('_'))
if use_amp:
# loss scaling is currently required when using mixed precision
opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
return opt
def adjust_tokens_for_transformers(sentence):
"""Adjust tokens for BERT
See https://github.com/DoodleJZ/HPSG-Neural-Parser/blob/master/src_joint/Zparser.py#L1204
Args:
sentence:
Returns:
"""
cleaned_words = []
for word in sentence:
# word = BERT_TOKEN_MAPPING.get(word, word)
if word == "n't" and cleaned_words:
cleaned_words[-1] = cleaned_words[-1] + "n"
word = "'t"
cleaned_words.append(word)
return cleaned_words
================================================
FILE: hanlp/layers/weight_normalization.py
================================================
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from hanlp.utils.tf_util import hanlp_register
@hanlp_register
class WeightNormalization(tf.keras.layers.Wrapper):
"""This wrapper reparameterizes a layer by decoupling the weight's
magnitude and direction.
This speeds up convergence by improving the
conditioning of the optimization problem.
Weight Normalization: A Simple Reparameterization to Accelerate
Training of Deep Neural Networks: https://arxiv.org/abs/1602.07868
Tim Salimans, Diederik P. Kingma (2016)
WeightNormalization wrapper works for keras and tf layers.
```python
net = WeightNormalization(
tf.keras.layers.Conv2D(2, 2, activation='relu'),
input_shape=(32, 32, 3),
data_init=True)(x)
net = WeightNormalization(
tf.keras.layers.Conv2D(16, 5, activation='relu'),
data_init=True)(net)
net = WeightNormalization(
tf.keras.layers.Dense(120, activation='relu'),
data_init=True)(net)
net = WeightNormalization(
tf.keras.layers.Dense(n_classes),
data_init=True)(net)
```
Args:
layer: a layer instance
data_init: If
Returns:
Raises:
ValueError: If not initialized with a
ValueError: If
NotImplementedError: If
"""
def __init__(self, layer, data_init=True, **kwargs):
super(WeightNormalization, self).__init__(layer, **kwargs)
self.data_init = data_init
self._track_trackable(layer, name='layer')
self._init_critical_section = tf.CriticalSection(name='init_mutex')
self.is_rnn = isinstance(self.layer, tf.keras.layers.RNN)
def build(self, input_shape):
"""Build `Layer`
Args:
input_shape:
Returns:
"""
input_shape = tf.TensorShape(input_shape)
self.input_spec = tf.keras.layers.InputSpec(
shape=[None] + input_shape[1:])
if not self.layer.built:
self.layer.build(input_shape)
kernel_layer = self.layer.cell if self.is_rnn else self.layer
if not hasattr(kernel_layer, 'kernel'):
raise ValueError('`WeightNormalization` must wrap a layer that'
' contains a `kernel` for weights')
# The kernel's filter or unit dimension is -1
self.layer_depth = int(kernel_layer.kernel.shape[-1])
self.kernel_norm_axes = list(range(kernel_layer.kernel.shape.rank - 1))
self.g = self.add_weight(
name='g',
shape=(self.layer_depth,),
initializer='ones',
dtype=kernel_layer.kernel.dtype,
trainable=True)
self.v = kernel_layer.kernel
self._initialized = self.add_weight(
name='initialized',
shape=None,
initializer='zeros',
dtype=tf.dtypes.bool,
trainable=False)
if self.data_init:
# Used for data initialization in self._data_dep_init.
with tf.name_scope('data_dep_init'):
layer_config = tf.keras.layers.serialize(self.layer)
layer_config['config']['trainable'] = False
self._naked_clone_layer = tf.keras.layers.deserialize(
layer_config)
self._naked_clone_layer.build(input_shape)
self._naked_clone_layer.set_weights(self.layer.get_weights())
if self.is_rnn:
self._naked_clone_layer.cell.activation = None
else:
self._naked_clone_layer.activation = None
self.built = True
def call(self, inputs):
"""Call `Layer`
Args:
inputs:
Returns:
"""
def _do_nothing():
return tf.identity(self.g)
def _update_weights():
# Ensure we read `self.g` after _update_weights.
with tf.control_dependencies(self._initialize_weights(inputs)):
return tf.identity(self.g)
g = self._init_critical_section.execute(lambda: tf.cond(
self._initialized, _do_nothing, _update_weights))
with tf.name_scope('compute_weights'):
# Replace kernel by normalized weight variable.
self.layer.kernel = tf.nn.l2_normalize(
self.v, axis=self.kernel_norm_axes) * g
# Ensure we calculate result after updating kernel.
update_kernel = tf.identity(self.layer.kernel)
with tf.control_dependencies([update_kernel]):
outputs = self.layer(inputs)
return outputs
def compute_output_shape(self, input_shape):
return tf.TensorShape(
self.layer.compute_output_shape(input_shape).as_list())
def _initialize_weights(self, inputs):
"""Initialize weight g.
The initial value of g could either from the initial value in v,
or by the input value if self.data_init is True.
Args:
inputs:
Returns:
"""
with tf.control_dependencies([
tf.debugging.assert_equal( # pylint: disable=bad-continuation
self._initialized,
False,
message='The layer has been initialized.')
]):
if self.data_init:
assign_tensors = self._data_dep_init(inputs)
else:
assign_tensors = self._init_norm()
assign_tensors.append(self._initialized.assign(True))
return assign_tensors
def _init_norm(self):
"""Set the weight g with the norm of the weight vector."""
with tf.name_scope('init_norm'):
v_flat = tf.reshape(self.v, [-1, self.layer_depth])
v_norm = tf.linalg.norm(v_flat, axis=0)
g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,)))
return [g_tensor]
def _data_dep_init(self, inputs):
"""Data dependent initialization.
Args:
inputs:
Returns:
"""
with tf.name_scope('data_dep_init'):
# Generate data dependent init values
x_init = self._naked_clone_layer(inputs)
data_norm_axes = list(range(x_init.shape.rank - 1))
m_init, v_init = tf.nn.moments(x_init, data_norm_axes)
scale_init = 1. / tf.math.sqrt(v_init + 1e-10)
# Assign data dependent init values
g_tensor = self.g.assign(self.g * scale_init)
if hasattr(self.layer, 'bias') and self.layer.bias is not None:
bias_tensor = self.layer.bias.assign(-m_init * scale_init)
return [g_tensor, bias_tensor]
else:
return [g_tensor]
def get_config(self):
config = {'data_init': self.data_init}
base_config = super(WeightNormalization, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
================================================
FILE: hanlp/losses/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 01:28
================================================
FILE: hanlp/losses/sparse_categorical_crossentropy.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-20 01:29
import tensorflow as tf
from hanlp.utils.tf_util import hanlp_register
@hanlp_register
class SparseCategoricalCrossentropyOverNonzeroWeights(object):
def __init__(self) -> None:
super().__init__()
self.__name__ = type(self).__name__
def __call__(self, y_true, y_pred, sample_weight=None, **kwargs):
loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
if sample_weight is not None:
loss = loss * sample_weight
loss = tf.reduce_sum(loss)
if sample_weight is not None:
# This is equivalent to SUM_OVER_BATCH_SIZE
# loss /= tf.reduce_sum(tf.ones_like(sample_weight, dtype=loss.dtype))
# This one is SUM_BY_NONZERO_WEIGHTS
loss /= tf.reduce_sum(sample_weight)
return loss
@hanlp_register
class SparseCategoricalCrossentropyOverBatchFirstDim(object):
def __init__(self) -> None:
super().__init__()
self.__name__ = type(self).__name__
def __call__(self, y_true, y_pred, sample_weight=None, **kwargs):
loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
if sample_weight is not None:
loss = loss * sample_weight
# could use sum of sample_weight[:,0] too
loss = tf.reduce_sum(loss) / tf.cast(tf.shape(y_true)[0], tf.float32)
return loss
def get_config(self):
return {}
@hanlp_register
class MaskedSparseCategoricalCrossentropyOverBatchFirstDim(object):
def __init__(self, mask_value=0) -> None:
super().__init__()
self.mask_value = mask_value
self.__name__ = type(self).__name__
def __call__(self, y_true, y_pred, sample_weight=None, **kwargs):
assert sample_weight is None, 'the mask will be computed via y_true != mask_value, ' \
'it might conflict with sample_weight'
active_loss = tf.not_equal(y_true, self.mask_value)
active_labels = tf.boolean_mask(y_true, active_loss)
active_logits = tf.boolean_mask(y_pred, active_loss)
loss = tf.keras.losses.sparse_categorical_crossentropy(active_labels, active_logits, from_logits=True)
loss = tf.reduce_sum(loss) / tf.cast(tf.shape(y_true)[0], tf.float32)
return loss
================================================
FILE: hanlp/metrics/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 21:55
================================================
FILE: hanlp/metrics/accuracy.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-12 17:56
from typing import Optional, Iterable
import torch
from hanlp.metrics.metric import Metric
class CategoricalAccuracy(Metric):
"""
Categorical Top-K accuracy. Assumes integer labels, with
each item to be classified having a single correct class.
Tie break enables equal distribution of scores among the
classes with same maximum predicted scores.
Copied from AllenNLP and added several methods.
"""
def __init__(self, top_k: int = 1, tie_break: bool = False) -> None:
if top_k > 1 and tie_break:
raise ValueError(
"Tie break in Categorical Accuracy can be done only for maximum (top_k = 1)"
)
if top_k <= 0:
raise ValueError("top_k passed to Categorical Accuracy must be > 0")
self._top_k = top_k
self._tie_break = tie_break
self.correct_count = 0.0
self.total_count = 0.0
def __call__(
self,
predictions: torch.Tensor,
gold_labels: torch.Tensor,
mask: Optional[torch.BoolTensor] = None,
):
"""
# Parameters
predictions : `torch.Tensor`, required.
A tensor of predictions of shape (batch_size, ..., num_classes).
gold_labels : `torch.Tensor`, required.
A tensor of integer class label of shape (batch_size, ...). It must be the same
shape as the `predictions` tensor without the `num_classes` dimension.
mask : `torch.BoolTensor`, optional (default = `None`).
A masking tensor the same size as `gold_labels`.
"""
predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
# Some sanity checks.
num_classes = predictions.size(-1)
if gold_labels.dim() != predictions.dim() - 1:
raise ValueError(
"gold_labels must have dimension == predictions.size() - 1 but "
"found tensor of shape: {}".format(predictions.size())
)
if (gold_labels >= num_classes).any():
raise ValueError(
"A gold label passed to Categorical Accuracy contains an id >= {}, "
"the number of classes.".format(num_classes)
)
predictions = predictions.view((-1, num_classes))
gold_labels = gold_labels.view(-1).long()
if not self._tie_break:
# Top K indexes of the predictions (or fewer, if there aren't K of them).
# Special case topk == 1, because it's common and .max() is much faster than .topk().
if self._top_k == 1:
top_k = predictions.max(-1)[1].unsqueeze(-1)
else:
top_k = predictions.topk(min(self._top_k, predictions.shape[-1]), -1)[1]
# This is of shape (batch_size, ..., top_k).
correct = top_k.eq(gold_labels.unsqueeze(-1)).float()
else:
# prediction is correct if gold label falls on any of the max scores. distribute score by tie_counts
max_predictions = predictions.max(-1)[0]
max_predictions_mask = predictions.eq(max_predictions.unsqueeze(-1))
# max_predictions_mask is (rows X num_classes) and gold_labels is (batch_size)
# ith entry in gold_labels points to index (0-num_classes) for ith row in max_predictions
# For each row check if index pointed by gold_label is was 1 or not (among max scored classes)
correct = max_predictions_mask[
torch.arange(gold_labels.numel(), device=gold_labels.device).long(), gold_labels
].float()
tie_counts = max_predictions_mask.sum(-1)
correct /= tie_counts.float()
correct.unsqueeze_(-1)
if mask is not None:
correct *= mask.view(-1, 1)
self.total_count += mask.sum()
else:
self.total_count += gold_labels.numel()
self.correct_count += correct.sum()
@property
def score(self):
if self.total_count > 1e-12:
accuracy = float(self.correct_count) / float(self.total_count)
else:
accuracy = 0.0
return accuracy
def __repr__(self) -> str:
return f'Accuracy:{self.score:.2%}'
@staticmethod
def detach_tensors(*tensors: torch.Tensor) -> Iterable[torch.Tensor]:
"""
If you actually passed gradient-tracking Tensors to a Metric, there will be
a huge memory leak, because it will prevent garbage collection for the computation
graph. This method ensures the tensors are detached.
"""
# Check if it's actually a tensor in case something else was passed.
return (x.detach() if isinstance(x, torch.Tensor) else x for x in tensors)
def reset(self):
self.correct_count = 0.0
self.total_count = 0.0
class BooleanAccuracy(Metric):
"""
Just checks batch-equality of two tensors and computes an accuracy metric based on that.
That is, if your prediction has shape (batch_size, dim_1, ..., dim_n), this metric considers that
as a set of `batch_size` predictions and checks that each is *entirely* correct across the remaining dims.
This means the denominator in the accuracy computation is `batch_size`, with the caveat that predictions
that are totally masked are ignored (in which case the denominator is the number of predictions that have
at least one unmasked element).
This is similar to [`CategoricalAccuracy`](./categorical_accuracy.md), if you've already done a `.max()`
on your predictions. If you have categorical output, though, you should typically just use
`CategoricalAccuracy`. The reason you might want to use this instead is if you've done
some kind of constrained inference and don't have a prediction tensor that matches the API of
`CategoricalAccuracy`, which assumes a final dimension of size `num_classes`.
"""
def __init__(self) -> None:
self._correct_count = 0.0
self._total_count = 0.0
def __call__(
self,
predictions: torch.Tensor,
gold_labels: torch.Tensor,
mask: Optional[torch.BoolTensor] = None,
):
"""
# Parameters
predictions : `torch.Tensor`, required.
A tensor of predictions of shape (batch_size, ...).
gold_labels : `torch.Tensor`, required.
A tensor of the same shape as `predictions`.
mask : `torch.BoolTensor`, optional (default = `None`).
A tensor of the same shape as `predictions`.
"""
predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
# Some sanity checks.
if gold_labels.size() != predictions.size():
raise ValueError(
f"gold_labels must have shape == predictions.size() but "
f"found tensor of shape: {gold_labels.size()}"
)
if mask is not None and mask.size() != predictions.size():
raise ValueError(
f"mask must have shape == predictions.size() but "
f"found tensor of shape: {mask.size()}"
)
batch_size = predictions.size(0)
if mask is not None:
# We can multiply by the mask up front, because we're just checking equality below, and
# this way everything that's masked will be equal.
predictions = predictions * mask
gold_labels = gold_labels * mask
# We want to skip predictions that are completely masked;
# so we'll keep predictions that aren't.
keep = mask.view(batch_size, -1).max(dim=1)[0]
else:
keep = torch.ones(batch_size, device=predictions.device).bool()
predictions = predictions.view(batch_size, -1)
gold_labels = gold_labels.view(batch_size, -1)
# At this point, predictions is (batch_size, rest_of_dims_combined),
# so .eq -> .prod will be 1 if every element of the instance prediction is correct
# and 0 if at least one element of the instance prediction is wrong.
# Because of how we're handling masking, masked positions are automatically "correct".
correct = predictions.eq(gold_labels).prod(dim=1).float()
# Since masked positions are correct, we need to explicitly exclude instance predictions
# where the entire prediction is masked (because they look "correct").
self._correct_count += (correct * keep).sum()
self._total_count += keep.sum()
def get_metric(self, reset: bool = False):
"""
# Returns
The accumulated accuracy.
"""
if self._total_count > 0:
accuracy = float(self._correct_count) / float(self._total_count)
else:
accuracy = 0.0
if reset:
self.reset()
return accuracy
def reset(self):
self._correct_count = 0.0
self._total_count = 0.0
@staticmethod
def detach_tensors(*tensors: torch.Tensor) -> Iterable[torch.Tensor]:
"""
If you actually passed gradient-tracking Tensors to a Metric, there will be
a huge memory leak, because it will prevent garbage collection for the computation
graph. This method ensures the tensors are detached.
"""
# Check if it's actually a tensor in case something else was passed.
return (x.detach() if isinstance(x, torch.Tensor) else x for x in tensors)
================================================
FILE: hanlp/metrics/amr/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-24 12:47
================================================
FILE: hanlp/metrics/amr/smatch_eval.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-24 12:47
import os
import warnings
from typing import Union
from hanlp.metrics.f1 import F1_
from hanlp.metrics.mtl import MetricDict
from hanlp.utils.io_util import get_resource, run_cmd, pushd
from hanlp.utils.log_util import flash
_SMATCH_SCRIPT = 'https://github.com/ChunchuanLv/amr-evaluation-tool-enhanced/archive/master.zip#evaluation.sh'
_FAST_SMATCH_SCRIPT = 'https://github.com/jcyk/AMR-gs/archive/master.zip#tools/fast_smatch/compute_smatch.sh'
class SmatchScores(MetricDict):
@property
def score(self):
return self['Smatch'].score
def smatch_eval(pred, gold, use_fast=False) -> Union[SmatchScores, F1_]:
script = get_resource(_FAST_SMATCH_SCRIPT if use_fast else _SMATCH_SCRIPT)
home = os.path.dirname(script)
pred = os.path.realpath(pred)
gold = os.path.realpath(gold)
with pushd(home):
flash('Running evaluation script [blink][yellow]...[/yellow][/blink]')
cmd = f'bash {script} {pred} {gold}'
text = run_cmd(cmd)
flash('')
return format_fast_scores(text) if use_fast else format_official_scores(text)
def post_process(pred, amr_version):
pred = os.path.realpath(pred)
utils_tar_gz = get_amr_utils(amr_version)
util_dir = get_resource(utils_tar_gz)
stog_home = get_resource('https://github.com/jcyk/AMR-gs/archive/master.zip')
with pushd(stog_home):
run_cmd(
f'python3 -u -m stog.data.dataset_readers.amr_parsing.postprocess.postprocess '
f'--amr_path {pred} --util_dir {util_dir} --v 2')
return pred + '.post'
def get_amr_utils(amr_version):
if amr_version == '1.0':
utils_tar_gz = 'https://www.cs.jhu.edu/~s.zhang/data/AMR/amr_1.0_utils.tar.gz'
elif amr_version == '2.0':
utils_tar_gz = 'https://www.cs.jhu.edu/~s.zhang/data/AMR/amr_2.0_utils.tar.gz'
elif amr_version == '3.0':
utils_tar_gz = 'https://file.hankcs.com/research/amr2020/amr_3.0_utils.tgz'
else:
raise ValueError(f'Unsupported AMR version {amr_version}')
return utils_tar_gz
def format_official_scores(text: str):
# Smatch -> P: 0.136, R: 0.107, F: 0.120
# Unlabeled -> P: 0.229, R: 0.180, F: 0.202
# No WSD -> P: 0.137, R: 0.108, F: 0.120
# Non_sense_frames -> P: 0.008, R: 0.008, F: 0.008
# Wikification -> P: 0.000, R: 0.000, F: 0.000
# Named Ent. -> P: 0.222, R: 0.092, F: 0.130
# Negations -> P: 0.000, R: 0.000, F: 0.000
# IgnoreVars -> P: 0.005, R: 0.003, F: 0.003
# Concepts -> P: 0.075, R: 0.036, F: 0.049
# Frames -> P: 0.007, R: 0.007, F: 0.007
# Reentrancies -> P: 0.113, R: 0.060, F: 0.079
# SRL -> P: 0.145, R: 0.104, F: 0.121
scores = SmatchScores()
for line in text.split('\n'):
line = line.strip()
if not line:
continue
name, vs = line.split(' -> ')
try:
p, r, f = [float(x.split(': ')[-1]) for x in vs.split(', ')]
except ValueError:
warnings.warn(f'Failed to parse results from smatch: {line}')
p, r, f = float("nan"), float("nan"), float("nan")
scores[name] = F1_(p, r, f)
return scores
def format_fast_scores(text: str):
# using fast smatch
# Precision: 0.137
# Recall: 0.108
# Document F-score: 0.121
scores = []
for line in text.split('\n'):
line = line.strip()
if not line or ':' not in line:
continue
name, score = line.split(': ')
scores.append(float(score))
assert len(scores) == 3
return F1_(*scores)
================================================
FILE: hanlp/metrics/chunking/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 03:49
================================================
FILE: hanlp/metrics/chunking/binary_chunking_f1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-02 14:27
from collections import defaultdict
from typing import List, Union
import torch
from hanlp.metrics.f1 import F1
class BinaryChunkingF1(F1):
def __call__(self, pred_tags: torch.LongTensor, gold_tags: torch.LongTensor, lens: List[int] = None):
if lens is None:
lens = [gold_tags.size(1)] * gold_tags.size(0)
self.update(self.decode_spans(pred_tags, lens), self.decode_spans(gold_tags, lens))
def update(self, pred_tags, gold_tags):
for pred, gold in zip(pred_tags, gold_tags):
super().__call__(set(pred), set(gold))
@staticmethod
def decode_spans(pred_tags: torch.LongTensor, lens: Union[List[int], torch.LongTensor]):
if isinstance(lens, torch.Tensor):
lens = lens.tolist()
batch_pred = defaultdict(list)
for batch, offset in pred_tags.nonzero(as_tuple=False).tolist():
batch_pred[batch].append(offset)
batch_pred_spans = [[(0, l)] for l in lens]
for batch, offsets in batch_pred.items():
l = lens[batch]
batch_pred_spans[batch] = list(zip(offsets, offsets[1:] + [l]))
return batch_pred_spans
================================================
FILE: hanlp/metrics/chunking/bmes_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 21:55
from hanlp.common.vocab_tf import VocabTF
from hanlp.metrics.chunking.chunking_f1_tf import ChunkingF1_TF
from hanlp.metrics.chunking.sequence_labeling import get_entities
class BMES_F1_TF(ChunkingF1_TF):
def __init__(self, tag_vocab: VocabTF, from_logits=True, suffix=False, name='f1', dtype=None, **kwargs):
super().__init__(tag_vocab, from_logits, name, dtype, **kwargs)
self.nb_correct = 0
self.nb_pred = 0
self.nb_true = 0
self.suffix = suffix
def update_tags(self, true_tags, pred_tags):
for t, p in zip(true_tags, pred_tags):
self.update_entities(get_entities(t, self.suffix), get_entities(p, self.suffix))
return self.result()
def update_entities(self, true_entities, pred_entities):
true_entities = set(true_entities)
pred_entities = set(pred_entities)
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
nb_true = len(true_entities)
self.nb_correct += nb_correct
self.nb_pred += nb_pred
self.nb_true += nb_true
def result(self):
nb_correct = self.nb_correct
nb_pred = self.nb_pred
nb_true = self.nb_true
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
score = 2 * p * r / (p + r) if p + r > 0 else 0
return score
def reset_states(self):
self.nb_correct = 0
self.nb_pred = 0
self.nb_true = 0
================================================
FILE: hanlp/metrics/chunking/chunking_f1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-11 22:14
import io
from collections import defaultdict
from typing import List, Set, Tuple, Dict
from hanlp.metrics.chunking.conlleval import calculate_metrics, DetailedF1, metrics
from hanlp.metrics.chunking.sequence_labeling import get_entities
from hanlp.metrics.f1 import F1
from hanlp.metrics.metric import Metric
class ChunkingF1(F1):
def __call__(self, pred_tags: List[List[str]], gold_tags: List[List[str]]):
for p, g in zip(pred_tags, gold_tags):
pred = set(get_entities(p))
gold = set(get_entities(g))
self.nb_pred += len(pred)
self.nb_true += len(gold)
self.nb_correct += len(pred & gold)
class DetailedSpanF1(Metric):
def __init__(self, do_confusion_matrix=False):
self.correct_chunk = 0 # number of correctly identified chunks
self.correct_unlabeled = 0
self.total_gold = 0 # number of chunks in corpus
self.total_pred = 0 # number of identified chunks
self.token_counter = 0 # token counter (ignores sentence breaks)
# counts by type
self.t_correct_chunk = defaultdict(int)
self.t_total_gold = defaultdict(int)
self.t_total_pred = defaultdict(int)
self.do_confusion_matrix = do_confusion_matrix
if do_confusion_matrix:
self.pred_labels = []
self.gold_labels = []
@property
def states(self):
return (self.t_correct_chunk, self.t_total_gold, self.t_total_pred)
def reset_state(self):
self.correct_chunk = 0 # number of correctly identified chunks
self.total_gold = 0 # number of chunks in corpus
self.total_pred = 0 # number of identified chunks
self.token_counter = 0 # token counter (ignores sentence breaks)
for state in self.states:
state.clear()
if self.do_confusion_matrix:
self.pred_labels = []
self.gold_labels = []
@property
def score(self):
overall = calculate_metrics(
self.correct_chunk, self.total_pred, self.total_gold
)
return overall.fscore
def __call__(self, pred: Set[Tuple[int, int, str]], gold: Set[Tuple[int, int, str]], num_tokens=None):
pred_chunks_unlabeled = set((b, e) for b, e, l in pred)
gold_chunks_unlabeled = set((b, e) for b, e, l in gold)
self.correct_unlabeled += len(pred_chunks_unlabeled & gold_chunks_unlabeled)
self.correct_chunk += len(pred & gold)
self.total_gold += len(gold)
self.total_pred += len(pred)
if num_tokens:
self.token_counter += num_tokens
def group_by_tag(collection: Set[Tuple[int, int, str]]):
group = defaultdict(set)
for b, e, l in collection:
group[l].add((b, e))
return group
pred_tags = group_by_tag(pred)
gold_tags = group_by_tag(gold)
for l in pred_tags.keys() | gold_tags.keys():
self.t_correct_chunk[l] += len(pred_tags[l] & gold_tags[l])
self.t_total_gold[l] += len(gold_tags[l])
self.t_total_pred[l] += len(pred_tags[l])
if self.do_confusion_matrix:
def group_by_span(collection: Set[Tuple[int, int, str]]):
group = dict()
for b, e, l in collection:
group[(b, e)] = l
return group
pred_spans = group_by_span(pred)
gold_spans = group_by_span(gold)
for span in pred_spans.keys() & gold_spans.keys():
self.pred_labels.append(pred_spans[span])
self.gold_labels.append(gold_spans[span])
def reset(self):
self.reset_state()
def report(self) -> Tuple[DetailedF1, Dict[str, DetailedF1], str]:
out = io.StringIO()
c = self
out.write('processed %d tokens with %d phrases; ' % (c.token_counter, c.total_gold))
out.write('found: %d phrases; correct: %d.\n' % (c.total_pred, c.correct_chunk))
overall = calculate_metrics(c.correct_unlabeled, c.total_pred, c.total_gold)
out.write('%17s: ' % 'unlabeled overall')
out.write('precision: %6.2f%%; ' % (100. * overall.prec))
out.write('recall: %6.2f%%; ' % (100. * overall.rec))
out.write('FB1: %6.2f\n' % (100. * overall.fscore))
overall, by_type = metrics(self)
out.write('%17s: ' % 'labeled overall')
out.write('precision: %6.2f%%; ' % (100. * overall.prec))
out.write('recall: %6.2f%%; ' % (100. * overall.rec))
out.write('FB1: %6.2f\n' % (100. * overall.fscore))
for i, m in sorted(by_type.items()):
out.write('%17s: ' % i)
out.write('precision: %6.2f%%; ' % (100. * m.prec))
out.write('recall: %6.2f%%; ' % (100. * m.rec))
out.write('FB1: %6.2f %d\n' % (100. * m.fscore, c.t_total_pred[i]))
text = out.getvalue()
out.close()
return overall, by_type, text
def __str__(self) -> str:
return self.report()[-1]
def confusion_matrix(self):
from sklearn.metrics import confusion_matrix
labels = sorted(self.gold_labels + self.pred_labels)
return confusion_matrix(self.gold_labels, self.pred_labels, labels=labels), labels
================================================
FILE: hanlp/metrics/chunking/chunking_f1_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 23:09
from abc import ABC, abstractmethod
import tensorflow as tf
from hanlp.common.vocab_tf import VocabTF
class ChunkingF1_TF(tf.keras.metrics.Metric, ABC):
def __init__(self, tag_vocab: VocabTF, from_logits=True, name='f1', dtype=None, **kwargs):
super().__init__(name, dtype, dynamic=True, **kwargs)
self.tag_vocab = tag_vocab
self.from_logits = from_logits
def update_the_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs):
if sample_weight is None:
if hasattr(y_pred, '_keras_mask'):
mask = y_pred._keras_mask
else:
mask = None
else:
mask = sample_weight
if self.tag_vocab.pad_idx is not None and mask is None:
# in this case, the model doesn't compute mask but provide a masking index, it's ok to
mask = y_true != self.tag_vocab.pad_idx
assert mask is not None, 'ChunkingF1 requires masking, check your _keras_mask or compute_mask'
if self.from_logits:
y_pred = tf.argmax(y_pred, axis=-1)
y_true = self.to_tags(y_true, mask)
y_pred = self.to_tags(y_pred, mask)
return self.update_tags(y_true, y_pred)
def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs):
return self.update_the_state(y_true, y_pred, sample_weight)
def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs):
return self.update_the_state(y_true, y_pred, sample_weight)
def to_tags(self, y: tf.Tensor, sample_weight: tf.Tensor):
batch = []
y = y.numpy()
sample_weight = sample_weight.numpy()
for sent, mask in zip(y, sample_weight):
tags = []
for tag, m in zip(sent, mask):
if not m:
continue
tag = int(tag)
if self.tag_vocab.pad_idx is not None and tag == self.tag_vocab.pad_idx:
# If model predicts , it will fail most metrics. So replace it with a valid one
tag = 1
tags.append(self.tag_vocab.get_token(tag))
batch.append(tags)
return batch
@abstractmethod
def update_tags(self, true_tags, pred_tags):
pass
@abstractmethod
def result(self):
pass
================================================
FILE: hanlp/metrics/chunking/conlleval.py
================================================
#!/usr/bin/env python
# Python version of the evaluation script from CoNLL'00-
# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported
import io
import sys
from collections import defaultdict, namedtuple
from typing import Tuple, Union, List
from hanlp.utils.span_util import bio_tags_to_spans
from hanlp.metrics.metric import Metric
ANY_SPACE = ''
class FormatError(Exception):
pass
DetailedF1 = namedtuple('Metrics', 'tp fp fn prec rec fscore')
class EvalCounts(object):
def __init__(self):
self.correct_chunk = 0 # number of correctly identified chunks
self.correct_tags = 0 # number of correct chunk tags
self.total_gold = 0 # number of chunks in corpus
self.total_pred = 0 # number of identified chunks
self.token_counter = 0 # token counter (ignores sentence breaks)
# counts by type
self.t_correct_chunk = defaultdict(int)
self.t_total_gold = defaultdict(int)
self.t_total_pred = defaultdict(int)
@property
def states(self):
return (self.t_correct_chunk, self.t_total_gold, self.t_total_pred)
def reset_state(self):
self.correct_chunk = 0 # number of correctly identified chunks
self.correct_tags = 0 # number of correct chunk tags
self.total_gold = 0 # number of chunks in corpus
self.total_pred = 0 # number of identified chunks
self.token_counter = 0 # token counter (ignores sentence breaks)
for state in self.states:
state.clear()
class SpanF1(Metric):
def __init__(self, label_encoding='IOBES') -> None:
super().__init__()
self.label_encoding = label_encoding
self.count = EvalCounts()
def reset(self):
self.count = EvalCounts()
@property
def score(self):
return self.result(False, False).fscore
def reset_state(self):
self.count.reset_state()
def update_state(self, true_seqs: List[str], pred_seqs: List[str]):
if self.label_encoding == 'IOBES':
count = evaluate_iobes(true_seqs, pred_seqs)
elif self.label_encoding in ['IOB2', 'BIO']:
count = evaluate_iob2(true_seqs, pred_seqs)
else:
raise ValueError(f'Unrecognized label encoding {self.label_encoding}')
self.count.correct_chunk += count.correct_chunk
self.count.correct_tags += count.correct_tags
self.count.total_gold += count.total_gold
self.count.total_pred += count.total_pred
self.count.token_counter += count.token_counter
for s, n in zip(self.count.states, count.states):
for k, v in n.items():
s[k] = s.get(k, 0) + v
def batch_update_state(self, true_seqs: List[List[str]], pred_seqs: List[List[str]]):
for t, p in zip(true_seqs, pred_seqs):
self.update_state(t, p)
def result(self, full=True, verbose=True) -> Union[Tuple[DetailedF1, dict, str], DetailedF1]:
if full:
out = io.StringIO()
overall, by_type = report(self.count, out)
text = out.getvalue()
if verbose:
print(text)
out.close()
return overall, by_type, text
else:
overall, _ = metrics(self.count)
return overall
# torch convention: put pred before gold
def __call__(self, pred_seqs: List[List[str]], true_seqs: List[List[str]]):
return self.batch_update_state(true_seqs, pred_seqs)
def __repr__(self) -> str:
result = self.result(False, False)
return f"P: {result.prec:.2%} R: {result.rec:.2%} F: {result.fscore:.2%}"
def parse_args(argv):
import argparse
parser = argparse.ArgumentParser(
description='evaluate tagging results using CoNLL criteria',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
arg = parser.add_argument
arg('-b', '--boundary', metavar='STR', default='-X-',
help='sentence boundary')
arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
help='character delimiting items in input')
arg('-o', '--otag', metavar='CHAR', default='O',
help='alternative outside tag')
arg('file', nargs='?', default=None)
return parser.parse_args(argv)
def split_tag(chunk_tag):
"""split chunk tag into IOBES prefix and chunk_type
e.g.
B-PER -> (B, PER)
O -> (O, None)
Args:
chunk_tag:
Returns:
"""
if chunk_tag == 'O':
return ('O', None)
return chunk_tag.split('-', maxsplit=1)
def evaluate_iobes(true_seqs, pred_seqs):
counts = EvalCounts()
in_correct = False # currently processed chunks is correct until now
last_correct = 'O' # previous chunk tag in corpus
last_correct_type = '' # type of previously identified chunk tag
last_guessed = 'O' # previously identified chunk tag
last_guessed_type = '' # type of previous chunk tag in corpus
for true_tag, pred_tag in zip(true_seqs, pred_seqs):
guessed, guessed_type = split_tag(pred_tag)
correct, correct_type = split_tag(true_tag)
end_correct = end_of_chunk(last_correct, correct,
last_correct_type, correct_type)
end_guessed = end_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
start_correct = start_of_chunk(last_correct, correct,
last_correct_type, correct_type)
start_guessed = start_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
if in_correct:
if (end_correct and end_guessed and
last_guessed_type == last_correct_type):
in_correct = False
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
elif (end_correct != end_guessed or guessed_type != correct_type):
in_correct = False
if start_correct and start_guessed and guessed_type == correct_type:
in_correct = True
if start_correct:
counts.total_gold += 1
counts.t_total_gold[correct_type] += 1
if start_guessed:
counts.total_pred += 1
counts.t_total_pred[guessed_type] += 1
if correct == guessed and guessed_type == correct_type:
counts.correct_tags += 1
counts.token_counter += 1
last_guessed = guessed
last_correct = correct
last_guessed_type = guessed_type
last_correct_type = correct_type
if in_correct:
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
return counts
def evaluate_iob2(true_seqs, pred_seqs):
counts = EvalCounts()
gold = set(bio_tags_to_spans(true_seqs))
pred = set(bio_tags_to_spans(pred_seqs))
counts.correct_chunk = len(gold & pred)
counts.total_pred = len(pred)
counts.total_gold = len(gold)
return counts
def uniq(iterable):
seen = set()
return [i for i in iterable if not (i in seen or seen.add(i))]
def calculate_metrics(correct, guessed, total):
tp, fp, fn = correct, guessed - correct, total - correct
p = 0. if tp + fp == 0 else 1. * tp / (tp + fp)
r = 0. if tp + fn == 0 else 1. * tp / (tp + fn)
f = 0. if p + r == 0 else 2 * p * r / (p + r)
return DetailedF1(tp, fp, fn, p, r, f)
def calc_metrics(tp, p, t, percent=True):
"""compute overall precision, recall and FB1 (default values are 0.0)
if percent is True, return 100 * original decimal value
Args:
tp:
p:
t:
percent: (Default value = True)
Returns:
"""
precision = tp / p if p else 0
recall = tp / t if t else 0
fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
if percent:
return 100 * precision, 100 * recall, 100 * fb1
else:
return precision, recall, fb1
def metrics(counts):
c = counts
overall = calculate_metrics(
c.correct_chunk, c.total_pred, c.total_gold
)
by_type = {}
for t in uniq(list(c.t_total_gold.keys()) + list(c.t_total_pred.keys())):
by_type[t] = calculate_metrics(
c.t_correct_chunk[t], c.t_total_pred[t], c.t_total_gold[t]
)
return overall, by_type
def report(counts, out=None):
if out is None:
out = sys.stdout
overall, by_type = metrics(counts)
c = counts
out.write('processed %d tokens with %d phrases; ' %
(c.token_counter, c.total_gold))
out.write('found: %d phrases; correct: %d.\n' %
(c.total_pred, c.correct_chunk))
if c.token_counter > 0:
out.write('accuracy: %6.2f%%; ' %
(100. * c.correct_tags / c.token_counter))
out.write('precision: %6.2f%%; ' % (100. * overall.prec))
out.write('recall: %6.2f%%; ' % (100. * overall.rec))
out.write('FB1: %6.2f\n' % (100. * overall.fscore))
for i, m in sorted(by_type.items()):
out.write('%17s: ' % i)
out.write('precision: %6.2f%%; ' % (100. * m.prec))
out.write('recall: %6.2f%%; ' % (100. * m.rec))
out.write('FB1: %6.2f %d\n' % (100. * m.fscore, c.t_total_pred[i]))
return overall, by_type
def end_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk ended between the previous and current word
# arguments: previous and current chunk tags, previous and current types
return ((prev_tag == "B" and tag == "B") or
(prev_tag == "B" and tag == "O") or
(prev_tag == "I" and tag == "B") or
(prev_tag == "I" and tag == "O") or
(prev_tag == "E" and tag == "E") or
(prev_tag == "E" and tag == "I") or
(prev_tag == "E" and tag == "O") or
(prev_tag == "I" and tag == "O") or
(prev_tag != "O" and prev_tag != "." and prev_type != type_) or
(prev_tag == "]" or prev_tag == "["))
def start_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk started between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunkStart = ((prev_tag == "B" and tag == "B") or
(prev_tag == "B" and tag == "B") or
(prev_tag == "I" and tag == "B") or
(prev_tag == "O" and tag == "B") or
(prev_tag == "O" and tag == "I") or
(prev_tag == "E" and tag == "E") or
(prev_tag == "E" and tag == "I") or
(prev_tag == "O" and tag == "E") or
(prev_tag == "O" and tag == "I") or
(tag != "O" and tag != "." and prev_type != type_) or
(tag == "]" or tag == "["))
# corrected 1998-12-22: these chunks are assumed to have length 1
# print("startOfChunk?", prevTag, tag, prevType, type)
# print(chunkStart)
return chunkStart
def main(argv):
args = parse_args(argv[1:])
if args.file is None:
counts = evaluate_iobes(sys.stdin, args)
else:
with open(args.file, encoding='utf-8') as f:
counts = evaluate_iobes(f, args)
report(counts)
if __name__ == '__main__':
sys.exit(main(sys.argv))
================================================
FILE: hanlp/metrics/chunking/iobes_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-09-14 21:55
from hanlp.common.vocab_tf import VocabTF
from hanlp.metrics.chunking.conlleval import SpanF1
from hanlp.metrics.chunking.chunking_f1_tf import ChunkingF1_TF
class IOBES_F1_TF(ChunkingF1_TF):
def __init__(self, tag_vocab: VocabTF, from_logits=True, name='f1', dtype=None, **kwargs):
super().__init__(tag_vocab, from_logits, name, dtype, **kwargs)
self.state = SpanF1()
def update_tags(self, true_tags, pred_tags):
# true_tags = list(itertools.chain.from_iterable(true_tags))
# pred_tags = list(itertools.chain.from_iterable(pred_tags))
# self.state.update_state(true_tags, pred_tags)
for gold, pred in zip(true_tags, pred_tags):
self.state.update_state(gold, pred)
return self.result()
def result(self):
return self.state.result(full=False, verbose=False).fscore
def reset_states(self):
self.state.reset_state()
================================================
FILE: hanlp/metrics/chunking/sequence_labeling.py
================================================
# MIT License
#
# Copyright (c) 2018 chakki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""Metrics to assess performance on sequence labeling task given prediction
Functions named as ``*_score`` return a scalar value to maximize: the higher
the better
"""
from collections import defaultdict
import numpy as np
def iobes_to_span(words, tags):
delimiter = ' '
if all([len(w) == 1 for w in words]):
delimiter = '' # might be Chinese
entities = []
for tag, start, end in get_entities(tags):
entities.append((delimiter.join(words[start:end]), tag, start, end))
yield entities
def get_entities(seq, suffix=False):
"""Gets entities from sequence.
Args:
seq(list): sequence of labels.
suffix: (Default value = False)
Returns:
list: list of (chunk_type, chunk_start, chunk_end).
Example:
>>> from seqeval.metrics.sequence_labeling import get_entities
>>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
>>> get_entities(seq)
[('PER', 0, 2), ('LOC', 3, 4)]
"""
# for nested list
if any(isinstance(s, list) for s in seq):
seq = [item for sublist in seq for item in sublist + ['O']]
prev_tag = 'O'
prev_type = ''
begin_offset = 0
chunks = []
for i, chunk in enumerate(seq + ['O']):
if suffix:
tag = chunk[-1]
type_ = chunk[:-2]
else:
tag = chunk[0]
type_ = chunk[2:]
if end_of_chunk(prev_tag, tag, prev_type, type_):
chunks.append((prev_type, begin_offset, i))
if start_of_chunk(prev_tag, tag, prev_type, type_):
begin_offset = i
prev_tag = tag
prev_type = type_
return chunks
def end_of_chunk(prev_tag, tag, prev_type, type_):
"""Checks if a chunk ended between the previous and current word.
Args:
prev_tag: previous chunk tag.
tag: current chunk tag.
prev_type: previous type.
type_: current type.
Returns:
chunk_end: boolean.
"""
chunk_end = False
if prev_tag == 'E': chunk_end = True
if prev_tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'B': chunk_end = True
if prev_tag == 'B' and tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'O': chunk_end = True
if prev_tag == 'I' and tag == 'B': chunk_end = True
if prev_tag == 'I' and tag == 'S': chunk_end = True
if prev_tag == 'I' and tag == 'O': chunk_end = True
if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
chunk_end = True
return chunk_end
def start_of_chunk(prev_tag, tag, prev_type, type_):
"""Checks if a chunk started between the previous and current word.
Args:
prev_tag: previous chunk tag.
tag: current chunk tag.
prev_type: previous type.
type_: current type.
Returns:
chunk_start: boolean.
"""
chunk_start = False
if tag == 'B': chunk_start = True
if tag == 'S': chunk_start = True
if prev_tag == 'E' and tag == 'E': chunk_start = True
if prev_tag == 'E' and tag == 'I': chunk_start = True
if prev_tag == 'S' and tag == 'E': chunk_start = True
if prev_tag == 'S' and tag == 'I': chunk_start = True
if prev_tag == 'O' and tag == 'E': chunk_start = True
if prev_tag == 'O' and tag == 'I': chunk_start = True
if tag != 'O' and tag != '.' and prev_type != type_:
chunk_start = True
return chunk_start
def f1_score(y_true, y_pred, average='micro', suffix=False):
"""Compute the F1 score.
The F1 score can be interpreted as a weighted average of the precision and
recall, where an F1 score reaches its best value at 1 and worst score at 0.
The relative contribution of precision and recall to the F1 score are
equal. The formula for the F1 score is::
F1 = 2 * (precision * recall) / (precision + recall)
Args:
y_true: 2d array. Ground truth (correct) target values.
y_pred: 2d array. Estimated targets as returned by a tagger.
average: (Default value = 'micro')
suffix: (Default value = False)
Returns:
score: float.
Example:
>>> from seqeval.metrics import f1_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> f1_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
nb_true = len(true_entities)
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
score = 2 * p * r / (p + r) if p + r > 0 else 0
return score
def accuracy_score(y_true, y_pred):
"""Accuracy classification score.
In multilabel classification, this function computes subset accuracy:
the set of labels predicted for a sample must *exactly* match the
corresponding set of labels in y_true.
Args:
y_true: 2d array. Ground truth (correct) target values.
y_pred: 2d array. Estimated targets as returned by a tagger.
Returns:
score: float.
Example:
>>> from seqeval.metrics import accuracy_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> accuracy_score(y_true, y_pred)
0.80
"""
if any(isinstance(s, list) for s in y_true):
y_true = [item for sublist in y_true for item in sublist]
y_pred = [item for sublist in y_pred for item in sublist]
nb_correct = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred))
nb_true = len(y_true)
score = nb_correct / nb_true
return score
def precision_score(y_true, y_pred, average='micro', suffix=False):
"""Compute the precision.
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
true positives and ``fp`` the number of false positives. The precision is
intuitively the ability of the classifier not to label as positive a sample.
The best value is 1 and the worst value is 0.
Args:
y_true: 2d array. Ground truth (correct) target values.
y_pred: 2d array. Estimated targets as returned by a tagger.
average: (Default value = 'micro')
suffix: (Default value = False)
Returns:
score: float.
Example:
>>> from seqeval.metrics import precision_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> precision_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
score = nb_correct / nb_pred if nb_pred > 0 else 0
return score
def recall_score(y_true, y_pred, average='micro', suffix=False):
"""Compute the recall.
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
true positives and ``fn`` the number of false negatives. The recall is
intuitively the ability of the classifier to find all the positive samples.
The best value is 1 and the worst value is 0.
Args:
y_true: 2d array. Ground truth (correct) target values.
y_pred: 2d array. Estimated targets as returned by a tagger.
average: (Default value = 'micro')
suffix: (Default value = False)
Returns:
score: float.
Example:
>>> from seqeval.metrics import recall_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> recall_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_true = len(true_entities)
score = nb_correct / nb_true if nb_true > 0 else 0
return score
def performance_measure(y_true, y_pred):
"""Compute the performance metrics: TP, FP, FN, TN
Args:
y_true: 2d array. Ground truth (correct) target values.
y_pred: 2d array. Estimated targets as returned by a tagger.
Returns:
performance_dict: dict
Example:
>>> from seqeval.metrics import performance_measure
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O']]
>>> performance_measure(y_true, y_pred)
(3, 3, 1, 4)
"""
performace_dict = dict()
if any(isinstance(s, list) for s in y_true):
y_true = [item for sublist in y_true for item in sublist]
y_pred = [item for sublist in y_pred for item in sublist]
performace_dict['TP'] = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred)
if ((y_t != 'O') or (y_p != 'O')))
performace_dict['FP'] = sum(y_t != y_p for y_t, y_p in zip(y_true, y_pred))
performace_dict['FN'] = sum(((y_t != 'O') and (y_p == 'O'))
for y_t, y_p in zip(y_true, y_pred))
performace_dict['TN'] = sum((y_t == y_p == 'O')
for y_t, y_p in zip(y_true, y_pred))
return performace_dict
def classification_report(y_true, y_pred, digits=2, suffix=False):
"""Build a text report showing the main classification metrics.
Args:
y_true: 2d array. Ground truth (correct) target values.
y_pred: 2d array. Estimated targets as returned by a classifier.
digits: int. Number of digits for formatting output floating point values. (Default value = 2)
suffix: (Default value = False)
Returns:
report: string. Text summary of the precision, recall, F1 score for each class.
Examples:
>>> from seqeval.metrics import classification_report
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> print(classification_report(y_true, y_pred))
precision recall f1-score support
MISC 0.00 0.00 0.00 1
PER 1.00 1.00 1.00 1
micro avg 0.50 0.50 0.50 2
macro avg 0.50 0.50 0.50 2
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
name_width = 0
d1 = defaultdict(set)
d2 = defaultdict(set)
for e in true_entities:
d1[e[0]].add((e[1], e[2]))
name_width = max(name_width, len(e[0]))
for e in pred_entities:
d2[e[0]].add((e[1], e[2]))
last_line_heading = 'macro avg'
width = max(name_width, len(last_line_heading), digits)
headers = ["precision", "recall", "f1-score", "support"]
head_fmt = u'{:>{width}s} ' + u' {:>9}' * len(headers)
report = head_fmt.format(u'', *headers, width=width)
report += u'\n\n'
row_fmt = u'{:>{width}s} ' + u' {:>9.{digits}f}' * 3 + u' {:>9}\n'
ps, rs, f1s, s = [], [], [], []
for type_name, true_entities in d1.items():
pred_entities = d2[type_name]
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
nb_true = len(true_entities)
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
f1 = 2 * p * r / (p + r) if p + r > 0 else 0
report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits)
ps.append(p)
rs.append(r)
f1s.append(f1)
s.append(nb_true)
report += u'\n'
# compute averages
report += row_fmt.format('micro avg',
precision_score(y_true, y_pred, suffix=suffix),
recall_score(y_true, y_pred, suffix=suffix),
f1_score(y_true, y_pred, suffix=suffix),
np.sum(s),
width=width, digits=digits)
report += row_fmt.format(last_line_heading,
np.average(ps, weights=s),
np.average(rs, weights=s),
np.average(f1s, weights=s),
np.sum(s),
width=width, digits=digits)
return report
================================================
FILE: hanlp/metrics/f1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-10 14:55
from abc import ABC
from hanlp.metrics.metric import Metric
class F1(Metric, ABC):
def __init__(self, nb_pred=0, nb_true=0, nb_correct=0) -> None:
super().__init__()
self.nb_correct = nb_correct
self.nb_pred = nb_pred
self.nb_true = nb_true
def __repr__(self) -> str:
p, r, f = self.prf
return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}"
@property
def prf(self):
nb_correct = self.nb_correct
nb_pred = self.nb_pred
nb_true = self.nb_true
p = nb_correct / nb_pred if nb_pred > 0 else .0
r = nb_correct / nb_true if nb_true > 0 else .0
f = 2 * p * r / (p + r) if p + r > 0 else .0
return p, r, f
@property
def score(self):
return self.prf[-1]
def reset(self):
self.nb_correct = 0
self.nb_pred = 0
self.nb_true = 0
def __call__(self, pred: set, gold: set):
self.nb_correct += len(pred & gold)
self.nb_pred += len(pred)
self.nb_true += len(gold)
class F1_(Metric):
def __init__(self, p, r, f) -> None:
super().__init__()
self.f = f
self.r = r
self.p = p
@property
def score(self):
return self.f
def __call__(self, pred, gold):
raise NotImplementedError()
def reset(self):
self.f = self.r = self.p = 0
def __repr__(self) -> str:
p, r, f = self.p, self.r, self.f
return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}"
================================================
FILE: hanlp/metrics/metric.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-03 11:35
from abc import ABC, abstractmethod
class Metric(ABC):
def __lt__(self, other):
return self.score < other
def __le__(self, other):
return self.score <= other
def __eq__(self, other):
return self.score == other
def __ge__(self, other):
return self.score >= other
def __gt__(self, other):
return self.score > other
def __ne__(self, other):
return self.score != other
@property
@abstractmethod
def score(self):
pass
@abstractmethod
def __call__(self, pred, gold, mask=None):
pass
def __repr__(self) -> str:
return f'{self.score}:.4f'
def __float__(self):
return self.score
@abstractmethod
def reset(self):
pass
================================================
FILE: hanlp/metrics/mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-08-03 00:16
from hanlp.metrics.metric import Metric
class MetricDict(Metric, dict):
_COLORS = ["magenta", "cyan", "green", "yellow"]
@property
def score(self):
return sum(float(x) for x in self.values()) / len(self)
def __call__(self, pred, gold):
for metric in self.values():
metric(pred, gold)
def reset(self):
for metric in self.values():
metric.reset()
def __repr__(self) -> str:
return ' '.join(f'({k} {v})' for k, v in self.items())
def cstr(self, idx=None, level=0) -> str:
if idx is None:
idx = [0]
prefix = ''
for _, (k, v) in enumerate(self.items()):
color = self._COLORS[idx[0] % len(self._COLORS)]
idx[0] += 1
child_is_dict = isinstance(v, MetricDict)
_level = min(level, 2)
# if level != 0 and not child_is_dict:
# _level = 2
lb = '{[('
rb = '}])'
k = f'[bold][underline]{k}[/underline][/bold]'
prefix += f'[{color}]{lb[_level]}{k} [/{color}]'
if child_is_dict:
prefix += v.cstr(idx, level + 1)
else:
prefix += f'[{color}]{v}[/{color}]'
prefix += f'[{color}]{rb[_level]}[/{color}]'
return prefix
================================================
FILE: hanlp/metrics/parsing/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-27 00:48
================================================
FILE: hanlp/metrics/parsing/attachmentscore.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from hanlp.metrics.metric import Metric
class AttachmentScore(Metric):
def __init__(self, eps=1e-12):
super(AttachmentScore, self).__init__()
self.eps = eps
self.total = 0.0
self.correct_arcs = 0.0
self.correct_rels = 0.0
def __repr__(self):
return f"UAS: {self.uas:.2%} LAS: {self.las:.2%}"
# noinspection PyMethodOverriding
def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
arc_mask = arc_preds.eq(arc_golds)[mask]
rel_mask = rel_preds.eq(rel_golds)[mask] & arc_mask
self.total += len(arc_mask)
self.correct_arcs += arc_mask.sum().item()
self.correct_rels += rel_mask.sum().item()
def __lt__(self, other):
return self.score < other
def __le__(self, other):
return self.score <= other
def __ge__(self, other):
return self.score >= other
def __gt__(self, other):
return self.score > other
@property
def score(self):
return self.las
@property
def uas(self):
return self.correct_arcs / (self.total + self.eps)
@property
def las(self):
return self.correct_rels / (self.total + self.eps)
def reset(self):
self.total = 0.0
self.correct_arcs = 0.0
self.correct_rels = 0.0
================================================
FILE: hanlp/metrics/parsing/conllx_eval.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-08 22:35
import tempfile
from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr
CONLLX_EVAL = get_resource(
'https://github.com/elikip/bist-parser/archive/master.zip' + '#bmstparser/src/utils/eval.pl')
def evaluate(gold_file, pred_file):
"""Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski)
Args:
gold_file(str): The gold conllx file
pred_file(str): The pred conllx file
Returns:
"""
gold_file = get_resource(gold_file)
fixed_pred_file = tempfile.NamedTemporaryFile().name
copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False)
if gold_file.endswith('.conllu'):
fixed_gold_file = tempfile.NamedTemporaryFile().name
copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False)
gold_file = fixed_gold_file
exitcode, out, err = get_exitcode_stdout_stderr(f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}')
if exitcode:
raise RuntimeError(f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.')
lines = out.split('\n')[-4:]
las = int(lines[0].split()[3]) / int(lines[0].split()[5])
uas = int(lines[1].split()[3]) / int(lines[1].split()[5])
return uas, las
def copy_cols(gold_file, pred_file, copied_pred_file, keep_comments=True):
"""Copy the first 6 columns from gold file to pred file
Args:
gold_file:
pred_file:
copied_pred_file:
keep_comments: (Default value = True)
Returns:
"""
with open(copied_pred_file, 'w') as to_out, open(pred_file) as pred_file, open(gold_file) as gold_file:
for idx, (p, g) in enumerate(zip(pred_file, gold_file)):
while p.startswith('#'):
p = next(pred_file)
if not g.strip():
if p.strip():
raise ValueError(
f'Prediction file {pred_file.name} does not end a sentence at line {idx + 1}\n{p.strip()}')
to_out.write('\n')
continue
while g.startswith('#') or '-' in g.split('\t')[0]:
if keep_comments or g.startswith('-'):
to_out.write(g)
g = next(gold_file)
to_out.write('\t'.join(str(x) for x in g.split('\t')[:6] + p.split('\t')[6:]))
================================================
FILE: hanlp/metrics/parsing/labeled_f1.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-27 21:42
from hanlp.metrics.metric import Metric
class LabeledF1(Metric):
def __init__(self):
super(LabeledF1, self).__init__()
self.sum_gold_arcs_wo_punc = 0.0
self.sum_pred_arcs_wo_punc = 0.0
self.correct_arcs_wo_punc = 0.0
self.correct_rels_wo_punc = 0.0
def __repr__(self):
return f"UF: {self.uf:4.2%} LF: {self.lf:4.2%}"
def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
mask_gold = mask & arc_golds
mask_pred = mask & arc_preds
correct_mask = mask_gold & mask_pred
correct_arcs_wo_punc = (arc_preds == arc_golds)[correct_mask]
correct_rels_wo_punc = (rel_preds == rel_golds)[correct_mask] & correct_arcs_wo_punc
self.sum_gold_arcs_wo_punc += float(mask_gold.sum())
self.sum_pred_arcs_wo_punc += float(mask_pred.sum())
self.correct_arcs_wo_punc += float(correct_arcs_wo_punc.sum())
self.correct_rels_wo_punc += float(correct_rels_wo_punc.sum())
def __lt__(self, other):
return self.score < other
def __le__(self, other):
return self.score <= other
def __ge__(self, other):
return self.score >= other
def __gt__(self, other):
return self.score > other
@property
def score(self):
return self.las
@property
def uas(self):
return self.uf
@property
def las(self):
return self.lf
@property
def ur(self):
if not self.sum_gold_arcs_wo_punc:
return .0
return self.correct_arcs_wo_punc / self.sum_gold_arcs_wo_punc
@property
def up(self):
if not self.sum_pred_arcs_wo_punc:
return .0
return self.correct_arcs_wo_punc / self.sum_pred_arcs_wo_punc
@property
def lr(self):
if not self.sum_gold_arcs_wo_punc:
return .0
return self.correct_rels_wo_punc / self.sum_gold_arcs_wo_punc
@property
def lp(self):
if not self.sum_pred_arcs_wo_punc:
return .0
return self.correct_rels_wo_punc / self.sum_pred_arcs_wo_punc
@property
def uf(self):
rp = self.ur + self.up
if not rp:
return .0
return 2 * self.ur * self.up / rp
@property
def lf(self):
rp = self.lr + self.lp
if not rp:
return .0
return 2 * self.lr * self.lp / rp
def reset(self):
self.sum_gold_arcs_wo_punc = 0.0
self.sum_pred_arcs_wo_punc = 0.0
self.correct_arcs_wo_punc = 0.0
self.correct_rels_wo_punc = 0.0
def to_dict(self) -> dict:
return {'UF': self.uf, 'LF': self.lf}
================================================
FILE: hanlp/metrics/parsing/labeled_f1_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-27 21:42
import tensorflow as tf
class LabeledF1TF(object):
def __init__(self):
super(LabeledF1TF, self).__init__()
self.sum_gold_arcs_wo_punc = 0.0
self.sum_pred_arcs_wo_punc = 0.0
self.correct_arcs_wo_punc = 0.0
self.correct_rels_wo_punc = 0.0
def __repr__(self):
return f"UF: {self.uf:6.2%} LF: {self.lf:6.2%}"
def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
mask = mask.unsqueeze(-1).expand_as(arc_preds)
mask = mask & mask.transpose(1, 2)
mask_gold = mask & arc_golds
mask_pred = mask & arc_preds
correct_arcs_wo_punc = (arc_preds == arc_golds)[mask_gold & mask_pred]
correct_rels_wo_punc = (rel_preds == rel_golds)[mask_gold & mask_pred] & correct_arcs_wo_punc
self.sum_gold_arcs_wo_punc += float(tf.math.count_nonzero(mask_gold))
self.sum_pred_arcs_wo_punc += float(tf.math.count_nonzero(mask_pred))
self.correct_arcs_wo_punc += float(tf.math.count_nonzero(correct_arcs_wo_punc))
self.correct_rels_wo_punc += float(tf.math.count_nonzero(correct_rels_wo_punc))
def __lt__(self, other):
return self.score < other
def __le__(self, other):
return self.score <= other
def __ge__(self, other):
return self.score >= other
def __gt__(self, other):
return self.score > other
@property
def score(self):
return self.las
@property
def uas(self):
return self.uf
@property
def las(self):
return self.lf
@property
def ur(self):
if not self.sum_gold_arcs_wo_punc:
return 0
return self.correct_arcs_wo_punc / self.sum_gold_arcs_wo_punc
@property
def up(self):
if not self.sum_pred_arcs_wo_punc:
return 0
return self.correct_arcs_wo_punc / self.sum_pred_arcs_wo_punc
@property
def lr(self):
if not self.sum_gold_arcs_wo_punc:
return 0
return self.correct_rels_wo_punc / self.sum_gold_arcs_wo_punc
@property
def lp(self):
if not self.sum_pred_arcs_wo_punc:
return 0
return self.correct_rels_wo_punc / self.sum_pred_arcs_wo_punc
@property
def uf(self):
rp = self.ur + self.up
if not rp:
return 0
return 2 * self.ur * self.up / rp
@property
def lf(self):
rp = self.lr + self.lp
if not rp:
return 0
return 2 * self.lr * self.lp / rp
def reset_states(self):
self.sum_gold_arcs_wo_punc = 0.0
self.sum_pred_arcs_wo_punc = 0.0
self.correct_arcs_wo_punc = 0.0
self.correct_rels_wo_punc = 0.0
def to_dict(self) -> dict:
return {'UF': self.uf, 'LF': self.lf}
================================================
FILE: hanlp/metrics/parsing/labeled_score.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-27 00:49
import tensorflow as tf
class LabeledScore(object):
def __init__(self, eps=1e-5):
super(LabeledScore, self).__init__()
self.eps = eps
self.total = 0.0
self.correct_arcs = 0.0
self.correct_rels = 0.0
def __repr__(self):
return f"UAS: {self.uas:6.2%} LAS: {self.las:6.2%}"
def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
arc_mask = (arc_preds == arc_golds)[mask]
rel_mask = (rel_preds == rel_golds)[mask] & arc_mask
self.total += len(arc_mask)
self.correct_arcs += int(tf.math.count_nonzero(arc_mask))
self.correct_rels += int(tf.math.count_nonzero(rel_mask))
def __lt__(self, other):
return self.score < other
def __le__(self, other):
return self.score <= other
def __ge__(self, other):
return self.score >= other
def __gt__(self, other):
return self.score > other
@property
def score(self):
return self.las
@property
def uas(self):
return self.correct_arcs / (self.total + self.eps)
@property
def las(self):
return self.correct_rels / (self.total + self.eps)
def reset_states(self):
self.total = 0.0
self.correct_arcs = 0.0
self.correct_rels = 0.0
def to_dict(self) -> dict:
return {'UAS': self.uas, 'LAS': self.las}
================================================
FILE: hanlp/metrics/parsing/semdep_eval.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2017 Timothy Dozat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import sys
from collections import namedtuple
# ===============================================================
def sdp_eval(gold_files, sys_files, labeled=False):
"""Modified from https://github.com/tdozat/Parser-v3/blob/2ff4061373e8aac8c962537a6220e1d5b196abf6/scripts/semdep_eval.py
Dozat claimed "I tested it against the official eval script and it reported identical LF1".
Args:
gold_files:
sys_files:
labeled: (Default value = False)
Returns:
"""
correct = 0
predicted = 0
actual = 0
n_tokens = 0
n_sequences = 0
current_seq_correct = False
n_correct_sequences = 0
current_sent = 0
if isinstance(gold_files, str):
gold_files = [gold_files]
if isinstance(sys_files, str):
sys_files = [sys_files]
for gold_file, sys_file in zip(gold_files, sys_files):
with codecs.open(gold_file, encoding='utf-8') as gf, \
codecs.open(sys_file, encoding='utf-8') as sf:
gold_line = gf.readline()
gold_i = 1
sys_i = 0
while gold_line:
while gold_line.startswith('#'):
current_sent += 1
gold_i += 1
n_sequences += 1
n_correct_sequences += current_seq_correct
current_seq_correct = True
gold_line = gf.readline()
if gold_line.rstrip() != '':
sys_line = sf.readline()
sys_i += 1
while sys_line.startswith('#') or sys_line.rstrip() == '' or sys_line.split('\t')[0] == '0':
sys_line = sf.readline()
sys_i += 1
gold_line = gold_line.rstrip().split('\t')
sys_line = sys_line.rstrip().split('\t')
# assert sys_line[1] == gold_line[1], 'Files are misaligned at lines {}, {}'.format(gold_i, sys_i)
# Compute the gold edges
gold_node = gold_line[8]
if gold_node != '_':
gold_node = gold_node.split('|')
if labeled:
gold_edges = set(tuple(gold_edge.split(':', 1)) for gold_edge in gold_node)
else:
gold_edges = set(gold_edge.split(':', 1)[0] for gold_edge in gold_node)
else:
gold_edges = set()
# Compute the sys edges
sys_node = sys_line[8]
if sys_node != '_':
sys_node = sys_node.split('|')
if labeled:
sys_edges = set(tuple(sys_edge.split(':', 1)) for sys_edge in sys_node)
else:
sys_edges = set(sys_edge.split(':', 1)[0] for sys_edge in sys_node)
else:
sys_edges = set()
correct_edges = gold_edges & sys_edges
if len(correct_edges) != len(gold_edges):
current_seq_correct = False
correct += len(correct_edges)
predicted += len(sys_edges)
actual += len(gold_edges)
n_tokens += 1
# current_fp += len(sys_edges) - len(gold_edges & sys_edges)
gold_line = gf.readline()
gold_i += 1
# print(correct, predicted - correct, actual - correct)
Accuracy = namedtuple('Accuracy', ['precision', 'recall', 'F1', 'seq_acc'])
precision = correct / (predicted + 1e-12)
recall = correct / (actual + 1e-12)
F1 = 2 * precision * recall / (precision + recall + 1e-12)
seq_acc = n_correct_sequences / n_sequences
return Accuracy(precision, recall, F1, seq_acc)
# ===============================================================
def main():
""" """
files = sys.argv[1:]
n_files = len(files)
assert (n_files % 2) == 0
gold_files, sys_files = files[:n_files // 2], files[n_files // 2:]
UAS = sdp_eval(gold_files, sys_files, labeled=False)
LAS = sdp_eval(gold_files, sys_files, labeled=True)
# print(UAS.F1, UAS.seq_acc)
print('UAS={:0.1f}'.format(UAS.F1 * 100))
print('LAS={:0.1f}'.format(LAS.F1 * 100))
if __name__ == '__main__':
main()
================================================
FILE: hanlp/metrics/parsing/span.py
================================================
# MIT License
#
# Copyright (c) 2020 Yu Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from collections import Counter
from hanlp.metrics.metric import Metric
class SpanMetric(Metric):
def __init__(self, eps=1e-12):
super().__init__()
self.reset(eps)
# noinspection PyAttributeOutsideInit
def reset(self, eps=1e-12):
self.n = 0.0
self.n_ucm = 0.0
self.n_lcm = 0.0
self.utp = 0.0
self.ltp = 0.0
self.pred = 0.0
self.gold = 0.0
self.eps = eps
def __call__(self, preds, golds):
for pred, gold in zip(preds, golds):
upred = Counter([(i, j) for i, j, label in pred])
ugold = Counter([(i, j) for i, j, label in gold])
utp = list((upred & ugold).elements())
lpred = Counter(pred)
lgold = Counter(gold)
ltp = list((lpred & lgold).elements())
self.n += 1
self.n_ucm += len(utp) == len(pred) == len(gold)
self.n_lcm += len(ltp) == len(pred) == len(gold)
self.utp += len(utp)
self.ltp += len(ltp)
self.pred += len(pred)
self.gold += len(gold)
return self
def __repr__(self):
s = f"UCM: {self.ucm:.2%} LCM: {self.lcm:.2%} "
s += f"UP: {self.up:.2%} UR: {self.ur:.2%} UF: {self.uf:.2%} "
s += f"LP: {self.lp:.2%} LR: {self.lr:.2%} LF: {self.lf:.2%}"
return s
@property
def score(self):
return self.lf
@property
def ucm(self):
return self.n_ucm / (self.n + self.eps)
@property
def lcm(self):
return self.n_lcm / (self.n + self.eps)
@property
def up(self):
return self.utp / (self.pred + self.eps)
@property
def ur(self):
return self.utp / (self.gold + self.eps)
@property
def uf(self):
return 2 * self.utp / (self.pred + self.gold + self.eps)
@property
def lp(self):
return self.ltp / (self.pred + self.eps)
@property
def lr(self):
return self.ltp / (self.gold + self.eps)
@property
def lf(self):
return 2 * self.ltp / (self.pred + self.gold + self.eps)
================================================
FILE: hanlp/metrics/spearman_correlation.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-23 16:12
import torch
from hanlp.metrics.metric import Metric
def _get_ranks(x: torch.Tensor) -> torch.Tensor:
argsort = x.argsort()
ranks = torch.zeros_like(argsort, device=x.device)
ranks[argsort] = torch.arange(len(x), device=x.device)
return ranks
def spearman_correlation(x: torch.Tensor, y: torch.Tensor):
"""Compute correlation between 2 1-D vectors. Adopted from
https://discuss.pytorch.org/t/spearmans-correlation/91931/5
Args:
x: Shape (N, )
y: Shape (N, )
"""
x_rank = _get_ranks(x)
y_rank = _get_ranks(y)
n = x.size(0)
upper = 6 * torch.sum((x_rank - y_rank).pow(2))
down = n * (n ** 2 - 1.0)
return 1.0 - (upper / down)
class SpearmanCorrelation(Metric):
"""
This `Metric` calculates the sample Spearman correlation coefficient (r)
between two tensors. Each element in the two tensors is assumed to be
a different observation of the variable (i.e., the input tensors are
implicitly flattened into vectors and the correlation is calculated
between the vectors).
"""
@property
def score(self):
return spearman_correlation(self.total_predictions, self.total_gold_labels).item()
def __init__(self) -> None:
super().__init__()
self.total_predictions = torch.zeros(0)
self.total_gold_labels = torch.zeros(0)
def __call__(
self,
predictions: torch.Tensor,
gold_labels: torch.Tensor,
mask=None
):
"""
# Parameters
predictions : `torch.Tensor`, required.
A tensor of predictions of shape (batch_size, ...).
gold_labels : `torch.Tensor`, required.
A tensor of the same shape as `predictions`.
"""
if mask is not None:
raise NotImplemented('mask not supported in SpearmanCorrelation for now.')
# Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between
# the vectors, since each element in the predictions and gold_labels tensor is assumed
# to be a separate observation.
predictions = predictions.reshape(-1)
gold_labels = gold_labels.reshape(-1)
self.total_predictions = self.total_predictions.to(predictions.device)
self.total_gold_labels = self.total_gold_labels.to(gold_labels.device)
self.total_predictions = torch.cat((self.total_predictions, predictions), 0)
self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels), 0)
def reset(self):
self.total_predictions = torch.zeros(0)
self.total_gold_labels = torch.zeros(0)
def __str__(self) -> str:
return f'spearman: {self.score * 100:.2f}'
================================================
FILE: hanlp/metrics/srl/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-16 18:44
================================================
FILE: hanlp/metrics/srl/srlconll.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-07-16 18:44
import os
from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr, run_cmd
def official_conll_05_evaluate(pred_path, gold_path):
script_root = get_resource('http://www.lsi.upc.edu/~srlconll/srlconll-1.1.tgz')
lib_path = f'{script_root}/lib'
if lib_path not in os.environ.get("PERL5LIB", ""):
os.environ['PERL5LIB'] = f'{lib_path}:{os.environ.get("PERL5LIB", "")}'
bin_path = f'{script_root}/bin'
if bin_path not in os.environ.get('PATH', ''):
os.environ['PATH'] = f'{bin_path}:{os.environ.get("PATH", "")}'
eval_info_gold_pred = run_cmd(f'perl {script_root}/bin/srl-eval.pl {gold_path} {pred_path}')
eval_info_pred_gold = run_cmd(f'perl {script_root}/bin/srl-eval.pl {pred_path} {gold_path}')
conll_recall = float(eval_info_gold_pred.strip().split("\n")[6].strip().split()[5]) / 100
conll_precision = float(eval_info_pred_gold.strip().split("\n")[6].strip().split()[5]) / 100
if conll_recall + conll_precision > 0:
conll_f1 = 2 * conll_recall * conll_precision / (conll_recall + conll_precision)
else:
conll_f1 = 0
return conll_precision, conll_recall, conll_f1
def run_perl(script, src, dst=None):
os.environ['PERL5LIB'] = f''
exitcode, out, err = get_exitcode_stdout_stderr(
f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}')
if exitcode:
# cpanm -l ~/.local namespace::autoclean
# cpanm -l ~/.local Moose
# cpanm -l ~/.local MooseX::SemiAffordanceAccessor module
raise RuntimeError(err)
with open(dst, 'w') as ofile:
ofile.write(out)
return dst
================================================
FILE: hanlp/optimizers/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-11 18:44
================================================
FILE: hanlp/optimizers/adamw/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-11 18:44
import tensorflow as tf
from hanlp.optimizers.adamw.optimization import WarmUp, AdamWeightDecay
# from hanlp.optimization.adamw.optimizers_v2 import AdamW
# from hanlp.optimization.adamw.utils import get_weight_decays
# def create_optimizer(model, init_lr, num_train_steps, num_warmup_steps):
# """Creates an optimizer with learning rate schedule."""
# wd_dict = get_weight_decays(model)
#
# # Implements linear decay of the learning rate.
# learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
# initial_learning_rate=init_lr,
# decay_steps=num_train_steps,
# end_learning_rate=0.0)
# if num_warmup_steps:
# learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
# decay_schedule_fn=learning_rate_fn,
# warmup_steps=num_warmup_steps)
# optimizer = AdamW(
# learning_rate=learning_rate_fn,
# weight_decay_rate=0.01,
# beta_1=0.9,
# beta_2=0.999,
# epsilon=1e-6,
# exclude_from_weight_decay=['layer_norm', 'bias'])
# return optimizer
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01, epsilon=1e-6, clipnorm=None):
"""Creates an optimizer with learning rate schedule.
Args:
init_lr:
num_train_steps:
num_warmup_steps:
weight_decay_rate: (Default value = 0.01)
epsilon: (Default value = 1e-6)
clipnorm: (Default value = None)
Returns:
"""
# Implements linear decay of the learning rate.
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=init_lr,
decay_steps=num_train_steps,
end_learning_rate=0.0)
if num_warmup_steps:
learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
decay_schedule_fn=learning_rate_fn,
warmup_steps=num_warmup_steps)
additional_args = {}
if clipnorm:
additional_args['clipnorm'] = clipnorm
optimizer = AdamWeightDecay(
learning_rate=learning_rate_fn,
weight_decay_rate=weight_decay_rate,
beta_1=0.9,
beta_2=0.999,
epsilon=epsilon,
exclude_from_weight_decay=['LayerNorm', 'bias'],
**additional_args
)
# {'LayerNorm/gamma:0', 'LayerNorm/beta:0'}
return optimizer
================================================
FILE: hanlp/optimizers/adamw/optimization.py
================================================
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Applys a warmup schedule on a given learning rate decay schedule."""
def __init__(
self,
initial_learning_rate,
decay_schedule_fn,
warmup_steps,
power=1.0,
name=None):
super(WarmUp, self).__init__()
self.initial_learning_rate = initial_learning_rate
self.warmup_steps = warmup_steps
self.power = power
self.decay_schedule_fn = decay_schedule_fn
self.name = name
def __call__(self, step):
with tf.name_scope(self.name or 'WarmUp') as name:
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
global_step_float = tf.cast(step, tf.float32)
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
warmup_percent_done = global_step_float / warmup_steps_float
warmup_learning_rate = (
self.initial_learning_rate *
tf.math.pow(warmup_percent_done, self.power))
return tf.cond(global_step_float < warmup_steps_float,
lambda: warmup_learning_rate,
lambda: self.decay_schedule_fn(step),
name=name)
def get_config(self):
return {
'initial_learning_rate': self.initial_learning_rate,
'decay_schedule_fn': self.decay_schedule_fn,
'warmup_steps': self.warmup_steps,
'power': self.power,
'name': self.name
}
def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
"""Creates an optimizer with learning rate schedule.
Args:
init_lr:
num_train_steps:
num_warmup_steps:
Returns:
"""
# Implements linear decay of the learning rate.
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=init_lr,
decay_steps=num_train_steps,
end_learning_rate=0.0)
if num_warmup_steps:
learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
decay_schedule_fn=learning_rate_fn,
warmup_steps=num_warmup_steps)
optimizer = AdamWeightDecay(
learning_rate=learning_rate_fn,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=['layer_norm', 'bias'])
return optimizer
try:
AdamTF = tf.keras.optimizers.legacy.Adam # avoid slowdown when using v2.11+ Keras optimizers on M1/M2 Macs
except:
AdamTF = tf.keras.optimizers.Adam
class AdamWeightDecay(AdamTF):
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
Just adding the square of the weights to the loss function is *not* the
correct way of using L2 regularization/weight decay with Adam, since that will
interact with the m and v parameters in strange ways.
Instead we want to decay the weights in a manner that doesn't interact with
the m/v parameters. This is equivalent to adding the square of the weights to
the loss with plain (non-momentum) SGD.
Args:
Returns:
"""
def __init__(self,
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-7,
amsgrad=False,
weight_decay_rate=0.0,
include_in_weight_decay=None,
exclude_from_weight_decay=None,
name='AdamWeightDecay',
**kwargs):
super(AdamWeightDecay, self).__init__(
learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
self.weight_decay_rate = weight_decay_rate
self._include_in_weight_decay = include_in_weight_decay
self._exclude_from_weight_decay = exclude_from_weight_decay
@classmethod
def from_config(cls, config):
"""Creates an optimizer from its config with WarmUp custom object.
Args:
config:
Returns:
"""
custom_objects = {'WarmUp': WarmUp}
return super(AdamWeightDecay, cls).from_config(
config, custom_objects=custom_objects)
def _prepare_local(self, var_device, var_dtype, apply_state):
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
apply_state)
apply_state['weight_decay_rate'] = tf.constant(
self.weight_decay_rate, name='adam_weight_decay_rate')
def _decay_weights_op(self, var, learning_rate, apply_state):
do_decay = self._do_use_weight_decay(var.name)
if do_decay:
return var.assign_sub(
learning_rate * var *
apply_state['weight_decay_rate'],
use_locking=self._use_locking)
return tf.no_op()
def apply_gradients(self, grads_and_vars, name=None):
grads, tvars = list(zip(*grads_and_vars))
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
def _get_lr(self, var_device, var_dtype, apply_state):
"""Retrieves the learning rate with the given state.
Args:
var_device:
var_dtype:
apply_state:
Returns:
"""
if apply_state is None:
return self._decayed_lr_t[var_dtype], {}
apply_state = apply_state or {}
coefficients = apply_state.get((var_device, var_dtype))
if coefficients is None:
coefficients = self._fallback_apply_state(var_device, var_dtype)
apply_state[(var_device, var_dtype)] = coefficients
return coefficients['lr_t'], dict(apply_state=apply_state)
def _resource_apply_dense(self, grad, var, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay, self)._resource_apply_dense(
grad, var, **kwargs)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay, self)._resource_apply_sparse(
grad, var, indices, **kwargs)
def get_config(self):
config = super(AdamWeightDecay, self).get_config()
config.update({
'weight_decay_rate': self.weight_decay_rate,
})
return config
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`.
Args:
param_name:
Returns:
"""
if self.weight_decay_rate == 0:
return False
if self._include_in_weight_decay:
for r in self._include_in_weight_decay:
if re.search(r, param_name) is not None:
return True
if self._exclude_from_weight_decay:
for r in self._exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def apply_gradients(self, grads_and_vars, name=None, **kwargs):
grads, tvars = list(zip(*grads_and_vars))
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name, **kwargs)
================================================
FILE: hanlp/pretrained/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:10
from hanlp.pretrained import tok
from hanlp.pretrained import dep
from hanlp.pretrained import sdp
from hanlp.pretrained import glove
from hanlp.pretrained import pos
from hanlp.pretrained import rnnlm
from hanlp.pretrained import word2vec
from hanlp.pretrained import ner
from hanlp.pretrained import classifiers
from hanlp.pretrained import fasttext
from hanlp.pretrained import mtl
from hanlp.pretrained import eos
from hanlp.pretrained import sts
from hanlp.pretrained import constituency
from hanlp.pretrained import amr
from hanlp.pretrained import amr2text
from hanlp.pretrained import srl
# Will be filled up during runtime
ALL = {}
================================================
FILE: hanlp/pretrained/amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-25 11:47
from hanlp_common.constant import HANLP_URL
AMR3_SEQ2SEQ_BART_LARGE = HANLP_URL + 'amr/amr3_seq2seq_bart_large_83.30_20220125_114450.zip'
'''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large parser trained on Abstract
Meaning Representation 3.0 (:cite:`knight2014abstract`). Its performance is
=================== ========= ========= =========
Metric P R F1
=================== ========= ========= =========
Smatch 84.00 82.60 83.30
Unlabeled 86.40 84.90 85.70
No WSD 84.50 83.10 83.80
Non_sense_frames 91.90 91.30 91.60
Wikification 81.70 80.80 81.20
Named Ent. 89.20 87.00 88.10
Negations 71.70 70.90 71.30
IgnoreVars 73.80 73.10 73.50
Concepts 90.70 89.60 90.10
Frames 88.50 87.90 88.20
Reentrancies 70.40 71.80 71.10
SRL 79.00 79.60 79.30
=================== ========= ========= =========
Note this parser does NOT perform wikification.
'''
AMR3_GRAPH_PRETRAIN_PARSER = HANLP_URL + 'amr/amr3_graph_pretrain_parser_20221207_153759.zip'
'''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large parser trained on Abstract
Meaning Representation 3.0 (:cite:`knight2014abstract`) with graph pre-training (:cite:`bai-etal-2022-graph`).
Its performance is ``84.3`` according to their official repository. Using ``amr-evaluation-enhanced``, the performance is
slightly lower:
=================== ========= ========= =========
Metric P R F1
=================== ========= ========= =========
Smatch 84.4 83.6 84.0
Unlabeled 86.7 85.8 86.2
No WSD 84.9 84.1 84.5
Non_sense_frames 91.8 91.6 91.7
Wikification 83.6 81.7 82.6
Named Ent. 89.3 87.4 88.4
Negations 71.6 72.2 71.9
IgnoreVars 74.6 74.2 74.4
Concepts 90.7 90.0 90.3
Frames 88.8 88.5 88.7
Reentrancies 72.1 72.9 72.5
SRL 80.1 80.7 80.4
=================== ========= ========= =========
Note this parser does NOT perform wikification.
'''
MRP2020_AMR_ENG_ZHO_XLM_BASE = 'http://download.hanlp.com/amr/extra/amr-eng-zho-xlm-roberta-base_20220412_223756.zip'
'''A wrapper for the Permutation-invariant Semantic Parser (:cite:`samuel-straka-2020-ufal`) trained on MRP2020 English
and Chinese AMR corpus. It was ranked the top in the MRP2020 competition, while this release is a base version.
See the original paper for the detailed performance. Note this model requires tokens and lemmas (for English) to be
provided as inputs.
'''
MRP2020_AMR_ZHO_MENGZI_BASE = 'http://download.hanlp.com/amr/extra/amr-zho-mengzi-base_20220415_101941.zip'
'''A Chinese Permutation-invariant Semantic Parser (:cite:`samuel-straka-2020-ufal`) trained on MRP2020
Chinese AMR corpus using Mengzi BERT base (:cite:`zhang2021mengzi`). Its performance on dev set is
``{amr-zho [tops F1: 85.43%][anchors F1: 93.41%][labels F1: 87.68%][properties F1: 82.02%][edges F1: 73.17%]
[attributes F1: 0.00%][all F1: 84.11%]}``. Test set performance is unknown since the test set is not released to the
public.
'''
# Will be filled up during runtime
ALL = {}
================================================
FILE: hanlp/pretrained/amr2text.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-12-07 15:19
from hanlp_common.constant import HANLP_URL
AMR3_GRAPH_PRETRAIN_GENERATION = HANLP_URL + 'amr2text/amr3_graph_pretrain_generation_20221207_153535.zip'
'''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large AMR2Text generator trained on
Abstract Meaning Representation 3.0 (:cite:`knight2014abstract`) with graph pre-training (:cite:`bai-etal-2022-graph`).
Its Sacre-BLEU is ``50.38`` according to their official repository.
'''
# Will be filled up during runtime
ALL = {}
================================================
FILE: hanlp/pretrained/classifiers.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 03:51
from hanlp_common.constant import HANLP_URL
CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20211228_163210.zip'
SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20211228_164917.zip'
LID_176_FASTTEXT_BASE = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
'''
126MB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
'''
LID_176_FASTTEXT_SMALL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
'''
917kB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
'''
ALL = {}
================================================
FILE: hanlp/pretrained/constituency.py
================================================
# -*- coding:utf-8 -*-
# Author=hankcs
# Date=2022-01-18 10:34
from hanlp_common.constant import HANLP_URL
CTB9_CON_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_con_electra_small_20220215_230116.zip'
'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with major categories. ' \
'Its performance is UCM=39.06% LCM=34.99% UP=90.05% UR=90.01% UF=90.03% LP=87.02% LR=86.98% LF=87.00%.'
CTB9_CON_FULL_TAG_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_full_tag_con_electra_small_20220118_103119.zip'
'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \
'Its performance is UCM=38.29% LCM=28.95% UP=90.16% UR=90.13% UF=90.15% LP=83.46% LR=83.43% LF=83.45%.'
CTB9_CON_FULL_TAG_ERNIE_GRAM = 'http://download.hanlp.com/constituency/extra/ctb9_full_tag_con_ernie_20220331_121430.zip'
'ERNIE-GRAM (:cite:`xiao-etal-2021-ernie`) base tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \
'Its performance is UCM=42.04% LCM=31.72% UP=91.33% UR=91.53% UF=91.43% LP=85.31% LR=85.49% LF=85.40%.'
# Will be filled up during runtime
ALL = {}
================================================
FILE: hanlp/pretrained/dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 02:55
from hanlp_common.constant import HANLP_URL
CTB5_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb5_20191229_025833.zip'
'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB5.'
CTB7_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb7_20200109_022431.zip'
'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB7.'
CTB9_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/ctb9_dep_electra_small_20220216_100306.zip'
'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-SD330. ' \
'Performance is UAS=87.68% LAS=83.54%.'
PMT1_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/pmt_dep_electra_small_20220218_134518.zip'
'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on PKU ' \
'Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`). Performance is UAS=91.21% LAS=88.65%.'
CTB9_UDC_ELECTRA_SMALL = HANLP_URL + 'dep/udc_dep_electra_small_20220218_095452.zip'
'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-UD420. ' \
'Performance is UAS=85.92% LAS=81.13% .'
PTB_BIAFFINE_DEP_EN = HANLP_URL + 'dep/ptb_dep_biaffine_20200101_174624.zip'
'Biaffine LSTM model (:cite:`dozat:17a`) trained on PTB.'
ALL = {}
================================================
FILE: hanlp/pretrained/eos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-22 13:22
from hanlp_common.constant import HANLP_URL
UD_CTB_EOS_MUL = HANLP_URL + 'eos/eos_ud_ctb_mul_20201222_133543.zip'
'EOS model (:cite:`Schweter:Ahmed:2019`) trained on concatenated UD2.3 and CTB9.'
# Will be filled up during runtime
ALL = {}
================================================
FILE: hanlp/pretrained/fasttext.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 18:57
FASTTEXT_DEBUG_EMBEDDING_EN = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext.debug.bin.zip'
FASTTEXT_CC_300_EN = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Common Crawl.'
FASTTEXT_WIKI_NYT_AMAZON_FRIENDS_200_EN \
= 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext-200-wikipedia-nytimes-amazon-friends-20191107.bin'
'FastText (:cite:`bojanowski2017enriching`) embeddings trained on wikipedia, nytimes and friends.'
FASTTEXT_WIKI_300_ZH = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.zip#wiki.zh.bin'
'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Chinese Wikipedia.'
FASTTEXT_WIKI_300_ZH_CLASSICAL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.zip#wiki.zh_classical.bin'
'FastText (:cite:`bojanowski2017enriching`) embeddings trained on traditional Chinese wikipedia.'
ALL = {}
================================================
FILE: hanlp/pretrained/glove.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-27 20:42
_GLOVE_6B_ROOT = 'http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'
GLOVE_6B_50D = _GLOVE_6B_ROOT + '#' + 'glove.6B.50d.txt'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 50d trained on 6B tokens.'
GLOVE_6B_100D = _GLOVE_6B_ROOT + '#' + 'glove.6B.100d.txt'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 100d trained on 6B tokens.'
GLOVE_6B_200D = _GLOVE_6B_ROOT + '#' + 'glove.6B.200d.txt'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 200d trained on 6B tokens.'
GLOVE_6B_300D = _GLOVE_6B_ROOT + '#' + 'glove.6B.300d.txt'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 6B tokens.'
GLOVE_840B_300D = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 840B tokens.'
ALL = {}
================================================
FILE: hanlp/pretrained/mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-22 13:16
from hanlp_common.constant import HANLP_URL
OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip'
"Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep, sdp and con model trained on open-source Chinese corpus."
OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH = HANLP_URL + 'mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip'
"Electra (:cite:`clark2020electra`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on open-source Chinese corpus."
CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip'
"Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep (SD Standard), sdp and con model trained on close-source Chinese corpus."
CLOSE_TOK_POS_NER_SRL_UDEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20220626_175100.zip'
'''
Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep (UD Standard), sdp and con model trained on close-source Chinese corpus.
Performance: ``{con UCM: 39.33% LCM: 35.69% UP: 90.24% UR: 90.28% UF: 90.26% LP: 87.55% LR: 87.59% LF: 87.57%}{dep UAS: 86.80% LAS: 82.82%}{ner/msra P: 95.45% R: 96.65% F1: 96.05%}{ner/ontonotes P: 75.98% R: 79.09% F1: 77.50%}{ner/pku P: 95.77% R: 96.75% F1: 96.26%}{pos/863 Accuracy:94.83%}{pos/ctb Accuracy:96.57%}{pos/pku Accuracy:97.54%}{sdp UF: 85.55% LF: 73.67%}{srl P: 75.71% R: 74.25% F1: 74.97%}{tok/coarse P: 97.77% R: 97.70% F1: 97.74%}{tok/fine P: 97.44% R: 97.32% F1: 97.38%}``.
'''
CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip'
"Electra (:cite:`clark2020electra`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus."
CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip'
"ERNIE (:cite:`xiao-etal-2021-ernie`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus."
UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L6_no_space_20220731_161526.zip'
'''
mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 small version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``{con UCM: 15.48% LCM: 11.45% UP: 68.92% UR: 66.88% UF: 67.88% LP: 61.19% LR: 59.38% LF: 60.27%}{ner P: 76.06% R: 77.83% F1: 76.93%}{sdp/dm UF: 91.84% LF: 91.00%}{sdp/pas UF: 95.46% LF: 93.90%}{sdp/psd UF: 91.94% LF: 81.26%}{srl [predicate P: 91.71% R: 74.51% F1: 82.22%][e2e P: 77.48% R: 55.28% F1: 64.52%]}{tok P: 93.17% R: 93.53% F1: 93.35%}{ud [lemmas Accuracy:81.74%][upos Accuracy:85.94%][deps UAS: 80.60% LAS: 71.21%][feats Accuracy:77.17%]}``.
'''
UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L12 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L12_no_space_20220807_133143.zip'
'''
mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 base version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``{con UCM: 17.32% LCM: 13.28% UP: 70.53% UR: 68.73% UF: 69.62% LP: 63.03% LR: 61.42% LF: 62.22%}{ner P: 76.91% R: 78.72% F1: 77.80%}{sdp/dm UF: 92.78% LF: 92.02%}{sdp/pas UF: 96.43% LF: 95.02%}{sdp/psd UF: 92.75% LF: 81.86%}{srl [predicate P: 91.82% R: 77.57% F1: 84.10%][e2e P: 78.33% R: 59.14% F1: 67.40%]}{tok P: 93.69% R: 94.34% F1: 94.02%}{ud [lemmas Accuracy:82.48%][upos Accuracy:87.09%][deps UAS: 82.41% LAS: 73.69%][feats Accuracy:78.58%]}``.
'''
UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20220608_003435.zip'
'''
XLM-R (:cite:`conneau-etal-2020-unsupervised`) base version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``{con UCM: 20.31% LCM: 16.82% UP: 77.50% UR: 76.63% UF: 77.06% LP: 71.25% LR: 70.46% LF: 70.85%}{ner P: 79.93% R: 80.76% F1: 80.34%}{sdp/dm UF: 93.71% LF: 93.00%}{sdp/pas UF: 97.63% LF: 96.37%}{sdp/psd UF: 93.08% LF: 80.95%}{srl [predicate P: 90.95% R: 84.25% F1: 87.47%][e2e P: 78.89% R: 67.32% F1: 72.65%]}{tok P: 98.50% R: 98.70% F1: 98.60%}{ud [lemmas Accuracy:85.95%][upos Accuracy:89.95%][deps UAS: 85.78% LAS: 78.51%][feats Accuracy:82.18%]}``.
'''
NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA = HANLP_URL + 'mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'
'BERT (:cite:`devlin-etal-2019-bert`) base char encoder trained on NPCMJ/UD/Kyoto corpora with decoders including tok, pos, ner, dep, con, srl.'
# Will be filled up during runtime
ALL = {}
================================================
FILE: hanlp/pretrained/ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 20:07
from hanlp_common.constant import HANLP_URL
MSRA_NER_BERT_BASE_ZH = HANLP_URL + 'ner/ner_bert_base_msra_20211227_114712.zip'
'BERT model (:cite:`devlin-etal-2019-bert`) trained on MSRA with 3 entity types.'
MSRA_NER_ALBERT_BASE_ZH = HANLP_URL + 'ner/msra_ner_albert_base_20211228_173323.zip'
'ALBERT model (:cite:`Lan2020ALBERT:`) trained on MSRA with 3 entity types.'
MSRA_NER_ELECTRA_SMALL_ZH = HANLP_URL + 'ner/msra_ner_electra_small_20220215_205503.zip'
'Electra small model (:cite:`clark2020electra`) trained on MSRA with 26 entity types. F1 = `95.16`'
CONLL03_NER_BERT_BASE_CASED_EN = HANLP_URL + 'ner/ner_conll03_bert_base_cased_en_20211227_121443.zip'
'BERT model (:cite:`devlin-etal-2019-bert`) trained on CoNLL03.'
ALL = {}
================================================
FILE: hanlp/pretrained/pos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 01:57
from hanlp_common.constant import HANLP_URL
CTB5_POS_RNN = HANLP_URL + 'pos/ctb5_pos_rnn_20200113_235925.zip'
'An old school BiLSTM tagging model trained on CTB5.'
CTB5_POS_RNN_FASTTEXT_ZH = HANLP_URL + 'pos/ctb5_pos_rnn_fasttext_20191230_202639.zip'
'An old school BiLSTM tagging model with FastText (:cite:`bojanowski2017enriching`) embeddings trained on CTB5.'
CTB9_POS_ALBERT_BASE = HANLP_URL + 'pos/ctb9_albert_base_20211228_163935.zip'
'ALBERT model (:cite:`Lan2020ALBERT:`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). This is a TF component.'
CTB9_POS_ELECTRA_SMALL_TF = HANLP_URL + 'pos/pos_ctb_electra_small_20211227_121341.zip'
'Electra small model (:cite:`clark2020electra`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.75`. This is a TF component.'
CTB9_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_ctb_electra_small_20220215_111944.zip'
'Electra small model (:cite:`clark2020electra`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.26`.'
CTB9_POS_RADICAL_ELECTRA_SMALL = HANLP_URL + 'pos/pos_ctb_radical_electra_small_20220215_111932.zip'
'Electra small model (:cite:`clark2020electra`) with radical embeddings (:cite:`he2018dual`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.14`.'
C863_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_863_electra_small_20220217_101958.zip'
'Electra small model (:cite:`clark2020electra`) trained on Chinese 863 corpus. Accuracy = `95.19`.'
PKU_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_pku_electra_small_20220217_142436.zip'
'Electra small model (:cite:`clark2020electra`) trained on Chinese PKU corpus. Accuracy = `97.55`.'
PTB_POS_RNN_FASTTEXT_EN = HANLP_URL + 'pos/ptb_pos_rnn_fasttext_20220418_101708.zip'
'An old school BiLSTM tagging model with FastText (:cite:`bojanowski2017enriching`) embeddings trained on PTB.'
ALL = {}
================================================
FILE: hanlp/pretrained/rnnlm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-19 03:47
from hanlp_common.constant import HANLP_URL
FLAIR_LM_FW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_fw_wmt11_en'
'The forward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'
FLAIR_LM_BW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_bw_wmt11_en'
'The backward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'
FLAIR_LM_WMT11_EN = HANLP_URL + 'lm/flair_lm_wmt11_en_20200601_205350.zip'
'The BiLSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'
ALL = {}
================================================
FILE: hanlp/pretrained/sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 23:54
from hanlp_common.constant import HANLP_URL
SEMEVAL16_NEWS_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-news-biaffine_20191231_235407.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 news data.'
SEMEVAL16_TEXT_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-text-biaffine_20200101_002257.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text data.'
SEMEVAL16_ALL_ELECTRA_SMALL_ZH = HANLP_URL + 'sdp/semeval16_sdp_electra_small_20220719_171433.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text and news data. Performance: ``UF: 83.03% LF: 72.58%``'
SEMEVAL15_PAS_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_pas_20200103_152405.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PAS data.'
SEMEVAL15_PSD_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_psd_20200106_123009.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PSD data.'
SEMEVAL15_DM_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_dm_20200106_122808.zip'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 DM data.'
ALL = {}
================================================
FILE: hanlp/pretrained/srl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-08-07 19:07
from hanlp_common.constant import HANLP_URL
CPB3_SRL_ELECTRA_SMALL = HANLP_URL + 'srl/cpb3_electra_small_crf_has_transform_20220218_135910.zip'
'Electra small model (:cite:`clark2020electra`) trained on CPB3. P=75.87% R=76.24% F1=76.05%.'
ALL = {}
================================================
FILE: hanlp/pretrained/sts.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-24 12:51
from hanlp_common.constant import HANLP_URL
STS_ELECTRA_BASE_ZH = HANLP_URL + 'sts/sts_electra_base_zh_20210530_200109.zip'
'A naive regression model trained on concatenated STS corpora.'
# Will be filled up during runtime
ALL = {}
================================================
FILE: hanlp/pretrained/tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:12
from hanlp_common.constant import HANLP_URL
SIGHAN2005_PKU_CONVSEG = HANLP_URL + 'tok/sighan2005-pku-convseg_20200110_153722.zip'
'Conv model (:cite:`wang-xu-2017-convolutional`) trained on sighan2005 pku dataset.'
SIGHAN2005_MSR_CONVSEG = HANLP_URL + 'tok/convseg-msr-nocrf-noembed_20200110_153524.zip'
'Conv model (:cite:`wang-xu-2017-convolutional`) trained on sighan2005 msr dataset.'
CTB6_CONVSEG = HANLP_URL + 'tok/ctb6_convseg_nowe_nocrf_20200110_004046.zip'
'Conv model (:cite:`wang-xu-2017-convolutional`) trained on CTB6 dataset.'
PKU_NAME_MERGED_SIX_MONTHS_CONVSEG = HANLP_URL + 'tok/pku98_6m_conv_ngram_20200110_134736.zip'
'Conv model (:cite:`wang-xu-2017-convolutional`) trained on pku98 six months dataset with familiy name and given name merged into one unit.'
LARGE_ALBERT_BASE = HANLP_URL + 'tok/large_corpus_cws_albert_base_20211228_160926.zip'
'ALBERT model (:cite:`Lan2020ALBERT:`) trained on the largest CWS dataset in the world.'
SIGHAN2005_PKU_BERT_BASE_ZH = HANLP_URL + 'tok/sighan2005_pku_bert_base_zh_20201231_141130.zip'
'BERT model (:cite:`devlin-etal-2019-bert`) trained on sighan2005 pku dataset.'
COARSE_ELECTRA_SMALL_ZH = HANLP_URL + 'tok/coarse_electra_small_20220616_012050.zip'
'Electra (:cite:`clark2020electra`) small model trained on coarse-grained CWS corpora. Its performance is ``P: 98.34% R: 98.38% F1: 98.36%`` which is ' \
'much higher than that of MTL model '
FINE_ELECTRA_SMALL_ZH = HANLP_URL + 'tok/fine_electra_small_20220615_231803.zip'
'Electra (:cite:`clark2020electra`) small model trained on fine-grained CWS corpora. Its performance is ``P: 98.14% R: 98.07% F1: 98.11%`` which is ' \
'much higher than that of MTL model '
CTB9_TOK_ELECTRA_SMALL = HANLP_URL + 'tok/ctb9_electra_small_20220215_205427.zip'
'Electra (:cite:`clark2020electra`) small model trained on CTB9. Its performance is P=97.15% R=97.36% F1=97.26% which is ' \
'much higher than that of MTL model '
CTB9_TOK_ELECTRA_BASE = 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_20220426_111949.zip'
'Electra (:cite:`clark2020electra`) base model trained on CTB9. Its performance is ``P: 97.62% R: 97.67% F1: 97.65%`` ' \
'which is much higher than that of MTL model '
CTB9_TOK_ELECTRA_BASE_CRF = 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_crf_20220426_161255.zip'
'Electra (:cite:`clark2020electra`) base model trained on CTB9. Its performance is ``P: 97.68% R: 97.71% F1: 97.69%`` ' \
'which is much higher than that of MTL model '
MSR_TOK_ELECTRA_BASE_CRF = 'http://download.hanlp.com/tok/extra/msra_crf_electra_base_20220507_113936.zip'
'Electra (:cite:`clark2020electra`) base model trained on MSR CWS dataset. Its performance is ``P: 98.71% R: 98.64% F1: 98.68%`` ' \
'which is much higher than that of MTL model '
UD_TOK_MMINILMV2L6 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip'
'''
mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 based tokenizer trained on UD 2.10.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``P: 94.99% R: 94.74% F1: 94.86%``.
'''
UD_TOK_MMINILMV2L12 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L12_no_space_mul_20220619_091159.zip'
'''
mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L12xH384 based tokenizer trained on UD 2.10.
The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``.
Performance: ``P: 95.41% R: 95.25% F1: 95.33%``.
'''
# Will be filled up during runtime
ALL = {}
================================================
FILE: hanlp/pretrained/word2vec.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-21 18:25
from hanlp_common.constant import HANLP_URL
CONVSEG_W2V_NEWS_TENSITE = HANLP_URL + 'embeddings/convseg_embeddings.zip'
CONVSEG_W2V_NEWS_TENSITE_WORD_PKU = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.pku.words.w2v50'
CONVSEG_W2V_NEWS_TENSITE_WORD_MSR = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.msr.words.w2v50'
CONVSEG_W2V_NEWS_TENSITE_CHAR = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.w2v200'
SEMEVAL16_EMBEDDINGS_CN = HANLP_URL + 'embeddings/semeval16_embeddings.zip'
SEMEVAL16_EMBEDDINGS_300_NEWS_CN = SEMEVAL16_EMBEDDINGS_CN + '#news.fasttext.300.txt'
SEMEVAL16_EMBEDDINGS_300_TEXT_CN = SEMEVAL16_EMBEDDINGS_CN + '#text.fasttext.300.txt'
CTB5_FASTTEXT_300_CN = HANLP_URL + 'embeddings/ctb.fasttext.300.txt.zip'
TENCENT_AILAB_EMBEDDING_SMALL_200 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d200-v0.2.0-s.tar.gz#tencent-ailab-embedding-zh-d200-v0.2.0-s.txt'
'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with small vocabulary size and 200 dimension provided by Tencent AI lab.'
TENCENT_AILAB_EMBEDDING_LARGE_200 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d200-v0.2.0.tar.gz#tencent-ailab-embedding-zh-d200-v0.2.0.txt'
'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with large vocabulary size and 200 dimension provided by Tencent AI lab.'
TENCENT_AILAB_EMBEDDING_SMALL_100 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d100-v0.2.0-s.tar.gz#tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with small vocabulary size and 100 dimension provided by Tencent AI lab.'
TENCENT_AILAB_EMBEDDING_LARGE_100 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d100-v0.2.0.tar.gz#tencent-ailab-embedding-zh-d100-v0.2.0.txt'
'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with large vocabulary size and 100 dimension provided by Tencent AI lab.'
MERGE_SGNS_BIGRAM_CHAR_300_ZH = 'http://download.hanlp.com/embeddings/extra/merge_sgns_bigram_char300_20220130_214613.txt.zip'
'Chinese word embeddings trained with context features (word, ngram, character, and more) using Skip-Gram with Negative Sampling (SGNS) (:cite:`li-etal-2018-analogical`).'
RADICAL_CHAR_EMBEDDING_100 = HANLP_URL + 'embeddings/radical_char_vec_20191229_013849.zip#character.vec.txt'
'Chinese character embedding enhanced with rich radical information (:cite:`he2018dual`).'
_SUBWORD_ENCODING_CWS = 'http://download.hanlp.com/embeddings/extra/subword_encoding_cws_20200524_190636.zip'
SUBWORD_ENCODING_CWS_ZH_WIKI_BPE_50 = _SUBWORD_ENCODING_CWS + '#zh.wiki.bpe.vs200000.d50.w2v.txt'
SUBWORD_ENCODING_CWS_GIGAWORD_UNI = _SUBWORD_ENCODING_CWS + '#gigaword_chn.all.a2b.uni.ite50.vec'
SUBWORD_ENCODING_CWS_GIGAWORD_BI = _SUBWORD_ENCODING_CWS + '#gigaword_chn.all.a2b.bi.ite50.vec'
SUBWORD_ENCODING_CWS_CTB_GAZETTEER_50 = _SUBWORD_ENCODING_CWS + '#ctb.50d.vec'
ALL = {}
================================================
FILE: hanlp/transform/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 22:24
================================================
FILE: hanlp/transform/conll_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 15:30
from abc import abstractmethod
from collections import Counter
from typing import Union, Tuple, Iterable, Any, Generator
import numpy as np
import tensorflow as tf
from transformers import PreTrainedTokenizer, PretrainedConfig
from hanlp_common.constant import ROOT
from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.components.parsers.alg_tf import tolist, kmeans, randperm, arange
from hanlp.components.parsers.conll import read_conll
from hanlp_common.conll import CoNLLWord, CoNLLUWord, CoNLLSentence
from hanlp.layers.transformers.utils_tf import config_is, adjust_tokens_for_transformers, convert_examples_to_features
from hanlp.utils.log_util import logger
from hanlp.utils.string_util import ispunct
from hanlp_common.util import merge_locals_kwargs
class CoNLLTransform(Transform):
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2,
use_pos=True, **kwargs) -> None:
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.form_vocab: VocabTF = None
if use_pos:
self.cpos_vocab: VocabTF = None
self.rel_vocab: VocabTF = None
self.puncts: tf.Tensor = None
@property
def use_pos(self):
return self.config.get('use_pos', True)
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
form, cpos = x
return self.form_vocab.token_to_idx_table.lookup(form), self.cpos_vocab.token_to_idx_table.lookup(cpos)
def y_to_idx(self, y):
head, rel = y
return head, self.rel_vocab.token_to_idx_table.lookup(rel)
def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
if len(X) == 2:
form_batch, cposes_batch = X
mask = tf.not_equal(form_batch, 0)
elif len(X) == 3:
form_batch, cposes_batch, mask = X
else:
raise ValueError(f'Expect X to be 2 or 3 elements but got {repr(X)}')
sents = []
for form_sent, cposes_sent, length in zip(form_batch, cposes_batch,
tf.math.count_nonzero(mask, axis=-1)):
forms = tolist(form_sent)[1:length + 1]
cposes = tolist(cposes_sent)[1:length + 1]
sents.append([(self.form_vocab.idx_to_token[f],
self.cpos_vocab.idx_to_token[c]) for f, c in zip(forms, cposes)])
return sents
def lock_vocabs(self):
super().lock_vocabs()
self.puncts = tf.constant([i for s, i in self.form_vocab.token_to_idx.items()
if ispunct(s)], dtype=tf.int64)
def file_to_inputs(self, filepath: str, gold=True):
assert gold, 'only support gold file for now'
use_pos = self.use_pos
conllu = filepath.endswith('.conllu')
for sent in read_conll(filepath):
for i, cell in enumerate(sent):
form = cell[1]
cpos = cell[3]
head = cell[6]
deprel = cell[7]
# if conllu:
# deps = cell[8]
# deps = [x.split(':', 1) for x in deps.split('|')]
# heads = [int(x[0]) for x in deps if '_' not in x[0] and '.' not in x[0]]
# rels = [x[1] for x in deps if '_' not in x[0] and '.' not in x[0]]
# if head in heads:
# offset = heads.index(head)
# if not self.rel_vocab or rels[offset] in self.rel_vocab:
# deprel = rels[offset]
sent[i] = [form, cpos, head, deprel] if use_pos else [form, head, deprel]
yield sent
@property
def bos(self):
if self.form_vocab.idx_to_token is None:
return ROOT
return self.form_vocab.idx_to_token[2]
def input_is_single_sample(self, input: Any) -> bool:
if self.use_pos:
return isinstance(input[0][0], str) if len(input[0]) else False
else:
return isinstance(input[0], str) if len(input[0]) else False
@abstractmethod
def batched_inputs_to_batches(self, corpus, indices, shuffle):
pass
def len_of_sent(self, sent):
return 1 + len(sent) # take ROOT into account
def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None,
drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset:
if shuffle:
def generator():
# custom bucketing, load corpus into memory
corpus = list(x for x in (samples() if callable(samples) else samples))
lengths = [self.len_of_sent(i) for i in corpus]
if len(corpus) < 32:
n_buckets = 1
else:
n_buckets = min(self.config.n_buckets, len(corpus))
buckets = dict(zip(*kmeans(lengths, n_buckets)))
sizes, buckets = zip(*[
(size, bucket) for size, bucket in buckets.items()
])
# the number of chunks in each bucket, which is clipped by
# range [1, len(bucket)]
chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in
zip(sizes, buckets)]
range_fn = randperm if shuffle else arange
max_samples_per_batch = self.config.get('max_samples_per_batch', None)
for i in tolist(range_fn(len(buckets))):
split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
for j in range(chunks[i])] # how many sentences in each batch
for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes):
indices = [buckets[i][j] for j in tolist(batch_indices)]
if max_samples_per_batch:
for j in range(0, len(indices), max_samples_per_batch):
yield from self.batched_inputs_to_batches(corpus, indices[j:j + max_samples_per_batch],
shuffle)
else:
yield from self.batched_inputs_to_batches(corpus, indices, shuffle)
else:
def generator():
# custom bucketing, load corpus into memory
corpus = list(x for x in (samples() if callable(samples) else samples))
n_tokens = 0
batch = []
for idx, sent in enumerate(corpus):
sent_len = self.len_of_sent(sent)
if n_tokens + sent_len > batch_size and batch:
yield from self.batched_inputs_to_batches(corpus, batch, shuffle)
n_tokens = 0
batch = []
n_tokens += sent_len
batch.append(idx)
if batch:
yield from self.batched_inputs_to_batches(corpus, batch, shuffle)
# next(generator())
return Transform.samples_to_dataset(self, generator, False, False, 0, False, repeat, drop_remainder, prefetch,
cache)
class CoNLL_DEP_Transform(CoNLLTransform):
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32,
min_freq=2, **kwargs) -> None:
super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, **kwargs)
def batched_inputs_to_batches(self, corpus, indices, shuffle):
"""Convert batched inputs to batches of samples
Args:
corpus(list): A list of inputs
indices(list): A list of indices, each list belongs to a batch
shuffle:
Returns:
"""
raw_batch = [[], [], [], []]
for idx in indices:
for b in raw_batch:
b.append([])
for cells in corpus[idx]:
for b, c, v in zip(raw_batch, cells,
[self.form_vocab, self.cpos_vocab, None, self.rel_vocab]):
b[-1].append(v.get_idx_without_add(c) if v else c)
batch = []
for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]):
b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
value=v.safe_pad_token_idx if v else 0,
dtype='int64')
batch.append(b)
assert len(batch) == 4
yield (batch[0], batch[1]), (batch[2], batch[3])
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
types = (tf.int64, tf.int64), (tf.int64, tf.int64)
shapes = ([None, None], [None, None]), ([None, None], [None, None])
values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), (
0, self.rel_vocab.safe_pad_token_idx)
return types, shapes, values
def inputs_to_samples(self, inputs, gold=False):
token_mapping: dict = self.config.get('token_mapping', None)
use_pos = self.config.get('use_pos', True)
for sent in inputs:
sample = []
for i, cell in enumerate(sent):
if isinstance(cell, tuple):
cell = list(cell)
elif isinstance(cell, str):
cell = [cell]
if token_mapping:
cell[0] = token_mapping.get(cell[0], cell[0])
if self.config['lower']:
cell[0] = cell[0].lower()
if not gold:
cell += [0, self.rel_vocab.safe_pad_token]
sample.append(cell)
# insert root word with arbitrary fields, anyway it will be masked
# form, cpos, head, deprel = sample[0]
sample.insert(0, [self.bos, self.bos, 0, self.bos] if use_pos else [self.bos, 0, self.bos])
yield sample
def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable:
(words, feats, mask), (arc_preds, rel_preds) = X, Y
if inputs is None:
inputs = self.X_to_inputs(X)
ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs)
sents = []
for x, y in zip(inputs, ys):
sent = CoNLLSentence()
for idx, (cell, (head, deprel)) in enumerate(zip(x, y)):
if self.use_pos and not self.config.get('joint_pos', None):
form, cpos = cell
else:
form, cpos = cell, None
if conll:
sent.append(
CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll'
else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel))
else:
sent.append([head, deprel])
sents.append(sent)
return sents
def fit(self, trn_path: str, **kwargs) -> int:
use_pos = self.config.use_pos
self.form_vocab = VocabTF()
self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk
if self.use_pos:
self.cpos_vocab = VocabTF(pad_token=None, unk_token=None)
self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
num_samples = 0
counter = Counter()
for sent in self.file_to_samples(trn_path, gold=True):
num_samples += 1
for idx, cell in enumerate(sent):
if use_pos:
form, cpos, head, deprel = cell
else:
form, head, deprel = cell
if idx == 0:
root = form
else:
counter[form] += 1
if use_pos:
self.cpos_vocab.add(cpos)
self.rel_vocab.add(deprel)
for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
self.form_vocab.add(token)
return num_samples
@property
def root_rel_idx(self):
root_rel_idx = self.config.get('root_rel_idx', None)
if root_rel_idx is None:
for idx, rel in enumerate(self.rel_vocab.idx_to_token):
if 'root' in rel.lower() and rel != self.bos:
self.config['root_rel_idx'] = root_rel_idx = idx
break
return root_rel_idx
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
arc_preds, rel_preds, mask = Y
sents = []
for arc_sent, rel_sent, length in zip(arc_preds, rel_preds,
tf.math.count_nonzero(mask, axis=-1)):
arcs = tolist(arc_sent)[1:length + 1]
rels = tolist(rel_sent)[1:length + 1]
sents.append([(a, self.rel_vocab.idx_to_token[r]) for a, r in zip(arcs, rels)])
return sents
class CoNLL_Transformer_Transform(CoNLL_DEP_Transform):
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True,
lower=True, n_buckets=32, min_freq=0, max_seq_length=256, use_pos=False,
mask_p=None, graph=False, topk=None,
**kwargs) -> None:
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.tokenizer: PreTrainedTokenizer = None
self.transformer_config: PretrainedConfig = None
if graph:
self.orphan_relation = ROOT
def lock_vocabs(self):
super().lock_vocabs()
if self.graph:
CoNLL_SDP_Transform._find_orphan_relation(self)
def fit(self, trn_path: str, **kwargs) -> int:
if self.config.get('joint_pos', None):
self.config.use_pos = True
if self.graph:
# noinspection PyCallByClass
num = CoNLL_SDP_Transform.fit(self, trn_path, **kwargs)
else:
num = super().fit(trn_path, **kwargs)
if self.config.get('topk', None):
counter = Counter()
for sent in self.file_to_samples(trn_path, gold=True):
for idx, cell in enumerate(sent):
form, head, deprel = cell
counter[form] += 1
self.topk_vocab = VocabTF()
for k, v in counter.most_common(self.config.topk):
self.topk_vocab.add(k)
return num
def inputs_to_samples(self, inputs, gold=False):
if self.graph:
yield from CoNLL_SDP_Transform.inputs_to_samples(self, inputs, gold)
else:
yield from super().inputs_to_samples(inputs, gold)
def file_to_inputs(self, filepath: str, gold=True):
if self.graph:
yield from CoNLL_SDP_Transform.file_to_inputs(self, filepath, gold)
else:
yield from super().file_to_inputs(filepath, gold)
@property
def mask_p(self) -> float:
return self.config.get('mask_p', None)
@property
def graph(self):
return self.config.get('graph', None)
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
mask_p = self.mask_p
types = (tf.int64, (tf.int64, tf.int64, tf.int64)), (tf.bool if self.graph else tf.int64, tf.int64, tf.int64) if mask_p else (
tf.bool if self.graph else tf.int64, tf.int64)
if self.graph:
shapes = ([None, None], ([None, None], [None, None], [None, None])), (
[None, None, None], [None, None, None], [None, None]) if mask_p else (
[None, None, None], [None, None, None])
else:
shapes = ([None, None], ([None, None], [None, None], [None, None])), (
[None, None], [None, None], [None, None]) if mask_p else ([None, None], [None, None])
values = (self.form_vocab.safe_pad_token_idx, (0, 0, 0)), \
(0, self.rel_vocab.safe_pad_token_idx, 0) if mask_p else (0, self.rel_vocab.safe_pad_token_idx)
types_shapes_values = types, shapes, values
if self.use_pos:
types_shapes_values = [((shapes[0][0], shapes[0][1] + (shapes[0][0],)), shapes[1]) for shapes in
types_shapes_values]
return types_shapes_values
def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
form_batch, feat, prefix_mask = X
sents = []
for form_sent, length in zip(form_batch, tf.math.count_nonzero(prefix_mask, axis=-1)):
forms = tolist(form_sent)[1:length + 1]
sents.append([self.form_vocab.idx_to_token[f] for f in forms])
return sents
def batched_inputs_to_batches(self, corpus, indices, shuffle):
use_pos = self.use_pos
if use_pos:
raw_batch = [[], [], [], []]
else:
raw_batch = [[], [], []]
if self.graph:
max_len = len(max([corpus[i] for i in indices], key=len))
for idx in indices:
arc = np.zeros((max_len, max_len), dtype=np.bool)
rel = np.zeros((max_len, max_len), dtype=np.int64)
for b in raw_batch[:2 if use_pos else 1]:
b.append([])
for m, cells in enumerate(corpus[idx]):
if use_pos:
for b, c, v in zip(raw_batch, cells, [None, self.cpos_vocab]):
b[-1].append(v.get_idx_without_add(c) if v else c)
else:
for b, c, v in zip(raw_batch, cells, [None]):
b[-1].append(c)
for n, r in zip(cells[-2], cells[-1]):
arc[m, n] = True
rid = self.rel_vocab.get_idx_without_add(r)
if rid is None:
logger.warning(f'Relation OOV: {r} not exists in train')
continue
rel[m, n] = rid
raw_batch[-2].append(arc)
raw_batch[-1].append(rel)
else:
for idx in indices:
for s in raw_batch:
s.append([])
for cells in corpus[idx]:
if use_pos:
for s, c, v in zip(raw_batch, cells, [None, self.cpos_vocab, None, self.rel_vocab]):
s[-1].append(v.get_idx_without_add(c) if v else c)
else:
for s, c, v in zip(raw_batch, cells, [None, None, self.rel_vocab]):
s[-1].append(v.get_idx_without_add(c) if v else c)
# Transformer tokenizing
config = self.transformer_config
tokenizer = self.tokenizer
xlnet = config_is(config, 'xlnet')
roberta = config_is(config, 'roberta')
pad_token = tokenizer.pad_token
pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
max_seq_length = self.config.max_seq_length
batch_forms = []
batch_input_ids = []
batch_input_mask = []
batch_prefix_offset = []
mask_p = self.mask_p
if mask_p:
batch_masked_offsets = []
mask_token_id = tokenizer.mask_token_id
for sent_idx, sent in enumerate(raw_batch[0]):
batch_forms.append([self.form_vocab.get_idx_without_add(token) for token in sent])
sent = adjust_tokens_for_transformers(sent)
sent = sent[1:] # remove use [CLS] instead
pad_label_idx = self.form_vocab.pad_idx
input_ids, input_mask, segment_ids, prefix_mask = \
convert_examples_to_features(sent,
max_seq_length,
tokenizer,
cls_token_at_end=xlnet,
# xlnet has a cls token at the end
cls_token=cls_token,
cls_token_segment_id=2 if xlnet else 0,
sep_token=sep_token,
sep_token_extra=roberta,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=xlnet,
# pad on the left for xlnet
pad_token_id=pad_token_id,
pad_token_segment_id=4 if xlnet else 0,
pad_token_label_id=pad_label_idx,
do_padding=False)
num_masks = sum(prefix_mask)
# assert len(sent) == num_masks # each token has a True subtoken
if num_masks < len(sent): # long sent gets truncated, +1 for root
batch_forms[-1] = batch_forms[-1][:num_masks + 1] # form
raw_batch[-1][sent_idx] = raw_batch[-1][sent_idx][:num_masks + 1] # head
raw_batch[-2][sent_idx] = raw_batch[-2][sent_idx][:num_masks + 1] # rel
raw_batch[-3][sent_idx] = raw_batch[-3][sent_idx][:num_masks + 1] # pos
prefix_mask[0] = True # is now [CLS]
prefix_offset = [idx for idx, m in enumerate(prefix_mask) if m]
batch_input_ids.append(input_ids)
batch_input_mask.append(input_mask)
batch_prefix_offset.append(prefix_offset)
if mask_p:
if shuffle:
size = int(np.ceil(mask_p * len(prefix_offset[1:]))) # never mask [CLS]
mask_offsets = np.random.choice(np.arange(1, len(prefix_offset)), size, replace=False)
for offset in sorted(mask_offsets):
assert 0 < offset < len(input_ids)
# mask_word = raw_batch[0][sent_idx][offset]
# mask_prefix = tokenizer.convert_ids_to_tokens([input_ids[prefix_offset[offset]]])[0]
# assert mask_word.startswith(mask_prefix) or mask_prefix.startswith(
# mask_word) or mask_prefix == "'", \
# f'word {mask_word} prefix {mask_prefix} not match' # could vs couldn
# mask_offsets.append(input_ids[offset]) # subword token
# mask_offsets.append(offset) # form token
input_ids[prefix_offset[offset]] = mask_token_id # mask prefix
# whole word masking, mask the rest of the word
for i in range(prefix_offset[offset] + 1, len(input_ids) - 1):
if prefix_mask[i]:
break
input_ids[i] = mask_token_id
batch_masked_offsets.append(sorted(mask_offsets))
else:
batch_masked_offsets.append([0]) # No masking in prediction
batch_forms = tf.keras.preprocessing.sequence.pad_sequences(batch_forms, padding='post',
value=self.form_vocab.safe_pad_token_idx,
dtype='int64')
batch_input_ids = tf.keras.preprocessing.sequence.pad_sequences(batch_input_ids, padding='post',
value=pad_token_id,
dtype='int64')
batch_input_mask = tf.keras.preprocessing.sequence.pad_sequences(batch_input_mask, padding='post',
value=0,
dtype='int64')
batch_prefix_offset = tf.keras.preprocessing.sequence.pad_sequences(batch_prefix_offset, padding='post',
value=0,
dtype='int64')
batch_heads = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-2], padding='post',
value=0,
dtype='int64')
batch_rels = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-1], padding='post',
value=self.rel_vocab.safe_pad_token_idx,
dtype='int64')
if mask_p:
batch_masked_offsets = tf.keras.preprocessing.sequence.pad_sequences(batch_masked_offsets, padding='post',
value=pad_token_id,
dtype='int64')
feats = (tf.constant(batch_input_ids, dtype='int64'), tf.constant(batch_input_mask, dtype='int64'),
tf.constant(batch_prefix_offset))
if use_pos:
batch_pos = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[1], padding='post',
value=self.cpos_vocab.safe_pad_token_idx,
dtype='int64')
feats += (batch_pos,)
yield (batch_forms, feats), \
(batch_heads, batch_rels, batch_masked_offsets) if mask_p else (batch_heads, batch_rels)
def len_of_sent(self, sent):
# Transformer tokenizing
config = self.transformer_config
tokenizer = self.tokenizer
xlnet = config_is(config, 'xlnet')
roberta = config_is(config, 'roberta')
pad_token = tokenizer.pad_token
pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
max_seq_length = self.config.max_seq_length
sent = sent[1:] # remove use [CLS] instead
pad_label_idx = self.form_vocab.pad_idx
sent = [x[0] for x in sent]
sent = adjust_tokens_for_transformers(sent)
input_ids, input_mask, segment_ids, prefix_mask = \
convert_examples_to_features(sent,
max_seq_length,
tokenizer,
cls_token_at_end=xlnet,
# xlnet has a cls token at the end
cls_token=cls_token,
cls_token_segment_id=2 if xlnet else 0,
sep_token=sep_token,
sep_token_extra=roberta,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=xlnet,
# pad on the left for xlnet
pad_token_id=pad_token_id,
pad_token_segment_id=4 if xlnet else 0,
pad_token_label_id=pad_label_idx,
do_padding=False)
return len(input_ids)
def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None,
drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset:
if shuffle:
return CoNLL_DEP_Transform.samples_to_dataset(self, samples, map_x, map_y, batch_size, shuffle, repeat,
drop_remainder, prefetch, cache)
def generator():
# custom bucketing, load corpus into memory
corpus = list(x for x in (samples() if callable(samples) else samples))
n_tokens = 0
batch = []
for idx, sent in enumerate(corpus):
sent_len = self.len_of_sent(sent)
if n_tokens + sent_len > batch_size and batch:
yield from self.batched_inputs_to_batches(corpus, batch, shuffle)
n_tokens = 0
batch = []
n_tokens += sent_len
batch.append(idx)
if batch:
yield from self.batched_inputs_to_batches(corpus, batch, shuffle)
# debug for transformer
# next(generator())
return Transform.samples_to_dataset(self, generator, False, False, 0, False, repeat, drop_remainder, prefetch,
cache)
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
if self.graph:
ys = CoNLL_SDP_Transform.Y_to_outputs(self, Y, gold, inputs, X)
ys = [[([t[0] for t in l], [t[1] for t in l]) for l in y] for y in ys]
return ys
return super().Y_to_outputs(Y, gold, inputs, X)
class CoNLL_SDP_Transform(CoNLLTransform):
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2,
use_pos=True, **kwargs) -> None:
super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, use_pos, **kwargs)
self.orphan_relation = ROOT
def lock_vocabs(self):
super().lock_vocabs()
# heuristic to find the orphan relation
self._find_orphan_relation()
def _find_orphan_relation(self):
for rel in self.rel_vocab.idx_to_token:
if 'root' in rel.lower():
self.orphan_relation = rel
break
def file_to_inputs(self, filepath: str, gold=True):
assert gold, 'only support gold file for now'
use_pos = self.use_pos
conllu = filepath.endswith('.conllu')
enhanced_only = self.config.get('enhanced_only', None)
for i, sent in enumerate(read_conll(filepath)):
parsed_sent = []
if conllu:
for cell in sent:
ID = cell[0]
form = cell[1]
cpos = cell[3]
head = cell[6]
deprel = cell[7]
deps = cell[8]
deps = [x.split(':', 1) for x in deps.split('|')]
heads = [int(x[0]) for x in deps if x[0].isdigit()]
rels = [x[1] for x in deps if x[0].isdigit()]
if enhanced_only:
if head in heads:
offset = heads.index(head)
heads.pop(offset)
rels.pop(offset)
else:
if head not in heads:
heads.append(head)
rels.append(deprel)
parsed_sent.append([form, cpos, heads, rels] if use_pos else [form, heads, rels])
else:
prev_cells = None
heads = []
rels = []
for j, cell in enumerate(sent):
ID = cell[0]
form = cell[1]
cpos = cell[3]
head = cell[6]
deprel = cell[7]
if prev_cells and ID != prev_cells[0]: # found end of token
parsed_sent.append(
[prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels])
heads = []
rels = []
heads.append(head)
rels.append(deprel)
prev_cells = [ID, form, cpos, head, deprel] if use_pos else [ID, form, head, deprel]
parsed_sent.append(
[prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels])
yield parsed_sent
def fit(self, trn_path: str, **kwargs) -> int:
self.form_vocab = VocabTF()
self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk
if self.use_pos:
self.cpos_vocab = VocabTF(pad_token=None, unk_token=None)
self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
num_samples = 0
counter = Counter()
for sent in self.file_to_samples(trn_path, gold=True):
num_samples += 1
for idx, cell in enumerate(sent):
if len(cell) == 4:
form, cpos, head, deprel = cell
elif len(cell) == 3:
if self.use_pos:
form, cpos = cell[0]
else:
form = cell[0]
head, deprel = cell[1:]
else:
raise ValueError('Unknown data arrangement')
if idx == 0:
root = form
else:
counter[form] += 1
if self.use_pos:
self.cpos_vocab.add(cpos)
self.rel_vocab.update(deprel)
for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
self.form_vocab.add(token)
return num_samples
def inputs_to_samples(self, inputs, gold=False):
use_pos = self.use_pos
for sent in inputs:
sample = []
for i, cell in enumerate(sent):
if isinstance(cell, tuple):
cell = list(cell)
elif isinstance(cell, str):
cell = [cell]
if self.config['lower']:
cell[0] = cell[0].lower()
if not gold:
cell += [[0], [self.rel_vocab.safe_pad_token]]
sample.append(cell)
# insert root word with arbitrary fields, anyway it will be masked
if use_pos:
form, cpos, head, deprel = sample[0]
sample.insert(0, [self.bos, self.bos, [0], deprel])
else:
form, head, deprel = sample[0]
sample.insert(0, [self.bos, [0], deprel])
yield sample
def batched_inputs_to_batches(self, corpus, indices, shuffle):
use_pos = self.use_pos
raw_batch = [[], [], [], []] if use_pos else [[], [], []]
max_len = len(max([corpus[i] for i in indices], key=len))
for idx in indices:
arc = np.zeros((max_len, max_len), dtype=bool)
rel = np.zeros((max_len, max_len), dtype=np.int64)
for b in raw_batch[:2]:
b.append([])
for m, cells in enumerate(corpus[idx]):
if use_pos:
for b, c, v in zip(raw_batch, cells,
[self.form_vocab, self.cpos_vocab]):
b[-1].append(v.get_idx_without_add(c))
else:
for b, c, v in zip(raw_batch, cells,
[self.form_vocab]):
b[-1].append(v.get_idx_without_add(c))
for n, r in zip(cells[-2], cells[-1]):
arc[m, n] = True
rid = self.rel_vocab.get_idx_without_add(r)
if rid is None:
logger.warning(f'Relation OOV: {r} not exists in train')
continue
rel[m, n] = rid
raw_batch[-2].append(arc)
raw_batch[-1].append(rel)
batch = []
for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]):
b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
value=v.safe_pad_token_idx,
dtype='int64')
batch.append(b)
batch += raw_batch[2:]
assert len(batch) == 4
yield (batch[0], batch[1]), (batch[2], batch[3])
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
types = (tf.int64, tf.int64), (tf.bool, tf.int64)
shapes = ([None, None], [None, None]), ([None, None, None], [None, None, None])
values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), (
False, self.rel_vocab.safe_pad_token_idx)
return types, shapes, values
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
arc_preds, rel_preds, mask = Y
sents = []
for arc_sent, rel_sent, length in zip(arc_preds, rel_preds,
tf.math.count_nonzero(mask, axis=-1)):
sent = []
for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])):
ar = []
for idx, (a, r) in enumerate(zip(arc, rel)):
if a:
ar.append((idx + 1, self.rel_vocab.idx_to_token[r]))
if not ar:
# orphan
ar.append((0, self.orphan_relation))
sent.append(ar)
sents.append(sent)
return sents
def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
gold=False, inputs=None, conll=True) -> Iterable:
(words, feats, mask), (arc_preds, rel_preds) = X, Y
xs = inputs
ys = self.Y_to_outputs((arc_preds, rel_preds, mask))
sents = []
for x, y in zip(xs, ys):
sent = CoNLLSentence()
for idx, ((form, cpos), pred) in enumerate(zip(x, y)):
head = [p[0] for p in pred]
deprel = [p[1] for p in pred]
if conll:
sent.append(CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel))
else:
sent.append([head, deprel])
sents.append(sent)
return sents
================================================
FILE: hanlp/transform/glue_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-08 16:34
from hanlp_common.structure import SerializableDict
from hanlp.datasets.glu.glue import STANFORD_SENTIMENT_TREEBANK_2_TRAIN, MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV
from hanlp.transform.table_tf import TableTransform
class StanfordSentimentTreebank2Transorm(TableTransform):
pass
class MicrosoftResearchParaphraseCorpus(TableTransform):
def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=(3, 4),
y_column=0, skip_header=True, delimiter='auto', **kwargs) -> None:
super().__init__(config, map_x, map_y, x_columns, y_column, skip_header, delimiter, **kwargs)
def main():
# _test_sst2()
_test_mrpc()
def _test_sst2():
transform = StanfordSentimentTreebank2Transorm()
transform.fit(STANFORD_SENTIMENT_TREEBANK_2_TRAIN)
transform.lock_vocabs()
transform.label_vocab.summary()
transform.build_config()
dataset = transform.file_to_dataset(STANFORD_SENTIMENT_TREEBANK_2_TRAIN)
for batch in dataset.take(1):
print(batch)
def _test_mrpc():
transform = MicrosoftResearchParaphraseCorpus()
transform.fit(MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV)
transform.lock_vocabs()
transform.label_vocab.summary()
transform.build_config()
dataset = transform.file_to_dataset(MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV)
for batch in dataset.take(1):
print(batch)
================================================
FILE: hanlp/transform/table_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 21:00
from abc import ABC
from typing import Tuple, Union
import numpy as np
import tensorflow as tf
from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp_common.constant import PAD
from hanlp.common.vocab_tf import create_label_vocab
from hanlp.utils.io_util import read_cells
from hanlp.utils.log_util import logger
class TableTransform(Transform, ABC):
def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=None,
y_column=-1, multi_label=False,
skip_header=True, delimiter='auto', **kwargs) -> None:
super().__init__(config, map_x, map_y, x_columns=x_columns, y_column=y_column, multi_label=multi_label,
skip_header=skip_header,
delimiter=delimiter, **kwargs)
self.label_vocab = create_label_vocab()
def file_to_inputs(self, filepath: str, gold=True):
x_columns = self.config.x_columns
y_column = self.config.y_column
num_features = self.config.get('num_features', None)
for cells in read_cells(filepath, skip_header=self.config.skip_header, delimiter=self.config.delimiter):
#multi-label: Dataset in .tsv format: x_columns: at most 2 columns being a sentence pair while in most
# cases just one column being the doc content. y_column being the single label, which shall be modified
# to load a list of labels.
if x_columns:
inputs = tuple(c for i, c in enumerate(cells) if i in x_columns), cells[y_column]
else:
if y_column != -1:
cells[-1], cells[y_column] = cells[y_column], cells[-1]
inputs = tuple(cells[:-1]), cells[-1]
if num_features is None:
num_features = len(inputs[0])
self.config.num_features = num_features
# multi-label support
if self.config.get('multi_label', None):
assert type(inputs[1]) is str, 'Y value has to be string'
if inputs[1][0] == '[':
# multi-label is in literal form of a list
labels = eval(inputs[1])
else:
labels = inputs[1].strip().split(',')
inputs = inputs[0], labels
else:
assert num_features == len(inputs[0]), f'Numbers of columns {num_features} ' \
f'inconsistent with current {len(inputs[0])}'
yield inputs
def inputs_to_samples(self, inputs, gold=False):
pad = self.label_vocab.safe_pad_token
for cells in inputs:
if gold:
yield cells
else:
yield cells, pad
def y_to_idx(self, y) -> tf.Tensor:
return self.label_vocab.lookup(y)
def fit(self, trn_path: str, **kwargs):
samples = 0
for t in self.file_to_samples(trn_path, gold=True):
if self.config.get('multi_label', None):
for l in t[1]:
self.label_vocab.add(l)
else:
self.label_vocab.add(t[1]) # the second one regardless of t is pair or triple
samples += 1
return samples
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
num_features = self.config.num_features
# It's crucial to use tuple instead of list for all the three
types = tuple([tf.string] * num_features), tf.string
shapes = tuple([[]] * num_features), []
values = tuple([PAD] * num_features), self.label_vocab.safe_pad_token
return types, shapes, values
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
logger.warning('TableTransform can not map x to idx. Please override x_to_idx')
return x
================================================
FILE: hanlp/transform/tacred_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-03-14 17:06
from typing import Union, Tuple
import tensorflow as tf
from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp_common.io import load_json
from hanlp_common.util import merge_locals_kwargs
def get_positions(start_idx, end_idx, length):
"""Get subj/obj position sequence.
Args:
start_idx:
end_idx:
length:
Returns:
"""
return list(range(-start_idx, 0)) + [0] * (end_idx - start_idx + 1) + \
list(range(1, length - end_idx))
class TACREDTransform(Transform):
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=False, **kwargs) -> None:
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.token_vocab = VocabTF()
self.pos_vocab = VocabTF(pad_token=None, unk_token=None)
self.ner_vocab = VocabTF(pad_token=None)
self.deprel_vocab = VocabTF(pad_token=None, unk_token=None)
self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
def fit(self, trn_path: str, **kwargs) -> int:
count = 0
for (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type,
obj_type), relation in self.file_to_samples(
trn_path, gold=True):
count += 1
self.token_vocab.update(tokens)
self.pos_vocab.update(pos)
self.ner_vocab.update(ner)
self.deprel_vocab.update(deprel)
self.rel_vocab.add(relation)
return count
def file_to_inputs(self, filepath: str, gold=True):
data = load_json(filepath)
for d in data:
tokens = list(d['token'])
ss, se = d['subj_start'], d['subj_end']
os, oe = d['obj_start'], d['obj_end']
pos = d['stanford_pos']
ner = d['stanford_ner']
deprel = d['stanford_deprel']
head = [int(x) for x in d['stanford_head']]
assert any([x == 0 for x in head])
relation = d['relation']
yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation
def inputs_to_samples(self, inputs, gold=False):
for input in inputs:
if gold:
(tokens, pos, ner, head, deprel, ss, se, os, oe), relation = input
else:
tokens, pos, ner, head, deprel, ss, se, os, oe = input
relation = self.rel_vocab.safe_pad_token
l = len(tokens)
subj_positions = get_positions(ss, se, l)
obj_positions = get_positions(os, oe, l)
subj_type = ner[ss]
obj_type = ner[os]
# anonymize tokens
tokens[ss:se + 1] = ['SUBJ-' + subj_type] * (se - ss + 1)
tokens[os:oe + 1] = ['OBJ-' + obj_type] * (oe - os + 1)
# min head is 0, but root is not included in tokens, so take 1 off from each head
head = [h - 1 for h in head]
yield (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
# (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation
types = (tf.string, tf.string, tf.string, tf.int32, tf.string, tf.int32, tf.int32, tf.string,
tf.string), tf.string
shapes = ([None], [None], [None], [None], [None], [None], [None], [], []), []
pads = (self.token_vocab.safe_pad_token, self.pos_vocab.safe_pad_token, self.ner_vocab.safe_pad_token, 0,
self.deprel_vocab.safe_pad_token,
0, 0, self.ner_vocab.safe_pad_token, self.ner_vocab.safe_pad_token), self.rel_vocab.safe_pad_token
return types, shapes, pads
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type = x
tokens = self.token_vocab.lookup(tokens)
pos = self.pos_vocab.lookup(pos)
ner = self.ner_vocab.lookup(ner)
deprel = self.deprel_vocab.lookup(deprel)
subj_type = self.ner_vocab.lookup(subj_type)
obj_type = self.ner_vocab.lookup(obj_type)
return tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type
def y_to_idx(self, y) -> tf.Tensor:
return self.rel_vocab.lookup(y)
================================================
FILE: hanlp/transform/text_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-04 11:46
from typing import Union, Tuple, Iterable, Any
import tensorflow as tf
from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.metrics.chunking.sequence_labeling import get_entities
from hanlp.utils.file_read_backwards import FileReadBackwards
from hanlp.utils.io_util import read_tsv_as_sents
class TextTransform(Transform):
def __init__(self,
forward=True,
seq_len=10,
tokenizer='char',
config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs)
self.vocab: VocabTF = None
def tokenize_func(self):
if self.config.tokenizer == 'char':
return list
elif self.config.tokenizer == 'whitespace':
return lambda x: x.split()
else:
return lambda x: x.split(self.config.tokenizer)
def fit(self, trn_path: str, **kwargs) -> int:
self.vocab = VocabTF()
num_samples = 0
for x, y in self.file_to_inputs(trn_path):
self.vocab.update(x)
num_samples += 1
return num_samples
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
types = tf.string, tf.string
shapes = [None], [None]
defaults = self.vocab.pad_token, self.vocab.pad_token
return types, shapes, defaults
def file_to_inputs(self, filepath: str, gold=True):
forward = self.config.forward
seq_len = self.config.seq_len
buffer = []
tokenizer = self.tokenize_func()
with open(filepath, encoding='utf-8') if forward else FileReadBackwards(filepath, encoding="utf-8") as src:
for line in src:
tokens = tokenizer(line)
buffer += tokens
while len(buffer) > seq_len:
yield buffer[:seq_len], buffer[1:1 + seq_len]
buffer.pop(0)
def inputs_to_samples(self, inputs, gold=False):
forward = self.config.forward
for t in inputs:
if gold:
x, y = t
else:
x, y = t, t
if not forward:
x = list(reversed(x))
y = list(reversed(y))
yield x, y
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
return self.vocab.lookup(x)
def y_to_idx(self, y) -> tf.Tensor:
return self.x_to_idx(y)
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable:
pred = tf.argmax(Y, axis=-1)
for ys, ms in zip(pred, inputs):
ret = []
for y in ys:
ret.append(self.vocab.idx_to_token[int(y)])
yield ret
def input_is_single_sample(self, input: Any) -> bool:
return isinstance(input[0], str)
def bmes_to_flat(inpath, outpath):
with open(outpath, 'w', encoding='utf-8') as out:
for sent in read_tsv_as_sents(inpath):
chunks = get_entities([cells[1] for cells in sent])
chars = [cells[0] for cells in sent]
words = []
for tag, start, end in chunks:
word = ''.join(chars[start: end])
words.append(word)
out.write(' '.join(f'{word}/{tag}' for word, (tag, _, _) in zip(words, chunks)))
out.write('\n')
================================================
FILE: hanlp/transform/transformer_tokenizer.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-03 16:23
import warnings
from typing import Union, Optional
from hanlp_common.constant import BOS, EOS
from hanlp_common.structure import SerializableDict
from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, PretrainedConfig, AutoTokenizer_
from hanlp_trie import DictInterface
class TransformerTokenizer(object):
def __init__(self, max_seq_length=512, truncate_long_sequences=True) -> None:
self.truncate_long_sequences = truncate_long_sequences
self.max_seq_length = max_seq_length
def sliding_window(self, flat_wordpiece_ids, same_tail=True):
if same_tail:
start_piece_ids, flat_wordpiece_ids, end_piece_ids = flat_wordpiece_ids[:1], \
flat_wordpiece_ids[1:-1], flat_wordpiece_ids[-1:]
else:
start_piece_ids, flat_wordpiece_ids, end_piece_ids = flat_wordpiece_ids[:1], \
flat_wordpiece_ids[1:], []
window_length = self.max_seq_length - len(start_piece_ids) - len(end_piece_ids)
stride = window_length // 2
wordpiece_windows = [start_piece_ids + flat_wordpiece_ids[i:i + window_length] + end_piece_ids
for i in range(0, len(flat_wordpiece_ids), stride)]
# Check for overlap in the last window. Throw it away if it is redundant.
last_window = wordpiece_windows[-1][1:]
penultimate_window = wordpiece_windows[-2]
if last_window == penultimate_window[-len(last_window):]:
wordpiece_windows = wordpiece_windows[:-1]
wordpiece_ids = [wordpiece for sequence in wordpiece_windows for wordpiece in sequence]
return wordpiece_ids
class TransformerTextTokenizer(TransformerTokenizer):
_KEY = ['input_ids', 'attention_mask', 'token_type_ids']
def __init__(self,
tokenizer: Union[PreTrainedTokenizer, str],
text_a_key: str,
text_b_key: str = None,
output_key=None,
max_seq_length=512, truncate_long_sequences=True) -> None:
super().__init__(max_seq_length, truncate_long_sequences)
self.text_b = text_b_key
self.text_a = text_a_key
if output_key is None:
output_key = self.text_a
if text_b_key:
output_key += '_' + text_b_key
if output_key == '':
output_key = self._KEY
else:
output_key = [f'{output_key}_{key}' for key in self._KEY]
self.output_key = output_key
if isinstance(tokenizer, str):
tokenizer = AutoTokenizer_.from_pretrained(tokenizer)
self.tokenizer = tokenizer
def __call__(self, sample: dict):
text_a = sample[self.text_a]
text_b = sample[self.text_b] if self.text_b else None
max_seq_length = self.max_seq_length if self.truncate_long_sequences else None
encoding = self.tokenizer.encode_plus(text_a, text_b, max_length=max_seq_length)
results = dict((k, encoding.data.get(k, None)) for k in self._KEY)
if not self.truncate_long_sequences and len(results['input_ids']) > self.max_seq_length:
# TODO: other fields should be properly handled too
results['input_ids'] = self.sliding_window(results['input_ids'])
if not results['token_type_ids']:
results['token_type_ids'] = encoding[0].type_ids
for k, v in zip(self.output_key, [results[_] for _ in self._KEY]):
sample[k] = v
return sample
class TransformerSequenceTokenizer(TransformerTokenizer):
def __init__(self,
tokenizer: Union[PreTrainedTokenizer, str],
input_key,
output_key=None,
max_seq_length=512,
truncate_long_sequences=False,
config: PretrainedConfig = None,
cls_token_at_end=False,
cls_token_segment_id=0,
pad_token_segment_id=0,
pad_on_left=False,
do_padding=False,
sep_token_extra=False,
ret_mask_and_type=False,
ret_prefix_mask=False,
ret_token_span=True,
ret_subtokens=False,
ret_subtokens_group=False,
cls_is_bos=False,
sep_is_eos=False,
do_basic_tokenize=True,
use_fast=True,
dict_force=None,
strip_cls_sep=True,
check_space_before=None,
) -> None:
"""A transformer tokenizer for token-level tasks. It honors the boundary of tokens and tokenize each token into
several subtokens then merge them. The information about each subtoken belongs to which token are kept and
returned as a new field in the sample. It also provides out-of-box sliding window trick on long sequences.
Args:
tokenizer: The identifier of a pre-trained tokenizer or a ``PreTrainedTokenizer``.
input_key: The token key in samples.
output_key: The output keys to store results.
max_seq_length: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
truncate_long_sequences: ``True`` to truncate exceeded parts of long sequences. ``False`` to enable
sliding window.
config: The ``PretrainedConfig`` to determine the model structure of the transformer, so that special
tokenization can be applied.
cls_token_at_end: ``True`` to put ``[CLS]`` at the end of input tokens.
cls_token_segment_id: The id of ``[CLS]``.
pad_token_segment_id: The id of ``[SEP]``.
pad_on_left: ``True`` to put ``[PAD]`` at the left side of input tokens.
do_padding: ``True`` to pad sequence to the left.
sep_token_extra: ``True`` to have two ``[SEP]``.
ret_mask_and_type: ``True`` to return masks and type ids.
ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
ret_subtokens: ``True`` to return list of subtokens belonging to each token for tokenization purpose.
When enabled, the prefix mask for each subtoken is set to True as each subtoken is a token unit in
tokenization task. Similarity, the token span for each token will be a continuous integer sequence.
ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
``False`` (default) means the first token is not [CLS], it will have its own embedding other than
the embedding of [CLS].
sep_is_eos: ``True`` means the last token of input is [SEP].
``False`` means it's not but [SEP] will be appended,
``None`` means it dependents on `input[-1] == [EOS]`.
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
use_fast: Whether or not to try to load the fast version of the tokenizer.
dict_force: A dictionary doing longest-prefix-match on input text so that the head and tail of each keyword
won't be concatenated to other tokens by transformer tokenizers.
strip_cls_sep: ``True`` to strip [CLS] and [SEP] off the input tokens.
check_space_before: ``True`` to detect the space before each token to handle underline in sentence piece
tokenization.
Examples:
.. highlight:: python
.. code-block:: python
transform = TransformerSequenceTokenizer('bert-base-uncased', 'token')
sample = {'token': 'HanLP good'.split()}
print(transform(sample))
"""
super().__init__(max_seq_length, truncate_long_sequences)
tokenizer_name = tokenizer if isinstance(tokenizer, str) else tokenizer.name_or_path
if check_space_before is None:
# These tokenizer is BPE-based which appends a space before each token and tokenizes loving into
# ['▁lo', 'ving'], tokenize 商品 into ['▁', '商品']. For the later case, the prefix '▁' has to be removed
# as there is no space between some languages like Chinese
check_space_before = tokenizer_name in ('xlm-roberta-base', 'xlm-roberta-large', 'google/mt5-small',
'google/mt5-base', 'xlm-roberta-base-no-space',
'mMiniLMv2L6-no-space', 'mMiniLMv2L12-no-space')
self.check_space_before = check_space_before
self.ret_subtokens_group = ret_subtokens_group
self.ret_subtokens = ret_subtokens
self.sep_is_eos = sep_is_eos
self.ret_prefix_mask = ret_prefix_mask
self.ret_mask_and_type = ret_mask_and_type
self.cls_is_bos = cls_is_bos
self.ret_token_span = ret_token_span
if not output_key or isinstance(output_key, str):
suffixes = ['input_ids']
if ret_mask_and_type:
suffixes += 'attention_mask', 'token_type_ids'
if ret_prefix_mask:
suffixes += ['prefix_mask']
if ret_token_span:
suffixes.append('token_span')
if output_key is None:
output_key = [f'{input_key}_{key}' for key in suffixes]
elif output_key == '':
output_key = suffixes
else:
output_key = [f'{output_key}_{key}' for key in suffixes]
self.input_key = input_key
self.output_key = output_key
if config:
xlnet = config_is(config, 'xlnet')
pad_token_segment_id = 4 if xlnet else 0
cls_token_segment_id = 2 if xlnet else 0
cls_token_at_end = xlnet
pad_on_left = xlnet
if isinstance(tokenizer, str):
tokenizer = AutoTokenizer_.from_pretrained(tokenizer, use_fast=use_fast,
do_basic_tokenize=do_basic_tokenize)
if use_fast:
# Dirty fix upstream bug: https://github.com/hankcs/HanLP/issues/1602
if hasattr(tokenizer, '_tokenizer') and hasattr(tokenizer._tokenizer, 'no_truncation'):
_t = tokenizer._tokenizer
_t.no_truncation()
_t.no_padding()
_t.no_truncation = _t.no_padding = lambda: None
pad_token = tokenizer.pad_token
self.pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
self.pad_token_segment_id = pad_token_segment_id
if tokenizer_name in ('google/mt5-small', 'google/mt5-base'):
# mt5 doesn't have cls or sep, but we can use something similar
self.has_cls = False
self.cls_token = '▁'
self.cls_token_id = tokenizer.convert_tokens_to_ids(self.cls_token)
self.sep_token = tokenizer.eos_token
self.sep_token_id = tokenizer.eos_token_id
else:
self.has_cls = True
self.cls_token = tokenizer.cls_token
self.sep_token = tokenizer.sep_token
self.cls_token_segment_id = cls_token_segment_id
self.cls_token_id = tokenizer.cls_token_id
self.sep_token_id = tokenizer.sep_token_id
self.sep_token_extra = sep_token_extra
self.cls_token_at_end = cls_token_at_end
self.tokenizer = tokenizer
self.pad_on_left = pad_on_left
self.do_padding = do_padding
if self.ret_token_span or not self.truncate_long_sequences:
assert not self.cls_token_at_end
assert not self.pad_on_left
# if self.ret_subtokens:
# if not use_fast:
# raise NotImplementedError(
# 'ret_subtokens is not available when using Python tokenizers. '
# 'To use this feature, set use_fast = True.')
self.dict: Optional[DictInterface] = dict_force # For tokenization of raw text
self.strip_cls_sep = strip_cls_sep
def __call__(self, sample: dict):
input_tokens = sample[self.input_key]
input_is_str = isinstance(input_tokens, str)
tokenizer = self.tokenizer
ret_token_span = self.ret_token_span
if input_is_str: # This happens in a tokenizer component where the raw sentence is fed.
# noinspection PyShadowingNames
def tokenize_str(input_str, add_special_tokens=True):
if tokenizer.is_fast:
encoding = tokenizer.encode_plus(input_str,
return_offsets_mapping=True,
add_special_tokens=add_special_tokens).encodings[0]
subtoken_offsets = encoding.offsets
input_tokens = encoding.tokens
input_ids = encoding.ids
# Fill up missing non-blank characters swallowed by HF tokenizer
offset = 0
fixed_offsets = []
fixed_tokens = []
fixed_ids = []
for token, id, (b, e) in zip(input_tokens, input_ids, subtoken_offsets):
if b > offset:
missing_token = input_str[offset: b]
if not missing_token.isspace(): # In the future, we may want space back
fixed_tokens.append(missing_token)
fixed_ids.append(tokenizer.unk_token_id)
fixed_offsets.append((offset, b))
if e == offset: # LI™ -> LIT + M
if fixed_offsets and fixed_offsets[-1][0] < b:
fixed_offsets[-1] = (fixed_offsets[-1][0], b)
fixed_tokens.append(token)
fixed_ids.append(id)
fixed_offsets.append((b, e))
offset = e
subtoken_offsets = fixed_offsets
input_tokens = fixed_tokens
input_ids = fixed_ids
if add_special_tokens:
subtoken_offsets = subtoken_offsets[1 if self.has_cls else 0:-1]
# Edge case that the input_str is swallowed in whole
if input_str and not subtoken_offsets and not input_str.isspace():
__index = 1 if add_special_tokens and self.has_cls else 0
input_tokens.insert(__index, input_str)
input_ids.insert(__index, tokenizer.unk_token_id)
subtoken_offsets.append((0, len(input_str)))
if not self.has_cls:
input_tokens = [self.cls_token] + input_tokens
input_ids = [self.cls_token_id] + input_ids
else:
input_tokens = tokenizer.tokenize(input_str)
subtoken_offsets = []
_o = 0
for each in input_tokens:
subtoken_offsets.append((_o, _o + len(each)))
_o += len(each)
if add_special_tokens:
input_tokens = [self.cls_token] + input_tokens + [self.sep_token]
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
if self.check_space_before:
non_blank_offsets = [i for i in range(len(input_tokens)) if input_tokens[i] != '▁']
if add_special_tokens and not self.has_cls:
non_blank_offsets.insert(0, 0)
input_tokens = [input_tokens[i] for i in non_blank_offsets]
input_ids = [input_ids[i] for i in non_blank_offsets]
if add_special_tokens:
non_blank_offsets = non_blank_offsets[1:-1]
subtoken_offsets = [subtoken_offsets[i - 1] for i in non_blank_offsets]
else:
subtoken_offsets = [subtoken_offsets[i] for i in non_blank_offsets]
# MT5 generates tokens like ▁of, which is bad for the tokenizer. So we want to remove the prefix.
for i, token in enumerate(input_tokens[1:-1] if add_special_tokens else input_tokens):
if input_str[subtoken_offsets[i][0]] == ' ':
subtoken_offsets[i] = (subtoken_offsets[i][0] + 1, subtoken_offsets[i][1])
# The following block will tokenize each empty string (space) into an unk token
# if add_special_tokens:
# if len(input_tokens) == 2: # bos and eos, meaning that the text contains only some spaces
# input_tokens.insert(1, input_str)
# input_ids.insert(1, tokenizer.unk_token_id)
# subtoken_offsets.append((0, len(input_str)))
# else:
# if not input_ids: # This chunk might be some control chars getting removed by tokenizer
# input_tokens = [input_str]
# input_ids = [tokenizer.unk_token_id]
# subtoken_offsets = [(0, len(input_str))]
return input_tokens, input_ids, subtoken_offsets
if self.dict:
chunks = self.dict.split(sample.get(f'{self.input_key}_', input_tokens)) # Match original text directly
_input_tokens, _input_ids, _subtoken_offsets = [self.cls_token], [self.cls_token_id], []
_offset = 0
custom_words = sample['custom_words'] = []
char_offset = 0
for chunk in chunks:
if isinstance(chunk, str): # Use transformed text as it's what models are trained on
chunk = input_tokens[char_offset:char_offset + len(chunk)]
tokens, ids, offsets = tokenize_str(chunk, add_special_tokens=False)
char_offset += len(chunk)
else:
begin, end, label = chunk
_offset = begin
# chunk offset is on char level, at this moment, there is no concept of tokens, just subtokens
if isinstance(label, list):
tokens, ids, offsets, delta = [], [], [], 0
for token in label:
_tokens, _ids, _offsets = tokenize_str(token, add_special_tokens=False)
tokens.extend(_tokens)
# track the subword offset of this chunk, -1 for [CLS]
custom_words.append(
(len(_input_ids) + len(ids) - 1, len(_input_ids) + len(ids) - 1 + len(_ids), token))
ids.extend(_ids)
offsets.extend((x[0] + delta, x[1] + delta) for x in _offsets)
delta = offsets[-1][-1]
else:
tokens, ids, offsets = tokenize_str(input_tokens[begin:end], add_special_tokens=False)
# offsets = [(offsets[0][0], offsets[-1][-1])]
custom_words.append((len(_input_ids) - 1, len(_input_ids) + len(ids) - 1, label))
char_offset = end
_input_tokens.extend(tokens)
_input_ids.extend(ids)
_subtoken_offsets.extend((x[0] + _offset, x[1] + _offset) for x in offsets)
_offset = _subtoken_offsets[-1][-1]
subtoken_offsets = _subtoken_offsets
input_tokens = _input_tokens + [self.sep_token]
input_ids = _input_ids + [self.sep_token_id]
else:
input_tokens, input_ids, subtoken_offsets = tokenize_str(input_tokens, add_special_tokens=True)
if self.ret_subtokens:
sample[f'{self.input_key}_subtoken_offsets'] = subtoken_offsets
cls_is_bos = self.cls_is_bos
if cls_is_bos is None:
cls_is_bos = input_tokens[0] == BOS
sep_is_eos = self.sep_is_eos
if sep_is_eos is None:
sep_is_eos = input_tokens[-1] == EOS
if self.strip_cls_sep:
if cls_is_bos:
input_tokens = input_tokens[1:]
if sep_is_eos:
input_tokens = input_tokens[:-1]
if not self.ret_mask_and_type: # only need input_ids and token_span, use a light version
if input_is_str:
prefix_mask = self._init_prefix_mask(input_ids)
else:
if input_tokens:
return_offsets_mapping = tokenizer.is_fast and self.ret_subtokens
encodings = tokenizer.batch_encode_plus(
input_tokens,
return_offsets_mapping=return_offsets_mapping, # Many tokenizers do not offer fast version
add_special_tokens=False
)
subtoken_ids_per_token = encodings.data['input_ids']
if return_offsets_mapping:
offsets_mapping = [encoding.offsets for encoding in encodings.encodings]
else:
offsets_mapping = []
for token, subtoken_ids in zip(input_tokens, subtoken_ids_per_token):
if len(subtoken_ids) > len(token): # … --> ...
del subtoken_ids[len(token):]
if not subtoken_ids:
subtoken_ids = [tokenizer.unk_token_id]
# Since non-fast tok generates no mapping, we have to guess
char_per_subtoken = max(len(token) // len(subtoken_ids), 1)
bes = [(b, b + char_per_subtoken) for b in range(0, len(token), char_per_subtoken)]
if not bes: # the token is an empty string
bes = [(0, 0)]
if len(bes) != len(subtoken_ids):
bes[len(subtoken_ids) - 1] = (bes[len(subtoken_ids) - 1][0], len(token))
del bes[len(subtoken_ids):]
offsets_mapping.append(bes)
else:
encodings = SerializableDict()
subtoken_ids_per_token = []
encodings.data = {'input_ids': subtoken_ids_per_token}
if self.check_space_before:
# noinspection PyUnboundLocalVariable
for token, subtokens, mapping, encoding in zip(input_tokens, subtoken_ids_per_token,
offsets_mapping, encodings.encodings):
# Remove ▁ generated by spm for 2 reasons:
# 1. During decoding, mostly no ▁ will be created unless blanks are placed between tokens (which
# is true for English but in English it will likely be concatenated to the token following it)
# 2. For T5, '▁' is used as CLS
if len(subtokens) > 1 and encoding.tokens[0] == '▁':
subtokens.pop(0)
if mapping:
mapping.pop(0)
# Some tokens get stripped out
subtoken_ids_per_token = [ids if ids else [tokenizer.unk_token_id] for ids in subtoken_ids_per_token]
input_ids = sum(subtoken_ids_per_token, [self.cls_token_id])
if self.sep_is_eos is None:
# None means to check whether sep is at the tail or between tokens
if sep_is_eos:
input_ids += [self.sep_token_id]
elif self.sep_token_id not in input_ids:
input_ids += [self.sep_token_id]
else:
input_ids += [self.sep_token_id]
# else self.sep_is_eos == False means sep is between tokens and don't bother to check
if self.ret_subtokens:
prefix_mask = self._init_prefix_mask(input_ids)
# if self.check_space_before:
# if offsets_mapping[0] and not input_tokens[0].startswith(' '):
# prefix_mask[1] = False
else:
prefix_mask = [False] * len(input_ids)
offset = 1
for _subtokens in subtoken_ids_per_token:
prefix_mask[offset] = True
offset += len(_subtokens)
if self.ret_subtokens:
subtoken_offsets = []
for token, offsets in zip(input_tokens, offsets_mapping):
if offsets:
subtoken_offsets.append(offsets)
else:
subtoken_offsets.append([(0, len(token))])
if self.ret_subtokens_group:
sample[f'{self.input_key}_subtoken_offsets_group'] = subtoken_offsets
else:
sample[f'{self.input_key}_subtoken_offsets'] = sum(subtoken_offsets, [])
else:
input_ids, attention_mask, token_type_ids, prefix_mask = \
convert_examples_to_features(input_tokens,
None,
tokenizer,
cls_token_at_end=self.cls_token_at_end,
# xlnet has a cls token at the end
cls_token=tokenizer.cls_token,
cls_token_segment_id=self.cls_token_segment_id,
sep_token=self.sep_token,
sep_token_extra=self.sep_token_extra,
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=self.pad_on_left,
# pad on the left for xlnet
pad_token_id=self.pad_token_id,
pad_token_segment_id=self.pad_token_segment_id,
pad_token_label_id=0,
do_padding=self.do_padding)
if len(input_ids) > self.max_seq_length:
if self.truncate_long_sequences:
# raise SequenceTooLong(
# f'Input tokens {input_tokens} exceed the max sequence length of {self.max_seq_length - 2}. '
# f'For sequence tasks, truncate_long_sequences = True is not supported.'
# f'You are recommended to split your long text into several sentences within '
# f'{self.max_seq_length - 2} tokens beforehand. '
# f'Or simply set truncate_long_sequences = False to enable sliding window.')
input_ids = input_ids[:self.max_seq_length]
prefix_mask = prefix_mask[:self.max_seq_length]
warnings.warn(
f'Input tokens {input_tokens} exceed the max sequence length of {self.max_seq_length - 2}. '
f'The exceeded part will be truncated and ignored. '
f'You are recommended to split your long text into several sentences within '
f'{self.max_seq_length - 2} tokens beforehand.'
f'Or simply set truncate_long_sequences = False to enable sliding window.'
)
else:
input_ids = self.sliding_window(input_ids, input_ids[-1] == self.sep_token_id)
if prefix_mask:
if cls_is_bos:
prefix_mask[0] = True
if sep_is_eos:
prefix_mask[-1] = True
outputs = [input_ids]
if self.ret_mask_and_type:
# noinspection PyUnboundLocalVariable
outputs += [attention_mask, token_type_ids]
if self.ret_prefix_mask:
outputs += [prefix_mask]
if ret_token_span and prefix_mask:
if cls_is_bos:
token_span = [[0]]
else:
token_span = []
offset = 1
span = []
for mask in prefix_mask[1:len(prefix_mask) if sep_is_eos is None else -1]: # skip [CLS] and [SEP]
if mask and span:
token_span.append(span)
span = []
span.append(offset)
offset += 1
if span:
token_span.append(span)
if sep_is_eos:
assert offset == len(prefix_mask) - 1
token_span.append([offset])
outputs.append(token_span)
for k, v in zip(self.output_key, outputs):
sample[k] = v
return sample
def _init_prefix_mask(self, input_ids):
prefix_mask = [True] * len(input_ids)
if not self.cls_is_bos:
prefix_mask[0] = False
if not self.sep_is_eos:
prefix_mask[-1] = False
return prefix_mask
def config_is(config, model='bert'):
return model in type(config).__name__.lower()
def convert_examples_to_features(
words,
max_seq_length: Optional[int],
tokenizer,
labels=None,
label_map=None,
cls_token_at_end=False,
cls_token="[CLS]",
cls_token_segment_id=1,
sep_token="[SEP]",
sep_token_extra=False,
pad_on_left=False,
pad_token_id=0,
pad_token_segment_id=0,
pad_token_label_id=0,
sequence_a_segment_id=0,
mask_padding_with_zero=True,
unk_token='[UNK]',
do_padding=True
):
"""Loads a data file into a list of `InputBatch`s
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
Args:
words:
max_seq_length:
tokenizer:
labels: (Default value = None)
label_map: (Default value = None)
cls_token_at_end: (Default value = False)
cls_token: (Default value = "[CLS]")
cls_token_segment_id: (Default value = 1)
sep_token: (Default value = "[SEP]")
sep_token_extra: (Default value = False)
pad_on_left: (Default value = False)
pad_token_id: (Default value = 0)
pad_token_segment_id: (Default value = 0)
pad_token_label_id: (Default value = 0)
sequence_a_segment_id: (Default value = 0)
mask_padding_with_zero: (Default value = True)
unk_token: (Default value = '[UNK]')
do_padding: (Default value = True)
Returns:
"""
args = locals()
if not labels:
labels = words
pad_token_label_id = False
tokens = []
label_ids = []
for word, label in zip(words, labels):
word_tokens = tokenizer.tokenize(word)
if not word_tokens:
# some wired chars cause the tagger to return empty list
word_tokens = [unk_token] * len(word)
tokens.extend(word_tokens)
# Use the real label id for the first token of the word, and padding ids for the remaining tokens
label_ids.extend([label_map[label] if label_map else True] + [pad_token_label_id] * (len(word_tokens) - 1))
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
special_tokens_count = 3 if sep_token_extra else 2
if max_seq_length and len(tokens) > max_seq_length - special_tokens_count:
warnings.warn(
f'Input tokens {words} exceed the max sequence length of {max_seq_length - special_tokens_count}. '
f'The exceeded part will be truncated and ignored. '
f'You are recommended to split your long text into several sentences within '
f'{max_seq_length - special_tokens_count} tokens beforehand.')
tokens = tokens[: (max_seq_length - special_tokens_count)]
label_ids = label_ids[: (max_seq_length - special_tokens_count)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# token_type_ids: 0 0 0 0 0 0 0
#
# Where "token_type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens += [sep_token]
label_ids += [pad_token_label_id]
if sep_token_extra:
# roberta uses an extra separator b/w pairs of sentences
tokens += [sep_token]
label_ids += [pad_token_label_id]
segment_ids = [sequence_a_segment_id] * len(tokens)
if cls_token_at_end:
tokens += [cls_token]
label_ids += [pad_token_label_id]
segment_ids += [cls_token_segment_id]
else:
tokens = [cls_token] + tokens
label_ids = [pad_token_label_id] + label_ids
segment_ids = [cls_token_segment_id] + segment_ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
if do_padding:
# Zero-pad up to the sequence length.
padding_length = max_seq_length - len(input_ids)
if pad_on_left:
input_ids = ([pad_token_id] * padding_length) + input_ids
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
label_ids = ([pad_token_label_id] * padding_length) + label_ids
else:
input_ids += [pad_token_id] * padding_length
input_mask += [0 if mask_padding_with_zero else 1] * padding_length
segment_ids += [pad_token_segment_id] * padding_length
label_ids += [pad_token_label_id] * padding_length
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
assert len(label_ids) == max_seq_length, f'failed for:\n {args}'
else:
assert len(set(len(x) for x in [input_ids, input_mask, segment_ids, label_ids])) == 1
return input_ids, input_mask, segment_ids, label_ids
def main():
transformer = 'bert-base-uncased'
tokenizer: PreTrainedTokenizer = AutoTokenizer_.from_pretrained(transformer)
# _test_text_transform(tokenizer)
_test_sequence_transform(tokenizer)
def _test_text_transform(tokenizer):
transform = TransformerTextTokenizer(tokenizer, 'text')
sample = {'text': 'HanLP good'}
print(transform(sample))
def _test_sequence_transform(tokenizer):
transform = TransformerSequenceTokenizer(tokenizer, 'token')
sample = {'token': 'HanLP good'.split()}
print(transform(sample))
if __name__ == '__main__':
main()
================================================
FILE: hanlp/transform/tsv_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 21:15
import functools
from abc import ABC
from typing import Tuple, Union, Optional, Iterable, List
import tensorflow as tf
from hanlp_common.structure import SerializableDict
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.io_util import generate_words_tags_from_tsv
from hanlp.utils.tf_util import str_tensor_to_str
from hanlp_common.util import merge_locals_kwargs
def dataset_from_tsv(tsv_file_path, word_vocab: VocabTF, char_vocab: VocabTF, tag_vocab: VocabTF, batch_size=32,
shuffle=None, repeat=None, prefetch=1, lower=False, **kwargs):
generator = functools.partial(generate_words_tags_from_tsv, tsv_file_path, word_vocab, char_vocab, tag_vocab, lower)
return dataset_from_generator(generator, word_vocab, tag_vocab, batch_size, shuffle, repeat, prefetch,
**kwargs)
def dataset_from_generator(generator, word_vocab, tag_vocab, batch_size=32, shuffle=None, repeat=None, prefetch=1,
**kwargs):
shapes = [None], [None]
types = tf.string, tf.string
defaults = word_vocab.pad_token, tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token
dataset = tf.data.Dataset.from_generator(generator, output_shapes=shapes, output_types=types)
if shuffle:
if isinstance(shuffle, bool):
shuffle = 1024
dataset = dataset.shuffle(shuffle)
if repeat:
dataset = dataset.repeat(repeat)
dataset = dataset.padded_batch(batch_size, shapes, defaults).prefetch(prefetch)
return dataset
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
-> Tuple[VocabTF, VocabTF, VocabTF]:
word_vocab = VocabTF()
char_vocab = VocabTF()
tag_vocab = VocabTF(unk_token=None)
with open(tsv_file_path, encoding='utf-8') as tsv_file:
for line in tsv_file:
cells = line.strip().split()
if cells:
word, tag = cells
if lower:
word_vocab.add(word.lower())
else:
word_vocab.add(word)
char_vocab.update(list(word))
tag_vocab.add(tag)
if lock_word_vocab:
word_vocab.lock()
if lock_char_vocab:
char_vocab.lock()
if lock_tag_vocab:
tag_vocab.lock()
return word_vocab, char_vocab, tag_vocab
class TsvTaggingFormat(Transform, ABC):
def file_to_inputs(self, filepath: str, gold=True):
assert gold, 'TsvTaggingFormat does not support reading non-gold files'
yield from generate_words_tags_from_tsv(filepath, gold=gold, lower=self.config.get('lower', False),
max_seq_length=self.max_seq_length)
@property
def max_seq_length(self):
return self.config.get('max_seq_length', None)
class TSVTaggingTransform(TsvTaggingFormat, Transform):
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, use_char=False, **kwargs) -> None:
super().__init__(**merge_locals_kwargs(locals(), kwargs))
self.word_vocab: Optional[VocabTF] = None
self.tag_vocab: Optional[VocabTF] = None
self.char_vocab: Optional[VocabTF] = None
def fit(self, trn_path: str, **kwargs) -> int:
self.word_vocab = VocabTF()
self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
num_samples = 0
for words, tags in self.file_to_inputs(trn_path, True):
self.word_vocab.update(words)
self.tag_vocab.update(tags)
num_samples += 1
if self.char_vocab:
self.char_vocab = VocabTF()
for word in self.word_vocab.token_to_idx.keys():
if word in (self.word_vocab.pad_token, self.word_vocab.unk_token):
continue
self.char_vocab.update(list(word))
return num_samples
def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
types = tf.string, tf.string
shapes = [None], [None]
values = self.word_vocab.pad_token, self.tag_vocab.first_token
return types, shapes, values
def inputs_to_samples(self, inputs, gold=False):
lower = self.config.get('lower', False)
if gold:
if lower:
for x, y in inputs:
yield x.lower(), y
else:
yield from inputs
else:
for x in inputs:
yield x.lower() if lower else x, [self.padding_values[-1]] * len(x)
def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
return self.word_vocab.lookup(x)
def y_to_idx(self, y) -> tf.Tensor:
return self.tag_vocab.lookup(y)
def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
for xs in X:
words = []
for x in xs:
words.append(str_tensor_to_str(x) if self.char_vocab else self.word_vocab.idx_to_token[int(x)])
yield words
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False,
inputs=None, X=None, **kwargs) -> Iterable:
if not gold:
Y = tf.argmax(Y, axis=2)
for ys, xs in zip(Y, inputs):
tags = []
for y, x in zip(ys, xs):
tags.append(self.tag_vocab.idx_to_token[int(y)])
yield tags
def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
return isinstance(input[0], str)
def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]):
text = ''
for word, gold_tag, pred_tag in zip(input, truth, output):
text += ' '.join([word, gold_tag, pred_tag]) + '\n'
text += '\n'
return text
================================================
FILE: hanlp/transform/txt_tf.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-24 15:07
import functools
from abc import ABC
from typing import Tuple, Union, List, Iterable
import tensorflow as tf
from hanlp.common.transform_tf import Transform
from hanlp.common.vocab_tf import VocabTF
from hanlp.utils.io_util import get_resource
from hanlp.utils.lang.zh.char_table import CharTable
from hanlp.utils.span_util import bmes_of, bmes_to_words
from hanlp.utils.string_util import split_long_sent
def generate_words_per_line(file_path):
with open(file_path, encoding='utf-8') as src:
for line in src:
cells = line.strip().split()
if not cells:
continue
yield cells
def words_to_bmes(words):
tags = []
for w in words:
if not w:
raise ValueError('{} contains None or zero-length word {}'.format(str(words), w))
if len(w) == 1:
tags.append('S')
else:
tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E'])
return tags
def extract_ngram_features_and_tags(sentence, bigram_only=False, window_size=4, segmented=True):
"""
Feature extraction for windowed approaches
See Also https://github.com/chqiwang/convseg/
Parameters
----------
sentence
bigram_only
window_size
segmented
Returns
-------
"""
chars, tags = bmes_of(sentence, segmented)
chars = CharTable.normalize_chars(chars)
ret = []
ret.append(chars)
# TODO: optimize ngram generation using https://www.tensorflow.org/api_docs/python/tf/strings/ngrams
ret.extend(extract_ngram_features(chars, bigram_only, window_size))
ret.append(tags)
return tuple(ret[:-1]), ret[-1] # x, y
def extract_ngram_features(chars, bigram_only, window_size):
ret = []
if bigram_only:
chars = ['', ''] + chars + ['', '']
ret.append([a + b if a and b else '' for a, b in zip(chars[:-4], chars[1:])])
ret.append([a + b if a and b else '' for a, b in zip(chars[1:-3], chars[2:])])
ret.append([a + b if a and b else '' for a, b in zip(chars[2:-2], chars[3:])])
ret.append([a + b if a and b else '' for a, b in zip(chars[3:-1], chars[4:])])
elif window_size > 0:
chars = ['', '', ''] + chars + ['', '', '']
# single char
if window_size >= 1:
ret.append(chars[3:-3])
if window_size >= 2:
# bi chars
ret.append([a + b if a and b else '' for a, b in zip(chars[2:], chars[3:-3])])
ret.append([a + b if a and b else '' for a, b in zip(chars[3:-3], chars[4:])])
if window_size >= 3:
# tri chars
ret.append(
[a + b + c if a and b and c else '' for a, b, c in zip(chars[1:], chars[2:], chars[3:-3])])
ret.append(
[a + b + c if a and b and c else '' for a, b, c in zip(chars[2:], chars[3:-3], chars[4:])])
ret.append(
[a + b + c if a and b and c else '' for a, b, c in zip(chars[3:-3], chars[4:], chars[5:])])
if window_size >= 4:
# four chars
ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in
zip(chars[0:], chars[1:], chars[2:], chars[3:-3])])
ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in
zip(chars[1:], chars[2:], chars[3:-3], chars[4:])])
ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in
zip(chars[2:], chars[3:-3], chars[4:], chars[5:])])
ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in
zip(chars[3:-3], chars[4:], chars[5:], chars[6:])])
return ret
def generate_ngram_bmes(file_path, bigram_only=False, window_size=4, gold=True):
with open(file_path, encoding='utf-8') as src:
for line in src:
sentence = line.strip()
if not sentence:
continue
yield extract_ngram_features_and_tags(sentence, bigram_only, window_size, gold)
def vocab_from_txt(txt_file_path, bigram_only=False, window_size=4, **kwargs) -> Tuple[VocabTF, VocabTF, VocabTF]:
char_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(pad_token=None, unk_token=None)
for X, Y in generate_ngram_bmes(txt_file_path, bigram_only, window_size, gold=True):
char_vocab.update(X[0])
for ngram in X[1:]:
ngram_vocab.update(filter(lambda x: x, ngram))
tag_vocab.update(Y)
return char_vocab, ngram_vocab, tag_vocab
def dataset_from_txt(txt_file_path: str, char_vocab: VocabTF, ngram_vocab: VocabTF, tag_vocab: VocabTF,
bigram_only=False,
window_size=4, segmented=True, batch_size=32, shuffle=None, repeat=None, prefetch=1):
generator = functools.partial(generate_ngram_bmes, txt_file_path, bigram_only, window_size, segmented)
return dataset_from_generator(generator, char_vocab, ngram_vocab, tag_vocab, bigram_only, window_size, batch_size,
shuffle, repeat, prefetch)
def dataset_from_generator(generator, char_vocab, ngram_vocab, tag_vocab, bigram_only=False, window_size=4,
batch_size=32, shuffle=None, repeat=None, prefetch=1):
if bigram_only:
ngram_size = 4
else:
ngram_size = window_size * (window_size + 1) // 2
vec_dim = 2 + ngram_size
shapes = tuple([[None]] * (vec_dim - 1)), [None]
types = tuple([tf.string] * (vec_dim - 1)), tf.string
defaults = tuple([char_vocab.pad_token] + [
ngram_vocab.pad_token if ngram_vocab else char_vocab.pad_token] * ngram_size), (
tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token)
dataset = tf.data.Dataset.from_generator(generator, output_shapes=shapes, output_types=types)
if shuffle:
if isinstance(shuffle, bool):
shuffle = 1024
dataset = dataset.shuffle(shuffle)
if repeat:
dataset = dataset.repeat(repeat)
dataset = dataset.padded_batch(batch_size, shapes, defaults).prefetch(prefetch)
return dataset
class TxtFormat(Transform, ABC):
def file_to_inputs(self, filepath: str, gold=True):
filepath = get_resource(filepath)
with open(filepath, encoding='utf-8') as src:
for line in src:
sentence = line.strip()
if not sentence:
continue
yield sentence
class TxtBMESFormat(TxtFormat, ABC):
def file_to_inputs(self, filepath: str, gold=True):
max_seq_length = self.config.get('max_seq_length', False)
if max_seq_length:
if 'transformer' in self.config:
max_seq_length -= 2 # allow for [CLS] and [SEP]
delimiter = set()
delimiter.update('。!?:;、,,;!?、,')
for text in super().file_to_inputs(filepath, gold):
chars, tags = bmes_of(text, gold)
if max_seq_length:
start = 0
for short_chars in split_long_sent(chars, delimiter, max_seq_length):
end = start + len(short_chars)
yield short_chars, tags[start:end]
start = end
else:
yield chars, tags
def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool:
return isinstance(input, str)
def inputs_to_samples(self, inputs, gold=False):
for chars, tags in (inputs if gold else zip(inputs, [None] * len(inputs))):
if not gold:
tags = [self.tag_vocab.safe_pad_token] * len(chars)
chars = CharTable.normalize_chars(chars)
yield chars, tags
def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None,
batch=None) -> Iterable:
yield from self.Y_to_tokens(self.tag_vocab, Y, gold, inputs)
def Y_to_tokens(self, tag_vocab, Y, gold, inputs):
if not gold:
Y = tf.argmax(Y, axis=2)
for text, ys in zip(inputs, Y):
tags = [tag_vocab.idx_to_token[int(y)] for y in ys[:len(text)]]
yield bmes_to_words(list(text), tags)
================================================
FILE: hanlp/utils/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-24 22:12
from . import rules
def ls_resource_in_module(root) -> dict:
res = dict()
for k, v in root.__dict__.items():
if k.startswith('_') or v == root:
continue
if isinstance(v, str):
if v.startswith('http') and not v.endswith('/') and not v.endswith('#') and not v.startswith('_'):
res[k] = v
elif type(v).__name__ == 'module':
res.update(ls_resource_in_module(v))
if 'ALL' in root.__dict__ and isinstance(root.__dict__['ALL'], dict):
root.__dict__['ALL'].update(res)
return res
================================================
FILE: hanlp/utils/component_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 19:24
import os
from hanlp_common.constant import HANLP_VERBOSE
from hanlp_common.io import load_json, eprint, save_json
from hanlp_common.reflection import object_from_classpath, str_to_type
from hanlp import pretrained
from hanlp import version
from hanlp.common.component import Component
from hanlp.utils.io_util import get_resource, get_latest_info_from_pypi, check_version_conflicts
from hanlp_common.util import isdebugging
def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, verbose=HANLP_VERBOSE,
**kwargs) -> Component:
"""
Load a component from a ``meta.json`` (legacy TensorFlow component) or a ``config.json`` file.
Args:
save_dir: The identifier.
meta_filename (str): The meta file of that saved component, which stores the classpath and version.
transform_only: Load and return only the transform.
**kwargs: Extra parameters passed to ``component.load()``.
Returns:
A component.
"""
identifier = save_dir
load_path = save_dir
save_dir = get_resource(save_dir)
if save_dir.endswith('.json'):
meta_filename = os.path.basename(save_dir)
save_dir = os.path.dirname(save_dir)
metapath = os.path.join(save_dir, meta_filename)
if not os.path.isfile(metapath):
tf_model = False
metapath = os.path.join(save_dir, 'config.json')
else:
tf_model = True
cls = None
if not os.path.isfile(metapath):
tips = ''
if save_dir.isupper():
from difflib import SequenceMatcher
similar_keys = sorted(pretrained.ALL.keys(),
key=lambda k: SequenceMatcher(None, k, identifier).ratio(),
reverse=True)[:5]
tips = f'Check its spelling based on the available keys:\n' + \
f'{sorted(pretrained.ALL.keys())}\n' + \
f'Tips: it might be one of {similar_keys}'
# These components are not intended to be loaded in this way, but I'm tired of explaining it again and again
if identifier in pretrained.word2vec.ALL.values():
save_dir = os.path.dirname(save_dir)
metapath = os.path.join(save_dir, 'config.json')
save_json({'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbeddingComponent',
'embed': {'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbedding',
'embed': identifier, 'field': 'token', 'normalize': 'l2'},
'hanlp_version': version.__version__}, metapath)
elif identifier in pretrained.fasttext.ALL.values():
save_dir = os.path.dirname(save_dir)
metapath = os.path.join(save_dir, 'config.json')
save_json({'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbeddingComponent',
'embed': {'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbedding',
'filepath': identifier, 'src': 'token'},
'hanlp_version': version.__version__}, metapath)
elif identifier in {pretrained.classifiers.LID_176_FASTTEXT_SMALL,
pretrained.classifiers.LID_176_FASTTEXT_BASE}:
save_dir = os.path.dirname(save_dir)
metapath = os.path.join(save_dir, 'config.json')
save_json({'classpath': 'hanlp.components.classifiers.fasttext_classifier.FastTextClassifier',
'model_path': identifier,
'hanlp_version': version.__version__}, metapath)
else:
raise FileNotFoundError(f'The identifier {save_dir} resolves to a nonexistent meta file {metapath}. {tips}')
meta: dict = load_json(metapath)
cls = meta.get('classpath', cls)
if not cls:
cls = meta.get('class_path', None) # For older version
if tf_model:
# tf models are trained with version < 2.1. To migrate them to 2.1, map their classpath to new locations
upgrade = {
'hanlp.components.tok_tf.TransformerTokenizerTF': 'hanlp.components.tokenizers.tok_tf.TransformerTokenizerTF',
'hanlp.components.pos.RNNPartOfSpeechTagger': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF',
'hanlp.components.pos_tf.RNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF',
'hanlp.components.pos_tf.CNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.CNNPartOfSpeechTaggerTF',
'hanlp.components.ner_tf.TransformerNamedEntityRecognizerTF': 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF',
'hanlp.components.parsers.biaffine_parser.BiaffineDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineDependencyParserTF',
'hanlp.components.parsers.biaffine_parser.BiaffineSemanticDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineSemanticDependencyParserTF',
'hanlp.components.tok_tf.NgramConvTokenizerTF': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF',
'hanlp.components.classifiers.transformer_classifier.TransformerClassifier': 'hanlp.components.classifiers.transformer_classifier_tf.TransformerClassifierTF',
'hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger': 'hanlp.components.taggers.transformers.transformer_tagger_tf.TransformerTaggerTF',
'hanlp.components.tok.NgramConvTokenizer': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF',
}
cls = upgrade.get(cls, cls)
assert cls, f'{meta_filename} doesn\'t contain classpath field'
try:
obj: Component = object_from_classpath(cls)
if hasattr(obj, 'load'):
if transform_only:
# noinspection PyUnresolvedReferences
obj.load_transform(save_dir)
else:
if os.path.isfile(os.path.join(save_dir, 'config.json')):
obj.load(save_dir, verbose=verbose, **kwargs)
else:
obj.load(metapath, **kwargs)
obj.config['load_path'] = load_path
return obj
except ModuleNotFoundError as e:
if isdebugging():
raise e from None
else:
raise ModuleNotFoundError(
f'Some modules ({e.name} etc.) required by this model are missing. Please install the full version:'
'\n\n\tpip install hanlp[full] -U') from None
except ValueError as e:
if e.args and isinstance(e.args[0], str) and 'Internet connection' in e.args[0]:
raise ConnectionError(
'Hugging Face 🤗 Transformers failed to download because your Internet connection is either off or bad.\n'
'See https://hanlp.hankcs.com/docs/install.html#server-without-internet for solutions.') \
from None
raise e from None
except Exception as e:
# Some users often install an incompatible tf and put the blame on HanLP. Teach them the basics.
try:
you_installed_wrong_versions, extras = check_version_conflicts(extras=('full',) if tf_model else None)
except Exception as check_e:
you_installed_wrong_versions, extras = None, None
if you_installed_wrong_versions:
raise version.NotCompatible(you_installed_wrong_versions + '\nPlease reinstall HanLP in the proper way:' +
'\n\n\tpip install --upgrade hanlp' + (
f'[{",".join(extras)}]' if extras else '')) from None
eprint(f'Failed to load {identifier}')
from pkg_resources import parse_version
model_version = meta.get("hanlp_version", '2.0.0-alpha.0')
if model_version == '2.0.0': # Quick fix: the first version used a wrong string
model_version = '2.0.0-alpha.0'
model_version = parse_version(model_version)
installed_version = parse_version(version.__version__)
try:
latest_version = get_latest_info_from_pypi()
except:
latest_version = None
if model_version > installed_version:
eprint(f'{identifier} was created with hanlp-{model_version}, '
f'while you are running a lower version: {installed_version}. ')
if installed_version != latest_version:
eprint(
f'Please upgrade HanLP with:\n'
f'\n\tpip install --upgrade hanlp\n')
eprint(
'If the problem still persists, please submit an issue to https://github.com/hankcs/HanLP/issues\n'
'When reporting an issue, make sure to paste the FULL ERROR LOG below.')
eprint(f'{"ERROR LOG BEGINS":=^80}')
import platform
eprint(f'OS: {platform.platform()}')
eprint(f'Python: {platform.python_version()}')
import torch
eprint(f'PyTorch: {torch.__version__}')
if tf_model:
try:
import tensorflow
tf_version = tensorflow.__version__
eprint(f'TensorFlow: {tf_version}')
except ModuleNotFoundError:
tf_version = 'not installed'
eprint(f'TensorFlow: {tf_version}')
except Exception as tf_e:
eprint(f'TensorFlow cannot be imported due to {tf_e.__class__.__name__}: {e}. '
f'Note this is not a bug of HanLP, but rather a compatability issue caused by TensorFlow.')
eprint(f'HanLP: {version.__version__}')
import sys
sys.stderr.flush()
try:
if e.args and isinstance(e.args, tuple):
for i in range(len(e.args)):
if isinstance(e.args[i], str):
from hanlp_common.util import set_tuple_with
e.args = set_tuple_with(e.args, e.args[i] + f'\n{"ERROR LOG ENDS":=^80}', i)
break
except:
pass
raise e from None
def load_from_meta(meta: dict) -> Component:
if 'load_path' in meta:
return load_from_meta_file(meta['load_path'])
cls = meta.get('class_path', None) or meta.get('classpath', None)
assert cls, f'{meta} doesn\'t contain classpath field'
cls = str_to_type(cls)
return cls.from_config(meta)
================================================
FILE: hanlp/utils/file_read_backwards/__init__.py
================================================
# -*- coding: utf-8 -*-
from .file_read_backwards import FileReadBackwards # noqa: F401
__author__ = """Robin Robin"""
__email__ = 'robinsquare42@gmail.com'
__version__ = '2.0.0'
================================================
FILE: hanlp/utils/file_read_backwards/buffer_work_space.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""BufferWorkSpace module."""
import os
new_lines = ["\r\n", "\n", "\r"]
new_lines_bytes = [n.encode("ascii") for n in new_lines] # we only support encodings that's backward compat with ascii
class BufferWorkSpace:
"""It is a helper module for FileReadBackwards."""
def __init__(self, fp, chunk_size):
"""Convention for the data.
When read_buffer is not None, it represents contents of the file from `read_position` onwards
that has not been processed/returned.
read_position represents the file pointer position that has been read into read_buffer
initialized to be just past the end of file.
"""
self.fp = fp
self.read_position = _get_file_size(self.fp) # set the previously read position to the
self.read_buffer = None
self.chunk_size = chunk_size
def add_to_buffer(self, content, read_position):
"""Add additional bytes content as read from the read_position.
Args:
content(bytes): data to be added to buffer working BufferWorkSpac.
read_position(int): where in the file pointer the data was read from.
Returns:
"""
self.read_position = read_position
if self.read_buffer is None:
self.read_buffer = content
else:
self.read_buffer = content + self.read_buffer
def yieldable(self):
""" """
if self.read_buffer is None:
return False
t = _remove_trailing_new_line(self.read_buffer)
n = _find_furthest_new_line(t)
if n >= 0:
return True
# we have read in entire file and have some unprocessed lines
if self.read_position == 0 and self.read_buffer is not None:
return True
return False
def return_line(self):
"""
Args:
Returns:
Precondition: self.yieldable() must be True
"""
assert(self.yieldable())
t = _remove_trailing_new_line(self.read_buffer)
i = _find_furthest_new_line(t)
if i >= 0:
l = i + 1
after_new_line = slice(l, None)
up_to_include_new_line = slice(0, l)
r = t[after_new_line]
self.read_buffer = t[up_to_include_new_line]
else: # the case where we have read in entire file and at the "last" line
r = t
self.read_buffer = None
return r
def read_until_yieldable(self):
"""Read in additional chunks until it is yieldable."""
while not self.yieldable():
read_content, read_position = _get_next_chunk(self.fp, self.read_position, self.chunk_size)
self.add_to_buffer(read_content, read_position)
def has_returned_every_line(self):
""" """
if self.read_position == 0 and self.read_buffer is None:
return True
return False
def _get_file_size(fp):
return os.fstat(fp.fileno()).st_size
def _get_next_chunk(fp, previously_read_position, chunk_size):
"""Return next chunk of data that we would from the file pointer.
Args:
fp: file
previously_read_position: file pointer position that we have read from
chunk_size: desired read chunk_size
Returns:
(bytestring, int): data that has been read in, the file pointer position where the data has been read from
"""
seek_position, read_size = _get_what_to_read_next(fp, previously_read_position, chunk_size)
fp.seek(seek_position)
read_content = fp.read(read_size)
read_position = seek_position
return read_content, read_position
def _get_what_to_read_next(fp, previously_read_position, chunk_size):
"""Return information on which file pointer position to read from and how many bytes.
Args:
fp:
past_read_positon: int
chunk_size: int
previously_read_position:
Returns:
(int, int): The next seek position, how many bytes to read next
"""
seek_position = max(previously_read_position - chunk_size, 0)
read_size = chunk_size
# examples: say, our new_lines are potentially "\r\n", "\n", "\r"
# find a reading point where it is not "\n", rewind further if necessary
# if we have "\r\n" and we read in "\n",
# the next iteration would treat "\r" as a different new line.
# Q: why don't I just check if it is b"\n", but use a function ?
# A: so that we can potentially expand this into generic sets of separators, later on.
while seek_position > 0:
fp.seek(seek_position)
if _is_partially_read_new_line(fp.read(1)):
seek_position -= 1
read_size += 1 # as we rewind further, let's make sure we read more to compensate
else:
break
# take care of special case when we are back to the beginnin of the file
read_size = min(previously_read_position - seek_position, read_size)
return seek_position, read_size
def _remove_trailing_new_line(l):
"""Remove a single instance of new line at the end of l if it exists.
Args:
l:
Returns:
: bytestring
"""
# replace only 1 instance of newline
# match longest line first (hence the reverse=True), we want to match "\r\n" rather than "\n" if we can
for n in sorted(new_lines_bytes, key=lambda x: len(x), reverse=True):
if l.endswith(n):
remove_new_line = slice(None, -len(n))
return l[remove_new_line]
return l
def _find_furthest_new_line(read_buffer):
"""Return -1 if read_buffer does not contain new line otherwise the position of the rightmost newline.
Args:
read_buffer: bytestring
Returns:
int: The right most position of new line character in read_buffer if found, else -1
"""
new_line_positions = [read_buffer.rfind(n) for n in new_lines_bytes]
return max(new_line_positions)
def _is_partially_read_new_line(b):
"""Return True when b is part of a new line separator found at index >= 1, False otherwise.
Args:
b: bytestring
Returns:
bool
"""
for n in new_lines_bytes:
if n.find(b) >= 1:
return True
return False
================================================
FILE: hanlp/utils/file_read_backwards/file_read_backwards.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""FileReadBackwards module."""
import io
import os
from .buffer_work_space import BufferWorkSpace
supported_encodings = ["utf-8", "ascii", "latin-1"] # any encodings that are backward compatible with ascii should work
class FileReadBackwards:
"""Class definition for `FileReadBackwards`.
A `FileReadBackwards` will spawn a `FileReadBackwardsIterator` and keep an opened file handler.
It can be used as a Context Manager. If done so, when exited, it will close its file handler.
In any mode, `close()` can be called to close the file handler..
Args:
Returns:
"""
def __init__(self, path, encoding="utf-8", chunk_size=io.DEFAULT_BUFFER_SIZE):
"""Constructor for FileReadBackwards.
Args:
path: Path to the file to be read
encoding (str): Encoding
chunk_size (int): How many bytes to read at a time
"""
if encoding.lower() not in supported_encodings:
error_message = "{0} encoding was not supported/tested.".format(encoding)
error_message += "Supported encodings are '{0}'".format(",".join(supported_encodings))
raise NotImplementedError(error_message)
self.path = path
self.encoding = encoding.lower()
self.chunk_size = chunk_size
self.iterator = FileReadBackwardsIterator(io.open(self.path, mode="rb"), self.encoding, self.chunk_size)
def __iter__(self):
"""Return its iterator."""
return self.iterator
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Closes all opened its file handler and propagates all exceptions on exit."""
self.close()
return False
def close(self):
"""Closes all opened it s file handler."""
self.iterator.close()
def readline(self):
""" """
try:
r = next(self.iterator) + os.linesep
return r
except StopIteration:
return ""
class FileReadBackwardsIterator:
"""Iterator for `FileReadBackwards`.
This will read backwards line by line a file. It holds an opened file handler.
Args:
Returns:
"""
def __init__(self, fp, encoding, chunk_size):
"""Constructor for FileReadBackwardsIterator
Args:
fp (File): A file that we wish to start reading backwards from
encoding (str): Encoding of the file
chunk_size (int): How many bytes to read at a time
"""
self.path = fp.name
self.encoding = encoding
self.chunk_size = chunk_size
self.__fp = fp
self.__buf = BufferWorkSpace(self.__fp, self.chunk_size)
def __iter__(self):
return self
def next(self):
"""Returns unicode string from the last line until the beginning of file.
Gets exhausted if::
* already reached the beginning of the file on previous iteration
* the file got closed
When it gets exhausted, it closes the file handler.
Args:
Returns:
"""
# Using binary mode, because some encodings such as "utf-8" use variable number of
# bytes to encode different Unicode points.
# Without using binary mode, we would probably need to understand each encoding more
# and do the seek operations to find the proper boundary before issuing read
if self.closed:
raise StopIteration
if self.__buf.has_returned_every_line():
self.close()
raise StopIteration
self.__buf.read_until_yieldable()
r = self.__buf.return_line()
return r.decode(self.encoding)
__next__ = next
@property
def closed(self):
"""The status of the file handler.
:return: True if the file handler is still opened. False otherwise.
Args:
Returns:
"""
return self.__fp.closed
def close(self):
"""Closes the file handler."""
self.__fp.close()
================================================
FILE: hanlp/utils/init_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-27 13:25
import math
import torch
from torch import nn
import functools
def embedding_uniform(tensor:torch.Tensor, seed=233):
gen = torch.Generator().manual_seed(seed)
with torch.no_grad():
fan_out = tensor.size(-1)
bound = math.sqrt(3.0 / fan_out)
return tensor.uniform_(-bound, bound, generator=gen)
================================================
FILE: hanlp/utils/io_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-26 15:02
import contextlib
import glob
import gzip
import json
import logging
import os
import platform
import random
import shlex
import shutil
import sys
import tarfile
import tempfile
import urllib
import zipfile
from contextlib import contextmanager
from pathlib import Path
from subprocess import Popen, PIPE
from typing import Tuple, Optional, Union, List
from urllib.parse import urlparse
from urllib.request import urlretrieve
from hanlp_downloader import Downloader
from hanlp_downloader.log import DownloadCallback
from packaging.version import Version
import hanlp
from hanlp_common.constant import HANLP_URL, HANLP_VERBOSE
from hanlp.utils.log_util import logger, cprint, remove_color_tag
from hanlp.utils.string_util import split_long_sentence_into
from hanlp.utils.time_util import now_filename, CountdownTimer
from hanlp.version import __version__
from hanlp_common.io import eprint
def load_jsonl(path, verbose=False):
if verbose:
src = TimingFileIterator(path)
else:
src = open(path, encoding='utf-8')
for line in src:
yield json.loads(line)
if not verbose:
src.close()
def make_debug_corpus(path, delimiter=None, percentage=0.1, max_samples=100):
files = []
if os.path.isfile(path):
files.append(path)
elif os.path.isdir(path):
files += [os.path.join(path, f) for f in os.listdir(path) if
os.path.isfile(os.path.join(path, f)) and '.debug' not in f and not f.startswith('.')]
else:
raise FileNotFoundError(path)
for filepath in files:
filename, file_extension = os.path.splitext(filepath)
if not delimiter:
if file_extension in {'.tsv', '.conll', '.conllx', '.conllu'}:
delimiter = '\n\n'
else:
delimiter = '\n'
with open(filepath, encoding='utf-8') as src, open(filename + '.debug' + file_extension, 'w',
encoding='utf-8') as out:
samples = src.read().strip().split(delimiter)
max_samples = min(max_samples, int(len(samples) * percentage))
out.write(delimiter.join(samples[:max_samples]))
def path_join(path, *paths):
return os.path.join(path, *paths)
def makedirs(path):
os.makedirs(path, exist_ok=True)
return path
def tempdir(name=None):
path = tempfile.gettempdir()
if name:
path = makedirs(path_join(path, name))
return path
def tempdir_human():
return tempdir(now_filename())
def temp_lock(path):
from filelock import FileLock
import hashlib
lock = FileLock(f"{tempdir()}/.{hashlib.md5(path.encode('utf8')).hexdigest()}.lock")
return lock
def hanlp_home_default():
"""Default data directory depending on the platform and environment variables"""
if windows():
return os.path.join(os.environ.get('APPDATA'), 'hanlp')
else:
return os.path.join(os.path.expanduser("~"), '.hanlp')
def windows():
system = platform.system()
return system == 'Windows'
def hanlp_home():
""" Home directory for HanLP resources.
Returns:
Data directory in the filesystem for storage, for example when downloading models.
This home directory can be customized with the following shell command or equivalent environment variable on Windows
systems.
.. highlight:: bash
.. code-block:: bash
$ export HANLP_HOME=/data/hanlp
"""
return os.getenv('HANLP_HOME', hanlp_home_default())
def file_exist(filename) -> bool:
return os.path.isfile(filename)
def remove_file(filename):
if file_exist(filename):
os.remove(filename)
def parent_dir(path):
return os.path.normpath(os.path.join(path, os.pardir))
def download(url, save_path=None, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True, verbose=HANLP_VERBOSE):
if not save_path:
save_path = path_from_url(url, save_dir, prefix, append_location)
if os.path.isfile(save_path):
if verbose:
eprint('Using local {}, ignore {}'.format(save_path, url))
return save_path
else:
makedirs(parent_dir(save_path))
if verbose:
eprint('Downloading {} to {}'.format(url, save_path))
tmp_path = '{}.downloading'.format(save_path)
remove_file(tmp_path)
try:
downloader = Downloader(url, tmp_path, 4, headers={
'User-agent': f'HanLP/{__version__} ({platform.platform()})'})
if verbose:
downloader.subscribe(DownloadCallback(show_header=False))
downloader.start_sync()
except BaseException as e:
remove_file(tmp_path)
url = url.split('#')[0]
try:
installed_version, latest_version = check_outdated()
except:
installed_version, latest_version = None, None # No Internet
if installed_version != latest_version:
# Always prompt user to upgrade whenever a new version is available
hints = f'[green]Please upgrade to the latest version ({latest_version}) with:[/green]' \
f'\n\n\t[yellow]pip install -U hanlp[/yellow]\n'
else: # Otherwise, prompt user to re-try
hints = f'[green]Please re-try or download it to {save_path} by yourself '
if not windows():
hints += f'with:[/green]\n\n\t[yellow]wget {url} -O {save_path}[/yellow]\n\n'
else:
hints += 'using some decent downloading tools.[/green]\n'
if not url.startswith(HANLP_URL):
hints += 'For third party data, unrestricted connectivity to the global network may be required.'
else:
hints += 'See also https://hanlp.hankcs.com/docs/install.html#install-models for instructions.'
message = f'Download failed due to [red]{repr(e)}[/red].\n' \
f'{hints}'
if verbose:
cprint(message)
if hasattr(e, 'msg'):
e.msg += '\n' + remove_color_tag(message)
elif hasattr(e, 'args') and e.args and isinstance(e.args, tuple) and isinstance(e.args[0], str):
e.args = (e.args[0] + '\n' + remove_color_tag(message),) + e.args[1:]
raise e from None
remove_file(save_path)
os.rename(tmp_path, save_path)
return save_path
def parse_url_path(url):
parsed: urllib.parse.ParseResult = urlparse(url)
path = parsed.path.strip('/')
return parsed.netloc, path
def uncompress(path, dest=None, remove=True, verbose=HANLP_VERBOSE):
"""Uncompress a file and clean up uncompressed files once an error is triggered.
Args:
path: The path to a compressed file
dest: The dest folder.
remove: Remove archive file after decompression.
verbose: ``True`` to print log message.
Returns:
Destination path.
"""
# assert path.endswith('.zip')
prefix, ext = split_if_compressed(path)
folder_name = os.path.basename(prefix)
file_is_zip = ext == '.zip'
root_of_folder = None
if ext == '.gz':
try:
with gzip.open(path, 'rb') as f_in, open(prefix, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
except Exception as e:
remove_file(prefix)
remove_file(path)
raise e
else:
try:
with zipfile.ZipFile(path, "r") if ext == '.zip' else tarfile.open(path, 'r:*') as archive:
if not dest:
namelist = sorted(archive.namelist() if file_is_zip else archive.getnames())
if namelist[0] == '.':
namelist = namelist[1:]
namelist = [p[len('./'):] if p.startswith('./') else p for p in namelist]
if ext == '.tgz':
roots = set(x.split('/')[0] for x in namelist)
if len(roots) == 1:
root_of_folder = next(iter(roots))
else:
# only one file, root_of_folder = ''
root_of_folder = namelist[0].strip('/') if len(namelist) > 1 else ''
if all(f.split('/')[0] == root_of_folder for f in namelist[1:]) or not root_of_folder:
dest = os.path.dirname(path) # only one folder, unzip to the same dir
else:
root_of_folder = None
dest = prefix # assume zip contains more than one file or folder
if verbose:
eprint('Decompressing {} to {}'.format(path, dest))
archive.extractall(dest)
if root_of_folder:
if root_of_folder != folder_name:
# move root to match folder name
os.rename(path_join(dest, root_of_folder), path_join(dest, folder_name))
dest = path_join(dest, folder_name)
elif len(namelist) == 1:
dest = path_join(dest, namelist[0])
except Exception as e:
remove_file(path)
if os.path.exists(prefix):
if os.path.isfile(prefix):
os.remove(prefix)
elif os.path.isdir(prefix):
shutil.rmtree(prefix)
raise e
if remove:
remove_file(path)
return dest
def split_if_compressed(path: str, compressed_ext=('.zip', '.tgz', '.gz', 'bz2', '.xz')) -> Tuple[str, Optional[str]]:
tar_gz = '.tar.gz'
if path.endswith(tar_gz):
root, ext = path[:-len(tar_gz)], tar_gz
else:
root, ext = os.path.splitext(path)
if ext in compressed_ext or ext == tar_gz:
return root, ext
return path, None
def get_resource(path: str, save_dir=hanlp_home(), extract=True, prefix=HANLP_URL, append_location=True,
verbose=HANLP_VERBOSE):
"""Fetch real (local) path for a resource (model, corpus, whatever) to ``save_dir``.
Args:
path: A local path (which will returned as is) or a remote URL (which will be downloaded, decompressed then
returned).
save_dir: Where to store the resource (Default value = :meth:`hanlp.utils.io_util.hanlp_home`)
extract: Whether to unzip it if it's a zip file (Default value = True)
prefix: A prefix when matched with an URL (path), then that URL is considered to be official. For official
resources, they will not go to a folder called ``thirdparty`` under :const:`~hanlp_common.constants.HANLP_HOME`.
append_location: Whether to put unofficial files in a ``thirdparty`` folder.
verbose: Whether to print log messages.
Returns:
The real path to the resource.
"""
_path = path
path = hanlp.pretrained.ALL.get(path, path)
anchor: str = None
compressed = None
if os.path.isdir(path):
return path
elif os.path.isfile(path):
pass
elif path.startswith('http:') or path.startswith('https:'):
url = path
if '#' in url:
url, anchor = url.split('#', maxsplit=1)
realpath = path_from_url(path, save_dir, prefix, append_location)
realpath, compressed = split_if_compressed(realpath)
# check if resource is there
if anchor:
if anchor.startswith('/'):
# indicates the folder name has to be polished
anchor = anchor.lstrip('/')
parts = anchor.split('/')
renamed_realpath = str(Path(realpath).parent.joinpath(parts[0]))
if os.path.isfile(realpath + compressed):
os.rename(realpath + compressed, renamed_realpath + compressed)
realpath = renamed_realpath
anchor = '/'.join(parts[1:])
child = path_join(realpath, anchor)
if os.path.exists(child):
return child
elif os.path.isdir(realpath) or (os.path.isfile(realpath) and (compressed and extract)):
return realpath
else:
if compressed:
pattern = realpath + '.*'
files = glob.glob(pattern)
files = list(filter(lambda x: not x.endswith('.downloading') and not x.endswith(compressed), files))
if files:
if len(files) > 1:
logger.debug(f'Found multiple files with {pattern}, will use the first one.')
return files[0]
# realpath is where its path after exaction
if compressed:
realpath += compressed
with temp_lock(path):
if not os.path.isfile(realpath):
path = download(url=path, save_path=realpath, verbose=verbose)
else:
path = realpath
if extract and compressed:
with temp_lock(path):
if os.path.isfile(path):
path = uncompress(path, verbose=verbose)
else: # other process must have already decompressed it and deleted it
return get_resource(_path, save_dir, extract, prefix, append_location, verbose)
if anchor:
path = path_join(path, anchor)
return path
def path_from_url(url, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True):
"""Map a URL to a local path.
Args:
url: Remote URL.
save_dir: The root folder to save this file.
prefix: The prefix of official website. Any URLs starting with this prefix will be considered official.
append_location: Whether to put unofficial files in a ``thirdparty`` folder.
Returns:
The real path that this URL is mapped to.
"""
if not save_dir:
save_dir = hanlp_home()
domain, relative_path = parse_url_path(url)
if append_location:
if not url.startswith(prefix):
save_dir = os.path.join(save_dir, 'thirdparty', domain)
else:
# remove the relative path in prefix
middle = prefix.split(domain)[-1].lstrip('/')
if relative_path.startswith(middle):
relative_path = relative_path[len(middle):]
realpath = os.path.join(save_dir, relative_path)
else:
realpath = os.path.join(save_dir, os.path.basename(relative_path))
return realpath
def human_bytes(file_size: int) -> str:
file_size /= 1024 # KB
if file_size > 1024:
file_size /= 1024 # MB
if file_size > 1024:
file_size /= 1024 # GB
return '%.1f GB' % file_size
return '%.1f MB' % file_size
return '%d KB' % file_size
def read_cells(filepath: str, delimiter='auto', strip=True, skip_header=False):
filepath = get_resource(filepath)
if delimiter == 'auto':
if filepath.endswith('.tsv'):
delimiter = '\t'
elif filepath.endswith('.csv'):
delimiter = ','
else:
delimiter = None
with open(filepath, encoding='utf-8') as src:
if skip_header:
next(src)
for line in src:
line = line.strip()
if not line:
continue
cells = line.split(delimiter)
if strip:
cells = [c.strip() for c in cells]
yield cells
def replace_ext(filepath, ext) -> str:
""" Replace the extension of filepath to ext.
Args:
filepath: Filepath to be replaced.
ext: Extension to replace.
Returns:
A new path.
"""
file_prefix, _ = os.path.splitext(filepath)
return file_prefix + ext
def read_tsv_as_sents(tsv_file_path, ignore_prefix=None, delimiter=None):
sent = []
tsv_file_path = get_resource(tsv_file_path)
with open(tsv_file_path, encoding='utf-8') as tsv_file:
for line in tsv_file:
if ignore_prefix and line.startswith(ignore_prefix):
continue
line = line.strip()
cells = line.split(delimiter)
if line and cells:
sent.append(cells)
elif sent:
yield sent
sent = []
if sent:
yield sent
def generate_words_tags_from_tsv(tsv_file_path, lower=False, gold=True, max_seq_length=None, sent_delimiter=None,
char_level=False, hard_constraint=False):
for sent in read_tsv_as_sents(tsv_file_path):
words = [cells[0] for cells in sent]
if max_seq_length:
offset = 0
# try to split the sequence to make it fit into max_seq_length
for shorter_words in split_long_sentence_into(words, max_seq_length, sent_delimiter, char_level,
hard_constraint):
if gold:
shorter_tags = [cells[1] for cells in sent[offset:offset + len(shorter_words)]]
offset += len(shorter_words)
else:
shorter_tags = None
if lower:
shorter_words = [word.lower() for word in shorter_words]
yield shorter_words, shorter_tags
else:
if gold:
try:
tags = [cells[1] for cells in sent]
except:
raise ValueError(f'Failed to load {tsv_file_path}: {sent}')
else:
tags = None
if lower:
words = [word.lower() for word in words]
yield words, tags
def split_file(filepath, train=0.8, dev=0.1, test=0.1, names=None, shuffle=False):
num_samples = 0
if filepath.endswith('.tsv'):
for sent in read_tsv_as_sents(filepath):
num_samples += 1
else:
with open(filepath, encoding='utf-8') as src:
for sample in src:
num_samples += 1
splits = {'train': train, 'dev': dev, 'test': test}
splits = dict((k, v) for k, v in splits.items() if v)
splits = dict((k, v / sum(splits.values())) for k, v in splits.items())
accumulated = 0
r = []
for k, v in splits.items():
r.append(accumulated)
accumulated += v
r.append(accumulated)
splits[k] = accumulated
if names is None:
names = {}
name, ext = os.path.splitext(filepath)
filenames = [names.get(split, name + '.' + split + ext) for split in splits.keys()]
outs = [open(f, 'w', encoding='utf-8') for f in filenames]
if shuffle:
shuffle = list(range(num_samples))
random.shuffle(shuffle)
if filepath.endswith('.tsv'):
src = read_tsv_as_sents(filepath)
else:
src = open(filepath, encoding='utf-8')
for idx, sample in enumerate(src):
if shuffle:
idx = shuffle[idx]
ratio = idx / num_samples
for sid, out in enumerate(outs):
if r[2 * sid] <= ratio < r[2 * sid + 1]:
if isinstance(sample, list):
sample = '\n'.join('\t'.join(x) for x in sample) + '\n\n'
out.write(sample)
break
if not filepath.endswith('.tsv'):
src.close()
for out in outs:
out.close()
return filenames
def fileno(file_or_fd):
try:
fd = getattr(file_or_fd, 'fileno', lambda: file_or_fd)()
except:
return None
if not isinstance(fd, int):
raise ValueError("Expected a file (`.fileno()`) or a file descriptor")
return fd
@contextmanager
def stdout_redirected(to=os.devnull, stdout=None):
"""Redirect stdout to else where.
Copied from https://stackoverflow.com/questions/4675728/redirect-stdout-to-a-file-in-python/22434262#22434262
Args:
to: Target device.
stdout: Source device.
"""
if windows(): # This doesn't play well with windows
yield None
return
if stdout is None:
stdout = sys.stdout
stdout_fd = fileno(stdout)
if not stdout_fd:
yield None
return
# copy stdout_fd before it is overwritten
# NOTE: `copied` is inheritable on Windows when duplicating a standard stream
with os.fdopen(os.dup(stdout_fd), 'wb') as copied:
stdout.flush() # flush library buffers that dup2 knows nothing about
try:
os.dup2(fileno(to), stdout_fd) # $ exec >&to
except ValueError: # filename
with open(to, 'wb') as to_file:
os.dup2(to_file.fileno(), stdout_fd) # $ exec > to
try:
yield stdout # allow code to be run with the redirected stdout
finally:
# restore stdout to its previous value
# NOTE: dup2 makes stdout_fd inheritable unconditionally
try:
stdout.flush()
os.dup2(copied.fileno(), stdout_fd) # $ exec >&copied
except:
# This is the best we can do
pass
def get_exitcode_stdout_stderr(cmd):
"""Execute the external command and get its exitcode, stdout and stderr.
See https://stackoverflow.com/a/21000308/3730690
Args:
cmd: Command.
Returns:
Exit code, stdout, stderr.
"""
args = shlex.split(cmd)
proc = Popen(args, stdout=PIPE, stderr=PIPE)
out, err = proc.communicate()
exitcode = proc.returncode
return exitcode, out.decode('utf-8'), err.decode('utf-8')
def run_cmd(cmd: str) -> str:
exitcode, out, err = get_exitcode_stdout_stderr(cmd)
if exitcode:
raise RuntimeError(err + '\nThe command is:\n' + cmd)
return out
@contextlib.contextmanager
def pushd(new_dir):
previous_dir = os.getcwd()
os.chdir(new_dir)
try:
yield
finally:
os.chdir(previous_dir)
def basename_no_ext(path):
basename = os.path.basename(path)
no_ext, ext = os.path.splitext(basename)
return no_ext
def file_cache(path: str, purge=False):
cache_name = path + '.cache'
cache_time = os.path.getmtime(cache_name) if os.path.isfile(cache_name) and not purge else 0
file_time = os.path.getmtime(path)
cache_valid = cache_time > file_time
return cache_name, cache_valid
def merge_files(files: List[str], dst: str):
with open(dst, 'wb') as write:
for f in files:
with open(f, 'rb') as read:
shutil.copyfileobj(read, write)
class TimingFileIterator(CountdownTimer):
def __init__(self, filepath) -> None:
super().__init__(os.path.getsize(filepath))
self.filepath = filepath
def __iter__(self):
if not os.path.isfile(self.filepath):
raise FileNotFoundError(self.filepath)
fp = open(self.filepath, encoding='utf-8', errors='ignore')
line = fp.readline()
while line:
yield line
self.current = fp.tell()
line = fp.readline()
fp.close()
def log(self, info=None, ratio_percentage=True, ratio=True, step=0, interval=0.5, erase=True,
logger: Union[logging.Logger, bool] = None, newline=False, ratio_width=None):
assert step == 0
super().log(info, ratio_percentage, ratio, step, interval, erase, logger, newline, ratio_width)
@property
def ratio(self) -> str:
return f'{human_bytes(self.current)}/{human_bytes(self.total)}'
@property
def ratio_width(self) -> int:
return len(f'{human_bytes(self.total)}') * 2 + 1
def close(self):
pass
def check_outdated(package='hanlp', version=__version__, repository_url='https://pypi.python.org/pypi/%s/json'):
"""Given the name of a package on PyPI and a version (both strings), checks
if the given version is the latest version of the package available.
Returns a 2-tuple (installed_version, latest_version)
`repository_url` is a `%` style format string
to use a different repository PyPI repository URL,
e.g. test.pypi.org or a private repository.
The string is formatted with the package name.
Adopted from https://github.com/alexmojaki/outdated/blob/master/outdated/__init__.py
Args:
package: Package name.
version: Installed version string.
repository_url: URL on pypi.
Returns:
Parsed installed version and latest version.
"""
installed_version = Version(version)
latest_version = get_latest_info_from_pypi(package, repository_url)
return installed_version, latest_version
def get_latest_info_from_pypi(package='hanlp', repository_url='https://pypi.python.org/pypi/%s/json'):
url = repository_url % package
response = urllib.request.urlopen(url).read()
return Version(json.loads(response)['info']['version'])
def check_version_conflicts(extras=None):
from pkg_resources import get_distribution, Requirement, WorkingSet, VersionConflict, DistributionNotFound
pkg = get_distribution('hanlp')
if not extras:
extras = pkg.extras
if isinstance(extras, list):
extras = tuple(extras)
requirements: List[Requirement] = pkg.requires(extras=extras)
error = None
try:
WorkingSet().resolve(
requirements, extras=extras
)
except VersionConflict as e:
error = e.with_context('hanlp').report()
except DistributionNotFound as e:
error = str(e)
return error, extras
================================================
FILE: hanlp/utils/lang/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-09 18:46
__doc__ = '''
This package holds misc utils for specific languages.
'''
================================================
FILE: hanlp/utils/lang/en/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 19:28
================================================
FILE: hanlp/utils/lang/en/english_tokenizer.py
================================================
#!/usr/bin/env python
"""
Regex-based word tokenizers.
Note that small/full/half-width character variants are *not* covered.
If a text were to contains such characters, normalize it first.
A modified version of https://github.com/fnl/segtok
- dropped dependency on regex
- dropped web_tokenize
- supported concat word
"""
__author__ = 'Florian Leitner '
from re import compile, UNICODE, VERBOSE
SENTENCE_TERMINALS = '.!?\u203C\u203D\u2047\u2048\u2049\u3002' \
'\uFE52\uFE57\uFF01\uFF0E\uFF1F\uFF61'
"The list of valid Unicode sentence terminal characters."
# Note that Unicode the category Pd is NOT a good set for valid word-breaking hyphens,
# because it contains many dashes that should not be considered part of a word.
HYPHENS = '\u00AD\u058A\u05BE\u0F0C\u1400\u1806\u2010-\u2012\u2e17\u30A0-'
"Any valid word-breaking hyphen, including ASCII hyphen minus."
APOSTROPHES = '\'\u00B4\u02B9\u02BC\u2019\u2032'
"""All apostrophe-like marks, including the ASCII "single quote"."""
APOSTROPHE = r"[\u00B4\u02B9\u02BC\u2019\u2032]"
"""Any apostrophe-like marks, including "prime" but not the ASCII "single quote"."""
LINEBREAK = r'(?:\r\n|\n|\r|\u2028)'
"""Any valid linebreak sequence (Windows, Unix, Mac, or U+2028)."""
LETTER = r'[^\W\d_]'
"""Any Unicode letter character that can form part of a word: Ll, Lm, Lt, Lu."""
NUMBER = r'\d'
"""Any Unicode number character: Nd or Nl."""
POWER = r'\u207B?[\u00B9\u00B2\u00B3]'
"""Superscript 1, 2, and 3, optionally prefixed with a minus sign."""
SUBDIGIT = r'[\u2080-\u2089]'
"""Subscript digits."""
ALNUM = LETTER[:-1] + NUMBER + ']'
"""Any alphanumeric Unicode character: letter or number."""
HYPHEN = r'[%s]' % HYPHENS
SPACE = r'\s'
"""Any unicode space character plus the (horizontal) tab."""
APO_MATCHER = compile(APOSTROPHE, UNICODE)
"""Matcher for any apostrophe."""
HYPHENATED_LINEBREAK = compile(
r'({alnum}{hyphen}){space}*?{linebreak}{space}*?({alnum})'.format(
alnum=ALNUM, hyphen=HYPHEN, linebreak=LINEBREAK, space=SPACE
), UNICODE
)
"""
The pattern matches any alphanumeric Unicode character, followed by a hyphen,
a single line-break surrounded by optional (non-breaking) spaces,
and terminates with a alphanumeric character on this next line.
The opening char and hyphen as well as the terminating char are captured in two groups.
"""
IS_POSSESSIVE = compile(r"{alnum}+(?:{hyphen}{alnum}+)*(?:{apo}[sS]|[sS]{apo})$".format(
alnum=ALNUM, hyphen=HYPHEN, apo="['" + APOSTROPHE[1:]
), UNICODE
)
"""A pattern that matches English words with a possessive s terminal form."""
IS_CONTRACTION = compile(r"{alnum}+(?:{hyphen}{alnum}+)*{apo}(?:d|ll|m|re|s|t|ve)$".format(
alnum=ALNUM, hyphen=HYPHEN, apo="['" + APOSTROPHE[1:]
), UNICODE
)
"""A pattern that matches tokens with valid English contractions ``'(d|ll|m|re|s|t|ve)``."""
MAP_CONCAT_WORD = {'aint': [2, 4], 'arent': [3, 5], 'cant': [2, 4], 'cannot': [3, 6], 'coulda': [5, 6],
'couldnt': [5, 7], 'didnt': [3, 5], 'doncha': [2, 3, 6], 'dont': [2, 4],
'doesnt': [4, 6], 'dunno': [2, 3, 5], 'finna': [3, 5], 'gimme': [3, 5], 'gonna': [3, 5],
'gotta': [3, 5], 'hadnt': [3, 5], 'hasnt': [3, 5], 'havent': [4, 6], 'isnt': [2, 4],
'itd': [2, 3], 'itll': [2, 4], 'lemme': [3, 5], 'lets': [3, 4], 'mightnt': [5, 7],
'mustnt': [4, 6], 'shant': [3, 5], 'shoulda': [6, 7], 'shouldnt': [6, 8],
'thatd': [4, 5], 'thatll': [4, 6], 'thats': [4, 5], 'theyd': [4, 5], 'theyre': [4, 6],
'theyve': [4, 6], 'wanna': [3, 5], 'wasnt': [3, 5], 'weve': [2, 4], 'werent': [4, 6],
'whadya': [3, 4, 6], 'whatcha': [4, 7], 'whatre': [4, 6], 'whats': [4, 5],
'whatve': [4, 6], 'whatz': [4, 5], 'whod': [3, 4], 'wholl': [3, 5], 'woncha': [2, 3, 6],
'wont': [2, 4], 'woulda': [5, 6], 'wouldnt': [5, 7], 'youd': [3, 4], 'youll': [3, 5],
'youve': [3, 5], "'tis": [2, 4], "'twas": [2, 5], "d'ye": [2, 4], "don'cha": [2, 4, 7],
"i'mma": [1, 3, 5], "i'mmm": [1, 5], "more'n": [4, 6], '’tis': [2, 4], '’twas': [2, 5],
'd’ye': [2, 4], 'don’cha': [2, 4, 7], 'i’mma': [1, 3, 5], 'i’mmm': [1, 5],
'more’n': [4, 6]}
RE_APOSTROPHE = compile(r'(?i)[a-z](n[\'\u2019]t|[\'\u2019](ll|nt|re|ve|[dmstz]))(\W|$)')
def split_possessive_markers(tokens):
"""
A function to split possessive markers at the end of alphanumeric (and hyphenated) tokens.
Takes the output of any of the tagger functions and produces and updated list.
To use it, simply wrap the tagger function, for example::
>>> my_sentence = "This is Fred's latest book."
>>> split_possessive_markers(tokenize_english(my_sentence))
['This', 'is', 'Fred', "'s", 'latest', 'book', '.']
:param tokens: a list of tokens
:returns: an updated list if a split was made or the original list otherwise
"""
idx = -1
for token in list(tokens):
idx += 1
if IS_POSSESSIVE.match(token) is not None:
if token[-1].lower() == 's' and token[-2] in APOSTROPHES:
tokens.insert(idx, token[:-2])
idx += 1
tokens[idx] = token[-2:]
elif token[-2].lower() == 's' and token[-1] in APOSTROPHES:
tokens.insert(idx, token[:-1])
idx += 1
tokens[idx] = token[-1:]
return tokens
def split_contractions(tokens):
"""
A function to split apostrophe contractions at the end of alphanumeric (and hyphenated) tokens.
Takes the output of any of the tagger functions and produces and updated list.
:param tokens: a list of tokens
:returns: an updated list if a split was made or the original list otherwise
"""
idx = -1
for token in list(tokens):
idx += 1
if IS_CONTRACTION.match(token) is not None:
length = len(token)
if length > 1:
for pos in range(length - 1, -1, -1):
if token[pos] in APOSTROPHES:
if 2 < length and pos + 2 == length and token[-1] == 't' and token[pos - 1] == 'n':
pos -= 1
tokens.insert(idx, token[:pos])
idx += 1
tokens[idx] = token[pos:]
return tokens
def _matches(regex):
"""Regular expression compiling function decorator."""
def match_decorator(fn):
automaton = compile(regex, UNICODE | VERBOSE)
fn.split = automaton.split
fn.match = automaton.match
return fn
return match_decorator
@_matches(r'\s+')
def space_tokenizer(sentence):
"""
For a given input `sentence`, return a list of its tokens.
Split on Unicode spaces ``\\s+`` (i.e., any kind of **Unicode** space character).
The separating space characters are not included in the resulting token list.
"""
return [token for token in space_tokenizer.split(sentence) if token]
@_matches(r'(%s+)' % ALNUM)
def symbol_tokenizer(sentence):
"""
The symbol tagger extends the :func:`space_tokenizer` by separating alphanumerics.
Separates alphanumeric Unicode character sequences in already space-split tokens.
"""
return [token for span in space_tokenizer(sentence) for
token in symbol_tokenizer.split(span) if token]
@_matches(r"""((?:
# Dots, except ellipsis
{alnum} \. (?!\.\.)
| # Comma, surrounded by digits (e.g., chemicals) or letters
{alnum} , (?={alnum})
| # Colon, surrounded by digits (e.g., time, references)
{number} : (?={number})
| # Hyphen, surrounded by digits (e.g., DNA endings: "5'-ACGT-3'") or letters
{alnum} {apo}? {hyphen} (?={alnum}) # incl. optional apostrophe for DNA segments
| # Apostophes, non-consecutive
{apo} (?!{apo})
| # ASCII single quote, surrounded by digits or letters (no dangling allowed)
{alnum} ' (?={alnum})
| # ASCII single quote after an s and at the token's end
s ' $
| # Terminal dimensions (superscript minus, 1, 2, and 3) attached to physical units
# size-prefix unit-acronym dimension
\b [yzafpn\u00B5mcdhkMGTPEZY]? {letter}{{1,3}} {power} $
| # Atom counts (subscript numbers) and ionization states (optional superscript
# 2 or 3 followed by a + or -) are attached to valid fragments of a chemical formula
\b (?:[A-Z][a-z]?|[\)\]])+ {subdigit}+ (?:[\u00B2\u00B3]?[\u207A\u207B])?
| # Any (Unicode) letter, digit, or the underscore
{alnum}
)+)""".format(alnum=ALNUM, apo=APOSTROPHE, power=POWER, subdigit=SUBDIGIT,
hyphen=HYPHEN, letter=LETTER, number=NUMBER))
def tokenize_english(sentence):
"""
A modified version of the segtok tagger: https://github.com/fnl/segtok
This tagger extends the alphanumeric :func:`symbol_tokenizer` by splitting fewer cases:
1. Dots appearing after a letter are maintained as part of the word, except for the last word
in a sentence if that dot is the sentence terminal. Therefore, abbreviation marks (words
containing or ending in a ``.``, like "i.e.") remain intact and URL or ID segments remain
complete ("www.ex-ample.com", "EC1.2.3.4.5", etc.). The only dots that never are attached
are triple dots (``...``; ellipsis).
2. Commas surrounded by alphanumeric characters are maintained in the word, too, e.g. ``a,b``.
Colons surrounded by digits are maintained, e.g., 'at 12:30pm' or 'Isaiah 12:3'.
Commas, semi-colons, and colons dangling at the end of a token are always spliced off.
3. Any two alphanumeric letters that are separated by a single hyphen are joined together;
Those "inner" hyphens may optionally be followed by a linebreak surrounded by spaces;
The spaces will be removed, however. For example, ``Hel- \\r\\n \t lo`` contains a (Windows)
linebreak and will be returned as ``Hel-lo``.
4. Apostrophes are always allowed in words as long as they are not repeated; The single quote
ASCII letter ``'`` is only allowed as a terminal apostrophe after the letter ``s``,
otherwise it must be surrounded by letters. To support DNA and chemicals, a apostrophe
(prime) may be located before the hyphen, as in the single token "5'-ACGT-3'" (if any
non-ASCII hyphens are used instead of the shown single quote).
5. Superscript 1, 2, and 3, optionally prefixed with a superscript minus, are attached to a
word if it is no longer than 3 letters (optionally 4 if the first letter is a power prefix
in the range from yocto, y (10^-24) to yotta, Y (10^+24)).
6. Subscript digits are attached if prefixed with letters that look like a chemical formula.
"""
if not sentence:
return []
flat = not isinstance(sentence, list)
if flat:
sents = [sentence]
else:
sents = sentence
results = []
for sentence in sents:
pruned = HYPHENATED_LINEBREAK.sub(r'\1\2', sentence)
tokens = [token for span in space_tokenizer(pruned) for
token in tokenize_english.split(span) if token]
# splice the sentence terminal off the last word/token if it has any at its borders
# only look for the sentence terminal in the last three tokens
for idx, word in enumerate(reversed(tokens[-3:]), 1):
if (tokenize_english.match(word) and not APO_MATCHER.match(word)) or \
any(t in word for t in SENTENCE_TERMINALS):
last = len(word) - 1
if 0 == last or u'...' == word:
# any case of "..." or any single char (last == 0)
pass # leave the token as it is
elif any(word.rfind(t) == last for t in SENTENCE_TERMINALS):
# "stuff."
tokens[-idx] = word[:-1]
tokens.insert(len(tokens) - idx + 1, word[-1])
elif any(word.find(t) == 0 for t in SENTENCE_TERMINALS):
# ".stuff"
tokens[-idx] = word[0]
tokens.insert(len(tokens) - idx + 1, word[1:])
break
# keep splicing off any dangling commas and (semi-) colons
dirty = True
while dirty:
dirty = False
for idx, word in enumerate(reversed(tokens), 1):
while len(word) > 1 and word[-1] in u',;:':
char = word[-1] # the dangling comma/colon
word = word[:-1]
tokens[-idx] = word
tokens.insert(len(tokens) - idx + 1, char)
idx += 1
dirty = True
if dirty:
break # restart check to avoid index errors
# split concat words
chunks = []
for token in tokens:
t = MAP_CONCAT_WORD.get(token.lower(), None)
if t:
i = 0
for j in t:
chunks.append(token[i:j])
i = j
else:
chunks.append(token)
tokens = chunks
# split APOSTROPHE
chunks = []
for token in tokens:
m = RE_APOSTROPHE.search(token)
if m:
chunks.append(token[:m.start(1)])
chunks.append(token[m.start(1):m.end(1)])
if m.end(1) < len(token):
chunks.append(token[m.end(1):])
else:
chunks.append(token)
tokens = chunks
results.append(tokens)
return results[0] if flat else results
================================================
FILE: hanlp/utils/lang/ja/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-13 13:24
================================================
FILE: hanlp/utils/lang/ja/bert_tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-13 13:24
from typing import Union, Optional
from transformers import BertTokenizerFast, TensorType, BatchEncoding, BertJapaneseTokenizer as _BertJapaneseTokenizer
from transformers.file_utils import PaddingStrategy
from transformers.tokenization_utils_base import TextInput, PreTokenizedInput, EncodedInput, TruncationStrategy
class BertJapaneseTokenizer(_BertJapaneseTokenizer):
# We may need to customize character level tokenization to handle English words and URLs
pass
class BertJapaneseTokenizerFast(BertTokenizerFast):
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences.
.. warning::
This method is deprecated, ``__call__`` should be used instead.
Args:
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
method).
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the ``tokenize`` method) or a list of integers (tokenized string ids using the
``convert_tokens_to_ids`` method).
"""
text = list(text)
is_split_into_words = True
encoding = BertJapaneseTokenizer.encode_plus(self,
text,
text_pair,
add_special_tokens,
padding,
truncation,
max_length,
stride,
is_split_into_words,
pad_to_multiple_of,
return_tensors,
return_token_type_ids,
return_attention_mask,
return_overflowing_tokens,
return_special_tokens_mask,
return_offsets_mapping,
return_length,
verbose,
**kwargs
)
offsets = encoding.encodings[0].offsets
fixed_offsets = [(b + i, e + i) for i, (b, e) in enumerate(offsets)]
# TODO: This doesn't work with rust tokenizers
encoding.encodings[0].offsets.clear()
encoding.encodings[0].offsets.extend(fixed_offsets)
return encoding
================================================
FILE: hanlp/utils/lang/zh/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-09 18:47
================================================
FILE: hanlp/utils/lang/zh/char_table.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-09 19:07
from typing import List
from hanlp.utils.io_util import get_resource
from hanlp_common.io import load_json
HANLP_CHAR_TABLE_TXT = 'https://file.hankcs.com/corpus/char_table.zip#CharTable.txt'
HANLP_CHAR_TABLE_JSON = 'https://file.hankcs.com/corpus/char_table.json.zip'
class CharTable:
convert = {}
@staticmethod
def convert_char(c):
if not CharTable.convert:
CharTable._init()
return CharTable.convert.get(c, c)
@staticmethod
def normalize_text(text: str) -> str:
return ''.join(CharTable.convert_char(c) for c in text)
@staticmethod
def normalize_chars(chars: List[str]) -> List[str]:
return [CharTable.convert_char(c) for c in chars]
@staticmethod
def _init():
CharTable.convert = CharTable.load()
@staticmethod
def load():
mapper = {}
with open(get_resource(HANLP_CHAR_TABLE_TXT), encoding='utf-8') as src:
for line in src:
cells = line.rstrip('\n')
if len(cells) != 3:
continue
a, _, b = cells
mapper[a] = b
return mapper
class JsonCharTable(CharTable):
@staticmethod
def load():
return load_json(get_resource(HANLP_CHAR_TABLE_JSON))
================================================
FILE: hanlp/utils/lang/zh/localization.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-05 02:09
task = {
'dep': '依存句法树',
'token': '单词',
'pos': '词性',
'ner': '命名实体',
'srl': '语义角色'
}
pos = {
'VA': '表语形容词', 'VC': '系动词', 'VE': '动词有无', 'VV': '其他动词', 'NR': '专有名词', 'NT': '时间名词', 'NN': '其他名词',
'LC': '方位词', 'PN': '代词', 'DT': '限定词', 'CD': '概数词', 'OD': '序数词', 'M': '量词', 'AD': '副词', 'P': '介词',
'CC': '并列连接词', 'CS': '从属连词', 'DEC': '补语成分“的”', 'DEG': '属格“的”', 'DER': '表结果的“得”', 'DEV': '表方式的“地”',
'AS': '动态助词', 'SP': '句末助词', 'ETC': '表示省略', 'MSP': '其他小品词', 'IJ': '句首感叹词', 'ON': '象声词',
'LB': '长句式表被动', 'SB': '短句式表被动', 'BA': '把字句', 'JJ': '其他名词修饰语', 'FW': '外来语', 'PU': '标点符号',
'NOI': '噪声', 'URL': '网址'
}
ner = {
'NT': '机构团体', 'NS': '地名', 'NR': '人名'
}
dep = {
'nn': '复合名词修饰', 'punct': '标点符号', 'nsubj': '名词性主语', 'conj': '连接性状语', 'dobj': '直接宾语', 'advmod': '名词性状语',
'prep': '介词性修饰语', 'nummod': '数词修饰语', 'amod': '形容词修饰语', 'pobj': '介词性宾语', 'rcmod': '相关关系', 'cpm': '补语',
'assm': '关联标记', 'assmod': '关联修饰', 'cc': '并列关系', 'elf': '类别修饰', 'ccomp': '从句补充', 'det': '限定语', 'lobj': '时间介词',
'range': '数量词间接宾语', 'asp': '时态标记', 'tmod': '时间修饰语', 'plmod': '介词性地点修饰', 'attr': '属性', 'mmod': '情态动词',
'loc': '位置补语', 'top': '主题', 'pccomp': '介词补语', 'etc': '省略关系', 'lccomp': '位置补语', 'ordmod': '量词修饰',
'xsubj': '控制主语', 'neg': '否定修饰', 'rcomp': '结果补语', 'comod': '并列联合动词', 'vmod': '动词修饰', 'prtmod': '小品词',
'ba': '把字关系', 'dvpm': '地字修饰', 'dvpmod': '地字动词短语', 'prnmod': '插入词修饰', 'cop': '系动词', 'pass': '被动标记',
'nsubjpass': '被动名词主语', 'clf': '类别修饰', 'dep': '依赖关系', 'root': '核心关系'
}
================================================
FILE: hanlp/utils/log_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-24 22:12
import datetime
import io
import logging
import os
import sys
from logging import LogRecord
import termcolor
from hanlp_common.constant import IPYTHON
class ColoredFormatter(logging.Formatter):
def __init__(self, fmt=None, datefmt=None, style='%', enable=True):
super().__init__(fmt, datefmt, style)
self.enable = enable
def formatMessage(self, record: LogRecord) -> str:
message = super().formatMessage(record)
if self.enable:
return color_format(message)
else:
return remove_color_tag(message)
def init_logger(name=None, root_dir=None, level=logging.INFO, mode='w',
fmt="%(asctime)s %(levelname)s %(message)s",
datefmt='%Y-%m-%d %H:%M:%S') -> logging.Logger:
if not name:
name = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
rootLogger = logging.getLogger(os.path.join(root_dir, name) if root_dir else name)
rootLogger.propagate = False
consoleHandler = logging.StreamHandler(sys.stdout) # stderr will be rendered as red which is bad
consoleHandler.setFormatter(ColoredFormatter(fmt, datefmt=datefmt))
attached_to_std = False
for handler in rootLogger.handlers:
if isinstance(handler, logging.StreamHandler):
if handler.stream == sys.stderr or handler.stream == sys.stdout:
attached_to_std = True
break
if not attached_to_std:
rootLogger.addHandler(consoleHandler)
rootLogger.setLevel(level)
consoleHandler.setLevel(level)
if root_dir:
os.makedirs(root_dir, exist_ok=True)
log_path = "{0}/{1}.log".format(root_dir, name)
fileHandler = logging.FileHandler(log_path, mode=mode)
fileHandler.setFormatter(ColoredFormatter(fmt, datefmt=datefmt, enable=False))
rootLogger.addHandler(fileHandler)
fileHandler.setLevel(level)
return rootLogger
logger = init_logger(name='hanlp', level=os.environ.get('HANLP_LOG_LEVEL', 'INFO'))
def enable_debug(debug=True):
logger.setLevel(logging.DEBUG if debug else logging.ERROR)
class ErasablePrinter(object):
def __init__(self, out=sys.stderr):
self._last_print_width = 0
self.out = out
def erase(self):
if self._last_print_width:
if IPYTHON:
self.out.write("\r")
self.out.write(" " * self._last_print_width)
else:
self.out.write("\b" * self._last_print_width)
self.out.write(" " * self._last_print_width)
self.out.write("\b" * self._last_print_width)
self.out.write("\r") # \r is essential when multi-lines were printed
self._last_print_width = 0
def print(self, msg: str, color=True):
self.erase()
if color:
if IPYTHON:
msg, _len = color_format_len(msg)
_len = len(msg)
else:
msg, _len = color_format_len(msg)
self._last_print_width = _len
else:
self._last_print_width = len(msg)
self.out.write(msg)
self.out.flush()
_printer = ErasablePrinter()
def flash(line: str, color=True):
_printer.print(line, color)
def color_format(msg: str):
for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES:
for c, v in tag.items():
start, end = f'[{c}]', f'[/{c}]'
msg = msg.replace(start, '\033[%dm' % v).replace(end, termcolor.RESET)
return msg
def remove_color_tag(msg: str):
for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES:
for c, v in tag.items():
start, end = f'[{c}]', f'[/{c}]'
msg = msg.replace(start, '').replace(end, '')
return msg
def color_format_len(msg: str):
_len = len(msg)
for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES:
for c, v in tag.items():
start, end = f'[{c}]', f'[/{c}]'
msg, delta = _replace_color_offset(msg, start, '\033[%dm' % v)
_len -= delta
msg, delta = _replace_color_offset(msg, end, termcolor.RESET)
_len -= delta
return msg, _len
def _replace_color_offset(msg: str, color: str, ctrl: str):
chunks = msg.split(color)
delta = (len(chunks) - 1) * len(color)
return ctrl.join(chunks), delta
def cprint(*args, file=None, **kwargs):
out = io.StringIO()
print(*args, file=out, **kwargs)
text = out.getvalue()
out.close()
c_text = color_format(text)
print(c_text, end='', file=file)
def main():
# cprint('[blink][yellow]...[/yellow][/blink]')
# show_colors_and_formats()
show_colors()
# print('previous', end='')
# for i in range(10):
# flash(f'[red]{i}[/red]')
def show_colors_and_formats():
msg = ''
for c in termcolor.COLORS.keys():
for h in termcolor.HIGHLIGHTS.keys():
for a in termcolor.ATTRIBUTES.keys():
msg += f'[{c}][{h}][{a}] {c}+{h}+{a} [/{a}][/{h}][/{c}]'
logger.info(msg)
def show_colors():
msg = ''
for c in termcolor.COLORS.keys():
cprint(f'[{c}]"{c}",[/{c}]')
# Generates tables for Doxygen flavored Markdown. See the Doxygen
# documentation for details:
# http://www.doxygen.nl/manual/markdown.html#md_tables
# Translation dictionaries for table alignment
if __name__ == '__main__':
main()
================================================
FILE: hanlp/utils/rules.py
================================================
import re
_SEPARATOR = r'@'
_RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
_UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + _SEPARATOR + r'(\w)', re.UNICODE)
_UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + _SEPARATOR + r'(\w)', re.UNICODE)
def _replace_with_separator(text, separator, regexs):
replacement = r"\1" + separator + r"\2"
result = text
for regex in regexs:
result = regex.sub(replacement, result)
return result
def split_sentence(text, best=True):
text = re.sub(r'([。!??])([^”’])', r"\1\n\2", text)
text = re.sub(r'(\.{6})([^”’])', r"\1\n\2", text)
text = re.sub(r'(…{2})([^”’])', r"\1\n\2", text)
text = re.sub(r'([。!??][”’])([^,。!??])', r'\1\n\2', text)
for chunk in text.split("\n"):
chunk = chunk.strip()
if not chunk:
continue
if not best:
yield chunk
continue
processed = _replace_with_separator(chunk, _SEPARATOR, [_AB_SENIOR, _AB_ACRONYM])
sents = list(_RE_SENTENCE.finditer(processed))
if not sents:
yield chunk
continue
for sentence in sents:
sentence = _replace_with_separator(sentence.group(), r" ", [_UNDO_AB_SENIOR, _UNDO_AB_ACRONYM])
yield sentence
================================================
FILE: hanlp/utils/span_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-06-12 20:34
import warnings
from typing import Dict, List, Tuple, Callable, Set, Optional
def generate_words_per_line(file_path):
with open(file_path, encoding='utf-8') as src:
for line in src:
cells = line.strip().split()
if not cells:
continue
yield cells
def words_to_bmes(words):
tags = []
for w in words:
if not w:
raise ValueError('{} contains None or zero-length word {}'.format(str(words), w))
if len(w) == 1:
tags.append('S')
else:
tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E'])
return tags
def words_to_bi(words):
tags = []
for w in words:
if not w:
raise ValueError('{} contains None or zero-length word {}'.format(str(words), w))
tags.extend(['B'] + ['I'] * (len(w) - 1))
return tags
def bmes_to_words(chars, tags):
result = []
if len(chars) == 0:
return result
word = chars[0]
for c, t in zip(chars[1:], tags[1:]):
if t == 'B' or t == 'S':
result.append(word)
word = ''
word += c
if len(word) != 0:
result.append(word)
return result
def bmes_to_spans(tags):
result = []
offset = 0
pre_offset = 0
for t in tags[1:]:
offset += 1
if t == 'B' or t == 'S':
result.append((pre_offset, offset))
pre_offset = offset
if offset != len(tags):
result.append((pre_offset, len(tags)))
return result
def bmes_of(sentence, segmented):
if segmented:
chars = []
tags = []
words = sentence.split()
for w in words:
chars.extend(list(w))
if len(w) == 1:
tags.append('S')
else:
tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E'])
else:
chars = list(sentence)
tags = ['S'] * len(chars)
return chars, tags
def iobes_to_bilou(src, dst):
with open(src) as src, open(dst, 'w') as out:
for line in src:
line = line.strip()
if not line:
out.write('\n')
continue
word, tag = line.split('\t')
if tag.startswith('E-'):
tag = 'L-' + tag[2:]
elif tag.startswith('S-'):
tag = 'U-' + tag[2:]
out.write(f'{word}\t{tag}\n')
def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]:
"""
Given labels and a constraint type, returns the allowed transitions. It will
additionally include transitions for the start and end states, which are used
by the conditional random field.
# Parameters
constraint_type : `str`, required
Indicates which constraint to apply. Current choices are
"BIO", "IOB1", "BIOUL", and "BMES".
labels : `Dict[int, str]`, required
A mapping {label_id -> label}. Most commonly this would be the value from
Vocabulary.get_index_to_token_vocabulary()
# Returns
`List[Tuple[int, int]]`
The allowed transitions (from_label_id, to_label_id).
"""
num_labels = len(labels)
start_tag = num_labels
end_tag = num_labels + 1
labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")]
allowed = []
for from_label_index, from_label in labels_with_boundaries:
if from_label in ("START", "END"):
from_tag = from_label
from_entity = ""
else:
from_tag = from_label[0]
from_entity = from_label[1:]
for to_label_index, to_label in labels_with_boundaries:
if to_label in ("START", "END"):
to_tag = to_label
to_entity = ""
else:
to_tag = to_label[0]
to_entity = to_label[1:]
if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity):
allowed.append((from_label_index, to_label_index))
return allowed
def is_transition_allowed(
constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str
):
"""
Given a constraint type and strings `from_tag` and `to_tag` that
represent the origin and destination of the transition, return whether
the transition is allowed under the given constraint type.
# Parameters
constraint_type : `str`, required
Indicates which constraint to apply. Current choices are
"BIO", "IOB1", "BIOUL", and "BMES".
from_tag : `str`, required
The tag that the transition originates from. For example, if the
label is `I-PER`, the `from_tag` is `I`.
from_entity : `str`, required
The entity corresponding to the `from_tag`. For example, if the
label is `I-PER`, the `from_entity` is `PER`.
to_tag : `str`, required
The tag that the transition leads to. For example, if the
label is `I-PER`, the `to_tag` is `I`.
to_entity : `str`, required
The entity corresponding to the `to_tag`. For example, if the
label is `I-PER`, the `to_entity` is `PER`.
# Returns
`bool`
Whether the transition is allowed under the given `constraint_type`.
"""
if to_tag == "START" or from_tag == "END":
# Cannot transition into START or from END
return False
if constraint_type == "BIOUL":
if from_tag == "START":
return to_tag in ("O", "B", "U")
if to_tag == "END":
return from_tag in ("O", "L", "U")
return any(
[
# O can transition to O, B-* or U-*
# L-x can transition to O, B-*, or U-*
# U-x can transition to O, B-*, or U-*
from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"),
# B-x can only transition to I-x or L-x
# I-x can only transition to I-x or L-x
from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity,
]
)
elif constraint_type == "BIO":
if from_tag == "START":
return to_tag in ("O", "B")
if to_tag == "END":
return from_tag in ("O", "B", "I")
return any(
[
# Can always transition to O or B-x
to_tag in ("O", "B"),
# Can only transition to I-x from B-x or I-x
to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity,
]
)
elif constraint_type == "IOB1":
if from_tag == "START":
return to_tag in ("O", "I")
if to_tag == "END":
return from_tag in ("O", "B", "I")
return any(
[
# Can always transition to O or I-x
to_tag in ("O", "I"),
# Can only transition to B-x from B-x or I-x, where
# x is the same tag.
to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity,
]
)
elif constraint_type == "BMES":
if from_tag == "START":
return to_tag in ("B", "S")
if to_tag == "END":
return from_tag in ("E", "S")
return any(
[
# Can only transition to B or S from E or S.
to_tag in ("B", "S") and from_tag in ("E", "S"),
# Can only transition to M-x from B-x, where
# x is the same tag.
to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity,
# Can only transition to E-x from B-x or M-x, where
# x is the same tag.
to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity,
]
)
else:
raise ValueError(f"Unknown constraint type: {constraint_type}")
TypedSpan = Tuple[int, Tuple[int, int]]
TypedStringSpan = Tuple[str, Tuple[int, int]]
class InvalidTagSequence(Exception):
def __init__(self, tag_sequence=None):
super().__init__()
self.tag_sequence = tag_sequence
def __str__(self):
return " ".join(self.tag_sequence)
T = str
def enumerate_spans(
sentence: List[T],
offset: int = 0,
max_span_width: int = None,
min_span_width: int = 1,
filter_function: Callable[[List[T]], bool] = None,
) -> List[Tuple[int, int]]:
"""
Given a sentence, return all token spans within the sentence. Spans are `inclusive`.
Additionally, you can provide a maximum and minimum span width, which will be used
to exclude spans outside of this range.
Finally, you can provide a function mapping `List[T] -> bool`, which will
be applied to every span to decide whether that span should be included. This
allows filtering by length, regex matches, pos tags or any Spacy `Token`
attributes, for example.
# Parameters
sentence : `List[T]`, required.
The sentence to generate spans for. The type is generic, as this function
can be used with strings, or Spacy `Tokens` or other sequences.
offset : `int`, optional (default = `0`)
A numeric offset to add to all span start and end indices. This is helpful
if the sentence is part of a larger structure, such as a document, which
the indices need to respect.
max_span_width : `int`, optional (default = `None`)
The maximum length of spans which should be included. Defaults to len(sentence).
min_span_width : `int`, optional (default = `1`)
The minimum length of spans which should be included. Defaults to 1.
filter_function : `Callable[[List[T]], bool]`, optional (default = `None`)
A function mapping sequences of the passed type T to a boolean value.
If `True`, the span is included in the returned spans from the
sentence, otherwise it is excluded..
"""
max_span_width = max_span_width or len(sentence)
filter_function = filter_function or (lambda x: True)
spans: List[Tuple[int, int]] = []
for start_index in range(len(sentence)):
last_end_index = min(start_index + max_span_width, len(sentence))
first_end_index = min(start_index + min_span_width - 1, len(sentence))
for end_index in range(first_end_index, last_end_index):
start = offset + start_index
end = offset + end_index
# add 1 to end index because span indices are inclusive.
if filter_function(sentence[slice(start_index, end_index + 1)]):
spans.append((start, end))
return spans
def bio_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to BIO tags, extracts spans.
Spans are inclusive and can be of zero length, representing a single word span.
Ill-formed spans are also included (i.e those which do not start with a "B-LABEL"),
as otherwise it is possible to get a perfect precision score whilst still predicting
ill-formed spans in addition to the correct spans. This function works properly when
the spans are unlabeled (i.e., your labels are simply "B", "I", and "O").
# Parameters
tag_sequence : `List[str]`, required.
The integer class labels for a sequence.
classes_to_ignore : `List[str]`, optional (default = `None`).
A list of string class labels `excluding` the bio tag
which should be ignored when extracting spans.
# Returns
spans : `List[TypedStringSpan]`
The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
Note that the label `does not` contain any BIO tag prefixes.
"""
classes_to_ignore = classes_to_ignore or []
spans: Set[Tuple[str, Tuple[int, int]]] = set()
span_start = 0
span_end = 0
active_conll_tag = None
for index, string_tag in enumerate(tag_sequence):
# Actual BIO tag.
bio_tag = string_tag[0]
if bio_tag not in ["B", "I", "O"]:
raise InvalidTagSequence(tag_sequence)
conll_tag = string_tag[2:]
if bio_tag == "O" or conll_tag in classes_to_ignore:
# The span has ended.
if active_conll_tag is not None:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = None
# We don't care about tags we are
# told to ignore, so we do nothing.
continue
elif bio_tag == "B":
# We are entering a new span; reset indices
# and active tag to new span.
if active_conll_tag is not None:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = conll_tag
span_start = index
span_end = index
elif bio_tag == "I" and conll_tag == active_conll_tag:
# We're inside a span.
span_end += 1
else:
# This is the case the bio label is an "I", but either:
# 1) the span hasn't started - i.e. an ill formed span.
# 2) The span is an I tag for a different conll annotation.
# We'll process the previous span if it exists, but also
# include this span. This is important, because otherwise,
# a model may get a perfect F1 score whilst still including
# false positive ill-formed spans.
if active_conll_tag is not None:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = conll_tag
span_start = index
span_end = index
# Last token might have been a part of a valid span.
if active_conll_tag is not None:
spans.add((active_conll_tag, (span_start, span_end)))
return list(spans)
def iob1_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to IOB1 tags, extracts spans.
Spans are inclusive and can be of zero length, representing a single word span.
Ill-formed spans are also included (i.e., those where "B-LABEL" is not preceded
by "I-LABEL" or "B-LABEL").
# Parameters
tag_sequence : `List[str]`, required.
The integer class labels for a sequence.
classes_to_ignore : `List[str]`, optional (default = `None`).
A list of string class labels `excluding` the bio tag
which should be ignored when extracting spans.
# Returns
spans : `List[TypedStringSpan]`
The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
Note that the label `does not` contain any BIO tag prefixes.
"""
classes_to_ignore = classes_to_ignore or []
spans: Set[Tuple[str, Tuple[int, int]]] = set()
span_start = 0
span_end = 0
active_conll_tag = None
prev_bio_tag = None
prev_conll_tag = None
for index, string_tag in enumerate(tag_sequence):
curr_bio_tag = string_tag[0]
curr_conll_tag = string_tag[2:]
if curr_bio_tag not in ["B", "I", "O"]:
raise InvalidTagSequence(tag_sequence)
if curr_bio_tag == "O" or curr_conll_tag in classes_to_ignore:
# The span has ended.
if active_conll_tag is not None:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = None
elif _iob1_start_of_chunk(prev_bio_tag, prev_conll_tag, curr_bio_tag, curr_conll_tag):
# We are entering a new span; reset indices
# and active tag to new span.
if active_conll_tag is not None:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = curr_conll_tag
span_start = index
span_end = index
else:
# bio_tag == "I" and curr_conll_tag == active_conll_tag
# We're continuing a span.
span_end += 1
prev_bio_tag = string_tag[0]
prev_conll_tag = string_tag[2:]
# Last token might have been a part of a valid span.
if active_conll_tag is not None:
spans.add((active_conll_tag, (span_start, span_end)))
return list(spans)
def _iob1_start_of_chunk(
prev_bio_tag: Optional[str],
prev_conll_tag: Optional[str],
curr_bio_tag: str,
curr_conll_tag: str,
) -> bool:
if curr_bio_tag == "B":
return True
if curr_bio_tag == "I" and prev_bio_tag == "O":
return True
if curr_bio_tag != "O" and prev_conll_tag != curr_conll_tag:
return True
return False
def bioul_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to BIOUL tags, extracts spans.
Spans are inclusive and can be of zero length, representing a single word span.
Ill-formed spans are not allowed and will raise `InvalidTagSequence`.
This function works properly when the spans are unlabeled (i.e., your labels are
simply "B", "I", "O", "U", and "L").
# Parameters
tag_sequence : `List[str]`, required.
The tag sequence encoded in BIOUL, e.g. ["B-PER", "L-PER", "O"].
classes_to_ignore : `List[str]`, optional (default = `None`).
A list of string class labels `excluding` the bio tag
which should be ignored when extracting spans.
# Returns
spans : `List[TypedStringSpan]`
The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
"""
spans = []
classes_to_ignore = classes_to_ignore or []
index = 0
while index < len(tag_sequence):
label = tag_sequence[index]
if label[0] == "U":
spans.append((label.partition("-")[2], (index, index)))
elif label[0] == "B":
start = index
while label[0] != "L":
index += 1
if index >= len(tag_sequence):
raise InvalidTagSequence(tag_sequence)
label = tag_sequence[index]
if not (label[0] == "I" or label[0] == "L"):
raise InvalidTagSequence(tag_sequence)
spans.append((label.partition("-")[2], (start, index)))
else:
if label != "O":
raise InvalidTagSequence(tag_sequence)
index += 1
return [span for span in spans if span[0] not in classes_to_ignore]
def iobes_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to BIOUL tags, extracts spans.
Spans are inclusive and can be of zero length, representing a single word span.
Ill-formed spans are not allowed and will raise `InvalidTagSequence`.
This function works properly when the spans are unlabeled (i.e., your labels are
simply "B", "I", "O", "U", and "L").
# Parameters
tag_sequence : `List[str]`, required.
The tag sequence encoded in BIOUL, e.g. ["B-PER", "L-PER", "O"].
classes_to_ignore : `List[str]`, optional (default = `None`).
A list of string class labels `excluding` the bio tag
which should be ignored when extracting spans.
# Returns
spans : `List[TypedStringSpan]`
The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
"""
spans = []
classes_to_ignore = classes_to_ignore or []
index = 0
while index < len(tag_sequence):
label = tag_sequence[index]
if label[0] == "S":
spans.append((label.partition("-")[2], (index, index)))
elif label[0] == "B":
start = index
while label[0] != "E":
index += 1
if index >= len(tag_sequence):
raise InvalidTagSequence(tag_sequence)
label = tag_sequence[index]
if not (label[0] == "I" or label[0] == "E"):
raise InvalidTagSequence(tag_sequence)
spans.append((label.partition("-")[2], (start, index)))
else:
if label != "O":
raise InvalidTagSequence(tag_sequence)
index += 1
return [span for span in spans if span[0] not in classes_to_ignore]
def iob1_to_bioul(tag_sequence: List[str]) -> List[str]:
warnings.warn(
"iob1_to_bioul has been replaced with 'to_bioul' to allow more encoding options.",
FutureWarning,
)
return to_bioul(tag_sequence)
def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
"""
Given a tag sequence encoded with IOB1 labels, recode to BIOUL.
In the IOB1 scheme, I is a token inside a span, O is a token outside
a span and B is the beginning of span immediately following another
span of the same type.
In the BIO scheme, I is a token inside a span, O is a token outside
a span and B is the beginning of a span.
# Parameters
tag_sequence : `List[str]`, required.
The tag sequence encoded in IOB1, e.g. ["I-PER", "I-PER", "O"].
encoding : `str`, optional, (default = `"IOB1"`).
The encoding type to convert from. Must be either "IOB1" or "BIO".
# Returns
bioul_sequence : `List[str]`
The tag sequence encoded in IOB1, e.g. ["B-PER", "L-PER", "O"].
"""
if encoding not in {"IOB1", "BIO"}:
raise ValueError(f"Invalid encoding {encoding} passed to 'to_bioul'.")
def replace_label(full_label, new_label):
# example: full_label = 'I-PER', new_label = 'U', returns 'U-PER'
parts = list(full_label.partition("-"))
parts[0] = new_label
return "".join(parts)
def pop_replace_append(in_stack, out_stack, new_label):
# pop the last element from in_stack, replace the label, append
# to out_stack
tag = in_stack.pop()
new_tag = replace_label(tag, new_label)
out_stack.append(new_tag)
def process_stack(stack, out_stack):
# process a stack of labels, add them to out_stack
if len(stack) == 1:
# just a U token
pop_replace_append(stack, out_stack, "U")
else:
# need to code as BIL
recoded_stack = []
pop_replace_append(stack, recoded_stack, "L")
while len(stack) >= 2:
pop_replace_append(stack, recoded_stack, "I")
pop_replace_append(stack, recoded_stack, "B")
recoded_stack.reverse()
out_stack.extend(recoded_stack)
# Process the tag_sequence one tag at a time, adding spans to a stack,
# then recode them.
bioul_sequence = []
stack: List[str] = []
for label in tag_sequence:
# need to make a dict like
# token = {'token': 'Matt', "labels": {'conll2003': "B-PER"}
# 'gold': 'I-PER'}
# where 'gold' is the raw value from the CoNLL data set
if label == "O" and len(stack) == 0:
bioul_sequence.append(label)
elif label == "O" and len(stack) > 0:
# need to process the entries on the stack plus this one
process_stack(stack, bioul_sequence)
bioul_sequence.append(label)
elif label[0] == "I":
# check if the previous type is the same as this one
# if it is then append to stack
# otherwise this start a new entity if the type
# is different
if len(stack) == 0:
if encoding == "BIO":
raise InvalidTagSequence(tag_sequence)
stack.append(label)
else:
# check if the previous type is the same as this one
this_type = label.partition("-")[2]
prev_type = stack[-1].partition("-")[2]
if this_type == prev_type:
stack.append(label)
else:
if encoding == "BIO":
raise InvalidTagSequence(tag_sequence)
# a new entity
process_stack(stack, bioul_sequence)
stack.append(label)
elif label[0] == "B":
if len(stack) > 0:
process_stack(stack, bioul_sequence)
stack.append(label)
else:
raise InvalidTagSequence(tag_sequence)
# process the stack
if len(stack) > 0:
process_stack(stack, bioul_sequence)
return bioul_sequence
def bmes_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to BMES tags, extracts spans.
Spans are inclusive and can be of zero length, representing a single word span.
Ill-formed spans are also included (i.e those which do not start with a "B-LABEL"),
as otherwise it is possible to get a perfect precision score whilst still predicting
ill-formed spans in addition to the correct spans.
This function works properly when the spans are unlabeled (i.e., your labels are
simply "B", "M", "E" and "S").
# Parameters
tag_sequence : `List[str]`, required.
The integer class labels for a sequence.
classes_to_ignore : `List[str]`, optional (default = `None`).
A list of string class labels `excluding` the bio tag
which should be ignored when extracting spans.
# Returns
spans : `List[TypedStringSpan]`
The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
Note that the label `does not` contain any BIO tag prefixes.
"""
def extract_bmes_tag_label(text):
bmes_tag = text[0]
label = text[2:]
return bmes_tag, label
spans: List[Tuple[str, List[int]]] = []
prev_bmes_tag: Optional[str] = None
for index, tag in enumerate(tag_sequence):
bmes_tag, label = extract_bmes_tag_label(tag)
if bmes_tag in ("B", "S"):
# Regardless of tag, we start a new span when reaching B & S.
spans.append((label, [index, index]))
elif bmes_tag in ("M", "E") and prev_bmes_tag in ("B", "M") and spans[-1][0] == label:
# Only expand the span if
# 1. Valid transition: B/M -> M/E.
# 2. Matched label.
spans[-1][1][1] = index
else:
# Best effort split for invalid span.
spans.append((label, [index, index]))
# update previous BMES tag.
prev_bmes_tag = bmes_tag
classes_to_ignore = classes_to_ignore or []
return [
# to tuple.
(span[0], (span[1][0], span[1][1]))
for span in spans
if span[0] not in classes_to_ignore
]
================================================
FILE: hanlp/utils/string_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-25 00:19
import unicodedata
from typing import List, Dict, Tuple
def format_scores(results: Dict[str, float]) -> str:
return ' - '.join(f'{k}: {v:.4f}' for (k, v) in results.items())
def ispunct(token):
return all(unicodedata.category(char).startswith('P')
for char in token)
def split_long_sentence_into(tokens: List[str], max_seq_length, sent_delimiter=None, char_level=False,
hard_constraint=False):
punct_offset = [i for i, x in enumerate(tokens) if
((sent_delimiter and x in sent_delimiter) or (not sent_delimiter and ispunct(x)))]
if not punct_offset:
# treat every token as punct
punct_offset = [i for i in range(len(tokens))]
punct_offset += [len(tokens)]
token_to_char_offset = []
if char_level:
offset = 0
for token in tokens:
token_to_char_offset.append(offset)
offset += len(token)
token_to_char_offset.append(offset)
start = 0
for i, offset in enumerate(punct_offset[:-1]):
end = punct_offset[i + 1]
length_at_next_punct = _len(start, end, token_to_char_offset, char_level)
if length_at_next_punct >= max_seq_length:
if hard_constraint:
yield from _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level)
else:
yield tokens[start: offset + 1]
start = offset + 1
offset = punct_offset[-1]
if start < offset:
offset -= 1
length_at_next_punct = _len(start, offset, token_to_char_offset, char_level)
if length_at_next_punct >= max_seq_length and hard_constraint:
yield from _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level)
else:
yield tokens[start:]
def _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level):
while start <= offset:
for j in range(offset + 1, start, -1):
if _len(start, j, token_to_char_offset, char_level) <= max_seq_length or j == start + 1:
yield tokens[start: j]
start = j
break
def _len(start, end, token_to_char_offset, char_level):
if char_level:
length_at_next_punct = token_to_char_offset[end] - token_to_char_offset[start]
else:
length_at_next_punct = end - start
return length_at_next_punct
def guess_delimiter(tokens):
if all(ord(c) < 128 for c in ''.join(tokens)):
delimiter_in_entity = ' '
else:
delimiter_in_entity = ''
return delimiter_in_entity
def split_long_sent(sent, delimiters, max_seq_length):
parts = []
offset = 0
for idx, char in enumerate(sent):
if char in delimiters:
parts.append(sent[offset:idx + 1])
offset = idx + 1
if not parts:
yield sent
return
short = []
for idx, part in enumerate(parts):
short += part
if idx == len(parts) - 1:
yield short
else:
if len(short) + len(parts[idx + 1]) > max_seq_length:
yield short
short = []
def possible_tokenization(text: str) -> List[Tuple[str]]:
"""Enumerate all possible tokenizations of a text.
Args:
text: A text.
Returns: All possible tokenizations.
"""
states = [((), ())]
for c in text:
new_states = []
for t, b in states:
# to split
new_states.append((t + (''.join(b + (c,)),), ()))
# not to split
new_states.append((t, b + (c,)))
states = new_states
return [t for t, b in states if not b]
================================================
FILE: hanlp/utils/tf_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-27 01:27
import json
import logging
import os
import random
from typing import List
import numpy as np
from hanlp_common.constant import PAD
def set_gpu(idx=0):
"""Restrict TensorFlow to only use the GPU of idx
Args:
idx: (Default value = 0)
Returns:
"""
gpus = get_visible_gpus()
if gpus:
try:
tf.config.experimental.set_visible_devices(gpus[idx], 'GPU')
logical_devices = tf.config.experimental.list_logical_devices('GPU')
assert len(logical_devices) == 1
except RuntimeError as e:
# Virtual devices must be set before GPUs have been initialized
# print(e)
raise e
def get_visible_gpus():
gpus = tf.config.experimental.list_physical_devices('GPU')
return gpus
def set_gpu_memory_growth(growth=True):
gpus = get_visible_gpus()
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, growth)
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
# print(e)
raise e
def nice_gpu():
"""Use GPU nicely."""
set_gpu_memory_growth()
set_gpu()
def shut_up_python_logging():
logging.getLogger('tensorflow').setLevel(logging.ERROR)
import absl.logging
logging.root.removeHandler(absl.logging._absl_handler)
absl.logging._warn_preinit_stderr = False
def set_tf_loglevel(level=logging.ERROR):
if level >= logging.FATAL:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
if level >= logging.ERROR:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2'
if level >= logging.WARNING:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '1'
else:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0'
shut_up_python_logging()
logging.getLogger('tensorflow').setLevel(level)
set_tf_loglevel()
shut_up_python_logging()
import tensorflow as tf
nice_gpu()
def size_of_dataset(dataset: tf.data.Dataset) -> int:
count = 0
for element in dataset.unbatch().batch(1):
count += 1
return count
def summary_of_model(model: tf.keras.Model):
"""https://stackoverflow.com/a/53668338/3730690
Args:
model: tf.keras.Model:
Returns:
"""
if not model.built:
return 'model structure unknown until calling fit() with some data'
line_list = []
model.summary(print_fn=lambda x: line_list.append(x))
summary = "\n".join(line_list)
return summary
def register_custom_cls(custom_cls, name=None):
if not name:
name = custom_cls.__name__
tf.keras.utils.get_custom_objects()[name] = custom_cls
def set_seed_tf(seed=233):
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
def nice():
nice_gpu()
set_seed_tf()
def hanlp_register(arg):
"""Registers a class with the Keras serialization framework.
Args:
arg:
Returns:
"""
class_name = arg.__name__
registered_name = 'HanLP' + '>' + class_name
# if tf_inspect.isclass(arg) and not hasattr(arg, 'get_config'):
# raise ValueError(
# 'Cannot register a class that does not have a get_config() method.')
tf.keras.utils.get_custom_objects()[registered_name] = arg
return arg
def tensor_is_eager(tensor: tf.Tensor):
return hasattr(tensor, 'numpy')
def copy_mask(src: tf.Tensor, dst: tf.Tensor):
mask = getattr(src, '_keras_mask', None)
if mask is not None:
dst._keras_mask = mask
return mask
def get_callback_by_class(callbacks: List[tf.keras.callbacks.Callback], cls) -> tf.keras.callbacks.Callback:
for callback in callbacks:
if isinstance(callback, cls):
return callback
def tf_bernoulli(shape, p, dtype=None):
return tf.keras.backend.random_binomial(shape, p, dtype)
def str_tensor_to_str(str_tensor: tf.Tensor) -> str:
return str_tensor.numpy().decode('utf-8')
def str_tensor_2d_to_list(str_tensor: tf.Tensor, pad=PAD) -> List[List[str]]:
l = []
for i in str_tensor:
sent = []
for j in i:
j = str_tensor_to_str(j)
if j == pad:
break
sent.append(j)
l.append(sent)
return l
def str_tensor_to_list(pred):
return [tag.predict('utf-8') for tag in pred]
def format_metrics(metrics: List[tf.keras.metrics.Metric]):
return ' - '.join(f'{m.name}: {m.result():.4f}' for m in metrics)
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
"""Special json encoder for numpy types
See https://interviewbubble.com/typeerror-object-of-type-float32-is-not-json-serializable/
Args:
obj: Object to be json encoded.
Returns:
Json string.
"""
if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
np.int16, np.int32, np.int64, np.uint8,
np.uint16, np.uint32, np.uint64)):
return int(obj)
elif isinstance(obj, (np.float_, np.float16, np.float32,
np.float64)):
return float(obj)
elif isinstance(obj, (np.ndarray,)): #### This is the fix
return obj.tolist()
return json.JSONEncoder.default(self, obj)
================================================
FILE: hanlp/utils/time_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-08-27 00:01
import datetime
import logging
import sys
import time
from typing import Union
from hanlp.utils.log_util import ErasablePrinter, color_format, color_format_len
def human_time_delta(days, hours, minutes, seconds, delimiter=' ') -> str:
units = locals().copy()
units.pop('delimiter')
non_zero = False
result = []
for key, val in sorted(units.items()):
append = False
if non_zero:
append = True
elif val:
non_zero = True
append = True
if append:
result.append('{} {}'.format(val, key[0]))
if not non_zero:
return '0 s'
return delimiter.join(result)
def seconds_to_time_delta(seconds):
seconds = round(seconds)
days = seconds // 86400
hours = seconds // 3600 % 24
minutes = seconds // 60 % 60
seconds = seconds % 60
return days, hours, minutes, seconds
def report_time_delta(seconds, human=True):
days, hours, minutes, seconds = seconds_to_time_delta(seconds)
if human:
return human_time_delta(days, hours, minutes, seconds)
return days, hours, minutes, seconds
class HumanTimeDelta(object):
def __init__(self, delta_seconds) -> None:
super().__init__()
self.delta_seconds = delta_seconds
def report(self, human=True):
return report_time_delta(self.delta_seconds, human)
def __str__(self) -> str:
return self.report(human=True)
def __truediv__(self, scalar):
return HumanTimeDelta(self.delta_seconds / scalar)
class CountdownTimer(ErasablePrinter):
def __init__(self, total: int, out=sys.stdout) -> None:
super().__init__(out=out)
self.total = total
self.current = 0
self.start = time.time()
self.finished_in = None
self.last_log_time = 0
def update(self, n=1):
self.current += n
self.current = min(self.total, self.current)
if self.current == self.total:
self.finished_in = time.time() - self.start
@property
def ratio(self) -> str:
return f'{self.current}/{self.total}'
@property
def ratio_percentage(self) -> str:
return f'{self.current / self.total:.2%}'
@property
def eta(self) -> float:
elapsed = self.elapsed
if self.finished_in:
eta = 0
else:
eta = elapsed / max(self.current, 0.1) * (self.total - self.current)
return eta
@property
def elapsed(self) -> float:
if self.finished_in:
elapsed = self.finished_in
else:
elapsed = time.time() - self.start
return elapsed
@property
def elapsed_human(self) -> str:
return human_time_delta(*seconds_to_time_delta(self.elapsed))
@property
def elapsed_average(self) -> float:
return self.elapsed / self.current
@property
def elapsed_average_human(self) -> str:
return human_time_delta(*seconds_to_time_delta(self.elapsed_average))
@property
def eta_human(self) -> str:
return human_time_delta(*seconds_to_time_delta(self.eta))
@property
def total_time(self) -> float:
elapsed = self.elapsed
if self.finished_in:
t = self.finished_in
else:
t = elapsed / max(self.current, 1) * self.total
return t
@property
def total_time_human(self) -> str:
return human_time_delta(*seconds_to_time_delta(self.total_time))
def stop(self, total=None):
if not self.finished_in or total:
self.finished_in = time.time() - self.start
if not total:
self.total = self.current
else:
self.current = total
self.total = total
@property
def et_eta(self):
_ = self.elapsed
if self.finished_in:
return self.elapsed
else:
return self.eta
@property
def et_eta_human(self):
text = human_time_delta(*seconds_to_time_delta(self.et_eta))
if self.finished_in:
return f'ET: {text}'
else:
return f'ETA: {text}'
@property
def finished(self):
return self.total == self.current
def log(self, info=None, ratio_percentage=True, ratio=True, step=1, interval=0.5, erase=True,
logger: Union[logging.Logger, bool] = None, newline=False, ratio_width=None):
self.update(step)
now = time.time()
if now - self.last_log_time > interval or self.finished:
cells = []
if ratio_percentage:
cells.append(self.ratio_percentage)
if ratio:
ratio = self.ratio
if not ratio_width:
ratio_width = self.ratio_width
ratio = ratio.rjust(ratio_width)
cells.append(ratio)
cells += [info, self.et_eta_human]
cells = [x for x in cells if x]
msg = f'{" ".join(cells)}'
self.last_log_time = now
self.print(msg, newline, erase, logger)
@property
def ratio_width(self) -> int:
return len(f'{self.total}') * 2 + 1
def print(self, msg, newline=False, erase=True, logger=None):
self.erase()
msg_len = 0 if newline else len(msg)
if self.finished and logger:
sys.stdout.flush()
if isinstance(logger, logging.Logger):
logger.info(msg)
else:
msg, msg_len = color_format_len(msg)
sys.stdout.write(msg)
if newline:
sys.stdout.write('\n')
msg_len = 0
self._last_print_width = msg_len
if self.finished and not logger:
if erase:
self.erase()
else:
sys.stdout.write("\n")
self._last_print_width = 0
sys.stdout.flush()
class Timer(object):
def __init__(self) -> None:
self.last = time.time()
def start(self):
self.last = time.time()
def stop(self) -> HumanTimeDelta:
now = time.time()
seconds = now - self.last
self.last = now
return HumanTimeDelta(seconds)
def now_human(year='y'):
now = datetime.datetime.now()
return now.strftime(f"%{year}-%m-%d %H:%M:%S")
def now_datetime():
return now_human('Y')
def now_filename(fmt="%y%m%d_%H%M%S"):
"""Generate filename using current datetime, in 20180102_030405 format
Args:
fmt: (Default value = "%y%m%d_%H%M%S")
Returns:
"""
now = datetime.datetime.now()
return now.strftime(fmt)
================================================
FILE: hanlp/utils/torch_util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-05-09 15:52
import os
import random
import time
from typing import List, Union, Dict, Tuple
import numpy as np
import torch
from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit, nvmlShutdown, nvmlDeviceGetCount
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from hanlp.utils.io_util import get_resource, replace_ext, TimingFileIterator
from hanlp.utils.log_util import logger, flash
from hanlp_common.constant import HANLP_VERBOSE
from hanlp_common.io import load_pickle, save_pickle
def gpus_available() -> Dict[int, float]:
if not torch.cuda.is_available():
return dict()
try:
nvmlInit()
gpus = {}
visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None)
if visible_devices is None:
visible_devices = list(range(nvmlDeviceGetCount()))
else:
visible_devices = {int(x.strip()) for x in visible_devices.split(',')}
for i, real_id in enumerate(visible_devices):
h = nvmlDeviceGetHandleByIndex(real_id)
info = nvmlDeviceGetMemoryInfo(h)
total = info.total
free = info.free
ratio = free / total
gpus[i] = ratio
# print(f'total : {info.total}')
# print(f'free : {info.free}')
# print(f'used : {info.used}')
# t = torch.cuda.get_device_properties(0).total_memory
# c = torch.cuda.memory_cached(0)
# a = torch.cuda.memory_allocated(0)
# print(t, c, a)
nvmlShutdown()
return dict(sorted(gpus.items(), key=lambda x: x[1], reverse=True))
except Exception as e:
logger.debug(f'Failed to get gpu info due to {e}')
return dict((i, 1.0) for i in range(torch.cuda.device_count()))
def cuda_devices(query=None) -> List[int]:
"""Decide which GPUs to use
Args:
query: (Default value = None)
Returns:
"""
if isinstance(query, list):
if len(query) == 0:
return [-1]
return query
if query is None:
query = gpus_available()
if not query:
return []
size, idx = max((v, k) for k, v in query.items())
# When multiple GPUs have the same size, randomly pick one to avoid conflicting
gpus_with_same_size = [k for k, v in query.items() if v == size]
query = random.choice(gpus_with_same_size)
if isinstance(query, float):
gpus = gpus_available()
if not query:
return []
query = [k for k, v in gpus.items() if v > query]
elif isinstance(query, int):
query = [query]
return query
def pad_lists(sequences: List[List], dtype=torch.long, padding_value=0):
return pad_sequence([torch.tensor(x, dtype=dtype) for x in sequences], True, padding_value)
def set_seed(seed=233, dont_care_speed=False):
"""Copied from https://github.com/huggingface/transformers/blob/7b75aa9fa55bee577e2c7403301ed31103125a35/src/transformers/trainer.py#L76
Args:
seed: (Default value = 233)
dont_care_speed: True may have a negative single-run performance impact, but ensures deterministic
Returns:
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
# ^^ safe to call this function even if cuda is not available
torch.cuda.manual_seed_all(seed)
if dont_care_speed:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def batched_index_select(input, index, dim=1):
"""
Args:
input: B x * x ... x *
index: B x M
dim: (Default value = 1)
Returns:
"""
views = [input.shape[0]] + [1 if i != dim else -1 for i in range(1, len(input.shape))]
expanse = list(input.shape)
expanse[0] = -1
expanse[dim] = -1
index = index.view(views).expand(expanse)
return torch.gather(input, dim, index)
def truncated_normal_(tensor, mean=0, std=1):
size = tensor.shape
tmp = tensor.new_empty(size + (4,)).normal_()
valid = (tmp < 2) & (tmp > -2)
ind = valid.max(-1, keepdim=True)[1]
tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
tensor.data.mul_(std).add_(mean)
return tensor
def dtype_of(e: Union[int, bool, float]):
if isinstance(e, bool):
return torch.bool
if isinstance(e, int):
return torch.long
if isinstance(e, float):
return torch.float
raise ValueError(f'Unsupported type of {repr(e)}')
def mean_model(model: torch.nn.Module):
return float(torch.mean(torch.stack([torch.sum(p) for p in model.parameters() if p.requires_grad])))
def main():
start = time.time()
print(gpus_available())
print(time.time() - start)
# print(gpus_available())
# print(cuda_devices())
# print(cuda_devices(0.1))
if __name__ == '__main__':
main()
def clip_grad_norm(model: nn.Module, grad_norm, transformer: nn.Module = None, transformer_grad_norm=None):
if transformer_grad_norm is None:
if grad_norm is not None:
nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), grad_norm)
else:
is_transformer = []
non_transformer = []
transformer = set(transformer.parameters())
for p in model.parameters():
if not p.requires_grad:
continue
if p in transformer:
is_transformer.append(p)
else:
non_transformer.append(p)
nn.utils.clip_grad_norm_(non_transformer, grad_norm)
nn.utils.clip_grad_norm_(is_transformer, transformer_grad_norm)
def load_word2vec(path, delimiter=' ', cache=True) -> Tuple[Dict[str, np.ndarray], int]:
realpath = get_resource(path)
binpath = replace_ext(realpath, '.pkl')
if cache:
try:
flash('Loading word2vec from cache [blink][yellow]...[/yellow][/blink]')
word2vec, dim = load_pickle(binpath)
flash('')
return word2vec, dim
except IOError:
pass
dim = None
word2vec = dict()
f = TimingFileIterator(realpath)
for idx, line in enumerate(f):
f.log('Loading word2vec from text file [blink][yellow]...[/yellow][/blink]')
line = line.rstrip().split(delimiter)
if len(line) > 2:
if dim is None:
dim = len(line)
else:
if len(line) != dim:
logger.warning('{}#{} length mismatches with {}'.format(path, idx + 1, dim))
continue
word, vec = line[0], line[1:]
word2vec[word] = np.array(vec, dtype=np.float32)
dim -= 1
if cache:
flash('Caching word2vec [blink][yellow]...[/yellow][/blink]')
save_pickle((word2vec, dim), binpath)
flash('')
return word2vec, dim
def load_word2vec_as_vocab_tensor(path, delimiter=' ', cache=True) -> Tuple[Dict[str, int], torch.Tensor]:
realpath = get_resource(path)
vocab_path = replace_ext(realpath, '.vocab')
matrix_path = replace_ext(realpath, '.pt')
if cache:
try:
if HANLP_VERBOSE:
flash('Loading vocab and matrix from cache [blink][yellow]...[/yellow][/blink]')
vocab = load_pickle(vocab_path)
matrix = torch.load(matrix_path, map_location='cpu')
if HANLP_VERBOSE:
flash('')
return vocab, matrix
except IOError:
pass
word2vec, dim = load_word2vec(path, delimiter, cache)
vocab = dict((k, i) for i, k in enumerate(word2vec.keys()))
matrix = torch.Tensor(np.stack(list(word2vec.values())))
if cache:
flash('Caching vocab and matrix [blink][yellow]...[/yellow][/blink]')
save_pickle(vocab, vocab_path)
torch.save(matrix, matrix_path)
flash('')
return vocab, matrix
def save_word2vec(word2vec: dict, filepath, delimiter=' '):
with open(filepath, 'w', encoding='utf-8') as out:
for w, v in word2vec.items():
out.write(f'{w}{delimiter}')
out.write(f'{delimiter.join(str(x) for x in v)}\n')
def lengths_to_mask(seq_len, max_len=None):
r"""
.. code-block::
>>> seq_len = torch.arange(2, 16)
>>> mask = lengths_to_mask(seq_len)
>>> print(mask.size())
torch.Size([14, 15])
>>> seq_len = np.arange(2, 16)
>>> mask = lengths_to_mask(seq_len)
>>> print(mask.shape)
(14, 15)
>>> seq_len = torch.arange(2, 16)
>>> mask = lengths_to_mask(seq_len, max_len=100)
>>>print(mask.size())
torch.Size([14, 100])
:param torch.LongTensor seq_len: (B,)
:param int max_len: max sequence length。
:return: torch.Tensor (B, max_len)
"""
assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}."
batch_size = seq_len.size(0)
max_len = int(max_len) if max_len else seq_len.max().long()
broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len)
mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1))
return mask
def activation_from_name(name: str):
return getattr(torch.nn, name)
def filter_state_dict_safely(model_state: dict, load_state: dict):
safe_state = dict()
for k, v in load_state.items():
model_v = model_state.get(k, None)
if model_v is not None and model_v.shape == v.shape:
safe_state[k] = v
return safe_state
================================================
FILE: hanlp/version.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:26
__version__ = '2.1.0-beta.64'
"""HanLP version"""
class NotCompatible(Exception):
pass
================================================
FILE: plugins/README.md
================================================
# Plugins for HanLP
This directory contains modules shared across several individual packages or non core APIs.
If you plan to submit any plugins, please put it here too.
For developers, run the following set-up.
```bash
pip install -e hanlp_trie
pip install -e hanlp_common
pip install -e hanlp_restful
```
================================================
FILE: plugins/hanlp_common/README.md
================================================
# Common utilities and structures for HanLP
[中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker)
The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch.
## Installation
```bash
pip install hanlp
```
## License
HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website.
================================================
FILE: plugins/hanlp_common/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-16 22:20
================================================
FILE: plugins/hanlp_common/hanlp_common/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-16 22:21
================================================
FILE: plugins/hanlp_common/hanlp_common/amr.py
================================================
# MIT License
#
# Copyright (c) 2019 Sheng Zhang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import json
import logging
import re
import traceback
from collections import Counter, defaultdict
from hanlp_common.io import eprint
try:
import networkx as nx
import penman
from penman import Triple
except ModuleNotFoundError:
traceback.print_exc()
eprint('AMR support requires the full version which can be installed via:\n'
'pip install hanlp_common[full]')
exit(1)
DEFAULT_PADDING_TOKEN = "@@PADDING@@"
DEFAULT_OOV_TOKEN = "@@UNKNOWN@@"
logger = logging.getLogger('amr')
# Disable inverting ':mod' relation.
penman.AMRCodec._inversions.pop('domain')
penman.AMRCodec._deinversions.pop('mod')
amr_codec = penman.AMRCodec(indent=6)
WORDSENSE_RE = re.compile(r'-\d\d$')
QUOTED_RE = re.compile(r'^".*"$')
def is_abstract_token(token):
return re.search(r'^([A-Z]+_)+\d+$', token) or re.search(r'^\d0*$', token)
def is_english_punct(c):
return re.search(r'^[,.?!:;"\'-(){}\[\]]$', c)
def find_similar_token(token, tokens):
token = re.sub(r'-\d\d$', '', token) # .lower())
for i, t in enumerate(tokens):
if token == t:
return tokens[i]
# t = t.lower()
# if (token == t or
# (t.startswith(token) and len(token) > 3) or
# token + 'd' == t or
# token + 'ed' == t or
# re.sub('ly$', 'le', t) == token or
# re.sub('tive$', 'te', t) == token or
# re.sub('tion$', 'te', t) == token or
# re.sub('ied$', 'y', t) == token or
# re.sub('ly$', '', t) == token
# ):
# return tokens[i]
return None
class AMR:
def __init__(self,
id=None,
sentence=None,
graph=None,
tokens=None,
lemmas=None,
pos_tags=None,
ner_tags=None,
abstract_map=None,
misc=None):
self.id = id
self.sentence = sentence
self.graph = graph
self.tokens = tokens
self.lemmas = lemmas
self.pos_tags = pos_tags
self.ner_tags = ner_tags
self.abstract_map = abstract_map
self.misc = misc
def is_named_entity(self, index):
return self.ner_tags[index] not in ('0', 'O')
def get_named_entity_span(self, index):
if self.ner_tags is None or not self.is_named_entity(index):
return []
span = [index]
tag = self.ner_tags[index]
prev = index - 1
while prev > 0 and self.ner_tags[prev] == tag:
span.append(prev)
prev -= 1
next = index + 1
while next < len(self.ner_tags) and self.ner_tags[next] == tag:
span.append(next)
next += 1
return span
def find_span_indexes(self, span):
for i, token in enumerate(self.tokens):
if token == span[0]:
_span = self.tokens[i: i + len(span)]
if len(_span) == len(span) and all(x == y for x, y in zip(span, _span)):
return list(range(i, i + len(span)))
return None
def replace_span(self, indexes, new, pos=None, ner=None):
self.tokens = self.tokens[:indexes[0]] + new + self.tokens[indexes[-1] + 1:]
self.lemmas = self.lemmas[:indexes[0]] + new + self.lemmas[indexes[-1] + 1:]
if pos is None:
pos = [self.pos_tags[indexes[0]]]
self.pos_tags = self.pos_tags[:indexes[0]] + pos + self.pos_tags[indexes[-1] + 1:]
if ner is None:
ner = [self.ner_tags[indexes[0]]]
self.ner_tags = self.ner_tags[:indexes[0]] + ner + self.ner_tags[indexes[-1] + 1:]
def remove_span(self, indexes):
self.replace_span(indexes, [], [], [])
def __repr__(self):
fields = []
for k, v in dict(
id=self.id,
snt=self.sentence,
tokens=self.tokens,
lemmas=self.lemmas,
pos_tags=self.pos_tags,
ner_tags=self.ner_tags,
abstract_map=self.abstract_map,
misc=self.misc,
graph=self.graph
).items():
if v is None:
continue
if k == 'misc':
fields += v
elif k == 'graph':
fields.append(str(v))
else:
if not isinstance(v, str):
v = json.dumps(v)
fields.append('# ::{} {}'.format(k, v))
return '\n'.join(fields)
def get_src_tokens(self):
return self.lemmas if self.lemmas else self.sentence.split()
class AMRNode:
attribute_priority = [
'instance', 'quant', 'mode', 'value', 'name', 'li', 'mod', 'frequency',
'month', 'day', 'year', 'time', 'unit', 'decade', 'poss'
]
def __init__(self, identifier, attributes=None, copy_of=None):
self.identifier = identifier
if attributes is None:
self.attributes = []
else:
self.attributes = attributes
# self._sort_attributes()
self._num_copies = 0
self.copy_of = copy_of
def _sort_attributes(self):
def get_attr_priority(attr):
if attr in self.attribute_priority:
return self.attribute_priority.index(attr), attr
if not re.search(r'^(ARG|op|snt)', attr):
return len(self.attribute_priority), attr
else:
return len(self.attribute_priority) + 1, attr
self.attributes.sort(key=lambda x: get_attr_priority(x[0]))
def __hash__(self):
return hash(self.identifier)
def __eq__(self, other):
if not isinstance(other, AMRNode):
return False
return self.identifier == other.identifier
def __repr__(self):
ret = str(self.identifier)
for k, v in self.attributes:
if k == 'instance':
ret += ' / ' + v
break
return ret
def __str__(self):
ret = repr(self)
for key, value in self.attributes:
if key == 'instance':
continue
ret += '\n\t:{} {}'.format(key, value)
return ret
@property
def instance(self):
for key, value in self.attributes:
if key == 'instance':
return value
else:
return None
@property
def ops(self):
ops = []
for key, value in self.attributes:
if re.search(r'op\d+', key):
ops.append((int(key[2:]), value))
if len(ops):
ops.sort(key=lambda x: x[0])
return [v for k, v in ops]
def copy(self):
attributes = None
if self.attributes is not None:
attributes = self.attributes[:]
self._num_copies += 1
copy = AMRNode(self.identifier + '_copy_{}'.format(self._num_copies), attributes, self)
return copy
def remove_attribute(self, attr, value):
self.attributes.remove((attr, value))
def add_attribute(self, attr, value):
self.attributes.append((attr, value))
def replace_attribute(self, attr, old, new):
index = self.attributes.index((attr, old))
self.attributes[index] = (attr, new)
def get_frame_attributes(self):
for k, v in self.attributes:
if isinstance(v, str) and re.search(r'-\d\d$', v):
yield k, v
def get_senseless_attributes(self):
for k, v in self.attributes:
if isinstance(v, str) and not re.search(r'-\d\d$', v):
yield k, v
class AMRGraph(penman.Graph):
edge_label_priority = (
'mod name time location degree poss domain quant manner unit purpose topic condition part-of compared-to '
'duration source ord beneficiary concession direction frequency consist-of example medium location-of '
'manner-of quant-of time-of instrument prep-in destination accompanier prep-with extent instrument-of age '
'path concession-of subevent-of prep-as prep-to prep-against prep-on prep-for degree-of prep-under part '
'condition-of prep-without topic-of season duration-of poss-of prep-from prep-at range purpose-of source-of '
'subevent example-of value path-of scale conj-as-if prep-into prep-by prep-on-behalf-of medium-of prep-among '
'calendar beneficiary-of prep-along-with extent-of age-of frequency-of dayperiod accompanier-of '
'destination-of prep-amid prep-toward prep-in-addition-to ord-of name-of weekday direction-of prep-out-of '
'timezone subset-of'.split())
def __init__(self, penman_graph):
super(AMRGraph, self).__init__()
self._triples = penman_graph._triples
self._top = penman_graph._top
self._build_extras()
self._src_tokens = []
def __str__(self):
self._triples = penman.alphanum_order(self._triples)
return amr_codec.encode(self)
def _build_extras(self):
G = nx.DiGraph()
self.variable_to_node = {}
for v in self.variables():
if type(v) is not str:
continue
attributes = [(t.relation, t.target) for t in self.attributes(source=v)]
node = AMRNode(v, attributes)
G.add_node(node)
self.variable_to_node[v] = node
edge_set = set()
for edge in self.edges():
if type(edge.source) is not str:
continue
source = self.variable_to_node[edge.source]
target = self.variable_to_node[edge.target]
relation = edge.relation
if relation == 'instance':
continue
if source == target:
continue
if edge.inverted:
source, target, relation = target, source, amr_codec.invert_relation(edge.relation)
if (source, target) in edge_set:
target = target.copy()
edge_set.add((source, target))
G.add_edge(source, target, label=relation)
self._G = G
def attributes(self, source=None, relation=None, target=None):
# Refine attributes because there's a bug in penman.attributes()
# See https://github.com/goodmami/penman/issues/29
attrmatch = lambda a: (
(source is None or source == a.source) and
(relation is None or relation == a.relation) and
(target is None or target == a.target)
)
variables = self.variables()
attrs = [t for t in self.triples() if t.target not in variables or t.relation == 'instance']
return list(filter(attrmatch, attrs))
def _update_penman_graph(self, triples):
self._triples = triples
if self._top not in self.variables():
self._top = None
def is_name_node(self, node):
edges = list(self._G.in_edges(node))
return any(self._G[source][target].get('label', None) == 'name' for source, target in edges)
def get_name_node_type(self, node):
edges = list(self._G.in_edges(node))
for source, target in edges:
if self._G[source][target].get('label', None) == 'name':
return source.instance
raise KeyError
def get_name_node_wiki(self, node):
edges = list(self._G.in_edges(node))
for source, target in edges:
if self._G[source][target].get('label', None) == 'name':
for attr, value in source.attributes:
if attr == 'wiki':
if value != '-':
value = value[1:-1] # remove quotes
return value
return None
def set_name_node_wiki(self, node, wiki):
edges = list(self._G.in_edges(node))
parent = None
for source, target in edges:
if self._G[source][target].get('label', None) == 'name':
parent = source
break
if parent:
if wiki != '-':
wiki = '"{}"'.format(wiki)
self.add_node_attribute(parent, 'wiki', wiki)
def is_date_node(self, node):
return node.instance == 'date-entity'
def add_edge(self, source, target, label):
self._G.add_edge(source, target, label=label)
t = penman.Triple(source=source.identifier, relation=label, target=target.identifier)
triples = self._triples + [t]
triples = penman.alphanum_order(triples)
self._update_penman_graph(triples)
def remove_edge(self, x, y):
if isinstance(x, AMRNode) and isinstance(y, AMRNode):
self._G.remove_edge(x, y)
if isinstance(x, AMRNode):
x = x.identifier
if isinstance(y, AMRNode):
y = y.identifier
triples = [t for t in self._triples if not (t.source == x and t.target == y)]
self._update_penman_graph(triples)
def update_edge_label(self, x, y, old, new):
self._G[x][y]['label'] = new
triples = []
for t in self._triples:
if t.source == x.identifier and t.target == y.identifier and t.relation == old:
t = Triple(x.identifier, new, y.identifier)
triples.append(t)
self._update_penman_graph(triples)
def add_node(self, instance):
identifier = instance[0]
assert identifier.isalpha()
if identifier in self.variables():
i = 2
while identifier + str(i) in self.variables():
i += 1
identifier += str(i)
triples = self._triples + [Triple(identifier, 'instance', instance)]
self._triples = penman.alphanum_order(triples)
node = AMRNode(identifier, [('instance', instance)])
self._G.add_node(node)
return node
def remove_node(self, node):
self._G.remove_node(node)
triples = [t for t in self._triples if t.source != node.identifier]
self._update_penman_graph(triples)
def replace_node_attribute(self, node, attr, old, new):
node.replace_attribute(attr, old, new)
triples = []
found = False
for t in self._triples:
if t.source == node.identifier and t.relation == attr and t.target == old:
found = True
t = penman.Triple(source=node.identifier, relation=attr, target=new)
triples.append(t)
if not found:
raise KeyError
self._triples = penman.alphanum_order(triples)
def remove_node_attribute(self, node, attr, value):
node.remove_attribute(attr, value)
triples = [t for t in self._triples if
not (t.source == node.identifier and t.relation == attr and t.target == value)]
self._update_penman_graph(triples)
def add_node_attribute(self, node, attr, value):
node.add_attribute(attr, value)
t = penman.Triple(source=node.identifier, relation=attr, target=value)
self._triples = penman.alphanum_order(self._triples + [t])
def remove_node_ops(self, node):
ops = []
for attr, value in node.attributes:
if re.search(r'^op\d+$', attr):
ops.append((attr, value))
for attr, value in ops:
self.remove_node_attribute(node, attr, value)
def remove_subtree(self, root):
children = []
removed_nodes = set()
for _, child in list(self._G.edges(root)):
self.remove_edge(root, child)
children.append(child)
for child in children:
if len(list(self._G.in_edges(child))) == 0:
removed_nodes.update(self.remove_subtree(child))
if len(list(self._G.in_edges(root))) == 0:
self.remove_node(root)
removed_nodes.add(root)
return removed_nodes
def get_subtree(self, root, max_depth):
if max_depth == 0:
return []
nodes = [root]
children = [child for _, child in self._G.edges(root)]
nodes += children
for child in children:
if len(list(self._G.in_edges(child))) == 1:
nodes = nodes + self.get_subtree(child, max_depth - 1)
return nodes
def get_nodes(self):
return self._G.nodes
def get_edges(self):
return self._G.edges
def set_src_tokens(self, sentence):
if type(sentence) is not list:
sentence = sentence.split(" ")
self._src_tokens = sentence
def get_src_tokens(self):
return self._src_tokens
def get_list_node(self, replace_copy=True):
visited = defaultdict(int)
node_list = []
def dfs(node, relation, parent):
node_list.append((
node if node.copy_of is None or not replace_copy else node.copy_of,
relation,
parent if parent.copy_of is None or not replace_copy else parent.copy_of))
if len(self._G[node]) > 0 and visited[node] == 0:
visited[node] = 1
for child_node, child_relation in self.sort_edges(self._G[node].items()):
dfs(child_node, child_relation["label"], node)
dfs(
self.variable_to_node[self._top],
'root',
self.variable_to_node[self._top]
)
return node_list
def sort_edges(self, edges):
return edges
def get_tgt_tokens(self):
node_list = self.get_list_node()
tgt_token = []
visited = defaultdict(int)
for node, relation, parent_node in node_list:
instance = [attr[1] for attr in node.attributes if attr[0] == "instance"]
assert len(instance) == 1
tgt_token.append(str(instance[0]))
if len(node.attributes) > 1 and visited[node] == 0:
for attr in node.attributes:
if attr[0] != "instance":
tgt_token.append(str(attr[1]))
visited[node] = 1
return tgt_token
def get_list_data(self, amr, bos=None, eos=None, bert_tokenizer=None, max_tgt_length=None):
node_list = self.get_list_node()
tgt_tokens = []
head_tags = []
head_indices = []
node_to_idx = defaultdict(list)
visited = defaultdict(int)
def update_info(node, relation, parent, token):
head_indices.append(1 + node_to_idx[parent][-1])
head_tags.append(relation)
tgt_tokens.append(str(token))
for node, relation, parent_node in node_list:
node_to_idx[node].append(len(tgt_tokens))
instance = [attr[1] for attr in node.attributes if attr[0] == "instance"]
assert len(instance) == 1
instance = instance[0]
update_info(node, relation, parent_node, instance)
if len(node.attributes) > 1 and visited[node] == 0:
for attr in node.attributes:
if attr[0] != "instance":
update_info(node, attr[0], node, attr[1])
visited[node] = 1
def trim_very_long_tgt_tokens(tgt_tokens, head_tags, head_indices, node_to_idx):
tgt_tokens = tgt_tokens[:max_tgt_length]
head_tags = head_tags[:max_tgt_length]
head_indices = head_indices[:max_tgt_length]
for node, indices in node_to_idx.items():
invalid_indices = [index for index in indices if index >= max_tgt_length]
for index in invalid_indices:
indices.remove(index)
return tgt_tokens, head_tags, head_indices, node_to_idx
if max_tgt_length is not None:
tgt_tokens, head_tags, head_indices, node_to_idx = trim_very_long_tgt_tokens(
tgt_tokens, head_tags, head_indices, node_to_idx)
copy_offset = 0
if bos:
tgt_tokens = [bos] + tgt_tokens
copy_offset += 1
if eos:
tgt_tokens = tgt_tokens + [eos]
head_indices[node_to_idx[self.variable_to_node[self.top]][0]] = 0
# Target side Coreference
tgt_copy_indices = [i for i in range(len(tgt_tokens))]
for node, indices in node_to_idx.items():
if len(indices) > 1:
copy_idx = indices[0] + copy_offset
for token_idx in indices[1:]:
tgt_copy_indices[token_idx + copy_offset] = copy_idx
tgt_copy_map = [(token_idx, copy_idx) for token_idx, copy_idx in enumerate(tgt_copy_indices)]
for i, copy_index in enumerate(tgt_copy_indices):
# Set the coreferred target to 0 if no coref is available.
if i == copy_index:
tgt_copy_indices[i] = 0
tgt_token_counter = Counter(tgt_tokens)
tgt_copy_mask = [0] * len(tgt_tokens)
for i, token in enumerate(tgt_tokens):
if tgt_token_counter[token] > 1:
tgt_copy_mask[i] = 1
def add_source_side_tags_to_target_side(_src_tokens, _src_tags):
assert len(_src_tags) == len(_src_tokens)
tag_counter = defaultdict(lambda: defaultdict(int))
for src_token, src_tag in zip(_src_tokens, _src_tags):
tag_counter[src_token][src_tag] += 1
tag_lut = {DEFAULT_OOV_TOKEN: DEFAULT_OOV_TOKEN,
DEFAULT_PADDING_TOKEN: DEFAULT_OOV_TOKEN}
for src_token in set(_src_tokens):
tag = max(tag_counter[src_token].keys(), key=lambda x: tag_counter[src_token][x])
tag_lut[src_token] = tag
tgt_tags = []
for tgt_token in tgt_tokens:
sim_token = find_similar_token(tgt_token, _src_tokens)
if sim_token is not None:
index = _src_tokens.index(sim_token)
tag = _src_tags[index]
else:
tag = DEFAULT_OOV_TOKEN
tgt_tags.append(tag)
return tgt_tags, tag_lut
# Source Copy
src_tokens = self.get_src_tokens()
src_token_ids = None
src_token_subword_index = None
src_pos_tags = amr.pos_tags
src_copy_vocab = SourceCopyVocabulary(src_tokens)
src_copy_indices = src_copy_vocab.index_sequence(tgt_tokens)
src_copy_map = src_copy_vocab.get_copy_map(src_tokens)
tgt_pos_tags, pos_tag_lut = add_source_side_tags_to_target_side(src_tokens, src_pos_tags)
if bert_tokenizer is not None:
src_token_ids, src_token_subword_index = bert_tokenizer.tokenize(src_tokens, True)
src_must_copy_tags = [1 if is_abstract_token(t) else 0 for t in src_tokens]
src_copy_invalid_ids = set(src_copy_vocab.index_sequence(
[t for t in src_tokens if is_english_punct(t)]))
return {
"tgt_tokens": tgt_tokens,
"tgt_pos_tags": tgt_pos_tags,
"tgt_copy_indices": tgt_copy_indices,
"tgt_copy_map": tgt_copy_map,
"tgt_copy_mask": tgt_copy_mask,
"src_tokens": src_tokens,
"src_token_ids": src_token_ids,
"src_token_subword_index": src_token_subword_index,
"src_must_copy_tags": src_must_copy_tags,
"src_pos_tags": src_pos_tags,
"src_copy_vocab": src_copy_vocab,
"src_copy_indices": src_copy_indices,
"src_copy_map": src_copy_map,
"pos_tag_lut": pos_tag_lut,
"head_tags": head_tags,
"head_indices": head_indices,
"src_copy_invalid_ids": src_copy_invalid_ids
}
@classmethod
def decode(cls, raw_graph_string):
_graph = amr_codec.decode(raw_graph_string)
return cls(_graph)
@classmethod
def from_lists(cls, all_list):
head_tags = all_list['head_tags']
head_indices = all_list['head_indices']
tgt_tokens = all_list['tokens']
tgt_copy_indices = all_list['coref']
variables = []
variables_count = defaultdict(int)
for i, token in enumerate(tgt_tokens):
if tgt_copy_indices[i] != i:
variables.append(variables[tgt_copy_indices[i]])
else:
if token[0] in variables_count:
variables.append(token[0] + str(variables_count[token[0]]))
else:
variables.append(token[0])
variables_count[token[0]] += 1
Triples = []
for variable, token in zip(variables, tgt_tokens):
Triples.append(Triple(variable, "instance", token))
Triples.append(
Triple(
head_indices[variable],
head_tags[variable],
variable
)
)
@classmethod
def from_prediction(cls, prediction):
def is_attribute_value(value):
return re.search(r'(^".*"$|^[^a-zA-Z]+$)', value) is not None
def is_attribute_edge(label):
return label in ('instance', 'mode', 'li', 'value', 'month', 'year', 'day', 'decade', 'ARG6')
def normalize_number(text):
if re.search(r'^\d+,\d+$', text):
text = text.replace(',', '')
return text
def abstract_node(value):
return re.search(r'^([A-Z]+|DATE_ATTRS|SCORE_ENTITY|ORDINAL_ENTITY)_\d+$', value)
def abstract_attribute(value):
return re.search(r'^_QUANTITY_\d+$', value)
def correct_multiroot(heads):
for i in range(1, len(heads)):
if heads[i] == 0:
heads[i] = 1
return heads
nodes = [normalize_number(n) for n in prediction['nodes']]
heads = correct_multiroot(prediction['heads'])
corefs = [int(x) for x in prediction['corefs']]
head_labels = prediction['head_labels']
triples = []
top = None
# Build the variable map from variable to instance.
variable_map = {}
for coref_index in corefs:
node = nodes[coref_index - 1]
head_label = head_labels[coref_index - 1]
if (re.search(r'[/:\\()]', node) or is_attribute_value(node) or
is_attribute_edge(head_label) or abstract_attribute(node)):
continue
variable_map['vv{}'.format(coref_index)] = node
for head_index in heads:
if head_index == 0:
continue
node = nodes[head_index - 1]
coref_index = corefs[head_index - 1]
variable_map['vv{}'.format(coref_index)] = node
# Build edge triples and other attribute triples.
for i, head_index in enumerate(heads):
if head_index == 0:
top_variable = 'vv{}'.format(corefs[i])
if top_variable not in variable_map:
variable_map[top_variable] = nodes[i]
top = top_variable
continue
head_variable = 'vv{}'.format(corefs[head_index - 1])
modifier = nodes[i]
modifier_variable = 'vv{}'.format(corefs[i])
label = head_labels[i]
assert head_variable in variable_map
if modifier_variable in variable_map:
triples.append((head_variable, label, modifier_variable))
else:
# Add quotes if there's a backslash.
if re.search(r'[/:\\()]', modifier) and not re.search(r'^".*"$', modifier):
modifier = '"{}"'.format(modifier)
triples.append((head_variable, label, modifier))
for var, node in variable_map.items():
if re.search(r'^".*"$', node):
node = node[1:-1]
if re.search(r'[/:\\()]', node):
parts = re.split(r'[/:\\()]', node)
for part in parts[::-1]:
if len(part):
node = part
break
else:
node = re.sub(r'[/:\\()]', '_', node)
triples.append((var, 'instance', node))
if len(triples) == 0:
triples.append(('vv1', 'instance', 'string-entity'))
top = 'vv1'
triples.sort(key=lambda x: int(x[0].replace('vv', '')))
graph = penman.Graph()
graph._top = top
graph._triples = [penman.Triple(*t) for t in triples]
graph = cls(graph)
try:
GraphRepair.do(graph, nodes)
amr_codec.encode(graph)
except Exception as e:
graph._top = top
graph._triples = [penman.Triple(*t) for t in triples]
graph = cls(graph)
return graph
class SourceCopyVocabulary:
def __init__(self, sentence, pad_token=DEFAULT_PADDING_TOKEN, unk_token=DEFAULT_OOV_TOKEN):
if type(sentence) is not list:
sentence = sentence.split(" ")
self.src_tokens = sentence
self.pad_token = pad_token
self.unk_token = unk_token
self.token_to_idx = {self.pad_token: 0, self.unk_token: 1}
self.idx_to_token = {0: self.pad_token, 1: self.unk_token}
self.vocab_size = 2
for token in sentence:
if token not in self.token_to_idx:
self.token_to_idx[token] = self.vocab_size
self.idx_to_token[self.vocab_size] = token
self.vocab_size += 1
def get_token_from_idx(self, idx):
return self.idx_to_token[idx]
def get_token_idx(self, token):
return self.token_to_idx.get(token, self.token_to_idx[self.unk_token])
def index_sequence(self, list_tokens):
return [self.get_token_idx(token) for token in list_tokens]
def get_copy_map(self, list_tokens):
src_indices = [self.get_token_idx(self.unk_token)] + self.index_sequence(list_tokens)
return [
(src_idx, src_token_idx) for src_idx, src_token_idx in enumerate(src_indices)
]
def get_special_tok_list(self):
return [self.pad_token, self.unk_token]
def __repr__(self):
return json.dumps(self.idx_to_token)
def is_similar(instances1, instances2):
if len(instances1) < len(instances2):
small = instances1
large = instances2
else:
small = instances2
large = instances1
coverage1 = sum(1 for x in small if x in large) / len(small)
coverage2 = sum(1 for x in large if x in small) / len(large)
return coverage1 > .8 and coverage2 > .8
class GraphRepair:
def __init__(self, graph, nodes):
self.graph = graph
self.nodes = nodes
self.repaired_items = set()
@staticmethod
def do(graph, nodes):
gr = GraphRepair(graph, nodes)
gr.remove_redundant_edges()
gr.remove_unknown_nodes()
def remove_unknown_nodes(self):
graph = self.graph
nodes = [node for node in graph.get_nodes()]
for node in nodes:
for attr, value in node.attributes:
if value == '@@UNKNOWN@@' and attr != 'instance':
graph.remove_node_attribute(node, attr, value)
if node.instance == '@@UNKNOWN@@':
if len(list(graph._G.edges(node))) == 0:
for source, target in list(graph._G.in_edges(node)):
graph.remove_edge(source, target)
graph.remove_node(node)
self.repaired_items.add('remove-unknown-node')
def remove_redundant_edges(self):
"""
Edge labels such as ARGx, ARGx-of, and 'opx' should only appear at most once
in each node's outgoing edges.
"""
graph = self.graph
nodes = [node for node in graph.get_nodes()]
removed_nodes = set()
for node in nodes:
if node in removed_nodes:
continue
edges = list(graph._G.edges(node))
edge_counter = defaultdict(list)
for source, target in edges:
label = graph._G[source][target]['label']
# `name`, `ARGx`, and `ARGx-of` should only appear once.
if label == 'name': # or label.startswith('ARG'):
edge_counter[label].append(target)
# the target of `opx' should only appear once.
elif label.startswith('op') or label.startswith('snt'):
edge_counter[str(target.instance)].append(target)
else:
edge_counter[label + str(target.instance)].append(target)
for label, children in edge_counter.items():
if len(children) == 1:
continue
if label == 'name':
# remove redundant edges.
for target in children[1:]:
if len(list(graph._G.in_edges(target))) == 1 and len(list(graph._G.edges(target))) == 0:
graph.remove_edge(node, target)
graph.remove_node(target)
removed_nodes.add(target)
self.repaired_items.add('remove-redundant-edge')
continue
visited_children = set()
groups = []
for i, target in enumerate(children):
if target in visited_children:
continue
subtree_instances1 = [n.instance for n in graph.get_subtree(target, 5)]
group = [(target, subtree_instances1)]
visited_children.add(target)
for _t in children[i + 1:]:
if _t in visited_children or target.instance != _t.instance:
continue
subtree_instances2 = [n.instance for n in graph.get_subtree(_t, 5)]
if is_similar(subtree_instances1, subtree_instances2):
group.append((_t, subtree_instances2))
visited_children.add(_t)
groups.append(group)
for group in groups:
if len(group) == 1:
continue
kept_target, _ = max(group, key=lambda x: len(x[1]))
for target, _ in group:
if target == kept_target:
continue
graph.remove_edge(node, target)
removed_nodes.update(graph.remove_subtree(target))
================================================
FILE: plugins/hanlp_common/hanlp_common/configurable.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-16 22:24
from hanlp_common.reflection import str_to_type, classpath_of
class Configurable(object):
@staticmethod
def from_config(config: dict, **kwargs):
"""Build an object from config.
Args:
config: A ``dict`` holding parameters for its constructor. It has to contain a `classpath` key,
which has a classpath str as its value. ``classpath`` will determine the type of object
being deserialized.
kwargs: Arguments not used.
Returns: A deserialized object.
"""
cls = config.get('classpath', None)
assert cls, f'{config} doesn\'t contain classpath field'
cls = str_to_type(cls)
deserialized_config = dict(config)
for k, v in config.items():
if isinstance(v, dict) and 'classpath' in v:
deserialized_config[k] = Configurable.from_config(v)
if cls.from_config == Configurable.from_config:
deserialized_config.pop('classpath')
return cls(**deserialized_config)
else:
return cls.from_config(deserialized_config)
class AutoConfigurable(Configurable):
@property
def config(self) -> dict:
"""
The config of this object, which are public properties. If any properties needs to be excluded from this config,
simply declare it with prefix ``_``.
"""
return dict([('classpath', classpath_of(self))] +
[(k, v.config if hasattr(v, 'config') else v)
for k, v in self.__dict__.items() if
not k.startswith('_')])
def __repr__(self) -> str:
return repr(self.config)
================================================
FILE: plugins/hanlp_common/hanlp_common/conll.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-19 20:50
from typing import Union, List
from hanlp_common.structure import SerializableDict
from hanlp_common.visualization import pretty_tree_horizontal, make_table, markdown_table
class CoNLLWord(SerializableDict):
def __init__(self, id, form, lemma=None, cpos=None, pos=None, feats=None, head=None, deprel=None, phead=None,
pdeprel=None):
"""CoNLL (:cite:`buchholz-marsi-2006-conll`) format template, see http://anthology.aclweb.org/W/W06/W06-2920.pdf
Args:
id (int):
Token counter, starting at 1 for each new sentence.
form (str):
Word form or punctuation symbol.
lemma (str):
Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available.
cpos (str):
Coarse-grained part-of-speech tag, where the tagset depends on the treebank.
pos (str):
Fine-grained part-of-speech tag, where the tagset depends on the treebank.
feats (str):
Unordered set of syntactic and/or morphological features (depending on the particular treebank),
or an underscore if not available.
head (Union[int, List[int]]):
Head of the current token, which is either a value of ID,
or zero (’0’) if the token links to the virtual root node of the sentence.
deprel (Union[str, List[str]]):
Dependency relation to the HEAD.
phead (int):
Projective head of current token, which is either a value of ID or zero (’0’),
or an underscore if not available.
pdeprel (str):
Dependency relation to the PHEAD, or an underscore if not available.
"""
self.id = sanitize_conll_int_value(id)
self.form = form
self.cpos = cpos
self.pos = pos
self.head = sanitize_conll_int_value(head)
self.deprel = deprel
self.lemma = lemma
self.feats = feats
self.phead = phead
self.pdeprel = pdeprel
def __str__(self):
if isinstance(self.head, list):
return '\n'.join('\t'.join(['_' if v is None else v for v in values]) for values in [
[str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats,
None if head is None else str(head), deprel, self.phead, self.pdeprel] for head, deprel in
zip(self.head, self.deprel)
])
values = [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats,
None if self.head is None else str(self.head), self.deprel, self.phead, self.pdeprel]
return '\t'.join(['_' if v is None else v for v in values])
@property
def nonempty_fields(self):
"""
Get the values of nonempty fields as a list.
"""
return list(f for f in
[self.form, self.lemma, self.cpos, self.pos, self.feats, self.head, self.deprel, self.phead,
self.pdeprel] if f)
def get_pos(self):
"""
Get the precisest pos for this word.
Returns: ``self.pos`` or ``self.cpos``.
"""
return self.pos or self.cpos
class CoNLLUWord(SerializableDict):
def __init__(self, id: Union[int, str], form, lemma=None, upos=None, xpos=None, feats=None, head=None, deprel=None,
deps=None,
misc=None):
"""CoNLL-U format template, see https://universaldependencies.org/format.html
Args:
id (Union[int, str]):
Token counter, starting at 1 for each new sentence.
form (Union[str, None]):
Word form or punctuation symbol.
lemma (str):
Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available.
upos (str):
Universal part-of-speech tag.
xpos (str):
Language-specific part-of-speech tag; underscore if not available.
feats (str):
List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
head (int):
Head of the current token, which is either a value of ID,
or zero (’0’) if the token links to the virtual root node of the sentence.
deprel (str):
Dependency relation to the HEAD.
deps (Union[List[Tuple[int, str], str]):
Projective head of current token, which is either a value of ID or zero (’0’),
or an underscore if not available.
misc (str):
Dependency relation to the PHEAD, or an underscore if not available.
"""
self.id = sanitize_conll_int_value(id)
self.form = form
self.upos = upos
self.xpos = xpos
if isinstance(head, list):
assert deps is None, 'When head is a list, deps has to be None'
assert isinstance(deprel, list), 'When head is a list, deprel has to be a list'
assert len(deprel) == len(head), 'When head is a list, deprel has to match its length'
deps = list(zip(head, deprel))
head = None
deprel = None
self.head = sanitize_conll_int_value(head)
self.deprel = deprel
self.lemma = lemma
self.feats = feats
if deps == '_':
deps = None
if isinstance(deps, str):
self.deps = []
for pair in deps.split('|'):
h, r = pair.split(':')
h = int(h)
self.deps.append((h, r))
else:
self.deps = deps
self.misc = misc
def __str__(self):
deps = self.deps
if not deps:
deps = None
else:
deps = '|'.join(f'{h}:{r}' for h, r in deps)
values = [str(self.id), self.form, self.lemma, self.upos, self.xpos, self.feats,
str(self.head) if self.head is not None else None, self.deprel, deps, self.misc]
return '\t'.join(['_' if v is None else v for v in values])
@property
def nonempty_fields(self):
"""
Get the values of nonempty fields as a list.
"""
return list(f for f in
[self.form, self.lemma, self.upos, self.xpos, self.feats, self.head, self.deprel, self.deps,
self.misc] if f)
def get_pos(self):
"""
Get the precisest pos for this word.
Returns: ``self.xpos`` or ``self.upos``
"""
return self.xpos or self.upos
class CoNLLSentence(list):
def __init__(self, words=None):
"""
A list of :class:`~hanlp_common.conll.CoNLLWord` or :class:`~hanlp_common.conll.CoNLLUWord`. It is a sub-class
of :class:`list` and its words can be accessed in the same way as accessing list elements.
Args:
words (list[Union[CoNLLWord, CoNLLUWord]]): A list of words.
"""
super().__init__()
if words:
self.extend(words)
def __str__(self):
return '\n'.join([word.__str__() for word in self])
@staticmethod
def from_str(conll: str, conllu=False):
"""Build a CoNLLSentence from CoNLL-X format str
Args:
conll (str): CoNLL-X or CoNLL-U format string
conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.
Returns:
A :class:`~hanlp_common.conll.CoNLLSentence`.
"""
words: List[CoNLLWord] = []
prev_id = None
for line in conll.strip().split('\n'):
if line.startswith('#'):
continue
cells = line.split('\t')
cells = [None if c == '_' else c for c in cells]
if '-' in cells[0]:
continue
cells[0] = int(cells[0])
cells[6] = int(cells[6])
if cells[0] != prev_id:
words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells))
else:
if isinstance(words[-1].head, list):
words[-1].head.append(cells[6])
words[-1].deprel.append(cells[7])
else:
words[-1].head = [words[-1].head] + [cells[6]]
words[-1].deprel = [words[-1].deprel] + [cells[7]]
prev_id = cells[0]
if conllu:
for word in words: # type: CoNLLUWord
if isinstance(word.head, list):
assert not word.deps
word.deps = list(zip(word.head, word.deprel))
word.head = None
word.deprel = None
return CoNLLSentence(words)
@staticmethod
def from_file(path: str, conllu=False):
"""Build a CoNLLSentence from ``.conllx`` or ``.conllu`` file
Args:
path: Path to the file.
conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.
Returns:
A :class:`~hanlp_common.conll.CoNLLSentence`.
"""
with open(path) as src:
return [CoNLLSentence.from_str(x, conllu) for x in src.read().split('\n\n') if x.strip()]
@staticmethod
def from_dict(d: dict, conllu=False):
"""Build a CoNLLSentence from a dict.
Args:
d: A dict storing a list for each field, where each index corresponds to a token.
conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.
Returns:
A :class:`~hanlp_common.conll.CoNLLSentence`.
"""
if conllu:
headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
else:
headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
words: List[Union[CoNLLWord, CoNLLUWord]] = []
for cells in zip(*list(d[f] for f in headings)):
words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells))
return CoNLLSentence(words)
def to_markdown(self, headings: Union[str, List[str]] = 'auto') -> str:
r"""Convert into markdown string.
Args:
headings: ``auto`` to automatically detect the word type. When passed a list of string, they are treated as
headings for each field.
Returns:
A markdown representation of this sentence.
"""
cells = [str(word).split('\t') for word in self]
if headings == 'auto':
if isinstance(self[0], CoNLLWord):
headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
else: # conllu
headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
for each in cells:
# if '|' in each[8]:
# each[8] = f'`{each[8]}`'
each[8] = each[8].replace('|', '⎮')
alignment = [('^', '>'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '>'), ('^', '<'),
('^', '<'), ('^', '<')]
text = markdown_table(headings, cells, alignment=alignment)
return text
def to_tree(self, extras: List[str] = None) -> str:
"""Convert into a pretty tree string which can be printed to show the tree structure.
Args:
extras: Extra table to be aligned to this tree.
Returns:
A pretty tree string along with extra table if passed any.
"""
arrows = []
for word in self: # type: Union[CoNLLWord, CoNLLUWord]
if word.head:
arrows.append({'from': word.head - 1, 'to': word.id - 1})
tree = pretty_tree_horizontal(arrows)
rows = [['Dep Tree', 'Token', 'Relation']]
has_lem = all(x.lemma for x in self)
has_pos = all(x.get_pos() for x in self)
if has_lem:
rows[0].append('Lemma')
if has_pos:
rows[0].append('PoS')
if extras:
rows[0].extend(extras[0])
for i, (word, arc) in enumerate(zip(self, tree)):
cell_per_word = [arc]
cell_per_word.append(word.form)
cell_per_word.append(word.deprel)
if has_lem:
cell_per_word.append(word.lemma)
if has_pos:
cell_per_word.append(word.get_pos())
if extras:
cell_per_word.extend(extras[i + 1])
rows.append(cell_per_word)
return make_table(rows, insert_header=True)
@property
def projective(self):
"""
``True`` if this tree is projective.
"""
return isprojective([x.head for x in self])
class CoNLLSentenceList(list):
def __str__(self) -> str:
return '\n\n'.join(str(x) for x in self)
def sanitize_conll_int_value(value: Union[str, int]):
if value is None or isinstance(value, int):
return value
if value == '_':
return None
if isinstance(value, str):
return int(value)
return value
def isprojective(sequence):
r"""
Checks if a dependency tree is projective.
This also works for partial annotation.
Besides the obvious crossing arcs, the examples below illustrate two non-projective cases
which are hard to detect in the scenario of partial annotation.
Args:
sequence (list[int]):
A list of head indices.
Returns:
``True`` if the tree is projective, ``False`` otherwise.
Examples:
>>> isprojective([2, -1, 1]) # -1 denotes un-annotated cases
False
>>> isprojective([3, -1, 2])
False
"""
pairs = [(h, d) for d, h in enumerate(sequence, 1) if h >= 0]
for i, (hi, di) in enumerate(pairs):
for hj, dj in pairs[i + 1:]:
(li, ri), (lj, rj) = sorted([hi, di]), sorted([hj, dj])
if li <= hj <= ri and hi == dj:
return False
if lj <= hi <= rj and hj == di:
return False
if (li < lj < ri or li < rj < ri) and (li - lj) * (ri - rj) > 0:
return False
return True
================================================
FILE: plugins/hanlp_common/hanlp_common/constant.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-13 22:41
import os
PAD = ''
'''Padding token.'''
UNK = ''
'''Unknown token.'''
CLS = '[CLS]'
BOS = ''
EOS = ''
ROOT = BOS
IDX = '_idx_'
'''Key for index.'''
HANLP_URL = os.getenv('HANLP_URL', 'https://file.hankcs.com/hanlp/')
'''Resource URL.'''
HANLP_VERBOSE = os.environ.get('HANLP_VERBOSE', '1').lower() in ('1', 'true', 'yes')
'''Enable verbose or not.'''
NULL = ''
PRED = 'PRED'
IPYTHON = os.environ.get('HANLP_IPYTHON', '1').lower() in ('1', 'true', 'yes') # Allow the user to disable IPYTHON
if IPYTHON:
try:
# noinspection PyUnresolvedReferences,PyStatementEffect
get_ipython
except NameError:
IPYTHON = False
================================================
FILE: plugins/hanlp_common/hanlp_common/document.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 04:16
import json
import re
import warnings
from typing import List, Union
from phrasetree.tree import Tree
from hanlp_common.conll import CoNLLUWord, CoNLLSentence, CoNLLSentenceList
from hanlp_common.constant import PRED, IPYTHON
from hanlp_common.util import collapse_json, prefix_match
from hanlp_common.visualization import tree_to_list, list_to_tree, render_labeled_span, make_table
class Document(dict):
def __init__(self, *args, **kwargs) -> None:
r"""A dict structure holding parsed annotations. A document is a subclass of ``dict`` and it supports every
interface of ``dict``\. Additionally, it supports interfaces to deal with various linguistic structures. Its
``str`` and ``dict`` representations are made to be compatible with JSON serialization.
Args:
*args: An iterator of key-value pairs.
**kwargs: Arguments from ``**`` operator.
Examples::
# Create a document
doc = Document(
tok=[["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司"]],
pos=[["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN"]],
ner=[[["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4],
["自然语义科技公司", "ORGANIZATION", 5, 9]]],
dep=[[[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"],
[9, "compound"], [9, "compound"], [9, "compound"], [5, "dobj"]]]
)
# print(doc) or str(doc) to get its JSON representation
print(doc)
# Access an annotation by its task name
print(doc['tok'])
# Get number of sentences
print(f'It has {doc.count_sentences()} sentence(s)')
# Access the n-th sentence
print(doc.squeeze(0)['tok'])
# Pretty print it right in your console or notebook
doc.pretty_print()
# To save the pretty prints in a str
pretty_text: str = '\n\n'.join(doc.to_pretty())
"""
super().__init__(*args, **kwargs)
for k, v in list(self.items()):
if not v:
continue
if k == 'con':
if isinstance(v, Tree) or isinstance(v[0], Tree):
continue
flat = isinstance(v[0], str)
if flat:
v = [v]
ls = []
for each in v:
if not isinstance(each, Tree):
ls.append(list_to_tree(each))
if flat:
ls = ls[0]
self[k] = ls
elif k == 'amr':
from hanlp_common.amr import AMRGraph
import penman
if isinstance(v, AMRGraph) or isinstance(v[0], AMRGraph):
continue
flat = isinstance(v[0][0], str)
if flat:
v = [v]
graphs = [AMRGraph(penman.Graph(triples)) for triples in v]
if flat:
graphs = graphs[0]
self[k] = graphs
def to_json(self, ensure_ascii=False, indent=2) -> str:
"""Convert to json string.
Args:
ensure_ascii: ``False`` to allow for non-ascii text.
indent: Indent per nested structure.
Returns:
A text representation in ``str``.
"""
d = self.to_dict()
text = json.dumps(d, ensure_ascii=ensure_ascii, indent=indent, default=lambda o: repr(o))
text = collapse_json(text, 4)
return text
def to_dict(self):
"""Convert to a json compatible dict.
Returns:
A dict representation.
"""
d = dict(self)
for k, v in self.items():
if v == [] or v is None:
continue
if k == 'con':
if not isinstance(v, Tree) and not isinstance(v[0], Tree):
continue
flat = isinstance(v, Tree)
if flat:
v = [v]
ls = []
for each in v:
if isinstance(each, Tree):
ls.append(tree_to_list(each))
if flat:
ls = ls[0]
d[k] = ls
return d
def __str__(self) -> str:
return self.to_json()
def to_conll(self, tok='tok', lem='lem', pos='pos', fea='fea', dep='dep', sdp='sdp') -> Union[
CoNLLSentence, List[CoNLLSentence]]:
"""
Convert to :class:`~hanlp_common.conll.CoNLLSentence`.
Args:
tok (str): Field name for tok.
lem (str): Field name for lem.
pos (str): Field name for upos.
fea (str): Field name for feats.
dep (str): Field name for dependency parsing.
sdp (str): Field name for semantic dependency parsing.
Returns:
A :class:`~hanlp_common.conll.CoNLLSentence` representation.
"""
tok = prefix_match(tok, self)
lem = prefix_match(lem, self)
pos = prefix_match(pos, self)
fea = prefix_match(fea, self)
dep = prefix_match(dep, self)
sdp = prefix_match(sdp, self)
results = CoNLLSentenceList()
if not tok or not self[tok]:
return results
self = self._to_doc_without_spans(tok)
flat = isinstance(self[tok][0], str)
if flat:
d = Document((k, [v]) for k, v in self.items())
else:
d = self
for sample in [dict(zip(d, t)) for t in zip(*d.values())]:
def get(_k, _i):
_v = sample.get(_k, None)
if not _v:
return None
return _v[_i]
sent = CoNLLSentence()
for i, _tok in enumerate(sample[tok]):
_dep = get(dep, i)
if not _dep:
_dep = (None, None)
sent.append(
CoNLLUWord(i + 1, form=_tok, lemma=get(lem, i), upos=get(pos, i), feats=get(fea, i), head=_dep[0],
deprel=_dep[1],
deps=None if not get(sdp, i) else '|'.join(f'{x[0]}:{x[1]}' for x in get(sdp, i))))
results.append(sent)
if flat:
return results[0]
return results
def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
show_header=True, html=False) -> Union[str, List[str]]:
"""
Convert to a pretty text representation which can be printed to visualize linguistic structures.
Args:
tok: Token key.
lem: Lemma key.
pos: Part-of-speech key.
dep: Dependency parse tree key.
sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet.
ner: Named entity key.
srl: Semantic role labeling key.
con: Constituency parsing key.
show_header: ``True`` to include a header which indicates each field with its name.
html: ``True`` to output HTML format so that non-ASCII characters can align correctly.
Returns:
A pretty string.
"""
results = []
tok = prefix_match(tok, self)
pos = prefix_match(pos, self)
ner = prefix_match(ner, self)
conlls = self.to_conll(tok=tok, lem=lem, pos=pos, dep=dep, sdp=sdp)
flat = isinstance(conlls, CoNLLSentence)
if flat:
conlls: List[CoNLLSentence] = [conlls]
def condense(block_, extras_=None):
text_ = make_table(block_, insert_header=False)
text_ = [x.split('\t', 1) for x in text_.split('\n')]
text_ = [[x[0], x[1].replace('\t', '')] for x in text_]
if extras_:
for r, s in zip(extras_, text_):
r.extend(s)
return text_
for i, conll in enumerate(conlls):
conll: CoNLLSentence = conll
tokens = [x.form for x in conll]
length = len(conll)
extras = [[] for j in range(length + 1)]
if ner in self:
ner_samples = self[ner]
if flat:
ner_samples = [ner_samples]
ner_per_sample = ner_samples[i]
# For nested NER, use the longest span
start_offsets = [None for i in range(length)]
for ent, label, b, e in ner_per_sample:
if not start_offsets[b] or e > start_offsets[b][-1]:
start_offsets[b] = (ent, label, b, e)
ner_per_sample = [y for y in start_offsets if y]
header = ['Token', 'NER', 'Type']
block = [[] for _ in range(length + 1)]
_ner = []
_type = []
offset = 0
for ent, label, b, e in ner_per_sample:
render_labeled_span(b, e, _ner, _type, label, offset)
offset = e
if offset != length:
_ner.extend([''] * (length - offset))
_type.extend([''] * (length - offset))
if any(_type):
block[0].extend(header)
for j, (_s, _t) in enumerate(zip(_ner, _type)):
block[j + 1].extend((tokens[j], _s, _t))
text = condense(block, extras)
if srl in self:
srl_samples = self[srl]
if flat:
srl_samples = [srl_samples]
srl_per_sample = srl_samples[i]
for k, pas in enumerate(srl_per_sample):
if not pas:
continue
block = [[] for _ in range(length + 1)]
header = ['Token', 'SRL', f'PA{k + 1}']
_srl = []
_type = []
offset = 0
p_index = None
for _, label, b, e in pas:
render_labeled_span(b, e, _srl, _type, label, offset)
offset = e
if label == PRED:
p_index = b
if len(_srl) != length:
_srl.extend([''] * (length - offset))
_type.extend([''] * (length - offset))
if p_index is not None:
_srl[p_index] = '╟──►'
# _type[j] = 'V'
if len(block) != len(_srl) + 1:
# warnings.warn(f'Unable to visualize overlapped spans: {pas}')
continue
block[0].extend(header)
while len(_srl) < length:
_srl.append('')
while len(_type) < length:
_type.append('')
for j, (_s, _t) in enumerate(zip(_srl, _type)):
block[j + 1].extend((tokens[j], _s, _t))
text = condense(block, extras)
if con in self:
con_samples: Tree = self[con]
if flat:
con_samples: List[Tree] = [con_samples]
tree = con_samples[i]
block = [[] for _ in range(length + 1)]
block[0].extend(('Token', 'PoS'))
for j, t in enumerate(tree.pos()):
block[j + 1].extend(t)
for height in range(2, tree.height() + (0 if len(tree) == 1 else 1)):
offset = 0
spans = []
labels = []
for k, subtree in enumerate(tree.subtrees(lambda x: x.height() == height)):
subtree: Tree = subtree
b, e = offset, offset + len(subtree.leaves())
if height >= 3:
b, e = subtree[0].center, subtree[-1].center + 1
subtree.center = b + (e - b) // 2
render_labeled_span(b, e, spans, labels, subtree.label(), offset, unidirectional=True)
offset = e
if len(spans) != length:
spans.extend([''] * (length - len(spans)))
if len(labels) != length:
labels.extend([''] * (length - len(labels)))
if height < 3:
continue
block[0].extend(['', f'{height}'])
for j, (_s, _t) in enumerate(zip(spans, labels)):
block[j + 1].extend((_s, _t))
# check short arrows and increase their length
for j, arrow in enumerate(spans):
if not arrow:
# -1 current tag ; -2 arrow to current tag ; -3 = prev tag ; -4 = arrow to prev tag
if block[j + 1][-3] or block[j + 1][-4] == '───►':
if height > 3:
if block[j + 1][-3]:
block[j + 1][-1] = block[j + 1][-3]
block[j + 1][-2] = '───►'
else:
block[j + 1][-1] = '────'
block[j + 1][-2] = '────'
block[j + 1][-3] = '────'
if block[j + 1][-4] == '───►':
block[j + 1][-4] = '────'
else:
block[j + 1][-1] = '────'
if block[j + 1][-1] == '────':
block[j + 1][-2] = '────'
if not block[j + 1][-4]:
block[j + 1][-4] = '────'
# If the root label is shorter than the level number, extend it to the same length
level_len = len(block[0][-1])
for row in block[1:]:
if row[-1] and len(row[-1]) < level_len:
row[-1] = row[-1] + ' ' * (level_len - len(row[-1]))
text = condense(block)
# Cosmetic issues
for row in text[1:]:
while ' ─' in row[1]:
row[1] = row[1].replace(' ─', ' ──')
row[1] = row[1].replace('─ ─', '───')
row[1] = re.sub(r'([►─])([\w-]*)(\s+)([│├])', lambda
m: f'{m.group(1)}{m.group(2)}{"─" * len(m.group(3))}{"┤" if m.group(4) == "│" else "┼"}',
row[1])
row[1] = re.sub(r'►(─+)►', r'─\1►', row[1])
for r, s in zip(extras, text):
r.extend(s)
# warnings.warn('Unable to visualize non-projective trees.')
if dep in self and conll.projective:
text = conll.to_tree(extras)
if not show_header:
text = text.split('\n')
text = '\n'.join(text[2:])
results.append(text)
elif any(extras):
results.append(make_table(extras, insert_header=True))
else:
results.append(' '.join(['/'.join(str(f) for f in x.nonempty_fields) for x in conll]))
if html:
def to_html(pretty_text: str) -> str:
lines = [x for x in pretty_text.split('\n') if x]
cells = []
for line in lines:
cells.append(line.split('\t'))
num_cols = len(cells[0])
cols = []
for i in range(num_cols):
cols.append([])
for row in cells:
cols[-1].append(row[i])
html = ''
for i, each in enumerate(cols):
html += '
'
if i != len(cols) - 1:
each = [x + ' ' for x in each]
html += ' '.join([x.replace(' ', ' ') for x in each])
html += ' '
html += '
'
return html
results = [to_html(x) for x in results]
if flat:
return results[0]
return results
def pretty_print(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
show_header=True, html=IPYTHON):
"""
Print a pretty text representation which visualizes linguistic structures.
Args:
tok: Token key.
lem: Lemma key.
pos: Part-of-speech key.
dep: Dependency parse tree key.
sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet.
ner: Named entity key.
srl: Semantic role labeling key.
con: Constituency parsing key.
show_header: ``True`` to print a header which indicates each field with its name.
html: ``True`` to output HTML format so that non-ASCII characters can align correctly.
"""
results = self.to_pretty(tok, lem, pos, dep, sdp, ner, srl, con, show_header, html=html)
if isinstance(results, str):
results = [results]
if html and IPYTHON:
from IPython.core.display import display, HTML
display(HTML(' '.join(results)))
else:
sent_new_line = '\n\n' if any('\n' in x for x in results) else '\n'
print(sent_new_line.join(results))
def translate(self, lang, tok='tok', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl'):
"""
Translate tags for each annotation. This is an inplace operation.
.. Attention:: Note that the translated document might not print well in terminal due to non-ASCII characters.
Args:
lang: Target language to be translated to.
tok: Token key.
pos: Part-of-speech key.
dep: Dependency parse tree key.
sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet.
ner: Named entity key.
srl: Semantic role labeling key.
Returns:
The translated document.
"""
if lang == 'zh':
from hanlp.utils.lang.zh import localization
else:
raise NotImplementedError(f'No translation for {lang}. '
f'Please contribute to our translation at https://github.com/hankcs/HanLP')
flat = isinstance(self[tok][0], str)
for task, name in zip(['pos', 'ner', 'dep', 'sdp', 'srl'], [pos, ner, dep, sdp, srl]):
annotations = self.get(name, None)
if not annotations:
continue
if flat:
annotations = [annotations]
translate: dict = getattr(localization, name, None)
if not translate:
continue
for anno_per_sent in annotations:
for i, v in enumerate(anno_per_sent):
if task == 'ner' or task == 'dep':
v[1] = translate.get(v[1], v[1])
else:
anno_per_sent[i] = translate.get(v, v)
return self
def squeeze(self, i=0):
r"""
Squeeze the dimension of each field into one. It's intended to convert a nested document like ``[[sent_i]]``
to ``[sent_i]``. When there are multiple sentences, only the ``i-th`` one will be returned. Note this is not an
inplace operation.
Args:
i: Keep the element at ``index`` for all ``list``\s.
Returns:
A squeezed document with only one sentence.
"""
sq = Document()
for k, v in self.items():
sq[k] = v[i] if isinstance(v, list) else v
return sq
def _to_doc_without_spans(self, tok: str):
"""
Remove the spans attached to tokens and return a new document.
Args:
tok: The key to tokens.
Returns:
A new document or itself.
"""
tokens: Union[List[str], List[List[str]], List[str, int, int],
List[List[str, int, int]]] = self[tok]
if isinstance(tokens[0], str):
return self
elif isinstance(tokens[0][-1], int):
tokens = [x[0] for x in tokens]
elif isinstance(tokens[0][-1], str):
return self
else:
tokens = [[t[0] for t in x] for x in tokens]
d = Document(**self)
d[tok] = tokens
return d
def get_by_prefix(self, prefix: str):
"""
Get value by the prefix of a key.
Args:
prefix: The prefix of a key. If multiple keys are matched, only the first one will be used.
Returns:
The value assigned with the matched key.
"""
key = prefix_match(prefix, self)
if not key:
return None
return self[key]
def count_sentences(self) -> int:
"""
Count number of sentences in this document.
Returns:
Number of sentences.
"""
tok = self.get_by_prefix('tok')
if isinstance(tok[0], str):
return 1
return len(tok)
================================================
FILE: plugins/hanlp_common/hanlp_common/io.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-16 22:38
import json
import os
import pickle
import sys
from typing import Union
def save_pickle(item, path):
with open(path, 'wb') as f:
pickle.dump(item, f)
def load_pickle(path):
with open(path, 'rb') as f:
return pickle.load(f)
def save_json(item: Union[dict, list, str, int, float], path: str, ensure_ascii=False, cls=None,
default=lambda o: repr(o), indent=2):
dirname = os.path.dirname(path)
if dirname:
os.makedirs(dirname, exist_ok=True)
with open(path, 'w', encoding='utf-8') as out:
json.dump(item, out, ensure_ascii=ensure_ascii, indent=indent, cls=cls, default=default)
def load_json(path):
with open(path, encoding='utf-8') as src:
return json.load(src)
def filename_is_json(filename):
filename, file_extension = os.path.splitext(filename)
return file_extension in ['.json', '.jsonl']
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
================================================
FILE: plugins/hanlp_common/hanlp_common/reflection.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 16:41
import importlib
import inspect
def classpath_of(obj) -> str:
"""get the full class path of object
Args:
obj: return:
Returns:
"""
if inspect.isfunction(obj):
return module_path_of(obj)
return "{0}.{1}".format(obj.__class__.__module__, obj.__class__.__name__)
def module_path_of(func) -> str:
return inspect.getmodule(func).__name__ + '.' + func.__name__
def object_from_classpath(classpath, **kwargs):
classpath = str_to_type(classpath)
if inspect.isfunction(classpath):
return classpath
return classpath(**kwargs)
def str_to_type(classpath):
"""convert class path in str format to a type
Args:
classpath: class path
Returns:
type
"""
module_name, class_name = classpath.rsplit(".", 1)
cls = getattr(importlib.import_module(module_name), class_name)
return cls
def type_to_str(type_object) -> str:
"""convert a type object to class path in str format
Args:
type_object: type
Returns:
class path
"""
cls_name = str(type_object)
assert cls_name.startswith(""), 'illegal input'
cls_name = cls_name[:-len("'>")]
return cls_name
================================================
FILE: plugins/hanlp_common/hanlp_common/structure.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-19 20:56
import json
from collections import OrderedDict
from hanlp_common.io import filename_is_json, save_pickle, load_pickle, save_json, load_json
class Serializable(object):
"""A super class for save/load operations."""
def save(self, path, fmt=None):
if not fmt:
if filename_is_json(path):
self.save_json(path)
else:
self.save_pickle(path)
elif fmt in ['json', 'jsonl']:
self.save_json(path)
else:
self.save_pickle(path)
def load(self, path, fmt=None):
if not fmt:
if filename_is_json(path):
self.load_json(path)
else:
self.load_pickle(path)
elif fmt in ['json', 'jsonl']:
self.load_json(path)
else:
self.load_pickle(path)
def save_pickle(self, path):
"""Save to path
Args:
path:
Returns:
"""
save_pickle(self, path)
def load_pickle(self, path):
"""Load from path
Args:
path(str): file path
Returns:
"""
item = load_pickle(path)
return self.copy_from(item)
def save_json(self, path):
save_json(self.to_dict(), path)
def load_json(self, path):
item = load_json(path)
return self.copy_from(item)
# @abstractmethod
def copy_from(self, item):
self.__dict__ = item.__dict__
# raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
def to_json(self, ensure_ascii=False, indent=2, sort=False) -> str:
d = self.to_dict()
if sort:
d = OrderedDict(sorted(d.items()))
return json.dumps(d, ensure_ascii=ensure_ascii, indent=indent, default=lambda o: repr(o))
def to_dict(self) -> dict:
return self.__dict__
class SerializableDict(Serializable, dict):
def save_json(self, path):
save_json(self, path)
def copy_from(self, item):
if isinstance(item, dict):
self.clear()
self.update(item)
def __getattr__(self, key):
if key.startswith('__'):
return dict.__getattr__(key)
return self.__getitem__(key)
def __setattr__(self, key, value):
return self.__setitem__(key, value)
def to_dict(self) -> dict:
return self
================================================
FILE: plugins/hanlp_common/hanlp_common/util.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-27 19:09
import math
from typing import Union, Any, List, Optional, Tuple, Iterable, Dict
import inspect
from itertools import chain, combinations
def powerset(iterable, descending=False):
"""
powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
Args:
iterable:
Returns:
"""
s = list(iterable)
sizes = range(len(s), -1, -1) if descending else range(len(s) + 1)
return chain.from_iterable(combinations(s, r) for r in sizes)
def isdebugging():
"""See Also https://stackoverflow.com/questions/333995/how-to-detect-that-python-code-is-being-executed-through-the-debugger"""
for frame in inspect.stack():
if frame[1].endswith("pydevd.py"):
return True
return False
def list_is_list_of_lists(sent: Union[Any, List[Any]]) -> Optional[bool]:
if not sent:
return None
return isinstance(sent[0], list)
def set_tuple_with(t: Tuple, v, at=0) -> Tuple:
t = list(t)
t[at] = v
return tuple(t)
def consume_keys_from_dict(keys: Iterable, d: dict) -> dict:
consumed = {}
for k in keys:
if k in d:
consumed[k] = d.pop(k)
return consumed
def merge_dict(d: dict, overwrite=False, inplace=False, **kwargs):
"""Merging the provided dict with other kvs
Args:
d:
kwargs:
d: dict:
overwrite: (Default value = False)
inplace: (Default value = False)
**kwargs:
Returns:
"""
nd = dict([(k, v) for k, v in d.items()] + [(k, v) for k, v in kwargs.items() if overwrite or k not in d])
if inplace:
d.update(nd)
return d
return nd
def merge_locals_kwargs(locals: dict, kwargs: dict = None, excludes=('self', 'kwargs', '__class__')):
if not kwargs:
kwargs = dict()
return merge_dict(dict((k, v) for k, v in list(locals.items())
if k not in excludes), **kwargs)
def infer_space_after(sent: List[str]):
last_token = None
quote_count: int = 0
# infer whitespace after field
whitespace_after = [True] * len(sent)
for token in range(len(sent)):
if sent[token] == '"':
quote_count += 1
if quote_count % 2 != 0:
whitespace_after[token] = False
elif last_token is not None:
whitespace_after[last_token] = False
if last_token is not None:
if sent[token] in [".", ":", ",", ";", ")", "n't", "!", "?"]:
whitespace_after[last_token] = False
if sent[token].startswith("'"):
whitespace_after[last_token] = False
if sent[token] in ["("]:
whitespace_after[token] = False
last_token = token
return whitespace_after
def collapse_json(text, indent=12):
"""Compacts a string of json data by collapsing whitespace after the
specified indent level
NOTE: will not produce correct results when indent level is not a multiple
of the json indent level
Args:
text:
indent: (Default value = 12)
Returns:
"""
initial = " " * indent
out = [] # final json output
sublevel = [] # accumulation list for sublevel entries
pending = None # holder for consecutive entries at exact indent level
for line in text.splitlines():
if line.startswith(initial):
if line[indent] == " ":
# found a line indented further than the indent level, so add
# it to the sublevel list
if pending:
# the first item in the sublevel will be the pending item
# that was the previous line in the json
sublevel.append(pending)
pending = None
item = line.strip()
sublevel.append(item)
if item.endswith(","):
sublevel.append(" ")
elif sublevel:
# found a line at the exact indent level *and* we have sublevel
# items. This means the sublevel items have come to an end
sublevel.append(line.strip())
out.append("".join(sublevel))
sublevel = []
else:
# found a line at the exact indent level but no items indented
# further, so possibly start a new sub-level
if pending:
# if there is already a pending item, it means that
# consecutive entries in the json had the exact same
# indentation and that last pending item was not the start
# of a new sublevel.
out.append(pending)
pending = line.rstrip()
else:
if pending:
# it's possible that an item will be pending but not added to
# the output yet, so make sure it's not forgotten.
out.append(pending)
pending = None
if sublevel:
out.append("".join(sublevel))
out.append(line)
return "\n".join(out)
class DummyContext(object):
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def merge_list_of_dict(samples: List[Dict]) -> dict:
batch = {}
for each in samples:
for k, v in each.items():
vs = batch.get(k, None)
if vs is None:
vs = []
batch[k] = vs
vs.append(v)
return batch
def split_dict(batch: Dict[str, Any]) -> List[Dict[str, Any]]:
samples = []
batch = dict((k, v) for k, v in batch.items() if isinstance(v, list))
num_samples = len(max(batch.values(), key=len))
for i in range(num_samples):
samples.append(dict((k, v[i]) for k, v in batch.items()))
return samples
def reorder(samples: List, order: List[int]) -> List:
return [samples[i] for i in sorted(range(len(order)), key=lambda k: order[k])]
def k_fold(k, total, i):
trn = math.ceil(i / k * total)
tst = math.ceil((i + 1) / k * total)
return list(range(0, trn)) + list(range(tst, total)), list(range(trn, tst))
def dfs(graph, start):
seen = set()
path = []
q = [start]
while q:
v = q.pop()
if v not in seen:
seen.add(v)
path.append(v)
q.extend(graph[v])
return path
def topological_sort(graph, start):
seen = set()
stack = []
order = []
q = [start]
while q:
v = q.pop()
if v not in seen:
seen.add(v)
q.extend(graph[v])
while stack and v not in graph[stack[-1]]:
order.append(stack.pop())
stack.append(v)
return stack + order[::-1]
def prefix_match(target, sources: Iterable[str]):
if target is None:
return None
if target in sources:
return target
for each in sources:
if each.startswith(target):
return each
================================================
FILE: plugins/hanlp_common/hanlp_common/visualization.py
================================================
# -*- coding:utf-8 -*-
# Modified from https://github.com/tylerneylon/explacy
import io
from collections import defaultdict
from pprint import pprint
from phrasetree.tree import Tree
def make_table(rows, insert_header=False):
col_widths = [max(len(s) for s in col) for col in zip(*rows[1:])]
rows[0] = [x[:l] for x, l in zip(rows[0], col_widths)]
fmt = '\t'.join('%%-%ds' % width for width in col_widths)
if insert_header:
rows.insert(1, ['─' * width for width in col_widths])
return '\n'.join(fmt % tuple(row) for row in rows)
def _start_end(arrow):
start, end = arrow['from'], arrow['to']
mn = min(start, end)
mx = max(start, end)
return start, end, mn, mx
def pretty_tree_horizontal(arrows, _do_print_debug_info=False):
"""Print the dependency tree horizontally
Args:
arrows:
_do_print_debug_info: (Default value = False)
Returns:
"""
# Set the base height; these may increase to allow room for arrowheads after this.
arrows_with_deps = defaultdict(set)
for i, arrow in enumerate(arrows):
arrow['underset'] = set()
if _do_print_debug_info:
print('Arrow %d: "%s" -> "%s"' % (i, arrow['from'], arrow['to']))
num_deps = 0
start, end, mn, mx = _start_end(arrow)
for j, other in enumerate(arrows):
if arrow is other:
continue
o_start, o_end, o_mn, o_mx = _start_end(other)
if ((start == o_start and mn <= o_end <= mx) or
(start != o_start and mn <= o_start <= mx)):
num_deps += 1
if _do_print_debug_info:
print('%d is over %d' % (i, j))
arrow['underset'].add(j)
arrow['num_deps_left'] = arrow['num_deps'] = num_deps
arrows_with_deps[num_deps].add(i)
if _do_print_debug_info:
print('')
print('arrows:')
pprint(arrows)
print('')
print('arrows_with_deps:')
pprint(arrows_with_deps)
# Render the arrows in characters. Some heights will be raised to make room for arrowheads.
sent_len = (max([max(arrow['from'], arrow['to']) for arrow in arrows]) if arrows else 0) + 1
lines = [[] for i in range(sent_len)]
num_arrows_left = len(arrows)
while num_arrows_left > 0:
assert len(arrows_with_deps[0])
arrow_index = arrows_with_deps[0].pop()
arrow = arrows[arrow_index]
src, dst, mn, mx = _start_end(arrow)
# Check the height needed.
height = 3
if arrow['underset']:
height = max(arrows[i]['height'] for i in arrow['underset']) + 1
height = max(height, 3, len(lines[dst]) + 3)
arrow['height'] = height
if _do_print_debug_info:
print('')
print('Rendering arrow %d: "%s" -> "%s"' % (arrow_index,
arrow['from'],
arrow['to']))
print(' height = %d' % height)
goes_up = src > dst
# Draw the outgoing src line.
if lines[src] and len(lines[src]) < height:
lines[src][-1].add('w')
while len(lines[src]) < height - 1:
lines[src].append(set(['e', 'w']))
if len(lines[src]) < height:
lines[src].append({'e'})
lines[src][height - 1].add('n' if goes_up else 's')
# Draw the incoming dst line.
lines[dst].append(u'►')
while len(lines[dst]) < height:
lines[dst].append(set(['e', 'w']))
lines[dst][-1] = set(['e', 's']) if goes_up else set(['e', 'n'])
# Draw the adjoining vertical line.
for i in range(mn + 1, mx):
while len(lines[i]) < height - 1:
lines[i].append(' ')
lines[i].append(set(['n', 's']))
# Update arrows_with_deps.
for arr_i, arr in enumerate(arrows):
if arrow_index in arr['underset']:
arrows_with_deps[arr['num_deps_left']].remove(arr_i)
arr['num_deps_left'] -= 1
arrows_with_deps[arr['num_deps_left']].add(arr_i)
num_arrows_left -= 1
return render_arrows(lines)
def render_arrows(lines):
arr_chars = {'ew': u'─',
'ns': u'│',
'en': u'└',
'es': u'┌',
'enw': u'┴',
'ensw': u'┼',
'ens': u'├',
'esw': u'┬'}
# Convert the character lists into strings.
max_len = max(len(line) for line in lines)
for i in range(len(lines)):
lines[i] = [arr_chars[''.join(sorted(ch))] if type(ch) is set else ch for ch in lines[i]]
lines[i] = ''.join(reversed(lines[i]))
lines[i] = ' ' * (max_len - len(lines[i])) + lines[i]
return lines
def render_span(begin, end, unidirectional=False):
if end - begin == 1:
return ['───►']
elif end - begin == 2:
return [
'──┐',
'──┴►',
] if unidirectional else [
'◄─┐',
'◄─┴►',
]
rows = []
for i in range(begin, end):
if i == (end - begin) // 2 + begin:
rows.append(' ├►')
elif i == begin:
rows.append('──┐' if unidirectional else '◄─┐')
elif i == end - 1:
rows.append('──┘' if unidirectional else '◄─┘')
else:
rows.append(' │')
return rows
def tree_to_list(T):
return [T.label(), [tree_to_list(t) if isinstance(t, Tree) else t for t in T]]
def list_to_tree(L):
if isinstance(L, str):
return L
return Tree(L[0], [list_to_tree(child) for child in L[1]])
def render_labeled_span(b, e, spans, labels, label, offset, unidirectional=False):
spans.extend([''] * (b - offset))
spans.extend(render_span(b, e, unidirectional))
center = b + (e - b) // 2
labels.extend([''] * (center - offset))
labels.append(label)
labels.extend([''] * (e - center - 1))
def main():
# arrows = [{'from': 1, 'to': 0}, {'from': 2, 'to': 1}, {'from': 2, 'to': 4}, {'from': 2, 'to': 5},
# {'from': 4, 'to': 3}]
# lines = pretty_tree_horizontal(arrows)
# print('\n'.join(lines))
# print('\n'.join([
# '◄─┐',
# ' │',
# ' ├►',
# ' │',
# '◄─┘',
# ]))
print('\n'.join(render_span(7, 12)))
if __name__ == '__main__':
main()
left_rule = {'<': ':', '^': ':', '>': '-'}
right_rule = {'<': '-', '^': ':', '>': ':'}
def evalute_field(record, field_spec):
"""Evalute a field of a record using the type of the field_spec as a guide.
Args:
record:
field_spec:
Returns:
"""
if type(field_spec) is int:
return str(record[field_spec])
elif type(field_spec) is str:
return str(getattr(record, field_spec))
else:
return str(field_spec(record))
def markdown_table(headings, records, fields=None, alignment=None, file=None):
"""Generate a Doxygen-flavor Markdown table from records.
See https://stackoverflow.com/questions/13394140/generate-markdown-tables
file -- Any object with a 'write' method that takes a single string
parameter.
records -- Iterable. Rows will be generated from this.
fields -- List of fields for each row. Each entry may be an integer,
string or a function. If the entry is an integer, it is assumed to be
an index of each record. If the entry is a string, it is assumed to be
a field of each record. If the entry is a function, it is called with
the record and its return value is taken as the value of the field.
headings -- List of column headings.
alignment - List of pairs alignment characters. The first of the pair
specifies the alignment of the header, (Doxygen won't respect this, but
it might look good, the second specifies the alignment of the cells in
the column.
Possible alignment characters are:
'<' = Left align
'>' = Right align (default for cells)
'^' = Center (default for column headings)
Args:
headings:
records:
fields: (Default value = None)
alignment: (Default value = None)
file: (Default value = None)
Returns:
"""
if not file:
file = io.StringIO()
num_columns = len(headings)
if not fields:
fields = list(range(num_columns))
assert len(headings) == num_columns
# Compute the table cell data
columns = [[] for i in range(num_columns)]
for record in records:
for i, field in enumerate(fields):
columns[i].append(evalute_field(record, field))
# Fill out any missing alignment characters.
extended_align = alignment if alignment is not None else [('^', '<')]
if len(extended_align) > num_columns:
extended_align = extended_align[0:num_columns]
elif len(extended_align) < num_columns:
extended_align += [('^', '>') for i in range(num_columns - len(extended_align))]
heading_align, cell_align = [x for x in zip(*extended_align)]
field_widths = [len(max(column, key=len)) if len(column) > 0 else 0
for column in columns]
heading_widths = [max(len(head), 2) for head in headings]
column_widths = [max(x) for x in zip(field_widths, heading_widths)]
_ = ' | '.join(['{:' + a + str(w) + '}'
for a, w in zip(heading_align, column_widths)])
heading_template = '| ' + _ + ' |'
_ = ' | '.join(['{:' + a + str(w) + '}'
for a, w in zip(cell_align, column_widths)])
row_template = '| ' + _ + ' |'
_ = ' | '.join([left_rule[a] + '-' * (w - 2) + right_rule[a]
for a, w in zip(cell_align, column_widths)])
ruling = '| ' + _ + ' |'
file.write(heading_template.format(*headings).rstrip() + '\n')
file.write(ruling.rstrip() + '\n')
for row in zip(*columns):
file.write(row_template.format(*row).rstrip() + '\n')
if isinstance(file, io.StringIO):
text = file.getvalue()
file.close()
return text
================================================
FILE: plugins/hanlp_common/setup.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 19:26
from os.path import abspath, join, dirname
from setuptools import find_packages, setup
this_dir = abspath(dirname(__file__))
with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
long_description = file.read()
setup(
name='hanlp_common',
version='0.0.22',
description='HanLP: Han Language Processing',
long_description=long_description,
long_description_content_type="text/markdown",
url='https://github.com/hankcs/HanLP',
author='hankcs',
author_email='hankcshe@gmail.com',
license='Apache License 2.0',
classifiers=[
'Intended Audience :: Science/Research',
'Intended Audience :: Developers',
"Development Status :: 3 - Alpha",
'Operating System :: OS Independent',
"License :: OSI Approved :: Apache Software License",
'Programming Language :: Python :: 3 :: Only',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
"Topic :: Text Processing :: Linguistic"
],
keywords='corpus,machine-learning,NLU,NLP',
packages=find_packages(exclude=['docs', 'tests*']),
include_package_data=True,
install_requires=[
'phrasetree>=0.0.9',
],
extras_require={
# These AMR dependencies might not be necessary for most people.
'full': [
'networkx',
'penman==0.6.2',
],
},
python_requires='>=3.6',
)
================================================
FILE: plugins/hanlp_demo/README.md
================================================
# Demos and examples for HanLP
This package is intended for demonstration purpose and won't be released to pypi. **Training requires a fair understanding of Linux and Python which might not be the case for everybody.**
You need a Linux/macOS system with Internet on because some corpora and bash scripts will be downloaded during training. Training on Windows might work if you are an expert but we believe it's very rare.
Your `python` command needs to be Python2 while `python3` needs to be Python3.
You need to install this package and run it from the **root** folder of HanLP.
```bash
pip install -e plugins/hanlp_demo
python3 plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py
```
================================================
FILE: plugins/hanlp_demo/hanlp_demo/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 17:48
================================================
FILE: plugins/hanlp_demo/hanlp_demo/block_windows.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-07-28 21:38
from hanlp.utils.io_util import windows
assert not windows(), 'Windows is not supported for this script. Please run it on Linux systems.'
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 17:55
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-25 19:09
import hanlp
amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE)
amr = amr_parser('The boy wants the girl to believe him.')
print(amr)
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_dep.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 17:55
import hanlp
syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN)
sent = [('Is', 'VBZ'),
('this', 'DT'),
('the', 'DT'),
('future', 'NN'),
('of', 'IN'),
('chamber', 'NN'),
('music', 'NN'),
('?', '.')]
tree = syntactic_parser(sent)
print(tree)
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_lm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-02-11 09:14
import hanlp
lm = hanlp.load(hanlp.pretrained.rnnlm.FLAIR_LM_FW_WMT11_EN_TF)
print(''.join(lm.generate_text(list('hello'))))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_ner.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-03 22:50
import hanlp
recognizer = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_CASED_EN)
print(recognizer(["President", "Obama", "is", "speaking", "at", "the", "White", "House", "."]))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_pipeline.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-04 21:05
import hanlp
from hanlp.utils.lang.en.english_tokenizer import tokenize_english
tokenizer = tokenize_english
tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN)
syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN)
semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN)
pipeline = hanlp.pipeline() \
.append(hanlp.utils.rules.split_sentence, output_key='sentences') \
.append(tokenizer, output_key='tokens') \
.append(tagger, output_key='part_of_speech_tags') \
.append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies',
conll=False) \
.append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies',
conll=False)
print(pipeline)
text = '''Jobs and Wozniak co-founded Apple in 1976 to sell Wozniak's Apple I personal computer.
Together the duo gained fame and wealth a year later with the Apple II.
'''
doc = pipeline(text)
print(doc)
# You can save the config to disk for deploying or sharing.
pipeline.save('en.json')
# Then load it smoothly.
deployed = hanlp.load('en.json')
print(deployed)
print(deployed(text))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_pos.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-03 22:16
import hanlp
tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN)
print(tagger([['I', 'banked', '2', 'dollars', 'in', 'a', 'bank', '.'],
['Is', 'this', 'the', 'future', 'of', 'chamber', 'music', '?']]))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_sdp.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-03 15:26
import hanlp
from hanlp_common.conll import CoNLLSentence
# semeval15 offers three independent annotations over the Penn Treebank (PTB)
semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN)
# semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_DM_BIAFFINE_EN)
# semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PSD_BIAFFINE_EN)
sent = [('Is', 'VBZ'),
('this', 'DT'),
('the', 'DT'),
('future', 'NN'),
('of', 'IN'),
('chamber', 'NN'),
('music', 'NN'),
('?', '.')]
tree = semantic_parser(sent) # type:CoNLLSentence
print(tree)
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_sentiment_analysis.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 03:52
import hanlp
classifier = hanlp.load('SST2_ALBERT_BASE_EN')
print(classifier.predict('I feel lucky'))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/demo_tok.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-02 19:41
from hanlp.utils.lang.en.english_tokenizer import tokenize_english
text = """\
Don't go gentle into that good night.
"""
print(tokenize_english(text))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/en/train_sst2_albert_base.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-11-10 17:41
import os
from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF
from tests import cdroot
from hanlp.datasets.glu.glue import STANFORD_SENTIMENT_TREEBANK_2_DEV, STANFORD_SENTIMENT_TREEBANK_2_TRAIN, \
STANFORD_SENTIMENT_TREEBANK_2_TEST
cdroot()
save_dir = os.path.join('data', 'model', 'sst', 'sst2_albert_base')
classifier = TransformerClassifierTF()
classifier.fit(STANFORD_SENTIMENT_TREEBANK_2_TRAIN, STANFORD_SENTIMENT_TREEBANK_2_DEV, save_dir,
transformer='albert-base-v2')
classifier.load(save_dir)
print(classifier('it\' s a charming and often affecting journey'))
classifier.evaluate(STANFORD_SENTIMENT_TREEBANK_2_TEST, save_dir=save_dir)
print(f'Model saved in {save_dir}')
================================================
FILE: plugins/hanlp_demo/hanlp_demo/ja/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-17 22:30
================================================
FILE: plugins/hanlp_demo/hanlp_demo/ja/demo_mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-17 22:30
import hanlp
from hanlp_common.document import Document
HanLP = hanlp.load(hanlp.pretrained.mtl.NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA)
doc: Document = HanLP([
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
'奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。',
])
print(doc)
doc.pretty_print()
================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 22:25
================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_lid.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 16:49
import hanlp
lid = hanlp.load(hanlp.pretrained.classifiers.LID_176_FASTTEXT_BASE)
print(lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.'))
lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
print(f'{lang} language identified with probability {prob:.3%}')
print(lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2))
# For a combination of languages, predict top-k languages with probabilities:
text = '''
2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。
In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.
'''
print(lid(text, topk=3, prob=True))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_lid_restful.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 16:49
from hanlp_restful import HanLPClient
HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul')
print(HanLP.language_identification([
'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
]))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 13:51
import hanlp
from hanlp_common.document import Document
HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE)
doc: Document = HanLP([
'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
])
print(doc)
doc.pretty_print()
================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/train/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2023-02-21 19:40
================================================
FILE: plugins/hanlp_demo/hanlp_demo/mul/train/mul_base.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:24
from hanlp.common.dataset import SortingSamplerBuilder
from hanlp.common.transform import NormalizeToken
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
from hanlp.components.mtl.tasks.ud import UniversalDependenciesParsing
from hanlp.datasets.parsing.ptb import PTB_TOKEN_MAPPING
from hanlp.datasets.parsing.ud.ud210m import UD_210_MULTILINGUAL_TRAIN, UD_210_MULTILINGUAL_DEV, \
UD_210_MULTILINGUAL_TEST
from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding
from hanlp.utils.log_util import cprint
from tests import cdroot
def main():
cdroot()
transformer = "nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large"
tasks = {
'tok': TaggingTokenization(
'data/mtl/mul/tok/train.tsv',
'data/mtl/mul/tok/dev.tsv',
'data/mtl/mul/tok/test.tsv',
SortingSamplerBuilder(batch_size=128, batch_max_tokens=12800),
hard_constraint=True,
tagging_scheme='BMES',
delimiter='\t',
max_seq_len=256,
char_level=True,
lr=1e-3,
),
'ud': UniversalDependenciesParsing(
UD_210_MULTILINGUAL_TRAIN,
UD_210_MULTILINGUAL_DEV,
UD_210_MULTILINGUAL_TEST,
SortingSamplerBuilder(batch_size=128, batch_max_tokens=12800),
lr=1e-3,
dependencies='tok',
max_seq_len=256,
),
}
mtl = MultiTaskLearning()
save_dir = 'data/model/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L12'
cprint(f'Model will be saved in [cyan]{save_dir}[/cyan]')
mtl.fit(
ContextualWordEmbedding(
'token',
transformer,
average_subwords=True,
max_sequence_length=512,
word_dropout=.2,
),
tasks,
save_dir,
30,
lr=1e-3,
encoder_lr=5e-5,
grad_norm=1,
gradient_accumulation=8,
eval_trn=False,
transform=NormalizeToken(PTB_TOKEN_MAPPING, 'token'),
tau=0.5,
cache='data/cache/ud/mtl',
)
cprint(f'Model saved in [cyan]{save_dir}[/cyan]')
mtl.load(save_dir)
mtl['tok'].dict_force = {"'s", "n't", "'ll", "'m", "'d", "'ve", "'re"}
mtl['ud'].config.tree = True
mtl.save_config(save_dir)
for k, v in mtl.tasks.items():
v.trn = tasks[k].trn
v.dev = tasks[k].dev
v.tst = tasks[k].tst
mtl.evaluate(save_dir)
doc = mtl(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'])
doc.pretty_print()
if __name__ == '__main__':
main()
================================================
FILE: plugins/hanlp_demo/hanlp_demo/sent_split.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 14:23
import hanlp
split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
output = split_sent('3.14 is pi. “你好!!!”——他说。劇場版「Fate/stay night [HF]」最終章公開カウントダウン!')
print('\n'.join(output))
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/eos.html
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/__init__.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 13:51
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/abstractive_summarization_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4M7ka0K5OMWU",
"outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BMW528wGNulM"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 生成式自动摘要\n",
"生成式自动摘要(Abstractive Summarization)任务的目标是为文章生成一段简短的概括性摘要。 生成的摘要有可能出现原文中不存在的新短语或新句子,并且整体流畅性较高。\n",
"### 中文\n",
"生成式自动摘要任务的输入为一段文本:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BqEmDMGGOtk3",
"outputId": "936d439a-e1ff-4308-d2aa-775955558594"
},
"outputs": [
{
"data": {
"text/plain": [
"'长江证券:看好大金属品种中的铜铝钢'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.abstractive_summarization('''\n",
"每经AI快讯,2月4日,长江证券研究所金属行业首席分析师王鹤涛表示,2023年海外经济衰退,美债现处于历史高位,\n",
"黄金的趋势是值得关注的;在国内需求修复的过程中,看好大金属品种中的铜铝钢。\n",
"此外,在细分的小品种里,建议关注两条主线,一是新能源,比如锂、钴、镍、稀土,二是专精特新主线。(央视财经)\n",
"''')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"返回值为一段摘要。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 英文\n",
"按照HanLP一贯的多语种设计,任何语言都支持。由于服务器GPU资源限制,目前英文接口暂未上线。如果你有相应需求,欢迎前往论坛发起请愿。"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "absum_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/amr_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4M7ka0K5OMWU",
"outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BMW528wGNulM"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 抽象意义表示\n",
"### 中文\n",
"抽象意义表示任务的输入为一段文本或已分词完毕的句子:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BqEmDMGGOtk3",
"outputId": "936d439a-e1ff-4308-d2aa-775955558594"
},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graphs = HanLP.abstract_meaning_representation('男孩希望女孩相信他。')\n",
"len(graphs)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"返回值为每个句子相应的AMR图的Meaning Representation格式:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': '0',\n",
" 'input': '男孩 希望 女孩 相信 他 。',\n",
" 'nodes': [{'id': 0,\n",
" 'label': '男孩',\n",
" 'anchors': [{'from': 0, 'to': 2}, {'from': 12, 'to': 13}]},\n",
" {'id': 1, 'label': '希望-01', 'anchors': [{'from': 3, 'to': 5}]},\n",
" {'id': 2, 'label': '女孩', 'anchors': [{'from': 6, 'to': 8}]},\n",
" {'id': 3, 'label': '相信-01', 'anchors': [{'from': 9, 'to': 11}]}],\n",
" 'edges': [{'source': 1, 'target': 3, 'label': 'arg1'},\n",
" {'source': 1, 'target': 0, 'label': 'arg0'},\n",
" {'source': 3, 'target': 2, 'label': 'arg0'},\n",
" {'source': 3, 'target': 0, 'label': 'arg1'}],\n",
" 'tops': [1],\n",
" 'framework': 'amr'}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graph = graphs[0]\n",
"graph"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"注意上面“男孩”有2个anchor,分别对应“男孩”和“他”。也就是说,MR格式其实包含了指代消解的结果。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 可视化\n",
"指定`visualization='svg'`即可得到矢量图可视化。"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"0 \n",
" \n",
"\n",
"\n",
"\n",
"1 \n",
"\n",
"希望-01 \n",
" \n",
"\n",
"\n",
"top->1 \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"0 \n",
"\n",
"男孩 \n",
" \n",
"\n",
"\n",
"1->0 \n",
" \n",
" \n",
"arg0 \n",
" \n",
"\n",
"\n",
"3 \n",
"\n",
"相信-01 \n",
" \n",
"\n",
"\n",
"1->3 \n",
" \n",
" \n",
"arg1 \n",
" \n",
"\n",
"\n",
"3->0 \n",
" \n",
" \n",
"arg1 \n",
" \n",
"\n",
"\n",
"2 \n",
"\n",
"女孩 \n",
" \n",
"\n",
"\n",
"3->2 \n",
" \n",
" \n",
"arg0 \n",
" \n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import SVG, display\n",
"\n",
"def show_svg(g):\n",
" display(SVG(data=g['svg']))\n",
" \n",
"graph = HanLP.abstract_meaning_representation('男孩希望女孩相信他。', visualization='svg')[0]\n",
"show_svg(graph)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 多语种支持\n",
"除了中文外,支持的语言列表:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 英文\n",
"目前,HanLP服务器还支持英文AMR:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"0 \n",
" \n",
"\n",
"\n",
"\n",
"1 \n",
"\n",
"want-01 \n",
" \n",
"\n",
"\n",
"top->1 \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"0 \n",
"\n",
"boy \n",
" \n",
"\n",
"\n",
"1->0 \n",
" \n",
" \n",
"arg0 \n",
" \n",
"\n",
"\n",
"3 \n",
"\n",
"believe-01 \n",
" \n",
"\n",
"\n",
"1->3 \n",
" \n",
" \n",
"arg1 \n",
" \n",
"\n",
"\n",
"3->0 \n",
" \n",
" \n",
"arg1 \n",
" \n",
"\n",
"\n",
"2 \n",
"\n",
"girl \n",
" \n",
"\n",
"\n",
"3->2 \n",
" \n",
" \n",
"arg0 \n",
" \n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"graph = HanLP.abstract_meaning_representation('The boy wants the girl to believe him.',\n",
" language='en', visualization='svg')[0]\n",
"show_svg(graph)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"用户可以通过指定`language`参数来实现英文抽象意义表示的分析:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': '0',\n",
" 'input': 'The boy wants the girl to believe him .',\n",
" 'nodes': [{'id': 0, 'label': 'boy'},\n",
" {'id': 1, 'label': 'wants-01'},\n",
" {'id': 2, 'label': 'girl'},\n",
" {'id': 3, 'label': 'believe-01'}],\n",
" 'edges': [{'source': 3, 'target': 0, 'label': 'arg1'},\n",
" {'source': 1, 'target': 3, 'label': 'arg1'},\n",
" {'source': 3, 'target': 2, 'label': 'arg0'},\n",
" {'source': 1, 'target': 0, 'label': 'arg0'}],\n",
" 'tops': [1],\n",
" 'framework': 'amr'}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.abstract_meaning_representation(tokens=[['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.']], \n",
" language='en')[0]"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "amr_stl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/amr_stl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp[amr] -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4M7ka0K5OMWU",
"outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
},
"outputs": [
{
"data": {
"text/plain": [
"{'AMR3_SEQ2SEQ_BART_LARGE': 'https://file.hankcs.com/hanlp/amr/amr3_seq2seq_bart_large_83.30_20220125_114450.zip',\n",
" 'MRP2020_AMR_ENG_ZHO_XLM_BASE': 'http://download.hanlp.com/amr/extra/amr-eng-zho-xlm-roberta-base_20220412_223756.zip',\n",
" 'MRP2020_AMR_ZHO_MENGZI_BASE': 'http://download.hanlp.com/amr/extra/amr-zho-mengzi-base_20220415_101941.zip'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.amr.ALL # 语种见名称最后一个字段或相应语料库"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BMW528wGNulM"
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0tmKBu7sNAXX",
"outputId": "df2de87b-27f5-4c72-8eb2-25ceefdd8270"
},
"outputs": [],
"source": [
"amr = hanlp.load('MRP2020_AMR_ENG_ZHO_XLM_BASE')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 抽象意义表示\n",
"抽象意义表示任务的输入为一个或多个句子,`MRP2020_AMR_ENG_ZHO_XLM_BASE`要求提供分词完毕的句子:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BqEmDMGGOtk3",
"outputId": "936d439a-e1ff-4308-d2aa-775955558594"
},
"outputs": [],
"source": [
"graph = amr([\"男孩\", \"希望\", \"女孩\", \"相信\", \"他\", \"。\"])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"返回对象为[penman.Graph](https://penman.readthedocs.io/en/latest/api/penman.graph.html)类型:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"graph"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"打印时为友好格式:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(x2 / 希望-01\n",
" :arg1 (x4 / 相信-01\n",
" :arg0 (x3 / 女孩)\n",
" :arg1 x1)\n",
" :arg0 (x1 / 男孩))\n"
]
}
],
"source": [
"print(graph)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"该AMR的可视化结果为:\n",
"\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`MRP2020_AMR_ENG_ZHO_XLM_BASE`其实是一个Meaning Representation Parsing模型,支持输出Meaning Representation(MR)格式,该格式比AMR的表达力更强:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': '0',\n",
" 'input': '男孩 希望 女孩 相信 他 。',\n",
" 'nodes': [{'id': 0,\n",
" 'label': '男孩',\n",
" 'anchors': [{'from': 0, 'to': 2}, {'from': 12, 'to': 13}]},\n",
" {'id': 1, 'label': '希望-01', 'anchors': [{'from': 3, 'to': 5}]},\n",
" {'id': 2, 'label': '女孩', 'anchors': [{'from': 6, 'to': 8}]},\n",
" {'id': 3, 'label': '相信-01', 'anchors': [{'from': 9, 'to': 11}]}],\n",
" 'edges': [{'source': 1, 'target': 3, 'label': 'arg1'},\n",
" {'source': 1, 'target': 0, 'label': 'arg0'},\n",
" {'source': 3, 'target': 2, 'label': 'arg0'},\n",
" {'source': 3, 'target': 0, 'label': 'arg1'}],\n",
" 'tops': [1],\n",
" 'framework': 'amr'}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"amr([\"男孩\", \"希望\", \"女孩\", \"相信\", \"他\", \"。\"], output_amr=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"注意上面“男孩”有2个anchor,分别对应“男孩”和“他”。也就是说,MR格式其实包含了指代消解的结果。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 多语种支持\n",
"`MRP2020_AMR_ENG_ZHO_XLM_BASE`同时还是一个Cross-Lingual模型,支持的语言列表:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['amr', 'eng'], ['amr', 'zho']]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"amr.config.frameworks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"用户可以通过指定language参数来实现英文抽象意义表示的分析:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(w1 / wants-01\n",
" :arg1 (b2 / believe-01\n",
" :arg0 (g1 / girl)\n",
" :arg1 b1)\n",
" :arg0 (b1 / boy))\n"
]
}
],
"source": [
"print(amr(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"为了达到最佳效果,建议同时提供每个词的词干:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(w1 / want-01\n",
" :arg1 (b2 / believe-01\n",
" :arg0 (g1 / girl)\n",
" :arg1 b1)\n",
" :arg0 (b1 / boy))\n"
]
}
],
"source": [
"print(amr([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'),\n",
" ('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"该AMR的可视化结果为:\n",
"\n",
""
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "amr_stl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/classification_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nf9TgeCTC0OT"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jaW4eu6kC0OU",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_xI_bLAaC0OU"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IYwV-UkNNzFp",
"outputId": "54065443-9b0a-444c-f6c0-c701bc86400b",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1Uf_u7ddMhUt",
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 文本分类\n",
"文本分类任务的输入为文档以及分类模型,以新闻领域的`news_zh`为例:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "BqEmDMGGOtk3"
},
"outputs": [
{
"data": {
"text/plain": [
"'科技'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SwaPn1hjC0OW"
},
"source": [
"返回值为文档最可能的类目。HanLP支持返回类目对应的概率(置信度):"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "egpWwHKxC0OX",
"outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff"
},
"outputs": [
{
"data": {
"text/plain": [
"['科技', 0.999642014503479]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh', prob=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kq_j5TLFC0OX"
},
"source": [
"HanLP也支持返回概率最高的`topk`个类目:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "isJhzYyIC0OX",
"outputId": "683c8489-dffc-426e-f95b-e91dfb373260"
},
"outputs": [
{
"data": {
"text/plain": [
"['科技', '家居']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh', topk=2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"该功能对于混合了多个主题的文档而言特别实用:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"{'时尚': 0.6342714428901672,\n",
" '家居': 0.359315425157547,\n",
" '科技': 0.0013340614968910813,\n",
" '体育': 0.001275017624720931,\n",
" '房产': 0.0010209722677245736,\n",
" '娱乐': 0.0006360886618494987,\n",
" '财经': 0.0005668793455697596,\n",
" '游戏': 0.00037119409535080194,\n",
" '教育': 0.00029694309341721237,\n",
" '股票': 0.0002858955995179713,\n",
" '星座': 0.0002288677787873894,\n",
" '彩票': 0.00022682634880766273,\n",
" '时政': 0.0001005345256999135,\n",
" '社会': 6.985480285948142e-05}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = '''\n",
"改了好几次,感觉终于可以确定了。\n",
"这次的真丝是做了古董感的米金色染色,法蕾也做了同样的颜色。\n",
"真丝软糯的手感和温柔的光泽感,在即将结束的冬天,显得格外的美好。\n",
"'''\n",
"\n",
"HanLP.text_classification(text, model='news_zh', topk=True, prob=True)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "classification_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/con_mtl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [
{
"data": {
"text/plain": [
"{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
" 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
" 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
" 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
" 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 短语句法分析\n",
"任务越少,速度越快。如指定仅执行短语句法分析:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
},
"outputs": [],
"source": [
"doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='con')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok/fine\": [\n",
" [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
" ],\n",
" \"con\": [\n",
" [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"2021年\"]]]], [\"NP\", [[\"_\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"_\", [\"为\"]], [\"NP\", [[\"_\", [\"生产\"]], [\"_\", [\"环境\"]]]]]], [\"VP\", [[\"_\", [\"带来\"]], [\"NP\", [[\"ADJP\", [[\"NP\", [[\"ADJP\", [[\"_\", [\"次\"]]]], [\"NP\", [[\"_\", [\"世代\"]]]]]], [\"ADVP\", [[\"_\", [\"最\"]]]], [\"VP\", [[\"_\", [\"先进\"]]]]]], [\"_\", [\"的\"]], [\"NP\", [[\"QP\", [[\"_\", [\"多\"]]]], [\"NP\", [[\"_\", [\"语种\"]]]]]], [\"NP\", [[\"_\", [\"NLP\"]], [\"_\", [\"技术\"]]]]]]]]]], [\"_\", [\"。\"]]]]]],\n",
" [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"阿婆主\"]]]], [\"VP\", [[\"VP\", [[\"_\", [\"来到\"]], [\"NP\", [[\"_\", [\"北京\"]], [\"_\", [\"立方庭\"]]]]]], [\"VP\", [[\"_\", [\"参观\"]], [\"NP\", [[\"_\", [\"自然\"]], [\"_\", [\"语义\"]], [\"_\", [\"科技\"]], [\"_\", [\"公司\"]]]]]]]], [\"_\", [\"。\"]]]]]]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"print(doc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`doc['con']`为Tree类型,是list的子类。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"可视化短语句法树:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
},
"outputs": [
{
"data": {
"text/html": [
"Token ───────── 2021年 HanLPv2.1 为 生产 环境 带来 次 世代 最 先进 的 多 语种 NLP 技术 。 P 3 4 5 6 7 8 9 ─────────────────────────────────────────────────────── _───────────────────────────────────────────►NP ───┐ _───────────────────────────────────────────►NP────┤ _──────────┐ │ _──┐ ├────────────────────────►PP ───┐ │ _──┴►NP ───┘ │ │ _──────────────────────────────────┐ │ │ _───►ADJP──┐ │ ├►VP────┤ _───►NP ───┴►NP ───┐ │ │ │ _───────────►ADVP──┼►ADJP──┐ ├►VP ───┘ ├►IP _───────────►VP ───┘ │ │ │ _──────────────────────────┤ │ │ _───►QP ───┐ ├►NP ───┘ │ _───►NP ───┴────────►NP────┤ │ _──┐ │ │ _──┴────────────────►NP ───┘ │ _──────────────────────────────────────────────────┘ Tok ─── 阿婆主 来到 北京 立方庭 参观 自然 语义 科技 公司 。 P 3 4 5 6 ─────────────────────────────── _───────────────────►NP ───┐ _──────────┐ │ _──┐ ├►VP ───┐ │ _──┴►NP ───┘ │ │ _──────────┐ ├►VP────┤ _──┐ │ │ ├►IP _ │ ├►VP ───┘ │ _ ├►NP ───┘ │ _──┘ │ _──────────────────────────┘ "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"doc.pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"将第一个短语树转换为bracketed格式:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(TOP\n",
" (IP\n",
" (NP (_ 2021年))\n",
" (NP (_ HanLPv2.1))\n",
" (VP\n",
" (PP (_ 为) (NP (_ 生产) (_ 环境)))\n",
" (VP\n",
" (_ 带来)\n",
" (NP\n",
" (ADJP\n",
" (NP (ADJP (_ 次)) (NP (_ 世代)))\n",
" (ADVP (_ 最))\n",
" (VP (_ 先进)))\n",
" (_ 的)\n",
" (NP (QP (_ 多)) (NP (_ 语种)))\n",
" (NP (_ NLP) (_ 技术)))))\n",
" (_ 。)))\n"
]
}
],
"source": [
"print(doc['con'][0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"将第一个短语树转换为list格式:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['TOP',\n",
" [['IP',\n",
" [['NP', [['_', ['2021年']]]],\n",
" ['NP', [['_', ['HanLPv2.1']]]],\n",
" ['VP',\n",
" [['PP', [['_', ['为']], ['NP', [['_', ['生产']], ['_', ['环境']]]]]],\n",
" ['VP',\n",
" [['_', ['带来']],\n",
" ['NP',\n",
" [['ADJP',\n",
" [['NP', [['ADJP', [['_', ['次']]]], ['NP', [['_', ['世代']]]]]],\n",
" ['ADVP', [['_', ['最']]]],\n",
" ['VP', [['_', ['先进']]]]]],\n",
" ['_', ['的']],\n",
" ['NP', [['QP', [['_', ['多']]]], ['NP', [['_', ['语种']]]]]],\n",
" ['NP', [['_', ['NLP']], ['_', ['技术']]]]]]]]]],\n",
" ['_', ['。']]]]]]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc['con'][0].to_list()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XOsWkOqQfzlr"
},
"source": [
"为已分词的句子执行短语句法分析:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "bLZSTbv_f3OA",
"outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
},
"outputs": [
{
"data": {
"text/html": [
"Token ───── hanlp 为 生产 环境 带来 次世代 最 先进 的 多语种 nlp 技术 。 P 3 4 5 6 7 8 9 ─────────────────────────────────────────────────────── _───────────────────────────────────────────►NP ───┐ _──────────┐ │ _──┐ ├────────────────────────►PP ───┐ │ _──┴►NP ───┘ │ │ _──────────────────────────────────┐ │ │ _───►NP ───┐ │ ├►VP────┤ _───►ADVP──┼►VP ────►IP ───┐ │ │ ├►IP _───►VP ───┘ │ ├►VP ───┘ │ _──────────────────────────┤ │ │ _───────────────────►NP────┼►NP ───┘ │ _───────────────────►NP────┤ │ _───────────────────►NP ───┘ │ _──────────────────────────────────────────────────┘ Tok ─── 我 的 希望 是 希望 张晚霞 的 背影 被 晚霞 映红 。 P 3 4 5 6 7 8 9 10 11 ─────────────────────────────────────────────────────────────────────── _───►NP ───┐ _──────────┴►DNP ──┐ _───────────►NP ───┴────────────────────────────────────────►NP ───┐ _──────────────────────────────────────────────────────────┐ │ _──────────────────────────────────────────┐ │ │ _───►NP ───┐ │ ├►VP────┤ _──────────┴►DNP ──┐ ├►VP ────►IP ───┘ │ _───────────►NP ───┴────────►NP ───┐ │ ├►IP _──────────────────────────┐ ├►IP ───┘ │ _───►NP ───┐ ├►VP ───┘ │ _───►VP ───┴►IP ────►CP ───┘ │ _──────────────────────────────────────────────────────────────────┘ "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HanLP([\n",
" [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
" ], tasks='con', skip_tasks='tok*').pretty_print()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "con_mtl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/con_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 短语句法分析\n",
"任务越少,速度越快。如指定仅执行短语句法分析:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
},
"outputs": [],
"source": [
"doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='con')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok/fine\": [\n",
" [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n",
" ],\n",
" \"con\": [\n",
" [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"2021年\"]]]], [\"NP\", [[\"_\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"_\", [\"为\"]], [\"NP\", [[\"_\", [\"生产\"]], [\"_\", [\"环境\"]]]]]], [\"VP\", [[\"_\", [\"带来\"]], [\"NP\", [[\"IP\", [[\"VP\", [[\"NP\", [[\"QP\", [[\"CLP\", [[\"_\", [\"次\"]]]]]], [\"NP\", [[\"_\", [\"世代\"]]]]]], [\"ADVP\", [[\"_\", [\"最\"]]]], [\"VP\", [[\"_\", [\"先进\"]]]]]]]], [\"_\", [\"的\"]], [\"NP\", [[\"QP\", [[\"_\", [\"多\"]]]], [\"NP\", [[\"_\", [\"语种\"]]]]]], [\"NP\", [[\"_\", [\"NLP\"]], [\"_\", [\"技术\"]]]]]]]]]], [\"_\", [\"。\"]]]]]]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"print(doc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`doc['con']`为Tree类型,是list的子类。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"可视化短语句法树:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
},
"outputs": [
{
"data": {
"text/html": [
"Token ───────── 2021年 HanLPv2.1 为 生产 环境 带来 次 世代 最 先进 的 多 语种 NLP 技术 。 P 3 4 5 6 7 8 9 10 11 ─────────────────────────────────────────────────────────────────────── _───────────────────────────────────────────────────────────►NP ───┐ _───────────────────────────────────────────────────────────►NP────┤ _──────────┐ │ _──┐ ├────────────────────────────────────────►PP ───┐ │ _──┴►NP ───┘ │ │ _──────────────────────────────────────────────────┐ │ │ _───►CLP ───►QP ───┐ │ ├►VP────┤ _───────────►NP ───┴►NP ───┐ │ │ │ _───────────────────►ADVP──┼►VP ────►IP ───┐ ├►VP ───┘ ├►IP _───────────────────►VP ───┘ │ │ │ _──────────────────────────────────────────┤ │ │ _───►QP ───┐ ├►NP ───┘ │ _───►NP ───┴────────────────────────►NP────┤ │ _──┐ │ │ _──┴────────────────────────────────►NP ───┘ │ _──────────────────────────────────────────────────────────────────┘ "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"doc.pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"转换为bracketed格式:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(TOP\n",
" (IP\n",
" (NP (_ 2021年))\n",
" (NP (_ HanLPv2.1))\n",
" (VP\n",
" (PP (_ 为) (NP (_ 生产) (_ 环境)))\n",
" (VP\n",
" (_ 带来)\n",
" (NP\n",
" (IP\n",
" (VP\n",
" (NP (QP (CLP (_ 次))) (NP (_ 世代)))\n",
" (ADVP (_ 最))\n",
" (VP (_ 先进))))\n",
" (_ 的)\n",
" (NP (QP (_ 多)) (NP (_ 语种)))\n",
" (NP (_ NLP) (_ 技术)))))\n",
" (_ 。)))\n"
]
}
],
"source": [
"print(doc['con'][0])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XOsWkOqQfzlr"
},
"source": [
"为已分词的句子执行短语句法分析:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "bLZSTbv_f3OA",
"outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
},
"outputs": [
{
"data": {
"text/html": [
"Token ───── hanlp 为 生产 环境 带来 次世代 最 先进 的 多语种 nlp 技术 。 P 3 4 5 6 7 8 9 10 11 12 ─────────────────────────────────────────────────────────────────────────────── _───────────────────────────────────────────────────────────────────►NP ───┐ _──────────┐ │ _──┐ ├────────────────────────────────────────────────►PP ───┐ │ _──┴►NP ───┘ │ │ _──────────────────────────────────────────────────────────┐ │ │ _───────────►NP ───┐ │ ├►VP────┤ _───►ADVP──┐ ├►VP ────►IP ───┐ │ │ ├►IP _───►VP ───┴►VP ───┘ ├►CP ────►CP ───┐ ├►VP ───┘ │ _──────────────────────────────────┘ │ │ │ _──────────────────────────────────────────────────┼►NP ───┘ │ _───►NP ───┐ │ │ _───►NP ───┴────────────────────────────────►NP ───┘ │ _──────────────────────────────────────────────────────────────────────────┘ Tok ─── 我 的 希望 是 希望 张晚霞 的 背影 被 晚霞 映红 。 P 3 4 5 6 7 8 9 10 11 ─────────────────────────────────────────────────────────────────────── _───►NP ───┐ _──────────┴►DNP ──┐ _───────────►NP ───┴────────────────────────────────────────►NP ───┐ _──────────────────────────────────────────────────────────┐ │ _──────────────────────────────────────────┐ │ │ _───►NP ───┐ │ ├►VP────┤ _──────────┴►DNP ──┐ ├►VP ────►IP ───┘ │ _───────────►NP ───┴────────►NP ───┐ │ ├►IP _──────────────────────────┐ ├►IP ───┘ │ _───►NP ───┐ ├►VP ───┘ │ _───►VP ───┴►IP ────►CP ───┘ │ _──────────────────────────────────────────────────────────────────┘ "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HanLP(tokens=[\n",
" [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
" ], tasks='con').pretty_print()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "con_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/con_stl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [
{
"data": {
"text/plain": [
"{'CTB9_CON_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/constituency/ctb9_con_electra_small_20220215_230116.zip',\n",
" 'CTB9_CON_FULL_TAG_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/constituency/ctb9_full_tag_con_electra_small_20220118_103119.zip'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.constituency.ALL # 语种见名称最后一个字段或相应语料库"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"con = hanlp.load('CTB9_CON_FULL_TAG_ELECTRA_SMALL')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 短语句法分析\n",
"输入为已分词的一个或多个句子:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
},
"outputs": [],
"source": [
"trees = con([[\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]], tasks='con')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"返回值为一个`Tree`的数组:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['TOP', [['IP', [['NP-TMP', [['_', ['2021年']]]], ['NP-PN-SBJ', [['_', ['HanLPv2.1']]]], ['VP', [['PP-BNF', [['_', ['为']], ['NP', [['_', ['生产']], ['_', ['环境']]]]]], ['VP', [['_', ['带来']], ['NP-OBJ', [['CP', [['CP', [['IP', [['VP', [['NP', [['DP', [['_', ['次']]]], ['NP', [['_', ['世代']]]]]], ['ADVP', [['_', ['最']]]], ['VP', [['_', ['先进']]]]]]]], ['_', ['的']]]]]], ['NP', [['QP', [['_', ['多']]]], ['NP', [['_', ['语种']]]]]], ['NP', [['_', ['NLP']], ['_', ['技术']]]]]]]]]], ['_', ['。']]]]]], ['TOP', [['IP', [['NP-SBJ', [['_', ['阿婆主']]]], ['VP', [['VP', [['_', ['来到']], ['NP-OBJ', [['_', ['北京']], ['NP-PN', [['_', ['立方庭']]]]]]]], ['VP', [['_', ['参观']], ['NP-OBJ', [['_', ['自然']], ['_', ['语义']], ['_', ['科技']], ['_', ['公司']]]]]]]], ['_', ['。']]]]]]]\n"
]
}
],
"source": [
"print(trees)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"转换为bracketed格式:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(TOP\n",
" (IP\n",
" (NP-TMP (_ 2021年))\n",
" (NP-PN-SBJ (_ HanLPv2.1))\n",
" (VP\n",
" (PP-BNF (_ 为) (NP (_ 生产) (_ 环境)))\n",
" (VP\n",
" (_ 带来)\n",
" (NP-OBJ\n",
" (CP\n",
" (CP\n",
" (IP\n",
" (VP\n",
" (NP (DP (_ 次)) (NP (_ 世代)))\n",
" (ADVP (_ 最))\n",
" (VP (_ 先进))))\n",
" (_ 的)))\n",
" (NP (QP (_ 多)) (NP (_ 语种)))\n",
" (NP (_ NLP) (_ 技术)))))\n",
" (_ 。)))\n"
]
}
],
"source": [
"print(trees[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 组装流水线"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"短语成分树的第一层non-terminal一般是词性标签,所以经常与词性标注一起使用。为此,先加载一个词性标注器:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"然后创建一个函数将词性标签和句法树组装起来:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from hanlp_common.document import Document\n",
"def merge_pos_into_con(doc:Document):\n",
" flat = isinstance(doc['pos'][0], str)\n",
" if flat:\n",
" doc = Document((k, [v]) for k, v in doc.items())\n",
" for tree, tags in zip(doc['con'], doc['pos']):\n",
" offset = 0\n",
" for subtree in tree.subtrees(lambda t: t.height() == 2):\n",
" tag = subtree.label()\n",
" if tag == '_':\n",
" subtree.set_label(tags[offset])\n",
" offset += 1\n",
" if flat:\n",
" doc = doc.squeeze()\n",
" return doc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"之后就可以用一个流水线将三者组装起来了:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"nlp = hanlp.pipeline() \\\n",
" .append(pos, input_key='tok', output_key='pos') \\\n",
" .append(con, input_key='tok', output_key='con') \\\n",
" .append(merge_pos_into_con, input_key='*')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"该流水线的结构如下:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tok->TransformerTagger->pos, tok->CRFConstituencyParser->con, None->merge_pos_into_con->None]\n"
]
}
],
"source": [
"print(nlp)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"传入一个已分词的句子试试:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok\": [\n",
" \"2021年\",\n",
" \"HanLPv2.1\",\n",
" \"带来\",\n",
" \"最\",\n",
" \"先进\",\n",
" \"的\",\n",
" \"多\",\n",
" \"语种\",\n",
" \"NLP\",\n",
" \"技术\",\n",
" \"。\"\n",
" ],\n",
" \"pos\": [\n",
" \"NT\",\n",
" \"NR\",\n",
" \"VV\",\n",
" \"AD\",\n",
" \"VA\",\n",
" \"DEC\",\n",
" \"CD\",\n",
" \"NN\",\n",
" \"NR\",\n",
" \"NN\",\n",
" \"PU\"\n",
" ],\n",
" \"con\": [\n",
" \"TOP\",\n",
" [[\"IP\", [[\"NP-TMP\", [[\"NT\", [\"2021年\"]]]], [\"NP-PN-SBJ\", [[\"NR\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"VV\", [\"带来\"]], [\"NP-OBJ\", [[\"CP\", [[\"CP\", [[\"IP\", [[\"VP\", [[\"ADVP\", [[\"AD\", [\"最\"]]]], [\"VP\", [[\"VA\", [\"先进\"]]]]]]]], [\"DEC\", [\"的\"]]]]]], [\"NP\", [[\"QP\", [[\"CD\", [\"多\"]]]], [\"NP\", [[\"NN\", [\"语种\"]]]]]], [\"NP\", [[\"NR\", [\"NLP\"]], [\"NN\", [\"技术\"]]]]]]]], [\"PU\", [\"。\"]]]]]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"doc = nlp(tok=[\"2021年\", \"HanLPv2.1\", \"带来\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"])\n",
"print(doc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"流水线的输出也是一个Document,所以支持可视化:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Token ───────── 2021年 HanLPv2.1 带来 最 先进 的 多 语种 NLP 技术 。 PoS 3 4 5 6 7 8 9 10 ──────────────────────────────────────────────────────────────────────── NT ─────────────────────────────────────────────────────►NP-TMP ────┐ NR ─────────────────────────────────────────────────────►NP-PN-SBJ──┤ VV ────────────────────────────────────────────────────┐ │ AD ───►ADVP──┐ │ │ VA ───►VP ───┴►VP ────►IP ───┐ │ │ DEC──────────────────────────┴►CP ────►CP ───┐ ├►VP─────────┼►IP CD ───►QP ───┐ │ │ │ NN ───►NP ───┴────────────────────────►NP────┼►NP-OBJ──┘ │ NR ──┐ │ │ NN ──┴────────────────────────────────►NP ───┘ │ PU ─────────────────────────────────────────────────────────────────┘ "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"doc.pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"如果要分析原始文本的话,分词是第一步,所以先加载一个分词器:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"然后将分词器插入到流水线的第一级:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[None->TransformerTaggingTokenizer->tok,\n",
" tok->TransformerTagger->pos,\n",
" tok->CRFConstituencyParser->con,\n",
" None->merge_pos_into_con->None]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nlp.insert(0, tok, output_key='tok')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"然后就可以直接分析原始文本了:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(TOP\n",
" (IP\n",
" (NT 2021)\n",
" (M 年)\n",
" (NP-PN-SBJ (NR HanLPv2.1))\n",
" (VP\n",
" (VV 带来)\n",
" (NP-OBJ\n",
" (CP (CP (IP (VP (ADVP (AD 最)) (VP (VA 先进)))) (DEC 的)))\n",
" (NP (QP (CD 多)) (NP (NN 语种)))\n",
" (NP (NR NLP) (NN 技术))))\n",
" (PU 。)))\n"
]
}
],
"source": [
"print(nlp('2021年HanLPv2.1带来最先进的多语种NLP技术。')['con'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"你明白吗?HanLP是为聪明人设计的,只要你足够聪明,你就可以优雅地实现各种功能。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 操作短语树的技巧"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"短语结构树的类型为`phrasetree.tree.Tree`,提供了许多接口,此处列举其中一些常用的接口。"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(TOP\n",
" (IP\n",
" (NP-TMP (NT 2021年))\n",
" (NP-PN-SBJ (NR HanLPv2.1))\n",
" (VP\n",
" (VV 带来)\n",
" (NP-OBJ\n",
" (CP (CP (IP (VP (ADVP (AD 最)) (VP (VA 先进)))) (DEC 的)))\n",
" (NP (QP (CD 多)) (NP (NN 语种)))\n",
" (NP (NR NLP) (NN 技术))))\n",
" (PU 。)))\n"
]
}
],
"source": [
"tree = doc['con'] # tree数组的话则需要doc['con'][0]\n",
"print(tree)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 按高度枚举子树"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"子树:(VP (ADVP (AD 最)) (VP (VA 先进)))\t标签:VP\t短语:['最', '先进']\n",
"子树:(NP (QP (CD 多)) (NP (NN 语种)))\t标签:NP\t短语:['多', '语种']\n"
]
}
],
"source": [
"for subtree in tree.subtrees(lambda t: t.height() == 4):\n",
" print(f'子树:{subtree}\\t标签:{subtree.label()}\\t短语:{subtree.leaves()}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 按标签枚举子树"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(NP (QP (CD 多)) (NP (NN 语种)))\n",
"(NP (NN 语种))\n",
"(NP (NR NLP) (NN 技术))\n"
]
}
],
"source": [
"for subtree in tree.subtrees(lambda t: t.label() == 'NP'):\n",
" print(subtree)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 遍历子节点"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"父节点(NP (NR NLP) (NN 技术))的子节点有:\n",
"(NR NLP)\n",
"(NN 技术)\n"
]
}
],
"source": [
"print(f'父节点{subtree}的子节点有:')\n",
"for child in subtree:\n",
" print(child)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "con_stl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/cor_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 指代消解\n",
"任务越少,速度越快。如指定仅执行指代消解:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
},
"outputs": [],
"source": [
"ret = HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"返回值为一个包含分词结果与簇的dict:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ret == {'clusters': [\n",
" [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人\n",
" [['我姐', 0, 2], ['她', 4, 5]], # 指代说话人的姐姐\n",
" [['她的猫', 4, 7], ['它', 11, 12]]], # 指代说话人的姐姐的猫\n",
" 'tokens': ['我', '姐', '送', '我', '她', '的', '猫', '。', '我', '很', '喜欢', '它', '。']}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"对应如下结构:\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XOsWkOqQfzlr"
},
"source": [
"为已分词的句子执行指代消解:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "bLZSTbv_f3OA",
"outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
},
"outputs": [],
"source": [
"clusters = HanLP.coreference_resolution(tokens=[['我', '姐', '送', '我', '她', '的', '猫', '。'],\n",
" ['我', '很', '喜欢', '它', '。']])\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"返回值为簇的list:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clusters == [\n",
" [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人\n",
" [['我姐', 0, 2], ['她', 4, 5]], # 指代说话人的姐姐\n",
" [['她的猫', 4, 7], ['它', 11, 12]]] # 指代说话人的姐姐的猫"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "cor_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_amr.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-04-12 22:19
import hanlp
parser = hanlp.load(hanlp.pretrained.amr.MRP2020_AMR_ENG_ZHO_XLM_BASE)
# For Chinese:
print(parser(["男孩", "希望", "女孩", "相信", "他", "。"]))
print(parser(["男孩", "希望", "女孩", "相信", "他", "。"], output_amr=False))
# For English:
print(parser(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng'))
# It's suggested to also feed the lemma for stabler performance.
print(parser([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'),
('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng'))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 22:26
import hanlp
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
# 加载多任务模型
HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
# 获取分词任务(以tok开头的任务都是分词任务,以细分标准为例)
tok: TaggingTokenization = HanLP['tok/fine']
tok.dict_force = tok.dict_combine = None
print(f'不挂词典:\n{HanLP("商品和服务项目")["tok/fine"]}')
tok.dict_force = {'和服', '服务项目'}
print(f'强制模式:\n{HanLP("商品和服务项目")["tok/fine"]}') # 慎用,详见《自然语言处理入门》第二章
tok.dict_force = {'和服务': ['和', '服务']}
print(f'强制校正:\n{HanLP("正向匹配商品和服务、任何和服务必按上述切分")["tok/fine"]}')
tok.dict_force = None
tok.dict_combine = {'和服', '服务项目'}
print(f'合并模式:\n{HanLP("商品和服务项目")["tok/fine"]}')
# 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html
# 含有空格、制表符等(Transformer tokenizer去掉的字符)的词语需要用tuple的形式提供
tok.dict_combine = {('iPad', 'Pro'), '2个空格'}
print(f'空格匹配:\n{HanLP("如何评价iPad Pro ?iPad Pro有2个空格", tasks="tok/fine")["tok/fine"]}')
# 聪明的用户请继续阅读:tuple词典中的字符串其实等价于该字符串的所有可能的切分方式
print(f'词典内容:\n{dict(tok.dict_combine.config["dictionary"]).keys()}')
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict_stl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 22:26
import hanlp
from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer
# 加载一个旧版本单任务模型演示分词错误(最新版已经修复):
tok: TransformerTaggingTokenizer = hanlp.load('https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip')
tok.dict_force = tok.dict_combine = None
print(f'不挂词典:\n{tok("首相和川普通电话")}')
tok.dict_force = {'川普'}
print(f'强制模式:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}') # 慎用,详见《自然语言处理入门》第二章
tok.dict_force = {'川普通电话': ['川普', '通', '电话']}
print(f'强制校正:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}')
tok.dict_force = None
tok.dict_combine = {'美国总统'}
print(f'合并模式:\n{tok("首相和川普通电话,川普是美国总统。")}')
# 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_del_tasks.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-02-03 13:28
import hanlp
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp_common.document import Document
HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
tasks = list(HanLP.tasks.keys())
print(tasks) # Pick what you need from what we have
for task in tasks:
if task not in ('tok', 'pos'):
del HanLP[task]
# You can save it as a new component
# HanLP.save('path/to/new/component')
# HanLP.load('path/to/new/component')
print(HanLP.tasks.keys())
doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', 'up主来到北京立方庭参观自然语义科技公司。'])
print(doc)
doc.pretty_print()
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_document.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-10-26 23:40
from hanlp_common.document import Document
# Create a document or get a document from HanLP.parse
doc = Document(
tok=[["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司"]],
pos=[["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN"]],
ner=[[["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4],
["自然语义科技公司", "ORGANIZATION", 5, 9]]],
dep=[[[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"],
[9, "compound"], [9, "compound"], [9, "compound"], [5, "dobj"]]]
)
# print(doc) or str(doc) to get its JSON representation
print(doc)
# Access an annotation by its task name
print(doc['tok'])
# Get number of sentences
print(f'It has {doc.count_sentences()} sentence(s)')
# Access the n-th sentence
print(doc.squeeze(0)['tok'])
# Pretty print it right in your console or notebook
doc.pretty_print()
# To save the pretty prints in a str
pretty_text: str = '\n\n'.join(doc.to_pretty())
# Create a document from a dict
doc = Document({
"tok/fine": [
["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司", "。"]
],
"tok/coarse": [
["晓美焰", "来到", "北京立方庭", "参观", "自然语义科技公司", "。"]
],
"pos/ctb": [
["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN", "PU"]
],
"pos/pku": [
["nr", "v", "ns", "nz", "v", "n", "n", "n", "n", "w"]
],
"ner/msra": [
[["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4], ["自然语义科技公司", "ORGANIZATION", 5, 9]]
],
"ner/ontonotes": [
[["晓美焰", "PERSON", 0, 1], ["北京", "GPE", 2, 3], ["立方庭", "FAC", 3, 4], ["自然语义科技公司", "ORG", 5, 9]]
],
"srl": [
[[["晓美焰", "ARG0", 0, 1], ["来到", "PRED", 1, 2], ["北京立方庭", "ARG1", 2, 4]],
[["晓美焰", "ARG0", 0, 1], ["参观", "PRED", 4, 5], ["自然语义科技公司", "ARG1", 5, 9]]]
],
"dep": [
[[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"], [9, "compound"], [9, "compound"],
[9, "compound"], [5, "dobj"], [2, "punct"]]
]
})
# Pretty print using a different NER annotation
doc.pretty_print(ner='ner/ontonotes')
# Get the first annotation for NER
print(doc.get_by_prefix('ner'))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_mlm.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-29 21:11
from hanlp.components.lm.mlm import MaskedLanguageModel
mlm = MaskedLanguageModel()
mlm.load('bert-base-chinese')
print(mlm('生活的真谛是[MASK]。'))
# Batching is always faster
print(mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。']))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_mtl.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-31 13:51
import hanlp
from hanlp_common.document import Document
# CLOSE是自然语义标注的闭源语料库,BASE是中号模型,ZH中文
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
# 默认执行全部任务
doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
# 返回类型Document是dict的子类,打印出来兼容JSON
print(doc)
# 即时可视化,防止换行请最大化窗口,推荐在Jupyter Notebook里调用
doc.pretty_print()
# 指定可视化OntoNotes标准的NER
# doc.pretty_print(ner='ner/ontonotes', pos='pku')
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_ner_dict.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-04-29 11:06
import hanlp
from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition
from hanlp.utils.io_util import get_resource
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH)
ner: TaggingNamedEntityRecognition = HanLP['ner/msra']
ner.dict_whitelist = {'午饭后': 'TIME'}
doc = HanLP('2021年测试高血压是138,时间是午饭后2点45,低血压是44', tasks='ner/msra')
doc.pretty_print()
print(doc['ner/msra'])
ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}
HanLP('他在浙江金华出生,他的名字叫金华。', tasks='ner/msra').pretty_print()
# HanLP.save(get_resource(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH))
# 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
# See https://hanlp.hankcs.com/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.html
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_parse_constituency.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-01-18 11:09
from hanlp_common.document import Document
import hanlp
con = hanlp.load(hanlp.pretrained.constituency.CTB9_CON_FULL_TAG_ELECTRA_SMALL)
# To speed up, parse multiple sentences at once, and use a GPU.
print(con(["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"]))
# The rest of this tutorial is written for clever users.
# The first level of non-terminals are PoS tags. So usually a PoS model is piped.
def merge_pos_into_con(doc: Document):
flat = isinstance(doc['pos'][0], str)
if flat:
doc = Document((k, [v]) for k, v in doc.items())
for tree, tags in zip(doc['con'], doc['pos']):
offset = 0
for subtree in tree.subtrees(lambda t: t.height() == 2):
tag = subtree.label()
if tag == '_':
subtree.set_label(tags[offset])
offset += 1
if flat:
doc = doc.squeeze()
return doc
pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
nlp = hanlp.pipeline() \
.append(pos, input_key='tok', output_key='pos') \
.append(con, input_key='tok', output_key='con') \
.append(merge_pos_into_con, input_key='*')
print(f'The pipeline looks like this: {nlp}')
doc = nlp(tok=["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"])
print(doc)
doc.pretty_print()
# If you need to parse raw text, simply add a tokenizer into this pipeline.
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
nlp.insert(0, tok, output_key='tok')
print(f'The pipeline looks like this: {nlp}')
doc = nlp('2021年HanLPv2.1带来最先进的多语种NLP技术。')
print(doc)
doc.pretty_print()
# ATTENTION: Pipelines are usually slower than MTL but they are more flexible.
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_pipeline.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-28 20:47
import hanlp
# Pipeline allows blending multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch
# one. However, it's slower than the MTL framework.
# pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE) # In case both tf and torch are used, load tf first.
HanLP = hanlp.pipeline() \
.append(hanlp.utils.rules.split_sentence, output_key='sentences') \
.append(hanlp.load('CTB9_TOK_ELECTRA_SMALL'), output_key='tok') \
.append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \
.append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \
.append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=False), output_key='dep', input_key='tok') \
.append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok')
doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。')
print(doc)
doc.pretty_print()
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-15 22:26
import hanlp
from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
from hanlp.components.mtl.tasks.pos import TransformerTagging
from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
from tests import cdroot
cdroot()
HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
# Demonstrates custom dict in part-of-speech tagging
pos: TransformerTagging = HanLP['pos/ctb']
print(f'自定义单个词性:')
pos.dict_tags = {'HanLP': 'state-of-the-art-tool'}
HanLP("HanLP为生产环境带来次世代最先进的多语种NLP技术。", tasks='pos/ctb').pretty_print()
print(f'根据上下文自定义词性:')
pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'}
HanLP("我的希望是希望张晚霞的背影被晚霞映红。", tasks='pos/ctb').pretty_print()
# 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
# See also https://hanlp.hankcs.com/docs/api/hanlp/components/taggers/transformer_tagger.html
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_sts.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-05-24 13:15
import hanlp
sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)
print(sim([
['看图猜一电影名', '看图猜电影'],
['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'],
['北京到上海的动车票', '上海到北京的动车票'],
]))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_word2vec.py
================================================
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2021-12-12 18:33
import hanlp
import torch
word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU)
vec = word2vec('先进')
print(vec)
print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0))
print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0))
print('获取语义最相似的词语:')
print(word2vec.most_similar('上海'))
# print(word2vec.most_similar(['上海', '寒冷'])) # batching更快
print('非常寒冷是OOV所以无法获取:')
print(word2vec.most_similar('非常寒冷'))
print('但是在doc2vec模式下OOV也可以进行相似度计算:')
print(word2vec.most_similar('非常寒冷', doc2vec=True))
print('甚至可以处理短文本:')
print(word2vec.most_similar('国家图书馆推出2022年春节主题活动', doc2vec=True))
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_mtl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [
{
"data": {
"text/plain": [
"{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
" 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
" 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
" 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
" 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 依存句法分析\n",
"任务越少,速度越快。如指定仅执行依存句法分析:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
},
"outputs": [],
"source": [
"doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='dep')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok/fine\": [\n",
" [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
" ],\n",
" \"dep\": [\n",
" [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"amod\"], [15, \"nn\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"assm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]],\n",
" [[2, \"nsubj\"], [0, \"root\"], [4, \"nn\"], [2, \"dobj\"], [2, \"conj\"], [9, \"nn\"], [9, \"nn\"], [9, \"nn\"], [5, \"dobj\"], [2, \"punct\"]]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"print(doc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`doc['dep']`为句子们的依存句法树列表,第`i`个二元组表示第`i`个单词的`[中心词的下标, 与中心词的依存关系]`。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"可视化依存句法树:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dep Tree \tToken \tRelati\n",
"────────────\t─────────\t──────\n",
" ┌─────────►\t2021年 \ttmod \n",
" │┌────────►\tHanLPv2.1\tnsubj \n",
" ││┌─►┌─────\t为 \tprep \n",
" │││ │ ┌─►\t生产 \tnn \n",
" │││ └─►└──\t环境 \tpobj \n",
"┌┼┴┴────────\t带来 \troot \n",
"││ ┌─►\t次 \tamod \n",
"││ ┌───►└──\t世代 \tnn \n",
"││ │ ┌─►\t最 \tadvmod\n",
"││ │┌──►├──\t先进 \trcmod \n",
"││ ││ └─►\t的 \tassm \n",
"││ ││ ┌─►\t多 \tnummod\n",
"││ ││┌─►└──\t语种 \tnn \n",
"││ │││ ┌─►\tNLP \tnn \n",
"│└─►└┴┴──┴──\t技术 \tdobj \n",
"└──────────►\t。 \tpunct \n",
"\n",
"Dep Tree \tTok\tRelat\n",
"────────────\t───\t─────\n",
" ┌─►\t阿婆主\tnsubj\n",
"┌┬────┬──┴──\t来到 \troot \n",
"││ │ ┌─►\t北京 \tnn \n",
"││ └─►└──\t立方庭\tdobj \n",
"│└─►┌───────\t参观 \tconj \n",
"│ │ ┌───►\t自然 \tnn \n",
"│ │ │┌──►\t语义 \tnn \n",
"│ │ ││┌─►\t科技 \tnn \n",
"│ └─►└┴┴──\t公司 \tdobj \n",
"└──────────►\t。 \tpunct\n"
]
}
],
"source": [
"doc.pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"转换为CoNLL格式:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\t2021年\t_\t_\t_\t_\t6\ttmod\t_\t_\n",
"2\tHanLPv2.1\t_\t_\t_\t_\t6\tnsubj\t_\t_\n",
"3\t为\t_\t_\t_\t_\t6\tprep\t_\t_\n",
"4\t生产\t_\t_\t_\t_\t5\tnn\t_\t_\n",
"5\t环境\t_\t_\t_\t_\t3\tpobj\t_\t_\n",
"6\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n",
"7\t次\t_\t_\t_\t_\t8\tamod\t_\t_\n",
"8\t世代\t_\t_\t_\t_\t15\tnn\t_\t_\n",
"9\t最\t_\t_\t_\t_\t10\tadvmod\t_\t_\n",
"10\t先进\t_\t_\t_\t_\t15\trcmod\t_\t_\n",
"11\t的\t_\t_\t_\t_\t10\tassm\t_\t_\n",
"12\t多\t_\t_\t_\t_\t13\tnummod\t_\t_\n",
"13\t语种\t_\t_\t_\t_\t15\tnn\t_\t_\n",
"14\tNLP\t_\t_\t_\t_\t15\tnn\t_\t_\n",
"15\t技术\t_\t_\t_\t_\t6\tdobj\t_\t_\n",
"16\t。\t_\t_\t_\t_\t6\tpunct\t_\t_\n",
"\n",
"1\t阿婆主\t_\t_\t_\t_\t2\tnsubj\t_\t_\n",
"2\t来到\t_\t_\t_\t_\t0\troot\t_\t_\n",
"3\t北京\t_\t_\t_\t_\t4\tnn\t_\t_\n",
"4\t立方庭\t_\t_\t_\t_\t2\tdobj\t_\t_\n",
"5\t参观\t_\t_\t_\t_\t2\tconj\t_\t_\n",
"6\t自然\t_\t_\t_\t_\t9\tnn\t_\t_\n",
"7\t语义\t_\t_\t_\t_\t9\tnn\t_\t_\n",
"8\t科技\t_\t_\t_\t_\t9\tnn\t_\t_\n",
"9\t公司\t_\t_\t_\t_\t5\tdobj\t_\t_\n",
"10\t。\t_\t_\t_\t_\t2\tpunct\t_\t_\n"
]
}
],
"source": [
"print(doc.to_conll())"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XOsWkOqQfzlr"
},
"source": [
"为已分词的句子执行依存句法分析:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "bLZSTbv_f3OA",
"outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dep Tree \tToken\tRelati\n",
"───────────\t─────\t──────\n",
" ┌────────►\tHanLP\tnsubj \n",
" │┌─►┌─────\t为 \tprep \n",
" ││ │ ┌─►\t生产 \tnn \n",
" ││ └─►└──\t环境 \tpobj \n",
"┌┼┴────────\t带来 \troot \n",
"││ ┌─────►\t次世代 \tnn \n",
"││ │ ┌─►\t最 \tadvmod\n",
"││ │┌─►├──\t先进 \trcmod \n",
"││ ││ └─►\t的 \tassm \n",
"││ ││ ┌──►\t多语种 \tnn \n",
"││ ││ │┌─►\tNLP \tnn \n",
"│└─►└┴─┴┴──\t技术 \tdobj \n",
"└─────────►\t。 \tpunct \n",
"\n",
"Dep Tree \tTok\tRelation \n",
"────────────────\t───\t─────────\n",
" ┌─►┌──\t我 \tassmod \n",
" │ └─►\t的 \tassm \n",
" ┌─►└─────\t希望 \ttop \n",
"┌┬─────┴────────\t是 \troot \n",
"│└─►┌───────────\t希望 \tccomp \n",
"│ │ ┌─►┌──\t张晚霞\tassmod \n",
"│ │ │ └─►\t的 \tassm \n",
"│ │ ┌─►└─────\t背影 \tnsubjpass\n",
"│ └─►└──┬─────\t被 \tccomp \n",
"│ │ ┌─►\t晚霞 \tnsubj \n",
"│ └─►└──\t映红 \tdep \n",
"└──────────────►\t。 \tpunct \n"
]
}
],
"source": [
"HanLP([\n",
" [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
" ], tasks='dep', skip_tasks='tok*').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 注意\n",
"Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "dep_mtl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 依存句法分析\n",
"任务越少,速度越快。如指定仅执行依存句法分析:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
},
"outputs": [],
"source": [
"doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='dep')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok/fine\": [\n",
" [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n",
" ],\n",
" \"dep\": [\n",
" [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"clf\"], [10, \"dep\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"cpm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"print(doc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`doc['dep']`为句子们的依存句法树列表,第`i`个二元组表示第`i`个单词的`[中心词的下标, 与中心词的依存关系]`。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"可视化依存句法树:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dep Tree \tToken \tRelati\n",
"─────────────\t─────────\t──────\n",
" ┌─────────►\t2021年 \ttmod \n",
" │┌────────►\tHanLPv2.1\tnsubj \n",
" ││┌─►┌─────\t为 \tprep \n",
" │││ │ ┌─►\t生产 \tnn \n",
" │││ └─►└──\t环境 \tpobj \n",
"┌┬┴┴┴────────\t带来 \troot \n",
"││ ┌─►\t次 \tclf \n",
"││ ┌─►└──\t世代 \tdep \n",
"││ │ ┌─►\t最 \tadvmod\n",
"││ ┌─►└──┼──\t先进 \trcmod \n",
"││ │ └─►\t的 \tcpm \n",
"││ │ ┌─►\t多 \tnummod\n",
"││ │ ┌─►└──\t语种 \tnn \n",
"││ │ │ ┌─►\tNLP \tnn \n",
"│└─►└──┴──┴──\t技术 \tdobj \n",
"└───────────►\t。 \tpunct \n"
]
}
],
"source": [
"doc.pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"转换为CoNLL格式:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\t2021年\t_\t_\t_\t_\t6\ttmod\t_\t_\n",
"2\tHanLPv2.1\t_\t_\t_\t_\t6\tnsubj\t_\t_\n",
"3\t为\t_\t_\t_\t_\t6\tprep\t_\t_\n",
"4\t生产\t_\t_\t_\t_\t5\tnn\t_\t_\n",
"5\t环境\t_\t_\t_\t_\t3\tpobj\t_\t_\n",
"6\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n",
"7\t次\t_\t_\t_\t_\t8\tclf\t_\t_\n",
"8\t世代\t_\t_\t_\t_\t10\tdep\t_\t_\n",
"9\t最\t_\t_\t_\t_\t10\tadvmod\t_\t_\n",
"10\t先进\t_\t_\t_\t_\t15\trcmod\t_\t_\n",
"11\t的\t_\t_\t_\t_\t10\tcpm\t_\t_\n",
"12\t多\t_\t_\t_\t_\t13\tnummod\t_\t_\n",
"13\t语种\t_\t_\t_\t_\t15\tnn\t_\t_\n",
"14\tNLP\t_\t_\t_\t_\t15\tnn\t_\t_\n",
"15\t技术\t_\t_\t_\t_\t6\tdobj\t_\t_\n",
"16\t。\t_\t_\t_\t_\t6\tpunct\t_\t_\n"
]
}
],
"source": [
"print(doc.to_conll())"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XOsWkOqQfzlr"
},
"source": [
"为已分词的句子执行依存句法分析:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "bLZSTbv_f3OA",
"outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dep Tree \tToken\tRelati\n",
"───────────\t─────\t──────\n",
" ┌────────►\tHanLP\tnsubj \n",
" │┌─►┌─────\t为 \tprep \n",
" ││ │ ┌─►\t生产 \tnn \n",
" ││ └─►└──\t环境 \tpobj \n",
"┌┼┴────────\t带来 \troot \n",
"││ ┌──►\t次世代 \tdep \n",
"││ │┌─►\t最 \tadvmod\n",
"││ ┌─►└┼──\t先进 \trcmod \n",
"││ │ └─►\t的 \tcpm \n",
"││ │ ┌──►\t多语种 \tnn \n",
"││ │ │┌─►\tNLP \tnn \n",
"│└─►└──┴┴──\t技术 \tdobj \n",
"└─────────►\t。 \tpunct \n",
"\n",
"Dep Tree \tTok\tRelation \n",
"────────────────\t───\t─────────\n",
" ┌─►┌──\t我 \tassmod \n",
" │ └─►\t的 \tassm \n",
" ┌─►└─────\t希望 \ttop \n",
"┌┬─────┴────────\t是 \troot \n",
"│└─►┌───────────\t希望 \tccomp \n",
"│ │ ┌─►┌──\t张晚霞\tassmod \n",
"│ │ │ └─►\t的 \tassm \n",
"│ │ ┌─►└─────\t背影 \tnsubjpass\n",
"│ └─►└──┬─────\t被 \tccomp \n",
"│ │ ┌─►\t晚霞 \tnsubj \n",
"│ └─►└──\t映红 \tdep \n",
"└──────────────►\t。 \tpunct \n"
]
}
],
"source": [
"HanLP(tokens=[\n",
" [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
" ], tasks='dep').pretty_print()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "dep_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_stl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4M7ka0K5OMWU",
"outputId": "69cdad22-d94d-41fb-9591-1c29515a3da9"
},
"outputs": [
{
"data": {
"text/plain": [
"{'CTB5_BIAFFINE_DEP_ZH': 'https://file.hankcs.com/hanlp/dep/biaffine_ctb5_20191229_025833.zip',\n",
" 'CTB7_BIAFFINE_DEP_ZH': 'https://file.hankcs.com/hanlp/dep/biaffine_ctb7_20200109_022431.zip',\n",
" 'CTB9_DEP_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/ctb9_dep_electra_small_20220216_100306.zip',\n",
" 'PMT1_DEP_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/pmt_dep_electra_small_20220218_134518.zip',\n",
" 'CTB9_UDC_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/udc_dep_electra_small_20220218_095452.zip',\n",
" 'PTB_BIAFFINE_DEP_EN': 'https://file.hankcs.com/hanlp/dep/ptb_dep_biaffine_20200101_174624.zip'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.dep.ALL # 语种见名称最后一个字段或相应语料库"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BMW528wGNulM"
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [],
"source": [
"dep = hanlp.load(hanlp.pretrained.dep.CTB9_DEP_ELECTRA_SMALL)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 依存句法分析\n",
"依存句法分析任务的输入为已分词的一个或多个句子:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "BqEmDMGGOtk3"
},
"outputs": [],
"source": [
"tree = dep([\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"返回对象为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)类型:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "U_PGm06m6K20",
"outputId": "a25c6452-5032-42b3-d501-99158380c487"
},
"outputs": [
{
"data": {
"text/plain": [
"[{'id': 1,\n",
" 'form': '2021年',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 3,\n",
" 'deprel': 'tmod',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 2,\n",
" 'form': 'HanLPv2.1',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 3,\n",
" 'deprel': 'nsubj',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 3,\n",
" 'form': '带来',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 0,\n",
" 'deprel': 'root',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 4,\n",
" 'form': '次',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 5,\n",
" 'deprel': 'det',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 5,\n",
" 'form': '世代',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 7,\n",
" 'deprel': 'dep',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 6,\n",
" 'form': '最',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 7,\n",
" 'deprel': 'advmod',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 7,\n",
" 'form': '先进',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 12,\n",
" 'deprel': 'rcmod',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 8,\n",
" 'form': '的',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 7,\n",
" 'deprel': 'cpm',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 9,\n",
" 'form': '多',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 10,\n",
" 'deprel': 'nummod',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 10,\n",
" 'form': '语种',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 12,\n",
" 'deprel': 'nn',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 11,\n",
" 'form': 'NLP',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 12,\n",
" 'deprel': 'nn',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 12,\n",
" 'form': '技术',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 3,\n",
" 'deprel': 'dobj',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None},\n",
" {'id': 13,\n",
" 'form': '。',\n",
" 'cpos': None,\n",
" 'pos': None,\n",
" 'head': 3,\n",
" 'deprel': 'punct',\n",
" 'lemma': None,\n",
" 'feats': None,\n",
" 'phead': None,\n",
" 'pdeprel': None}]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tree"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Gn_RQa_Z6K20"
},
"source": [
"打印时为CoNLL格式:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "26P1LGzv6K20",
"outputId": "c78ffdb0-3cd7-492d-f55e-0d50120faffb"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\t2021年\t_\t_\t_\t_\t3\ttmod\t_\t_\n",
"2\tHanLPv2.1\t_\t_\t_\t_\t3\tnsubj\t_\t_\n",
"3\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n",
"4\t次\t_\t_\t_\t_\t5\tdet\t_\t_\n",
"5\t世代\t_\t_\t_\t_\t7\tdep\t_\t_\n",
"6\t最\t_\t_\t_\t_\t7\tadvmod\t_\t_\n",
"7\t先进\t_\t_\t_\t_\t12\trcmod\t_\t_\n",
"8\t的\t_\t_\t_\t_\t7\tcpm\t_\t_\n",
"9\t多\t_\t_\t_\t_\t10\tnummod\t_\t_\n",
"10\t语种\t_\t_\t_\t_\t12\tnn\t_\t_\n",
"11\tNLP\t_\t_\t_\t_\t12\tnn\t_\t_\n",
"12\t技术\t_\t_\t_\t_\t3\tdobj\t_\t_\n",
"13\t。\t_\t_\t_\t_\t3\tpunct\t_\t_\n"
]
}
],
"source": [
"print(tree)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"如果不需要CoNLL格式的话,也许`conll=False`时的输出更加简洁:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(3, 'tmod'),\n",
" (3, 'nsubj'),\n",
" (0, 'root'),\n",
" (5, 'det'),\n",
" (7, 'dep'),\n",
" (7, 'advmod'),\n",
" (12, 'rcmod'),\n",
" (7, 'cpm'),\n",
" (10, 'nummod'),\n",
" (12, 'nn'),\n",
" (12, 'nn'),\n",
" (3, 'dobj'),\n",
" (3, 'punct')]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dep([\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], conll=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 可视化\n",
"你可以构造一个`Document`实现漂亮的可视化:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"Dep Tree ───────────── ┌──► │┌─► ┌┬───────┴┴── ││ ┌─► ││ ┌─►└── ││ │ ┌─► ││ ┌─►└──┼── ││ │ └─► ││ │ ┌─► ││ │ ┌─►└── ││ │ │ ┌─► │└─►└──┴──┴── └───────────► Token ───────── 2021年 HanLPv2.1 带来 次 世代 最 先进 的 多 语种 NLP 技术 。 Relati ────── tmod nsubj root det dep advmod rcmod cpm nummod nn nn dobj punct "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from hanlp_common.document import Document\n",
"doc = Document(\n",
" tok=[\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
" dep=[(3, 'tmod'), (3, 'nsubj'), (0, 'root'), (5, 'det'), (7, 'dep'), (7, 'advmod'), (12, 'rcmod'), (7, 'cpm'), (10, 'nummod'), (12, 'nn'), (12, 'nn'), (3, 'dobj'), (3, 'punct')]\n",
")\n",
"doc.pretty_print()"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "dep_stl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/extractive_summarization_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4M7ka0K5OMWU",
"outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BMW528wGNulM"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 抽取式自动摘要\n",
"抽取式自动摘要的目标是从文章中筛选出一些作为摘要的中心句子:既要紧扣要点,又要避免赘语。\n",
"### 中文\n",
"抽取式自动摘要任务的输入为一段文本和所需的摘要句子数量的最大值`topk`:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BqEmDMGGOtk3",
"outputId": "936d439a-e1ff-4308-d2aa-775955558594"
},
"outputs": [
{
"data": {
"text/plain": [
"{'据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。': 0.9999685883522034,\n",
" '仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。': 0.5798477530479431,\n",
" '尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。': 0.5435440540313721}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = '''\n",
"据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。\n",
"据供应链消息人士称,生产厂的订单拉动情况正在慢慢转强,这会提高MacBook Pro机型的供应量,并缩短苹果客户在过去几周所经历的延长交货时间。\n",
"仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。\n",
"据分析师郭明錤表示,广达是高端MacBook Pro的唯一供应商,自防疫封控依赖,MacBook Pro大部分型号交货时间增加了三到五周,\n",
"一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。\n",
"尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。\n",
"苹果上周表示,防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求,这最终将影响苹果6月份的收入。\n",
"'''\n",
"HanLP.extractive_summarization(text, topk=3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"返回值为最多`topk`个摘要句子以及相应的权重,权重取值区间为$[0, 1]$。由于Trigram Blocking技巧,实际返回的摘要句数量可能小于`topk`。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 可视化"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。 \n",
"据供应链消息人士称,生产厂的订单拉动情况正在慢慢转强,这会提高MacBook Pro机型的供应量,并缩短苹果客户在过去几周所经历的延长交货时间。\n",
"仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。 \n",
"据分析师郭明錤表示,广达是高端MacBook Pro的唯一供应商,自防疫封控依赖,MacBook Pro大部分型号交货时间增加了三到五周,\n",
"一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。\n",
"尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。 \n",
"苹果上周表示,防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求,这最终将影响苹果6月份的收入。 \n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def highlight(text, scores):\n",
" for k, v in scores.items():\n",
" text = text.replace(k, f'{k} ')\n",
" from IPython.display import display, HTML\n",
" display(HTML(text))\n",
"\n",
"scores = HanLP.extractive_summarization(text, topk=100)\n",
"highlight(text, scores)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 繁体中文\n",
"HanLP的抽取式自动摘要接口支持繁体中文:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'華爾街日報周二(3日)報導,根據知情人透露,日前已宣布將以440億美元買下推特(Twitter)並下市的馬斯克,曾經跟一些潛在投資人說,他可以在短短幾年後,再將這家社群媒體公司重新上市。': 0.9999818205833435,\n",
" '消息來源說,特斯拉創辦人兼執行長馬斯克表示,他計劃在買下推特後最短三年內,就展開推特的首次公開發行股票。': 0.503434419631958,\n",
" '根據之前華爾街日報的報導,馬斯克為購買推特籌現金時,與私募股權公司等投資人討論出資事宜,Apollo Global Management有興趣參與。': 0.2688594460487366}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = '''\n",
"華爾街日報周二(3日)報導,根據知情人透露,日前已宣布將以440億美元買下推特(Twitter)並下市的馬斯克,曾經跟一些潛在投資人說,他可以在短短幾年後,再將這家社群媒體公司重新上市。\n",
"消息來源說,特斯拉創辦人兼執行長馬斯克表示,他計劃在買下推特後最短三年內,就展開推特的首次公開發行股票。\n",
"馬斯克買推特的交易案預期在今年稍後走完程序,包括獲得股東同意以及監管機關核准等步驟。\n",
"根據之前華爾街日報的報導,馬斯克為購買推特籌現金時,與私募股權公司等投資人討論出資事宜,Apollo Global Management有興趣參與。\n",
"私募股權公司通常都先買下公司將之私有化,把公司移出眾人注目的焦點之外以後,整頓公司,接著再把公司上市,時間常是五年左右。\n",
"華爾街日報指出,馬斯克暗示他對推特有類似的規劃的話,有助說服潛在投資人,他會很快行動,改善推特的營運和獲利。\n",
"'''\n",
"scores = HanLP.extractive_summarization(text)\n",
"scores"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"華爾街日報周二(3日)報導,根據知情人透露,日前已宣布將以440億美元買下推特(Twitter)並下市的馬斯克,曾經跟一些潛在投資人說,他可以在短短幾年後,再將這家社群媒體公司重新上市。 \n",
"消息來源說,特斯拉創辦人兼執行長馬斯克表示,他計劃在買下推特後最短三年內,就展開推特的首次公開發行股票。 \n",
"馬斯克買推特的交易案預期在今年稍後走完程序,包括獲得股東同意以及監管機關核准等步驟。\n",
"根據之前華爾街日報的報導,馬斯克為購買推特籌現金時,與私募股權公司等投資人討論出資事宜,Apollo Global Management有興趣參與。 \n",
"私募股權公司通常都先買下公司將之私有化,把公司移出眾人注目的焦點之外以後,整頓公司,接著再把公司上市,時間常是五年左右。\n",
"華爾街日報指出,馬斯克暗示他對推特有類似的規劃的話,有助說服潛在投資人,他會很快行動,改善推特的營運和獲利。\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"highlight(text, scores)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 英文\n",
"按照HanLP一贯的多语种设计,任何语言都支持。由于服务器GPU资源限制,目前英文接口暂未上线。如果你有相应需求,欢迎前往论坛发起请愿。"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "extractive_summarization_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/gec_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 语法纠错\n",
"输入短文本,执行语法纠错:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
},
"outputs": [
{
"data": {
"text/plain": [
"['每个青年都应当有远大的抱负。', '有的同学对语言很有兴趣。', '我市本地居民约占全市人口的70%。']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.grammatical_error_correction(['每个青年都应当有远大的报复。', '有的同学对语言很兴趣。', '我市本地居民约占全市人口的70%多。'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"返回值是每段短文本的修改结果列表。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 测试版\n",
"当前版本为测试版,暂时仅支持拼写、标点和简单的语法错误,HanLP的线上模型和语料库仍然在迭代发展中。欢迎广大用户将测试版的问题反馈到[论坛](https://bbs.hankcs.com/c/text-generation/gec/30),我们将在下一个版本中,将HanLP的文本纠错能力提升到高考语文水平。"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "gec_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/keyphrase_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4M7ka0K5OMWU",
"outputId": "d74f0749-0587-454a-d7c9-7418d45ce534"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BMW528wGNulM"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 关键词提取\n",
"关键词(短语)提取的目标是文本中最具有代表性的关键词以及短语。\n",
"### 中文\n",
"关键词提取任务的输入为一段文本和所需的关键词数量`topk`:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BqEmDMGGOtk3",
"outputId": "936d439a-e1ff-4308-d2aa-775955558594"
},
"outputs": [
{
"data": {
"text/plain": [
"{'自然语言处理': 0.800000011920929,\n",
" 'HanLP的全部性能': 0.5256577134132385,\n",
" '一门博大精深的学科': 0.42154020071029663}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.keyphrase_extraction('自然语言处理是一门博大精深的学科,掌握理论才能发挥出HanLP的全部性能。 '\n",
" '《自然语言处理入门》是一本配套HanLP的NLP入门书,助你零起点上手自然语言处理。', topk=3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"返回值为`topk`个关键词以及相应的权重,权重取值区间为$[0, 1]$。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"关键词提取并不仅限于短文本,长文章也一样支持:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'新冠病毒核酸阳性感染': 0.888239324092865,\n",
" '确诊病例': 0.8868124485015869,\n",
" '本土无症状感染者': 0.8557102680206299,\n",
" '属地社区(村屯)': 0.8164600133895874,\n",
" '疫情防控工作': 0.7749382853507996,\n",
" '我市疫情防控要求': 0.7502512335777283,\n",
" '症状': 0.669366180896759,\n",
" '我市疫情形势': 0.6673010587692261,\n",
" '感染': 0.6663177013397217,\n",
" '本土确诊病例': 0.6464788317680359}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc = '''\n",
"4月15日0-24时,长春市新增本土确诊病例157例(含57例无症状感染者转为确诊病例),新增本土无症状感染者407例。\n",
"以上人员均为隔离管控期间筛查新冠病毒核酸阳性感染者。\n",
"当前我市疫情形势严峻,为做好全市疫情防控工作,尽快恢复正常社会秩序和经济社会发展,长春市新冠肺炎疫情防控工作领导小组办公室提醒广大市民,\n",
"请严格遵守我市疫情防控要求,配合各部门落实好防控措施,进一步提高防范意识,坚持规范戴口罩、勤洗手、常通风、保持社交距离、不聚餐、不聚集,\n",
"减少疾病感染风险。一旦出现发热、干咳、乏力、咽痛、嗅味觉减退或丧失等不适症状,应及时向属地社区(村屯)或疾控机构报告。\n",
"'''\n",
"HanLP.keyphrase_extraction(doc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 可视化"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"4月15日0-24时,长春市新增本土确诊病例 157例(含57例无症状 感染 者转为确诊病例 ),新增本土无症状 感染 者 407例。\n",
"以上人员均为隔离管控期间筛查新冠病毒核酸阳性感染 者。\n",
"当前我市疫情形势 严峻,为做好全市疫情防控工作 ,尽快恢复正常社会秩序和经济社会发展,长春市新冠肺炎疫情防控工作 领导小组办公室提醒广大市民,\n",
"请严格遵守我市疫情防控要求 ,配合各部门落实好防控措施,进一步提高防范意识,坚持规范戴口罩、勤洗手、常通风、保持社交距离、不聚餐、不聚集,\n",
"减少疾病感染 风险。一旦出现发热、干咳、乏力、咽痛、嗅味觉减退或丧失等不适症状 ,应及时向属地社区(村屯) 或疾控机构报告。\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def highlight(text, scores):\n",
" for k, v in scores.items():\n",
" text = text.replace(k, f'{k} ')\n",
" from IPython.display import display, HTML\n",
" display(HTML(text))\n",
"\n",
"scores = HanLP.keyphrase_extraction(doc)\n",
"highlight(doc, scores)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 英文\n",
"按照HanLP一贯的多语种设计,任何语言都支持。由于服务器GPU资源限制,目前英文接口暂未上线。如果你有相应需求,欢迎前往论坛发起请愿。"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "keyphrase_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/lid_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nf9TgeCTC0OT"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jaW4eu6kC0OU",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"!pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_xI_bLAaC0OU"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IYwV-UkNNzFp",
"outputId": "54065443-9b0a-444c-f6c0-c701bc86400b",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1Uf_u7ddMhUt",
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 语种识别\n",
"语种识别任务的输入为一个或多个文档:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "BqEmDMGGOtk3"
},
"outputs": [
{
"data": {
"text/plain": [
"'en'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.language_identification('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SwaPn1hjC0OW"
},
"source": [
"返回对象为[ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)。HanLP支持返回语种对应的概率(置信度):"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "egpWwHKxC0OX",
"outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff"
},
"outputs": [
{
"data": {
"text/plain": [
"['ja', 0.9976244568824768]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.language_identification('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kq_j5TLFC0OX"
},
"source": [
"HanLP也支持返回概率最高的`topk`个语种:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "isJhzYyIC0OX",
"outputId": "683c8489-dffc-426e-f95b-e91dfb373260"
},
"outputs": [
{
"data": {
"text/plain": [
"['zh', 'ja']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HanLP.language_identification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"该功能对于混合了多个语种的文档而言特别实用:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'zh': 0.3952908217906952,\n",
" 'en': 0.37189167737960815,\n",
" 'ja': 0.056213412433862686}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = '''\n",
"2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。\n",
"In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.\n",
"'''\n",
"\n",
"HanLP.language_identification(text, topk=3, prob=True)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "lid_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/lid_stl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nf9TgeCTC0OT"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jaW4eu6kC0OU",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"!pip install hanlp[fasttext] -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_xI_bLAaC0OU"
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IYwV-UkNNzFp",
"outputId": "54065443-9b0a-444c-f6c0-c701bc86400b",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'CHNSENTICORP_BERT_BASE_ZH': 'https://file.hankcs.com/hanlp/classification/chnsenticorp_bert_base_20211228_163210.zip',\n",
" 'SST2_ALBERT_BASE_EN': 'https://file.hankcs.com/hanlp/classification/sst2_albert_base_20211228_164917.zip',\n",
" 'LID_176_FASTTEXT_BASE': 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin',\n",
" 'LID_176_FASTTEXT_SMALL': 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.classifiers.ALL # 任务见第一个字段"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1Uf_u7ddMhUt",
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "pp-1KqEOOJ4t",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
]
}
],
"source": [
"lid = hanlp.load('LID_176_FASTTEXT_BASE')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 语种识别\n",
"语种识别任务的输入为一个或多个文档:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "BqEmDMGGOtk3"
},
"outputs": [
{
"data": {
"text/plain": [
"'en'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SwaPn1hjC0OW"
},
"source": [
"返回对象为[ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)。HanLP支持返回语种对应的概率(置信度):"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "egpWwHKxC0OX",
"outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff"
},
"outputs": [
{
"data": {
"text/plain": [
"('ja', 0.9976244568824768)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kq_j5TLFC0OX"
},
"source": [
"HanLP也支持返回概率最高的`topk`个语种:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "isJhzYyIC0OX",
"outputId": "683c8489-dffc-426e-f95b-e91dfb373260"
},
"outputs": [
{
"data": {
"text/plain": [
"['zh', 'ja']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"该功能对于混合了多个语种的文档而言特别实用:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'zh': 0.3952908217906952,\n",
" 'en': 0.37189167737960815,\n",
" 'ja': 0.056213412433862686}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = '''\n",
"2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。\n",
"In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.\n",
"'''\n",
"\n",
"lid(text, topk=3, prob=True)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "lid_stl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_mtl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1Uf_u7ddMhUt",
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pp-1KqEOOJ4t",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"!pip install hanlp -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0tmKBu7sNAXX",
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EmZDmLn9aGxG",
"outputId": "38469cbe-d56c-4648-b103-b67e6d22aeff",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
" 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
" 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
" 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
" 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "w0lm87NUsMwW"
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "6Evnxsa0sMwW",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bPUHdNJ-sMwW"
},
"source": [
"## 命名实体识别"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"同时执行所有标准的命名实体识别:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "21be671b-ead0-43c9-cc3a-32c305d8be29"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok/fine\": [\n",
" [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
" ],\n",
" \"ner/msra\": [\n",
" [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"WWW\", 1, 2]],\n",
" [[\"北京\", \"LOCATION\", 2, 3], [\"立方庭\", \"LOCATION\", 3, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n",
" ],\n",
" \"ner/pku\": [\n",
" [],\n",
" [[\"北京立方庭\", \"ns\", 2, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n",
" ],\n",
" \"ner/ontonotes\": [\n",
" [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORG\", 1, 2]],\n",
" [[\"北京立方庭\", \"FAC\", 2, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"print(HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='ner*'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`,下标指的是命名实体在单词数组中的下标,单词数组默认为第一个以`tok`开头的数组。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cqEWnj_7p2Lf"
},
"source": [
"任务越少,速度越快。如指定仅执行命名实体识别,默认MSRA标准:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 572
},
"id": "BqEmDMGGOtk3",
"outputId": "33790ca9-7013-456f-c1cb-e5ddce90a457"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Token \tNER Type \n",
"─────────\t────────────────\n",
"2021年 \t───►DATE \n",
"HanLPv2.1\t───►WWW \n",
"为 \t \n",
"生产 \t \n",
"环境 \t \n",
"带来 \t \n",
"次世代 \t───►DATE \n",
"最 \t \n",
"先进 \t \n",
"的 \t \n",
"多 \t \n",
"语种 \t \n",
"NLP \t \n",
"技术 \t \n",
"。 \t \n",
"阿婆主 \t \n",
"来到 \t \n",
"北京 \t◄─┐ \n",
"立方庭 \t◄─┴►ORGANIZATION\n",
"参观 \t \n",
"自然 \t◄─┐ \n",
"语义 \t │ \n",
"科技 \t ├►ORGANIZATION\n",
"公司 \t◄─┘ \n",
"。 \t \n"
]
}
],
"source": [
"HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"执行OntoNotes命名实体识别:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 572
},
"id": "1goEC7znPNkI",
"outputId": "2a97331c-a5fb-4d3c-ccf2-ce2186616c57",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Token \tNER Type\n",
"─────────\t────────\n",
"2021年 \t───►DATE\n",
"HanLPv2.1\t───►ORG \n",
"为 \t \n",
"生产 \t \n",
"环境 \t \n",
"带来 \t \n",
"次世代 \t \n",
"最 \t \n",
"先进 \t \n",
"的 \t \n",
"多 \t \n",
"语种 \t \n",
"NLP \t \n",
"技术 \t \n",
"。 \t \n",
"阿婆主 \t \n",
"来到 \t \n",
"北京 \t◄─┐ \n",
"立方庭 \t◄─┴►ORG \n",
"参观 \t \n",
"自然 \t◄─┐ \n",
"语义 \t │ \n",
"科技 \t ├►ORG \n",
"公司 \t◄─┘ \n",
"。 \t \n"
]
}
],
"source": [
"HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner/ontonotes').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 注意\n",
"Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "P7CNTDBRsiYa"
},
"source": [
"## 自定义词典"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZXtRTXlBsmtw"
},
"source": [
"自定义词典是NER任务的成员变量,要操作自定义词典,先获取一个NER任务。以MSRA为例:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "QgY22h0AszsA"
},
"outputs": [],
"source": [
"ner = HanLP['ner/msra']"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_6fPzuyps98H"
},
"source": [
"### 白名单词典\n",
"白名单词典中的词语会尽量被输出。当然,HanLP以统计为主,词典的优先级很低。"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 321
},
"id": "plNDyWhws5qg",
"outputId": "7120d400-022c-42e9-fca9-febe3745d2c9"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Token\tNER Type \n",
"─────\t───────────\n",
"2021年\t───►DATE \n",
"测试 \t \n",
"高血压 \t \n",
"是 \t \n",
"138 \t───►INTEGER\n",
", \t \n",
"时间 \t \n",
"是 \t \n",
"午饭 \t◄─┐ \n",
"后 \t◄─┴►TIME \n",
"2点45 \t───►TIME \n",
", \t \n",
"低血压 \t \n",
"是 \t \n",
"44 \t───►INTEGER\n"
]
}
],
"source": [
"ner.dict_whitelist = {'午饭后': 'TIME'}\n",
"doc = HanLP('2021年测试高血压是138,时间是午饭后2点45,低血压是44', tasks='ner/msra')\n",
"doc.pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aR_8TICmtw_E"
},
"source": [
"### 强制词典\n",
"如果你读过[《自然语言处理入门》](http://nlp.hankcs.com/book.php),你就会理解BMESO标注集,于是你可以直接干预统计模型预测的标签,拿到最高优先级的权限。"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 268
},
"id": "sWPljj3stsEA",
"outputId": "99c4c281-a5b6-46bb-dffd-c1722fee7aee"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"To\tNER Type \n",
"──\t────────────\n",
"他 \t \n",
"在 \t \n",
"浙江\t───►LOCATION\n",
"金华\t───►LOCATION\n",
"出生\t \n",
", \t \n",
"他 \t \n",
"的 \t \n",
"名字\t \n",
"叫 \t \n",
"金华\t───►PERSON \n",
"。 \t \n"
]
}
],
"source": [
"ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}\n",
"HanLP('他在浙江金华出生,他的名字叫金华。', tasks='ner/msra').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fkTC0GFxtinZ"
},
"source": [
"### 黑名单词典\n",
"黑名单中的词语绝对不会被当做命名实体。"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 268
},
"id": "bIJpgdGauLJK",
"outputId": "e74ec7ba-00fd-4958-d772-a1d1c40d1033"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"To\tNER Type \n",
"──\t────────────\n",
"他 \t \n",
"在 \t \n",
"浙江\t───►LOCATION\n",
"金华\t \n",
"出生\t \n",
", \t \n",
"他 \t \n",
"的 \t \n",
"名字\t \n",
"叫 \t \n",
"金华\t \n",
"。 \t \n"
]
}
],
"source": [
"ner.dict_blacklist = {'金华'}\n",
"HanLP('他在浙江金华出生,他的名字叫金华。', tasks='ner/msra').pretty_print()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "ner_mtl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 命名实体识别"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"同时执行所有标准的命名实体识别:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "21be671b-ead0-43c9-cc3a-32c305d8be29"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok/fine\": [\n",
" [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n",
" ],\n",
" \"ner/msra\": [\n",
" [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORGANIZATION\", 1, 2]],\n",
" [[\"北京立方庭\", \"LOCATION\", 2, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n",
" ],\n",
" \"ner/pku\": [\n",
" [],\n",
" [[\"北京\", \"ns\", 2, 3], [\"立方庭\", \"ns\", 3, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n",
" ],\n",
" \"ner/ontonotes\": [\n",
" [[\"2021年\", \"DATE\", 0, 1], [\"次世代\", \"DATE\", 6, 8]],\n",
" [[\"北京\", \"FAC\", 2, 3], [\"立方庭\", \"LOC\", 3, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"print(HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner*'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`,下标指的是命名实体在单词数组中的下标,单词数组默认为第一个以`tok`开头的数组。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cqEWnj_7p2Lf"
},
"source": [
"任务越少,速度越快。如指定仅执行命名实体识别,默认MSRA标准:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 572
},
"id": "BqEmDMGGOtk3",
"outputId": "33790ca9-7013-456f-c1cb-e5ddce90a457"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Token \tNER Type \n",
"─────────\t────────────────\n",
"2021年 \t───►DATE \n",
"HanLPv2.1\t───►ORGANIZATION\n",
"为 \t \n",
"生产 \t \n",
"环境 \t \n",
"带来 \t \n",
"次 \t \n",
"世代 \t \n",
"最 \t \n",
"先进 \t \n",
"的 \t \n",
"多 \t \n",
"语种 \t \n",
"NLP \t \n",
"技术 \t \n",
"。 \t \n",
"\n",
"Tok\tNER Type \n",
"───\t────────────────\n",
"阿婆主\t \n",
"来到 \t \n",
"北京 \t◄─┐ \n",
"立方庭\t◄─┴►LOCATION \n",
"参观 \t \n",
"自然 \t◄─┐ \n",
"语义 \t │ \n",
"科技 \t ├►ORGANIZATION\n",
"公司 \t◄─┘ \n",
"。 \t \n"
]
}
],
"source": [
"HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"执行OntoNotes命名实体识别:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 572
},
"id": "1goEC7znPNkI",
"outputId": "2a97331c-a5fb-4d3c-ccf2-ce2186616c57"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Token \tNER Type\n",
"─────────\t────────\n",
"2021年 \t───►DATE\n",
"HanLPv2.1\t \n",
"为 \t \n",
"生产 \t \n",
"环境 \t \n",
"带来 \t \n",
"次 \t◄─┐ \n",
"世代 \t◄─┴►DATE\n",
"最 \t \n",
"先进 \t \n",
"的 \t \n",
"多 \t \n",
"语种 \t \n",
"NLP \t \n",
"技术 \t \n",
"。 \t \n",
"\n",
"Tok\tNER Typ\n",
"───\t───────\n",
"阿婆主\t \n",
"来到 \t \n",
"北京 \t───►FAC\n",
"立方庭\t───►LOC\n",
"参观 \t \n",
"自然 \t◄─┐ \n",
"语义 \t │ \n",
"科技 \t ├►ORG\n",
"公司 \t◄─┘ \n",
"。 \t \n"
]
}
],
"source": [
"HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner/ontonotes').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XOsWkOqQfzlr"
},
"source": [
"为已分词的句子执行命名实体识别:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 161
},
"id": "bLZSTbv_f3OA",
"outputId": "6a0e1e76-f581-4fd1-8a78-ef97d9429e87"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Token \tNER Type \n",
"────────\t────────────────\n",
"阿婆主 \t \n",
"来到 \t \n",
"北京立方庭 \t───►LOCATION \n",
"参观 \t \n",
"自然语义科技公司\t───►ORGANIZATION\n",
"。 \t \n"
]
}
],
"source": [
"HanLP(tokens=[[\"阿婆主\", \"来到\", \"北京立方庭\", \"参观\", \"自然语义科技公司\", \"。\"]], tasks='ner').pretty_print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "ner_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_stl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1Uf_u7ddMhUt",
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pp-1KqEOOJ4t",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"!pip install hanlp -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0tmKBu7sNAXX",
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EmZDmLn9aGxG",
"outputId": "0d55f7a1-3a4c-4170-e60f-da7473208e3f",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'MSRA_NER_BERT_BASE_ZH': 'https://file.hankcs.com/hanlp/ner/ner_bert_base_msra_20211227_114712.zip',\n",
" 'MSRA_NER_ALBERT_BASE_ZH': 'https://file.hankcs.com/hanlp/ner/msra_ner_albert_base_20211228_173323.zip',\n",
" 'MSRA_NER_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/ner/msra_ner_electra_small_20210807_154832.zip',\n",
" 'CONLL03_NER_BERT_BASE_CASED_EN': 'https://file.hankcs.com/hanlp/ner/ner_conll03_bert_base_cased_en_20211227_121443.zip'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.ner.ALL # 语种见名称最后一个字段或相应语料库"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "VDT-qmLyvDST"
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存。"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "Tzu5Qi-xvDST",
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"ner = hanlp.load(hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 命名实体识别"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"命名实体识别任务的输入为已分词的句子:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "864da076-7113-4685-e27a-1856e69bdd2a"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[('2021年', 'DATE', 0, 1)], [('北京', 'LOCATION', 2, 3), ('立方庭', 'LOCATION', 3, 4), ('自然语义科技公司', 'ORGANIZATION', 5, 9)]]\n"
]
}
],
"source": [
"print(ner([[\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]], tasks='ner*'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`,下标指的是命名实体在单词数组中的下标。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 自定义词典"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"自定义词典是NER任务的成员变量:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n"
]
}
],
"source": [
"print(ner.dict_whitelist)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 白名单词典\n",
"白名单词典中的词语会尽量被输出。当然,HanLP以统计为主,词典的优先级很低。"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('2021年', 'DATE', 0, 1),\n",
" ('138', 'INTEGER', 4, 5),\n",
" ('午饭后', 'TIME', 8, 10),\n",
" ('2点45', 'TIME', 10, 11),\n",
" ('44', 'INTEGER', 14, 15)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ner.dict_whitelist = {'午饭后': 'TIME'}\n",
"ner(['2021年', '测试', '高血压', '是', '138', ',', '时间', '是', '午饭', '后', '2点45', ',', '低血压', '是', '44'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 强制词典\n",
"如果你读过[《自然语言处理入门》](http://nlp.hankcs.com/book.php),你就会理解BMESO标注集,于是你可以直接干预统计模型预测的标签,拿到最高优先级的权限。"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('浙江', 'LOCATION', 2, 3), ('金华', 'LOCATION', 3, 4), ('金华', 'PERSON', 10, 11)]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}\n",
"ner(['他', '在', '浙江', '金华', '出生', ',', '他', '的', '名字', '叫', '金华', '。'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 黑名单词典\n",
"黑名单中的词语绝对不会被当做命名实体。"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('浙江', 'LOCATION', 2, 3)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ner.dict_blacklist = {'金华'}\n",
"ner(['他', '在', '浙江', '金华', '出生', ',', '他', '的', '名字', '叫', '金华', '。'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "ner_stl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_mtl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "view-in-github"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"!pip install hanlp -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 加载模型\n",
"HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4M7ka0K5OMWU",
"outputId": "50ad002e-4363-46cd-8f5d-b6d6aad3e957"
},
"outputs": [
{
"data": {
"text/plain": [
"{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n",
" 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n",
" 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n",
" 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n",
" 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n",
" 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import hanlp\n",
"hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BMW528wGNulM"
},
"source": [
"调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [],
"source": [
"HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 词性标注\n",
"任务越少,速度越快。如指定仅执行词性标注,默认CTB标准:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "5ad7fd22-651a-4403-d897-a9492eb15854"
},
"outputs": [
{
"data": {
"text/html": [
"HanLP/NR 为/P 生产/NN 环境/NN 带来/VV 次/JJ 世代/NN 最/AD 先进/JJ 的/DEG 多语种/NN NLP/NR 技术/NN 。/PU 我/PN 的/DEG 希望/NN 是/VC 希望/VV 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"注意上面两个“希望”的词性各不相同,一个是名词另一个是动词。\n",
"执行PKU词性标注:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "1goEC7znPNkI",
"outputId": "586afd5d-db0d-41bd-f7de-411f37062a8c"
},
"outputs": [
{
"data": {
"text/html": [
"HanLP/nx 为/p 生产/vn 环境/n 带来/v 次/b 世代/n 最/d 先进/a 的/u 多语种/n NLP/nx 技术/n 。/w 我/r 的/u 希望/n 是/v 希望/v 张晚霞/nr 的/u 背影/n 被/p 晚霞/n 映红/v 。/w "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos/pku').pretty_print()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"同时执行所有标准的词性标注:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "d2b3eb65-06e6-47a6-d954-04cae27d6c51"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok/fine\": [\n",
" [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
" ],\n",
" \"pos/ctb\": [\n",
" [\"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"JJ\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"NN\", \"NR\", \"NN\", \"PU\"],\n",
" [\"PN\", \"DEG\", \"NN\", \"VC\", \"VV\", \"NR\", \"DEG\", \"NN\", \"LB\", \"NN\", \"VV\", \"PU\"]\n",
" ],\n",
" \"pos/pku\": [\n",
" [\"nx\", \"p\", \"vn\", \"n\", \"v\", \"b\", \"n\", \"d\", \"a\", \"u\", \"n\", \"nx\", \"n\", \"w\"],\n",
" [\"r\", \"u\", \"n\", \"v\", \"v\", \"nr\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n",
" ],\n",
" \"pos/863\": [\n",
" [\"w\", \"p\", \"v\", \"n\", \"v\", \"a\", \"nt\", \"d\", \"a\", \"u\", \"n\", \"ws\", \"n\", \"w\"],\n",
" [\"r\", \"u\", \"n\", \"vl\", \"v\", \"nh\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"print(HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos*'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"以`pos`开头的字段为词性,以`tok`开头的第一个数组为单词,两者按下标一一对应。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 注意\n",
"Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "suUL042zPpLj"
},
"source": [
"## 自定义词典\n",
"自定义词典为词性标注任务的成员变量,要操作自定义词典,先获取一个词性标注任务,以CTB标准为例:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AzYShIssP6kq",
"outputId": "640cefa5-1d6d-464b-81d2-83c66e2081f2"
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pos = HanLP['pos/ctb']\n",
"pos"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1q4MUpgVQNlu"
},
"source": [
"自定义单个词性:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "2zZkH9tRQOoi",
"outputId": "ed0bb8fe-2e68-4c58-e11e-ff6a0cc69ae4"
},
"outputs": [
{
"data": {
"text/html": [
"HanLP/state-of-the-art-tool 为/P 生产/NN 环境/NN 带来/VV 次/JJ 世代/NN 最/AD 先进/JJ 的/DEG 多语种/NN NLP/NR 技术/NN 。/PU "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pos.dict_tags = {'HanLP': 'state-of-the-art-tool'}\n",
"HanLP(\"HanLP为生产环境带来次世代最先进的多语种NLP技术。\", tasks='pos/ctb').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "F-9gAeIVQUFG"
},
"source": [
"根据上下文自定义词性:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "F8M8cyBrQduw",
"outputId": "16ef7f82-50ff-478f-c3ea-8e768b0cea31"
},
"outputs": [
{
"data": {
"text/html": [
"我/PN 的/补语成分 希望/名词 是/VC 希望/动词 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'}\n",
"HanLP(\"我的希望是希望张晚霞的背影被晚霞映红。\", tasks='pos/ctb').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"需要算法基础才能理解,初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"include_colab_link": true,
"name": "pos_mtl.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_restful.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t
\n",
"\t
\n",
"
\n",
"\n",
"## 安装"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IYwV-UkNNzFp"
},
"source": [
"无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1Uf_u7ddMhUt"
},
"outputs": [],
"source": [
"pip install hanlp_restful -U"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pp-1KqEOOJ4t"
},
"source": [
"## 创建客户端"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "0tmKBu7sNAXX"
},
"outputs": [],
"source": [
"from hanlp_restful import HanLPClient\n",
"HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EmZDmLn9aGxG"
},
"source": [
"#### 申请秘钥\n",
"由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "elA_UyssOut_"
},
"source": [
"## 词性标注\n",
"任务越少,速度越快。如指定仅执行词性标注,默认CTB标准:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "BqEmDMGGOtk3",
"outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34"
},
"outputs": [
{
"data": {
"text/html": [
"HanLP/NR 为/P 生产/NN 环境/NN 带来/VV 次世代/NN 最/AD 先进/JJ 的/DEG 多/CD 语种/NN NLP/NN 技术/NN 。/PU 我/PN 的/DEG 希望/NN 是/VC 希望/VV 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jj1Jk-2sPHYx"
},
"source": [
"注意上面两个“希望”的词性各不相同,一个是名词另一个是动词。\n",
"\n",
"### 执行PKU词性标注"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "1goEC7znPNkI",
"outputId": "7a3fde55-7577-49eb-92c8-48146aaa89d3"
},
"outputs": [
{
"data": {
"text/html": [
"HanLP/nx 为/p 生产/vn 环境/n 带来/v 次世代/n 最/d 先进/a 的/u 多/a 语种/n NLP/nx 技术/n 。/w 我/r 的/u 希望/n 是/v 希望/v 张晚霞/nr 的/u 背影/n 被/p 晚霞/n 映红/v 。/w "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos/pku').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 执行粗颗粒度分词和PKU词性标注"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"阿婆主/n 来到/v 北京立方庭/ns 参观/v 自然语义科技公司/n 。/w "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['tok/coarse', 'pos/pku'], skip_tasks='tok/fine').pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"举一反三,你可以指定其他pos标注集(ctb、863等)。用户有多聪明,HanLP就有多强大。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wxctCigrTKu-"
},
"source": [
"### 同时执行所有标准的词性标注"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Zo08uquCTFSk",
"outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"tok/fine\": [\n",
" [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
" ],\n",
" \"pos/ctb\": [\n",
" [\"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"CD\", \"NN\", \"NN\", \"NN\", \"PU\"],\n",
" [\"PN\", \"DEG\", \"NN\", \"VC\", \"VV\", \"NR\", \"DEG\", \"NN\", \"LB\", \"NN\", \"VV\", \"PU\"]\n",
" ],\n",
" \"pos/pku\": [\n",
" [\"nx\", \"p\", \"vn\", \"n\", \"v\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"nx\", \"n\", \"w\"],\n",
" [\"r\", \"u\", \"n\", \"v\", \"v\", \"nr\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n",
" ],\n",
" \"pos/863\": [\n",
" [\"w\", \"p\", \"v\", \"n\", \"v\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"w\", \"n\", \"w\"],\n",
" [\"r\", \"u\", \"v\", \"vl\", \"v\", \"nh\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n",
" ]\n",
"}\n"
]
}
],
"source": [
"print(HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos*'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"以`pos`开头的字段为词性,以`tok`开头的第一个数组为单词,两者按下标一一对应。"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XOsWkOqQfzlr"
},
"source": [
"### 为已分词的句子执行词性标注"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 70
},
"id": "bLZSTbv_f3OA",
"outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844"
},
"outputs": [
{
"data": {
"text/html": [
"HanLP/NR 为/P 生产环境/NN 带来/VV 次世代/NN 最/AD 先进/JJ 的/DEG 多语种/NN NLP/NN 技术/NN 。/PU 我/PN 的/DEG 希望/NN 是/VC 希望/VV 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"HanLP(tokens=[\n",
" [\"HanLP\", \"为\", \"生产环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n",
" [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n",
" ], tasks='pos').pretty_print()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "pos_restful.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_stl.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "WfGpInivS0fG"
},
"source": [
"点击下列图标在线运行HanLP \n",
"\n",
"\t