Repository: hankcs/HanLP Branch: doc-zh Commit: ddb1299bddff Files: 697 Total size: 3.2 MB Directory structure: gitextract_p7um9exn/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ ├── config.yml │ │ └── feature_request.md │ ├── pull_request_template.md │ └── workflows/ │ └── unit-tests.yml ├── .gitignore ├── CITATION.cff ├── LICENSE ├── README.md ├── docs/ │ ├── Makefile │ ├── annotations/ │ │ ├── constituency/ │ │ │ ├── ctb.md │ │ │ ├── index.md │ │ │ ├── npcmj.md │ │ │ └── ptb.md │ │ ├── dep/ │ │ │ ├── index.md │ │ │ ├── pmt.md │ │ │ ├── sd_en.md │ │ │ ├── sd_zh.md │ │ │ └── ud.md │ │ ├── index.md │ │ ├── ner/ │ │ │ ├── index.md │ │ │ ├── msra.md │ │ │ ├── ontonotes.md │ │ │ └── pku.md │ │ ├── pos/ │ │ │ ├── 863.md │ │ │ ├── ctb.md │ │ │ ├── index.md │ │ │ ├── npcmj.md │ │ │ ├── pku.md │ │ │ └── ud.md │ │ ├── sdp/ │ │ │ ├── dm.md │ │ │ ├── index.md │ │ │ ├── pas.md │ │ │ ├── psd.md │ │ │ └── semeval16.md │ │ ├── srl/ │ │ │ ├── cpb.md │ │ │ ├── index.md │ │ │ └── propbank.md │ │ └── tok/ │ │ ├── ctb.md │ │ ├── index.md │ │ └── msr.md │ ├── api/ │ │ ├── common/ │ │ │ ├── configurable.rst │ │ │ ├── conll.rst │ │ │ ├── constant.rst │ │ │ ├── document.rst │ │ │ └── index.md │ │ ├── hanlp/ │ │ │ ├── common/ │ │ │ │ ├── component.rst │ │ │ │ ├── dataset.md │ │ │ │ ├── index.md │ │ │ │ ├── structure.md │ │ │ │ ├── torch_component.md │ │ │ │ ├── transform.md │ │ │ │ └── vocab.md │ │ │ ├── components/ │ │ │ │ ├── classifiers.md │ │ │ │ ├── eos.md │ │ │ │ ├── index.md │ │ │ │ ├── lemmatizer.md │ │ │ │ ├── mtl/ │ │ │ │ │ ├── index.md │ │ │ │ │ ├── mtl.md │ │ │ │ │ └── tasks/ │ │ │ │ │ ├── constituency.md │ │ │ │ │ ├── dep.md │ │ │ │ │ ├── index.md │ │ │ │ │ ├── lem.md │ │ │ │ │ ├── ner/ │ │ │ │ │ │ ├── biaffine_ner.md │ │ │ │ │ │ ├── index.md │ │ │ │ │ │ └── tag_ner.md │ │ │ │ │ ├── pos.md │ │ │ │ │ ├── sdp.md │ │ │ │ │ ├── srl/ │ │ │ │ │ │ ├── bio_srl.md │ │ │ │ │ │ ├── index.md │ │ │ │ │ │ └── rank_srl.md │ │ │ │ │ ├── task.md │ │ │ │ │ ├── tok.md │ │ │ │ │ └── ud.md │ │ │ │ ├── ner/ │ │ │ │ │ ├── biaffine_ner.md │ │ │ │ │ ├── index.md │ │ │ │ │ ├── rnn_ner.md │ │ │ │ │ └── transformer_ner.md │ │ │ │ ├── parsers/ │ │ │ │ │ ├── biaffine_dep.md │ │ │ │ │ ├── biaffine_sdp.md │ │ │ │ │ ├── crf_constituency_parser.md │ │ │ │ │ ├── index.md │ │ │ │ │ └── ud_parser.md │ │ │ │ ├── pipeline.md │ │ │ │ ├── srl/ │ │ │ │ │ ├── index.md │ │ │ │ │ ├── span_bio.md │ │ │ │ │ └── span_rank.md │ │ │ │ ├── sts.md │ │ │ │ ├── taggers/ │ │ │ │ │ ├── index.md │ │ │ │ │ ├── rnn_tagger.md │ │ │ │ │ └── transformer_tagger.md │ │ │ │ └── tokenizers/ │ │ │ │ ├── index.md │ │ │ │ ├── multi_criteria.md │ │ │ │ └── transformer.md │ │ │ ├── datasets/ │ │ │ │ ├── constituency/ │ │ │ │ │ ├── constituency_dataset.md │ │ │ │ │ ├── index.md │ │ │ │ │ └── resources.md │ │ │ │ ├── dep/ │ │ │ │ │ ├── conll_dataset.md │ │ │ │ │ ├── index.md │ │ │ │ │ └── resources.md │ │ │ │ ├── eos/ │ │ │ │ │ ├── eos.md │ │ │ │ │ ├── index.md │ │ │ │ │ └── resources.md │ │ │ │ ├── index.md │ │ │ │ ├── ner/ │ │ │ │ │ ├── index.md │ │ │ │ │ ├── json.md │ │ │ │ │ ├── resources.md │ │ │ │ │ └── tsv.md │ │ │ │ ├── pos/ │ │ │ │ │ ├── index.md │ │ │ │ │ └── resources.md │ │ │ │ ├── srl/ │ │ │ │ │ ├── conll2012_dataset.md │ │ │ │ │ ├── index.md │ │ │ │ │ └── resources.md │ │ │ │ └── tok/ │ │ │ │ ├── index.md │ │ │ │ ├── mcws_dataset.md │ │ │ │ ├── resources.md │ │ │ │ └── txt.md │ │ │ ├── hanlp.rst │ │ │ ├── index.md │ │ │ ├── layers/ │ │ │ │ ├── decoders/ │ │ │ │ │ ├── biaffine_ner.md │ │ │ │ │ ├── index.md │ │ │ │ │ └── linear_crf.md │ │ │ │ ├── embeddings/ │ │ │ │ │ ├── char_cnn.md │ │ │ │ │ ├── char_rnn.md │ │ │ │ │ ├── embedding.md │ │ │ │ │ ├── fasttext.md │ │ │ │ │ ├── index.md │ │ │ │ │ ├── transformer.md │ │ │ │ │ └── word2vec.md │ │ │ │ ├── index.md │ │ │ │ └── transformers/ │ │ │ │ ├── encoder.md │ │ │ │ ├── index.md │ │ │ │ └── tokenizer.md │ │ │ ├── pretrained/ │ │ │ │ ├── amr.md │ │ │ │ ├── amr2text.md │ │ │ │ ├── constituency.md │ │ │ │ ├── dep.md │ │ │ │ ├── eos.md │ │ │ │ ├── fasttext.md │ │ │ │ ├── glove.md │ │ │ │ ├── index.md │ │ │ │ ├── mlm.md │ │ │ │ ├── mtl.md │ │ │ │ ├── ner.md │ │ │ │ ├── pos.md │ │ │ │ ├── sdp.md │ │ │ │ ├── srl.md │ │ │ │ ├── sts.md │ │ │ │ ├── tok.md │ │ │ │ └── word2vec.md │ │ │ └── utils/ │ │ │ ├── index.md │ │ │ └── io_util.md │ │ ├── restful.rst │ │ ├── restful_golang.md │ │ ├── restful_java.md │ │ └── trie/ │ │ ├── dictionary.md │ │ ├── index.md │ │ └── trie.md │ ├── conf.py │ ├── configure.md │ ├── contributing.md │ ├── data_format.md │ ├── index.md │ ├── install.md │ ├── references.bib │ ├── references.rst │ └── tutorial.md ├── hanlp/ │ ├── __init__.py │ ├── callbacks/ │ │ ├── __init__.py │ │ └── fine_csv_logger.py │ ├── common/ │ │ ├── __init__.py │ │ ├── component.py │ │ ├── dataset.py │ │ ├── keras_component.py │ │ ├── structure.py │ │ ├── torch_component.py │ │ ├── transform.py │ │ ├── transform_tf.py │ │ ├── vocab.py │ │ └── vocab_tf.py │ ├── components/ │ │ ├── __init__.py │ │ ├── amr/ │ │ │ ├── __init__.py │ │ │ ├── amrbart/ │ │ │ │ ├── __init__.py │ │ │ │ ├── bart_amr_generation.py │ │ │ │ ├── bart_amr_parser.py │ │ │ │ ├── common/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── constant.py │ │ │ │ │ ├── penman_interface.py │ │ │ │ │ └── postprocessing.py │ │ │ │ ├── data_interface/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── dataset.py │ │ │ │ ├── model_interface/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── modeling_bart.py │ │ │ │ │ └── tokenization_bart.py │ │ │ │ └── preprocess/ │ │ │ │ ├── __init__.py │ │ │ │ ├── amr_io.py │ │ │ │ ├── penman_interface.py │ │ │ │ └── read_and_process.py │ │ │ └── seq2seq/ │ │ │ ├── __init__.py │ │ │ ├── dataset/ │ │ │ │ ├── IO.py │ │ │ │ ├── __init__.py │ │ │ │ ├── dataset.py │ │ │ │ ├── linearization.py │ │ │ │ ├── penman.py │ │ │ │ ├── postprocessing.py │ │ │ │ ├── tokenization_bart.py │ │ │ │ └── tokenization_t5.py │ │ │ ├── evaluation.py │ │ │ ├── optim.py │ │ │ └── seq2seq_amr_parser.py │ │ ├── classifiers/ │ │ │ ├── __init__.py │ │ │ ├── fasttext_classifier.py │ │ │ ├── transformer_classifier.py │ │ │ ├── transformer_classifier_hf.py │ │ │ ├── transformer_classifier_tf.py │ │ │ └── transformer_regression_hf.py │ │ ├── distillation/ │ │ │ ├── __init__.py │ │ │ ├── distillable_component.py │ │ │ ├── losses.py │ │ │ └── schedulers.py │ │ ├── eos/ │ │ │ ├── __init__.py │ │ │ └── ngram.py │ │ ├── lambda_wrapper.py │ │ ├── lemmatizer.py │ │ ├── lm/ │ │ │ ├── __init__.py │ │ │ └── mlm.py │ │ ├── mtl/ │ │ │ ├── __init__.py │ │ │ ├── multi_task_learning.py │ │ │ └── tasks/ │ │ │ ├── __init__.py │ │ │ ├── amr.py │ │ │ ├── constituency.py │ │ │ ├── dep.py │ │ │ ├── dep_2nd.py │ │ │ ├── lem.py │ │ │ ├── ner/ │ │ │ │ ├── __init__.py │ │ │ │ ├── biaffine_ner.py │ │ │ │ └── tag_ner.py │ │ │ ├── pos.py │ │ │ ├── sdp.py │ │ │ ├── srl/ │ │ │ │ ├── __init__.py │ │ │ │ ├── bio_srl.py │ │ │ │ └── rank_srl.py │ │ │ ├── tok/ │ │ │ │ ├── __init__.py │ │ │ │ ├── reg_tok.py │ │ │ │ └── tag_tok.py │ │ │ └── ud.py │ │ ├── ner/ │ │ │ ├── __init__.py │ │ │ ├── biaffine_ner/ │ │ │ │ ├── __init__.py │ │ │ │ ├── biaffine_ner.py │ │ │ │ └── biaffine_ner_model.py │ │ │ ├── ner_tf.py │ │ │ ├── rnn_ner.py │ │ │ └── transformer_ner.py │ │ ├── parsers/ │ │ │ ├── __init__.py │ │ │ ├── alg.py │ │ │ ├── alg_tf.py │ │ │ ├── biaffine/ │ │ │ │ ├── __init__.py │ │ │ │ ├── biaffine.py │ │ │ │ ├── biaffine_2nd_dep.py │ │ │ │ ├── biaffine_dep.py │ │ │ │ ├── biaffine_model.py │ │ │ │ ├── biaffine_sdp.py │ │ │ │ ├── mlp.py │ │ │ │ ├── structual_attention.py │ │ │ │ └── variationalbilstm.py │ │ │ ├── biaffine_parser_tf.py │ │ │ ├── biaffine_tf/ │ │ │ │ ├── __init__.py │ │ │ │ ├── alg.py │ │ │ │ ├── layers.py │ │ │ │ └── model.py │ │ │ ├── chu_liu_edmonds.py │ │ │ ├── conll.py │ │ │ ├── constituency/ │ │ │ │ ├── __init__.py │ │ │ │ ├── crf_constituency_model.py │ │ │ │ ├── crf_constituency_parser.py │ │ │ │ └── treecrf.py │ │ │ ├── parse_alg.py │ │ │ └── ud/ │ │ │ ├── __init__.py │ │ │ ├── lemma_edit.py │ │ │ ├── tag_decoder.py │ │ │ ├── ud_model.py │ │ │ ├── ud_parser.py │ │ │ ├── udify_util.py │ │ │ └── util.py │ │ ├── pipeline.py │ │ ├── rnn_language_model_tf.py │ │ ├── srl/ │ │ │ ├── __init__.py │ │ │ ├── span_bio/ │ │ │ │ ├── __init__.py │ │ │ │ ├── baffine_tagging.py │ │ │ │ └── span_bio.py │ │ │ └── span_rank/ │ │ │ ├── __init__.py │ │ │ ├── highway_variational_lstm.py │ │ │ ├── inference_utils.py │ │ │ ├── layer.py │ │ │ ├── span_rank.py │ │ │ ├── span_ranking_srl_model.py │ │ │ ├── srl_eval_utils.py │ │ │ └── util.py │ │ ├── sts/ │ │ │ ├── __init__.py │ │ │ └── transformer_sts.py │ │ ├── taggers/ │ │ │ ├── __init__.py │ │ │ ├── cnn_tagger_tf.py │ │ │ ├── ngram_conv/ │ │ │ │ ├── __init__.py │ │ │ │ └── ngram_conv_tagger.py │ │ │ ├── pos_tf.py │ │ │ ├── rnn/ │ │ │ │ ├── __init__.py │ │ │ │ └── rnntaggingmodel.py │ │ │ ├── rnn_tagger.py │ │ │ ├── rnn_tagger_tf.py │ │ │ ├── tagger.py │ │ │ ├── tagger_tf.py │ │ │ ├── transformers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── metrics_tf.py │ │ │ │ ├── transformer_tagger.py │ │ │ │ ├── transformer_tagger_tf.py │ │ │ │ └── transformer_transform_tf.py │ │ │ └── util.py │ │ └── tokenizers/ │ │ ├── __init__.py │ │ ├── multi_criteria_cws_transformer.py │ │ ├── tok.py │ │ ├── tok_tf.py │ │ └── transformer.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── classification/ │ │ │ ├── __init__.py │ │ │ └── sentiment.py │ │ ├── coref/ │ │ │ ├── __init__.py │ │ │ └── loaders/ │ │ │ ├── __init__.py │ │ │ └── conll12coref.py │ │ ├── eos/ │ │ │ ├── __init__.py │ │ │ ├── eos.py │ │ │ └── loaders/ │ │ │ ├── __init__.py │ │ │ └── nn_eos.py │ │ ├── lm/ │ │ │ ├── __init__.py │ │ │ └── loaders/ │ │ │ ├── __init__.py │ │ │ └── lm_dataset.py │ │ ├── lu/ │ │ │ ├── __init__.py │ │ │ └── glue.py │ │ ├── ner/ │ │ │ ├── __init__.py │ │ │ ├── conll03.py │ │ │ ├── loaders/ │ │ │ │ ├── __init__.py │ │ │ │ ├── json_ner.py │ │ │ │ └── tsv.py │ │ │ ├── msra.py │ │ │ ├── resume.py │ │ │ └── weibo.py │ │ ├── parsing/ │ │ │ ├── __init__.py │ │ │ ├── amr.py │ │ │ ├── ctb5.py │ │ │ ├── ctb7.py │ │ │ ├── ctb8.py │ │ │ ├── ctb9.py │ │ │ ├── loaders/ │ │ │ │ ├── __init__.py │ │ │ │ ├── _ctb_utils.py │ │ │ │ ├── conll_dataset.py │ │ │ │ └── constituency_dataset.py │ │ │ ├── pmt1.py │ │ │ ├── ptb.py │ │ │ ├── semeval15.py │ │ │ ├── semeval16.py │ │ │ └── ud/ │ │ │ ├── __init__.py │ │ │ ├── ud210.py │ │ │ ├── ud210m.py │ │ │ ├── ud23.py │ │ │ ├── ud23m.py │ │ │ ├── ud27.py │ │ │ └── ud27m.py │ │ ├── pos/ │ │ │ ├── __init__.py │ │ │ └── ctb5.py │ │ ├── qa/ │ │ │ ├── __init__.py │ │ │ └── hotpotqa.py │ │ ├── srl/ │ │ │ ├── __init__.py │ │ │ ├── loaders/ │ │ │ │ ├── __init__.py │ │ │ │ ├── conll2012.py │ │ │ │ └── ontonotes_loader.py │ │ │ └── ontonotes5/ │ │ │ ├── __init__.py │ │ │ ├── _utils.py │ │ │ ├── chinese.py │ │ │ └── english.py │ │ ├── sts/ │ │ │ ├── __init__.py │ │ │ └── stsb.py │ │ └── tokenization/ │ │ ├── __init__.py │ │ ├── ctb6.py │ │ ├── loaders/ │ │ │ ├── __init__.py │ │ │ ├── chunking_dataset.py │ │ │ ├── multi_criteria_cws/ │ │ │ │ ├── __init__.py │ │ │ │ └── mcws_dataset.py │ │ │ └── txt.py │ │ └── sighan2005/ │ │ ├── __init__.py │ │ ├── as_.py │ │ ├── cityu.py │ │ ├── msr.py │ │ └── pku.py │ ├── layers/ │ │ ├── __init__.py │ │ ├── cnn_encoder.py │ │ ├── crf/ │ │ │ ├── __init__.py │ │ │ ├── crf.py │ │ │ ├── crf_layer_tf.py │ │ │ └── crf_tf.py │ │ ├── dropout.py │ │ ├── embeddings/ │ │ │ ├── __init__.py │ │ │ ├── char_cnn.py │ │ │ ├── char_cnn_tf.py │ │ │ ├── char_rnn.py │ │ │ ├── char_rnn_tf.py │ │ │ ├── concat_embedding.py │ │ │ ├── contextual_string_embedding.py │ │ │ ├── contextual_string_embedding_tf.py │ │ │ ├── contextual_word_embedding.py │ │ │ ├── embedding.py │ │ │ ├── fast_text.py │ │ │ ├── fast_text_tf.py │ │ │ ├── util.py │ │ │ ├── util_tf.py │ │ │ ├── word2vec.py │ │ │ └── word2vec_tf.py │ │ ├── feed_forward.py │ │ ├── feedforward.py │ │ ├── scalar_mix.py │ │ ├── time_distributed.py │ │ ├── transformers/ │ │ │ ├── __init__.py │ │ │ ├── encoder.py │ │ │ ├── loader_tf.py │ │ │ ├── pt_imports.py │ │ │ ├── relative_transformer.py │ │ │ ├── resource.py │ │ │ ├── tf_imports.py │ │ │ ├── utils.py │ │ │ └── utils_tf.py │ │ └── weight_normalization.py │ ├── losses/ │ │ ├── __init__.py │ │ └── sparse_categorical_crossentropy.py │ ├── metrics/ │ │ ├── __init__.py │ │ ├── accuracy.py │ │ ├── amr/ │ │ │ ├── __init__.py │ │ │ └── smatch_eval.py │ │ ├── chunking/ │ │ │ ├── __init__.py │ │ │ ├── binary_chunking_f1.py │ │ │ ├── bmes_tf.py │ │ │ ├── chunking_f1.py │ │ │ ├── chunking_f1_tf.py │ │ │ ├── conlleval.py │ │ │ ├── iobes_tf.py │ │ │ └── sequence_labeling.py │ │ ├── f1.py │ │ ├── metric.py │ │ ├── mtl.py │ │ ├── parsing/ │ │ │ ├── __init__.py │ │ │ ├── attachmentscore.py │ │ │ ├── conllx_eval.py │ │ │ ├── labeled_f1.py │ │ │ ├── labeled_f1_tf.py │ │ │ ├── labeled_score.py │ │ │ ├── semdep_eval.py │ │ │ └── span.py │ │ ├── spearman_correlation.py │ │ └── srl/ │ │ ├── __init__.py │ │ └── srlconll.py │ ├── optimizers/ │ │ ├── __init__.py │ │ └── adamw/ │ │ ├── __init__.py │ │ └── optimization.py │ ├── pretrained/ │ │ ├── __init__.py │ │ ├── amr.py │ │ ├── amr2text.py │ │ ├── classifiers.py │ │ ├── constituency.py │ │ ├── dep.py │ │ ├── eos.py │ │ ├── fasttext.py │ │ ├── glove.py │ │ ├── mtl.py │ │ ├── ner.py │ │ ├── pos.py │ │ ├── rnnlm.py │ │ ├── sdp.py │ │ ├── srl.py │ │ ├── sts.py │ │ ├── tok.py │ │ └── word2vec.py │ ├── transform/ │ │ ├── __init__.py │ │ ├── conll_tf.py │ │ ├── glue_tf.py │ │ ├── table_tf.py │ │ ├── tacred_tf.py │ │ ├── text_tf.py │ │ ├── transformer_tokenizer.py │ │ ├── tsv_tf.py │ │ └── txt_tf.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── component_util.py │ │ ├── file_read_backwards/ │ │ │ ├── __init__.py │ │ │ ├── buffer_work_space.py │ │ │ └── file_read_backwards.py │ │ ├── init_util.py │ │ ├── io_util.py │ │ ├── lang/ │ │ │ ├── __init__.py │ │ │ ├── en/ │ │ │ │ ├── __init__.py │ │ │ │ └── english_tokenizer.py │ │ │ ├── ja/ │ │ │ │ ├── __init__.py │ │ │ │ └── bert_tok.py │ │ │ └── zh/ │ │ │ ├── __init__.py │ │ │ ├── char_table.py │ │ │ └── localization.py │ │ ├── log_util.py │ │ ├── rules.py │ │ ├── span_util.py │ │ ├── string_util.py │ │ ├── tf_util.py │ │ ├── time_util.py │ │ └── torch_util.py │ └── version.py ├── plugins/ │ ├── README.md │ ├── hanlp_common/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── hanlp_common/ │ │ │ ├── __init__.py │ │ │ ├── amr.py │ │ │ ├── configurable.py │ │ │ ├── conll.py │ │ │ ├── constant.py │ │ │ ├── document.py │ │ │ ├── io.py │ │ │ ├── reflection.py │ │ │ ├── structure.py │ │ │ ├── util.py │ │ │ └── visualization.py │ │ └── setup.py │ ├── hanlp_demo/ │ │ ├── README.md │ │ ├── hanlp_demo/ │ │ │ ├── __init__.py │ │ │ ├── block_windows.py │ │ │ ├── en/ │ │ │ │ ├── __init__.py │ │ │ │ ├── demo_amr.py │ │ │ │ ├── demo_dep.py │ │ │ │ ├── demo_lm.py │ │ │ │ ├── demo_ner.py │ │ │ │ ├── demo_pipeline.py │ │ │ │ ├── demo_pos.py │ │ │ │ ├── demo_sdp.py │ │ │ │ ├── demo_sentiment_analysis.py │ │ │ │ ├── demo_tok.py │ │ │ │ └── train_sst2_albert_base.py │ │ │ ├── ja/ │ │ │ │ ├── __init__.py │ │ │ │ └── demo_mtl.py │ │ │ ├── mul/ │ │ │ │ ├── __init__.py │ │ │ │ ├── demo_lid.py │ │ │ │ ├── demo_lid_restful.py │ │ │ │ ├── demo_mtl.py │ │ │ │ └── train/ │ │ │ │ ├── __init__.py │ │ │ │ └── mul_base.py │ │ │ ├── sent_split.py │ │ │ └── zh/ │ │ │ ├── __init__.py │ │ │ ├── abstractive_summarization_restful.ipynb │ │ │ ├── amr_restful.ipynb │ │ │ ├── amr_stl.ipynb │ │ │ ├── classification_restful.ipynb │ │ │ ├── con_mtl.ipynb │ │ │ ├── con_restful.ipynb │ │ │ ├── con_stl.ipynb │ │ │ ├── cor_restful.ipynb │ │ │ ├── demo_amr.py │ │ │ ├── demo_custom_dict.py │ │ │ ├── demo_custom_dict_stl.py │ │ │ ├── demo_del_tasks.py │ │ │ ├── demo_document.py │ │ │ ├── demo_mlm.py │ │ │ ├── demo_mtl.py │ │ │ ├── demo_ner_dict.py │ │ │ ├── demo_parse_constituency.py │ │ │ ├── demo_pipeline.py │ │ │ ├── demo_pos_dict.py │ │ │ ├── demo_sts.py │ │ │ ├── demo_word2vec.py │ │ │ ├── dep_mtl.ipynb │ │ │ ├── dep_restful.ipynb │ │ │ ├── dep_stl.ipynb │ │ │ ├── extractive_summarization_restful.ipynb │ │ │ ├── gec_restful.ipynb │ │ │ ├── keyphrase_restful.ipynb │ │ │ ├── lid_restful.ipynb │ │ │ ├── lid_stl.ipynb │ │ │ ├── ner_mtl.ipynb │ │ │ ├── ner_restful.ipynb │ │ │ ├── ner_stl.ipynb │ │ │ ├── pos_mtl.ipynb │ │ │ ├── pos_restful.ipynb │ │ │ ├── pos_stl.ipynb │ │ │ ├── sdp_mtl.ipynb │ │ │ ├── sdp_restful.ipynb │ │ │ ├── sdp_stl.ipynb │ │ │ ├── sentiment_restful.ipynb │ │ │ ├── srl_mtl.ipynb │ │ │ ├── srl_restful.ipynb │ │ │ ├── srl_stl.ipynb │ │ │ ├── sts_restful.ipynb │ │ │ ├── sts_stl.ipynb │ │ │ ├── tf/ │ │ │ │ ├── __init__.py │ │ │ │ ├── demo_classifier.py │ │ │ │ ├── demo_client.py │ │ │ │ ├── demo_cws.py │ │ │ │ ├── demo_cws_trie.py │ │ │ │ ├── demo_dep.py │ │ │ │ ├── demo_fasttext.py │ │ │ │ ├── demo_multiprocess.py │ │ │ │ ├── demo_ner.py │ │ │ │ ├── demo_pipeline.py │ │ │ │ ├── demo_pos.py │ │ │ │ ├── demo_sdp.py │ │ │ │ ├── demo_serving.py │ │ │ │ └── train/ │ │ │ │ ├── __init__.py │ │ │ │ ├── cws/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── train_ctb6_cws_albert.py │ │ │ │ │ ├── train_ctb6_cws_bert.py │ │ │ │ │ ├── train_ctb6_cws_convseg.py │ │ │ │ │ ├── train_large_bert_cws.py │ │ │ │ │ ├── train_large_conv_cws.py │ │ │ │ │ ├── train_large_cws_albert.py │ │ │ │ │ ├── train_large_cws_electra.py │ │ │ │ │ ├── train_large_rnn_cws.py │ │ │ │ │ ├── train_msr_cws_albert.py │ │ │ │ │ ├── train_msr_cws_bert.py │ │ │ │ │ ├── train_msr_cws_ngram_conv.py │ │ │ │ │ ├── train_msr_cws_ngram_conv_embed.py │ │ │ │ │ ├── train_pku980106_conv_cws.py │ │ │ │ │ ├── train_pku980106_rnn_cws.py │ │ │ │ │ └── train_pku_conv_cws.py │ │ │ │ ├── finetune_msra_ner_albert.py │ │ │ │ ├── train_chnsenticorp_bert.py │ │ │ │ ├── train_conll03_ner_bert.py │ │ │ │ ├── train_conll03_ner_flair.py │ │ │ │ ├── train_ctb5_dep.py │ │ │ │ ├── train_ctb5_pos_rnn.py │ │ │ │ ├── train_ctb7_dep.py │ │ │ │ ├── train_ctb9_pos_albert.py │ │ │ │ ├── train_ctb9_pos_electra.py │ │ │ │ ├── train_msra_ner_albert.py │ │ │ │ ├── train_msra_ner_bert.py │ │ │ │ ├── train_msra_ner_electra.py │ │ │ │ ├── train_msra_ner_ngram_conv.py │ │ │ │ ├── train_msra_ner_rnn.py │ │ │ │ ├── train_ptb_dep_biaffine_albert.py │ │ │ │ ├── train_ptb_dep_biaffine_bert.py │ │ │ │ ├── train_ptb_dep_biaffine_bert_96.6.py │ │ │ │ ├── train_ptb_dep_biaffine_bert_positional.py │ │ │ │ ├── train_ptb_dep_sa_albert.py │ │ │ │ ├── train_ptb_dep_sa_albert_topk.py │ │ │ │ ├── train_ptb_dep_sa_bert.py │ │ │ │ ├── train_ptb_dep_sa_pos_bert.py │ │ │ │ ├── train_ptb_pos_rnn_fasttext.py │ │ │ │ ├── train_semeval15_dm.py │ │ │ │ ├── train_semeval15_pas.py │ │ │ │ ├── train_semeval15_psd.py │ │ │ │ ├── train_semeval16_news.py │ │ │ │ └── train_semeval16_text.py │ │ │ ├── tok_mtl.ipynb │ │ │ ├── tok_restful.ipynb │ │ │ ├── tok_stl.ipynb │ │ │ ├── train/ │ │ │ │ ├── __init__.py │ │ │ │ ├── finetune_ner.py │ │ │ │ ├── open_base.py │ │ │ │ └── open_small.py │ │ │ ├── train_sota_bert_pku.py │ │ │ ├── tst_restful.ipynb │ │ │ └── tutorial.ipynb │ │ └── setup.py │ ├── hanlp_restful/ │ │ ├── README.md │ │ ├── hanlp_restful/ │ │ │ └── __init__.py │ │ ├── setup.py │ │ └── tests/ │ │ ├── __init__.py │ │ └── test_client.py │ ├── hanlp_restful_golang/ │ │ └── README.md │ ├── hanlp_restful_java/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── com/ │ │ │ └── hankcs/ │ │ │ └── hanlp/ │ │ │ └── restful/ │ │ │ ├── BaseInput.java │ │ │ ├── CoreferenceResolutionOutput.java │ │ │ ├── DocumentInput.java │ │ │ ├── HanLPClient.java │ │ │ ├── SentenceInput.java │ │ │ ├── Span.java │ │ │ ├── TokenInput.java │ │ │ └── mrp/ │ │ │ ├── Anchor.java │ │ │ ├── Edge.java │ │ │ ├── MeaningRepresentation.java │ │ │ └── Node.java │ │ └── test/ │ │ └── java/ │ │ └── com/ │ │ └── hankcs/ │ │ └── hanlp/ │ │ └── restful/ │ │ ├── HanLPClientTest.java │ │ └── MeaningRepresentationTest.java │ └── hanlp_trie/ │ ├── README.md │ ├── hanlp_trie/ │ │ ├── __init__.py │ │ ├── dictionary.py │ │ └── trie.py │ ├── setup.py │ └── tests/ │ ├── __init__.py │ ├── test_trie.py │ └── test_trie_dict.py ├── setup.py └── tests/ ├── __init__.py ├── test_config_tracker.py ├── test_mtl.py ├── test_pipeline.py ├── test_rules.py └── test_string_util.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: 🐛发现一个bug about: 需提交版本号、触发代码、错误日志 title: '' labels: bug assignees: hankcs --- **Describe the bug** A clear and concise description of what the bug is. **Code to reproduce the issue** Provide a reproducible test case that is the bare minimum necessary to generate the problem. ```python ``` **Describe the current behavior** A clear and concise description of what happened. **Expected behavior** A clear and concise description of what you expected to happen. **System information** - OS Platform and Distribution (e.g., Linux Ubuntu 16.04): - Python version: - HanLP version: **Other info / logs** Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached. * [ ] I've completed this form and searched the web for solutions. ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false contact_links: - name: ⁉️ 提问求助请上论坛 url: https://bbs.hankcs.com/ about: 欢迎前往蝴蝶效应论坛求助 ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: 🚀新功能请愿 about: 建议增加一个新功能 title: '' labels: feature request assignees: hankcs --- **Describe the feature and the current behavior/state.** **Will this change the current api? How?** **Who will benefit with this feature?** **Are you willing to contribute it (Yes/No):** **System information** - OS Platform and Distribution (e.g., Linux Ubuntu 16.04): - Python version: - HanLP version: **Any other info** * [ ] I've carefully completed this form. ================================================ FILE: .github/pull_request_template.md ================================================ # Title of Your Pull Request ## Description Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. Fixes # (issue) ## Type of Change Please check any relevant options and delete the rest. - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] New feature (non-breaking change which adds functionality) - [ ] This change requires a documentation update ## How Has This Been Tested? Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration ## Checklist Check all items that apply. - [ ] ⚠️Changes **must** be made on `dev` branch instead of `master` - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] New and existing unit tests pass locally with my changes - [ ] My code follows the style guidelines of this project - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation - [ ] My changes generate no new warnings - [ ] I have checked my code and corrected any misspellings ================================================ FILE: .github/workflows/unit-tests.yml ================================================ name: Unit Tests on: push: branches: [ "**" ] pull_request: branches: [ "**" ] jobs: build: runs-on: ${{ matrix.os }} env: HANLP_HOME: ${{ github.workspace }}/data strategy: fail-fast: false matrix: os: [ ubuntu-latest, macos-latest, windows-latest ] python-version: [ 3.6, 3.7, 3.8, 3.9, '3.10' ] exclude: # GHA doesn't list 3.6 for ubuntu-22.04 - os: ubuntu-latest python-version: "3.6" # MacOS 14.4.1 for arm64 doesn't support Python < 3.8 - os: macos-latest python-version: "3.6" - os: macos-latest python-version: "3.7" include: # GHA doesn't list 3.6 for ubuntu-22 - os: ubuntu-20.04 python-version: "3.6" # MacOS 13 required for Python < 3.8 - os: macos-13 python-version: "3.6" - os: macos-13 python-version: "3.7" steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install dependencies shell: bash run: | python -m pip install -e plugins/hanlp_trie python -m pip install -e plugins/hanlp_common python -m pip install -e . python -m pip install pytest - name: Cache data uses: actions/cache@v3 with: path: ${{ env.HANLP_HOME }} key: hanlp-data - name: Test with pytest shell: bash run: | pytest tests pytest plugins/hanlp_trie/tests deploy: needs: build if: github.event_name == 'push' && github.ref == 'refs/heads/master' runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Install dependencies run: | python -m pip install setuptools wheel twine - name: Deploy to PyPI run: | python setup.py sdist bdist_wheel python -m twine upload dist/* env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} TWINE_REPOSITORY: pypi ================================================ FILE: .gitignore ================================================ # Created by .ignore support plugin (hsz.mobi) ### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ### Java template # Compiled class file *.class # Log file # BlueJ files *.ctxt # Mobile Tools for Java (J2ME) .mtj.tmp/ # Package Files # *.jar *.war *.nar *.ear *.zip *.tar.gz *.rar # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* ### Eclipse template .metadata bin/ tmp/ *.tmp *.bak *.swp *~.nib local.properties .settings/ .loadpath .recommenders # External tool builders .externalToolBuilders/ # Locally stored "Eclipse launch configurations" *.launch # PyDev specific (Python IDE for Eclipse) *.pydevproject # CDT-specific (C/C++ Development Tooling) .cproject # CDT- autotools .autotools # Java annotation processor (APT) .factorypath # PDT-specific (PHP Development Tools) .buildpath # sbteclipse plugin .target # Tern plugin .tern-project # TeXlipse plugin .texlipse # STS (Spring Tool Suite) .springBeans # Code Recommenders .recommenders/ # Annotation Processing .apt_generated/ # Scala IDE specific (Scala & Java development for Eclipse) .cache-main .scala_dependencies .worksheet ### VisualStudioCode template .vscode/* !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json !.vscode/extensions.json ### JetBrains template # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff .idea/**/workspace.xml .idea/**/tasks.xml .idea/**/usage.statistics.xml .idea/**/dictionaries .idea/**/shelf # Generated files .idea/**/contentModel.xml # Sensitive or high-churn files .idea/**/dataSources/ .idea/**/dataSources.ids .idea/**/dataSources.local.xml .idea/**/sqlDataSources.xml .idea/**/dynamic.xml .idea/**/uiDesigner.xml .idea/**/dbnavigator.xml # Gradle .idea/**/gradle.xml .idea/**/libraries # Gradle and Maven with auto-import # When using Gradle or Maven with auto-import, you should exclude module files, # since they will be recreated, and may cause churn. Uncomment if using # auto-import. # .idea/modules.xml # .idea/*.iml # .idea/modules # *.iml # *.ipr # CMake cmake-build-*/ # Mongo Explorer plugin .idea/**/mongoSettings.xml # File-based project format *.iws # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties # Editor-based Rest HanLPClient .idea/httpRequests # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser .idea *.iml data .vscode *.pkl *.pdf _static/ _build/ _templates/ ================================================ FILE: CITATION.cff ================================================ cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - family-names: He given-names: Han orcid: "https://orcid.org/0009-0005-1778-917X" title: "HanLP: Han Language Processing" version: 2.1 date-released: 2015-05-27 url: "https://github.com/hankcs/HanLP" preferred-citation: type: conference-paper authors: - family-names: He given-names: Han - family-names: Choi given-names: Jinho D. title: "The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders" editors: - family-names: Moens given-names: Marie-Francine - family-names: Huang given-names: Xuanjing - family-names: Specia given-names: Lucia - family-names: Yih given-names: Scott Wen-tau year: 2021 month: 11 date-released: 2021-11 conference: name: "2021 Conference on Empirical Methods in Natural Language Processing" place: "Online and Punta Cana, Dominican Republic" url: "https://aclanthology.org/2021.emnlp-main.451" doi: "10.18653/v1/2021.emnlp-main.451" url: "https://aclanthology.org/2021.emnlp-main.451" publisher: "Association for Computational Linguistics" booktitle: "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing" location: "Online and Punta Cana, Dominican Republic" pages: "5555-5577" ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================

HanLP: Han Language Processing

Unit Tests PyPI Version Python Versions Downloads 在线运行

English | 日本語 | 文档 | 论文 | 论坛 | docker | ▶️在线运行

面向生产环境的多语种自然语言处理工具包,基于PyTorch和TensorFlow 2.x双引擎,目标是普及落地最前沿的NLP技术。HanLP具备功能完善、精度准确、性能高效、语料时新、架构清晰、可自定义的特点。 [![demo](https://raw.githubusercontent.com/hankcs/OpenCC-to-HanLP/img/demo.gif)](https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftutorial.ipynb) 借助世界上最大的多语种语料库,HanLP2.1支持包括简繁中英日俄法德在内的[130种语言](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mtl.html#hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6)上的10种联合任务以及多种单任务。HanLP预训练了十几种任务上的数十个模型并且正在持续迭代语料库与模型:
| 功能 | RESTful | 多任务 | 单任务 | 模型 | 标注标准 | | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | | [分词](https://hanlp.hankcs.com/demos/tok.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_stl.ipynb) | [tok](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html) | [粗分](https://hanlp.hankcs.com/docs/annotations/tok/msr.html)、[细分](https://hanlp.hankcs.com/docs/annotations/tok/ctb.html) | | [词性标注](https://hanlp.hankcs.com/demos/pos.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/pos_stl.ipynb) | [pos](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/pos.html) | [CTB](https://hanlp.hankcs.com/docs/annotations/pos/ctb.html)、[PKU](https://hanlp.hankcs.com/docs/annotations/pos/pku.html)、[863](https://hanlp.hankcs.com/docs/annotations/pos/863.html) | | [命名实体识别](https://hanlp.hankcs.com/demos/ner.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/ner_stl.ipynb) | [ner](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/ner.html) | [PKU](https://hanlp.hankcs.com/docs/annotations/ner/pku.html)、[MSRA](https://hanlp.hankcs.com/docs/annotations/ner/msra.html)、[OntoNotes](https://hanlp.hankcs.com/docs/annotations/ner/ontonotes.html) | | [依存句法分析](https://hanlp.hankcs.com/demos/dep.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/dep_stl.ipynb) | [dep](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/dep.html) | [SD](https://hanlp.hankcs.com/docs/annotations/dep/sd_zh.html)、[UD](https://hanlp.hankcs.com/docs/annotations/dep/ud.html#chinese)、[PMT](https://hanlp.hankcs.com/docs/annotations/dep/pmt.html) | | [成分句法分析](https://hanlp.hankcs.com/demos/con.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/con_stl.ipynb) | [con](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/constituency.html) | [Chinese Tree Bank](https://hanlp.hankcs.com/docs/annotations/constituency/ctb.html) | | [语义依存分析](https://hanlp.hankcs.com/demos/sdp.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sdp_stl.ipynb) | [sdp](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/sdp.html) | [CSDP](https://hanlp.hankcs.com/docs/annotations/sdp/semeval16.html#) | | [语义角色标注](https://hanlp.hankcs.com/demos/srl.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_restful.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_mtl.ipynb) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/srl_stl.ipynb) | [srl](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/srl.html) | [Chinese Proposition Bank](https://hanlp.hankcs.com/docs/annotations/srl/cpb.html) | | [抽象意义表示](https://hanlp.hankcs.com/demos/amr.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/amr_restful.ipynb) | 暂无 | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/amr_stl.ipynb) | [amr](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/amr.html) | [CAMR](https://www.hankcs.com/nlp/corpus/introduction-to-chinese-abstract-meaning-representation.html) | | [指代消解](https://hanlp.hankcs.com/demos/cor.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/cor_restful.ipynb) | 暂无 | 暂无 | 暂无 | OntoNotes | | [语义文本相似度](https://hanlp.hankcs.com/demos/sts.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sts_restful.ipynb) | 暂无 | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sts_stl.ipynb) | [sts](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/sts.html) | 暂无 | | [文本风格转换](https://hanlp.hankcs.com/demos/tst.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tst_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 | | [关键词短语提取](https://hanlp.hankcs.com/demos/keyphrase.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/keyphrase_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 | | [抽取式自动摘要](https://hanlp.hankcs.com/demos/exsum.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/extractive_summarization_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 | | [生成式自动摘要](https://hanlp.hankcs.com/demos/absum.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/abstractive_summarization_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 | | [文本语法纠错](https://hanlp.hankcs.com/demos/gec.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/gec_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 | | [文本分类](https://hanlp.hankcs.com/demos/classification.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/classification_restful.ipynb) | 暂无 | 暂无 | 暂无 | 暂无 | | [情感分析](https://hanlp.hankcs.com/demos/sentiment.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/sentiment_restful.ipynb) | 暂无 | 暂无 | 暂无 | `[-1,+1]` | | [语种检测](https://hanlp.hankcs.com/demos/classification.html) | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/lid_restful.ipynb) | 暂无 | [教程](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/lid_stl.ipynb) | 暂无 | [ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) |
- 词干提取、词法语法特征提取请参考[英文教程](https://hanlp.hankcs.com/docs/tutorial.html);[词向量](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/word2vec.html)和[完形填空](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mlm.html)请参考相应文档。 - 简繁转换、拼音、新词发现、文本聚类请参考[1.x教程](https://github.com/hankcs/HanLP/tree/1.x)。 量体裁衣,HanLP提供**RESTful**和**native**两种API,分别面向轻量级和海量级两种场景。无论何种API何种语言,HanLP接口在语义上保持一致,在代码上坚持开源。如果您在研究中使用了HanLP,请引用我们的[EMNLP论文](https://aclanthology.org/2021.emnlp-main.451/)。 ### 轻量级RESTful API 仅数KB,适合敏捷开发、移动APP等场景。简单易用,无需GPU配环境,秒速安装。语料更多、模型更大、精度更高,**强烈推荐**。服务器GPU算力有限,匿名用户配额较少,[建议申请**免费公益**API秘钥`auth`](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。 #### Python ```shell pip install hanlp_restful ``` 创建客户端,填入服务器地址和秘钥: ```python from hanlp_restful import HanLPClient HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种 ``` #### Golang 安装 `go get -u github.com/hankcs/gohanlp@main` ,创建客户端,填入服务器地址和秘钥: ```go HanLP := hanlp.HanLPClient(hanlp.WithAuth(""),hanlp.WithLanguage("zh")) // auth不填则匿名,zh中文,mul多语种 ``` #### Java 在`pom.xml`中添加依赖: ```xml com.hankcs.hanlp.restful hanlp-restful 0.0.12 ``` 创建客户端,填入服务器地址和秘钥: ```java HanLPClient HanLP = new HanLPClient("https://www.hanlp.com/api", null, "zh"); // auth不填则匿名,zh中文,mul多语种 ``` #### 快速上手 无论何种开发语言,调用`parse`接口,传入一篇文章,得到HanLP精准的分析结果。 ```java HanLP.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。") ``` 更多功能包括语义相似度、风格转换、指代消解等,请参考[文档](https://hanlp.hankcs.com/docs/api/restful.html)和[测试用例](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful/tests/test_client.py)。 ### 海量级native API 依赖PyTorch、TensorFlow等深度学习技术,适合**专业**NLP工程师、研究者以及本地海量数据场景。要求Python 3.6至3.10,支持Windows,推荐*nix。可以在CPU上运行,推荐GPU/TPU。安装PyTorch版: ```bash pip install hanlp ``` - HanLP每次发布都通过了Linux、macOS和Windows上Python3.6至3.10的[单元测试](https://github.com/hankcs/HanLP/actions?query=branch%3Amaster),不存在安装问题。 HanLP发布的模型分为多任务和单任务两种,多任务速度快省显存,单任务精度高更灵活。 #### 多任务模型 HanLP的工作流程为加载模型然后将其当作函数调用,例如下列联合多任务模型: ```python import hanlp HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) # 世界最大中文语料库 HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。']) ``` Native API的输入单位为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful和native两种API的语义设计完全一致,用户可以无缝互换。简洁的接口也支持灵活的参数,常用的技巧有: - 灵活的`tasks`任务调度,任务越少,速度越快,详见[教程](https://mybinder.org/v2/gh/hankcs/HanLP/doc-zh?filepath=plugins%2Fhanlp_demo%2Fhanlp_demo%2Fzh%2Ftutorial.ipynb)。在内存有限的场景下,用户还可以[删除不需要的任务](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_del_tasks.py)达到模型瘦身的效果。 - 高效的trie树自定义词典,以及强制、合并、校正3种规则,请参考[demo](https://github.com/hankcs/HanLP/blob/doc-zh/plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb)和[文档](https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html)。规则系统的效果将无缝应用到后续统计模型,从而快速适应新领域。 #### 单任务模型 根据我们的[最新研究](https://aclanthology.org/2021.emnlp-main.451),多任务学习的优势在于速度和显存,然而精度往往不如单任务模型。所以,HanLP预训练了许多单任务模型并设计了优雅的[流水线模式](https://hanlp.hankcs.com/docs/api/hanlp/components/pipeline.html#hanlp.components.pipeline.Pipeline)将其组装起来。 ```python import hanlp HanLP = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ .append(hanlp.load('FINE_ELECTRA_SMALL_ZH'), output_key='tok') \ .append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \ .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \ .append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=0), output_key='dep', input_key='tok')\ .append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok') HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。') ``` 更多功能,请参考[demo](https://github.com/hankcs/HanLP/tree/doc-zh/plugins/hanlp_demo/hanlp_demo/zh)和[文档](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html)了解更多模型与用法。 ### 输出格式 无论何种API何种开发语言何种自然语言,HanLP的输出统一为`json`格式兼容`dict`的[`Document`](https://hanlp.hankcs.com/docs/api/common/document.html): ```json { "tok/fine": [ ["2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"], ["阿婆主", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司", "。"] ], "tok/coarse": [ ["2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"], ["阿婆主", "来到", "北京立方庭", "参观", "自然语义科技公司", "。"] ], "pos/ctb": [ ["NT", "NR", "P", "NN", "NN", "VV", "JJ", "NN", "AD", "JJ", "DEG", "CD", "NN", "NR", "NN", "PU"], ["NN", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN", "PU"] ], "pos/pku": [ ["t", "nx", "p", "vn", "n", "v", "b", "n", "d", "a", "u", "a", "n", "nx", "n", "w"], ["n", "v", "ns", "ns", "v", "n", "n", "n", "n", "w"] ], "pos/863": [ ["nt", "w", "p", "v", "n", "v", "a", "nt", "d", "a", "u", "a", "n", "ws", "n", "w"], ["n", "v", "ns", "n", "v", "n", "n", "n", "n", "w"] ], "ner/pku": [ [], [["北京立方庭", "ns", 2, 4], ["自然语义科技公司", "nt", 5, 9]] ], "ner/msra": [ [["2021年", "DATE", 0, 1], ["HanLPv2.1", "ORGANIZATION", 1, 2]], [["北京", "LOCATION", 2, 3], ["立方庭", "LOCATION", 3, 4], ["自然语义科技公司", "ORGANIZATION", 5, 9]] ], "ner/ontonotes": [ [["2021年", "DATE", 0, 1], ["HanLPv2.1", "ORG", 1, 2]], [["北京立方庭", "FAC", 2, 4], ["自然语义科技公司", "ORG", 5, 9]] ], "srl": [ [[["2021年", "ARGM-TMP", 0, 1], ["HanLPv2.1", "ARG0", 1, 2], ["为生产环境", "ARG2", 2, 5], ["带来", "PRED", 5, 6], ["次世代最先进的多语种NLP技术", "ARG1", 6, 15]], [["最", "ARGM-ADV", 8, 9], ["先进", "PRED", 9, 10], ["技术", "ARG0", 14, 15]]], [[["阿婆主", "ARG0", 0, 1], ["来到", "PRED", 1, 2], ["北京立方庭", "ARG1", 2, 4]], [["阿婆主", "ARG0", 0, 1], ["参观", "PRED", 4, 5], ["自然语义科技公司", "ARG1", 5, 9]]] ], "dep": [ [[6, "tmod"], [6, "nsubj"], [6, "prep"], [5, "nn"], [3, "pobj"], [0, "root"], [8, "amod"], [15, "nn"], [10, "advmod"], [15, "rcmod"], [10, "assm"], [13, "nummod"], [15, "nn"], [15, "nn"], [6, "dobj"], [6, "punct"]], [[2, "nsubj"], [0, "root"], [4, "nn"], [2, "dobj"], [2, "conj"], [9, "nn"], [9, "nn"], [9, "nn"], [5, "dobj"], [2, "punct"]] ], "sdp": [ [[[6, "Time"]], [[6, "Exp"]], [[5, "mPrep"]], [[5, "Desc"]], [[6, "Datv"]], [[13, "dDesc"]], [[0, "Root"], [8, "Desc"], [13, "Desc"]], [[15, "Time"]], [[10, "mDegr"]], [[15, "Desc"]], [[10, "mAux"]], [[8, "Quan"], [13, "Quan"]], [[15, "Desc"]], [[15, "Nmod"]], [[6, "Pat"]], [[6, "mPunc"]]], [[[2, "Agt"], [5, "Agt"]], [[0, "Root"]], [[4, "Loc"]], [[2, "Lfin"]], [[2, "ePurp"]], [[8, "Nmod"]], [[9, "Nmod"]], [[9, "Nmod"]], [[5, "Datv"]], [[5, "mPunc"]]] ], "con": [ ["TOP", [["IP", [["NP", [["NT", ["2021年"]]]], ["NP", [["NR", ["HanLPv2.1"]]]], ["VP", [["PP", [["P", ["为"]], ["NP", [["NN", ["生产"]], ["NN", ["环境"]]]]]], ["VP", [["VV", ["带来"]], ["NP", [["ADJP", [["NP", [["ADJP", [["JJ", ["次"]]]], ["NP", [["NN", ["世代"]]]]]], ["ADVP", [["AD", ["最"]]]], ["VP", [["JJ", ["先进"]]]]]], ["DEG", ["的"]], ["NP", [["QP", [["CD", ["多"]]]], ["NP", [["NN", ["语种"]]]]]], ["NP", [["NR", ["NLP"]], ["NN", ["技术"]]]]]]]]]], ["PU", ["。"]]]]]], ["TOP", [["IP", [["NP", [["NN", ["阿婆主"]]]], ["VP", [["VP", [["VV", ["来到"]], ["NP", [["NR", ["北京"]], ["NR", ["立方庭"]]]]]], ["VP", [["VV", ["参观"]], ["NP", [["NN", ["自然"]], ["NN", ["语义"]], ["NN", ["科技"]], ["NN", ["公司"]]]]]]]], ["PU", ["。"]]]]]] ] } ``` 特别地,Python RESTful和native API支持基于等宽字体的[可视化](https://hanlp.hankcs.com/docs/tutorial.html#visualization),能够直接将语言学结构在控制台内可视化出来: ```python HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。']).pretty_print() Dep Tree Token Relati PoS Tok NER Type Tok SRL PA1 Tok SRL PA2 Tok PoS 3 4 5 6 7 8 9 ──────────── ───────── ────── ─── ───────── ──────────────── ───────── ──────────── ───────── ──────────── ───────── ───────────────────────────────────────────────────────── ┌─────────► 2021年 tmod NT 2021年 ───►DATE 2021年 ───►ARGM-TMP 2021年 2021年 NT ───────────────────────────────────────────►NP ───┐ │┌────────► HanLPv2.1 nsubj NR HanLPv2.1 ───►ORGANIZATION HanLPv2.1 ───►ARG0 HanLPv2.1 HanLPv2.1 NR ───────────────────────────────────────────►NP────┤ ││┌─►┌───── 为 prep P 为 为 ◄─┐ 为 为 P ───────────┐ │ │││ │ ┌─► 生产 nn NN 生产 生产 ├►ARG2 生产 生产 NN ──┐ ├────────────────────────►PP ───┐ │ │││ └─►└── 环境 pobj NN 环境 环境 ◄─┘ 环境 环境 NN ──┴►NP ───┘ │ │ ┌┼┴┴──────── 带来 root VV 带来 带来 ╟──►PRED 带来 带来 VV ──────────────────────────────────┐ │ │ ││ ┌─► 次 amod JJ 次 次 ◄─┐ 次 次 JJ ───►ADJP──┐ │ ├►VP────┤ ││ ┌───►└── 世代 nn NN 世代 世代 │ 世代 世代 NN ───►NP ───┴►NP ───┐ │ │ │ ││ │ ┌─► 最 advmod AD 最 最 │ 最 ───►ARGM-ADV 最 AD ───────────►ADVP──┼►ADJP──┐ ├►VP ───┘ ├►IP ││ │┌──►├── 先进 rcmod JJ 先进 先进 │ 先进 ╟──►PRED 先进 JJ ───────────►VP ───┘ │ │ │ ││ ││ └─► 的 assm DEG 的 的 ├►ARG1 的 的 DEG──────────────────────────┤ │ │ ││ ││ ┌─► 多 nummod CD 多 多 │ 多 多 CD ───►QP ───┐ ├►NP ───┘ │ ││ ││┌─►└── 语种 nn NN 语种 语种 │ 语种 语种 NN ───►NP ───┴────────►NP────┤ │ ││ │││ ┌─► NLP nn NR NLP NLP │ NLP NLP NR ──┐ │ │ │└─►└┴┴──┴── 技术 dobj NN 技术 技术 ◄─┘ 技术 ───►ARG0 技术 NN ──┴────────────────►NP ───┘ │ └──────────► 。 punct PU 。 。 。 。 PU ──────────────────────────────────────────────────┘ Dep Tree Tok Relat Po Tok NER Type Tok SRL PA1 Tok SRL PA2 Tok Po 3 4 5 6 ──────────── ─── ───── ── ─── ──────────────── ─── ──────── ─── ──────── ─── ──────────────────────────────── ┌─► 阿婆主 nsubj NN 阿婆主 阿婆主 ───►ARG0 阿婆主 ───►ARG0 阿婆主 NN───────────────────►NP ───┐ ┌┬────┬──┴── 来到 root VV 来到 来到 ╟──►PRED 来到 来到 VV──────────┐ │ ││ │ ┌─► 北京 nn NR 北京 ───►LOCATION 北京 ◄─┐ 北京 北京 NR──┐ ├►VP ───┐ │ ││ └─►└── 立方庭 dobj NR 立方庭 ───►LOCATION 立方庭 ◄─┴►ARG1 立方庭 立方庭 NR──┴►NP ───┘ │ │ │└─►┌─────── 参观 conj VV 参观 参观 参观 ╟──►PRED 参观 VV──────────┐ ├►VP────┤ │ │ ┌───► 自然 nn NN 自然 ◄─┐ 自然 自然 ◄─┐ 自然 NN──┐ │ │ ├►IP │ │ │┌──► 语义 nn NN 语义 │ 语义 语义 │ 语义 NN │ ├►VP ───┘ │ │ │ ││┌─► 科技 nn NN 科技 ├►ORGANIZATION 科技 科技 ├►ARG1 科技 NN ├►NP ───┘ │ │ └─►└┴┴── 公司 dobj NN 公司 ◄─┘ 公司 公司 ◄─┘ 公司 NN──┘ │ └──────────► 。 punct PU 。 。 。 。 PU──────────────────────────┘ ``` 关于标注集含义,请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习,所以HanLP的标注集也是覆盖面最广的。 ## 训练你自己的领域模型 写深度学习模型一点都不难,难的是复现较高的准确率。下列[代码](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/train_sota_bert_pku.py)展示了如何在sighan2005 PKU语料库上花6分钟训练一个超越学术界state-of-the-art的中文分词模型。 ```python tokenizer = TransformerTaggingTokenizer() save_dir = 'data/model/cws/sighan2005_pku_bert_base_96.73' tokenizer.fit( SIGHAN2005_PKU_TRAIN_ALL, SIGHAN2005_PKU_TEST, # Conventionally, no devset is used. See Tian et al. (2020). save_dir, 'bert-base-chinese', max_seq_len=300, char_level=True, hard_constraint=True, sampler_builder=SortingSamplerBuilder(batch_size=32), epochs=3, adam_epsilon=1e-6, warmup_steps=0.1, weight_decay=0.01, word_dropout=0.1, seed=1660853059, ) tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir) ``` 其中,由于指定了随机数种子,结果一定是`96.73`。不同于那些虚假宣传的学术论文或商业项目,HanLP保证所有结果可复现。如果你有任何质疑,我们将当作最高优先级的致命性bug第一时间排查问题。 请参考[demo](https://github.com/hankcs/HanLP/tree/master/plugins/hanlp_demo/hanlp_demo/zh/train)了解更多训练脚本。 ## 性能
langcorporamodeltokposnerdepconsrlsdplemfeaamr
finecoarsectbpku863udpkumsraontonotesSemEval16DMPASPSD
mulUD2.7
OntoNotes5
small98.62----93.23--74.4279.1076.8570.63-91.1993.6785.3487.7184.51-
base98.97----90.32--80.3278.7471.2373.63-92.6096.0481.1985.0882.13-
zhopensmall97.25-96.66-----95.0084.5787.6273.4084.57------
base97.50-97.07-----96.0487.1189.8477.7887.11------
closesmall96.7095.9396.8797.5695.05-96.2295.7476.7984.4488.1375.8174.28------
base97.5296.4496.9997.5995.29-96.4895.7277.7785.2988.5776.5273.76------
ernie96.9597.2996.7697.6495.22-97.3196.4777.9585.6789.1778.5174.10------
- 根据我们的[最新研究](https://aclanthology.org/2021.emnlp-main.451),单任务学习的性能往往优于多任务学习。在乎精度甚于速度的话,建议使用[单任务模型](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html)。 HanLP采用的数据预处理与拆分比例与流行方法未必相同,比如HanLP采用了[完整版的MSRA命名实体识别语料](https://bbs.hankcs.com/t/topic/3033),而非大众使用的阉割版;HanLP使用了语法覆盖更广的[Stanford Dependencies标准](https://hanlp.hankcs.com/docs/annotations/dep/sd_zh.html),而非学术界沿用的Zhang and Clark (2008)标准;HanLP提出了[均匀分割CTB的方法](https://bbs.hankcs.com/t/topic/3024),而不采用学术界不均匀且遗漏了51个黄金文件的方法。HanLP开源了[一整套语料预处理脚本与相应语料库](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py),力图推动中文NLP的透明化。 总之,HanLP只做我们认为正确、先进的事情,而不一定是流行、权威的事情。 ## 引用 如果你在研究中使用了HanLP,请按如下格式引用: ```bibtex @inproceedings{he-choi-2021-stem, title = "The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders", author = "He, Han and Choi, Jinho D.", booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2021", address = "Online and Punta Cana, Dominican Republic", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.emnlp-main.451", pages = "5555--5577", abstract = "Multi-task learning with transformer encoders (MTL) has emerged as a powerful technique to improve performance on closely-related tasks for both accuracy and efficiency while a question still remains whether or not it would perform as well on tasks that are distinct in nature. We first present MTL results on five NLP tasks, POS, NER, DEP, CON, and SRL, and depict its deficiency over single-task learning. We then conduct an extensive pruning analysis to show that a certain set of attention heads get claimed by most tasks during MTL, who interfere with one another to fine-tune those heads for their own objectives. Based on this finding, we propose the Stem Cell Hypothesis to reveal the existence of attention heads naturally talented for many tasks that cannot be jointly trained to create adequate embeddings for all of those tasks. Finally, we design novel parameter-free probes to justify our hypothesis and demonstrate how attention heads are transformed across the five tasks during MTL through label analysis.", } ``` ## License ### 源代码 HanLP源代码的授权协议为 **Apache License 2.0**,可免费用做商业用途。请在产品说明中附加HanLP的链接和授权协议。HanLP受版权法保护,侵权必究。 ##### 自然语义(青岛)科技有限公司 HanLP从v1.7版起独立运作,由自然语义(青岛)科技有限公司作为项目主体,主导后续版本的开发,并拥有后续版本的版权。 ##### 上海林原公司 HanLP 早期得到了上海林原公司的大力支持,并拥有1.28及前序版本的版权,相关版本也曾在上海林原公司网站发布。 ### 预训练模型 机器学习模型的授权在法律上没有定论,但本着尊重开源语料库原始授权的精神,如不特别说明,HanLP的多语种模型授权沿用[CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/),中文模型授权为仅供研究与教学使用。 ## References https://hanlp.hankcs.com/docs/references.html ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/annotations/constituency/ctb.md ================================================ # Chinese Tree Bank See also [The Bracketing Guidelines for the Penn Chinese Treebank (3.0)](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1040&context=ircs_reports). | Tag | Definition | 定义 | 例子 | |------|----------------------------------------------|----------------------------------------------------|-------------------| | ADJP | adjective phrase | 形容词短语,以形容词为中心词 | 不完全、大型 | | ADVP | adverbial phrase headed by AD (adverb) | 副词短语,以副词为中心词 | 非常、很 | | CLP | classifier phrase | 由量词构成的短语 | 系列、大批 | | CP | clause headed by C (complementizer) | 从句,通过带补语(如“的”、“吗”等) | 张三喜欢李四吗? | | DNP | phrase formed by ‘‘XP + DEG’’ | 结构为XP + DEG(的)的短语,其中XP可以是ADJP、DP、QP、PP等等,用于修饰名词短语。 | 大型的、前几年的、五年的、在上海的 | | DP | determiner phrase | 限定词短语,通常由限定词和数量词构成 | 这三个、任何 | | DVP | phrase formed by ‘‘XP + DEV’’ | 结构为XP+地的短评,用于修饰动词短语VP | 心情失落地、大批地 | | FRAG | fragment | 片段 | (完) | | INTJ | interjection | 插话,感叹语 | 哈哈、切 | | IP | simple clause headed by I (INFL) | 简单子句或句子,通常不带补语(如“的”、“吗”等) | 张三喜欢李四。 | | LCP | phrase formed by ‘‘XP + LC’’ | 用于表本地点+方位词(LC)的短语 | 生活中、田野上 | | LST | list marker | 列表短语,包括标点符号 | 一. | | MSP | some particles | 其他小品词 | 所、而、来、去 | | NN | common noun | 名词 | HanLP、技术 | | NP | noun phrase | 名词短语,中心词通常为名词 | 美好生活、经济水平 | | PP | preposition phrase | 介词短语,中心词通常为介词 | 在北京、据报道 | | PRN | parenthetical | 插入语 | ,(张三说), | | QP | quantifier phrase | 量词短语 | 三个、五百辆 | | TOP | root node | 根节点 | 根节点 | | UCP | unidentical coordination phrase | 不对称的并列短语,指并列词两侧的短语类型不致 | (养老、医疗)保险 | | VCD | coordinated verb compound | 复合动词 | 出版发行 | | VCP | verb compounds formed by VV + VC | VV + VC形式的动词短语 | 看作是 | | VNV | verb compounds formed by A-not-A or A-one-A | V不V形式的动词短语 | 能不能、信不信 | | VP | verb phrase | 动词短语,中心词通常为动词 | 完成任务、努力工作 | | VPT | potential form V-de-R or V-bu-R | V不R、V得R形式的动词短语 | 打不赢、打得过 | | VRD | verb resultative compound | 动补结构短语 | 研制成功、降下来 | | VSB | verb compounds formed by a modifier + a head | 修饰语+中心词构成的动词短语 | 拿来支付、仰头望去 | ================================================ FILE: docs/annotations/constituency/index.md ================================================ # Constituency Parsing ## Chinese ```{toctree} ctb ``` ## English ```{toctree} ptb ``` ## Japanese ```{toctree} npcmj ``` ================================================ FILE: docs/annotations/constituency/npcmj.md ================================================ # NPCMJ | Tag | Description | |-----------------|-----------------------------------------| | ADVP | adverb phrase | | ADVP-CMPL | complement adverb phrase | | ADVP-MSR | measurement adverb phrase | | ADVP-PRD | predicate adverb phrase | | ADVP-TMP | temporal adverb phrase | | CONJP | conjunction phrase | | CP-EXL | exclamative | | CP-IMP | imperative | | CP-FINAL | projection for sentence final particle | | CP-QUE | question (direct or indirect) | | CP-QUE-ADV | question used adverbially | | CP-QUE-OB1 | question used as object | | CP-QUE-PRD | question used as a nominal predicate | | CP-QUE-SBJ | question used as subject | | CP-THT | complementizer clause | | CP-THT-ADV | complementizer clause used adverbially | | CP-THT-OB1 | complementizer clause used as object | | CP-THT-PRD | complementizer clause used as predicate | | CP-THT-PRP | purposive complementizer clause | | CP-THT-SBJ | complementizer clause used as subject | | FRAG | fragment | | FS | false start | | INTJP | interjection phrase | | IP-ADV | adverbial clause | | IP-ADV-CONJ | coordinated clause | | IP-ADV-PRD | adverbial clause used as predicate | | IP-ADV-SCON | subordinate clause | | IP-ADV-SCON-CND | | | conditional | clause | | IP-EMB | gapless noun-modifying clause | | IP-IMP | imperative clause | | IP-MAT | matrix clause | | IP-NMZ | nominalized clause | | IP-NMZ-PRD | nominalized clause used as predicate | | IP-REL | relative clause | | IP-SMC | small clause | | IP-SMC-CNT | small clause in continuative form | | IP-SMC-OB1 | small clause used as object | | IP-SMC-SBJ | small clause used as subject | | IP-SUB | clause under CP* layer | | multi-sentence | multiple sentence | | NML | intermediate nominal layer | | NP | noun phrase | | NP-ADV | adverbial noun phrase | | NP-CZZ | causee noun phrase | | NP-DOB1 | derived primary object noun phrase | | NP-DSBJ | derived subject noun phrase | | NP-LGS | logical subject noun phrase | | NP-LOC | locational noun phrase | | NP-MSR | measure noun phrase | | NP-OB1 | primary object noun phrase | | NP-OB2 | secondary object noun phrase | | NP-POS | possessive noun phrase | | NP-PRD | predicate noun phrase | | NP-SBJ | subject noun phrase | | NP-SBJ2 | secondary subject noun phrase | | NP-TMP | temporal noun phrase | | NP-TPC | topic noun phrase | | NP-VOC | vocative noun phrase | | NUMCLP | numeral-classifier phrase | | PNLP | prenominal phrase | | PP | particle phrase | | PP-ADV | adverbial particle phrase | | PP-CMPL | complement particle phrase | | PP-CONJ | coordination particle phrase | | PP-CZZ | causee particle phrase | | PP-DOB1 | derived primary object particle phrase | | PP-DSBJ | derived subject particle phrase | | PP-LGS | logical subject particle phrase | | PP-LOC | locational particle phrase | | PP-MSR | measure particle phrase | | PP-OB1 | primary object particle phrase | | PP-OB2 | secondary object particle phrase | | PP-PRD | predicate particle phrase | | PP-PRP | purpositive particle phrase | | PP-SBJ | subject particle phrase | | PP-SBJ2 | secondary subject particle phrase | | PP-SCON | subordination particle phrase | | PP-SCON-CND | conditional particle phrase | | PP-TMP | temporal particle phrase | | PP-TPC | topic particle phrase | | PP-VOC | vocative particle phrase | | PRN | parenthetical | ================================================ FILE: docs/annotations/constituency/ptb.md ================================================ # Penn Treebank | Tag | Description | |--------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ADJP | Adjective Phrase. | | ADVP | Adverb Phrase. | | CONJP | Conjunction Phrase. | | FRAG | Fragment. | | INTJ | Interjection. Corresponds approximately to the part-of-speech tag UH. | | LST | List marker. Includes surrounding punctuation. | | NAC | Not a Constituent; used to show the scope of certain prenominal modifiers within an NP. | | NP | Noun Phrase. | | NX | - Used within certain complex NPs to mark the head of the NP. Corresponds very roughly to N-bar level but used quite differently. | | PP | Prepositional Phrase. | | PRN | Parenthetical | | PRT | Particle. Category for words that should be tagged RP. | | QP | Quantifier Phrase (i.e. complex measure/amount phrase); used within NP. | | ROOT | No description | | RRC | Reduced Relative Clause. | | S | conjunction or a wh-word and that does not exhibit subject-verb inversion. | | SBAR | Clause introduced by a (possibly empty) subordinating conjunction. | | SBARQ | - Direct question introduced by a wh-word or a wh-phrase. Indirect questions and relative clauses should be bracketed as SBAR, not SBARQ. | | SINV | - Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal. | | SQ | Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ. | | UCP | Unlike Coordinated Phrase. | | VP | Verb Phrase. | | WHADJP | Wh-adjective Phrase. Adjectival phrase containing a wh-adverb, as in how hot. | | WHADVP | - Wh-adverb Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing a wh-adverb such as how or why. | | WHNP | - Wh-noun Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing some wh-word, e.g. who, which book, whose daughter, none of which, or how many leopards. | | WHPP | - Wh-prepositional Phrase. Prepositional phrase containing a wh-noun phrase (such as of which or by whose authority) that either introduces a PP gap or is contained by a WHNP. | | X | - Unknown, uncertain, or unbracketable. X is often used for bracketing typos and in bracketing the…​the-constructions. | ================================================ FILE: docs/annotations/dep/index.md ================================================ # Dependency Parsing ## Chinese ```{toctree} sd_zh pmt ``` ## English ```{toctree} sd_en ``` ## Multilingual ```{toctree} ud ``` ================================================ FILE: docs/annotations/dep/pmt.md ================================================ # PKU Multi-view Chinese Treebank ```{eval-rst} See also :cite:`qiu-etal-2014-multi`. ``` | Tag | Description | 依存关系 | | ---- | ------------------------------------------- | -------------- | | ACT | action object | 行为宾语 | | ADV | adverbial | 状语 | | APP | appositive element | 同位 | | ATT | attribute | 定语 | | CMP | complement | 补语 | | COO | other coordination element | 一般并列 | | COS | share-right-child coordination element | 共享并列 | | DE | de (modifier of 的(special function word)) | 的字 | | DEI | dei (modifier of 得(special function word)) | 得字 | | DI | di (modifier of 地(special function word)) | 地字 | | FOC | focus | 强调 | | HED | root of a sentence | 核心 | | IC | independent clause | 小句 | | IOB | indirect object | 间接宾语 | | IS | independent structure | 独立结构 | | ISC | non-shared independent structure | 并列式独立结构 | | LAD | left additive | 前附加 | | MT | modality and time | 时体 | | NUM | number | 数字 | | POB | propositional object | 介宾 | | PUN | punctuation | 标点 | | PUS | cross-clause punctuation | 跨句标点 | | QUC | post-positional quantity | 数量补语 | | QUCC | non-shared post-positional quantity | 非共享数量补语 | | QUN | quantity | 数量 | | RAD | right additive | 后附加 | | RADC | non-shared right additive | 非共享后附加 | | RED | reduplicate element | 重叠 | | SBV | subject | 主语 | | TPC | topic | 话题 | | VOB | direct object | 宾语 | | VV | serial verb construction | 连动 | ================================================ FILE: docs/annotations/dep/sd_en.md ================================================ # Stanford Dependencies English See also [Stanford typed dependencies manual](https://nlp.stanford.edu/software/dependencies_manual.pdf). | Tag | Description | |------------|-----------------------------------| | abbrev | abbreviation modifier | | acomp | adjectival complement | | advcl | adverbial clause modifier | | advmod | adverbial modifier | | agent | agent | | amod | adjectival modifier | | appos | appositional modifier | | arg | argument | | attr | attributive | | aux | auxiliary | | auxpass | passive auxiliary | | cc | coordination | | ccomp | clausal complement | | comp | complement | | complm | complementizer | | conj | conjunct | | cop | copula | | csubj | clausal subject | | csubjpass | clausal passive subject | | dep | dependent | | det | determiner | | discourse | discourse element | | dobj | direct object | | expl | expletive | | goeswith | goes with | | iobj | indirect object | | mark | marker | | mod | modifier | | mwe | multi-word expression | | neg | negation modifier | | nn | noun compound modifier | | npadvmod | noun phrase as adverbial modifier | | nsubj | nominal subject | | nsubjpass | passive nominal subject | | num | numeric modifier | | number | element of compound number | | obj | object | | parataxis | parataxis | | pcomp | prepositional complement | | pobj | object of a preposition | | poss | possession modifier | | possessive | possessive modifier | | preconj | preconjunct | | pred | predicate | | predet | predeterminer | | prep | prepositional modifier | | prepc | prepositional clausal modifier | | prt | phrasal verb particle | | punct | punctuation | | purpcl | purpose clause modifier | | quantmod | quantifier phrase modifier | | rcmod | relative clause modifier | | ref | referent | | rel | relative | | root | root | | sdep | semantic dependent | | subj | subject | | tmod | temporal modifier | | vmod | verb modifier | | xcomp | open clausal complement | | xsubj | controlling subject | ================================================ FILE: docs/annotations/dep/sd_zh.md ================================================ # Stanford Dependencies Chinese ```{eval-rst} See also :cite:`chang-etal-2009-discriminative`. ``` |Tag|Description|中文简称|例句|依存弧| | ---- | ---- | ---- | ---- | ---- | |nn|noun compound modifier|复合名词修饰|服务中心|nn(中心,服务)| |punct|punctuation|标点符号|海关统计表明,|punct(表明,,)| |nsubj|nominal subject|名词性主语|梅花盛开|nsubj (盛开,梅花)| |conj|conjunct (links two conjuncts)|连接性状语|设备和原材料|conj(原材料,设备)| |dobj|direct object|直接宾语|浦东颁布了七十一件文件|dobj(颁布,文件)| |advmod|adverbial modifier|副词性状语|部门先送上文件|advmod(送上,先)| |prep|prepositional modifier|介词性修饰语|在实践中逐步完善|prep(完善,在)| |nummod|number modifier|数词修饰语|七十一件文件|nummod(件,七十一)| |amod|adjectival modifier|形容词修饰语|跨世纪工程|amod(工程,跨世纪)| |pobj|prepositional object|介词性宾语|根据有关规定|pobj (根据,规定)| |rcmod|relative clause modifier|关系从句修饰语|不曾遇到过的情况|rcmod(情况,遇到)| |cpm|complementizer|补语|开发浦东的经济活动|cpm(开发,的)| |assm|associative marker|关联标记|企业的商品|assm(企业,的)| |assmod|associative modifier|关联修饰|企业的商品|assmod(商品,企业)| |cc|coordinating conjunction|并列关系|设备和原材料|cc(原材料,和)| |clf|classifier modifier|类别修饰|七十一件文件|clf(文件,件)| |ccomp|clausal complement|从句补充|银行决定先取得信用评级|ccomp(决定,取得)| |det|determiner|限定语|这些经济活动|det(活动,这些)| |lobj|localizer object|范围宾语|近年来|lobj(来,近年)| |range|dative object that is a quantifier phrase|数量词间接宾语|成交药品一亿多元|range(成交,元)| |asp|aspect marker|时态标记|发挥了作用|asp(发挥,了)| |tmod|temporal modifier|时间修饰语|以前不曾遇到过|tmod(遇到,以前)| |plmod|localizer modifier of a preposition|介词性地点修饰|在这片热土上|plmod(在,上)| |attr|attributive|属性|贸易额为二百亿美元|attr(为,美元)| |mmod|modal verb modifier|情态动词|利益能得到保障|mmod(得到,能)| |loc|localizer|位置补语|占九成以上|loc(占,以上)| |top|topic|主题|建筑是主要活动|top(是,建筑)| |pccomp|clausal complement of a preposition|介词补语|据有关部门介绍|pccomp(据,介绍)| |etc|etc modifier|省略关系|科技、文教等领域|etc(文教,等)| |lccomp|clausal complement of a localizer|位置补语|中国对外开放中升起的明星|lccomp(中,开放)| |ordmod|ordinal number modifier|量词修饰|第七个机构|ordmod(个,第七)| |xsubj|controlling subject|控制主语|银行决定先取得信用评级|xsubj (取得,银行)| |neg|negative modifier|否定修饰|以前不曾遇到过|neg(遇到,不)| |rcomp|resultative complement|结果补语|研究成功|rcomp(研究,成功)| |comod|coordinated verb compound modifier|并列联合动词|颁布实行|comod(颁布,实行)| |vmod|verb modifier|动词修饰|其在支持外商企业方面的作用|vmod(方面,支持)| |prtmod|particles such as 所,以,来,而|小品词|在产业化所取得的成就|prtmod(取得,所)| |ba|“ba” construction|把字关系|把注意力转向市场|ba(转向,把)| |dvpm|manner DE(地)modifier|地字修饰|有效地防止流失|dvpm(有效,地)| |dvpmod|a "XP+DEV", phrase that modifies VP|地字动词短语|有效地防止流失|dvpmod(防止,有效)| |prnmod|parenthetical modifier|插入词修饰|八五期间(1990-1995 )|pmmod(期间,1995)| |cop|copular|系动词|原是自给自足的经济|cop(自给自足,是)| |pass|passive marker|被动标记|被认定为高技术产业|pass(认定,被)| |nsubjpass|nominal passive subject|被动名词主语|镍被称作现代工业的维生素|nsubjpass(称作,镍)| |dep|dependent|其他依赖关系|新华社北京二月十二日电|dep(电,新华社)| ================================================ FILE: docs/annotations/dep/ud.md ================================================ # Universal Dependencies ## Cross-Linguistic See also [Universal Dependencies](https://universaldependencies.org/docs/u/dep/index.html). | Tag | Description | |------------|----------------------------------------------| | acl | clausal modifier of noun (adjectival clause) | | advcl | adverbial clause modifier | | advmod | adverbial modifier | | amod | adjectival modifier | | appos | appositional modifier | | aux | auxiliary | | auxpass | passive auxiliary | | case | case marking | | cc | coordinating conjunction | | ccomp | clausal complement | | compound | compound | | conj | conjunct | | cop | copula | | csubj | clausal subject | | csubjpass | clausal passive subject | | dep | unspecified dependency | | det | determiner | | discourse | discourse element | | dislocated | dislocated elements | | dobj | direct object | | expl | expletive | | foreign | foreign words | | goeswith | goes with | | iobj | indirect object | | list | list | | mark | marker | | mwe | multi-word expression | | name | name | | neg | negation modifier | | nmod | nominal modifier | | nsubj | nominal subject | | nsubjpass | passive nominal subject | | nummod | numeric modifier | | parataxis | parataxis | | punct | punctuation | | remnant | remnant in ellipsis | | reparandum | overridden disfluency | | root | root | | vocative | vocative | | xcomp | open clausal complement | ## Localization ### Chinese | Tag | 简称 | 例句 | | :--------------- |---------:| -----------------------------------------------------------: | | acl | 形容词子句 | ![acl](https://file.hankcs.com/img/ud/1303b5cbe9413044cb800b3c3514b70b.svg) | | advcl:loc | 状语从句修饰语 | ![advcl:loc](https://file.hankcs.com/img/ud/e8865563caf0eda7a80043eda8cc43a6.svg) | | advmod | 状语 | ![advmod](https://file.hankcs.com/img/ud/3ce9276f4e18d92edb48e58956bbaee7.svg) | | advmod:dvp | 状语:地 | ![advmod:dvp](https://file.hankcs.com/img/ud/e90870682b9f0a80736d25977565f96a.svg) | | advmod:loc | 状语:限定 | ![advmod:loc](https://file.hankcs.com/img/ud/135e9143e73e5f45290d204d4ad5b30e.svg) | | advmod:rcomp | 状语:因果 | ![advmod:rcomp](https://file.hankcs.com/img/ud/aa75be342648bed0846f54a88f71e7a7.svg) | | amod | 形容 | ![amod](https://file.hankcs.com/img/ud/dee0097c244c1bd0a1d1ed117932346d.svg) | | amod:ordmod | 形容:数量 | ![amod:ordmod](https://file.hankcs.com/img/ud/8bb79245311a4190836dce8439591e91.svg) | | appos | 同位 | ![appos](https://file.hankcs.com/img/ud/a74f6a31f68ba5697d0a8906e8476b47.svg) | | aux:asp | 助语:时态 | ![aux:asp](https://file.hankcs.com/img/ud/8c32de9b4858c0e4d24ee6da5fb80a6e.svg) | | aux:ba | 助语:把 | ![aux:ba](https://file.hankcs.com/img/ud/2c712e3af49fcdbd5914398895904f3c.svg) | | aux:modal | 助语:情态 | ![aux:modal](https://file.hankcs.com/img/ud/606946c569e4bfbacbb1b9e13336e247.svg) | | aux:prtmod | 助语:分词 | ![aux:prtmod](https://file.hankcs.com/img/ud/fc49d338487dd63687941433a0633f5d.svg) | | auxpass | 被动 | ![auxpass](https://file.hankcs.com/img/ud/a6e4a8aabb7bb1bb5c4e9cdf7876e3f7.svg) | | case | 条件 | ![case](https://file.hankcs.com/img/ud/35a021e15a9355880cb8720ba34ed936.svg) | | cc | 并列连词 | ![cc](https://file.hankcs.com/img/ud/18c6a22520cec2ba60ce636bb410f651.svg) | | ccomp | 从句补语 | ![ccomp](https://file.hankcs.com/img/ud/8cc4ea0c6a090f1ba03d02926240c35b.svg) | | compound:nn | 复合名词 | ![compound:nn](https://file.hankcs.com/img/ud/587e12141aa42aa9862ea0ac0eb30e09.svg) | | compound:vc | 复合动词 | ![compound:vc](https://file.hankcs.com/img/ud/f72cedcb6cec8563d88063b118544a9d.svg) | | conj | 连接 | ![conj](https://file.hankcs.com/img/ud/fc924f495d1d5a3a828a0e2262da06cd.svg) | | cop | 系动 | ![cop](https://file.hankcs.com/img/ud/a7da58f57adbe9e6bd166ecb514f2d1c.svg) | | csubj | 从句主语 | ![csubj](https://file.hankcs.com/img/ud/0adda481e81b3765ed7f4f9d55c153c4.svg) | | dep | 未定义 | ![dep](https://file.hankcs.com/img/ud/db15b792f1bfd5e42982832b04c65a79.svg) | | det | 限定 | ![det](https://file.hankcs.com/img/ud/17376d13a4e7b0677cd18d13e0990dab.svg) | | discourse | 语气 | ![discourse](https://file.hankcs.com/img/ud/d7eb37d5fd13462b237140a08f0ed9a4.svg) | | dobj | 直接宾语 | ![dobj](https://file.hankcs.com/img/ud/f5e801103ddc57a9aeff0e272b8f7b44.svg) | | etc | 省略 | ![etc](https://file.hankcs.com/img/ud/86d3fd24cae9f585b7730119edaa0248.svg) | | mark | 标记 | ![mark](https://file.hankcs.com/img/ud/b17b4027ab368c76a3b6f085d5b561d9.svg) | | mark:clf | 标记:量词 | ![mark:clf](https://file.hankcs.com/img/ud/5974c92e3587aa64ba1d572243b9c5cc.svg) | | name | 名称 | ![name](https://file.hankcs.com/img/ud/63ea082457dfe6f4fc04f635a8c019f3.svg) | | neg | 否定 | ![neg](https://file.hankcs.com/img/ud/e38814231ff9a31dcce5672556375c94.svg) | | nmod | 名词修饰 | ![nmod](https://file.hankcs.com/img/ud/e948a8dbcd43984d14c257f0ace1753d.svg) | | nmod:assmod | 名词修饰:关联 | ![nmod:assmod](https://file.hankcs.com/img/ud/76349f30cef2c4978a03118d65ac6c81.svg) | | nmod:poss | 名词修饰:所有格 | ![nmod:poss](https://file.hankcs.com/img/ud/5b4937dbea42cdff7054e9dd0904bedb.svg) | | nmod:prep | 名词修饰:介词 | ![nmod:prep](https://file.hankcs.com/img/ud/63b92981638b758681a82e9f4a9aa04c.svg) | | nmod:range | 名词修饰:范围 | ![nmod:range](https://file.hankcs.com/img/ud/217ec98756cfe3750c76f5e5e89b7f54.svg) | | nmod:tmod | 名词修饰:时间 | ![nmod:tmod](https://file.hankcs.com/img/ud/166e3b8fb72db52f0ec332d444ea017f.svg) | | nmod:topic | 名词修饰:主题 | ![nmod:topic](https://file.hankcs.com/img/ud/93c83c98c188b131211ac5e9ff5242c0.svg) | | nsubj | 名词主语 | ![nsubj](https://file.hankcs.com/img/ud/63e3902d4a3045d1d696a0c4ed203563.svg) | | nsubj:xsubj | 名词主语: 补语 | ![nsubj:xsubj](https://file.hankcs.com/img/ud/80cb355b9f9732fd888186a1f658b0ac.svg) | | nsubjpass | 被动态主语 | ![nsubjpass](https://file.hankcs.com/img/ud/6327fab58ab42d5a417b2e5c7018ac3a.svg) | | nummod | 数量 | ![nummod](https://file.hankcs.com/img/ud/0fd20559645265c2c937f06631aa74df.svg) | | parataxis:prnmod | 并列 | ![parataxis:prnmod](https://file.hankcs.com/img/ud/783a0faf4cd935bb61f5d225a388b79e.svg) | | punct | 标点符号 | ![punct](https://file.hankcs.com/img/ud/983410055658352080ae476a5d85e6b5.svg) | | root | 根 | ![root](https://file.hankcs.com/img/ud/588101bec0440ffb769172f8b7e9f98e.svg) | | xcomp | 从句补语 | ![xcomp](https://file.hankcs.com/img/ud/c72071875f1c01e51acb9e1ec4893113.svg) | ================================================ FILE: docs/annotations/index.md ================================================ # Annotations ```{toctree} tok/index pos/index ner/index dep/index sdp/index srl/index constituency/index ``` ================================================ FILE: docs/annotations/ner/index.md ================================================ # Named Entity Recognition ## Chinese ```{toctree} pku msra ``` ## Multilingual ```{toctree} ontonotes ``` ================================================ FILE: docs/annotations/ner/msra.md ================================================ # msra | Category | Subcategory | Tag-set of Format-1 | Tag-set of Format-2 | |----------|----------------|---------------------|---------------------| | NAMEX | Person | P | PERSON | | | Location | L | LOCATION | | | Organization | 〇 | ORGANIZATION | | TIMEX | Date | dat | DATE | | | Duration | dur | DURATION | | | Time | tim | TIME | | NUMEX | Percent | per | PERCENT | | | Money | mon | MONEY | | | Frequency | fre | FREQUENCY | | | Integer | int | INTEGER | | | Fraction | fra | FRACTION | | | Decimal | dec | DECIMAL | | | Ordinal | ord | ORDINAL | | | Rate | rat | RATE | | MEASUREX | Age | age | AGE | | | Weight | wei | WEIGHT | | | Length | len | LENGTH | | | Temperature | tem | TEMPERATURE | | | Angle | ang | ANGLE | | | Area | are | AREA | | | Capacity | cap | CAPACITY | | | Speed | spe | SPEED | | | Acceleration | acc | ACCELERATION | | | Other measures | mea | MEASURE | | ADDREX | Email | ema | EMAIL | | | Phone | pho | PHONE | | | Fax | fax | FAX | | | Telex | tel | TELEX | | | WWW | WWW | WWW | | | Postalcode | pos | POSTALCODE | ================================================ FILE: docs/annotations/ner/ontonotes.md ================================================ # ontonotes | TAG | Description | |--------------|------------------------------------------------------| | PERSON | People, including fictional | | NORP | Nationalities or religious or political groups | | FACILITY | Buildings, airports, highways, bridges, etc. | | ORGANIZATION | Companies, agencies, institutions, etc. | | GPE | Countries, cities, states | | LOCATION | Non-GPE locations, mountain ranges, bodies of water | | PRODUCT | Vehicles, weapons, foods, etc. (Not services) | | EVENT | Named hurricanes, battles, wars, sports events, etc. | | WORK OF ART | Titles of books, songs, etc. | | LAW | Named documents made into laws | | DATE | Absolute or relative dates or periods | | TIME | Times smaller than a day | | PERCENT | Percentage | | MONEY | Monetary values, including unit | | QUANTITY | Measurements, as of weight or distance | | ORDINAL | “first”, “second” | | CARDINAL | Numerals that do not fall under another type | ================================================ FILE: docs/annotations/ner/pku.md ================================================ # pku | 序号 | 词性 | 名称 | 帮助记忆的诠释 | 例子及注解 | | ---- | ---- | -------- | ------------------------------------------------------ | ------------------------------------------------------------ | | 1 | nr | 人名 | 名词代码n和“人(ren)”的声母并在一起。 | 1. 汉族人及与汉族起名方式相同的非汉族人的姓和名单独切分,并分别标注为nr。张/nr 仁伟/nr, 欧阳/nr 修/nr, 阮/nr 志雄/nr, 朴/nr 贞爱/nr汉族人除有单姓和复姓外,还有双姓,即有的女子出嫁后,在原来的姓上加上丈夫的姓。如:陈方安生。这种情况切分、标注为:陈/nr 方/nr 安生/nr;唐姜氏,切分、标注为:唐/nr 姜氏/nr。2. 姓名后的职务、职称或称呼要分开。江/nr 主席/n, 小平/nr 同志/n, 江/nr 总书记/n,张/nr 教授/n, 王/nr 部长/n, 陈/nr 老总/n, 李/nr 大娘/n, 刘/nr 阿姨/n, 龙/nr 姑姑/n3. 对人的简称、尊称等若为两个字,则合为一个切分单位,并标以nr。老张/nr, 大李/nr, 小郝/nr, 郭老/nr, 陈总/nr4. 明显带排行的亲属称谓要切分开,分不清楚的则不切开。三/m 哥/n, 大婶/n, 大/a 女儿/n, 大哥/n, 小弟/n, 老爸/n5. 一些著名作者的或不易区分姓和名的笔名通常作为一个切分单位。鲁迅/nr, 茅盾/nr, 巴金/nr, 三毛/nr, 琼瑶/nr, 白桦/nr6. 外国人或少数民族的译名(包括日本人的姓名)不予切分,标注为nr。克林顿/nr, 叶利钦/nr, 才旦卓玛/nr, 小林多喜二/nr, 北研二/nr,华盛顿/nr, 爱因斯坦/nr有些西方人的姓名中有小圆点,也不分开。卡尔·马克思/nr | | 2 | ns | 地名 | 名词代码n和处所词代码s并在一起。 | 安徽/ns,深圳/ns,杭州/ns,拉萨/ns,哈尔滨/ns, 呼和浩特/ns, 乌鲁木齐/ns,长江/ns,黄海/ns,太平洋/ns, 泰山/ns, 华山/ns,亚洲/ns, 海南岛/ns,太湖/ns,白洋淀/ns, 俄罗斯/ns,哈萨克斯坦/ns,彼得堡/ns, 伏尔加格勒/ns 1. 国名不论长短,作为一个切分单位。中国/ns, 中华人民共和国/ns, 日本国/ns, 美利坚合众国/ns, 美国/ns2. 地名后有“省”、“市”、“县”、“区”、“乡”、“镇”、“村”、“旗”、“州”、“都”、“府”、“道”等单字的行政区划名称时,不切分开,作为一个切分单位。四川省/ns, 天津市/ns,景德镇/ns沙市市/ns, 牡丹江市/ns,正定县/ns,海淀区/ns, 通州区/ns,东升乡/ns, 双桥镇/ns 南化村/ns,华盛顿州/ns,俄亥俄州/ns,东京都/ns, 大阪府/ns,北海道/ns, 长野县/ns,开封府/ns,宣城县/ns3. 地名后的行政区划有两个以上的汉字,则将地名同行政区划名称切开,不过要将地名同行政区划名称用方括号括起来,并标以短语NS。[芜湖/ns 专区/n] NS,[宣城/ns 地区/n]ns,[内蒙古/ns 自治区/n]NS,[深圳/ns 特区/n]NS, [厦门/ns 经济/n 特区/n]NS, [香港/ns 特别/a 行政区/n]NS,[香港/ns 特区/n]NS, [华盛顿/ns 特区/n]NS,4. 地名后有表示地形地貌的一个字的普通名词,如“江、河、山、洋、海、岛、峰、湖”等,不予切分。鸭绿江/ns,亚马逊河/ns, 喜马拉雅山/ns, 珠穆朗玛峰/ns,地中海/ns,大西洋/ns,洞庭湖/ns, 塞普路斯岛/ns 5. 地名后接的表示地形地貌的普通名词若有两个以上汉字,则应切开。然后将地名同该普通名词标成短语NS。[台湾/ns 海峡/n]NS,[华北/ns 平原/n]NS,[帕米尔/ns 高原/n]NS, [南沙/ns 群岛/n]NS,[京东/ns 大/a 峡谷/n]NS [横断/b 山脉/n]NS6.地名后有表示自然区划的一个字的普通名词,如“ 街,路,道,巷,里,町,庄,村,弄,堡”等,不予切分。 中关村/ns,长安街/ns,学院路/ns, 景德镇/ns, 吴家堡/ns, 庞各庄/ns, 三元里/ns,彼得堡/ns, 北菜市巷/ns, 7.地名后接的表示自然区划的普通名词若有两个以上汉字,则应切开。然后将地名同自然区划名词标成短语NS。[米市/ns 大街/n]NS, [蒋家/nz 胡同/n]NS , [陶然亭/ns 公园/n]NS , 8. 大小地名相连时的标注方式为:北京市/ns 海淀区/ns 海淀镇/ns [南/f 大街/n]NS [蒋家/nz 胡同/n]NS 24/m 号/q , | | 3 | nt | 机构团体 | “团”的声母为t,名词代码n和t并在一起。 | (参见2。短语标记说明--NT)联合国/nt,中共中央/nt,国务院/nt, 北京大学/nt1.大多数团体、机构、组织的专有名称一般是短语型的,较长,且含有地名或人名等专名,再组合,标注为短语NT。[中国/ns 计算机/n 学会/n]NT, [香港/ns 钟表业/n 总会/n]NT, [烟台/ns 大学/n]NT, [香港/ns 理工大学/n]NT, [华东/ns 理工大学/n]NT,[合肥/ns 师范/n 学院/n]NT, [北京/ns 图书馆/n]NT, [富士通/nz 株式会社/n]NT, [香山/ns 植物园/n]NT, [安娜/nz 美容院/n]NT,[上海/ns 手表/n 厂/n]NT, [永和/nz 烧饼铺/n]NT,[北京/ns 国安/nz 队/n]NT,2. 对于在国际或中国范围内的知名的唯一的团体、机构、组织的名称即使前面没有专名,也标为nt或NT。联合国/nt,国务院/nt,外交部/nt, 财政部/nt,教育部/nt, 国防部/nt,[世界/n 贸易/n 组织/n]NT, [国家/n 教育/vn 委员会/n]NT,[信息/n 产业/n 部/n]NT,[全国/n 信息/n 技术/n 标准化/vn 委员会/n]NT,[全国/n 总/b 工会/n]NT,[全国/n 人民/n 代表/n 大会/n]NT,美国的“国务院”,其他国家的“外交部、财政部、教育部”,必须在其所属国的国名之后出现时,才联合标注为NT。[美国/ns 国务院/n]NT,[法国/ns 外交部/n]NT,[美/j 国会/n]NT,日本有些政府机构名称很特别,无论是否出现在“日本”国名之后都标为nt。[日本/ns 外务省/nt]NT,[日/j 通产省/nt]NT通产省/nt 3. 前后相连有上下位关系的团体机构组织名称的处理方式如下:[联合国/nt 教科文/j 组织/n]NT, [中国/ns 银行/n 北京/ns 分行/n]NT,[河北省/ns 正定县/ns 西平乐乡/ns 南化村/ns 党支部/n]NT, 当下位名称含有专名(如“北京/ns 分行/n”、“南化村/ns 党支部/n”、“昌平/ns 分校/n”)时,也可脱离前面的上位名称单独标注为NT。[中国/ns 银行/n]NT [北京/ns 分行/n]NT,北京大学/nt [昌平/ns 分校/n]NT,4. 团体、机构、组织名称中用圆括号加注简称时:[宝山/ns 钢铁/n (/w 宝钢/j )/w 总/b 公司/n]NT,[宝山/ns 钢铁/n 总/b 公司/n]NT,(/w 宝钢/j )/w | ================================================ FILE: docs/annotations/pos/863.md ================================================ # 863 | 词性 | 名称 | 说明 | 例子 | | :-- | -----: | ---------------------------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | a | 形容词 | 取英语形容词adjective的第1个字母 | [重要/a 步伐/n]NP ,美丽/a ,看似/v 抽象/a , | | c | 连词 | 取英语连词conjunction的第1个字母。 | 合作/vn 与/c 伙伴/n | | d | 副词 | 取adverb的第2个字母,因其第1个字母已用于形容词。 | 进一步/d 发展/v , | | e | 叹词 | 取英语叹词exclamation的第1个字母。 | 啊/e ,/w 那/r 金灿灿/z 的/u 麦穗/n , | | f | 方位词 | 取汉字“方”。 | 军人/n 的/u 眼睛/n 里/f 不/d 是/v 没有/v 风景/n , | | g | 语素字 | | | | h | 前接成分 | 取英语head的第1个字母。 | 许多/m 非/h 主角/n 人物/n ,办事处/n 的/u “/w 准/h 政府/n ”/w 功能/n 不断/d 加强/v , | | i | 成语 | 取英语成语idiom的第1个字母。 | 一言一行/i ,义无反顾/i , | | j | 简称略语 | 取汉字“简”的声母。 | [德/j 外长/n]NP ,文教/j , | | k | 后接成分 | 后接成分。 | 少年儿童/l 朋友/n 们/k ,身体/n 健康/a 者/k , | | m | 数词 | 取英语numeral的第3个字母,n,u已有他用。 | 1.数量词组应切分为数词和量词。 三/m 个/q, 10/m 公斤/q, 一/m 盒/q 点心/n ,但少数数量词已是词典的登录单位,则不再切分。 一个/m , 一些/m ,2. 基数、序数、小数、分数、百分数一律不予切分,为一个切分单位,标注为 m 。一百二十三/m,20万/m, 123.54/m, 一个/m, 第一/m, 第三十五/m, 20%/m, 三分之二/m, 千分之三十/m, 几十/m 人/n, 十几万/m 元/q, 第一百零一/m 个/q ,3. 约数,前加副词、形容词或后加“来、多、左右”等助数词的应予分开。约/d 一百/m 多/m 万/m,仅/d 一百/m 个/q, 四十/m 来/m 个/q,二十/m 余/m 只/q, 十几/m 个/q,三十/m 左右/m ,两个数词相连的及“成百”、“上千”等则不予切分。五六/m 年/q, 七八/m 天/q,十七八/m 岁/q, 成百/m 学生/n,上千/m 人/n, 4.表序关系的“数+名”结构,应予切分。二/m 连/n , 三/m 部/n , | | mq | 数量词 | | | | n | 名词 | 取英语名词noun的第1个字母。 | (参见 动词--v)岗位/n , 城市/n , 机会/n ,她/r 是/v 责任/n 编辑/n , | | nd | 方位名词 | 方位名词(nd),表示位置的相对方向 | 上 下 左 右 前 后 里 外 中 东 西 南 北前边 左面 里头 中间 外部 | | nh | 人名 | 人名(nh),表示人的名称的专有名词 | 华罗庚 阿凡提 诸葛亮 司马相如 松赞干布 卡尔·马克思 | | nhf | 姓 | | | | nhs | 名 | | | | ni | 机构名 | 机构名(ni),表示团体、组织、机构名称的专有名词 | 联合国 教育部 北京大学 中国科学院 | | nl | 处所名词 | 处所名词(nl),表示处所 | 空中 高处 隔壁 门口 附近 边疆 一旁 野外 | | ns | 地名 | 地名(ns),表示地理区域名称的专有名词 | 亚洲 大西洋 地中海 阿尔卑斯山 加拿大中国 北京 浙江 景德镇 呼和浩特 中关村 | | nt | 时间名词 | 时间名词(nt),包括一般所说的时量词 | 年 月 日 分 秒现在 过去 昨天 去年 将来 宋朝 星期一 | | nz | 其他专有名词 | 其他专有名词(nz) | 五粮液 宫爆鸡丁 桑塔纳 | | o | 拟声词 | 取英语拟声词onomatopoeia的第1个字母。 | 哈哈/o 一/m 笑/v ,装载机/n 隆隆/o 推进/v , | | p | 介词 | 取英语介词prepositional的第1个字母。 | 对/p 子孙后代/n 负责/v ,以/p 煤/n 养/v 农/Ng ,为/p 治理/v 荒山/n 服务/v , 把/p 青年/n 推/v 上/v 了/u 领导/vn 岗位/n , | | q | 量词 | 取英语quantity的第1个字母。 | (参见数词m)首/m 批/q ,一/m 年/q , | | r | 代词 | 取英语代词pronoun的第2个字母,因p已用于介词。 | 单音节代词“本”、“每”、“各”、“诸”后接单音节名词时,和后接的单音节名词合为代词;当后接双音节名词时,应予切分。本报/r, 每人/r, 本社/r, 本/r 地区/n, 各/r 部门/n | | u | 助词 | 取英语助词auxiliary。 | [[俄罗斯/ns 和/c 北约/j]NP-BL 之间/f [战略/n 伙伴/n 关系/n]NP 的/u 建立/vn]NP 填平/v 了/u [[欧洲/ns 安全/a 政治/n]NP 的/u 鸿沟/n]NP | | v | 动词 | 取英语动词verb的第一个字母。 | (参见 名词--n)[[[欧盟/j 扩大/v]S 的/u [历史性/n 决定/n]NP]NP 和/c [北约/j 开放/v]S]NP-BL [为/p [创建/v [一/m 种/q 新/a 的/u 欧洲/ns 安全/a 格局/n]NP]VP-SBI]PP-MD [奠定/v 了/u 基础/n]V-SBI ,, | | vd | 趋向动词 | 趋向动词(vd),表示趋向 | (走)上 (趴)下 (进)来 (回)去(跑)上来 (掉)下去 (提)起来 (扔)过去 | | vl | 联系动词 | 联系动词(vl),表示关系的判断 | 是 | | vu | 能愿动词 | 能愿动词(vu),表示可能、意愿 | 能够 能 应该 可以 可能 情愿 愿意 要 | | w | 标点符号 | | ”/w :/w | | ws | 非汉字字符串 | 非汉字字符串(ws),如: | HanLP office windows | | x | 非语素字 | 非语素字只是一个符号,字母x通常用于代表未知数、符号。 | ================================================ FILE: docs/annotations/pos/ctb.md ================================================ # ctb See also [The Part-Of-Speech Tagging Guidelines for the Penn Chinese Treebank (3.0)](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports). | Tag | Description | Chinese | Chinese Description | Examples | |-----|------------------------------------------------------|---------|---------------------------------------------------------|------------------------| | AD | adverb | 副词 | 副词 | 仍然、很、大大、约 | | AS | aspect marker | 动态助词 | 助词 | 了、着、过 | | BA | `bǎ` in ba-construction | 把字句 | 当“把”、“将”出现在结构“NP0 + BA + NP1+VP”时的词性 | 把、将 | | CC | coordinating conjunction | 并列连接词 | 并列连词 | 与、和、或者、还是 | | CD | cardinal number | 概数词 | 数词或表达数量的词 | 一百、好些、若干 | | CS | subordinating conjunction | 从属连词 | 从属连词 | 如果、那么、就 | | DEC | `de` as a complementizer or a nominalizer | 补语成分“的” | 当“的”或“之”作补语标记或名词化标记时的词性,其结构为:S/VP DEC {NP},如,喜欢旅游的大学生 | 的、之 | | DEG | `de` as a genitive marker and an associative marker | 属格“的” | 当“的”或“之”作所有格时的词性,其结构为:NP/PP/JJ/DT DEG {NP}, 如,他的车、经济的发展 | 的、之 | | DER | resultative `de`, `de` in V-de const and V-de-R | 表结果的“得” | 当“得”出现在结构“V-得-R”时的词性,如,他跑得很快 | 得 | | DEV | manner `de`, `de` before VP | 表方式的“地” | 当“地”出现在结构“X-地-VP”时的词性,如,高兴地说 | 地 | | DT | determiner | 限定词 | 代冠词,通常用来修饰名词 | 这、那、该、每、各 | | ETC | for words like "etc." | 表示省略 | “等”、“等等”的词性 | 等、等等 | | EM | emoji | 表情符 | 表情符、或称颜文字 | :) | | FW | foreign words | 外来语 | 外来词 | 卡拉、A型 | | IC | incomplete component | 不完整成分 | 不完整成分,尤指ASR导致的错误 | 好*xin*、那个*ba* | | IJ | interjection | 句首感叹词 | 感叹词,通常出现在句子首部 | 啊 | | JJ | other noun-modifier | 其他名词修饰语 | 形容词 | 共同、新 | | LB | `bèi` in long bei-const | 长句式表被动 | 当“被”、“叫”、“给”出现在结构“NP0 + LB + NP1+ VP”结构时 的词性,如,他被我训了一顿 | 被、叫、给 | | LC | localizer | 方位词 | 方位词以及表示范围的限定词 | 前、旁、到、在内、以来、为止 | | M | measure word | 量词 | 量词 | 个、群、公里 | | MSP | other particle | 其他小品词 | 其他虚词,包括“所”、“以”、“来”和“而”等出现在VP前的词 | 所、以、来、而 | | NN | common noun | 其他名词 | 除专有名词和时间名词外的所有名词 | 桌子、生活、经济 | | NOI | noise that characters are written in the wrong order | 噪声 | 汉字顺序颠倒产生的噪声 | 事/NOI 类/NOI 各/NOI 故/NOI | | NR | proper noun | 专有名词 | 专有名词,通常表示地名、人名、机构名等 | 北京、乔丹、微软 | | NT | temporal noun | 时间名词 | 表示时间概念的名词 | 一月、汉朝、当今 | | OD | ordinal number | 序数词 | 序列词 | 第一百 | | ON | onomatopoeia | 象声词 | 象声词 | 哗哗、呼、咯吱 | | P | preposition e.g., "from" and "to" | 介词 | 介词 | 从、对、根据 | | PN | pronoun | 代词 | 代词,通常用来指代名词 | 我、这些、其、自己 | | PU | punctuation | 标点符号 | 标点符号 | ?、。、; | | SB | `bèi` in short bei-const | 短句式表被动 | 当“被”、“给”出现在NP0 +SB+ VP结果时的词性,如,他被训了 一顿 | 被、叫 | | SP | sentence final particle | 句末助词 | 经常出现在句尾的词 | 吧、呢、啊、啊 | | URL | web address | 网址 | 网址 | www.hankcs.com | | VA | predicative adjective | 表语形容词 | 可以接在“很”后面的形容词谓语 | 雪白、厉害 | | VC | copula, be words | 系动词 | 系动词,表示“是”或“非”概念的动词 | 是、为、非 | | VE | `yǒu` as the main verb | 动词有无 | 表示“有”或“无”概念的动词 | 有、没有、无 | | VV | other verb | 其他动词 | 其他普通动词,包括情态词、控制动词、动作动词、心理动词等等 | 可能、要、走、喜欢 | ================================================ FILE: docs/annotations/pos/index.md ================================================ # Part-of-Speech Tagging ## Chinese ```{toctree} ctb pku 863 ``` ## Japanese ```{toctree} npcmj ``` ## Multilingual ```{toctree} ud ``` ================================================ FILE: docs/annotations/pos/npcmj.md ================================================ # NPCMJ | Tag | Description | |-----------|-----------------------------------| | ADJI | イ-adjective | | ADJI-MD | modal イ-adjective | | ADJN | ナ-adjective | | ADJN-MD | modal ナ-adjective | | ADV | adverb | | AX | auxiliary verb (including copula) | | AXD | auxiliary verb, past tense | | CL | classifier | | CONJ | coordinating conjunction | | D | determiner | | FN | formal noun | | FW | foreign word | | INTJ | interjection | | MD | modal element | | N | noun | | N-MENTION | mentioned expression | | NEG | negation | | NPR | proper noun | | NUM | numeral | | P-COMP | complementizer particle | | P-CONN | conjunctional particle | | P-FINAL | final particle | | P-INTJ | interjectional particle | | P-OPTR | toritate particle | | P-ROLE | role particle | | PASS | direct passive | | PASS2 | indirect passive | | PNL | prenominal | | PRO | pronoun | | PU | punctuation | | PUL | left bracket | | PUR | right bracket | | Q | quantifier | | QUOT | quote | | SYM | symbol | | VB | verb (or verb stem) | | VB0 | light verb | | VB2 | secondary verb | | WADV | indeterminate adverb | | WD | indeterminate determiner | | WNUM | indeterminate numeral | | WPRO | indeterminate pronoun | ================================================ FILE: docs/annotations/pos/pku.md ================================================ # pku | 序号 | 词性 | 名称 | 帮助记忆的诠释 | 例子及注解 | | ---- | ---- | -------- | ------------------------------------------------------ | ------------------------------------------------------------ | | 1 | Ag | 形语素 | 形容词性语素。形容词代码为a,语素代码g前面置以A。 | 绿色/n 似/d 锦/Ag , | | 2 | a | 形容词 | 取英语形容词adjective的第1个字母 | [重要/a 步伐/n]NP ,美丽/a ,看似/v 抽象/a , | | 3 | ad | 副形词 | 直接作状语的形容词。形容词代码a和副词代码d并在一起。 | [积极/ad 谋求/v]V-ZZ ,幻象/n 易/ad 逝/Vg , | | 4 | an | 名形词 | 具有名词功能的形容词。形容词代码a和名词代码n并在一起。 | [外交/n 和/c 安全/an]NP-BL , | | 5 | Bg | 区别语素 | 区别词性语素。区别词代码为b,语素代码g前面置以B。 | 赤/Ag 橙/Bg 黄/a 绿/a 青/a 蓝/a 紫/a , | | 6 | b | 区别词 | 取汉字“别”的声母。 | 女/b 司机/n, 金/b 手镯/n, 慢性/b 胃炎/n, 古/b 钱币/n, 副/b 主任/n, 总/b 公司/n 单音节区别词和单音节名词或名语素组合,作为一个词,并标以名词词性n。 | | 7 | c | 连词 | 取英语连词conjunction的第1个字母。 | 合作/vn 与/c 伙伴/n | | 8 | Dg | 副语素 | 副词性语素。副词代码为d,语素代码g前面置以D。 | 了解/v 甚/Dg 深/a ,煞/Dg 是/v 喜人/a , | | 9 | d | 副词 | 取adverb的第2个字母,因其第1个字母已用于形容词。 | 进一步/d 发展/v , | | 10 | e | 叹词 | 取英语叹词exclamation的第1个字母。 | 啊/e ,/w 那/r 金灿灿/z 的/u 麦穗/n , | | 11 | f | 方位词 | 取汉字“方”。 | 军人/n 的/u 眼睛/n 里/f 不/d 是/v 没有/v 风景/n , | | 12 | h | 前接成分 | 取英语head的第1个字母。 | 许多/m 非/h 主角/n 人物/n ,办事处/n 的/u “/w 准/h 政府/n ”/w 功能/n 不断/d 加强/v , | | 13 | i | 成语 | 取英语成语idiom的第1个字母。 | 一言一行/i ,义无反顾/i , | | 14 | j | 简称略语 | 取汉字“简”的声母。 | [德/j 外长/n]NP ,文教/j , | | 15 | k | 后接成分 | 后接成分。 | 少年儿童/l 朋友/n 们/k ,身体/n 健康/a 者/k , | | 16 | l | 习用语 | 习用语尚未成为成语,有点“临时性”,取“临”的声母。 | 少年儿童/l 朋友/n 们/k ,落到实处/l , | | 17 | Mg | 数语素 | 数词性语素。数词代码为m,语素代码g前面置以M。 | 甲/Mg 减下/v 的/u 人/n 让/v 乙/Mg 背上/v ,凡/d “/w 寅/Mg 年/n ”/w 中/f 出生/v 的/u 人/n 生肖/n 都/d 属/v 虎/n , | | 18 | m | 数词 | 取英语numeral的第3个字母,n,u已有他用。 | 1.数量词组应切分为数词和量词。 三/m 个/q, 10/m 公斤/q, 一/m 盒/q 点心/n ,但少数数量词已是词典的登录单位,则不再切分。 一个/m , 一些/m ,2. 基数、序数、小数、分数、百分数一律不予切分,为一个切分单位,标注为 m 。一百二十三/m,20万/m, 123.54/m, 一个/m, 第一/m, 第三十五/m, 20%/m, 三分之二/m, 千分之三十/m, 几十/m 人/n, 十几万/m 元/q, 第一百零一/m 个/q ,3. 约数,前加副词、形容词或后加“来、多、左右”等助数词的应予分开。约/d 一百/m 多/m 万/m,仅/d 一百/m 个/q, 四十/m 来/m 个/q,二十/m 余/m 只/q, 十几/m 个/q,三十/m 左右/m ,两个数词相连的及“成百”、“上千”等则不予切分。五六/m 年/q, 七八/m 天/q,十七八/m 岁/q, 成百/m 学生/n,上千/m 人/n, 4.表序关系的“数+名”结构,应予切分。二/m 连/n , 三/m 部/n , | | 19 | Ng | 名语素 | 名词性语素。名词代码为n,语素代码g前面置以N。 | 出/v 过/u 两/m 天/q 差/Ng, 理/v 了/u 一/m 次/q 发/Ng, | | 20 | n | 名词 | 取英语名词noun的第1个字母。 | (参见 动词--v)岗位/n , 城市/n , 机会/n ,她/r 是/v 责任/n 编辑/n , | | 21 | nr | 人名 | 名词代码n和“人(ren)”的声母并在一起。 | 1. 汉族人及与汉族起名方式相同的非汉族人的姓和名单独切分,并分别标注为nr。张/nr 仁伟/nr, 欧阳/nr 修/nr, 阮/nr 志雄/nr, 朴/nr 贞爱/nr汉族人除有单姓和复姓外,还有双姓,即有的女子出嫁后,在原来的姓上加上丈夫的姓。如:陈方安生。这种情况切分、标注为:陈/nr 方/nr 安生/nr;唐姜氏,切分、标注为:唐/nr 姜氏/nr。2. 姓名后的职务、职称或称呼要分开。江/nr 主席/n, 小平/nr 同志/n, 江/nr 总书记/n,张/nr 教授/n, 王/nr 部长/n, 陈/nr 老总/n, 李/nr 大娘/n, 刘/nr 阿姨/n, 龙/nr 姑姑/n3. 对人的简称、尊称等若为两个字,则合为一个切分单位,并标以nr。老张/nr, 大李/nr, 小郝/nr, 郭老/nr, 陈总/nr4. 明显带排行的亲属称谓要切分开,分不清楚的则不切开。三/m 哥/n, 大婶/n, 大/a 女儿/n, 大哥/n, 小弟/n, 老爸/n5. 一些著名作者的或不易区分姓和名的笔名通常作为一个切分单位。鲁迅/nr, 茅盾/nr, 巴金/nr, 三毛/nr, 琼瑶/nr, 白桦/nr6. 外国人或少数民族的译名(包括日本人的姓名)不予切分,标注为nr。克林顿/nr, 叶利钦/nr, 才旦卓玛/nr, 小林多喜二/nr, 北研二/nr,华盛顿/nr, 爱因斯坦/nr有些西方人的姓名中有小圆点,也不分开。卡尔·马克思/nr | | 22 | ns | 地名 | 名词代码n和处所词代码s并在一起。 | 安徽/ns,深圳/ns,杭州/ns,拉萨/ns,哈尔滨/ns, 呼和浩特/ns, 乌鲁木齐/ns,长江/ns,黄海/ns,太平洋/ns, 泰山/ns, 华山/ns,亚洲/ns, 海南岛/ns,太湖/ns,白洋淀/ns, 俄罗斯/ns,哈萨克斯坦/ns,彼得堡/ns, 伏尔加格勒/ns 1. 国名不论长短,作为一个切分单位。中国/ns, 中华人民共和国/ns, 日本国/ns, 美利坚合众国/ns, 美国/ns2. 地名后有“省”、“市”、“县”、“区”、“乡”、“镇”、“村”、“旗”、“州”、“都”、“府”、“道”等单字的行政区划名称时,不切分开,作为一个切分单位。四川省/ns, 天津市/ns,景德镇/ns沙市市/ns, 牡丹江市/ns,正定县/ns,海淀区/ns, 通州区/ns,东升乡/ns, 双桥镇/ns 南化村/ns,华盛顿州/ns,俄亥俄州/ns,东京都/ns, 大阪府/ns,北海道/ns, 长野县/ns,开封府/ns,宣城县/ns3. 地名后的行政区划有两个以上的汉字,则将地名同行政区划名称切开,不过要将地名同行政区划名称用方括号括起来,并标以短语NS。[芜湖/ns 专区/n] NS,[宣城/ns 地区/n]ns,[内蒙古/ns 自治区/n]NS,[深圳/ns 特区/n]NS, [厦门/ns 经济/n 特区/n]NS, [香港/ns 特别/a 行政区/n]NS,[香港/ns 特区/n]NS, [华盛顿/ns 特区/n]NS,4. 地名后有表示地形地貌的一个字的普通名词,如“江、河、山、洋、海、岛、峰、湖”等,不予切分。鸭绿江/ns,亚马逊河/ns, 喜马拉雅山/ns, 珠穆朗玛峰/ns,地中海/ns,大西洋/ns,洞庭湖/ns, 塞普路斯岛/ns 5. 地名后接的表示地形地貌的普通名词若有两个以上汉字,则应切开。然后将地名同该普通名词标成短语NS。[台湾/ns 海峡/n]NS,[华北/ns 平原/n]NS,[帕米尔/ns 高原/n]NS, [南沙/ns 群岛/n]NS,[京东/ns 大/a 峡谷/n]NS [横断/b 山脉/n]NS6.地名后有表示自然区划的一个字的普通名词,如“ 街,路,道,巷,里,町,庄,村,弄,堡”等,不予切分。 中关村/ns,长安街/ns,学院路/ns, 景德镇/ns, 吴家堡/ns, 庞各庄/ns, 三元里/ns,彼得堡/ns, 北菜市巷/ns, 7.地名后接的表示自然区划的普通名词若有两个以上汉字,则应切开。然后将地名同自然区划名词标成短语NS。[米市/ns 大街/n]NS, [蒋家/nz 胡同/n]NS , [陶然亭/ns 公园/n]NS , 8. 大小地名相连时的标注方式为:北京市/ns 海淀区/ns 海淀镇/ns [南/f 大街/n]NS [蒋家/nz 胡同/n]NS 24/m 号/q , | | 23 | nt | 机构团体 | “团”的声母为t,名词代码n和t并在一起。 | (参见2。短语标记说明--NT)联合国/nt,中共中央/nt,国务院/nt, 北京大学/nt1.大多数团体、机构、组织的专有名称一般是短语型的,较长,且含有地名或人名等专名,再组合,标注为短语NT。[中国/ns 计算机/n 学会/n]NT, [香港/ns 钟表业/n 总会/n]NT, [烟台/ns 大学/n]NT, [香港/ns 理工大学/n]NT, [华东/ns 理工大学/n]NT,[合肥/ns 师范/n 学院/n]NT, [北京/ns 图书馆/n]NT, [富士通/nz 株式会社/n]NT, [香山/ns 植物园/n]NT, [安娜/nz 美容院/n]NT,[上海/ns 手表/n 厂/n]NT, [永和/nz 烧饼铺/n]NT,[北京/ns 国安/nz 队/n]NT,2. 对于在国际或中国范围内的知名的唯一的团体、机构、组织的名称即使前面没有专名,也标为nt或NT。联合国/nt,国务院/nt,外交部/nt, 财政部/nt,教育部/nt, 国防部/nt,[世界/n 贸易/n 组织/n]NT, [国家/n 教育/vn 委员会/n]NT,[信息/n 产业/n 部/n]NT,[全国/n 信息/n 技术/n 标准化/vn 委员会/n]NT,[全国/n 总/b 工会/n]NT,[全国/n 人民/n 代表/n 大会/n]NT,美国的“国务院”,其他国家的“外交部、财政部、教育部”,必须在其所属国的国名之后出现时,才联合标注为NT。[美国/ns 国务院/n]NT,[法国/ns 外交部/n]NT,[美/j 国会/n]NT,日本有些政府机构名称很特别,无论是否出现在“日本”国名之后都标为nt。[日本/ns 外务省/nt]NT,[日/j 通产省/nt]NT通产省/nt 3. 前后相连有上下位关系的团体机构组织名称的处理方式如下:[联合国/nt 教科文/j 组织/n]NT, [中国/ns 银行/n 北京/ns 分行/n]NT,[河北省/ns 正定县/ns 西平乐乡/ns 南化村/ns 党支部/n]NT, 当下位名称含有专名(如“北京/ns 分行/n”、“南化村/ns 党支部/n”、“昌平/ns 分校/n”)时,也可脱离前面的上位名称单独标注为NT。[中国/ns 银行/n]NT [北京/ns 分行/n]NT,北京大学/nt [昌平/ns 分校/n]NT,4. 团体、机构、组织名称中用圆括号加注简称时:[宝山/ns 钢铁/n (/w 宝钢/j )/w 总/b 公司/n]NT,[宝山/ns 钢铁/n 总/b 公司/n]NT,(/w 宝钢/j )/w | | 24 | nx | 外文字符 | 外文字符。 | A/nx 公司/n ,B/nx 先生/n ,X/nx 君/Ng ,24/m K/nx 镀金/n ,C/nx 是/v 光速/n ,Windows98/nx ,PentiumIV/nx ,I LOVE THIS GAME/nx ,HanLP/nx | | 25 | nz | 其他专名 | “专”的声母的第1个字母为z,名词代码n和z并在一起。 | (参见2。短语标记说明--NZ)除人名、国名、地名、团体、机构、组织以外的其他专有名词都标以nz。满族/nz,俄罗斯族/nz,汉语/nz,罗马利亚语/nz, 捷克语/nz,中文/nz, 英文/nz, 满人/nz, 哈萨克人/nz, 诺贝尔奖/nz, 茅盾奖/nz, 1.包含专有名称(或简称)的交通线,标以nz;短语型的,标为NZ。津浦路/nz, 石太线/nz, [京/j 九/j 铁路/n]NZ, [京/j 津/j 高速/b 公路/n]NZ, 2. 历史上重要事件、运动等专有名称一般是短语型的,按短语型专有名称处理,标以NZ。[卢沟桥/ns 事件/n]NZ, [西安/ns 事变/n]NZ,[五四/t 运动/n]NZ, [明治/nz 维新/n]NZ,[甲午/t 战争/n]NZ,3.专有名称后接多音节的名词,如“语言”、“文学”、“文化”、“方式”、“精神”等,失去专指性,则应分开。欧洲/ns 语言/n, 法国/ns 文学/n, 西方/ns 文化/n, 贝多芬/nr 交响乐/n, 雷锋/nr 精神/n, 美国/ns 方式/n,日本/ns 料理/n, 宋朝/t 古董/n 4. 商标(包括专名及后接的“牌”、“型”等)是专指的,标以nz,但其后所接的商品仍标以普通名词n。康师傅/nr 方便面/n, 中华牌/nz 香烟/n, 牡丹III型/nz 电视机/n, 联想/nz 电脑/n, 鳄鱼/nz 衬衣/n, 耐克/nz 鞋/n5. 以序号命名的名称一般不认为是专有名称。2/m 号/q 国道/n ,十一/m 届/q 三中全会/j如果前面有专名,合起来作为短语型专名。[中国/ns 101/m 国道/n]NZ, [中共/j 十一/m 届/q 三中全会/j]NZ,6. 书、报、杂志、文档、报告、协议、合同等的名称通常有书名号加以标识,不作为专有名词。由于这些名字往往较长,名字本身按常规处理。《/w 宁波/ns 日报/n 》/w ,《/w 鲁迅/nr 全集/n 》/w,中华/nz 读书/vn 报/n, 杜甫/nr 诗选/n,少数书名、报刊名等专有名称,则不切分。红楼梦/nz, 人民日报/nz,儒林外史/nz 7. 当有些专名无法分辨它们是人名还是地名或机构名时,暂标以nz。[巴黎/ns 贝尔希/nz 体育馆/n]NT,其中“贝尔希”只好暂标为nz。 | | 26 | o | 拟声词 | 取英语拟声词onomatopoeia的第1个字母。 | 哈哈/o 一/m 笑/v ,装载机/n 隆隆/o 推进/v , | | 27 | p | 介词 | 取英语介词prepositional的第1个字母。 | 对/p 子孙后代/n 负责/v ,以/p 煤/n 养/v 农/Ng ,为/p 治理/v 荒山/n 服务/v , 把/p 青年/n 推/v 上/v 了/u 领导/vn 岗位/n , | | 28 | q | 量词 | 取英语quantity的第1个字母。 | (参见数词m)首/m 批/q ,一/m 年/q , | | 29 | Rg | 代语素 | 代词性语素。代词代码为r,在语素的代码g前面置以R。 | 读者/n 就/d 是/v 这/r 两/m 棵/q 小树/n 扎根/v 于/p 斯/Rg 、/w 成长/v 于/p 斯/Rg 的/u 肥田/n 沃土/n , | | 30 | r | 代词 | 取英语代词pronoun的第2个字母,因p已用于介词。 | 单音节代词“本”、“每”、“各”、“诸”后接单音节名词时,和后接的单音节名词合为代词;当后接双音节名词时,应予切分。本报/r, 每人/r, 本社/r, 本/r 地区/n, 各/r 部门/n | | 31 | s | 处所词 | 取英语space的第1个字母。 | 家里/s 的/u 电脑/n 都/d 联通/v 了/u 国际/n 互联网/n ,西部/s 交通/n 咽喉/n , | | 32 | Tg | 时语素 | 时间词性语素。时间词代码为t,在语素的代码g前面置以T。 | 3日/t 晚/Tg 在/p 总统府/n 发表/v 声明/n ,尊重/v 现/Tg 执政/vn 当局/n 的/u 权威/n , | | 33 | t | 时间词 | 取英语time的第1个字母。 | 1. 年月日时分秒,按年、月、日、时、分、秒切分,标注为t 。1997年/t 3月/t 19日/t 下午/t 2时/t 18分/t若数字后无表示时间的“年、月、日、时、分、秒”等的标为数词m。1998/m 中文/n 信息/n 处理/vn 国际/n 会议/n 2. 历史朝代的名称虽然有专有名词的性质,仍标注为t。西周/t, 秦朝/t, 东汉/t, 南北朝/t, 清代/t“牛年、虎年”等一律不予切分,标注为:牛年/t, 虎年/t, 甲午年/t, 甲午/t 战争/n, 庚子/t 赔款/n, 戊戌/t 变法/n | | 34 | u | 助词 | 取英语助词auxiliary。 | [[俄罗斯/ns 和/c 北约/j]NP-BL 之间/f [战略/n 伙伴/n 关系/n]NP 的/u 建立/vn]NP 填平/v 了/u [[欧洲/ns 安全/a 政治/n]NP 的/u 鸿沟/n]NP | | 35 | Vg | 动语素 | 动词性语素。动词代码为v。在语素的代码g前面置以V。 | 洗/v 了/u 一个/m 舒舒服服/z 的/u 澡/Vg | | 36 | v | 动词 | 取英语动词verb的第一个字母。 | (参见 名词--n)[[[欧盟/j 扩大/v]S 的/u [历史性/n 决定/n]NP]NP 和/c [北约/j 开放/v]S]NP-BL [为/p [创建/v [一/m 种/q 新/a 的/u 欧洲/ns 安全/a 格局/n]NP]VP-SBI]PP-MD [奠定/v 了/u 基础/n]V-SBI ,, | | 37 | vd | 副动词 | 直接作状语的动词。动词和副词的代码并在一起。 | 形势/n 会/v 持续/vd 好转/v ,认为/v 是/v 电话局/n 收/v 错/vd 了/u 费/n , | | 38 | vn | 名动词 | 指具有名词功能的动词。动词和名词的代码并在一起。 | 引起/v 人们/n 的/u 关注/vn 和/c 思考/vn ,收费/vn 电话/n 的/u 号码/n , | | 39 | w | 标点符号 | | ”/w :/w | | 40 | x | 非语素字 | 非语素字只是一个符号,字母x通常用于代表未知数、符号。 | | | 41 | Yg | 语气语素 | 语气词性语素。语气词代码为y。在语素的代码g前面置以Y。 | 唯/d 大力/d 者/k 能/v 致/v 之/u 耳/Yg | | 42 | y | 语气词 | 取汉字“语”的声母。 | 会/v 泄露/v 用户/n 隐私/n 吗/y ,又/d 何在/v 呢/y ? | | 43 | z | 状态词 | 取汉字“状”的声母的前一个字母。 | 取得/v 扎扎实实/z 的/u 突破性/n 进展/vn ,四季/n 常青/z 的/u 热带/n 树木/n ,短短/z 几/m 年/q 间, | ================================================ FILE: docs/annotations/pos/ud.md ================================================ # Universal Dependencies See also [Universal Dependencies](https://universaldependencies.org/u/pos/). | Tag | Description | |------------|----------------------------------------------| | ADJ | adjective | | ADP | adposition | | ADV | adverb | | AUX | auxiliary | | CCONJ | coordinating conjunction | | DET | determiner | | INTJ | interjection | | NOUN | noun | | NUM | numeral | | PART | particle | | PRON | pronoun | | PROPN | proper noun | | PUNCT | punctuation | | SCONJ | subordinating conjunction | | SYM | symbol | | VERB | verb | | X | other | ================================================ FILE: docs/annotations/sdp/dm.md ================================================ # The reduction of Minimal Recursion Semantics Please refer to [Minimal Recursion Semantics An Introduction](https://www.cl.cam.ac.uk/~aac10/papers/mrs.pdf). ================================================ FILE: docs/annotations/sdp/index.md ================================================ # Semantic Dependency Parsing ## Chinese ```{toctree} semeval16 ``` ## English ```{toctree} dm pas psd ``` ================================================ FILE: docs/annotations/sdp/pas.md ================================================ # Predicate-Argument Structures Please refer to [Probabilistic disambiguation models for wide-coverage HPSG parsing](https://www.aclweb.org/anthology/P05-1011.pdf). ================================================ FILE: docs/annotations/sdp/psd.md ================================================ # Prague Czech-English Dependency Treebank Please refer to [Prague Czech-English Dependency Treebank](http://ufal.mff.cuni.cz/pcedt2.0/en/index.html). ================================================ FILE: docs/annotations/sdp/semeval16.md ================================================ # SemEval2016 ## CSDP SemEval2016 adopts the CSDP guideline listed as follows. ### 语义关系标注标签集 | 分类 | | | | | ------------ | ------------ | --------------- | ------------------------------------------------------------ | | 语义周边角色 | 主体角色 | 施事AGT; | 施事Agt;感事Aft | | | | 当事EXP; | 当事Exp;领事Poss | | | 客体角色 | 受事PAT; | 受事Pat | | | | 客事CONT; | 客事Cont;成事Prod;结局Cons | | | | 涉事DATV; | 涉事Datv;比较Comp;源事Orig | | | | 系事LINK; | 类事Clas;属事Belg | | | 情境角色 | 工具TOOL; | 工具Tool | | | | 材料MATL; | 材料Matl | | | | 方式MANN; | 方式Mann;依据Accd | | | | 范围SCO; | 范围Sco | | | | 缘由REAS; | 缘故Reas;意图Int | | | | 时间TIME; | 时间Time;时间起点Tini;时间终点Tfin;时段Tdur;时距Trang | | | | 空间LOC; | 空间Loc;原处所Lini;终处所Lfin;通过处所Lthru;趋向Dir | | | | 度量MEAS; | 数量Quan;起始量Nini;终止量Nfin;数量短语Qp;频率Freq;顺序Seq;变化量Nvar | | | | 状态STAT; | 状态Stat;起始状态Sini;终止状态Sfin;历经状态Sproc | | | | 修饰FEAT; | 描写Desc;宿主Host;名词修饰语Nmod;时间修饰语Tmod | | 语义结构关系 | 反关系 | 反施事rAGT; | 反施事rAgt;反感事rAft | | | | 反当事rEXP。 | 反当事rExp;反领事rPoss | | | | 反受事rPAT; | 反受事rPat | | | | 反客事rCONT; | 反客事rCont;反成事rProd;反结局rCons | | | | 反涉事rDATV; | 反涉事rDatv;反比较rComp;反源事rOrig | | | | 反系事rLINK。 | 反类事rClas;反属事rBelg | | | | 反工具rTOOL; | 反工具rTool | | | | 反材料rMATL; | 反材料rMatl | | | | 反方式RMANN; | 反方式rMann;反依据rAccd | | | | 反范围rSCO; | 反范围rSco | | | | 反缘由rREAS; | 反缘故rReas;反意图rInt | | | | 反时间rTIME; | 反时间rTime;反时间起点rTini;反时间终点rTfin;反时段rTdur;反时距rTrang | | | | 反空间rLOC; | 反空间rLoc;反原处所rLini;反终处所rLfin;反通过处所rLthru;反趋向rDir | | | | 反度量rMEAS; | 反数量rQuan;反起始量rNini;反终止量rNfin;反数量短语rQp;反频率rFreq;反顺序rSeq;反变化量rNvar | | | | 反状态rSTAT; | 反状态rStat;反起始状态rSini;反终止状态rSfin;反历经状态rSproc | | | | 反修饰rFEAT; | 反描写rDesc;反宿主rHost; 反名词修饰语rNmod; 反时间修饰语rTmod | | | 嵌套事件关系 | 嵌套施事dAGT; | 嵌套施事dAgt;嵌套感事dAft | | | | 嵌套当事dEXP。 | 嵌套当事dExp;嵌套领事dPoss | | | | 嵌套受事dPAT; | 嵌套受事dPat | | | | 嵌套客事dCONT; | 嵌套客事dCont;嵌套成事dProd;嵌套结局dCons | | | | 嵌套涉事dDATV; | 嵌套涉事dDatv;嵌套比较dComp;嵌套源事dOrig | | | | 嵌套系事dLINK。 | 嵌套类事dClas;嵌套属事dBelg | | | | 嵌套工具dTOOL; | 嵌套工具dTool | | | | 嵌套材料dMATL; | 嵌套材料dMatl | | | | 嵌套方式dMANN; | 嵌套方式dMann;嵌套依据dAccd | | | | 嵌套范围dSCO; | 嵌套范围dSco | | | | 嵌套缘由dREAS; | 嵌套缘故dReas;嵌套意图dInt | | | | 嵌套时间dTIME; | 嵌套时间dTime;嵌套时间起点dTini;嵌套时间终点dTfin;嵌套时段dTdur;嵌套时距dTrang | | | | 嵌套空间dLOC; | 嵌套空间dLoc;嵌套原处所dLini;嵌套终处所dLfin;嵌套通过处所dLthru;嵌套趋向dDir | | | | 嵌套度量dMEAS; | 嵌套数量dQuan;嵌套起始量dNini;嵌套终止量dNfin;嵌套数量短语dQp;嵌套频率dFreq;嵌套顺序dSeq;嵌套变化量dNvar | | | | 嵌套状态dSTAT; | 嵌套状态dStat;嵌套起始状态dSini;嵌套终止状态dSfin;嵌套历经状态dSproc | | | | 嵌套修饰dFEAT; | 嵌套描写dDesc;嵌套宿主dHost; 嵌套名词修饰语dNmod; 嵌套时间修饰语dTmod | | | 事件关系 | 并列关系eCOO; | 并列eCoo;等同eEqu;分叙eRect;选择eSelt;割舍eAban;选取ePref;总括eSum | | | | 先行关系ePREC; | 先行ePrec;原因eCau;条件eCond;假设eSupp;手段eMetd;让步eConc | | | | 后继关系eSUCC; | 后继eSucc;递进eProg;转折 eAdvt;目的ePurp;结果eResu;推论eInf | | 语义依附标记 | 标点标记 | 标点标记mPUNC; | 标点标记mPunc | | | 依附标记 | 否定标记mNEG; | 否定标记mNeg | | | | 关系标记mRELA; | 连词标记mConj;介词标记mPrep | | | | 依附标记mDEPD; | 语气标记mTone;时间标记mTime;范围标记mRang;情态标记mMod; 频率标记mFreq;程度标记mDegr;趋向标记mDir;的字标记mAux; 多数标记mMaj;插入语标记mPars;离合标记mSepa;实词虚化标记mVain 重复标记mRept | ## SemEval2016 The following table is a subset of CSDP but offers some examples to illustrate the idea. | 关系类型 | Tag | Description | Example | |--------|---------------|--------------------|-----------------------------| | 施事关系 | Agt | Agent | 我送她一束花 (我 <– 送) | | 当事关系 | Exp | Experiencer | 我跑得快 (跑 –> 我) | | 感事关系 | Aft | Affection | 我思念家乡 (思念 –> 我) | | 领事关系 | Poss | Possessor | 他有一本好读 (他 <– 有) | | 受事关系 | Pat | Patient | 他打了小明 (打 –> 小明) | | 客事关系 | Cont | Content | 他听到鞭炮声 (听 –> 鞭炮声) | | 成事关系 | Prod | Product | 他写了本小说 (写 –> 小说) | | 源事关系 | Orig | Origin | 我军缴获敌人四辆坦克 (缴获 –> 坦克) | | 涉事关系 | Datv | Dative | 他告诉我个秘密 ( 告诉 –> 我 ) | | 比较角色 | Comp | Comitative | 他成绩比我好 (他 –> 我) | | 属事角色 | Belg | Belongings | 老赵有俩女儿 (老赵 <– 有) | | 类事角色 | Clas | Classification | 他是中学生 (是 –> 中学生) | | 依据角色 | Accd | According | 本庭依法宣判 (依法 <– 宣判) | | 缘故角色 | Reas | Reason | 他在愁女儿婚事 (愁 –> 婚事) | | 意图角色 | Int | Intention | 为了金牌他拼命努力 (金牌 <– 努力) | | 结局角色 | Cons | Consequence | 他跑了满头大汗 (跑 –> 满头大汗) | | 方式角色 | Mann | Manner | 球慢慢滚进空门 (慢慢 <– 滚) | | 工具角色 | Tool | Tool | 她用砂锅熬粥 (砂锅 <– 熬粥) | | 材料角色 | Malt | Material | 她用小米熬粥 (小米 <– 熬粥) | | 时间角色 | Time | Time | 唐朝有个李白 (唐朝 <– 有) | | 空间角色 | Loc | Location | 这房子朝南 (朝 –> 南) | | 历程角色 | Proc | Process | 火车正在过长江大桥 (过 –> 大桥) | | 趋向角色 | Dir | Direction | 部队奔向南方 (奔 –> 南) | | 范围角色 | Sco | Scope | 产品应该比质量 (比 –> 质量) | | 数量角色 | Quan | Quantity | 一年有365天 (有 –> 天) | | 数量数组 | Qp | Quantity-phrase | 三本书 (三 –> 本) | | 频率角色 | Freq | Frequency | 他每天看书 (每天 <– 看) | | 顺序角色 | Seq | Sequence | 他跑第一 (跑 –> 第一) | | 描写角色 | Desc(Feat) | Description | 他长得胖 (长 –> 胖) | | 宿主角色 | Host | Host | 住房面积 (住房 <– 面积) | | 名字修饰角色 | Nmod | Name-modifier | 果戈里大街 (果戈里 <– 大街) | | 时间修饰角色 | Tmod | Time-modifier | 星期一上午 (星期一 <– 上午) | | 反角色 | r + main role | | 打篮球的小姑娘 (打篮球 <– 姑娘) | | 嵌套角色 | d + main role | | 爷爷看见孙子在跑 (看见 –> 跑) | | 并列关系 | eCoo | event Coordination | 我喜欢唱歌和跳舞 (唱歌 –> 跳舞) | | 选择关系 | eSelt | event Selection | 您是喝茶还是喝咖啡 (茶 –> 咖啡) | | 等同关系 | eEqu | event Equivalent | 他们三个人一起走 (他们 –> 三个人) | | 先行关系 | ePrec | event Precedent | 首先,先 | | 顺承关系 | eSucc | event Successor | 随后,然后 | | 递进关系 | eProg | event Progression | 况且,并且 | | 转折关系 | eAdvt | event adversative | 却,然而 | | 原因关系 | eCau | event Cause | 因为,既然 | | 结果关系 | eResu | event Result | 因此,以致 | | 推论关系 | eInf | event Inference | 才,则 | | 条件关系 | eCond | event Condition | 只要,除非 | | 假设关系 | eSupp | event Supposition | 如果,要是 | | 让步关系 | eConc | event Concession | 纵使,哪怕 | | 手段关系 | eMetd | event Method | | | 目的关系 | ePurp | event Purpose | 为了,以便 | | 割舍关系 | eAban | event Abandonment | 与其,也不 | | 选取关系 | ePref | event Preference | 不如,宁愿 | | 总括关系 | eSum | event Summary | 总而言之 | | 分叙关系 | eRect | event Recount | 例如,比方说 | | 连词标记 | mConj | Conjunction | 和,或 | | 的字标记 | mAux | Auxiliary | 的,地,得 | | 介词标记 | mPrep | Preposition | 把,被 | | 语气标记 | mTone | Tone | 吗,呢 | | 时间标记 | mTime | Time | 才,曾经 | | 范围标记 | mRang | Range | 都,到处 | | 程度标记 | mDegr | Degree | 很,稍微 | | 频率标记 | mFreq | Frequency Marker | 再,常常 | | 趋向标记 | mDir | Direction Marker | 上去,下来 | | 插入语标记 | mPars | Parenthesis Marker | 总的来说,众所周知 | | 否定标记 | mNeg | Negation Marker | 不,没,未 | | 情态标记 | mMod | Modal Marker | 幸亏,会,能 | | 标点标记 | mPunc | Punctuation Marker | ,。! | | 重复标记 | mPept | Repetition Marker | 走啊走 (走 –> 走) | | 多数标记 | mMaj | Majority Marker | 们,等 | | 实词虚化标记 | mVain | Vain Marker | | | 离合标记 | mSepa | Seperation Marker | 吃了个饭 (吃 –> 饭) 洗了个澡 (洗 –> 澡) | | 根节点 | Root | Root | 全句核心节点 | See also [SemEval-2016 Task 9](https://www.hankcs.com/nlp/sdp-corpus.html) and [CSDP](https://csdp-doc.readthedocs.io/zh_CN/latest/%E9%99%84%E5%BD%95/). ================================================ FILE: docs/annotations/srl/cpb.md ================================================ # Chinese Proposition Bank | | 标签 | 角色 | 例子 | |------|----------|-------|-------------------------| | 中心角色 | ARG0 | 施事者 | (ARG0中国政府)提供援助 | | | ARG1 | 受事者 | 中国政府提供(ARG1援助) | | | ARG2 | 依谓词而定 | 失业率控制(ARG2在百分之十内) | | | ARG3 | 依谓词而定 | (ARG3从城市)扩大到农村 | | | ARG4 | 依谓词而定 | 提高(ARG4百分之二十) | | 附属角色 | ARGM-ADV | 状语 | (ARGM-ADV共同)承担 | | | ARGM-BNF | 受益者 | (ARGM-BNF为其他国家)进行融资 | | | ARGM-CND | 条件 | (ARGM-CND如果成功),他就留下 | | | ARGM-DIR | 方向 | (ARGM-DIR向和平)迈出一大步 | | | ARGM-EXT | 范围 | 在北京逗留(ARGM-EXT两天) | | | ARGM-FRQ | 频率 | 每半年执行(ARGM-FRQ一次) | | | ARGM-LOC | 地点、位置 | (ARGM-LOC在机场)被捕获 | | | ARGM-MNR | 方式 | (ARGM-MNR以中英文)发行 | | | ARGM-PRP | 目的或原因 | (ARGM-PRP由于危机)而破产 | | | ARGM-TMP | 时间 | 公司(ARGM-TMP去年)成立 | | | ARGM-TPC | 主题 | (ARGM-TPC稳定政策),核心是... | | | ARGM-DIS | 话语标记 | (ARGM-DIS因此),他感到不公 | | | ARGM-CRD | 并列论元 | (ARGM-CRD与台湾)非正式接触 | | | ARGM-PRD | 次谓词 | 指控廉政公署五人(ARGM-PRD接受贿赂) | ```{note} Although ARG0 and ARG1 share general definitions across all predicates, word sense disambiguation is required to find the coresponding definition of semantic roles. Given the word sense of `变化`, say `变化-2`, [its second frameset](http://verbs.colorado.edu/chinese/cpb/html_frames/0183-bian-hua.html) can be found which defines the following 2 arguments: 1. ARG0: agent/cause 2. ARG1: entity arg0 changes These definitions are different from that of frameset `变化-1`: 1. ARG0: entity undergoing change Sometimes, the number of arguments and definitions can vary a lot across framesets. In summary, word sense disambiguation is essential if SRL is to be used to best effect in practical applications ``` ================================================ FILE: docs/annotations/srl/index.md ================================================ # Semantic Role Labeling ## Chinese ```{toctree} cpb ``` ## English ```{toctree} propbank ``` ================================================ FILE: docs/annotations/srl/propbank.md ================================================ # English PropBank | Role | Description | |------|----------------------------------------| | ARG0 | agent | | ARG1 | patient | | ARG2 | instrument, benefactive, attribute | | ARG3 | starting point, benefactive, attribute | | ARG4 | ending point | | ARGM | modifier | | COM | Comitative | | LOC | Locative | | DIR | Directional | | GOL | Goal | | MNR | Manner | | TMP | Temporal | | EXT | Extent | | REC | Reciprocals | | PRD | Secondary Predication | | PRP | Purpose | | CAU | Cause | | DIS | Discourse | | ADV | Adverbials | | ADJ | Adjectival | | MOD | Modal | | NEG | Negation | | DSP | Direct Speech | | LVB | Light Verb | | CXN | Construction | ================================================ FILE: docs/annotations/tok/ctb.md ================================================ The Segmentation Guidelines for the Penn Chinese Treebank (3.0) =============================================================== Fei Xia *University of Pennsylvania* This is an OCR version. See also the [PDF version](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1038&context=ircs_reports). ## Abstract This document describes the segmentation guidelines for the Penn Chinese Treebank Project. The goal of the project is the creation of a 100-thousand-word corpus of Mandarin Chinese text with syntactic bracketing. The Chinese Treebank has been released via the Linguistic Data Consortium (LDC) and is available to the public. The segmentation guidelines have been revised several times during the two-year period of the project. The previous two versions were completed in December 1998 and March 1999, respectively. This document is the third and final version. We have added an introduction chapter in order to explain some rationale behind certain decisions in the guidelines. We also include the English gloss to the Chinese words in the guidelines. In this document, we first discuss the notion of word and tests for wordhood that have been proposed in the literature. Then we give the specification for word segmentation. The specification is organized according to the potential Part-of-Speech tag of an expression and the internal structure of the expression. Next, we specify the treatment for some common collocations. Finally, we compare our guidelines with two segmentation standards: the first (Liu et al., 1993) is used in Mainland China and the second (CKIP, 1996) is used in Academia Sinica in Taiwan. ## Chapter 1 Introduction This document is designed for the Penn Chinese Treebank Project [XPX+ 00]. The goal of the project is the creation of a 100-thousand word corpus of Mandarin Chinese text with syntactic bracketing. The annotation consists of two stages: the first phrase is word segmentation and part-of-speech (POS) tagging and the second phrase is syntactic bracketing. Each stage includes at least two passes, that is, the data are annotated by one annotator, then the resulting files are checked by another annotator. The segmentation guidelines, like POS guidelines and bracketing guidelines, have been revised several times during the project. So far, we have released all three versions on our web site: the first draft was completed in December 1998, after the first pass of word segmentation and POS tagging; the second draft in March 1999, after the second pass of word segmentation and POS tagging. This document, which is the third draft, is revised after the second pass of bracketing. The major changes in the third draft, compared with the previous two drafts, are (1) we add an introduction chapter in order to explain some rationale behind the guideline, (2) we add the gloss to the Chinese words in the guidelines,1 and (3) we also turn the guidelines into a technical report, which is published by the Institute for Research in Cognitive Science (IRCS) of the University of Pennsylvania. ### 1.1 Notion of *word* The difficulty in defining the notion of word is not unique to Chinese,2 but the problem is certainly more severe for Chinese for a number of reasons. First, Chinese is not written with word delimiters so segmenting a sentence into "words" is not a natural task even for a native speaker. Second, Chinese has little inflectional morphology to ease word identification. Third, there is little consensus in the community on difficult constructions that could affect word segmentation. For instance, the segmentation of verb resultative compounds depends on the syntactic analysis of the construction. One view on how a verb resultative compound is formed says that a simple sentence with a compound is actually bi-clausal and the compound is formed by movement, therefore, the compound should be treated as two words. Another view believes that the compound is formed in the lexicon, and therefore should be one word. The segmentation of the verb resultative compounds depends on which view we adopt for this construction. Fourth, many monosyllabic morphemes that used to be able to stand alone in non-Modern Chinese become bound in Modern Chinese. The influence of non-Modern Chinese makes it difficult to draw the line between bound morphemes and free morphemes, the notions which could otherwise have been very useful for deciding word boundaries. Our approach is based on both linguistic and engineering consideration. The notion word in our Treebank is roughly a syntactic atom as defined in [SW87], that is, anything that can be inserted into an X° position in syntax. This includes both compounds and simple words. ### 1.2 Tests of wordhood What tests can be used to decide whether a string of hanzi[Chinese character] is a word or not? Without loss of generalization, we assume the string that we are trying to segment is X-Y, which has two morphemes X and Y. The following tests for establishing word boundaries have been proposed by various authors: - Bound morpheme: a bound morpheme should be attached to its neighboring morpheme to form a word when possible. - Productivity: if a rule that combines the expression X-Y does not apply generally (i.e., it is not productive), then X-Y is likely to be a word. - Frequency of co-occurrence: if the expression X-Y occurs very often, it is likely to be a word. - Complex internal structure: strings with complex internal structures should be segmented when possible. - Compositionality: if the meaning of X-Y is not compositional, it is likely to be a word. - Insertion: if another morpheme can be inserted between X and Y, then X-Y is unlikely to be a word. - XP-substitution: if a morpheme can not be replaced by a phrase of the same type, then it is likely to be part of a word. - The number of syllables: several guidelines [LTS93, Chi96] have used syllable numbers on certain cases. For example, in [LTS93], a verb resultative compound is treated as one word if the resultative part is monosyllabic, and it is treated as two words if the resultative part has more than one syllable. All of these tests are very useful. However, none of them is sufficient by itself for covering the entire range of difficult cases. Either the test is applicable only to limited cases (e.g., the XP-substitution test) or there is no objective way to perform the test as the test refers to vaguely defined properties (e.g., in the productive test, it is not clear where to draw the line between a productive rule and a non-productive rule). For more discussion on this topic from the linguistics point of view, please refer to [Pac98, SW87]. Since no single test is sufficient, we chose a set of tests for our segmentation guidelines which includes all of the ones mentioned except for the productivity test and the frequency test. Rather than have the annotators try to memorize the entire set and make each decision from these principles, in the guidelines we spell out what the results of applying the tests would be for all of the relevant phenomena. For example, for the treatment of verb resultative compounds, we select the relevant tests (e.g., the number of syllables and the insertion test), and give several examples of the results of applying these tests to verb resultative compounds. This makes it straightforward, and thus efficient, for the annotators to follow the guidelines. ### 1.3 Compatibility with other guidelines We have studied other groups, guidelines, such as the Segmentation Standard in China [LTS93] and the one in Taiwan [Chi96], and tried to accommodate them in our guidelines if possible. Since the final result of the Treebank is a list of bracketed sentences, our guidelines have some flexibility with regards to the segmentation of certain constructions. For example, the string 走上来[walk up] is treated as two segments in [LTS93], but one segment in [Chi96]. In our Treebank, we will segment it into two parts, and then group them together as a compound ——that is, (走[walk]/V 上来[up]/V)/V. We call 走上来 a word with internal structures. Out annotation, in this case, is compatible with both [LTS93] and [Chi96]. The comparisons of these three guidelines can be found in Appendix A. Note: For the sake of annotation efficiency, the grouping of the words with internal structure is done at bracketing stage, rather than at the segmentation stage. In this document, we show the grouping format, but keep in mind that the format is the one AFTER the bracketing is completed. For example, we consider 走上来[walk up] 2us one word. It is segmented into “走[walk]/V 上来[up]/V” at the segmentation stage, and it will be grouped into (走[walk]/V 上来[up]/V)/V at the bracketing stage. In the paper, we just say 走上来[walk up] should be annotated as (走[walk]/V 上来[up]/V)/V. Most disagreements among these three guidelines do not make much difference to parsing or sentence interpretation. For most patterns for which the guidelines give different treatments (e.g., numbers and reduplication strings), simple conversion programs can be written to convert the data from one format to another. Our goal is: in the final output, the word boundary (the highest-level X° in the parse tree) should be as accurate as possible, while the internal structure serves as a bridge for the resource sharing with other systems. ### 1.4 Treatment for unclear cases There are two types of unclear cases: - A construction is easy to identify but there is no consensus on its treatment. Ex: A-not-A, V-de construction, V-R, potential form (i.e., V-de-R).Our approach: we will choose one analysis, and annotate the data according to that analysis. Make sure that the annotation is easy to convert to the structures for other analyses if necessary. - Two constructions are difficult to tell apart by existing tests. Ex: some N+N are compounds, others are phrases. Our approach: for the sake of consistency and efficiency, we don^ disambiguate the two constructions unless making the distinction is crucial for various reasons. ### 1.5 Organization of this guidelines The guidelines are organized according to the internal structure of the corresponding expressions (e.g., a verb resultative compound is represented as V+V, while a verb-object expression is as V+N), so it is easy for the annotators to search the guidelines for reference. The Part-of-speech tags used in this paper are identical to the ones used in the POS tagging task except that the tags for verbs are merged into V and the ones for nouns are merged into N. For the descriptions of the complete POS tagset, please refer to our Part-of-Speech Tagging Guidelines for the Penn Chinese Treebank (3.0). The list of POS tags can be found in Appendix B. In this guidelines, we list mainly the decision for each case without going into detail elaborating other alternatives and the reasoning behind each decision. Chapter 2 Specification --------- In this chapter, we assume that a sentence has been segmented into large chunks, and the next step is to decide whether each chunk should be further divided. The chapter is arranged by the potential POS of the chunk if the chunk is a word. To search through the section, first use the ^POS^ of the chunk to find the subsection, then use the ^word^ formation information to find the subsection; or simply use the “word” formation information. ### 2.1 Common noun: NN #### 2.1.1 Name of relative Treat it as one word. Ex:三叔[uncle]/NN,表叔[uncle]/NN,.大姑父[uncle]/NN. #### 2.1.2 CD+N If a measure word can be inserted between CD and N without changing the meaning, tag it as CD+N; otherwise, tag it as one word (N). One word:三排[the third platoon]/NN,一方[one side]/NN,三者[three entities]/NN, 一行[a group traveling together]/NN,2 1 世纪[the 21st century]/NT. Two words: — [one]/CD 学生[student]/NN. #### 2.1.3 DT+N Treat it as one word if both DT and N are monosyllabic and either DT or N is bound; otherwise, treat it as two words. Sometimes, it is difficult to decide whether a morpheme is bound or not because of the influence of non-Modern Chinese. To be consistent, we maintain a list of nouns and a list of determiners. If a morpheme is in one of the lists, we consider it as bound: - monosyllabic bound nouns: /^.[school], ^ (when it means the earth). - monosyllabic bound determiners:当[this/that] We also treat 本人[oneself]/NN as one word and tag it as NN. One word:本人[oneself]/NN,本校[our school]/NN,全球[whole world]/NN,当地[the place mentioned]/NN,当今[present time]/NT,当代[the contemporary era]/NN. Two words:本[one’s]/DT 单位[organization]/NN. #### 2.1.4 PN+N Treat it as one word if both PN and N are monosyllabic and N is bound; otherwise, treat it as two words. In this case, the current list of bound nouns is:校[school]. One word:我校[my school]/NN. Two words:我[my]/PN 单祆[organization]/NN. #### 2.1.5 JJ+N The pattern is: X+N, where X modifies the N, and X is either a JJ or a prefix. Note: JJ+N can be a phrase. For example, in one of the files we annotated,全国性[nationwide]/JJ 网络[network]/NN is extended into “全国性[nationwide]/JJ 观测[observe]/VV 苏梅克一列桌/NR 9 号[number 9]/NN 彗星[comet]/NN 撞击[hit]/W 木星[Jupiter]/NN 的/DEC 网络[network]/NN”. Segment X+N according to the type of X: - X is a prefix: treat X+N as one word.[1](#bookmark93) A list of prefixes:啊,非[non-]. Ex:啊爸[father]/NN,非商业化[non-commercial]/JJ 宗旨[purpose]/NN. A list of JJs:原[former],前[former] Ex:原[former]/JJ 在[at]/P 华[China]/NR 老挝[Laos]/NR 难民[refugee]/NN; 前[former]/JJ 民主德国[German Democratic Republic]/NR. - X is a non-predicate adjective:[2](#bookmark94) if both JJ and N are monosyllabic, tag it as one word; otherwise, treat it as JJ+N. One word:女人[woman]/NN. Two words:共同[mutual]/JJ 利益[interest]/NN. - X is an adjective: treat it as one word if X or N is bound or the meaning of X+N is non-compositional. For unclear cases, if both JJ and N are monosyllabic, treat JJ+N as one word (e.g” 鲜花[fresh flower]/NN,强队[strong team]/NN, •红茶[black tea]/NN,好评[favorable comment]/NN). One word:小媳妇[daughter-in-law]/NN,大洲[continent]/NN,大海[sea]/NN. Two words:厚[thick]/JJ 书[book]/NN. #### 2.1.6 LC+N If both LC and N are monosyllabic, treat the string as one word, and tag it as NN or NT according to its meaning. Ex:前院[front yard]/NN,前天[day before yesterday]/NT,左肩[left shoulder]/NN. #### 2.1.7 N+LC Treat N+LC as one word if:[3](#bookmark95) - the N and LC are monosyllabic; and - in this context, the N is non-referential or bound; and - in this context, the N can not be modified by Det-M or other modifiers. Otherwise, treat it as two words. - One word (some of them might be two words in other context):室内[indoor](室内[indoor]/NN 训练[training]/NN),台下[off stage],眼前[at present],境外[foreign](境外[foreign]/NN 集团[group]/NN 境内外[domestic and international /NN,海外[oversea](海外[oversea]/NN 市场[market]/NN),背后[at the back]/NN,天下[world]/NN,国内[domestic]/NN,午后[afternoon]/NT,赛前[before the contest]/NT. - Two words:中午[noon]/NT 以后[afterwards]/LC. #### 2.1.8 N+N: N1 modifies N2 If it is 1-hl or 2+1 (i.e., N1 has one or two hanzi and N2 has one hanzi), treat N1+N2 as one word (i.e.,we treat all monosyllabic nouns as potential “接尾词. If a noun with no more than 2 hanzi is followed by multiple "接尾词" monosyllabic noun attaches to the preceding the whole string is treated as one word (e.g•,物理学家[physicist]/NN). For other cases, the string is treated as two words. - One word:北京市[Beijing]/NR,研究室[research lab]/NN,发展史[developmental history]/NN,始祖鸟[proto-bird]/NN, 残疾人[the physically challenged]/NN, 清晰度[visibility]/NN, [sense of urgency]/NN, 大奖赛[tournament]/NN,太阳系[the solar system]/NN. - Two words:北京[Beijing]/NR 大学[University]/NN,坑具[toy]/NN 工厂[factory]/NN,合作[collaboration]/NN, 领城[area]/NN,史学[history]/NN 研究[research]/NN. #### 2.1.9 PN+LC If both PN and LC are monosyllabic, treat PN+LC as one word and tag it as NT or NN. One word:此间[here]/NN,此前[before this]/NN,其中[among them]/NN,何时[when]/NT. Two word:这[this]/PN 以后[after]/LC. #### 2.1.10 V+N In this pattern, we assume V is VV (For VA+N, please refer to the section for JJ+N) If V modifies N, treat V+N as one word and tag it as a noun. one word:烤肉[barbecue]/NN,炒菜[stir-fried dishes]/NN,证明信[certificate]/NN,讨论会[symposium]/NN.[4](#bookmark96) ### 2.2 Proper Noun: NR Currently, if the proper noun is composed of multiple words, we don^ group them. #### 2.2.1 Personal name Treat it as one word. Don't give the internal structure unless there is a space between two names (in foreign alphabet). Ex:张胜利/NR,卡尔[Karl].马克斯[Maxx]/NR, John/NR Smith/NR. #### 2.2.2 Personal name with affixes Treat it as one word. Ex:老张/NR,张老/NR #### 2.2.3 Personal name + title Treat it as two words. Ex:张/NR 教授[professor]/NN,张/NR 李/NR 两[two]/CD 位/M 教授[professor"^ #### 2.2.4 Name of Organization/Country/School/.. If the pattern is N1+N2, where N2 is a common noun, then if N2 is monosyllabic, treat N1+N2 as one word, else treat N1+N2 as two words. Simple names:北京市[Beijing]/NR,黄河[the Yellow River]/NR,沙市[Sha City]/NR,黑龙江省[Heilongji^ Province]/NR. Complex names:北京[Beijing]/NR.大学[University]/NN,北京[Beijing]/NR 第一[First]/OD 服装厂[Clothing Factory]/NN,美国[the United States]/NR 国会[Congress]/NN. #### 2.2.5 NR+NR: coordination without conjunction Treat it as two words. Ex:中[China]/NR 美[the United States]/NR,中[China]/NR 美[the United States]/NR 关系[relation]/NN, 东[Eastern Asia]/NR 新[Singapore]/NR 澳[Macao]/NR. ### 2.3 Temporal noun: NT The names of years/months/day/hour and so on axe words. Ex: 1998年[1998]/NT 3月[March]/NT 21 日[21st]/NT, 5点钟[5 o’clock]/NT,初一[the first day of a lunar month]NT,i年[last year]/NT. #### 2.3.1 CD+N If CD+N is the name of a time, treat it as one word (NT). If it is the count of the time, treat it as two words (CD+M). One word: 1998年[1998]/NT, 5点钟[5 o,clock]/NT, 9 0 年代[the 90s]/NT, Two words: 3/CD 年[year]/M, 3/CD 个/M 月[month]/NN. ### 2.4 Localizer: LC Localizers are separated from the noun that it attaxJies to except for the case mentioned in Section 2.1.7 (i.e., N+LC). A localizer is either one or two syllables: - monosyllabic localizers: e.g.内[in],后[after]. - bisyUabic localizers: e.g.之间[between],以来[since],以后[afterwards],左右[around]. ### 2.5 Pronoun: PN Treat it as one word. Ex:他们[they]/PN,他自己piimself]/PN,自己[self]/PN. ### 2.6 Determiner: DT We separate DTs from the succeeding words. Ex:这[this]/DT 三[three]/CD 个/M 人[people]/NN,各[each]/DT 国[nation]/NN. Currently, we treat 这些[these] as one word, and tag it as DT. Some examples of bisyllabic DTs:全体[all],其余[the rest], —切[all],这些[these],那些[those],所 ### 2.7 Cardinal number: CD Treat it as one word. Note: the internal structure of a CD is very easy to recover if needed. Some examples: - Pure numbers: 一亿三千万[one hundred and thirty million]/CD, 30.1/CD, 123,456/CD, 35.6%/CD, 30万[three hundred thousand]/CD, 30几[thirty odd]/CD. - Estimation:三四十[between thirty and forty-nine]/CD 岁[years old]/M. - CD + X + CD(5.5.4): X is a morpheme such as 余[odd],分之[fraction],点[point]•三十几亿[three billion odd]/CD,三分之一[one third]/CD,三点一[three point one]/CD,好几[multiple]/CD 个/M. - CD+X: X is a morpheme such as 余[odd],来[over/odd]:四千一百余[four thousand and one hundred odd]/CD 人[people]/NN,三十雇[about thirty]/CD 个/M. ### 2.8 Ordinal number: OD Treat it as one word. Ex:第一[first]/OD,第三十一[thirty-first]/OD. ### 2.9 Measure word: M Treat the measure word, including a reduplicated or a compound measure word, as one word. Treat the string such as 分钟[minute] as one word. Ex:杯[cup]/M,杯杯[cup-cup]/M,架次[number of flights]/M,分钟[minute]/M. ### 2.10 Verb: VA, VC, VE, and VV #### 2.10.1 Reduplication: A A, ABAB, A ABB, A AB, ABB,ABAC Treat it as one word. - AA, A is a verb: AA/V Ex:看看[see]/W,红红[vivid red]/VA. - ABAB: AB is a verb: ABAB/V Ex:研究研究[research]/VV,雪白雪白[snow white]/VA. - AABB, AB is a verb: AABB/V Ex:来来往往[come and go]/W,高髙兴兴[happy]/VA Note: most of the time, AA or BB is not a word. - AAB(except for AA-看 in 2.10.2):AAB/V Ex:蒙蒙亮 Note: most of the time, AA or B is not a word. - ABB: ABB/V Ex:绿油油[bright green]/VA,红彤彤[bright red]/VA. Note: most of the time, A or BB is not a word. - ABAC, etc.: ABAC/V Ex:马里马虎[careless]/VA,有条有理[orderly]/VA,一清二楚[very clear]/VA. #### 2.10.2 “Reduplication”: AA-kan, A-one-A, A-le-one-A,A-le-A Treat it as one word with internal structure. - AA-看:(AA/V 看/V)/V Ex:(说说[say]/W 看/VV)/V. The basic meaning of the word 看 is to “see”,but in this context,it roughly means "try to do something". - A-one-A: (A/V one/CD A/V)/V Ex:(想[think]/W — [one]/CD 想[think]/VV)/V. - A-le-A: (A/V le/AS A/V)/V Ex:(想[think]/W 了/AS 想[think]/W)/V. - A-l^on^A: (A/V le/AS one/CD A/V)/V Ex:(想[think]/W 了/AS — [one]/CD 想[think]/W)/V. Note: V+CD+M is treated as three words, e.g. [look]/V [one]/CD [eye]/M (take a look). #### 2.10.3 A-not-A Treat it as one word with internal structure. Ex:(来[come]/VV 没[not]/AD 来[come]/VV)/V,(高[happy]/VA 不[not]/AD 高兴[happy]/VA)/V, (喜[like]/VV 不[not]/AD 喜欢[like]/VV)/V. #### 2.10.4 AD+V If one or more of the following hold, treat AD+V as one word (V): - no free word can intervene between AD and V, - the V cannot be a predicate without the AD, - the subcategorization frame of AD+V is different from that of the V. Otherwise, treat it as two words. - One word:胡说[talk nonsense],胡来[mess things up],敬献[present with great respect],尚余[remain] [(尚余[still remain]/VV 七十五[75]/^D 名)M 难民[refugee]〉NN),历任[have served successively as],并列[tie处 不喪[not afraid o月. - Two words:已经[already]/AD 采取[take]/VV,不[not]/AD 应该[should]/VV,没[not]/AD 完成[complete]/VV. #### 2.10.5 MSP+V If the V can not be a predicate without the MSP, treat MSP+V as one word (V). One word:以期[in order to]/W (以期[in order to]/W 在[at] 与[with] 美国[the United States]、 瑞典[Sweden]、挪威[Norway]、这些 [these]、世界[world]、强队[strong teams] 、交锋[competition] 、中[during]...). #### 2.10.6 N+V Some subject-predicate strings Coin be either a phrase or a word depending on the context. If a VP-modifier can be inserted between the subject and the predicate part and the “subject” is referential, then the string is a phrase, otherwise it is a word. One word:头疼[headache]/VA in “他[he]/PN 让[make]/VV 我[me]/PN 很[very]/AD 〈He gives me a headache}”. Two words:头[head]/NN 疼[ache]/VA in “我[I]/PN 头[head]/NN {很[very]/AD}疼[ache]/VA〈I have a headache}’’. #### 2.10.7 V+N If the V and the N axe separated (by the aspect markers, by the modifiers of the N, or because the V is reduplicated), treat V+N as two words. If the V and the N are adjacent,[6](#bookmark98) - If V-N is semantically transitive and its object can occur after N only when VN are adjacent (therefore the V is not a ditransitive verb),treat V+N as one word (e.g.,投资[invest]/VV, 出席[be present]/W,关心[care]/VV,为期[scheduled for a specific duration of time]/W). - If V and VN have similar meaning and both axe semantically intransitive, treat VN as one word (e.g.,睡觉[sleep]/VV). - If N is “bound”, treat VN as one word (e.g.,游泳[swim]/VV,无望[hopeless]/VV,无效[invalid]/VV, 无法[unable to]/VV,辞职[resign]/W). - If V-N is 1+1 AND the meaning is non-compositional,treat V-N as one word (e.g.,念书[study]/VV, 流血[bleed]/VV). Examples of V-N as two words:访[visit]/VY 华[China]/NR in the sentence 他[he]/PN 曾[previously]/AD 七[seven]/CD 次[time]/M 访[visit]/W 华[China]/NR〈He has visited China seven times、 #### 2.10.8 V+R The tests for verb resultative compounds (V-Hs): both V and R are verbs and the potential forms (V-de-R, V-not-R) exist. So our definition of V-R includes resultative and directional verb com-pounds (e.g.,看见[see] and 走上来[walk up]),but it does NOT include words such as 改善[improve] and 鼓动[agitate]. - We treat it as one word. For the sake of compatibility with other guidelines, we give the internal structure for the words if they have more than 2 syllables or if the R is the following:完[finish]/W. - Words without internal structure:吃掉[eat up]/VV,看见[see]/W,擦净[wipe clean]/VV. - Words with internal structures:(做[do]/VV 完[finish]/W)/V,(擦[wipe]/VV 干净[clean]/VV)/V, (认识[realize]/W 到[reach]/VV)/V. #### 2.10.9 Potential form: V-de/bu-R We treat it as one word. - If V-R exists, give the internal structure of V-de/bu-R, otherwise, don^ give one. Ex: words with internal structure:(擦[wipe]/VV 不[not]/AD 冷[clean]/VA)/V,(擦[wipe]/VV 得/DER 净[clean]/VA)/V. " - words without internal structure:吃不了 [unable to eat anymore]/W,买不起[cannot afford]/VV. Note: the string WV de R,? can be ambiguous between potential form and V-de construction. For example, “这[this]张[M]桌子[table]擦[wipe]得pER]干净[clean]吗[SP]?’’ can either be a potential form (which means Can this table be wiped clean?), or it could be a V-de construction (which means Has the table been wiped clean?). The two constructions have different syntactic structures. Normally, we can tell them apart by meaning, by the position of the object or by checking whether adverbs can be inserted between the de and the R. #### 2.10.10 V+DIR See Section 2.10.8 (i.e., the section for V+R). Words with internal structure:(走[walk]/VV 出去[out]/VV)/V,(走[walk]/VV 不[not]/AD 出去[o叫 Words without internal structure:走出[walk out of]/VV,想出[think of]/VV. #### 2.10.11 V+AS Treat it as two words.[7](#bookmark99) Ex:走[walk]/VV 了/AS. #### 2.10.12 V+DER The pattern is V-de in V-de construction. We treat V-de as two words.[8](#bookmark100) Ex:走[walk]/VV 得/DER (走[walk]/W 得/DER 很[very]/AD 快[fast]/VA). #### 2.10.13 Verb coordination without conjunctive words If the pattern is 1+1, treat it as a word; otherwise, treat it as multiple words. One word:修建[build]/VV. Two words:宣传[propagate]/VV 鼓动[agitate]/VV. #### 2.10.14 V+coverb The pattern is V+X, where X is monosyllabic and it is either a P or a V.[9](#bookmark101) - We first decide whether V+X is a word. If it is, we use its syllable count to decide whether to show its internal structure. That is, if V is monosyllabic, don^ give the internal structure; otherwise, give the internal structure. - treat V+X as one word if X is in the following list:给[give];为[become],成[become],作[treat as],到[arrive],出[out];自[from],向[toward],入[in],以[with]. Ex: - 给[give]:送给[give/send to]/VV,交给[hand in]/VV,(赠送[give as a gift to]/VV 给[give]/VV)/V. - 为[to],成[become/into],作[do/as],到[arrive],出[out]:(翻译[translate]/VV 成[become] 当作[treat as]/VV,起到[take effect]>V,找到[find]/VV,(认识[realize]/VV 到[reach]/VV)/V,决出[decide victors]/VV. - 自[from],向[toward],入[in],以[with]:来自[come from]面向[face toward]/ into]/VV,迈向[step toward],VV,报以[respond with]/VV,加以[supplement with]/VV. - treat V+X as two words if X is in the following list:在[at],似[like]. - Ex:生[to be born]/W 在[at]/P,坐[sit]/W 在[at]/P,留[stay]/W 在[at]/P,深[deep]/VA 似pike]/P 海[sea]〉NN. - treat V+X as one word or two words (V+P) according to the meaning of the X, if X is in the following list:于[at]. - If 于 in V + 于 can be replaced by 在[at], tag V+于 £us two words (V+P). Otherwise, tag it as one word. - One word:等于[equal to]/VV,缘于[due to]/VV,大于[bigger than]/VV,小于[smaller than]/VV, 无助于[of no help to]/VY 低于[lower than]/W,利于[be beneficial for]/W,有利于[be beneficial for]/VV. - Two words:生[to be born]/W 于[at]/P,建[build]/VV 于[at]/P. #### 2.10.15 Others Generally, in X+V(or V+X) where X modifies V, if X cannot modify other verbs, or V cannot be a predicate without the X, treat X+V as one word. - Ex:以期[in order to]/W ### 2.11 Adverb: AD Adverbs are separated from the XP that it modifies. Adverbs that modify numbers:近[almost]/AD 三十[thirty]/CD,5[five]/CD 分[minute]/M 多[odd]^ 钟[minute]/NN.[10](#bookmark102) The string such as fe^[extremely big] is an adverb when it modifies VPs, not AD+VA, because the VA(大[big]) cannot modify VPs without the AD(极[extremely]). #### 2.11.1 Reduplication When VA(or AD) reduplicates, the resulting word can be an AD. Ex:妤好[well]/AD 干[do]/W,常常[always]/AD,仅仅[only]/AD. #### 2.11.2 DT+M/N The following are tagged as ADs when they modify VP/S:这样[this way]/AD (这样[this way]/AD 做[do]/W),同机[on the same airplane]/AD (同机[on the same airplane]/AD 到达[arrive]/W). #### 2.11.3 P+PN We treat the following as two words:为[for]/P 此[this]/PN. #### 2.11.4 P+N The following can be seen as frozen PPs. Since they have the same function as the ADs, we treat them as words, and tag them as ADs:迄今[until now],沿途[on the way],即席[impromptu], 为何[why](为何[why]/AD 愈演愈洩[get worse and worse]/VA),为什么[why]/AD 来[come]/VV #### 2.11.5 PN+LC If a PN+LC totally loses the function of an NP and the string acts like an adverb, treat it as an adverb. We treat the following as ADs:此外[in addition]/AD. #### 2.11.6 Others If in that context a string totally loses the function of the XP(where X is the head of the string) and the string behaves like an adverb, tag it as AD. We treat the following as ADs:进一步[a step further]/AD. ### 2.12 Preposition: P Separate it from NP/S that follows it. Most prepositions are monosyllabic. Some common bisyllabic prepositions are:为了 [in order to],随着[along with],沿着[along],本着[in conformity with],鉴于[due to],除了[except],经过[through], 作为[being/regard as],截止[until]. When a coverb follows a verb, we have to decide whether the word is part of a verb compound. A list of such coverbs are:于,给,为, See Section 2.10.14 for details. ### 2.13 Subordinating Conjunction: CS Separate it from the XP that follows it. Strings such as 只有[only] is ambiguous: - CS:只有[only if]/CS ...才[then]/AD .... - AD+VE:他[he]只[only]/AD 有[have]/VE 三[three]/CD 块/M 钱[money]/NN〈He only has three dollars). ### 2.14 Conjunction: CC Separate it from the XPs that it conjoins. Ex:和[and]/CC,与 ### 2.15 Particle: DEC, DEG, DEV, DER,AS, SP,ETC,and MSP Separate it from the XP that it attaches to.[11](#bookmark103) Most particles axe monosyllabic. One of bisyllabic particles is 的话[if so]/SP. ### 2.16 Interjection: IJ Treat it as one word. Ex: 哈[expressing satisfaction and so on]/IJ. ### 2.17 Onomatopoeia: ON Treat it as one word. Ex:哈哈[sound of laughter]/ON,哔啦啦[sound of water/rain]/ON ### 2.18 Other noun-modifier: JJ Separate it from the measure word (M) or the noun (N) that it modifies. Ex:三[three]/CD 大[big]/JJ 杯[glass]/M 水[water]/NN "When JJs modify nouns, the JJs can be adjectives,区别词(非谓形容词),or “phrasal words”. Most of the <4phrasal words,? have two parts: X+Y, both X and Y are monosyllabic, and X or Y is the short-form of the corresponding words. Some examples of the "phrasal words" are as follows: #### 2.18.1 V+N V+N:随军[being with the army]/JJ.妓女[prostitute]/NN,旅英[having studied in England]/JJ 学者[scholar]/N^ 成套[forming a complete set]/JJ 设备[equipment]/NN,.发稿[sending manuscripts to press]/JJ 时间[time]/NN, ^^[receiving award]/JJ #i[scholar]/NN, 驻华[being stationed in China]/JJ 使馆[embassy]/NN, ^4[giving benefit]/JJ 国家[nation]/NN, #### 2.18.2 AD+VA AD+VA:最新[the newest]/JJ 消息[news]/NN,超大[extra-large]/JJ 规模[scale]/NN 集成[integrate]/NN 电路[circuit]/NN,较大[relatively big]/JJ 增长[growth]/NN. The common “AD”:最[the most],超[extra-],较[relatively]. #### 2.18.3 VA+N VA+N/M:高层[high-ranking]/JJ 人士[official]/NN,高速[high speed]/JJ 公路[highway]/NN,大幅[big size]/JJ 标语[slogan]/NN. #### 2.18.4 CD+N CD+N/M:两国[two~nation]/JJ 关系[relation]/NN,多国[multi-nation|/JJ 部队[troop]/NN #### 2.18.5 P+N P+N/LC:对外[foreign]/JJ 政策[policy]/NN #### 2.18.6 Others others:关贸[tariff and trade]/JJ 总协定[treaty]/NN,年均[annual average]/JJ 增长率[growth rate]/NN, 上述[aforementioned]/JJ 三[three]/CD 国[nation]/NN,历届[all previous sessions]/JJ 世界[world]/NN 体操[gymnastics]/NN 大赛[championship]/NN,有关[related]/JJ 方面[parties]/]S[N. ### 2.19 Punctuation: PU Treat it as one word, except when it is part of another word; for example, 4V? in a number (e.g., 123,456/CD) or in proper names,(e.g.,卡尔[Karl].马克斯[Marx]/NR). ### 2.20 Foreign word: FW Treat it as one word, except when it is part of another word (e.g., [Karaoke]/NN). ### 2.21 Others #### 2.21.1 Idioms The frozen idioms (成语)axe treated as words when they function as an NP or a VP. Ex:各有所好[each has his likes and dislikes]/V, 一比高低[compete]/V. #### 2.21.2 Telescopic strings Telescopic strings are treated as one word if they are not too long (less than four characters). K it is too long, segment them according to pauses. Short strings:进出口[imports and exports]/NN 贸易[trade]/NN,国内外[foreign and domestic]/NN 形勢[situation] /NN. Long strings:交响[symphony]/JJ 乐团[orchestra]/NN,北京[Beijing]/NR 市长[mayor]/NN. #### 2.21.3 Short form Ex:三好[three-merit]/JJ 学生[student]/NN,教科文[education,science,紐d culture]/NN 组织[organization] (UNESCO),七中[the seventh central government]/NN 全会[convention]/NN. Shortened part is treated as one word. If the shortened part is longer than 3 syllables, segment them according to phonologic evidence (e.g., pauses). The structure of the short form might be different from that of the full form. Chapter 3 Collocation with Some Morphemes --------- ### 3.1 Strings with zhe5 Some prepositions end with 着. Ex:随着[along with]/P. ### 3.2 Strings with zhi1 zhi+LC, where LC is monosyllabic, is treated as one word (LC). - Ex:之外[aside from]/LC,之中[among]/LC. - zhi1+CD is treated as DEG+CD (e.g.,方法[method]/NN 之/DEG 一[one]/CD,方法[method]/NN 之/DEG 三[three]/CD). For simplicity,之一 in a sentence such as 中国是发展中国家之一 is treated as one word and tagged as an NN. zhi1+N is treated as DEG+N (e.g.,少年[Children]/NN 之/DEG 家[Club/Center]/NN). ### 3.3 Strings with bu4 If X in X+不[not] (or 不[not]+X) must co-occur with bu4 or the meaning of X+不[not] is not com-positional, we treat X+bu4 as one word. Words that include bu4(不[not]):不到[less than](不到[less than] 5 分钟[minutes],不足[less than] (不足[less than] 5 公斤[kilogram]),不便[inconvenient],不久[not before long]. ### 3.4 Strings with shi4 For simplicity, we treat 特别是[particularly]/AD as one word. ### 3.5 Strings with xiel The following axe treated as one word: [these]/PN(or DT), [some]/CD. ### 3.6 Strings with you3 V+有[have] is often a verb; for example,刻有[engraved with]/VV,真有possess]/VV,富有[rich]/VV. mei2you3(没有) is always treated as one word(VV or VE or SP). Many idioms include the word 有[have]; for example,若有所思[as if lost in thought]/W. The following are two words:有[have]/V 所/MSP,仅[only]/AD 有[have]/V,有[have]/V 可能[possibility]/NN. The following are ambiguous without the context: - you3-dian3(有点):V[have]+M or AD[a little bit] It is V+M when 点 can be dropped or replaced by 一点[a little bit]. you3-dian3 is an AD when it can be replaced by other degree adverbs such as ^[very] or when it is followed by a VP. - 他[he]/PN 有点[a little bit]/AD 下不了 [unable to get off]/VV 台[stage]/NN〈He felt embarrassed}. - 这[this]/DT 本/M 书[book]/NN 有[have]/V 点/M 意思[meaning]/NN〈This book is interesting〉. - 这[this]/DT 本/M 书[book]/NN 有[have]/V 点/M 看头[worth reading]/NN〈This book is worth reading). - you3-de5(有的):V[have]+DEC or DT[some] - 他[he]有[have]/V 的/DEC 书[book]我[I]也[also]有[have]〈The books that lie has, I have, too〉. - 有的[some]/DT 人[people]已经[already]走[leave] 了[AS]〈Some people have already left〉. - you3-xie1 (有些):V[have]+M or DT[some]: - 我[I]只[only]有[have]/VV 些[some]/M 旧书[old books]〈I only have some old books.} - 他[he]不[not]像[like]有些[certain]/DT 人[people]专门[especially]爱[like]抬杜[argue]〈沿 like certain people who especially like to argue). - zhi3-you3(只有):AD[only]+V[have] or CS[only if]: - 你[you]只有[only]/CS 学习[learn]才[then]/AD 能[able to]改进[improve]工作[work]〈You can only improve your work by learning). - 他[he]只[only]/AD 有[have]/VV 10 块[M].钱[dollars]〈He only has ten dollars〉. ### 3.7 Strings with zai4 One word:正在[in the process of]/AD. ### 3.8 Strings with zi4ji3 Always treat PN+zi4ji3 (自己[self]) as one word. Ex:他自己/PN. Chapter 4 Common Collocations --------- ### 4.1 As one word - AD:迄今为止[皿til today],迄今[皿til now],进一步[one step further],越来越[more and more],同机[on the same airplane],沿途[on the way],即席[impromptu]. - DT:这些[these]. - JJ:对外[foreign] (e.g.,对外[foreign]/JJ 政策[policy]/NN),各界[all circles]/JJ. - LC:之间[between],在内[inside]. - NN:其中[among them], —行[group traveling together]. - P:为了[in order to]. - V:来自[come from],面向[face toward],流入[flow in],迈向[step toward],报以[respond with],为期[scheduled for a specific duration of time],有利于[be beneficial for]. ### 4.2 As two words - AD-like:并[yet]/AD 未[not]/AD. - CC-like:及[and]/CC 其[his/its/her]/PN,而[and]/CC 又[in addition]/AD. - DT-like:各[each]/DT 个/M. - NN-like:超大[extra-large]/JJ 祝模[scale]/NN,我[our]/PN 国[nation]/NN. - NT-like:零点[midnight]/NT 零一分[one]/NT〈one minute past midnight〉. ### 4.3 Other cases V-V:(迎上[step forward]/W 前去[go forward]/VV)/V. Appendix A Comparison with Other Guidelines ---------- In this appendix, we compare our guidelines with the guidelines from PRC [LTS93] and from Rocling [Chi96]. The grouping of words in our system is done in bracketing stage. | | Ours | PRC | Rocling | Example | | --- | --- | --- | --- | --- | | Verb | | | | | | AA | AA | AA | AA | 看看 | | ABAB | ABAB | ABAB | ABAB | 研究研究 | | AABB | AABB | AABB | AABB | 高高兴兴 | | ABB | ABB | ABB | ABB | 绿油油 | | AAB(excl AA-看) | AAB | AAB | AAB | 蒙蒙亮 | | ABAC etc. | ABAC | ABAC | ABAC | 有条有理 | | AA-看 | (AA/V kan/V)/V | AA kan | AA kan | | | A-yi-A | (A/V yi/CD A/V)/V | AyiA | AyiA | 走一走 | | A-l^A | (A/V le/AS A/V)/V | A le A | A le A | 走了走 | | A-le-yi-A | (A/V le/AS yi/CD A/V)/V | A le yi A | A le yi A | 走了一走 | | nonreduced A-not-A | (A/V not/AD A/V)/V | A not A | A not A | 喜欢不喜欢 | | reduced A-not-A | (A/V not/AD A/V)/V | A-not-A | A-not-A | 喜不喜欢 | | V-R(R is monosyl.) | v-r except v/V 完/V | v-r | v-r | 打破 | | V-R(R is bisyl.) | (v/V r/V)/V | v r | v r | 扫千净 | | V-de/bu-R | (v/V de/DER r/v)/V | v de r | v de r | 打得破 | | (V-R exists) | (v/V bu4/AD r/v)/V | v bu r | y bu r | 打不破 | | V-de/bu-R | y-de-r/V | ?? | y-de-r | 来得及 | | (V-R doesn’t exist) | v-bu-r/V | ?? | y-bu-r | 来不及 | | V-DIR | (v/V dix/V)/V | v dir | v-dir | 走上来 | | V-x-0 | v/V x/X o/N | v x n | v x n | 吃了饭 | | VO | depends | depends | depends | 关心,吃饭 | | V-de | y/V de/DER | v de5 | v de5 | 走得 | | V-AS | y/V as/AS | v as | v as | 走了 | **Table A. 1: Comparison with PRC’s and Rocling’s Guidelines** | | Ours | PRC | Rocling | Example | | --- | --- | --- | --- | --- | | Nouns | Proper Names (NR) LstNm+Fst Nm | one seg | two segs | one seg | 王鸣 | | IstNm+title | name/NR title/NN | name title | name title | 王市长 | | NR +接尾词 | nr-nn/NR | depends | nr-nn | 北京市 | | NR + common noun | nr/NR nn/NN | nr nn | nr nn | 北京大学 | | complex names | several segs | depends | several segs | 北京第一服装厂 | | Common nouns N+men5 | one seg | one seg | two segs | 学生们 | | VA+N | depends | depends | depends | 小媳妇 | | N+N | depends | depends | depends | 牛肉 | | Temporal nouns name of time | cd-year/NT | cd year | cd-year | 1998年 | | count of time | cd/CD year/NN | cd year | cd year | 3年 | | DP-related CD | one seg | ?? | one seg | 一万三千 | | CD+X+CD | one seg | several | one seg | 三分之一 | | AD + CD | ad/AD + cd/CD | ad cd | ad cd | 约三百 | | CD + X | cd-X/CD | cdX | cd-X | 三百多 | | di4-CD | di 娈 cd/OD | di4 cd | di4-cd | 第一 | | CD+M | cd/CD m/M | cd m | cd m | 这个 | | M + M | m-m/M | m-m | m-m | 片片 | | yi1+M+M | yi1/CD m-m/M | yi1 m-m | yi1-mm | 一片片 | | yi1-M-yi1-M | yi1/CD m/M yi1/CD m/M | ?? | yi1 m yi1 m | —^^~-个 | | Markers V-AS | v/V as/AS | v AS | v AS | 打了 | | V-de | v/V de/DER | v de5 | v de5 | 走得 | | SP | one seg | one seg | one seg | 吗 | | de5(的,地) | one seg | one seg | one seg | 我的,高兴地 | | zhi1(之)+CD/N | two segs | two segs | two segs | 方法之三 | | zhi1(之)+LOC | one seg | ?? | one seg | | | Others 成语(no insertion) | one seg | one seg | one seg | 鼠目寸光 | | ACROM | one seg | one seg | one seg | 北大 | **Table A.2: Comparison with PRC and Rocling^ Guidelines(Ctd)** Appendix B Treebank Part-of-Speech Tagset ---------- The following is the Part-of-Speech Tagset used in our Penn Chinese Treebank. | | | | | --- | --- | --- | | AD | adverb | 还 | | AS | aspect marker | 着 | | BA | 把 in barconstmction | 把,将 | | CC | coordinating conjunction | 和 | | CD | cardinal number | 一百 | | CS | subordinating conjunction | 虽然 | | DEC | 的 in a relative-clause | 的 | | DEG | associative 的 | 的 | | DER | 得 in V-de const, and V-de-R | 得 | | DEV | 地 before VP | 地 | | DT | determiner | 这 | | ETC | for words等,等等 | 等,等等 | | FW | foreign words | ISO | | IJ | interjection | 啊 | | JJ | other noun-modifier | 男,共同 | | LB | 被 in long bei-const | 被^给 | | LC | localizer | 里 | | M | measure word | 个 | | MSP | other particle | 所 | | NN | common noun | 书 | | NR | proper noun | 美国 | | NT | temporal noun | 今天 | | OD | ordinal number | 第一 | | ON | onomatopoeia | 哔哔 | | P | preposition excl.被 and 把 | 从 | | PN | pronoun | 他 | | PU | punctuation | | | SB | 被 in short bei-const | 被^给 | | SP | sentence-final particle | 吗 | | VA | predicative adjective | •红 | | VC | 是 | 是 | | VE | # as the main verb | 有 | | VV | other verb | 走 | **Table B.l: Our POS tagset in alphabetical order** Bibliography ------------ [Chi96] Chinese Knowledge Information Processing Group. Shouwen Jiezi - A study of Chinese Word Boundaries and Segmentation Standard for Information Processing (in Chinese). Technical report, Taipei: Academia Sinica, 1996. [1D92] John Xiang ling Dai. The Head in Wo Pao De Kuai. Journal of Chinese Linguistics, 1992. [LTS93] Y. Liu, Q. Tan, and X. Shen. Segmentation Standard for Modern Chinese Information Processing and Automatic Segmentation Methodology, 1993. [Pac98] Jerome L. Packard, editor. New Approaches to Chinese Word Formation, Mouton de Gruyter, 1998. [SW87] Anna Maria Di Sciullo and Edwin Williams. On the Definition of Word. The MIT Press, 1987. [XPX+00] Fei Xia, Martha Palmer, Nianwen Xue, Mary Ellen Okurowski, John Kovarik, Shizhe Huang, Tony Kroch, and Mitch Marcus. Developing Guidelines and Ensuring Consistency for Chinese Text Annotation. In Proc. of the 2nd International Conference on Language Resources and Evaluation (LREC-2000)^ Athens, Greece, 2000. [1](#footnote1) The difference between a JJ and a prefix is that the latter, not the former, is bound. As mentioned before, sometimes, it is difficult to tell whether a morpheme is bound or not, so we keep a list of morphemes that we regard as prefixes. In this case, if the N in X+N can be replaced with, an NP, we treat X as a JJ, ratter than a prefix. [2](#footnote2) A word is a non-predicate adjective if it can not appear as a predicate after the subject without the help of 是...的. [3](#footnote3) N+LC1+LC2, where LC1 and LC2 denote opposite directions, is treated similarly. [4](#footnote4) In either of the last two examples, the first morpheme is bisyllabic, and it could be tagged as nouns in some context. Because the second morpheme is mono-syllabic, the expression should be treated as one word regardless of the POS tag of the first morpheme. [5](#footnote5) [6](#footnote6) The V+N combination is among the hardest cases for the word definition. The tests proposed here are not perfect. They tend to treat idiomatic phrases (similar to "kick the bucket" in English) as words. However, Those errors can be easily corrected if later a dictionary becomes available. [7](#footnote7) It has been argued that aspect markers are affixes (e.g., [1D92]). Right now, we do not group the V and the AS together. [8](#footnote8) The function of de in the V-de construction is controversial. It ranges from an affix, a particle, to a verb. We will not get into details here. [9](#footnote9) Many of Xs in this pattern are ^coverbs^ and it is highly debated which tag, V or P, X should Lave in this pattern and whether V+X forms a word by the process such as reanalysis. [10](#footnote10) Note: 50 多分钟 is segmented as 50 多[50\_odd]/CD 分钟/M. [11](#footnote11) In the literature(e.g., [1D92]), it has been argued that some of the particles such as 得,了 are affixes. For the sake of compatibility with other guidelines and also because it is very easy to automatically group these particles with preceding words, we separate the particles from the preceding words. ================================================ FILE: docs/annotations/tok/index.md ================================================ # Tokenization ## Chinese ```{toctree} ctb msr ``` ================================================ FILE: docs/annotations/tok/msr.md ================================================ # MSR中文文本标注规范 (5.0 版) [**Tokenization Guidelines of Chinese Text (V5.0)**](http://sighan.cs.uchicago.edu/bakeoff2006/MSRAsamp/msra-spec.pdf) 黄昌宁 李玉梅 朱晓丹 Chang-Ning Huang, Yumei Li, and Xiaodan Zhu 微软亚洲研究院 Microsoft Research Asia 2006 年 3 月 27 日 March 27, 2006 微软《中文文本标注规范(5.0 版)》 ## 第一章 概述 ### 1.1 版本说明 微软亚洲研究院《命名实体标注规范》3.0版是为30万词《人民日报》语料的命名实体(NE)标注任务制定的。其英文版‘Guideline on Chinese Named Entity Annotation’成稿于2003年2月,用于LSP(Lexical Service Platform)课题。当时在研究院,命名实体识别(Name Entity Recognition)和自动分词(Word Segmentation)是文本处理中互相独立的两个过程,所以未曾深入考虑分词词表(lexicon)对命名实体标注带来的影响。2005年3月至7月在准备第二届国际自动分词评测(SIGHANBakeoff2005)的237万词训练语料的过程中修订了该规范,形成4.0版。《命名实体标注规范》4.0版的一个最大特点是把命名实体识别有机地融入到中文自动分词的整体过程中去。因此,除了命名实体自身的定义以外,还需要系统地阐明词表词和各类实体之间的复杂关系。本规范是在微软亚洲研究院《命名实体标注规范》4.0版的基础上编制的。由于规范实际上涵盖了文本中词语和各类实体的标注规则与实例,所以更名为《中文文本标注规范》(Tokenization Guidelines of Chinese Text) 5.0版。 ### 1.2导读 规范的第一章(概述)、第二章(专有名词标注总则)、第六章(数字串标注总则)以及第九章(分词歧义消解细则)是每个标注人员必读的材料。其它章节收集了大量的实体标注规则与实例,用以补充各类实体定义的不足。凭借这些具有上下文信息的词例化实例可以进一步提高文本标注的精度和一致性,所以它们是供标注人员经常查阅的参考资料。诚恳欢迎读者对本规范和带标语料中的错误提出宝贵意见,以便及时更正。批评和意见请寄[黄昌宁](mailto:cnhuang@msrchina.research.microsoft.com)。 ### 1.3标注格式 format-1是面向标注人员的格式: /十月九日/上午/ ->/[dat十月九日]/[tim上午]/ format-2是基于XML的标注格式: /十月九日/上午/ -> `十月九日 上午` *TIMEX* 是时间表达式,日期 *DATE* 和时间 *TIME* 是它的两个子类。 考虑到本规范主要是为标注人员编写的,以后的例子主要以第一种格式(format-1)表示。想了解更多 XML 格式的读者,请参见 MET-2 Guideline1。 1MET(MultipleEntityTask)是1997年第七届MUC(Meassage Understading Conference)会议多实体识别任务的简称。MET-2是当年美国NIST公布的命名实体标注规范。可查阅:http://www.itl.nist.gov/iaui/894.02/related_projects/muc/proceedings/ne_task.html ### 1.4命名实体标记集 表1-1是本规范定义的全部命名实体标记,包括专有名词(*NAMEX*)、时间表达式(*TIMEX*)、数字表达式(*NUMEX*)、度量表达式(*MEASUREX*)和地址表达式(*ADDREX*)等类五大类及其下属的三十个子类。 | 大类 | 子类 | Format-1标注集 | Format-2标注集 | | --- | --- | --- | --- | | NAMEX | Person | P | PERSON | | Location | L | LOCATION | | Organization | O | ORGANIZATION | | TIMEX | Date | dat | DATE | | Duration | dur | DURATION | | Time | tim | TIME | | NUMEX | Percent | per | PERCENT | | Money | mon | MONEY | | Frequency | fre | FREQUENCY | | Integer | int | INTEGER | | Fraction | fra | FRACTION | | Decimal | dec | DECIMAL | | Ordinal | ord | ORDINAL | | Rate | rat | RATE | | MEASUREX | Age | age | AGE | | Weight | wei | WEIGHT | | Length | len | LENGTH | | Temperature | tem | TEMPERATURE | | Angle | ang | ANGLE | | Area | are | AREA | | Capacity | cap | CAPACITY | | Speed | spe | SPEED | | Acceleration | acc | ACCELERATION | | Othermeasures | mea | MEASURE | | ADDREX | Email | ema | EMAIL | | Phone | pho | PHONE | | Fax | fax | FAX | | Telex | tel | TELEX | | WWW | www | WWW | | Postalcode | pos | POSTALCODE | **表1-1命名实体的标记集** ### 1.5基本原则 #### 1.5.1基本考虑 通用性:尽量遵循国际标准MET-2和ER-992,不同之处在本规范中阐明。 实用性:可用于LSP (Lexical Service Platform), TTS (Text To Speech conversion), IR (Information Retrieval), IE (Information Extraction), QA (Question Answering), IME(Input Method Editor)等应用系统。 #### 1.5.2标注对象 ##### 1.5.2.1词表词与未登录词 本规范认为:文本中的任何一个词要么是词表词(LW),要么是未登录词(OOV)。两者都是文本的标注对象。未登录词可以进一步分成命名实体(NE)、词法派生词(MDW)和新词(NW)等三部分。本规范定义的命名实体是未登录词的主体。 (1)命名实体(NE) 命名实体可以进一步分成如下五大类共三十个子类(详见表1-1): - 专有名词(*NAMEX*)包括人名(*P*)、地名(*L*)和机构名(*O*)等3种。 - 时间表达式(*TIMEX*)包括日期(*dat*)、时间(*tim*)和时段(*dur*)等3种。 - 数字表达式(*NUMEX*)包括百分数(*per*)、钱款(*mon*)、频度(*fre*)、整数(*int*)、分数(*fra*)、小数(*dec*)、序数(*ord*)和比率(*rat*)等8种。 - 度量表达式(*MEASUREX*)包括年龄(*age*)、温度(*tem*)、角度(*ang*)、长度(*len*)、 - 面积(*are*)、容积(*cap*)、重量(*wei*)、速度(*spe*)、加速度(acc)和其它(*mea*)等10种。 - 地址表达式(*ADDREX*)包括电子邮箱(*ema*)、电话(*pho*)、传真(*fax*)、电报挂号(*tel*)、邮政编码(*pos*)和网址(*www*)等6种。 在标注过的文本中,词的边界一律用斜线(slash)表示。除了词表词以外,每个独立的命名实体(即非嵌入到词表词内部的实体,见1.5.2.3)也被视为一个词,其标注符号及形式详见本规范。 (2)词法派生词(MDW) 以词表词AB的重迭形式AABB和AB/AB为例: /*转轨*/*哪*/*有*/*像*/*人*/*说*/*得*/*那般*/*轻轻松松*/? /*积累*/*多*/*了*/*,*/*抽出*/*时间*/*,*/*认真*/*整理*/*整理*/, (3)新词(NW) 一个新词的左右两侧用符号&标示,其内部的切分符保留3,如: /&*桑拿*&/*浴*/ /*天时地利*/&*人*/*和*&/*;*/ /[L*罗*]/*货币*/&*列*/*伊*&/ 以下是一些真实的例句,例句中的实体标注符号请参阅表1-1。 [Example-1] ``` /[dat 6月29日]/、/[dat 30日]/[tim 晚上]/,/[L 北京市]/下/了/[int 两场]/大雨/,/笔者/ 居住/的/宿舍/楼/前/,/宽/[len 六七米]/、/长/[len 30多米]/的/路/上/积水/达/膝盖/之上/。 6月29日 30日 晚 上 北京市 两 场 大 雨 笔者居住宿舍 六七米 30多米 积水膝盖之上 ``` [Example-2] ``` /[dat 6月中下旬]/,/笔者/到/[L 意大利]/、/ [L 西班牙]/等/国/访问/时/,/一个/很/深/的/感受 /是/[L 意]/、/[L 西]/两国/的/高速公路/非常/发达/,/东西南北/,/纵横/成/网/,/.四通八达/。 6月中下旬笔者意大利西班牙访问 一 个 感 受 西两国高速公路非 常发达东西南北纵横四通八达 ``` [Example-3] ``` /[O 县委]/决定/选派/任/了/[dur 八年]/[O 城建局]/长/的/[P 周欣光]/担任/[O 老干部局]/长/。 县 委 决 定 选 派 八 年 城建局 周欣光 担 任 老干部局 ``` [Example-4] ``` /[L喇嘛寺村]/地处/[L承德避暑山庄]/,/[L山庄]/寺庙/林立/,/僧侣/穿梭/,/[L山庄]/[L外八庙]/的/[ord第一个]/庙/就/是/[L喇嘛寺]/。 喇嘛寺村地处 承德避暑山庄 >山庄寺庙林立僧侣 穿 梭 > 山 庄 外八庙 第一个喇嘛寺 ``` ##### 1.5.2.2*L*, *P*,*O*, *dat*,*tim*,*dur*等实体的边界允许跨越多个词表词 例如: /[L*瑞典*]/[O*斯德哥尔摩国际和平研究所*]/ /[O*中国工商银行上海市分行*]/ /[tim*下午当地时间*5*时*59*分*]/ 1.5.2.3专名的标记(L,P,O)可以插入到一个词表词的内部 例如,词表词抗日战争*和*事后诸葛亮*中的地名和人名应分别予以标注。 /*抗*[L*日*]*战争*/----正确标注。 /*抗日战争*/*----未标出*L,是错误标注。 /*抗*/[L*日*]/*战争*/ ----插入分词标记,是错误标注。 /*事后*[P*诸葛亮*]/ ##### 1.5.2.4数字串(除专名以外的其他四类表达式)的标记不得插入到词表词的内部 ###### 1.5.2.4.1dat,tim等标记不得插入到一个词表词的内部 词表词*夏令营、*春耕、*冬训*、*早出晚归*中的*夏、春、冬、早、晚*等词素都有*dat*和*tim*的意思,但不得标注。例如, /[dat*冬*]*训*/ ---错误标注。 /[tim*早*]*出*[dat*晚*]*归* / ---错误标注。 然而词表词被整体标注为*dat*和*tim*的情况是常有的,例如: /[dat*初冬*]/ ----*初冬*是词表词。 /*[dat*夏季*]/*----*夏季*是词表词。 /告别/*了*/[dat*冬日*]/*的*/*凝重*/*、*/[dat*春天*]/*的*/*轻盈*/*、*/[dat*夏日*]/*的*/*浪漫*/, - 注:在文本中具有比喻意义的*春、夏、秋、冬*、历史上的*今天、昨天、明天*不作标注。例如: /[dat*今年*]/*又*/*迎来*/*了*/*一个*/*科学*/*的*/*春天*/ /"/*在*/*陆地*/*资源*/*日渐*/*减少*/*的*/*今天*/*,*/ /*他们*/*的*/*明天*/*将*/*更加*/*辉煌*/*。*/ ###### 1.5.2.4.2int,ord等标记不得插入到到一个词表词的内部 词表词*五湖四海*、*不管三七二十一*、*三纲五常中的数词不允许标注*int(整数)。例如, /*[int*五*]*湖*[int*四*]*海*/*----错误标注。 /*十年动乱*/*结束*/*不久*/*,*/ ----*十年动乱*是词表词。*十年*不标。 /*不管三七二十一*/ /*三纲五常*/ ##### 1.5.2.5数词首、半、双、两等 ###### 1.5.2.5.1序数词素首 词表中有许多词含有词素*首*,如*首创、首倡、首选、首发、首航、首飞、首演、首映、首战、首展、首席代表、首席科学家、首席执行官、首富、榜首、魁首、居首*等。但不可把词表词中的词素*首*单独作为*ord*(序数)来标注。 /*首席执行官*/----正确标注。 /*[ord首席]执行官*/----在词表词中插标*ord*是错误的。 以下的词表词属于"首+量词"结构,可以整体作为*ord*标注。例如: *[ord*首届*]*,*[ord*首次*]*,*[ord*首批*]*,*[ord*首位*]*,*[ord*首例*]* ###### 1.5.2.5.2分数词素半 词表中有许多词含有词素*半**如半价、半票、半饱、半身、半世、半辈子、上半时、下半场、半边*等,但不可把上述词表词中的词素*半*标注为*fra*(分数)。 /*下半场*/*比赛*/[O*中国队*]/*未进*/*一*/*球*/ /*上半时*/ /*下*[fra*半*]*场*/----在词表词中插标*fra*是错误的。 以下的词表词可作为不同的数字串(*dur*,*tim*,*fra*,*int*,*age*)标注: *[dur*半年*]*,*[dur*半天*]*,*[tim|dur*半夜*]*,*[fra*半个*]*,*[int|age*半百*]* - 注:半个西瓜中的半个,与四半中的半概念不一样,前一个半是指二分之一, 后一个半是量词,所以标注也不同!!/*[int*一个*]*/*西瓜*/*分为*/[int*四半*]/ /[fra*半个*]/*西瓜*/ ###### 1.5.2.5.3整数词素双 当数词双成为词表词的一个词素时,如"双方、双边、双手、双打、双杠、双轨、双层、双目、双亲"等,一律不作为整数(*int*)标注。对于非词表词,只标[*int双*]。例如: /*窗外*/*又*/*起风*/*了*/*,*/*双层*/*的*/*窗*/*硬是*/*阻挡*/*不住*/*沙尘*/*的*/*侵扰*/*。*/ /*双方*/*认为*/*,*/[L*中*][L*美*]/*两国*/*应该*/*从*/*战略*/*的*/*高度*/*和*/*长远*/*的*/*角度*/ - 注:一般情况下,数词和"方"之间不切分整体标为*int*。但"四方"是词表词所以不标。 [*int三方*]/*会谈*/ /*举行*/*四方*/*会谈*/ 以下是相关的例子: /*用*/*任何*/*一*/*部*/[*int双*]/*音频*/*电话*/*只需*/*拨打*/[pho*2580*]/*就*/*可以*/ /*部队*/*进行*/*的*/*海上*/*训练*/*、*/[int*双*]/*机*/*穿云*/*、*/*超低空*/*等*/*高难*/*课目*/*训练*/ /*全村*/[are*700亩*]/*旱地*/*都*/*种上*/*了*/[int*双*]/*膜*/*棉*/*,*/ ###### 1.5.2.5.4整数词素两 当数词"两"成为词表词的一个词素时,如"两国、两会、两地、两者、两头、 两手、两边、两旁、两侧"等,一律不作为数位串(*int*)标注。例如: /*使*/*两国*/*的*/*友好*/*合作*/*得到*/*巩固*/*和*/*发展*/。 - 注:一般情况下,数词和"国"之间是要切分的,如:[*int五*]/*国*/*元首*/ /*前*/*些*/*年*/*我*/*对*/*参加*/*『*/*两会*/*』*/*总是*/*有点*/*发怵*/*。*/ /*大街*/*两旁*/*店铺*/*林立*/ /*戏台*/*两侧*/*立柱*/*上*/*有*/*这样*/*一*/*副*/*对联*/*:*/ /*中间*/[int*两间*]/*是*/*客厅*/*,*/*两边*/*是*/*卧室*/*和*/*书房*/*。*/ /*对*/*分散*/*居住*/*的*/*"*/*五保*/*"*/*户*/*,*/*镇*/*、*/*村*/[int*两*]/*级*/*拨*/*专款*/ /[int*两*]/*车*/*饮料*/*以及*/*办公*/*桌椅*/*,*/ - 注:临时量词"车、船、床、桌、屋子、院子"等不进入int标注。/*成为*/[O*议会*]/[int*两*]/*院*/*审议*/*的*/*重点*/*和*/*舆论*/*关注*/*的*/*焦点*/ - 注:两院不是词表词,所以应当切分标注如上。 /*及早*/*进行*/*政治*/*谈判*/*推动*/[L*两岸*]/*关系*/*发展*/ /*沿江*/[int*两*]/*岸*/*苗家*/*吊脚楼*/*上*/*的*/*观众*/ - 注:词表词两岸是专指台湾海峡两岸的地名。如果泛指江河两岸,则不作为 地名标注,而且要切分并标数词"两"为*int*。 /*一下*/*进*/*了*/[int*两*]/*球*/ #### 1.5.3基本规定 1)标注时,不得在原来的文本中加入回车换行符。 2)对于NIST制定的两个中文NE标准:MET-2和ER-99。前者已有系统参加评测,它们的评测结果可供后来者参考;后者是前者的修订版,但尚未有系统参加测试。本规范与这两种标准不同之处将尽可能在注释中加以说明。例如: /[dat*去年上半年*]/ - 注:MET-2把去年上半年*整体视为*dat*;ER-99则只将上半年*视为*dat*。 3)对于微软研究院根据自己的需要而加入的标记,本规范也将在注释中加以说明。比如本规范要求的如下标注: /[P*邓小平*]/*理论*/ - 注:MET-2和ER-99规定,*理论,主义,思想,定律*等词前面的人名均不作为专名标注(见2.8)。 ## 第二章 专有名词标注总则 ### 2.1专有名词(NAMEX)标注通则 对于人名、地名和机构名这三类专有名词,MET-2和ER-99之间的差异甚微,在它们给出的示例中,只有两处不同:中南美*和长江流域*(具体情况见后)。所以在制订人名、地名、机构名的标注规范时,我们没有刻意去区分这两个标准,而是力图把它们统一地融入本规范。 下面给出人名、地名、机构名的定义。 ### 2.2专有名词是具体的、特定的,而不是抽象的、泛指的 比如:*上苍、老外、姑娘,小镇,企业*等就不应视为专有名词。 ### 2.3复合专有名词的标注不允许嵌套 在MET-2和ER-99标准中,任何命名实体都不允许嵌套。换句话讲,只标一个实体的最长边界,不标其内部包含的其它实体。 ### 2.4人名、地名、机构名中的数字串不单独标出 例如: /[P*龟山一郎*]/ /[L*德富路二四一至二六三号*]/ /[O*北京*101*中学*]/ /[O*北京*[ord*四*]*中*]/ ----这种嵌套式的的标注是错误的! ### 2.5含有外文和数字的命名实体应整体一起标注 例如: /[O*American航空公司*]/ /[O*SONY公司*]/ ### 2.6当两个实体用虚词的连接时应分别标注为两个实体 例如: /[L*美国*]/*的*/[L*纽约*]/ /[L*美国*]/*的*/[P*理查德本森*]/ 但当*的*成为实体的一部分时,要整体一起标注。例如: /[O*美的电器集团*]/ ### 2.7实体前后有引号或书名号的情况 如果一个命名实体中间有引号或书名号,则引号或书名号是该实体的一部分。如果一个实体被外面的引号或书名号括起来,那么其引号或书名号就不作为实体的一部分标注。例如: /[O"*阿克布拉克*"*中哈合资企业*]/ /[O*美国《幸福》杂志*]/ /*《*/[O*星岛日报*]/*》*/*的*/*社论*/*说*/ ### 2.8短语内部包含实体、但整体又不是命名实体的情况 ER-99规定:如果一个短语内部包含实体、但整体又不是命名实体,则一律不作标注。本规范则要求对该短语中的实体部分加以标注。例如: /[L*巴拿马运河*]/*条约*/ - 注:ER-99认为,巴拿马运河条约*整体不能分解,其中的地名不应标注。但本规范把其中的巴拿马运河*单独标为地名。 /[L*巴拿马运河*]/*----巴拿马运河*单独出现时,作为地名标注。 /[L*香港*]/*脚*/ - 注:英文为"HongKongfoot",类似于"athletesfoot",不可分解,所以ER-99规定整体不标。本规范,仍将*香港*标为地名。 /[L*美国*]/*小姐*/ - 注:原文为"MissAmerica",指选美活动中获全美第一名的小姐。对此ER-99规定整体不标。本规范,仍将*美国*标为地名。 /[L*美国*]/*姑娘*/*----ER-99对本例的美国*也是标注的。 /[ord*第四十六届*]/[O*太平洋亚洲旅行协会*]/*年会*/ - 注:此例在ER-99中整体不标,理由是不可分解。本规范认为找不出充分理由说明其不可分解。所以我们把太平洋亚洲旅行协会*标为机构名。第四十六届太平洋亚洲旅行协会年会*整体不是机构名。 /[P*毛泽东*]*思想*/ /[P*马克思*]*主义*/ /[P*马克思*]/*主义*/ ----*错误标注!因为*马克思主义*是词表词。 /[P*阿佛加罗*]/*定律*/ - 注:ER-99规定,在理论、主义、思想、定律*等词前面出现人名时,是整体不可分解的字符串;因此该字符串和其中的人名都不标注。但本规范仍将标注其中的人名。 ### 2.9与军队相关的情况 当泛指某个国家的军队(如英军、美军*等)时,不是机构名;当指一个具体的军种(如空军、陆军、海军*等)时,要标注为机构名。例如: /[L*美*]/*军*/*飞机*/ /[O*斯里兰卡空军*]/ /[O*英国皇家空军*]/ 但是,有如下特殊情况: *[L*济南军区*]/ ----*军区是*L*而不是*O*。 /[L*彼得森空军基地*]/ -----军事基地是L而不是O。 /[L*西非*]/&*维*/*和*&/*部队*/ ------部队不作为机构名标准。 ### 2.10多媒体、产品和条约中的人名、地名、机构名 ER-99规定:当人名、地名、机构名属于多媒体、产品和条约时,均不加标注。但本规范对上述实体名还是要标注的。例如: /[P*邓小平*]/*一*/*片*/*的*/*播出*/ - 注:ER-99规定,电视节目的名字邓小平*不标。本规范仍把它标为人名。此外,邓小平*作为片名,在规范的文本中应当用书名号括出,如《邓小平》。 /*二战*/ ----*二战*是事件,所以不标注。 /[L*香港*]/*百*/*题*/*今天*/*为*/*您*/*解答*/ - 注:ER-99规定,香港百题是电视片的标题,所以专名香港不予标注。但本规范仍把香港标为地名。下面其它的例子就不一一解释了。 /*这*/*本*/*介绍*/[P*毛泽东*]/*的*/*小说*/ ----*毛泽东*要标注。 /*这*/*本*/*名*/*为*/[P*毛泽东*]/*的*/*小说*/ ----ER-99*毛泽东*不标。 /[L*广州*]/*条约*/ ----ER-99*广州*不标。 /[L*辽*][L*沈*]*战役*/ ----ER-99*辽沈*不标。 本规范在后面还要对人名、地名、机构名中不加标注的情况作专门的说明,详见下面的各章节标注细则。 ### 2.11别名或简称的标注 对人名、地名、机构名的别名或简称要标注。例如: /[O*IBM*]/ /[L*深*]/[L*沪*]/*股市*/ /[O*北约*]/ /[L*中*][L*美*]/*首脑*/*互访*/ /[L*中*]/[L*文*]/*双方*/*一致*/*认为*/ - 注:由于中美是词表词,标注地名时不可插入分词标记。中文也是词表词,但这里是指中国和文莱,所以标成地名时需要在两个简称中插入分词符号。这样的词表词还有中意、意中、中巴、日中、中肯、中非等。巴中是一个地名,但表示巴基斯坦和中国时需要用分词符号把两个简称分隔开。 - 注:对于简称中嵌套的人名、地名、机构名不予标注,如: /[O*中共*]/ ----*中*指*中国*,但不标。 /[O*中共中央政治局*]/ - ---同理,不标注*中*。 ## 第三章人名 人名一般包含姓和名两部分,姓是表明家族的字,有单姓和复姓之别;名也就是名字,是一种称号,由一个或几个字组成,跟姓合在一起,用来代表一个人,以区别于别的人。下面将对人名的标注规则进行详解。 ### 3.1人名标注规则 正常情况下,人名一般包含姓和名两部分,标注规则如下表所示: | **序号** | **情况** | **标记方法** |例子 | | --- | --- | --- | --- | | 1 | 只含姓,没有名 | 标出姓氏部分 | *[P*庄*]*、*[P*欧阳*]*、*[P*司马*]* | | 2 | 只包含名字 | 标出名字部分 | *[P*育焜*]* | | 3 | 姓名 | 姓名整体标出 | *[P*苏宗哲*]*、*[P*萝莉胡吉温*]* | | 4 | 姓名|姓|名+称谓称谓+姓名| | 5 | 前缀+姓名|姓|名姓名| | 6 | 姓名+姓名 | 分开来标 | *[P*李向东*]/[P*李向阳*]* | | 7 | 外国人名 | 作为一个整体来标 | *[P*罗马里奥*]*[P*马拉多纳*]*[P*比尔*•*盖茨*]* | - 说明:当人名中包含•时,整体标注为人名,如[P*比尔•盖茨*]。 ### 3.2人名标注细则 #### 3.2.1人名的示例和详细说明 #### 3.2.1.1人名实例 /[P*颜惠忠*]/ /[P*连战*]/ /[P*凡*•*高*]/ /[P*陈方安生*]/ ---当妻子与丈夫的名字写在一起时,要作为一个人名整体标注为P! #### 3.2.1.2称谓、绰号、官职不作为人名的一部分 称谓、绰号、官职(如先生、总理等)不作为人名的一部分。例如, /[P*张*]/*经理*/ /[P*李*]/*市长*/ /[P*陈*]/*姓*/*游客*/*说*/ /[P*刘*]/[ord*二*]/*嫂*/ /[P*周*]/*总理*/ /[P*雷锋*]/*同志*/ /[P*奥尔布赖特*]/*国务卿*/ #### 3.2.1.3当称谓和姓名不可分时应整体标注为人名 /[P*李主席登辉*]/*先生*/ /*处*/[P*李犯清龙*]/*死刑*/*,*/ /[P*李犯*×*龙*]/*持*/*刀*/*行凶*/*杀害*/*无辜*/*青年*/*,*/ #### 3.2.1.4几世、几代要作为人名的一部分 /[P*十四世达赖丹增加措*]/ /[L*英国*]/*女王*/[P*伊丽莎白二世*]/ #### 3.2.1.5家族实体 /[P*蒋*]/*氏*/*父子*/ /[P*西迪*]/*兄弟*/ #### 3.2.1.6圣人和宗教人物要标注为人名 /[P*释迦穆尼*]/ /[P*达赖*]/*喇嘛*/ ### 3.3虚构的人物、动物的名字要标注为人名 #### 3.3.1在童话、小说中虚构人物要标注为人名 /[P*孙悟空*]/ /[P*玉皇大帝*]/ #### 3.3.2虚构的动物和非人的人物要标注为人名 /[P*唐老鸭*]/ /[P*花仙子*]/ /"/[P*盼盼*]/"/*是*/*国内外*/*著名*/*的*/*熊猫*/*明星*/*,*/ /*争相*/*目睹*/*狮*/*王*/[P*木法沙*]/*和*/*王后*/[P*色拉碧*]*产下*/*的*/*小*/*王子*/[P*辛巴*]/ /*走进*/*一家*/*饭馆*/*,*/*发现*/*老板*/*就*/*是*/*大*/*灰*/*狼*/[P*罗克*]/*。*/ #### 3.3.3用称谓或朝代等名号来指称特定人时要标注为人名 例如: /[P*康熙*]/ /[P*乾隆*]/ /[P*秦始皇*]/ /[P*老子*]/ /[P*孔子*]/ ### 3.4不标注为人名的各种情况 #### 3.4.1虚构的非人的植物的名字不作为人名标注 如: /"/*彩霞*/*,*/"/*石子*/*小声*/*嘟哝*/*着*/*,*/"/*多*/*恶心*/*的*/*名字*/*!*/"/ /*电磨*/*姐姐*/*故意*/*气*/*气*/*小*/*毛驴*/*,*/*说*/*:*/"/*输*/*了*/*,*/*可*/*不能*/*哭鼻子*/*。*/"/ /"/*卡车*/*哥哥*/*,*/*我*/*和*/*你*/*来*/*比*/*一*/*比*/*谁*/*运*/*得*/*多*/*,*/*怎么样*/*?*/"/ /*好像*/*在*/*说*/*:*/"/*荷花*/*姐姐*/*,*/*你*/*好*/*!*/ #### 3.4.2对于嵌套在地名和机构名中的人名,不作标注 如: /[L*嘉诚广场*]/ /[O*中山大学*]/ /[O*宋庆龄基金会*]/ #### 3.4.3作为书名或画名的人名 作为书名或画名的人名ER-99不标(见2.8),但本规范是要作标注的。如: /*世界*/*名画*/*《*/[P*蒙娜莉萨*]/*》*/ */《/*[P*蒋介石*]/*与*/[P*毛泽东*]/*》*/ 3.4.4法律、法庭事件、天气形成、疾病和奖金等五种情况 当人名后面紧跟法律名、法庭事件、天气形成、疾病、奖金这五种情况时,人名不标注。 例如: /*里*/*氏*/[ord*六点二级*]/ -----*里*不标。 /*专家*/*呼吁*/*人们*/*要*/*注意*/*沙*/*氏*/*杆菌*/ -----*沙*不标。 /[P*诺贝尔*]*奖*/ -----ER-99*诺贝尔*不标。 #### 3.4.5在人名后面出现基金会时要整体标注为机构名 /[O*李嘉诚基金会*]/ 所以基金会*和奖、奖金*是不同的两种情况,需加以区别。又如 /[O*李嘉诚股份有限公司*]/ /[O*诺贝尔股份有限公司*]/ ## 第四章 地名 地名包括洲、海洋、国家、省、市、县、地区、街道、乡、镇、村、机场、军事基地、军区、铁路、公路、桥梁、海峡、海湾、港湾、河流、湖、公园、草原、煤矿、牧场、养殖场、音乐厅、剧院、教堂、寺庙、图书馆、博物馆、美术馆、展览中心、公园、动物园、植物园、火车站、广场、大厦、大楼、体育场(馆)、游泳馆(池)、赛车场、商城、超市、书店(城)等城市公共设施,还包括某些特定的城市建筑和虚构的处所。详见下表。 ### 4.1地名标注规则 | **序号** | **情况** | 标记方法 |例子 | | --- | --- | --- | --- | | 1 | 只是单独地名 | 标出地名部分 | *[L*中国*]*[L*竹塘乡*] | | 2 | 地名+地理(行政)单位 | 作为整体标出 | *[L*北京市*]*[L*台北县*]*地理单位如:省、地区、市、县、乡、镇、村、店、庙、沟、屯、坟、崖、海洋、河、川、江、峡谷、海湾、港湾、丘陵、湖、半岛、三角洲、区、街、路、街、街道、社区、小区、公寓、音乐厅、剧院、图书馆、博物馆、美术馆、展览馆、公园、动(植)物园、火车站、广场、大厦、大楼、体育场(馆)、游泳馆(池)、赛车场、商城、超市、书店(城)等城市公共设施及象征性建筑物、军事基地、军区等。*[L*天安门广场*]*[L*艾菲尔铁塔*]* | | 3 | 包含上、下位的地名(即合成地名)以及并列的地名 | 一律分别单独标出 | *[L*山东省*]/[L*青岛市*]/[L*胜利广场*]*[L*青岛市*]/[L*孙中山广场*]*[L*北京市*]/[L*海淀区*]/[L*知春路*]/[L*希格玛大厦*]*[L*北京*]/*、*/[L*天津*]/*、*/[L*上海*] | | 4 | 地名简称 | 单独标出 | *[L*鲁*]/*、*/[L*冀*]/*、*/[L*京*]* | | 5 | 并列的简称 | 单独标出 | [L*中*]/[L*俄*]/*两国*/*领导人*/*进行*/*了*/*会晤*[L*港*][L*澳*][L*台*]/地区 | | 6 | 地名包含人名以及地名包含地名的情况 | 地名中的人名、地名不标 | *[L*李嘉诚广场*]*[L*南京路*] | | 7 | 地名+地名关键词表达一个完整的概念时 | 相对完整的地名 | *[L*南非共和国*]*[L*宁夏回族自治区*]*[L*香港特别行政区*]* | ### 4.2地名标注细则 #### 4.2.1地名实体示例 /[L*北京*]/ /[L*亚洲*]/ /[dat*2008年*]/[L*奥*]*运会*/*,*/[L*中国*]*人*/ /[L*中国*]*人民*/ ----*中国人、中国人民*都是词表词。 /[L*朝鲜*]/*南北*/*对话*/ ----*不标注南*,北。 - 注:词表词"京剧、京白、京腔、京味儿"中的"京"字要标注为: /[L*京*]*剧*/*、*/[L*京*]*白、*/[L*京*]*腔*/*、*/[L*京*]*味儿*/ /[L*台东火车站*]/ /[L*卑南文化公园*]/ /[L*基隆文化中心广场*]/ /[L*高雄港第一港口*]/ /[L*苏澳镇*]/[L*南方澳渔港*]/ /*环*/[L*渤海湾*]/*地区*/*的*/*天然气*/*市场*/ /*来自*/[L*沈阳军区*]/*各*/*集团军*/ /[L*梅狮路后段*]/ /[L*中横公路天祥段*]/ /[L*华禄溪*]/*及*/[L*碧绿隧道*]/ /[L*南二高*]/[L*高雄支线*]/ /[L*台廿一线*]/ /[L*美国空军基地*]/ /[L*上海*]/[L*国际航运大厦*]/ /[L*上海*]/[L*虹口足球场*]/ /[L*上海博物馆*]/ /[L*上海*]/[L*城市规划展示馆*]/ /[L*石家庄*]/[L*富强电力新村*]/ /[L*西安第二长途通讯大楼*]/ /[L*北京市*]/[L*王府井百货大楼*]/ /[L*广深铁路*]/*以及*/[O*深圳发展银行*]/*部分*/*高官*/*也*/*被*/*免职*/ /[L*汉江*]/*上*/*的*/[L*圣水大桥*]/ /[L*新亚欧大陆桥*]/ ---从世界知识知道此处大陆桥的名字叫*新亚欧大陆桥*,是不可分解的。 #### 4.2.2地名指示词(如国、省、市等)视为地名的一部分一起标注 地名指示词(如国、省、市等)视为地名的一部分一起标注。复杂的、具有包含关系的地名要分开标注,但分开标注时不可把一个有完整意义的地名拆散。以下是正确的标注: /[L*德国联邦*]/*政府*/*总理*/ /[L*基隆市*]/ /[L*台东县*]/ /[L*南山部落*]/ /[L*美国*]/ [L*马里兰州*]/ /[L*约旦河*]/ /[L*朝鲜半岛*]/ /[L*长江三角洲*]/ -----*长江三角洲*是词表词。 /[L*吉林省*]/[L*延边朝鲜族自治州*]/[L*图们市*]/ 以下两例均为错误的标注,因为*延边朝鲜族自治州*是具有完整意义的地名: /[L*吉林省*]/[L*延边*]/[L*朝鲜族自治州*]/[L*图们市*]/ /[L*吉林省延边朝鲜族自治州*]/[L*图们市*]/ - 注:在ER-99的标准测试集中,把中国西昌卫星发射基地*整体标为地名。我们认为这是错误的,因为在一个地名中不应当包含具有上、下位关系的另一地名。正确的标注是: /[L*中国*]/[L*西昌卫星发射基地*]/ /[L*美国洛克希德·马丁卫星测控中心*]/*和*/[L*中卫公司测控站*]/ /*从*/[L*法*]/*属*/[L*圭亚那*]/[L*库鲁航天中心*]/*发射*/ - 注:本规范不采用ER-99的标注:*[L*法属圭亚那库鲁航天中心*]*。 /[L*武汉*]/[L*长江大桥*]/ /[L*上海*]/[L*中山公园*]/ - 注:尽管其它城市也有长江大桥和中山公园,但在当地它们已构成完整的地名,所以应单独标注。 /*位于*/[L*朝阳门*]/*外*/*商务*/*区*/*之中*/*,*/ /[L*盛华公寓*]/*坐落*/*于*/[L*西直门*]/*内*/[L*冠英园小区*]/ - 注:内、外都不在标注范围之内,但如果地名中的内、外去掉不能说明是一个完整的地名时,内、外要标注在地名内。如: /[O*外交部*]/*位于*/[L*北京市*]/[L*朝阳门内南小街*52*号*]/ /[L*西直门外大街*71*号*]/ 4.2.3并列的地名应分别标注 对于并列的多个地名应分别标注。对于嵌套在地名中的人名、地名和机构名不再单独标注。例如: /[L*中*]/[L*意*]/*双方*/ ----*中意*是词表词,作为国名时要切开。 /[L*香港*]/*和*/[L*澳门特别行政区*]/ /*目前*/*已*/*有*/[int*12个*]/[L*中*]/*、*/[L*东欧*]/*国家*/ /[L*北京*]/[L*上海*]/ /[L*科*]/[L*伊*]/*边境*/ #### 4.2.4跨国家的和国家内部的地名 /[L*西非*]/*国家领导人*/ /*从*/[L*陕*]/[L*甘*]/*革命*/*老区*/*到*/*沿海*/*经济特区*/*,*/ /[L*亚太*]/----亚太是词表词,它是一个地名,而不是两个地名。 /[L*近东*]/*和*/[L*北非*]/ ##### 4.2.4.1表示地理方位的名词 一些表示地理方位的名词如*南半球、北半球、江南、江北、西南、西北、华南、华北、华中、东北*等虽然不完全具备确指性,也要作为地名标注为*L*。 /[L*汉水*]/*流域*/*、*[L*西南*]/*地区*/*东部*/ /[L*江南*]/*大*/*部*/*、*[L*华南*]/*有*/*小*/*到*/*中雨*/ /*近*/[dur*两天*]/*造成*/[L*东北*]/*、*/[L*华北*]/*地区*/*的*/*降雨*/*天气*/*系统*/ /*迫使*/[L*北半球*]/*的*/*副热带*/*高压带*/*在*/[L*青*][L*藏*]/*地区*/"/*断裂*/"/ - 注:上述地名后面的方位词*南部、北部、东部、西部*不应包括在地名的括号里, 因为其所指的区域是更不确定的。 ##### 4.2.4.2方位词修饰地名实体时要整体标注为L /[L*东西九龙*]/ ----这是一个并列的地名。 /*一代*/*又*/*一代*/*海*/*测*/*官兵*/*犁*/*波*/*耕*/*浪*/*于*/[L*南中国海*]/*,* /[L*北爱尔兰*]/ /[L*中西伯利亚*]/ - 注:ER-99将此例标为*中*/ [L*西伯利亚*]。我们认为它整体是一个专指性的地名。 /[L*中南美*]/ /[L*东南亚*]/ - 注:ER-99要求把上面两个地名分别标注为[L*中*]/[L*南美*]*和*/[L*东*]/[L*南亚*]/*。其实中南美*指*中美*和*南美*两个地名,而*东南亚*是一个地名。这样的细节需要专门的地理知识才能做出判断。所以我们不遵循ER-99的这条规则。 #### 4.2.5地名实体受时间词修饰时,时间词不标 /*前*/[L*苏联*]/ /*前*/[L*南*]/*地区*----*南*指南斯拉夫,时间词*前*不标。 #### 4.2.6 只有经纬度在一起时才能标注为 **L** 只有经纬度在一起时才能标注为L,否则经度或纬度单独标为角度*ang*。如: /*震*/*中*/*位于*/[L*北纬三十六点二零度,东经九十点二九度*]/ /*并*/*将*/*卫星*/*定点*/*在*/[L*东经*110.5*度赤道*]/*上空*/*。*/ /*震*/*中*/*位于*/[ang*北纬*30.5*度*]/*,*/ #### 4.2.7天体的标注 /[L*宇宙*]/ /[L*地球*]/ /[L*太阳*]/ /[L*太阳系*]/ /[L*银河*]/ /[L*银河系*]/ /[L*月亮*]/ /[L*海王星*]/ /[L*东方红三号*]/ /[L"*鑫诺1号*"*卫星*]/ - 注:火箭只是卫星的发射工具,故火箭型号不作为星体标注。 /[dat*96年2月15日*]/*长征*/[ord*三号乙*]/*火箭*/*发射*/*失利*/, /*长*/[ord*二*]/*捆*/*火箭*/ ----*全名为*"*长征二号捆绑式运载火箭*"。 ### 4.3不作地名标注的示例 /[L*阿*]/[L*以*]/*冲突*/ - 注:ER-99和MT-2认为阿(阿拉伯)不是一个特定国家的简称,本规范不采纳他们的规定。 /*回答*/*了*/[L*中*]*外*/*记者*/*的*/*提问*/ ---*外*不标。 #### 4.3.1地区一般不作为地名的一部分标注 仅当*地区*特指行政单位时,才被视为地名的一部分。一般情况下,*地区*泛指一片地方,不是地名的一部分。若不能确定时,*地区*不作为地名的一部分标注。 /[L*港*][L*澳*][L*台*]/*地区*/ -----*港澳台*是词表词。 /[L*巴尔干*]*地区*/ /[L*临沂*]/*地区*/*现*/*更名*/*为*/[L*临沂市*]/ #### 4.3.2平原、山脉、山区、盆地、沙漠、流域不在标注范围内 *平原、山脉、山区、盆地、沙漠、戈壁、流域、故里、故居、纪念馆、风景区、开发区、经济区*等都不在地名标注范围内。但当某某故居、故里、纪念馆成为一个对外开放的旅游景点时,才作为地名标注。如: /[L*云*][L*贵*]*高原*/ ----*云贵高原*是词表词不可分割,但云、贵要分别标注*L*。 /[L*成都*]/*平原*/ /[L*秦岭山*]/*脉*/ /[L*秦*]/[L*巴*]/*山区*/ /[L*四川*]/*盆地*/ /[L*撒哈拉*]*沙漠*/ ----*撒哈拉沙漠*是词表词。 /[L*长江*]/*流域*/ /[L*毕加索故居*]/ /*造型*/*典雅*//*毗邻*/[L*青云岩*]/*风景区*/*及*/[L*北山湾*]/*旅游区*/ *[L*约旦河西岸*]*----因为*约旦河西岸*是专指。 /[L*海峡两岸*] / ----指*台湾湾海峡两岸*。 /[L*两岸*]/ - 注:词表词*两岸*只有在表示台湾海峡两岸时,才作为地名标注为*L*,当作为*江河、湖泊*的两岸时,*两岸*要切分标注。如: /[L*长江*]/*的*/*丰姿*/*和*/[int*两*]/*岸*/*的*/*美景*/*尽收眼底*/*。*/ /*祖国*/[L*大陆*]/ - 注:内地虽然指中国大陆,但不作为地名标注,这里遵从了ER-99的规定。特区只有在确指是香港和澳门时才作标注。如: /*来自*/*内地*/*和*/[L*香港特区*]/ /[L*特区*]/*政府*/*和*/[L*香港*]/*同胞*/*正*/*以*/*喜悦*/*的*/*心情*/ /[L*中国*]/[L*厦门*]/*经济特区*/ #### 4.3.3对语言文字前的单音节地名不标,双音节的地名标注为L *英语*----对*英*不标注。 *汉语*----对*汉*不标注。 *中文*----对*中*不标注。 /*对*/[L*西藏*]/*地区*/*的*/*藏语*/*广播*/ /*主张*/*台语*/*在*/[L*台*]/ /*用*/[L*四川*]/*话*/ ----如果*语、文*前面的地名为双音节时,就要标注。 /[L*荷兰*]/*语*/ #### 4.3.4以族或裔结尾的词组中地名也要标注 MT-2和ER-99规定:以族或裔结尾的词组中的地名不标注。因此*华裔*、*汉族*中的*华*和*汉(指汉族)*都不作为地名标,但*华人、华侨、华商、中医、中草药、中餐馆、亚运会、奥运会*里的*华、中、亚、奥*仍需标注*L*。本规范不采用这一规则。作为民族的名字,单音节的不标,双音节的标*L*。 下面是一些标准实例: /[L*美*]*籍*[L*华*]人----"美籍华人"是词表词。 /*目的*/*是*/*促进*/[L*塞浦路斯*]/*西*/*族*/*与*/*土*/*族*/*的*/*和解* /*她*/*和*/*同*/*是*/[L*日*]/*裔*/[int*三*]/*世*/*的*/*男*/*友*/ /*通过*/*在*/[L*中*]*医药*/*宝库*/*里*/*寻找*/*线索*/ /*人们*/*纷纷*/*拥向*/[L*中*]*餐*/*馆*/*,*/*一时间*/*人满为患*/ /[L*吉普赛*]/*人*/----*吉普赛*不是词表词。 /[L*印地安*]/*民族*/*;*/ ----*印地安人*是词表词。 ## 第五章 机构名 机构名包括:股票(证券)交易所、国家或国际组织、商业团体(公司、企业、工厂)、电视台、广播电台、报刊杂志、出版社、政党或党派、学校、科研院所、医院、诊所、邮电局、乐队、体育运动队、联盟、议会或代表大会、军队、咖啡厅、酒吧、饭店、旅馆,以及虚构的机构等。 ### 5.1机构名标注规则 机构名的后缀应视为机构名的一部分。 | **序号** | **情况** | 标记方法 |例子 | | --- | --- | --- | --- | | 1 | 普通名字+机构名 | 整体标出 | *[O*板桥市胜捷公司*]* | | 2 | 地名+机构名 | 机构名整体标出 | [O*北京市电信局*]*[O*台北县立莺歌高职*]*[O*台北看守所*]*[O*基隆长庚医院*]*[O*东直门敬老院*]机构名的关键词如:幼儿园、各级学校、科学院、部委、实验室、工厂、公司、报刊杂志、出版社、大使馆、领事馆、咖啡店、快餐店、饭店、酒店、旅馆等 | | 3 | 人名+机构名 | 机构名整体标出 | *[O*李嘉诚基金会*]* | | 4 | 简称 | 一律整体标注 | *[O*北约*]*[O*上轮集团*]----*指上海轮胎集团*[O*白宫*]/*官员*/表示 | ### 5.2机构名标注细则 #### 5.2.1机构名标注实体示例 /[O*国防部*]/*长*/[P*迟浩田*]/ /[O*美国国防部*]/*长*/[P*佩里*]/ /[O*台北县地政局地权课*]/ /[O*地政局*]/ /[O*政风室*]/*接*/*获*/*检举*/*调查*/ /[O*国军北投医院*]/ /[O*三重地政事务所*]/ /[O*台湾银行宜兰分行*]/ /[O*省立关山工商*]/ /[O*基隆市光隆家商*]/ /[O*东信国小*]/ /[O*安乐国中*]/ /[O*原住民委员会*]/ /[O*连萧全国竞选总部*]/ /[O*北京钓鱼台国宾馆*]/ /[L*浙江*]/[O*温州大酒店*]/ /[O*松下电工株式会社*]/ /[O*公司*]/*英文*/*名称*/[O *HUNAN* FORE *SCAPE* TECHNOLOGY*CO*.,*LTD*]/ /[O*朝鲜人民武装力量部*]/*副*/*部长*/ /[O*美国海军*]/ /[O*欧共体*]/ /[O*中国国家生育委员会*]/ /[O*中国奥林匹克队*]/ /[O*披头四*]/ /[O*飞虎队*]/ /*敢死队*/ -----泛指不标。 /*但是*/[O*共和党*]/*人*/*说*/ /[O*土耳其议会外交关系委员会*]/ /[O*终战*50*周年国会议员联盟*]/ /*记者*/*来到*/[O*中山医科大学第一附属医院住院部*]/ /[O*中共中央政治局*]/*常委*/*、*/[O*中央纪委*]/*书记*/[P*尉健行*]/ - 注:中国共产党的简称中共或共要标注为O。例如: /[ord*第二次*]/[O*国*]/[O*共*]/*合作*/ - 注:类似的简称党,由于专指性不强,不标,如: /但/这种/现象/的/产生/,/是/同/党/和/国家/尊师重教/的/方针/背道而驰/的/, /全国/"/[dat三八]/"/红旗手/、/全国/优秀/共青团员/ - 注:"三八红旗手"是词表词。但如果"三八"在文中被双引号断开,就要单独表为dat。另外,词表词共青团员、共产党员、共产党人、中的机构名不确指,所以一律不标。 /[O*中共中央政治局常委会*]/ - 注:常委会可以是机构名,常委则不是。 /*党*/*的*/[O*十四大*]/*以来*/ - 注:中共的*X中全会*不是机构名,除了词表词*三中全会*什么也不标以外,数词*X*应单独标注为*ord*。例如: /*根据*/*党*/*的*/[ord*十五届*]/[ord*二*]/*中*/*全会*/ /[O*八届全国人大*]/*代表*/[P*陈妙珍*]/ /[O*西藏政协*]/*委员*/*强调*/*,*/*必须*/*旗帜*/*鲜明*/*地*/*反对*/*民族*/*分裂*/ [O*澳门中华总商会*]/*会*/*董*/*兼*/[O*青年委员会*]/*副*/*主任*/ /[O*足协*]/*杯赛*/*冠军*/[O*北京国安队*]/ ----*杯赛*是词表词。 /[O*以国家电视一台*]/ ----指以色列国家电视一台 /[L*汉城*]/[O*路透*]/*电*/ /*前*/[L*苏联*]/[O*切尔诺贝利核电站*]/*泄漏*/*事件*/ /*参加*/*这次*/*比赛*/*的*/*还有*/[O*日本*]/*、*/[O*俄罗斯*]/*、*/[O*美国*]/*、*/[O*德国*]/ 和/[O*意大利队*]/*。*/ /*前往*/[O*解放军驻港部队总部*]/*慰问*/*驻军*/ /[O*第四届和平小天使台湾访问团*]/*抵达*/[L*重庆直辖市*]/ /[O*塔里班*]/*部队*/*已经*/*到达*/[P*杜斯塔姆*]/*将军*/*的*/*家乡*/ /*用*/*公款*/*购买*/[O*靖国神社*]/*和*/[O*护国神社*]/*的*/*祭祀*/*品*/ /*纪念币*/*正面*/*是*/*由*/[O*解放军*]/*军徽*/*光,*/*八一南昌起义*/*和*/[O*解放军*]/[O*陆*]/[O*海*]/[O*空*]/*三军*/*战士*/*的*/*图案*/ - 注:词表词八一南昌起义*是一个事件,不是机构名。三军*是词表词,所以数字*三*不作为*int*标注。 - 注:股市报导中的企业和公司名不论其前后有没有外文字符,一律作为一个整体 标注成*O*。例如: /[O*ST辽物资*]/[dec*14.141*]/[O*宁波中百*]/[dec*20.354*]/ /[O*DR沪港机*]/[dec*11.194*]/[O*鲁北化工*]/[dec*8.051*]/ - 注:商城或百货公司本应标注为L,但作为股市中时企业时应标注为O。 - 注:股票指数在没有明确说明是多少元的情况下一律标注为*int*或*dec*。 - 注:被命名的轮船、飞机、机车应标注为*O*。例如: /*却*/*购*/*回*/*了*/[int*3张*]/ [*O"长月"号轮船*]/*船票*/*,*/ /[O*泰坦尼克号游轮*]/*上*/*的*/*这*/*对*/*情人*/*实在*/*浅*/*得*/*很*/*。*/ /[O*美国"哥伦比亚"号航天飞机*]/*上*/*的*/*宇航员*/ #### 5.2.2机构名的后缀是机构名的一部分 机构名的后缀是机构名的一部分,即要准确的标出机构名的最长边界(机构名的全称)。机构名中可以包含人名、地名和机构名,但对于它们不再单独标注。例如: /[O*苗栗县环保局*]/ /[O*卫生署桃园医院*]/ /[O*兰阳民生医院*]/*前身*/*为*/[O*吴外妇科*]/ /[O*台北爱乐青年管弦乐团*]/ /[O*行政院农委会林业试验所福山分所*]/ /[O*宋庆龄基金会*]/ /[O*上海轮胎橡胶(集团)股份有限公司*]/ /[O*中国驻日本大使馆*]/ /[O*美国白宫*]/ /*前*/[O*中国新华社香港分社*]/*社长*/[P*许家屯*]/ [O*清华大学计算机系人工智能实验室*]/ [O*中保财产保险四川省分公司*]/ #### 5.2.3国家(或国际)立法部门或行政部门标注为机构名 /*当选*/[O*国会*]/*议员*/ /[O*内阁*]/*改组*/*将*/*会*/*在*/[dat*八月底*]/*前*/*完成*/ /*前*/[O*内阁官房*]/*长官*/[P*山静六*]/ /[P*刹瓦什*]/*向*/[O*宪政法庭*]/*提出*/*动议*/ 5.2.4地名和机构名紧邻时的情况 地名和机构名的关系一般有以下两种情况: (1)表示所属关系(如:法国航空航天局,航空航天局隶属法国)。 (2)表示地理位置关系(如:北京邮电大学表示大学位于北京,而不是隶属于北京)。 地名和机构名之间还可能有更复杂的情况,这里不予讨论。 ##### 5.2.4.1规则一 如果机构名以一个地名开头,而且删除这个地名后所剩部分不再是一个具有特指性的机构名,那么该地名必须留在机构名中作为该机构名的一部分标注; /[O*北京大学*]/ /[O*深圳中学*]/ /[O*复旦大学专用集成电路与系统实验室*]/ /[O*东南大学*]/[O*深圳宝安设计院*]/ ##### 5.2.4.2规则二 如果机构名前面还有一个或多个地名,那么该机构名与前面紧邻的地名应当分开标注。 如: /[L*中国*]/[O*北京大学*]/ /[L*中国*]/[L*广东*]/[O*深圳中学*]/ /[L*北京*]/[L*昌平*]/[O*十三陵抽水蓄能电站*]/ ##### 5.2.4.3规则三 如果一个机构名的开头不是地名,那么当它前面邻接一个或多个地名时,只有其中与该机构名紧邻的那个地名需一起标注。例如: /[O*上海同济大学*]/ /[L*中国*]/[O*上海同济大学*]/ /[O*湖北省武钢三中*]/ ##### 5.2.4.4规则四 如果一个机构名本身以两个或两个以上并列的地名开头,则这些地名都要留在该机构名中。如果在它前面再出现其它地名时,一律同该机构名分开标注。但是如果上一级地名不能管辖下一级地名时,要把上一级地名标注在机构名内。 例如: /[L*洛杉矶*]/[O*亚太法律中心*]/ /[L*香港*]/[O*中港贸易协会*]/ /[O*广东亚洲大酒店*]/ /[O*澳大利亚维多利亚投资公司上海办事处*]/*》*/, /[O*澳大利亚维多利亚投资公司*]/*》*/ - 注:"广东"与"亚洲、澳大利亚与维多利亚"都不属于上、下级管辖关系,所以要把上一级地名标注在机构名内。 ##### 5.2.4.5更复杂的情况 在更复杂的情况下,我们可能无法判定某机构名究竟是以一个还是两个地名开头的。这时可按规则5.2.5和5.2.6来处理。 例如,*洛杉矶台北经济文化办事处* 究竟是A:*[L*洛杉矶*]/[O*台北经济文化办事处*]* 还是B:*[O*洛杉矶台北经济文化办事处*]* 这时,默认的标注方式是B(理由见5.2.8)。 ##### 5.2.4.6地名概念比较模糊的情况 如果该地名比较模糊,而标注者又没有足够的知识来判断某机构名的开头是否是一个地名。就标注到一个比较明确的地名, 例如:*印度尼西亚莫巴蒂努山打腊航空公司*中的*莫巴蒂*·*努山打腊*不知道是不是地名。但至少知道一旦拿走了这个字符串,剩下的字符串已不构成专指性的地名。此时,按规则2.5的标注方式应是: /[L*印度尼西亚*]/[O*莫巴蒂*·*努山打腊航空公司*]/ /[O*河北沙岭子电厂*]/ ----*沙岭子*是一个乡镇的地名,河北和内蒙古都有一个沙岭子镇,地名的概念比较模糊,故标注在机构名内。 /*国际*/*著名*/*的*/[O*加拿大*B+*H国际建筑师事务所*]/ ##### 5.2.4.7紧邻的地名和机构名不构成修饰关系的情况 一个地名后紧邻一个机构名,但它们不构成修饰关系,则一律分开标注。 /*促进*/*了*/[L*中国*]/[O*东盟*]/*的*/*合作*/ /*在*/[L*日内瓦*]/[O*联合国*]/&*人*/*权*&/*会议*/*上*/ 更典型的例子需借助上下文来判断,如: /*促进*/*了*/[L*中国*]/[O*微软*]*的合作*/ /[O*中国微软*]/*即将*/*发布*/*新产品*/ - 注:如果标注者不能判断它们是不是修饰关系,则默认为分开标注,如: /[L*中国*]/[O*微软*]/ /[O*美国众议院*]/ /[L*重庆*]/[O*长江救助打捞公司*]/ /[L*日本*]/[O*东京股市*]/ ----错误标注! /[L*日本*]/[L*东京*]/*股市*/ ----正确标注。 /[L*美国*]/[L*华盛顿*]/[O*三普证券公司*]/ ----错误标注! /[L*美国*]/[O*华盛顿三普证券公司*]/ ----正确标注。 /[L*华盛顿*]/[O*美国国务院*]/ /[L*瑞典*]/[O*斯德哥尔摩国际和平研究所*]/ #### 5.2.5会议、晚会、运动会等以会结尾的短语是事件,不作机构名标注 /*泛*/[L*美*]/*运动会*/ /[L*中国*]/[ord*第一届*]/*人工智能*/*大会*/ /[ord*第四届*]/[L*中*]/[L*法*]/*经济*/*研讨会*/ /[ord*第三届*]/[L*海峡两岸*]/*水利*/*科技*/*交流*/*研讨会*/ ----以上几例为事件,不是机构名。 /[O*中国人工智能协会*]/ /[O*中国人工智能联合会*]/ ----为机构名。 当会议指议会(congress)或代表大会(chamberofdeputies)时,应视为机构名。但是要注意:虽然议会或代表大会是机构名,但是议会或代表大会中的某一次会议是一个事件,不是机构名。为了更明确的区分各种情况,我们用以下例子说明: /*通报*/*了*/[O*八届政协*]/[ord*五次*]/*会议*/*的*/*各*/*项*/*安排*/ /[O*全国政协*]/[ord*八届*]/[[ord*五次*]/*会议*/*将*/*于*/ /*听取*/*和*/*审议*/[O*全国政协八届五次会议常务委员会*]/*报告*/ /*审议*/[ord*八届*]/[ord*五次*]/*会议*/*提案*/*审查*/*情况*/*的*/*报告*/ - 注:*八届五次会议*、*五次会议*是一个事件,不应标注为机构名。但是这次会议的组委会、委员会应视为机构名。例如: /[O*八届全国人大*]/[ord*五次*]/*会议*/ /[O*政协九届一次会议*]/ --错误标注! /[O*中国共产党第十五次全国代表大会*]/ /[O*九届人大*]/[ord*一次*]/*会议*/ /[O*中国全国人大*]/ /[O*中共十五大*]/ /*各级*/*人大*/*常委会*/ --不是专指,故不标。 /[O*中国科协*]/[ord*第五次*]/*全国代表大会*/ /[L*湖南省*]/[ord*六届*]/[ord*二次*]/*全*/*委*/*会议*/ /*向*/*同级*/*人民代表大会*/*或*/*人民代表大会常务委员会*/*提请*/*审议*/ - 注:*全国人民代表大会*和确指的省、市人民代表大会及其常委会、常务委员会需作为机构名标注。泛指的人大、中央银行、人民银行、&*农*/*发*/*行*&不作为机构名标注。 /[O*临澧县人大*]/*抓*/*村*/*级*/*财务监督*/*一瞥*/*(*/*监督*/*广角*/) /*由于*/*各级*/*人大*/*代表*/*的*/*有效*/*监督*/*,*/[dat*去年*]/*以来*/*该*/*县*/*各*/*村*/*村*/*务*/*情况*/*出现*/*好转*/*,*/ - 注:在地名*国会大厦*中,*国会*不可作为机构名标注,否则就出现嵌套了。 /[L*国会大厦*]/ - 注:"联合国大会"及其简称"联大"都是词表词,但不可整体标为O。如: /[O*联合国*]*大会*/*于*/[dat*1992年*]/*批准*/*了*/*这*/*一*/*条约*/*。* /[P*沈国放*]/[dat*27日*]/*在*/[O*联*]*大*/*全体*/*会议*/*上*/*表示*/, - 注:*会*也可能出现在一般的机构名中,如: /[O*红十字协会*]/ #### 5.2.6用我们、我等代词修饰的机构名,只对机构名进行标注 /*我国*/[O*共产党*]/ /*我们*/[O*清华大学*]/ - 注:根据上下文是确指的某公司、单位名称的简称要标注为机构,否则不标注!但如果在公司、集团等词前面有本、我、该等字样时,此处的公司、集团不进行标注。其他特殊情况依据上下文进行标注。如: /*凡*/*《*/[O*克罗伏特缓冲器股份有限公司*]/*股份*/*》*/*记名*/*的*/*持有*/*人*/*均*/*为*/*本*/*公司*/*股东*/*。*/ /*我*/*公司*/*出资*/*总额*/[mon*50万元*]/ /[O*港资陕西华懋实业公司*]/*总经理*/[P*商铭渔*]/*,*/*受*/[O*公司董事会*]/*委托*/*来到*/[O*咸阳市西北地勘局二一五医院*]/*看望*/[O*公司*]/*保安*/*员*/[P*韩玉刚*]/*,*/ #### 5.2.7大使馆和领事馆的标注 当大使馆(或领事馆或其它外交使团)所代表的国家和所在地区相连时,整体标为机构名。如: /*后来*/*调*/*任*/[O*美国驻洪都拉斯大使馆*]/ 当大使馆(或领事馆或其它外交使团)所代表的国家或所在地没有出现在上下文中,或者在描述范围内不连续,那么存在两种情况: (1)大使馆所代表的国家和大使馆(领事馆)相连,此地名和大使馆一起标记 为机构名。如: /*前往*/[L*香港*]/*的*/[O*洪都拉斯领事馆*]/ (2)大使馆所在地和大使馆(领事馆)相连,此地名应单独标记,整体不作为机构名。如: /[L*美国*]/*在*/*通过*/*驻*/[L*金沙萨*]/*大使馆*/*和*/*其它*/*正常*/*管道*/ - 注:虽然*驻金沙萨大使馆*是一个连续的短语,但它的实际意思是*美国(或*X*国)驻金沙萨大使馆*,而不是什么*金沙萨(的)大使馆*。因此在这里*大使馆*不视为机构名。 #### 5.2.8生产厂家要标注为机构名,产品则不标 这里定义的产品范围较广,不仅包括生产厂家生产出来的产品(如自行车等),还包括计算出来的产品(如:股票指数)、媒体产品(如:电视节目) /[O*道琼*]/*工业*/*平均*/*指数*/ ----因为股票指数可以视为产品,那么*道琼*就可以视为生产厂家。 /[O*纳斯达克*]/*指数*/ ---原因同前。 /[O*太原刚玉*]/[dec*10.581*]/ /[O*咸阳偏转*]/[dec*16.112*]/ /[O*深华发A*]/[dec*15.663*]/ /[O*渝开发*A]/ #### 5.2.9报纸、广播电台、电视台和杂志的名字要标为机构名 新闻媒体(如:报纸、广播电台、电视台和杂志等)的名字要标为*O*,但报刊、电视栏目的名字不标。例如: /[O*美国之音*]/*记者*/*表示*/ /[O*人民日报*]/*海外*/*版*/[ord*第三版*]/ /*《*/[O*泰晤士报*]/*》*/*援引*/*一个*/*国际*/*专家*/*委员会*/ /[O*中央电视台*]/*《*/*焦点*/*访谈*/*》*/*、*/*《*/*东方*/*时空*/*》*/*主持人*/ /[O*武汉电视台*]/*《*/*科技*/*之*/*光*/*》*/*栏目*/*的*/*《*/*科学家*/*,*/*您好*/*》*/*专栏*/ /[O*美国《科学》杂志*]/ /[O*美国探索电视网*]/ /*创办*/*《*/[O*深圳房地产快讯*]/*》*/ /*办*/*好*/*《*/[O*中外房地产导报*]/*》*/ #### 5.2.10特殊情况 ***民族不作为机构名*** ***泛指的*部队不作为机构名** ***政府不作为机构名*** ***学术或商务会议(conference,meeting)不作为机构名*** ***交易会不作为机构名*** ***运动会不作为机构名*** ***联赛不作为机构名*** #### 5.2.11特殊情况示例 /[L*中国*]/[L*天津*]/*出口*/*商品*/*交易会*/ /[L*中国*]/[O*天津出口商品交易会*]/ ----错误标注! /[L*中国*]/*政府*/ ----*不把政府*标为机构名。 /[L*非洲*]/*维持*/*和平*/*部队*/ ----*不把部队*标为机构名。 /[L*中国*]/*公安*/*部门*/ ----*不把部门*标为机构名。 /[O*中国公安部门*]/ ----错误的标注! - 注:标注并列的机构名(*O*)时,连接词和标点符号不进入标注范围。例如: /[O*上海*]/*、*/[O*北京人类基因组研究中心*]/ /[P*贺国中*]/*分别*/*任*/[O*一*]/*、*/[O*四*]/*、*/[O*七团*]/*党代表*/ - 注:上述情况和标注并列的序数(*ord*)不同,连接词和标点符号是否进入标注范围取决于序数词所修饰的词语。例如: /*获得*/*个人*/[ord*一、二、三等*]/*奖*/ /[ord*一*]/*、*/[ord*二*]/*、*/[ord*三*]/*产业*/ /*书店*/[ord*三、四层*]/ - 注:*中央*不作为机构名,但党中央*标为机构名。 /*在*/*中央*/*的*/*领导*/*下*/ /*以*/[P*胡锦涛*]/*同志*/*为*/*核心*/*的*/[O*党中央*]/*周围*/ #### 5.2.12地名和机构名容易混淆的情况 /[L*人民大会堂*] ----地名。 /[O*五角大楼*]/*发言人*/*说*/*,* /[O*白宫*] ----机构名。 /[O*克里姆林宫*]/*表示* ----机构名。 /*在*/[L*总统府*]/*分别*/*约见*/*了*/*多*/*位*/[O*国民党*]/*中*/*常委*/*检察官*/ - 注:*总统府*标注为L而不是*O*。这是因为有的国家有多处总统府,所以不能把它们视为国家或政府的唯一代表。 - 注:下面的例子中出现的类似单位名称的,因不是确指,而且是出现在各种条令、合同中,适合任何一个省、市、县的单位机构名称,所以不能作为一个机构名称标注为*O*。如: /*本*/*合同*/*正本*/[int*三份*]/*,*/*出租*/*人*/*、*/*承租*/*人*/*、*/*市*/*公证处*/*各*/*执*/*一*/*份*/*。*/*副本*/*若干*/*份*/*,*/*报*/*市*/*经济*/*委员会*/*、*/*市*/*经济体制*/*改革*/*委员会*/*、*/*市*/*财政*/*局*/*、*/*劳动局*/*、*/*税务局*/*、*/*审计*/*局*/*、*/*工商*/*行政管理*/*局*/*、*/[O*中国人民银行*]/*市*/*分行*/*、*/[O*中国工商银行*]/*、*/*市*/*分行*/*等*/*有关*/*部门*/*备案*/ */*本*/*合同*/*在*/*履行*/*中*/*如*/*发生*/*争议*/*,*/*双方*/*应*/*协商*/*解决*/*;/*协商*/*不*/*成*/*时*/ /*任何*/*一方*/*均*/*可*/*向*/*工商*/*行政管理*/*局*/*合同*/*仲裁*/*委员会*/*申请*/*调解*/*或*/*仲裁*/ ## 第六章 数字串标注总则 数字串(**Factoid**)包括时间表达式(**TIMEX**) 、数字表达式( **NUMEX** )、度量表达式(**MEASUREX**)和地址表达式(**ADDREX**)等**4**大类,***27***个小类,详见表**1-1**。标注数字串的一条重要原则就是:它的标记不得插入到词表词的内部(见**1.5.2.4**)。 ### 6.1时间表达式 时间表达式(*TIMEX*)包括日期(*dat*)、时间(*tim*)和时段(*dur*)三小类。所有小于一天的时间都被定义为时间(*tim*),如秒,分,小时。一天或者大于一天的时间单位则属于日期(*dat*),如*天,日,星期,礼拜,月,季度,年,五年,十年,世纪*等。时段(dur)通常也使用日期和时间中的单位,如月、年、时、分*等。对此标注者要注意区分。 将日期、时间同时段区分开来有时是困难的,下面分别给出它们的定义。 #### 6.1.1日期(dat)和时间(tim)的定义 日期和时间在一维的时间坐标轴上有相对确定的位置。小于一天的时间都被定义为时间。一天或者大于一天的时间则属于日期。 /[tim*8*点30*分*]/ /[dat*今天*]/[tim*晚上*]/ ----*晚上*是词表词。 /[dat*昨天*]/[tim*夜里*]/ ----*昨天*和*夜里*都是词表词。 /[tim*昨夜*]/ ----*昨夜、昨晚*都是词表词,只能整体标*tim*。 /[dat*昨*]/[tim*晚*]/ ----错误的标注! /[dat*春节*]/---在每一年中,是比较固定一天或几天。 /[dat*1999*年*]/---以*年*为单位,与别的年份相区别。 /*在*/"/[dat*六五*]/"/*中*/---以*五年*为单位,与别的*五年*相区别 - 注:严格地说,每一个*dat*或*tim*都占据了一个时间段,因此这里出现的*期间*和*中*,不能作为标注时段的理由。 /"/[dat*九五*]/"/计划 /[dat"*九五*"*初*]/ /*仅*/*"*[dat*八五*]/*"*/*期间*/*就*/*达*/[mon一百一十五亿元]/。 /[dat*下半年*]/---以*半年*为单位,与*上半年*相区别。 /[dat*二十世纪*]/---以一百年为单位,与别的*世纪*相区别。 /*为*/*庆祝*/[O*北京大学*]/*建*/*校*/[dat*100周年*]/*,*/ /[dat*民国八十六年*]/ /[dat*民国六十年代*]/ /[dat*八十八年下半年*]/*及*/[dat*八十九年*]/*中央*/*统筹*/*分配*/*款*/*,*/ /[dat*公元二千年*]/ /[dat*今年九月*]/ /*"*/[O*迈特兴华*]/*"*/*杯*/[ord*首届*]/*全国*/*象棋*/*大师*/*赛*/*于*/[dat*今日*]/*收*/*秤* /[dat*1997年下半年*]/*,*/ /*可*/*于*/[dat*农历年*]/*前*/*迁居*/*。*/ /[tim*第七十三分钟*]/ /[tim*中午*12*点*]/ /[tim*格林威治时间*5*时*59*分*]/----含有地名。 /[dat*第二天*]/[tim*一大早*]/*,*----*一大早*是词表词。 /*在*/[dat*今年暑期*]/*大学生*/*送*/*科技*/*下乡*/*活动*/*中*/, /*大约*/[tim*七点*]/*到达*/*----大约*不标。 /[tim*晚上大约七点*]/*到达*/ - 注:*大约*被两个*tim*包围,分割不开,所以整体标上。这条标注规则遵照了ER-99和MET-2的标准。 - 注:事件戊戌变法、辛亥革命、甲午战争、五四运动等都是词表词,其中的日期不标注。但当戊戌、辛亥、五四单独出现时,应作为日期来标注。例如: /*与*/*稍*/*后*/*的*/*辛亥革命*/*,*/*都*/*有*/*相通*/*的*/*地方*/, /*在*/[L*香港*]/*回归*/[dat*周年*]/*前夕*/*和*/*"*/*七七事变*/*"*/*纪念日*/*,*/[dat*戊戌*]/*思潮*/*与*/*前此*/*的*/*洋务运动*/, #### 6.1.2时段(dur)的定义 时段既可以长于一天,也可以短于一天。它不同于日期和时间,在一维的时间坐标轴上没有确定的位置。例如: /[dur*三年*]/ /[dur*半年*]/ /[dur*四分之一个世纪*]/ /[dur*廿四个月*]/ /*时间*/*长*/*达*/[dur*六分钟*]/ /[dur*两个星期*]/ /[dur*一个月*]/*后*/ /*曾*/*在*/[dur*5、6年*]/*前*/*撰文*/*陈述*/ /*早产*/[dur*十二周*]/*左右*/ /*大水*/[dur*十天*]/*后*/*才*/*退*/*尽*/ /[dur*一至两年*]/ /[dur*一小时卅分钟*]/ /*这*/[dur*几天*]/ /[dur*卅天*]/*会期*/*只*/*开*/*了*/[dur*九天*]/ /*虽*/*经*/[dur*一整天*]/*磋商*/*,*----*一整天*不是词表词,但要标为*dur*。 与*/*洪水*/*奋战*/[dur*一天一夜*]/*,*----*一天一夜*也不是词表词。 时间表达式的标注细则详见第七章。 ### 6.2数字表达式 数字表达式(*NUMEX*)包括百分数(*per*)、钱款(*mon*)、频度(*fre*)、整数(*int*)、分数(*fra*)、小数(*dec*)、序数(*ord*)、比率(*rat*)等8小类。 #### 6.2.1百分数(per) /[per*百分之二十五*]/ /[per*百分之一点七*]/ ---虽然是小数,但要标作per。 /[per*六点五百分点*]/ /[per*五成*]/*以上*/ /[per*六折*]/ /[fra*百万分之八*]/ ----注意标的是*fra*而不是*per*。 /*大约/*[per5%]/ ----约数*大约*不进入标注。 6.2.2钱款(mon) /[mon*四亿元台币*]/ /[mon*43.6亿美元*]/ /[mon*卅万元*]/ /[mon*四万五千块钱*]/ /[mon*四万五千元人民币*]/ /*只*/*增加*/*了*/[mon*几元钱*]/*的*/*成本*/ /*决不*/*乱*/*花*/*国家*/*的*/[mon*一分钱*]/。 - 注:同一笔钱的不同货币形式需分开标注。货币中的地名不标。 [mon*26万英镑*]/ (/[mon*43.6亿美元*]/)/ - 注:*约*是一个不确切的概念,故不标注。但*上*、*数*、*好*要和数字串捆绑在一起标注。但*近*作为特例,不与数词捆绑!! /*约*/[mon*十万元*]/ /*大概*/*需要*/*花费*/[mon*上千万美元*]/*的*/*投资*/*和*/[dur*3*年*]/*左右*/*时间*/*,*/ /*多*/*收入*/[mon*好几十元*]/ #### 6.2.3频度(fre) /[fre*数度*]/ /[fre*两次*]/ /[fre*26次*]/ /[fre*十多次*]/ /[fre*多次*]/ - 注:动量词次除了一次不标注以外,其余的全部标注为*fre*。 /[fre*一次次*]/ /[fre*再次*]/ /[fre*无数次*]/ /[fre*数次*]/ #### 6.2.4整数(int) *int*标注的是数词和量词组合成数量词组。 /[int*卅七件*]/ /[int*一百卅项*]/ /[int*三种*]/ /[int*九个*]/*课室*/ /[int*几家*]/ /*后*/[int*几名*]/ /[int*十*]/*多*/*人*/ /[int*四条*]/*断层*/ /[int*五十户*]/ /[int*百余名*]/ /[int*上万*]/*人潮*/ /*"*/[int*双*]/[int*百*]/*"*/*方针*/, - 注:"双百方针"是词表词,由于文中"双百"用引号括起,而且它们是两个数字,所以要分别按数字串标注。类似情况还有词表词"五四运动",这是个事件不标。但是如果文中日期"五四"被引号括起,就要单独标为:/"/[dat五四]/"/运动/。又如"六一儿童节、六一国际儿童节、六一节"都是词表词。由于"六一"和"儿童节"是同一个日期,即使在文中"六一"被引号括起,也可以整体标为dat,如:/[dat"六一"儿童节]/。 - 注:人次应标注为*mea*而不是*int*,例如: /*近*/[dur*3年*]/*中*/*,*/*该*/*市*/*采取*/*多*/*形式*/*的*/*农技*/*培训*/*近*/[mea*万人次*]/, - 注:"*数词*+*强*"不一定表示序数,因此只单独标注数词为*int*。例如: */*在*/*这次*/*从*/[int*十六*]/*强*/*到*/*冠*/*、*/*亚军*/*的*/*一次性*/*竞猜*/*中*/*,*/* /[O*宝钢*]/*为*/*跻身*/*世界*/[int*500*]/*强*/*而*/*采取*/*的*/*重要*/*步骤*/*。*/ #### 6.2.5分数(fra) /[fra*数倍*]/ /[fra*一半*]/ /[fra*千百倍*]/ /[fra*3/4]/ /[fra*四分之三*]/ /[fra*百万分之三百六十四*]/ *----*注意标记是*fra*而不是*per*。 /[fra*半个*]/ /[fra*4倍半*]/ *----*倍数是分数的一种表示,应标*fra*。 /[fra*4倍半*]/ /[fra*4.5倍*]/ ----*虽然* *4.5*是个小数,但不标*dec*。 /*有效*/*载*/*力*/*提高*/[fra*2至3倍*]/ 注:"过半数"是词表词,因此不作为分数fra标注。例如:*/*都*/*难以*/*获得*/*过半数*/*的*/[int*207张*]/*选票*/*,*/* #### 6.2.6小数(dec) /[dec*3.14]/ /[dec*三点一四*]/ /*看*/*了*/*那么*/*长*/*时间*/*的*/*电视*/*,*/*视力*/*依旧*/[dec*1*.*5*]/ /*我*/*有着*/*足以*/*令*/*我*/*自豪*/*的*/[dec*1*.*2*]/*视力*/ ----视力的多少是一个量级,没有单位,故按数量标注整数或小数。/*并*/*以*/[dec*6139.69点*]/*收盘*/ /*以*/ [dec*33.8*]/*收盘*/ /*比重*/*:*/[dec*1.02*]/ #### 6.2.7序数(ord) /[ord*第一任*]/ /[ord*第一期*]/ /[ord*十六楼*]/ /[ord*第三次*]/*世界大战*/ /[ord*首*]/*日*/*销售*/*欠佳*/ /[ord*第二*]/*故乡*/ /[ord*三等*]/*奖*/ /[ord*前*6*名*]/ /*地震烈度*/*不*/*超过*/[ord*8度*]/ /*这*/[ord*第二条*]/*尤为*/*重要*/ /*位居*/*金牌*/*榜*/[ord*第二名*]/ /[O*北京市*]/[ord*首家*]/*就业*/*与*/*创业*/*组合*/*市场*/ /[ord*1174号*]/*文件*/ /[ord*6*路*]/*汽车*/ /[ord*六年级*]/*学生*/ /[dat*今年*]/*读*/[ord*大三*]/ /*发展*/*第一产业*/* ----第一产业*是词表词。 /*发展*/[ord*第一*]*产业*/ ----错误的标注。 /*阵风*/[ord*五级*] /[ord*一、二、三等*]/*奖*/。 /*他*/*亲手*/*接*/*治*/[L*墨西哥*]/[ord*首例*]/*艾滋病*/*患*/*儿*/ #### 6.2.8比率(rat) / [rat*一比廿五*]/ /*以*/[rat*0∶6*]/*失利*/ /*上*/*一*/*届*/*世界杯*/*赛*/*就*/*以*/[rat*1∶0*]/*胜过*/*。*/ /*最终*/*以*/[rat*三比三*]/*握手言和*/*。*/ /*用*/*原液*/*与*/*水*/*稀释*/[rat*1*∶*20*倍*]/*。*/ 数字表达式的标注细则详见第八章。 ### 6.3度量表达式 度量表达式(*MEASUREX*)包括年龄(*age*)、温度(*tem*)、角度(*ang*)、长度(*len*)、面积(*are*)、容积(*cap*)、重量(*wei*)、速度(*spe*)、加速度(acc)和其它(*mea*)等10小类。 #### 6.3.1年龄(age) /[age*卅五岁*]/ /[age*廿一岁*]/ /[age*六十五岁*]/ /[age*34岁*]/ /[age*六十寿辰*]/ /[age*花甲*]/*老人*/ ----*花甲*是词表词。 /*如同*/[age*年过半百*]/*的*/*老*/*妇*/*。*/ ----*年过半百*是词表词。 /[P*李元*]/*、*/[P*卞德培*]/[int*两位*]/*先生*/*都*/*已*/[age*年逾古稀*]/。 #### 6.3.2温度(tem) /*寒流*/*耍*/*酷*/*平地*/[tem*6℃*]/ /*才*/*会*/*微*/*升*/[tem*6.1℃*]/ /*但*/*平地*/*温度*/*还*/*会*/*下*/*探*/[tem*5℃*]/*左右*/ /*积温*/*高*/*(*/[tem*2800度*]/*)*/----注意! /[tem*零下*5*到*6*摄氏度*]/ - 注:数字范围的标注方式详见7.1.1。 /*大约*/[tem*5~7℃*]/ /*低温*/*反而*/*只*/*有*/[tem*10℃*]/~/[tem*12℃*]/ /[tem*摄氏19*-*24度*]/ /[tem*摄氏*19*度*]/ -/[tem*24度*]/ #### 6.3.3角度(ang) /*钝角*/*就*/*是*/*大于*/ [ang*90度*]/*的*/*角*/ /*并*/*将*/*卫星*/*定点*/*在*/[agn*东经*110.5*度*]/[L*赤道*]/*上空*/*。*/ /*震*/*中*/*位于*/[ang*北纬*30.5*度*]/*,*/ ----详见4.2.4.2 #### 6.3.4长度(len) /*开掘*/*到*/ [len*一米六七*]/*深度*/*时*/ /*高*/ [len*五米*]/*宽*/ [len*一百米*]/ /[len*109×78厘米*]/ /[len*1纳米*]/=/[len*十的负九次方米*]/ /*应用*/*于*/*紧*/*固*/*件*/*直径*/*为*/[len*1*/*4″*]/*(*/[len6m]/*)*/ /*最高*/*速度*/*每*/*秒*/ [len*360米*]/ /*发生*/*每*/*秒*/*速度*/*达*/[len*四十二米*]/*的*/*大风*/*。*/ /[L*三峡*]/*截流*/*落差*/*在*/[len*0.7-0.8米*]/*之间*/*,*/ #### 6.3.5面积(are) /[are*廿七公顷*]/*土地*/ /*占*/*地*/[are*六百多公顷*]/ /*兴建*/[are*五千坪*]/*大*/*的*/*厂房*/ /[are*七百余坪*]/ /*每*/*套*/*住宅*/*面积*/[are*140*-*160m2*]/*,*/ /[are*997万平方公里*]/ /*农田*/ [are*20万亩*]/ #### 6.3.6容积(cap) /*运输量*/*为*/ [cap*34个立方*]/ /[cap*一两箩*]/*谷子*/ /*选定*/*的*/*设计*/*流量*/*是*/*每*/*秒*/[cap*1.4*万至*1.9万立方米*]/*。*/ /*工程*/*已*/*完成*/*土方*/[cap*2300多万方*]/*,*/ /*全国*/*消费*/*了*/[cap*25万升*]/*啤酒*/*。*/ #### 6.3.7重量(wei) /[wei*九百至一千吨*]/ /[wei*零点三公克*/] /[wei*三千二百英吨*]/ /*重*/*约*/[wei*五、六公斤*]/*的*/*鲤鱼*/ /[wei*十台斤*]/ /[wei*三点五公吨*]/ /*产量*/*达到*/ [wei*数千万吨*]/ /[wei*几万吨*]/ /[wei*二十万吨*]/*级*/*以上*/ - 注:ER-99把上例标为:[wei*二十万吨级*]/以上。 #### 6.3.8速度(spe) /*最高*/*速度*/ [spe*360米每秒*]/ /*打印*/*速度*/*:*/[spe12cps]/ ----"*cps*"表示"characterspersecond(每秒字符数)"。 #### 6.3.9加速度(acc) /*抗震*/*能力*/*:*/*地面*/*水平*/*加速度*/*≤*/[acc*0.4m*/*s2*]/ /*地面*/*垂直*/*加速度*/*≤*/[acc*0.2m*/*s2*]/ #### 6.3.10其它度量表达式(mea) 除了上面提到的度量单位元之外,物理、化学及其它度量单位的统一标注为*mea*。/*额定*/*电压*/*至*/[mea*660V]/ /[mea*5.5瓦特*]/ /*参观*/*人数*/*达*/[mea*620万人次*]/ /*工资*/[mea*3500元*/*人*/*月*]/ /[mea*25元*/*公斤*]/ /*风*/*压*/*不*/*超过*/[mea*700Pa*]/*(*/*相当于*/*风速*/[spe*34m*/*s*]/*)*/*。* /*迁移*/*到*/[mea*千兆比特*]/*的*/*能力*/*能够*/*降低*/*拥有*/*总*/*成本*/*的*/*管理*/*方案*/ /*这些*/*快速*/*以太*/*网*/*和*/[mea*千兆位*]/*以太*/*网*/*服务器*/ #### 6.4地址表达式 地址表达式(*ADDREX*)包括电子邮箱(*ema*)、电话(*pho*)、传真(*fax*)、电报挂号(*tel*)、邮政编码(*pos*)和网址(*www*)等6种。 #### 6.4.1电子邮箱(ema) /[ema *exp@email.com.cn*]/ /[ema*cnhuang@msrchina.research.microsoft.com*]/ #### 6.4.2电话(pho) 在标注电话号码时,要把国际区号、国内区号、本地区号等作为一个整体标注。如果有分机号码也要一并标注。当有多个分机号码时,要分别标注。如: *预约*/*订*/*位*/*电话*/[pho*九五一八六二八*]/ /*洽*/*询*/*电话*/[pho*二四九三一零二零*]/ /*订*/*席*/*专线*/[pho(*8610*)-78906617]/ /*查询*/*电话*/*是*/(/[pho*零三八六二一一零零转二五二*]/)/ /*查询*/*电话*/[pho*三六九九七二一转二三三一*]/*或*/[pho*二三三二*]/ /[pho*120*]/ /[pho*119*]/ #### 6.4.3传真(fax) /*全国*/*客户*/*服务*/*传真*/*:*/[fax*010-58722727*]/ /*传真*/*号码*/:/[fax*86-10-66665555*]/ /*公司*/*传真*/*:*/[fax*86-10-66665555*]/ #### 6.4.4电报挂号(tel) /[O*搜狐公司*]/*电报挂号*/*是*/*:*/[tel(8610)*62726666*]/ /*电报挂号*/*:*/[tel*86-10-66665555*]/ /*联系*/*电话*/*:*/[tel*86-10-66665555*]/ #### 6.4.5邮政编码(pos) /[O*清华大学*]/*的*/*邮政编码*/*是*/*:*/ [pos*100080*]/ /[L*安徽*]/[L*阜阳*]*/*地区*/*的*/*邮政编码*/*是*/*:/[pos*233600*]/ #### 6.4.6网址(www) /*活动*/*报名*/*网址*/*:*/[www http:www.acer.net/event/apply]/ /[O*蕃薯藤*]/*购物*/*网*/*(*/[www http:shopping.yam.com]/*)*/ ## 第七章 时间表达式标注细则 **时间表达式(**TIMEX***)包括日期(***dat**)** 、时间( tim )和时段( dur )三小 **类。** ### 7.1日期(dat) /[dat*明治三十九年*]/*(*/[dat*公元一九零六年*]/*)*/ /[dat*大正十四年*]/*(*/[dat*公元一九二五年*]/*)*/ /[dat*昭和二年*]/*(*/[dat*公元一九二七年*]/*)*/ /[dat*清*]/[dat*道光十四年*]/ /[dat*清*]/[dat*咸丰十一年*]/ /[dat*民国六十八年*]/*拆除*/*后*/*迁到*/[L*芦洲*]/*,*/[dat*八十一年*]/*间*/*又*/*扩建*/ /[dat*一九九九*]/ /[dat*一九九九年十二月三十号*]/ /[dat*公元*1990*年*4*月*22*日*]/ /[dat*旧石器时代*]/ /[dat*八十年代*]/ /[dat*下半年*]/ /[dat*1989财年*]/ ----注意! /[dat*1989*财年第三季度*]/ /[dat*1990*上半财年*]/ /[dat*1991*财政年度*]/ /[dat*秋季*]/*报告*/ /[dat*第四季度*]/ /[dat*十五世纪*]/ /*努力*/*成为*/*一*/*名*/*高*/*素质*/*的*/[dat*跨世纪*]/*人才*/*。*/ /*值*/*此*/[dat*世纪之交*]/*的*/*时候*/*,* /*走*/*向*/[dat*新世纪*]/*的*/[L*中国*]/*律师*/*业*/ /[dat*新旧世纪交替*]/*之际*/ /*黑色*/[dat*星期一*]----注意! /[*L北京]*/*在*/[dat*23号*]/*发表*/*了*/*报告*/ - 注:数字串*23号*若不表示日期,则不标。 /[dat*五月上旬*]/ ----*上、中、下旬*要标注。 /*科技*/*之*/[dat*夏*]/ ----注意! /[dat*夏*]/[dat*秋*]/*之间*/ /[dur*一年*]/*中*/*四季*/*分明*/ ----*四季*是词表词不标注。 /[L*南极*]/*的*/[dat*夏季*]/ /[L*中国*]/[dat*汉代*]/ /[dat*春节*]/ ----日期确定的节日要标注。 /[dat*肉孜节*]/ /[dat*开斋节*]/ /[dat*中秋*]/*时节*/ ----注意*时节*不标。 /[L*美国*]/*的*/[dat*独立日*]/----美国独立日为每年7月4日。 /[dat*27年*]/*是*/*一个*/*多*/*事*/*的*/*年份*/ - 注:*27*年*可能表示时段,标注者须根据上下文注意区分。 /*现在*/*是*/[dat*26号*]/*,*/[dat*星期三*]/ ----同一个时间的不同表达,要分开标注。 /*现在*/*是*/[dat*二月九号*]/*,*/[dat*农历大年初三*]/ /*大约*/[dat*五月四日*]/*----大约,大致,大概*等词不标。 /[dat*第二个十年*]/ /[dat*第二年3月*]/ /[dat*当年*9*月*]/ /[dat*今春*]/ ----*今春*不是词表词。 #### 7.1.1日期起讫表达式的标注 当日期表达式中有至、到和连结符-时,处在至、到和连结符-前后的日期表达式分别叫做前式和后式。如果前式和后式都是完整的日期表达式,则它们应分别进行*da*t标注;否则前、后式要整体标注为*dat*。 这条规则同样适合于其它各类数字串的标注,如:*tim*,*dur*,*int*,*tem*,*wei*,*mon*等。其一般表达式为: /X+量词/到/X+量词/ /X+量词/至/X+量词/ /X+量词/-/X+量词/ /X+至+X+量词/ /X+到+X+量词/ /X+-+X+量词/ /X+、+X+量词/ 例如: /[dat*三月三日*]/*至*/[dat*三月卅一日*]/ /[dat*一月十八日*]/*到*/[dat*廿一日*]/ /[dat*三月三至廿一日*]/ /[dat*二月十八日*]/-/[dat*廿一日*]/ /*于*/[dat*今明两年*]/*陆续*/*推出*/*。*/ /[dat*民国五十五、五十六年*]/ /[dat*今明两天*]/ /[dat*今*]/*、*/[dat*明*]/[dur*两日*]/ /[dat*1980年*]*到*[dat*1990*年*1月*]/ - 注:含有比喻意义的今天、昨天、明天、今日、昨日、明日全不标注。 */*"*/*一失足成千古恨*/*,*/*同学*/*们*/*,*/*看到*/*今天*/*的*/*我*/*,*/*你们*/*是否*/*感悟*/*到*/了*/*什么*/*?*/*"*/ /*尽管*/*炮火*/*已*/*消失*/*在*/*昨天*/*那*/*段*/*苦难*/*,*/ /[O"四方"集团*]/*的*/*明天*/*将*/*会*/*更加*/*灿烂*/*美好*/*。*/ - 注:当年、同年、当月等词语后有具体的日期时,要整体标注dat,如果当年、当月、同年等词语单独出现,而其前后有确指的日期时也要标注为dat,否则不作标注!当日、当天等词后有具体的时间时标注为dat,否则不作标注!如: /[dat*当年7月*]/*在*/[L*莫斯科*]/*举行*/ /*然后*/*于*/[dat*同年8月*]/*奉调*/*回国*/*。*/ /[P*克林顿*]/*在*/[dat*当月13日*]/*表示*/*,*/ /*那*/*是*/[dat*当天*]/[tim*中午1时*]/*的*/*汇率*/ /*发言人*/*于*/[dat*当日*]/[tim*午夜*]/*发表*/*声明*/ #### 7.1.2前、头、下+时段(dur)应整体标注为dat /[dat*头两个礼拜*]/ /[dat*前3天*]/ /[dat*今年头四个月*]/ /*比*/[dat*上一年*]/*增长*/[per*10.4%*]/*。*/ /*集中*/*研究*/*解决*/[dat*下半年*]/*纠风*/*工作*/*如何*/*突出*/*重点*/*,*/ /[dur*两周*]/*前*/ /[dat*1993年之初*]/ ----注意! /[dat*公元之初*]/ #### 7.1.3当乾隆、康熙、道光等表示年代时标注为dat 当乾隆、康熙、道光*等表示年代时标注为*dat*,而当*乾隆、康熙、道光*等表示皇帝本人的名字时标为P。如: /*最近*/*发现*/*一*/*张*/*在*/*农家*/*珍藏*/*的*/[dat*清代*]/[P*康熙*]/*、*/[P*雍正*]/*、*/[P*乾隆*]/*、*/[P*嘉庆*]/*、*/[P*道光*]/[int*五*]/*皇帝*/*诰封*/*圣旨*/[int*九道*]/*,*/ /*收藏*/*了*/*自*/[dat*清代*]/[dat*乾隆*]/*年间*/*至今*/*各个*/*历史*/*时期*/*的*/*鼻烟壶*/*艺术*/*珍品*/*,*/ #### 7.1.4朝代名的默认值为dat 当朝代名被上下文确认为国家名时标注*L*,否则默认为*dat*。如: /*如果*/[P*刘伯温*]/*不是*/*一直*/*压抑*/*着*/*对*/[dat*元*]/*王朝*/*的*/*不满*/*,*/ /[dat*楚*]/*霸王*/[P*项羽*]/*带领*/[int*两万*]/*兵*/*将*/*,*/ /*只*/*带*/[dur*三天*]/*粮食*/*,*/*渡过*/[L*漳河*]/*去*/*与*/*强大*/*的*/[dat*秦*]/*兵*/*作战*/*。*/*结果*/*,*/[dat*楚*]/*军*/*大败*/[dat*秦*]/*军*/*。*/ /[dat*吴*]/*王*/[*P*夫差]/*战胜*/*了*/[dat*越*]/*王*/[P*勾践*]/, /[dat*战国*]/*时*/[L*赵国*]/*良*/*相*/[P*蔺相如*]/*曾*/*为*/[L*赵国*]/*立*/*下*/*汗马功劳*/; *[P*唐睢*]/*出使*/[L*秦国*]/*,*/* 《*/[L*水浒*]/*全传*/*》*/*描述*/*的*/*是*/[dat*北宋末年*]/*震撼*/[dat*宋*]/*室*/*江山*/*的*/[P*宋江*]*起义*/*。*/* /*从*/*侧面*/*表现*/*了*/[dat*清*]/*政府*/*的*/*腐败*/*无能*/*,*/*激起*/*了*/*深*/*埋*/*在*/*人们*/*心底* /*对*/*侵略者*/*的*/*敌视*/*和*/*对*/[dat*清*]/*政府*/*的*/*愤怒*/*,*/ /*但是*/*,*/*战争*/*最终*/*因*/[dat*清*]/*政府*/*的*/*妥协*/*、*/*投降*/*而*/*告*/*失败*/*。*/ /*无奈*/*夜郎自大*/*、*/*腐败*/*不堪*/*的*/*大*/[L*清国*]/*武器*/*太*/*落后*/*,*/ #### 7.1.5在"过去、今后、未来+时段(dur)"等修饰成分不进入标注范围 /*过去*/[dur*3年*]/*中*/*,* /*将*/*在*/*未来*/[dur*几年*]/*内*/*出现*/ /*未来*/[dur*两天*]/*沿江*/*地区*/*仍*/*有*/*中*/*到*/*大雨*/*,*/ /[dat*今年七八月*]/*间*/ #### 7.1.6词表词近年来、近些年、近几年来、近几年、几年来等均不标注 按规定,词表词*近年来、近几年、近几年、几年来、多年来、近些年*等内部的*dat*、*tim*、*dur*都是不标的。但对非词表词则要分开标注。例如: /[L*瑞士*]/*多年来*/*是*/[ord*第一次*]/*。* /近几年/,/[L中]/[L菲]/关系/ /*近*/[dur*五年*]/*来*/ /*时至今日*/*仍*/*在*/*缓刑*/*期间*/*。*/*-----时至今日*是词表词。 ### 7.2时间 /[tim*凌晨零时*]/ /[tim*清晨六时卅五分*]/*到*/[tim*四十分*]/ /[tim*凌晨二至四点*]/ /[tim*中午十二时*]/-/[tim*晚上九时*]/ /[tim*上午十一时*]/*至*/[tim*下午二时*]/ /[tim*第七十三分钟*]/ /[tim*格林威治时间*5*时*59*分*]/ ----含有地名。 /[tim*下午当地时间*5*时*59*分*]/ /[tim*九点整*]/*到达*/[L*北京站*]/ /[dat*九月十三日*]/*大约*/[tim*七点*]/*到达*/[L*北京*]/ - 注:这里*大约*不标。因为它虽被一个*dat*和一个*tim*包围,但是仍可以分割开。 ### 7.3时段 /[dur*两个星期*]/ /[dur*一个月*]/*后*/ /*曾*/*在*/[dur*5、6年*]/*前*/*撰文*/*陈述*/ /*早产*/[dur*十二周*]/*左右*/ /*大水*/[dur*十天*]/*后*/*才*/*退*/*尽*/ /[dur*一至两年*]/ /[dur*一小时卅分钟*]/ /*这*/[dur*几天*]/ /[dur*卅天*]/*会期*/*只*/*开*/*了*/[dur*九天*]/ /[dur*10个月*]/ /*虽*/*经*/[dur*一整天*]/*磋商*/ ----*一整天*不是词表词,但要标为*dur*。 /*与*/*洪水*/*奋战*/[dur*一天一夜*]/*,*/ ----*一天一夜*也不是词表词。 */*历经*/[dur*一二十年*]/*创建*/*了*/*庞大*/*的*/*船队*/*,*/* /*让*/*我们*/*全家*/*人*/*感动*/*了*/[dur*好几天*]/ /*在*/*水门*/*丑闻*/ [dur*四分之一世纪*]/*时*/*发表*/*的*/*评论。*/ - 注:按照前面的原则:*水门*/*丑闻*/ [dur*四分之一世纪*]/*时*在时间坐标轴上有比较固定的位置,因此应当标为*dat*。但这种与事件(水门丑闻)相关的时间表达,在ER-99和MET-2中都是不标注的。这样,只有*四分之一世纪*需要标注为*dur*。 /[dur*十多年*]/ /[dur*几年*]/*以来*/ /*在*/[dur*半年*]/*时间*/*内*/----注意:*上半年*是*dat*。 /*在*/*总结*/[dur*14年*]/*改革开放*/*经验*/*的*/*基础*/*上*/ - 注:*14年*、*30*年*也可能表示dat。标注者要注意区分。*/*我们*/*在*/*美国*/*奔波*/*了*/[dur*30年*]/ /[dur*27年*]/*的*/*军旅*/*生涯*/ /*整整*/[dur*十五年*]/ ----*整整*不标。 /*大约/*[dur*十年*]*/的/时间*/ ----*大约*不标。 /[dur*十年*]/*来*/ /[dur*十几年*]/*的*/*时间*/ ----注意! /[dur*十几年*]/*来*/ /[dur*十来年*]/ /[dur*数年*]/ /[dur*多年*]/ ----ER99不标。 #### 7.3.1一年都标为dur /*新*/*的*/[dur*一年*]/*即将*/*开始*/ /*硬*/*是*/*在*/*地下室*/*干*/*了*/ [dur*一年*]/*的*/*公司*/ /[dur*一年*]/*创*/*产值*/*效益*/…/…/ /*聘金*/*为*/[dur*一年*]/ [mon*900万美元*]/*的*/*价码*/ - 注:*/*这*/*一年*/*、/*那*/*一年*/*中的一年不是确指不作标注。 */*这*/*一年*/*,*/*企业*/*增收节支*/*达*/[mon*110万元*]/ /*在*/[O*北大*]/*就读*/*的*/*那*/*一年*/*,*/ - 注:整天、整日、整夜一律标注为*dur*,如: /[dur*整天*]/*都*/*很*/*安静*/*,*/ /*还*/*东奔西走*/[dur*整日*]/*忙*/*个*/*不停*/*,*/ /*让*/*人*/[dur*整夜*]/*不得*/*入睡*/ - 注:当年、月、日、周等词修饰后面的工资、交易(销售)额、创汇等词语时,要作为时段(*dur*)来标注。如: /[dur*月*]/*收入*/*就*/*在*/[mon*千元*]/*以上*/ /[dur*年*]/*交易额*/*近*/[mon*1000亿元*]/*。* /*这*/*一*/*工程*/[dur*日*]/*处理*/*污水*/[cap*2万立方米*]/*。* #### 7.3.2一天的标注有以下三种情况,需区别对待: ##### 7.3.2.1"前一天",不论其前面有没有定语修饰统统标注为dat(参见7.4.1): /[dat*前一天*]/*还*/*静止*/*的*/*电梯*/[dat*今天*]/*动*/*起来*/*了*/*,*/ /[L*香港*]/[O*恒生*]/*指数*/*比*/[dat*前一天*]/*下跌*/[int*412点*]/*,*/ /*这次*/[L*中*]/[L*韩*]/*足球*/*对抗赛*/*是*/*在*/[O*韩国队*]/*准备*/*赴*/[L*法*]/*出征*/ *世界杯*/*的*/[dat*前一天*]/*举行*/*的*/*,*/ ##### 7.3.2.2"一天"的意思是指时间段(24小时),标注为dur: /*每人*/*每月*/*接待*/*来访*/[dur*一天*]/*,* /[P*汤*]/[P*尤*]/*杯*/[dur*一天*]/*不*/*拿*/*回来*/*,*/ /*仅*/[dat*5月31日*]/[dur*一天*]/*,*/[L*莫斯科市*]/*税*/*警*/*就*/*查出*/[int*1600个*]/*违法*/*经营者*/*。*/ /*青年人*/*辛苦*/*忙碌*/*了*/[dur*一天*]/*来*/*此*/*坐*/*坐*/*,*/ /*在*/[L*墨西哥*]/*最后*/[dur*一天*]/*的*/*访问*/*中*/*,*/ /*每*/*枚*/*多*/*赚*/[mon*7分钱*]/*,*/[dur*一天*]/*下来*/*能*/*多*/*收入*/[mon*好几十元*]/*。* ##### 7.3.2.3"一天"的意思相当于"有一天",由于不是确指的日期所以什么也不标: /*但愿*/*有一天*/*我们*/*轻松*/*地*/*说*/*:*/*消费*/*着*/*是*/*美丽*/*的*/*。*/ /[dat*1997年*]/*的*/*一天*/*,*/[P*吴佩民*]/*在*/*办公室*/*热情*/*接待*/*了*/*一个*/*素不相识*/*的*/*中年*/*妇女*/*。*/ /*一天*/[tim*下午*]/*,*/*记者*/*到*/*那*/*店*/*里*/*专门*/*拜访*/*了*/[P*佛朗科*]/*师傅*/*。*/ /*一天*/*,*/[P*列宁*]/*收到*/*一*/*封*/*前线*/*发*/*来*/*的*/*要求*/*支援*/*武器*/*和*/*服装*/*的*/*电报*/*。*/ /*一天*/*上*/*晚*/*自习*/*回来*/*,*/*有*/*一*/*条*/*狗*/*总*/*跟着*/*她*/*,*/ /*一天*/[tim*深夜*]/*,*/*一*/*人*/*酒后*/*拦截*/*过往*/*的*/*外地*/*车辆*/*,*/ /*一天*/*,*/*我*/*走过*/*他*/*的*/*门前*/*,*/ /*一天*/[tim*晚上*]/*,*/*新*/*上任*/*的*/[L*河北省*]/[O*栾城县委*]/*书记*/[dat*六月八日*]/*,*/ - 注:"这/一天、那/一天"中的"一天"也非确指,所以也不标。 /*记住*/*这*/*一天*/*,*/*也是*/*表达*/*我*/*对*/[L*香港*]/*回归祖国*/*的*/*预祝*/*。*/ /[P*王龙雨*]/*从*/*上任*/*的*/*那*/*一天*/*起*/*,*/ ### 7.4有关时间表达式的规则 #### 7.4.1前(后)+日期|时间要整体标注 /[dat*今年前五个月*]/ /[dat*前三天*]/ - 注:以下的标注是正确的: /*在*/*上半时*/*结束*/*前*/[dur*1分钟*]/ ----*上半时*是词表词。 /*比赛*/*前*/[dur*十分钟*]/ /*在*/*上*/*半场*/[tim*第27分钟*]/*时*/ #### 7.4.2反例——不应该标注的例子 刚才、最近、开始军备谈判以来、一会儿*等表示不确定时间的词语,不标。如果节日没有确定的时间,也不标。如: /[L*印度*]/*国际*/*电影节*/ /[L*中国*]/*旅游年*/ #### 7.4.3特例 若两个短语属于不同的子类*dat*和*tim*,就需分开标注。 /[dat*2*月*12日*]/[tim*上午*8*点*]/ /[dat*星期一*]/[tim*8点*]/ - 注1:时间中的地名,如北京时间下午*5*点,在ER-99中不标注,而在NET-2中要标注。本规范按NET-2标注(参照前面的例子)。如果*dat*和*tim*分不开,就整体标注。 /[tim*北京时间*1997*年*2*月*9*号*19*点*28*分*]/ - 注2:*去年、昨天、今早*等词在MET-2中要标,在ER-99中不标。本规范只参照MET-2: /[dat*去年上半年*]/ /[dat*今年夏天*]/ /[dat*今年三月一日*]/ /[dat*去年春夏之交*]/ /[dat*昨天*]/[tim*夜里*]/ ---*夜里*是词表词。 /[dat*今天*]/[tim*晚上*]/ ---*晚上*是词表词。 /[dat*今*]/[tim*早六点*]/ ---*今早*不是词表词。 /[tim*早上六点*]/ ---*早上*是词表词。 /[dat*5月份*]/*产品*/*出口*/*和*/*转口*/*总值*/*比*/[dat*去年同月*]/*下降*/[per*3.2%*]/*,* /[dat*同一天*][tim*晚上*]/ /[dat*当日*]/[tim*下午*]/ - 注3:当日是词表词。如果在上下文中能确定*当日、当天*或*同一天*的具体日期时,就标注;否则不标。 /*每日*/[tim*上午11时*]/*至*/[tim*深夜3时*]/ ----*深夜*是词表词。 /[tim*昨夜*/]/ ----*昨夜*是词表词。 /*每*/[dat*周四,二,一*]/ - 注:MET-2和ER-99对*早上六点*的标注是相同的。但ER-99认为*早上六点*与*今早六点*不同。原因可以从英语的表达来理解:前者是"6:00am",后者是"6:00thismorning"。"thismorning"在ER-99中被视为"相对时间",不标注。但在MET-2中,"相对时间"是要标的。本规范遵循MET-2。 /[dat*11月24至27日*]/ /[dat*3*月*15日*]/*至*/[dat*17日*]/ /[dat*1949年*]/-/[dat*1972年*]/ /[L*美国*]/*南北战争*/*(*/[dat*1861—1865年*]/*)*/*中*/ /*软件*/*最*/*长*/*的*/*寿命*/*为*/[dur*两到三年*]/*,*/ ---清注意这里日期范围的标注方式。 *迄今*----*词表词不标-,MET-2标今*。 *今后*----*词表词不标-,MET-2标今。 *晨练*----*词表词中的*晨*不标。*- *晚宴*----*词表词中的*晚*不标。*- *春联*----词表词中的*春*不标。 *他们*/*的*/*今天*/*,*/*仿佛*/*就是*/*我们*/*的*/*明天*/*。*----泛指不标。 *参加*/*半决赛*----*半决赛*是词表词,*半*不标。 *双边*/*会谈*----*双边*是词表词,因此*双*不标。 #### 7.4.4每年和年不标注 本规则也适用于*月,天,小时*等其它时间单位。例如: */*年产值*/*…*/*…*/* /*每年*/*创*/*产值*/*效益*/*…*/*…*/ /*每年*/*收入*/*…*/*…*/ ## 第八章 数字表达式标注细则 数字表达式(*NUMEX*)包括百分数(*per*)、钱款(*mon*)、频度(*fre*)、整数(*int*)、分数(*fra*)、小数(*dec*)、序数(*ord*)、比率(*rat*)等8小类。以下是数字表达式的一些标注规则。 ### 8.1如果整数、分数、小数、序数后面有量词,数量短语要整体标注 例如: /[int*几千万盆*]/ /[int*几家*]/*工厂*/ /*一*/*家*/ [int*5*]/*人*/ /*一*/*家*/ [int*5口*]/*人*/ /*铁人*/[int*三项*]/*比赛*/*是*/*多*/*项目*/*的*/*综合*/*运动*/*,* /*计算机*/*配置*/*:*/586/*以上*/*,*/[int*8兆*]/*内存*/*以上*/ /*打印*/*分辨率*/*:*/[mea*180dpi*]/ 注:*dpi*表示每英寸的点数,所以作为*mea*标注。 /*评为*/*"*/[int*十*]/*星*/*级*/*乡镇*/*"*/*、*/*"*/[int*十*]/*星*/*级*/*支部*/*"* ### 8.2单纯的数字、词表词(包括俗语)中的数字都不作标注 如: /*自然数*/5/*和*/6/*都是*/*整数*/ /*大家*/*听*/*口令*/*,*/*齐步走*/*,*/*一*/*二*/*一*/*,一*/*二*/*一*/*,*/*一*/*二*/*三*/*四*/*,/* /*但是*/*卷子*/*上*/*的*/"/6/"/*还是*/*颠*/*巍巍*/*地*/*变成*/*了*/"/8/"/*。*/ /[L*瑞士*]/*、*/[L*西班牙*]/*、*/[L*比利时*]/*、*/[L*丹麦*]/[int*四*]/*国*/ /*并*/*促进*/*了*/[L*中*][L*美*]/*两国*/*的*/*交流*/*与*/*合作*/ ----*两国*是词表词。 /*并*/*促进*/*了*/[L*中*][L*美*]/[int*两*]*国*/*的*/*交流*/*与*/*合作*/*,* ----错误! /*垄断*/*了*/[L*神奈川*]/*、*/[L*青森*]/*等*/[int*5*]/*县*/*的*/*交通*/*信号*/*维修*/*业务*/*。* /[L*两岸*]/*经济*/*合作*/*和*/*直接*/*三通*/ ----*三通*是词表词。 /[L*两岸*]/*经济*/*合作*/*和*/*直接*/[int*三*]*通*/ ----错误! /*到*/[L*云*]/[L*贵*]/[L*川*]/*的*/*大三线*/*地区*/*,----大三线*是词表词。 /*到*/[L*云*]/[L*贵*]/[L*川*]/*的*/*大*[int*三*]*线*/*地区*/*,----错误! /*十年寒窗*/ ----*词表词中的十年*不标。 /*千载难逢*/ ----*词表词中的千载*不标。 /*十*/*年*/*九*/*旱*/*----非词表词。虚指的十年*不标。 /*眼*/*观*/*六*/*路*/*,耳*/*听*/*八*/*方*/ ----非词表词。虚指的六、八不标。 /*利*/*在*/*千秋*/*的*/*大事*/ ----*虚指的*"*千秋*"不标。 /*十*/*年*/*如*/*一*/*日*/ ---*-虚指的十年*和*一日*,不标。 /*万里*[L*长城*]/ ---*-虚指的万里*,不标。 /*三皇五帝*/----*三皇五帝*是词表词。 /*乌七八糟*/*的*/*东西*/*几乎*/*扫荡*/*殆尽*/*----乌七八糟*是词表词。 /*三大球*/*在*/*走*/*向*/*市场*/*时*/----*三大球*是词表词。 /*第二次世界大战*/*的*/*反法西斯*/*斗争*/----*第二次世界大战*是词表词。 /*三五成群*/*地*/*散落*/*着*/*警察*/*,*----*三五成群*是词表词。 - 注:*一会儿,一起,唯一,付之一炬,一流,千方百计,一分为二,一切,二娃*等词表词中的数字一律不标。 /*本职*/*创*/"/*一流*/"/*活动*/ /[int*亿万*]/*人民*/ /[int*百万*]/*民众*/ - 注:按照ER-99,*亿万、百万*不是一个抽象的数字,因此是要标注的。 ### 8.3约、近是一个不确切概念,故不同后面的数字串一起标注 *上*、*数*、*几*、*好*则要和数字串捆绑在一起标注,而*约、近*作为特例,不与数词捆绑。 /*大约*/[int*12亿*]/*人口*/ /*约*/[int*四五千*]/*人*/*在*/[L*金边奥林匹克运动中心*]/*举行*/*集会*/*,*/ /*约*/[mon*十万元*]/ /*近*/[mon*千万元*]/ /*大概*/*需要*/*花费*/[mon*上千万美元*]/*的*/*投资*/*和*/[dur*3年*]/*左右*/*时间*/*,*/ /[O*省电力公司*]/*还*/*投资*/[mon*好几百万元*]/*,*/ /*多于*/[mon$90,000]/ /[mon*几百万新元*]/ /*统计*/*了*/[int*上百种*]/*数字*/*,*/ /*每年*/*都*/*要*/*花费*/*大量*/*外汇*/*引进*/[int*上百套*]/*系统*/ /*每年*/*搞*/[int*一两个*]/*工程*/*,*/ /*邀请*/*全国*/*近*/[int*百名*]/*书法*/*名家*/*,*/ /*近*/[int*千名*]/*员工*/ - 注:余、多本不应标注,但当它们位于量词前分割不开,所以整体加以标注。 /[mon*二十七万余元*]/ /[mon*五百多万元*]/ ### 8.4钱款式中的地名 钱款表达式中的地名不论是单音节还是多音节的,Er-99和MET-2都不标,否则就形成嵌套。 如果货币字符串在文本中单独出现,字符串中没有数字修饰,那么双音节的地名要标注为*L*,单音节的地名不标注。例如非词表词*泰铢*中的*泰*不标。注意词表词*日元*、*美元*中的单音节的地名也不标。 /[mon*2000新元*]/ /[mon*2000新加坡元*]/ /*泰*/*铢*/*汇率*/*稳定*/*在*/[mon*38铢*]/—/[mon*39铢*]/*兑*/[mon*美元*]/*水平*/ /*纷纷*/*抛*/*出*/*日元*/*购*/*进*/[L*德国*]/*马克*/*,*/ /[L*菲律宾*]/*比索*/*对*/*美元*/*汇率*/*也*/*下跌*/*。*/ ### 8.5钱款标注中的特例 MET-2规定:如果没有表示钱款的单位,则不标。ER-99则不然。本规范采用ER-99的规定。 /*这*/*辆*/*汽车*/*值*/[mon*20万*]/ /*卷标*/*上*/*的*/*价格*/*是*/ [mon*50*]/ /[O*纳斯达克*]/*跌*/ [int*140*]/ ### 8.6频率的特例 /[fre*四年一度*]/ ----*四年一度*并非词表词,但整体标注为*fre*。 /[fre*一年一度*]/ ----*一年一度*是词表词,整体标注为*fre*。 */*主要*/*在*/*交流*/[fre*50Hz*]/*,/*额定*/*电压*/*至*/[mea*660V*]/ *---*交流电的频率是*50Hz*(赫兹)*,*即每秒变化*50*周。所以理应标成*fre*而不是*mea*。 /*频率*/*高*/*(*/[fre*30*-*60KHz*]/) /*卫星*/*每年*/*发射*/[fre*6至7次*]/。 - 注:又一次、再一次全部标注为fre,但/*一次*/*又*/*一次*/例外,不作标注。 如: /*此间*/*舆论*/[fre*又一次*]/*注意*/*到*/[L*亚*]/[L*非*]/*足球*/*的*/*差距*/ /*精湛*/*演技*/*,*/[fre*再一次*]/*赢得*/*了*/*首都*/*观众*/*的*/*由衷*/*赞赏*/ ### 8.7名词方没有与之搭配的量词,因此可以和前面的数词直接结合 在我方、校方中的名词方没有与之搭配的量词,因此可以和前面的数词直接结合,如: /[int*三方*]/*已*/*就*/[O*劳斯莱斯*]/*汽车*/*的*/*前景*/*达成*/*协定*/*,*/ ### 8.8一相当于英语的冠词a,一般不标 一相当于英语的冠词a,一般不标,但一倍是例外,要标fra。例如: /*一个*/*条件*/ /*一*/*座*/*城市*/ /*最大*/*的*/*企业*/*之一*/ /*荣立*/*一等功*/ ----*一等功*是词表词,不可标注。 /荣立*/[ord*一等*]*功*/----错误的标注! /*获*/*县*/*政府*/*新技术*/*推广*/[ord*一、二等*]/*奖*/*。*/ /*我*/*的*/*收入*/*是*/*她*/*的*/[fra*一倍*]/ ----*一倍*是要标的。 ### 8.9一(1)+量词不标注*int* #### 8.9.1一+量词是词表词的情况 词表词一个、一种、一类、一批、一次、一套、一阵等作为数量短语不予切分,也不标注*int*。其中有些量词重迭形式也是词表词,如一个个、一天天,应保持其整词形式,而其它非词表词的数量短语和量词重迭形式都是要切开的。 /*一个*/*人*/ /*一个个*/*观众*/ /*一种*/*算法*/ /*一套*/*特种*/*邮票*/ /*一次*/*讨论*/ /*一*/*匹*/*黄骠马*/ /*一*/*栋*/*栋*/*楼房*/ /*一天天*/*暖和*/*起来*/ #### 8.9.2词表词一起、一块、一道、一面用作数量短语时应切开 词表词*一起、一块、一道、一面*有副词和其它词性的用法,但当它们用作数量短语时一律切开,而且不标注*int*。 /*一*/*块*/*石头*/ /*一*/*起*/*交通*/*事故*/ /*一*/*面*/*镜子*/ ### 8.10一(1)"+物理单位元需按度量表达式标注 一(1)"+物理单位元(如米、公斤、摄氏度等)需按度量表达式(见6.3)标注。如: /[wei*一公斤*]/*大米*/ /[mea*一度*]/*电*/ ### 8.11分数词素半 #### 8.11.1词表词中的词素半不可标注为fra(分数) 词表词*如半价、半票、半饱、半身、半世、半辈子、上半时、下半场、半边*等,但不可把上述词表词中的词素*半*标注为*fra*(分数)。 /*上*/[fra*半*]/*场*/*比赛*/[L*中国*]*队*/*未进*/*一*/*球*/ /*下半场*/----词表词,是正确标注。 /*下*[fra*半*]*场*/----在词表词中插标*fra*是错误的。 /*目前*/*还*/*空闲*/*着*/[fra*一大半*]/*的*/*营业*/*面积*/*。*/ /*他们*/*之中*/*肯定*/*有*/[fra*一多半*]/*人*/*没有*/*球*/*票*/ /*有*/[fra*大半个*]/*篮球*/*场*/*那么*/*大*/ - 注:当半作为一个独立的词时要标注,标注的原则是:半+量词或名词时标注,半+动词或形容词时不作标注,如: /*下半场*/*后*/[fra*半*]/*段*/ /*地处*/*偏僻*/[fra*半*]/*山区*/ /*部分*/*企业*/*停产*/*或*/*半*/*停产*/ /*而*/*处于*/*半*/*死亡*/*或*/*休眠*/*状态*/*,*/ /*干旱*/*半*/*干旱*/*地区*/*径流*/*造林*/*技术*/*、*/ #### 8.11.2以下的词表词不作为分数标注,而作为其它不同的数字串标注 /[dur*半年*]/ /[dur*半天*]/ /[tim|dur*半夜*]/ /[int|age*半百*]/ #### 8.11.3例外 半个西瓜中的半个,与四半中的半概念不一样,前一个半是指二分之一,后一个半是量词,所以标注也不同。 /[fra*半个*]/*西瓜*/ /[int*一个*]/*西瓜*/*分为*/[int*四半*]/ ### 8.12序数词素首 #### 8.12.1词表词中的词素首不可标注为ord(序数) 词表中有许多词含有词素*首*,如*首创、首倡、首选、首发、首航、首飞、首演、首映、首战、首展、首席代表、首席科学家、首席执行官、首富、榜首、魁首、居首*等。但不可把词表词中的词素*首*单独作为*ord*(序数)来标注。 /*首席执行官*/----正确标注。 /[*ord首席*]*执行官*/----在词表词中插标*ord*是错误的。 #### 8.12.2具有首+量词结构的词表词或非词表词,应整体作为ord标注 具有"首+量词"结构的词表词有:*[ord*首届*]*,*[ord*首次*]*,*[ord*首批*]*,*[ord*首位*]*,[ord*首例*]等。 具有首+量词结构的非词表词,如: /[L*北京市*]/[ord*首家*]/*就业*/*与*/*创业*/*组合*/*市场*/ /[P*满文军*]/*则*/*以*/*自己*/*的*/[ord*首张*]/*个人*/*专辑*/ /[dat*首日*]/*销售*/*欠佳*/ ----这里首日不能作序数词来标注,应标注为日期*dat*。(详见7.1)。 - 注:头版、头条是词表词。它们和头一回统统标注为*ord*。如: /*在*/[dat*4月11日*]/*的*/*《*/[O*人民日报*]/*》*/[ord*头版*]/[ord*头条*]/*社论*/*位置*/*发表*/*出来*/*,* /*由于*/*是*/[ord*头一回*]/*,*/*总*/*怕*/*有*/*个*/*闪失*/*,* - 注:"头"的上述标注不可类推到其它词组中,例如, *上*/*半场*/*表现*/*不好*/*,*/*头*/[dur*10分钟*]/*甚至*/*有些*/*拖泥带水*/*。* *----*注:这里半场时词表词,但不标注为*fra*。 ### 8.13序数词+量词结构,应整体作为ord标注 /[ord*第一期*]/ /[ord*第二*]/*故乡*/ /[ord*三等*]/*奖*/ /[dat*第一天*]/ *---*相对日期,标*dat,*而不是** [ord*第一*]/*天*。 /[dat*第二年*]/ *---*相对日期,标*dat,*而不是** [ord*第二*]/*年*。 /[O*波音*]/747 */* ----*产品序号不标*。 /*地震烈度*/*不*/*超过*/[ord*8度*]// /*这*/[ord*第二条*]/*尤为*/*重要*/*,*/ /*位居*/*金牌*/*榜*/[ord*第二名*]/*。*/ /*作为*/*大豆*/*行动*/*计划*/*的*/[ord*第二步*]/ /[ord*1174号*]/*文件*/ /[ord*6路*]/*汽车*/ /[ord*六年级*]/*学生*/ /[dat*今年*]/*读*/[ord*大三*]/ /*发展*/*第一产业*/ ----*第一产业*是词表词。 /*发展*/[ord*第一*]*产业*/ ----错误的标注。 /*阵风*/[ord*五级]*/ /*通过*/*大学*/*英语*/[ord*六级*]/ - 注:联赛中的A/组、B/组等不作为序数字串标注。如: /*在*/[L*里昂*]/*进行*/*的*/*世界杯*/*G*/*组*/*比赛*/*中*/ - 注:"甲级、甲/A、乙/级、乙/A"等不作为序数ord标注。如: */*当即*/*停止*/*该*/*场*/*比赛*/*主*/*裁判员*/*执法*/*全国*/*足球*/*甲*/*A*/*联赛*/*; /*获得*/[ord*前两名*]/*的*/*球队*/*晋级*/*甲*/*A*/*行列*/*。*/ /[dat*1998年*]/*全国*/[O*男篮*]/*甲*/*B*/*联赛*/ /*判处*/*以*/[P*东条英机*]/*为首*/*的*/[int*7名*]/*甲级*/*战犯*/*死刑*/*。*/ ### 8.14仅当形容词前表示比赛名次时才和后面的序数结构一起标注 仅当形容词前表示比赛名次,如前*6*名、前四(指前四名)时,才和后面的序数结构一起标注为*ord*。其余的情况如前两次、前三组、前三场、前两项等,前都不得进入被标注的数字表达式。 /*获得*/[ord*前十名*]/*的*/*是*/*:*/*在*/*前*/[int*两轮*]/*小组*/*赛*/*中*/ /*列*/*前*/[int*两位*]/*的*/*是*/[O*澳大利亚队*]/*和*/[O*日本队*]/*。* ### 8.15文本中表示标号的数字不标 规范、条例中的条款标号,包括一、二、三、Ⅰ、Ⅱ、Ⅲ、1,2,3、第一条、第二条、第三条等,一律不予标注。只有当这些条款被正文引用时,才作为序号ord被标注。例如: /*第二*/*,*/*制定*/*必要*/*的*/*行规*/*、*/*行约*/*,*/*共同*/*规范*/*,*/*共同*/*遵守*/*,*/ /*一*/*无*/*资金*/*,*/*二*/*无*/*场地*/ /*一*/*靠*/*政策*/*调动*/*农民*/*的*/*积极性;*/ /*二*/*靠*/*科技;*/ /*一*/*是*/*继续*/*加强*/*农业;*/ /*二*/*是*/*采取*/*措施*/*稳定*/*物价*/*,*/*抑制*/*通货膨胀;*/ /*1*/*.*/*自卑*/*的*/*羞耻*/*感*/*。*/ /*2*/*.*/*依赖*/*的*/*恐惧*/*感*/*。*/ /*(1)*/*加强*/*爱国主义*/*的*/*宣传*/*教育。*/ /*(2)*/*加强*/*正确*/*的*/*理想*/*、*/*信念*/*、*/*人生观*/*、*/*价值观*/*的*/*宣传*/*教育*/*。* "*第*+*数词*+*条*"视为词表词,但作为文中陈述的标号时不标注*ord*。仅当其在文中被引用时才作为*ord*标注。例如: /*第一条*/*、*/*消费者*/*永远*/*是*/*对*/*的*/*;* /*第二条*/*、*/*如果*/*消费者*/*真*/*的*/*错*/*了*/*,*/*清*/*参照*/[ord*第一条*]/*。*/ - 注:当上述数字表示等级序号时,则要标注为*ord*。例如: *污秽*/*等级*/*:*/[ord*Ⅰ*]/*、*/[ord*Ⅱ*]/*、*/[ord*Ⅲ*]/*、*/[ord*Ⅳ*]/*。* ### 8.16人名、地名、机构名中的数字,不单独标注int /[P*佐腾一郎*]/ /[L*梅竹蹊六十七号茶花庄*]/ /[O*子弟一中*]/ /[O*三明市*]/ /*任*/*队长*/*的*/ [O*1205钻井队*]/ ### 8.17外文字符串的标注 由于外文的词与词之间都有空格作为分隔符,因此无需再去切分,只在标点符号的前后加切分标记。遇到字母词、名称缩写等情况也不作切分,如:/COM/经济/(网络经济)、/E/产品/(电子产品)、/卡拉/OK/等。 /Good morning/ ,/everyone/./ /*最近*/*引进*/*一*/*台*/JT-ESWL-*Ⅲ*/*型*/*体*/*外*/*震波*/*粉碎*/*肾结石*/*机*/*,*/ "*/[L *ZHONG* HUA *REN* MIN *GONG* HE*GUO]/"/*,*/*这是*/[L*中华人民共和国*]/*的*/*汉语拼音*/*。*/* "*/Brother/*,*/I *love* you *all* the *time/*,*/ Thank *you* very *much/*!*/"/ "/Happy *birthday* to*you/*!*/" /Dip *one* end *of* a *straw* in *the* solution/./Blow *gently* through *the* straw/./ */A* soap *bubble* forms/./What *happens* when *you* keep *on* blowing/?/ /The *bubble* bursts *because* the *pressure* inside *the* bubble *is* more *than* the*pressure *outside* the*bubble/./ ### 8.18数学公式和机型标号均作为一个整体来切分和标注 例如: /*△*S/=/[len*12*(*S1*+*S2*)*mm*]/ /*IEC298*.*265*.*129*.*694. *420*.*56.* 529*.*932*/ /*GB3804.* *3906*.*11022*/ /IEC60129A2/*(*/[dat*1996*]/)*UES*-*K3*/*2/ /UEMC40K8U*/*1*/ */1* V/FJ220001R2/ /*SFL12*/*17.5*/IVD *P575303RI/ /S*FL24A/IVDP5753/O2RI/ ## 第九章 分词歧义消解细则 本章中的歧义切分实例是从微软亚洲研究院237万词训练语料、10万词测试语料和20万词散页语料中抽取出来的。这些歧义字段可粗分为交集型歧义(OAS)和组合型歧义(CAS)两大类。交集型歧义又包含用正反向最大匹配(MM)算法侦查不到的所谓隐藏的CAS。下面就分别介绍不同歧义字段的消解规则。 ### 9.1交集型歧义字段(OAS) #### 9.1.1交集型歧义字段示例 由于交集型歧义字段的例子太多,不便穷举,所以下面只列举少量实例供参考。 (1)/矛头/所/指/正是/以/包/代/管/、/负/盈/不/负/亏/、/ (2)/[L四川]/一/私营企业/家/向/下岗/女工/捐款/ (3)/柚木/购/进/后/市场价格/大/跌/,/ (4)/图/为/[O保险公司]/向/受灾/企业/赔/付/现场/ (5)/地方政府/亟需/在/加强/压/锭/监管/力度/方面/下功夫/,/ (6)/与/厂/内/存留/的/旧/纱/机/一并/销毁/。/ (7)经/请示/,/自行/将/本/厂/经/改造/的/应/压缩/设备 (8)擅自/新/增/棉纺/生产能力/, (9)/有人/钻/政策/空子/、/骗/财政补贴/。 (10)日益/猖獗/的/走私/犯罪/活动/, (11)/他/在/教务/活动/中/积极/研究/、 (12)对/全/山/的/商业/网点/和/摊/区/重新/进行/了/规划/和/建设/。 (13)/加强/各级/领导班子/建设/。 (14)全体/员工/开展/了/"/人家/学/我们/,/我们/怎么办/"/的/大/讨论/, (15)/[O欧佩克]/提高/原油/配额/和/暖冬/等/因素/影响/, (16)/保护/国家/和/人民群众/的/生命/财产/安全/。 (17)/以/维护/民族团结/为/己任/, (18)/各级/领导干部/要/站/在/党/和/国家/全局/的/高度/, (19)/只有/坚持/解放/思想/、/实事求是/的/思想/路线/, (20)/呈现/了/"/部队/添/战斗力/,/企业/增/生产力/, (21)/共建/双方/通过/自上而下/层/层/签约/, (22)/电力/部门/还/专门/建立/了/正规/的/转业军人/业务/培训/机制/, (23)/这/条/线/不/停电/,/官兵/跳伞/太/危险/了/。 (24)如同/[L华中]/电网/强大/的/发电机/群/按照/同一/频率/转动/一样/, (25)通过/举办/一些/全/集团/参与/的/拥军/活动/, (26)/在/全国/工业/[ord500]/强/中/名列前茅/的/大型/企业集团/。 (27)/本/次/检测/中/性能/系数/最高/者/。 (28)/不是/主张/所有/的/会议/都/开/成/电视电话/会议/。 #### 9.1.2隐藏的交集型歧义字段 隐藏的交集型歧义字段是指那些用正、反向最大匹配(MM)算法无法侦查到的交集型歧义字段。 注:以下例句中,双百分号右面为改正后的切分。 (1)/[L新疆]/经济/社会/发展/一定/会展/现出/越来越/美好/的/前景/ /[L新疆]/经济/社会/发展/一定/会/展现/出/越来越/美好/的/前景/ (2)/成立/了/专/司空/中和/地面/服务/质量/监管/的/服务/质量/督察/办公室/,/ /成立/了/专/司/空中/和/地面/服务/质量/监管/的/服务/质量/督察/办公室/,/ (4)/其内/容或/规则/已/译/成/[int15]/国/语言/,/ /其/内容/或/规则/已/译/成/[int15]/国/语言/,/ (5)/这/一发/现有/可能/加速/艾滋病/新药/和/疫苗/的/研制/。/ 这/一/发现/有/可能/加速/艾滋病/新药/和/疫苗/的/研制/。/ (6)/[L韩国]/对/日出/口中/,/ /[L韩国]/对/[L日]/出口/中/,/ (7)/恰/在/此时/,/奉/党委/派赴/[O共产国际]/工作/的/[P张太雷]/于/[dat8月]/回国/ /恰/在/此时/,/奉/党/委派/赴/[O共产国际]/工作/的/[P张太雷]/于/[dat8月]/回国 (8)/站/在建/设有/[L中国]/特色/社会主义/全局/ /站/在/建设/有/[L中国]/特色/社会主义/全局/ (9)/金融/危机/就/可能/会演/变为/经济危机/ 金融/危机/就/可能/会/演变/为/经济危机/ (10)/如/少数/司机/在/东侧/门楼/外道/路上/违章/占/道/停车/,/ 如/少数/司机/在/东侧/门楼/外/道路/上/违章/占/道/停车/,/ (11)/有关/部门/要/下决心/下力/气管/好/电子/游戏/室/。/ 有关/部门/要/下决心/下/力气/管/好/电子/游戏/室/。/ (12)/相近/似的/设施/化/保护/菜地/面积/达/[are1300万亩]/;/ /相/近似/的/设施/化/保护/菜地/面积/达/[are1300万亩]/;/ (13)/表明/了/财政部/门对/落实/科教/兴/国/战略/采取/的/实际/行动/。/ /表明/了/财政/部门/对/落实/科教/兴/国/战略/采取/的/实际/行动/。/ (14)儿时/站/在家/门口/向/四面/望/,/ 儿时/站/在/家门/口/向/四面/望/,/ (15)/就/不可能/正确/地理/解和/执行/党/的/路线/方针/政策/,/ /就/不可能/正确/地/理解/和/执行/党/的/路线/方针/政策/,/ (16)/使得/高校/中原/有的/个别/的/知识/物化/行为/迅速/扩展/为/一种/专门/职能/。/ /使得/高校/中/原有/的/个别/的/知识/物化/行为/迅速/扩展/为/一种/专门/职能/。/ (17)/这是/大都/市里/的/一个/皮货/修理/店/,/ /这是/大/都市/里/的/一个/皮货/修理/店/,/ (18)/需/招集/体制/女性/业务员/[int四名]/,/ /需/招/集体/制/女性/业务员/[int四名]/,/ (19)/连同/应/交费/用以/划/支票/寄/还/。/ /连同/应/交/费用/以/划/支票/寄/还/。/ (20)/我们/一口气/跑/到家/门口/的/一/棵/大树/前/,/ /我们/一口气/跑/到/家门/口/的/一/棵/大树/前/,/ (21)/"/唉/,/要是/好/好/复习/,/可不/会考/得/这样/糟/。/"/ "/唉/,/要是/好/好/复习/,/可/不会/考/得/这样/糟/。/"/ (22)/特制/定本/规定/。/ /特/制定/本/规定/。/ (23)/股东/会所/议事/项/作/成/会议/记录/,/ /股东/会/所/议/事项/作/成/会议/记录/,/ ### 9.2组合型歧义字段(CAS) 组合型歧义字段在真实文本中大量出现,有的是比较常见的,有的是非常罕见的。尤其是有的CAS即使根据上下文也很难判断其正确切分,如正在、就是、还是、只有、只是、一道、一起等等。因此有必要针对那些高频的CAS逐条加以说明。 #### 9.2.1常见的组合型歧义字段 下面对一些常见的组合型歧义字段加以解释。 ##### 9.2.1.1数词一和量词组成的CAS 词表词一个、一种、一类、一批、一次、一套、一阵等作为数量短语不予切分,也不标注int。其中有些量词重迭形式也是词表词,如一个个、一天天,应保持其整词形式,而其它非词表词的数量短语和量词重迭形式都是要切开的。(详见8.9) /*一个*/*人*/ /*一个个*/*观众*/ /*一天天*/*暖和*/*起来*/ /*一套*/*特种*/*邮票*/ /*一次*/*讨论*/ /*一*/*匹*/*黄骠马*/ /*一*/*栋*/*栋*/*楼房*/ 词表词一起、一道、一样、一手、一面、一口、一头、一气等既可以用作连词、副词、名词或形容词等,又可以切分开来成为数量短语。但像一套这样的词表词,除了数量短语的用法以外,不再有其它用法,因此不存在切分问题。词表词有一套是有本事的意思时,也不切分。这类词的切分问题只能逐个加以描述。 ##### 9.2.1.2动量词次与频率int的标注 动量词中只有*次*被标注为频率*fre*,如[fre*再次*]*、*[fre*数次*]*、*[fre*一次次*]*、*[fre*无数次*]*、*[fre*好几次*]*,而*遍、回、趟*不标注为频率,一*/*遍、一*/*回、一*/*趟、一次(词表词不切分)、一*/*遍*/*又*/*一*/*遍、一*/*回*/*又*/*一*/*回、一*/*趟*/*又*/*一*/*趟,一次*/*又*/*一次*也不标注为*fre*。这条规则的理由如下: (1)遍表达的是动作从开始到结束的全过程;次、回描写动作的重复;趟只用于表示行走意义的动词。*去一趟*可以说成*去一次*、*去一回*,但*做一次*、*做一回*不能说成*做一趟*。 *遍、次、回*有时可通用,如*你再唱一遍*,可以说成*你再唱一次*或*你再唱一回*而意思不变。但单纯表示动作数量时,只用*次*,不用*遍*,如*他表示了多次*、*敌人的三次进攻都被击退了*。 *次*与*回*区别在于,*次*既用于书面语又用于口语;*回*只用于口语。如*多次、数次*等带文言色彩的短语,就不能说成*多回、数回*。 (2)*这本书我看了一遍*,是指从书的开头到末尾的全过程。*这本书我看了一次*,着重指看的次数,不指看的全过程。 ##### 9.2.1.3一(1)+物理单位元量词构成度量表达式 当一(1)后面是长度、重量等物理单位元时应分别按度量表达式标注为*len*,*wei*,如[*len*一米*]*、*[wei*1*公斤*](见8.10)。 #### 9.2.2CAS示例 下面是一些常见CAS的切分规则和示例。 (1)人为作形容词时不切分。 (1a)而是/深究/灾难/中/的/人为/因素/。(1b)以/人/为/本/ (2)为人: (2a)也/包括/最/基本/的/为人/处事/的/行为/准则/ (2b)/始终/主宰/着/他/的/为人/之/道/和/为/艺/之/方/。(2c)/我/把/不大/为/人/所/知/的/一些/往事/写/下来/, (3)一起:作名词和副词使用时不切分,作为"数+量词"时切分。(3a)/和/市民/一起/聊天/,/听取/群众/反映/。 (3b)/[dat4月17日]/发生/在/[L北京]/[L海淀区]/[L阜石路]/的/一/起/车祸/,/ (4)一点:形容词,意思是少许,不切分。但作为数量短语时要切开。(4a)/文/中/还有/一点/小/差误/,/也/顺便/提/提/。/ (4b)/都/清楚/地/意识到/了/这/一/点/,/ (5)一道:作副词使用时不切分;作为"数+量词"时切分。 (5a)/而且/要求/未来/的/丈夫/同/她/一道/挑起/照顾/[P穆]/大爷/的/担子/。 (5b)/已/成为/百里/油田/的/一/道/风景/线/。 (5c)/在/我/的/前额/刻下/了/一/道/道/弯曲/的/青春/印记/。 (6)一面:作名词和副词使用时不切分,作为"数+量词"时切分。 (6a)/虽然/在/现代汉语/里/含有/贬义/,/但/其/积极/的/一面/应该/肯定/。 (6b)/一面/学习/,/一面/实践/,/贯彻/到/筹组/[L澳门特别行政区]/的/工作/中/去/。(6c)/爱/是/一/面/辽阔/光滑/的/回音壁/,/微小/的/爱/意/反复/回响/着/, (7)一口:作形容词和副词使用时不切分,作为"数+量词"时切分。(7a)/[P崔]/又/一口/回绝/并/与/其/发生/争吵/。 (7b)要不/则/是/一/脸/匪/相/或者/一口/痞/气/, (7c)不由得/倒/吸/了/一/口/冷气/打/了/一个/寒战/, (7d)一/口/大/锅/解决/了/[int两家]/的/再就业/难题/。 (8)一手既有名词和副词的用时,又有"数+量词"的用法,但在文本中一律不予区分。 (8a)所有/的/为/官/为/政/者/都/能/写/一手/好/文章/,(8b)/整个/事件/是/他/一手/策划/的/, (8c)/他/一手/划水/,/一手/搂/着/女/青年/游/向/岸边/。 (9)一头:作名词和副词使用时不切分,作为"数+量词"时切分。(9a)/二来/街道/一头/联/着/片/内/的/企业单位/,/一头/联/着/居民/, (9b)/此后/,/他/一头/钻进/常年/云雾/缭绕/的/云雾山/,/拜访/民间/郎中/,(9c)/一/头/经过/救助/已/恢复/健康/的/灰/鲸/『/[P杰杰]/』/ (10)一路:作名词和副词使用时不切分;作为"数+量词"时切分。 (10a)有时/公共汽车/挤/不/上/,/干脆/快步/当/车/一路/小跑/。 (10b)我们/一路/攀登/来到/[P王永祥]/简陋/的/护林/小屋/。 (10c)另/一/路/是/探索/[L火星]/、/[L木星]/等/星球/。 (11)一下:用作副词和数量词使用时不切分;当一作副词下作动词时要切分。 (11a)/只要/通融/一下/,/既/能/得到/一/笔/大钱/,/又/能/保持/友情/。 (11b)/相互/拍打/一下/:/"/你/猜/[rat几比几]/?/" (11c)/书包/斜/背/在/肩/上/,/带子/太/长/,/随着/步子/一/上/一/下/跳跃/着/拍打/在/屁股/上/。 (12)一片:作形容词使用时不切分;作数量短语时切分。 (12a)/台上/台/下/那/一片/亲切/和谐/的/气氛/, (12b)/融入/一片/[dat夏日]/的/浓绿/之中 (12c)/地板/上/看/不/到/一/片/碎/纸屑/。 (12d)/宽宽大大/的/粽/叶/,/她/总/要/一/片/片/洗/净/。 (13)一则:作副词使用时不切分;作数量短语时切分。 (13a)/一则/表达/对/同乡/画/马/大师/[P徐悲鸿]/的/敬仰/,/二/则/愿/家乡/建设/如/骏马/奔腾/一日千里/。 (13b)/[L法国]/报纸/刊/出/一/则/特写/, (14)不见:是动词见的否定形式不切分。当它同前面的动词形成V/*得*/*见*、*V/*不*/*见*的* 可能式动补结构时,要切分。类似的可能式动补结构还有*V/*得*/*下去*/*、*V/*不*/*下去*/*, *V/*得*/*来*/*、*V/*不*/*来*,* *V/*得*/*起、*V/*不*/*起*,** V/*得*/*了*/*、*V/*不*/*了*/*,*V/*得*/*成*/*、*V/*不*/*成*/*,*长*/*得*/*大*/*、长*/*不*/*大*/*等*。 (14a)/全/都是/"/不见/兔子/不/撒/鹰/"/。 (14b)/人武部/就/看/不/见/一/盏/长明灯/, (15)不对:作形容词表示不正确时不切分;如果对作为介词,就要切开。 (15a)父母/这么/想/当然/不对/,/可/也/不能/全/怪/他们/的/愚钝/和/落后/。 (15b)/中国/主张/和平/的/外交/政策/,/中国/不/对/任何/国家/构成/威胁/。 (16)不等:作形容词表示不相同时不切分;如果等作为动词,就要切开。 (16a)/按照/用户/要求/生产/大小/不等/的/编织/塑料/袋/, (16b)不/等/妻子/说/什么/,/他/自己/悄悄/地/找/开/了/出路/。 (16c)/时间/不/等/人/ (17)不下:表示不少于时不切;作为动词下的否定式和可能式动补结构(见14),就要切开。 (17a)/每天/她/经手/的/业务/不下/[int百笔]/, (17b)/架子/还/放/不/下/,/面子/还/丢/不/开/, (17c)/刑/不/上/大夫/,/礼/不/下/庶人/ (17d)/[L俄罗斯]/整个/国家/开支/居/高/不/下/, (18)不成:作动词、形容词和助词使用时不切分;当它作为可能式动补结构(见14)时,一律切开。 (18a)/难道/自己/这/一辈子/就/这么/过/不成/? (18b)/毛虾/已/不成/汛/, (18c)攀/"/亲/"/不成/反/折本/, (18d)/往往/是/有/点/而/形/不/成/网/, (19)上下:用作动词时一律切开,如"上/下/火车";用作名词(包括并列意义)时则不切,如"上下/两册"。 (19a)/经过/上下/的/共同/努力/, (19b)/上/下/车/、/船/,/须/待/车/、/船/停/稳/后/先/下/客/后/上/客/, (20)从前:作时间名词时不切分;如果从作介词前作方位词,就要切开。 (20a)/有的/是/从前/在/队/中/当/板凳/球员/, (20b)/导致/美元/对/马克/的/汇价/从/前/一/交易/日/的/[rat1比1·7766]/降/至/[rat1比1·7762]/。 (20c)/从/前/不久/[L深圳]/一家/公司/大规模/地/恶意/抢/注/商标/案/, (21)以为:作动词时不切分。 (21a)/以为/强大/的/[P卡斯珀罗夫]/恢复/了/他/的/本来面目/。 (21b)有/一些/干部/想/不/通/,/以为/是/搞/形式/,/出风头/。 (21c)/我们/引/以/为/自豪/的/风格/多少/应/有些/改变/了/。 (21d)/代表/们/以/为/人民/高度/负责/的/精神/,/提出/批评/和/意见/。 (22)正当:作形容词时不切分。 (22a)/我们/是否/能/以/某/种/不/正当/的/方式/反对/, (22b)/正/当/禾苗/生长/关键/时期/,/ (23)正在: (23a)/对/各地/已/建成/尚未/售出/和/正在/建设/的/住房/, (23b)/记者/正/在/回/[L巴黎]/的/高速/列车/上/。 (23c)/[O世界卫生组织]/正/在/[L科特迪瓦]/召开/国际/会议/, (24)会上: (24a)/[L苏州]/等/省市/及/有关单位/在/会上/介绍/了/经验/, (24b)/在/[O国际泳联]/[dat二十四日]/举行/的/听证/会/上/, (25)台上: (25a)/表演/完/节目/后/竟/在/台上/掩/面/而/泣/。/ (25b)/预赛/是/在/有/围/绳/的/拳击/台/上/ (26)走向:用作名词时不切分;用作动词+介词时,一律切分。 (26a)/[L北京]/输/气/管道/工程/线路/走向/示意图/(/示意图/:/[P孙伟]/绘/)/ (26b)/迈出/了/我国/航天/事业/走/向/世界/的/[ord第一步]/。 (27)才能:用作名词时不切分;用作副词+能愿动词时,必须切开。 (27a)/但/如果/施展/才能/的/空间/很/大/,/而且/能/充分/发挥/所/学/专长/,/不妨/一/试/。 (27b)/勤奋/才/能/有/真知灼见/; (28)人才:用作名词时不切分。 (28a)要/想/成为/[dat跨世纪]/人才/,/光/有/专业知识/不够/, (28b)/这/恐怕/只有/浪漫/的/[L法国]/人/才/想/得/出来/。 (29)上来:作趋向动词和动词时不切分;当上作方位词时,就要切开。 (29a)/一/届/新/班子/上来/以后/,/ (29b)/把/工作/重点/转移/到/社会主义/现代化/建设/上/来/,/ (30)上去:作趋向动词和动词时不切分;当上作方位词时,就要切开。 (30a)/显然/是/上/了/学/的/[L瑶族]/娃子/写/上去/的/。/ (30b)/把/科研/技术/成果/转移/到/社会/应用/上/去/。/ (31)上前: (31a)/他/的/四个/弟兄/挨次/伸出/手/来/上前/祝贺/ (31b)/当即/冲/上/前/去/,/扭/住/一/名/歹徒/不/放/,/ (32)上路:作动词时不切分;上作动词是要切分。 (32a)/我/背/起/你/的/薄被/送/你/上路/, (32b)/过去/村里/也是/上/路/打场/,/ (33)得了:取助词用法时不切分;但作为动词+助词(了)和可能式动补结构(见14)时要切分。 (33a)/没/叫/到/你/的/时候/,/安心/等/着/就/得了/。 (33b)/经/医生/诊断/他/得/了/胃癌/。 (33c)/书记/何以/承受/得/了/, (34)得出:作动词时不切;作可能式动补结构(见14)时要切分。 (34a)[L天津]/近几年/的/实践/得出/了/肯定/的/答案/。 (34b)为了/让/[dat今年]/蒜农/的/产品/卖/得/出/、/卖/出/好/价钱/,/ (35)人称:作名词时不切分;当称作动词是要切分。 (35a)作者/用/第一/人称/的/叙述/手法/, (35b)据/用/过/的/人/称/,/打/国际/长途/如/从/[L北京]/到/[L美国]/,/每/分钟/只需/传统/电话/费用/的/[fra1/4]/。 (36)同行:用作名词时不切分,读作tonghang;用作动词时读作tongxing,一律切分。 (36a)/这时/恰/有/同行/到来/,/只好/借/[mon一元钱]/给/他/。 (36b)/笔者/与/她/骑车/同/行/。 (37)从小: (37a)/图文并茂/、/声/形/兼备/的/写作/能力/将要/从小/培养/, (37b)/企业/从无到有/,/从/小/到/大/, (38)中学: (38a)/在/长期/的/中学/教学/实践/中/我/体会/到/, (38b)/引导/他们/在/实践/中/学/会/正确/行使/民主/权利/。 (39)上门: (39a)/营业员/们/便/主动/上门/收款/。 (39b)/打/出/了/名气/,/找/上/门/来/的/工程/一个/接/一个/。 (39c)我/冲/出/门/去/,/随手/拉/上/门/。/ (40)声响:作名词使用时不切分。 (40a)/而/轻轻/地/挪动/椅子/走开/,/无/一点/声响/。/ (40b)/"/哗哗/"/的/潮水/声/响/成/一片/, (41)就此:作副词使用时不切分。 (41a)/国际/足球/界/一些/有识之士/就此/产生/一种/忧虑/, (41b)我们/也/欢迎/科技界/人士/就/此/问题/发表/意见/和/建议/。 (42)高层次作形容词时不切分;当该词前有副词修饰时需切分。 (42a)/着眼点/放/在/培养/造就/大批/高层次/科技/人才/上/。 (42b)/实现/更/大/规模/、/更/高/层次/的/扩张/和/发展/。/ (43)有的: (43a)/有的/用/汉语/,/有的/用/俄语/, (43b)/是/[L北大荒]/独/有/的/风味/。 (44)的话:作助词使用时不切。 (44a)/如果/要/使/谈判/取得/迅速/进展/的话/, (44b)/[P卡比拉]/先生/对/我/的/话/是/持/认真/态度/的/。 (45)话说:整体作动词使用时不切。 (45a)/话说/当年/,/他/言语/铿锵/:/"/在/当时/,/一切/都/得/打破常规/。 (45b)用/他/自己/的/话/说/, (46)标本:意思为生物/标本时不切;表示"直接和根本"并列的意思时要切开。 (46a)/他们/还/结合/挂图/、/标本/进行/讲解/。 (46b)/反/腐败/要/坚持/标/本/兼/治/, (47)上将:作为军衔使用时不切分。 (47a)/[O中央军委]/副/主席/、/国务委员/兼/[O国防部]/长/[P迟浩田]/上将/ (47b)/[L中国]/在/人口/问题/上/将/面临/新/的/挑战/。 (48)将军:作为军衔使用时不切分。 (48a)/党/和/国家/领导人/、/解放军/元帅/、/将军/、/政府/省/部级/干部 (48b)/将/军:将/军/体/与/群体/紧密/结合/,/开办/体育/知识/讲座 (49)之一: (49a)/企业/领导班子/不/适应/社会主义/市场经济/的/要求/是/主要/原因/之一/。 (49b)/游人/视线/随/之/一/收/,/"/[L太和宫]/"/[int三个]/大字/豁然/在/目/。/ (49c)/我/不禁/为/之/一/震/。 (50)到家:作为形容词不切分。 (50a)/现在/不行/,/你/技术/不/过关/,/说明/练/得/还/不/到家/,/ (50b)/果不其然/,/此/儿/到/家/就/猝不及防/地/给/了/他/妈/一/刀/。 (50c)/[P赵匡胤]/终于/将/义/妹/[P京娘]/送/到/家/。 (51)在家: (51a)/一直/在家/等待/厂子/通知/上班/的/她/再/也/沉/不住/气/了/ (51b)/实现/访客/在/家/门口/与/住户/可/视/ (51c)/把/她/一/人/放/在/家/中/[P孙威锋]/放心不下/, (52)人均: (52a)/学生/拥有/计算机/的/人均/占有率/最高/ (52b)/[int两]/人/均/未/达到/[fra2/3]/的/当选/票/数/, (53)中用: (53a)/"/察/古/知/今/"/基本上/不/中用/了/, (53b)/天文学/上/把/[L宇宙]/中/用/光学/方法/看/不/到/的/物质/称/做/暗/物质/, (53c)/西/体/[L中]/用/我/也/反对/, (54)前去: (54a)/让/[P胡洁青]/前去/扶持/、/帮忙/。 (54b)一/名/应邀/到会/的/[L北京]/小学生/激动/地/跑/上/前/去/请/他/签名/。 (55)词表词"受过"只有"代人受过"的意思。当动词受和助词过构成"动+助"结构时,一律切开。 (55a)/它们/代/四/奸/受过/, (55b)/[P鲁迅]/虽然/在/[dat二十年代中期]/受/过/[P托洛茨基]/的/一定/影响/, (56)结果:有名词和动词两种用法,都不切分,动词结果的意思是杀死,而不是结出果实的意思。作为后一个意思,名词果是动词结的宾语,所以需切分。 (56a)/矫枉过正/的/结果/,/是/大家/几乎/忘/了/怎么/吃/,/ (56b)/种/果树/一般/要/三年/才/能/结/果/, #### 9.2.4就是、只有、只是、还是的切分规则 ##### 9.2.4.1就是 就是作副词、连词、助词使用时不切分。但作动词时,就是副词,是是动词,一律切分。 (A)作副词时,共有6个义项: (i)单用,表示同意,对; (ii)表示坚决,不可更改; (iii)强调肯定某种性质和状态,含有反驳意味;(iv)强调迅速果断; (v)确定范围,排除其它;(vi)表示没有别的情况;。 (1)我/一定/办到/,您/放心/就是/。/ (2)/反正/姥爷/就是/看/我/不/顺心/,/一点/也/不/喜欢/我/。/ (3)/望/着/车/来/车/往/的/马路/,/一/站/就是/[int几个小时]/。/ (4)/就是/节目/诉/求/为/非常/鲜明/的/单一/主题/, (B)作连词有2个义项: (i)表示假设的让步;即使(后面常用也作呼应); (ii)表示一种极端情况;纵然。如: (5)不是/播种/,/就是/锄地/;/不是/下/田/挖/野菜/,/就是/上山/打柴/。 (6)这个/建议/好/倒/是/好/,/就是/远水/不解/近/渴/。/ (C)就是作动词时,一律切分,就是副词,是是动词。如: (7)/[O光华国中]/职员/[P杨一中]/就/是/买/菜/变成/[O慈德]/会员/的/一/例/。 (8)/多元化/的/意思/就/是/有/了/更/多/的/选择/, (9)/最/特别/的/就/是/黄金/压制/的/邮票/。 (10)/最/关键/的/原则/就/是/「/避/凶/趋/吉/」/, (11)这/就/是/[L海尔-波普彗星]/。/ ##### 9.2.4.2只有 只有作为一个词表词有副词和连词两个义项。当他用作动词时,一律切开。(A)只有做副词相当于只好,表示唯一的选择。如: (1)/家属/最后/只有/寄/望/对岸/[O海协会]/能/请/[L大陆]/渔船/协/寻/。 (2)/[L中国]/的/体育/长期/是/国家/一/家/办/,/发达国家/是/国家/不/办/,/只有/社会/办/,/现在/国际/体育/的/潮流/是/国家/与/社会/共同/兴办/。/ (3)无/雪/的/[dat冬天]/是/难挨/的/,/我/只有/在/心中/落/着/一/场/场/大雪/。/ (4)协办员/和/见习员/在/通过/[int三道]/关/后/,/还要/经由/主办员/挑选/,/没有/主办员/ 挑选/的/也/只有/待岗/。/ (5)如果/"/邪恶/的/敌人/对/[L伊]/发动/侵略/,/[L伊拉克]/将/别无选择/,/只有/用/其/全部/的/潜力/、/经验/和/信仰/进行/自卫/"/。/ (B)"只有"作连词用表示必要条件,下文常用副词才、方呼应。如: (6)只有/掌握/了/最/先进/的/科学/,/我们/才/能/有/巩固/的/国防/。/ (7)高尚/的/世界/只/对/高尚/的/人们/存在/,/高尚/的/精神/境界/只有/高尚/的/人们/才/有/ 资格/领略/。/ (C)"只有"用作动词时一律切开。这时"只"做副词、"有"作动词。如: (8)/完成/管理/的/比率/只/有/[per百分之八十九]/, (9)/车行/时速/只/有/[len卅到五十公里]/左右/; (10)/目前/[O基隆邮局]/只/有/一个/集邮/柜台/, (11)/因/[O中嵙国小]/整个/学区/只/有/一个/[L中嵙里]/, ##### 9.2.4.3还是 还是有三种用法:连词、副词和动词。作动词时一律切分。 (A)作连词用时表示选择,通常跟无论、不管等连用。带连词还是的句子,除疑问句外,还是都可以换成或者,意思不变。例如: (1)无论是/说/新/话/,/提/新/观点/,/还是/放弃/前人/和/本本/上/的/过时/的/观点/、/错误/的/结论/,/都/需要/勇气/。/ (2)农民/[P张戎梅]/说/:/"/我们/村/不论/是/养猪/还是/种菜/的/,/现在/都/把/眼睛/盯/在/了/铁路/两头/。/"/ (3)不管/是/开工/还是/竣工/,/既/有/庆典/,/又/有/报导/,/或/称/世纪/工程/, (4)他们/不但/是/我们/公司/发展/的/"/动力/之/源/"/,/还是/我们/学习/的/好榜样/!/ (B)还是作副词用时有三个义项: (i)表示行为、动作或状态继续保持不变,相当于"仍然"、"依然"。 (ii)表示经过比较后做出的选择。 (iii)加强语气,相当于"到底"、"究竟"、"毕竟"。 还是/d用在动词、形容词前,可以省作还,而用在主语前不能省作还。 (5)/但/现场/交通/还是/十分/杂乱/。 (6)/该/基金/还是/可以/支应/灾民/最高/[mon一百万元]/的/贷款/额/, (7)/很多/居民/还是/使用/地下水/, (8)/[P陈]/还是/不/改/顽皮/个性/, (9)/多数人/还是/喜欢/为/宝宝/选/个/金/饰/, (10)/[P陈小弟]/的/父/母亲/还是/勇敢/地/生/下/他/, (C)还是用作动词时一律都要切开,即还作副词使用,有作动词使用。句型"是/v……的"可以帮助我们判断还是在句中是不是一种动词的用法。 (11)/关键/还/是/在/府/会/双方/态度/, (12)/初/到/部队/,/[age十五六岁]/,/还/是/个/没/见/过/世面/的/毛孩子/。/ (13)/她/不/相信/歌剧/这/门/综合/艺术/会/落入/低谷/,/认为/关键/还/是/提高/歌剧/自身/的 /品质/。/ (14)但/在/日常/工作/中/,/我/深感/除了/忙/还/是/忙/,/搞/得/焦头烂额/,/一天到晚/自己/不/属于/自己/。/ ##### 9.2.4.4只是 只是有三种用法:副词、连词和动词。作动词时统统切分。 (A)只是作副词使用时有两个义项: (i)表示限定某种情况或范围,相当于仅仅是。句末用而已或罢了等配合, 表示语气更为缓和。 (ii)强调在任何条件下情况都不变,有总是的意思。 (1)/只是/作为/预定/分娩/日/的/参考/。 (2)/只是/没有/焢/窑/经验/的/[P张]/课/长/, (3)/他/虽/表示/民意调查/结果/数据/只是/具有/参考/价值/, (4)/施工/初期/只是/修剪/树枝/, (B)只是作连词用,用在后一分句,表示轻微的转折,补充修正上文的意思,与不过的用法相近。 (5)/记者/在/重灾区/[L大河乡]/注意/到/,/群众/有/饭/吃/,/有/衣/穿/,/有/伤病/能/医治/,/只是/搭建/的/小/窝棚/难以/抵御/坝上/呼啸/的/寒风/。/ (6)[dat唐朝]/著名/诗人/[P李商隐]/『/夕阳/无限/好/,/只是/近/黄昏/』/的/诗句/是/对/黄昏/的/叹息/和/无奈/,/ (C)只是用作动词时一律要切开,即只作副词,是作动词。如: (7)/他/只/是/[dur一个月]/领/[mon二万多元]/的/工人/ (8)/事实上/[L盐埔乡]/公所/的/薪水/无/着落/只/是/冰山/一/角/, (9)/这些/需求/不只/是/钱/或/资源/,/ ================================================ FILE: docs/api/common/configurable.rst ================================================ .. _api/configurable: configurable ==================== .. autoclass:: hanlp_common.configurable.Configurable :members: .. autoclass:: hanlp_common.configurable.AutoConfigurable :members: ================================================ FILE: docs/api/common/conll.rst ================================================ .. _api/conll: conll ==================== .. autoclass:: hanlp_common.conll.CoNLLWord :members: .. autoclass:: hanlp_common.conll.CoNLLUWord :members: .. autoclass:: hanlp_common.conll.CoNLLSentence :members: ================================================ FILE: docs/api/common/constant.rst ================================================ constant ==================== .. automodule:: hanlp_common.constant :members: ================================================ FILE: docs/api/common/document.rst ================================================ .. _api/document: document ==================== .. currentmodule:: hanlp_common .. autoclass:: hanlp_common.document.Document :members: ================================================ FILE: docs/api/common/index.md ================================================ # hanlp_common Common APIs shared between `hanlp` and `restful`. ```{toctree} document conll configurable constant ``` ================================================ FILE: docs/api/hanlp/common/component.rst ================================================ component ================= .. currentmodule:: hanlp.common .. autoclass:: hanlp.common.component.Component :members: ================================================ FILE: docs/api/hanlp/common/dataset.md ================================================ # dataset This module provides base definition for datasets, dataloaders and samplers. ## datasets ```{eval-rst} .. currentmodule:: hanlp.common .. autoclass:: hanlp.common.dataset.Transformable :members: .. autoclass:: hanlp.common.dataset.TransformableDataset :members: :special-members: :exclude-members: __init__, __repr__ ``` ## dataloaders ```{eval-rst} .. currentmodule:: hanlp.common .. autoclass:: hanlp.common.dataset.PadSequenceDataLoader :members: :special-members: :exclude-members: __init__, __repr__ .. autoclass:: hanlp.common.dataset.PrefetchDataLoader :members: :special-members: :exclude-members: __init__, __repr__ ``` ## samplers ```{eval-rst} .. currentmodule:: hanlp.common .. autoclass:: hanlp.common.dataset.BucketSampler :members: .. autoclass:: hanlp.common.dataset.KMeansSampler :members: .. autoclass:: hanlp.common.dataset.SortingSampler :members: ``` ## sampler builders ```{eval-rst} .. currentmodule:: hanlp.common .. autoclass:: hanlp.common.dataset.SamplerBuilder :members: .. autoclass:: hanlp.common.dataset.SortingSamplerBuilder :members: .. autoclass:: hanlp.common.dataset.KMeansSamplerBuilder :members: ``` ================================================ FILE: docs/api/hanlp/common/index.md ================================================ # common Common base classes. ```{toctree} structure vocab transform dataset component torch_component ``` ================================================ FILE: docs/api/hanlp/common/structure.md ================================================ # structure ```{eval-rst} .. currentmodule:: hanlp.common .. autoclass:: hanlp.common.structure.ConfigTracker :members: .. autoclass:: hanlp.common.structure.History :members: ``` ================================================ FILE: docs/api/hanlp/common/torch_component.md ================================================ # torch_component ```{eval-rst} .. currentmodule:: hanlp.common.torch_component .. autoclass:: hanlp.common.torch_component.TorchComponent :members: ``` ================================================ FILE: docs/api/hanlp/common/transform.md ================================================ # transform ```{eval-rst} .. currentmodule:: hanlp.common .. autoclass:: hanlp.common.transform.VocabDict :members: ``` ================================================ FILE: docs/api/hanlp/common/vocab.md ================================================ # vocab ```{eval-rst} .. currentmodule:: hanlp.common .. autoclass:: hanlp.common.transform.Vocab :members: :special-members: :exclude-members: __init__, __repr__, __call__, __str__ ``` ================================================ FILE: docs/api/hanlp/components/classifiers.md ================================================ # classifiers ```{eval-rst} .. currentmodule:: hanlp.components.classifiers .. autoclass:: hanlp.components.classifiers.transformer_classifier.TransformerClassifier :members: ``` ================================================ FILE: docs/api/hanlp/components/eos.md ================================================ # eos ```{eval-rst} .. currentmodule:: hanlp.components.eos .. autoclass:: hanlp.components.eos.ngram.NgramSentenceBoundaryDetector :members: ``` ================================================ FILE: docs/api/hanlp/components/index.md ================================================ # components NLP components. ```{toctree} mtl/index classifiers eos tokenizers/index lemmatizer taggers/index ner/index parsers/index srl/index pipeline sts ``` ================================================ FILE: docs/api/hanlp/components/lemmatizer.md ================================================ # lemmatizer ```{eval-rst} .. currentmodule:: hanlp.components.lemmatizer .. autoclass:: TransformerLemmatizer :members: ``` ================================================ FILE: docs/api/hanlp/components/mtl/index.md ================================================ # mtl Multi-Task Learning (MTL) framework. ```{toctree} mtl tasks/index ``` ================================================ FILE: docs/api/hanlp/components/mtl/mtl.md ================================================ # MultiTaskLearning ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.multi_task_learning.MultiTaskLearning :members: :special-members: :exclude-members: __init__, __repr__ ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/constituency.md ================================================ # con Constituency parsing. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.constituency.CRFConstituencyParsing :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/dep.md ================================================ # dep Dependency parsing. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.dep.BiaffineDependencyParsing :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/index.md ================================================ # tasks Multi-Task Learning (MTL) tasks. ```{toctree} task constituency dep sdp ud lem pos tok ner/index srl/index ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/lem.md ================================================ # lem Lemmatization. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.lem.TransformerLemmatization :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/ner/biaffine_ner.md ================================================ # biaffine_ner Biaffine Named Entity Recognition. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.ner.biaffine_ner.BiaffineNamedEntityRecognition :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/ner/index.md ================================================ # ner Named Entity Recognition. ```{toctree} tag_ner biaffine_ner ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/ner/tag_ner.md ================================================ # tag_ner Tagging based Named Entity Recognition. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.ner.tag_ner.TaggingNamedEntityRecognition :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/pos.md ================================================ # pos Part-of-speech tagging. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.pos.TransformerTagging :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/sdp.md ================================================ # sdp Semantic Dependency Parsing. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.sdp.BiaffineSemanticDependencyParsing :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/srl/bio_srl.md ================================================ # bio_srl BIO Tagging based Semantic Role Labeling. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.srl.bio_srl.SpanBIOSemanticRoleLabeling :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/srl/index.md ================================================ # srl Semantic Role Labeling. ```{toctree} bio_srl rank_srl ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/srl/rank_srl.md ================================================ # rank_srl Span Ranking Semantic Role Labeling. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.srl.rank_srl.SpanRankingSemanticRoleLabeling :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/task.md ================================================ # Task ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.Task :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/tok.md ================================================ # tok Tokenization. ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.tok.tag_tok.TaggingTokenization :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/mtl/tasks/ud.md ================================================ # ud Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing). ```{eval-rst} .. currentmodule:: hanlp.components.mtl .. autoclass:: hanlp.components.mtl.tasks.ud.UniversalDependenciesParsing :members: :exclude-members: execute_training_loop, fit_dataloader ``` ================================================ FILE: docs/api/hanlp/components/ner/biaffine_ner.md ================================================ # biaffine_ner Biaffine Named Entity Recognition. ```{eval-rst} .. currentmodule:: hanlp.components.ner.transformer_ner .. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner.BiaffineNamedEntityRecognizer :members: ``` ================================================ FILE: docs/api/hanlp/components/ner/index.md ================================================ # ner Named Entity Recognition. ```{toctree} transformer_ner rnn_ner biaffine_ner ``` ================================================ FILE: docs/api/hanlp/components/ner/rnn_ner.md ================================================ # rnn_ner Tagging based Named Entity Recognition. ```{eval-rst} .. currentmodule:: hanlp.components.ner.rnn_ner .. autoclass:: hanlp.components.ner.rnn_ner.RNNNamedEntityRecognizer :members: ``` ================================================ FILE: docs/api/hanlp/components/ner/transformer_ner.md ================================================ # transformer_ner Tagging based Named Entity Recognition. ```{eval-rst} .. currentmodule:: hanlp.components.ner.transformer_ner .. autoclass:: hanlp.components.ner.transformer_ner.TransformerNamedEntityRecognizer :members: ``` ================================================ FILE: docs/api/hanlp/components/parsers/biaffine_dep.md ================================================ # biaffine_dep Biaffine dependency parser. ```{eval-rst} .. currentmodule:: hanlp.components .. autoclass:: hanlp.components.parsers.biaffine.biaffine_dep.BiaffineDependencyParser :members: ``` ================================================ FILE: docs/api/hanlp/components/parsers/biaffine_sdp.md ================================================ # biaffine_sdp Biaffine dependency parser. ```{eval-rst} .. currentmodule:: hanlp.components .. autoclass:: hanlp.components.parsers.biaffine.biaffine_sdp.BiaffineSemanticDependencyParser :members: ``` ================================================ FILE: docs/api/hanlp/components/parsers/crf_constituency_parser.md ================================================ # crf_constituency_parser Biaffine dependency parser. ```{eval-rst} .. currentmodule:: hanlp.components .. autoclass:: hanlp.components.parsers.constituency.crf_constituency_parser.CRFConstituencyParser :members: ``` ================================================ FILE: docs/api/hanlp/components/parsers/index.md ================================================ # parsers Parsers. ```{toctree} biaffine_dep biaffine_sdp ud_parser crf_constituency_parser ``` ================================================ FILE: docs/api/hanlp/components/parsers/ud_parser.md ================================================ # ud_parser Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing). ```{eval-rst} .. currentmodule:: hanlp.components .. autoclass:: hanlp.components.parsers.ud.ud_parser.UniversalDependenciesParser :members: ``` ================================================ FILE: docs/api/hanlp/components/pipeline.md ================================================ # pipeline ```{eval-rst} .. currentmodule:: hanlp.components.pipeline .. autoclass:: hanlp.components.pipeline.Pipe :members: .. autoclass:: hanlp.components.pipeline.Pipeline :members: ``` ================================================ FILE: docs/api/hanlp/components/srl/index.md ================================================ # srl Semantic Role Labelers. ```{toctree} span_rank span_bio ``` ================================================ FILE: docs/api/hanlp/components/srl/span_bio.md ================================================ # span_bio Span BIO tagging based SRL. ```{eval-rst} .. currentmodule:: hanlp.components.srl.span_bio.span_bio .. autoclass:: SpanBIOSemanticRoleLabeler :members: ``` ================================================ FILE: docs/api/hanlp/components/srl/span_rank.md ================================================ # span_rank Span Rank based SRL. ```{eval-rst} .. currentmodule:: hanlp.components.srl.span_rank.span_rank .. autoclass:: SpanRankingSemanticRoleLabeler :members: ``` ================================================ FILE: docs/api/hanlp/components/sts.md ================================================ # sts ```{eval-rst} .. currentmodule:: hanlp.components.sts .. autoclass:: hanlp.components.sts.transformer_sts.TransformerSemanticTextualSimilarity :members: ``` ================================================ FILE: docs/api/hanlp/components/taggers/index.md ================================================ # taggers Taggers. ```{toctree} transformer_tagger rnn_tagger ``` ================================================ FILE: docs/api/hanlp/components/taggers/rnn_tagger.md ================================================ # rnn_tagger RNN based tagger. ```{eval-rst} .. currentmodule:: hanlp.components .. autoclass:: hanlp.components.taggers.rnn_tagger.RNNTagger :members: ``` ================================================ FILE: docs/api/hanlp/components/taggers/transformer_tagger.md ================================================ # transformer_tagger Transformer based tagger. ```{eval-rst} .. currentmodule:: hanlp.components .. autoclass:: hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger :members: ``` ================================================ FILE: docs/api/hanlp/components/tokenizers/index.md ================================================ # tokenizers Tokenizers. ```{toctree} transformer multi_criteria ``` ================================================ FILE: docs/api/hanlp/components/tokenizers/multi_criteria.md ================================================ # multi_criteria Transformer based Multi-Criteria Word tokenizer. ```{eval-rst} .. currentmodule:: hanlp.components.tokenizers.multi_criteria_cws_transformer .. autoclass:: hanlp.components.tokenizers.multi_criteria_cws_transformer.MultiCriteriaTransformerTaggingTokenizer :members: ``` ================================================ FILE: docs/api/hanlp/components/tokenizers/transformer.md ================================================ # transformer Transformer based tokenizer. ```{eval-rst} .. currentmodule:: hanlp.components.tokenizers.transformer .. autoclass:: hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer :members: ``` ================================================ FILE: docs/api/hanlp/datasets/constituency/constituency_dataset.md ================================================ # constituency_dataset ```{eval-rst} .. autoclass:: hanlp.datasets.parsing.loaders.constituency_dataset.ConstituencyDataset :members: ``` ================================================ FILE: docs/api/hanlp/datasets/constituency/index.md ================================================ # con Constituency parsing datasets. ```{toctree} constituency_dataset resources ``` ================================================ FILE: docs/api/hanlp/datasets/constituency/resources.md ================================================ # resources ## Chinese Treebank ### CTB8 ````{margin} **Discussion** ```{seealso} About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024). ``` ```` ```{eval-rst} .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TRAIN .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_DEV .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TEST ``` ### CTB9 ````{margin} **Discussion** ```{seealso} About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024). ``` ```` ```{eval-rst} .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TRAIN .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_DEV .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TEST ``` ## English Treebank ### PTB ```{eval-rst} .. autodata:: hanlp.datasets.parsing.ptb.PTB_TRAIN .. autodata:: hanlp.datasets.parsing.ptb.PTB_DEV .. autodata:: hanlp.datasets.parsing.ptb.PTB_TEST ``` ================================================ FILE: docs/api/hanlp/datasets/dep/conll_dataset.md ================================================ # conll ```{eval-rst} .. currentmodule:: hanlp.datasets.parsing.loaders.conll_dataset .. autoclass:: CoNLLParsingDataset :members: ``` ================================================ FILE: docs/api/hanlp/datasets/dep/index.md ================================================ # dep Dependency parsing datasets. ```{toctree} conll_dataset resources ``` ================================================ FILE: docs/api/hanlp/datasets/dep/resources.md ================================================ # resources ## PKU Multiview Treebank PKU Multi-view Chinese Treebank, released by PKU-ICL. It contains the sentences from People's Daily(19980101-19980110). The number of sentences in it is 14463. ```{eval-rst} .. automodule:: hanlp.datasets.parsing.pmt1 :members: ``` ## Chinese Treebank ### CTB5 ```{eval-rst} .. automodule:: hanlp.datasets.parsing.ctb5 :members: ``` ### CTB7 ```{eval-rst} .. automodule:: hanlp.datasets.parsing.ctb7 :members: ``` ### CTB8 ```{eval-rst} .. Attention:: We propose a new data split for CTB which is different from the academia conventions with the following 3 advantages. - Easy to reproduce. Files ending with ``8`` go to dev set, ending with ``9`` go to the test set, otherwise go to the training set. - Full use of CTB8. The academia conventional split omits 50 gold files while we recall them. - More balanced split across genres. Proportions of samples in each genres are similar. We also use Stanford Dependencies 3.3.0 which offers fine-grained relations and more grammars than the conventional head finding rules introduced by :cite:`zhang-clark-2008-tale`. Therefore, scores on our preprocessed CTB8 are not directly comparable to those in most literatures. We have experimented the same model on the conventionally baked CTB8 and the scores could be 4~5 points higher. We believe it's worthy since HanLP is made for practical purposes, not just for producing pretty numbers. ``` ````{margin} **Discussion** ```{seealso} We have a discussion on [our forum](https://bbs.hankcs.com/t/topic/3024). ``` ```` ```{eval-rst} .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_TRAIN .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_DEV .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_SD330_TEST ``` ### CTB9 ```{eval-rst} .. Attention:: Similar preprocessing and splits with CTB8 are applied. See the notice above. ``` ```{eval-rst} .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_TRAIN .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_DEV .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_SD330_TEST ``` ## English Treebank ### PTB ```{eval-rst} .. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_TRAIN .. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_DEV .. autodata:: hanlp.datasets.parsing.ptb.PTB_SD330_TEST ``` ## Universal Dependencies ### Languages ```{eval-rst} .. automodule:: hanlp.datasets.parsing.ud.ud27 :members: ``` ### Multilingual ```{eval-rst} .. automodule:: hanlp.datasets.parsing.ud.ud27m :members: ``` ================================================ FILE: docs/api/hanlp/datasets/eos/eos.md ================================================ # eos ```{eval-rst} .. currentmodule:: hanlp.datasets.eos.eos .. autoclass:: SentenceBoundaryDetectionDataset :members: ``` ================================================ FILE: docs/api/hanlp/datasets/eos/index.md ================================================ # eos Sentence boundary detection datasets. ```{toctree} eos resources ``` ================================================ FILE: docs/api/hanlp/datasets/eos/resources.md ================================================ # resources ## nn_eos ```{eval-rst} .. automodule:: hanlp.datasets.eos.loaders.nn_eos :members: ``` ================================================ FILE: docs/api/hanlp/datasets/index.md ================================================ # datasets ```{eval-rst} NLP datasets grouped by tasks. For each task, we provide at least one ``torch.utils.data.Dataset`` compatible class and several open-source resources. Their file format and description can be found in their ``Dataset.load_file`` documents. Their contents are split into ``TRAIN``, ``DEV`` and ``TEST`` portions, each of them is stored in a Python constant which can be fetched using :meth:`~hanlp.utils.io_util.get_resource`. ``` ````{margin} **Professionals use Linux** ```{note} Many preprocessing scripts written by professionals make heavy use of Linux/Unix tool chains like shell, perl, gcc, etc., which is not available or buggy on Windows. You may need a *nix evironment to run these scripts. ``` ```` ```{toctree} eos/index tok/index pos/index ner/index dep/index srl/index constituency/index ``` ================================================ FILE: docs/api/hanlp/datasets/ner/index.md ================================================ # ner NER datasets. ```{toctree} tsv json resources ``` ================================================ FILE: docs/api/hanlp/datasets/ner/json.md ================================================ # json ```{eval-rst} .. currentmodule:: hanlp.datasets.ner.loaders.json_ner .. autoclass:: JsonNERDataset :members: ``` ================================================ FILE: docs/api/hanlp/datasets/ner/resources.md ================================================ # resources ## CoNLL 2003 ```{eval-rst} .. automodule:: hanlp.datasets.ner.conll03 :members: ``` ## MSRA ```{eval-rst} .. automodule:: hanlp.datasets.ner.msra :members: ``` ## OntoNotes5 ```{eval-rst} .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TRAIN .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_DEV .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TEST ``` ## Resume ```{eval-rst} .. automodule:: hanlp.datasets.ner.resume :members: ``` ## Weibo ```{eval-rst} .. automodule:: hanlp.datasets.ner.weibo :members: ``` ================================================ FILE: docs/api/hanlp/datasets/ner/tsv.md ================================================ # tsv ```{eval-rst} .. currentmodule:: hanlp.datasets.ner.loaders.tsv .. autoclass:: TSVTaggingDataset :members: ``` ================================================ FILE: docs/api/hanlp/datasets/pos/index.md ================================================ # pos PoS datasets. ```{eval-rst} PoS is a normal tagging task which uses :class:`hanlp.datasets.ner.loaders.tsv.TSVTaggingDataset` for loading. ``` ```{toctree} resources ``` ================================================ FILE: docs/api/hanlp/datasets/pos/resources.md ================================================ # resources ## CTB5 ```{eval-rst} .. automodule:: hanlp.datasets.pos.ctb5 :members: ``` ## CTB8 ```{eval-rst} .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TRAIN .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_DEV .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TEST ``` ## CTB9 ```{eval-rst} .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TRAIN .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_DEV .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TEST ``` ================================================ FILE: docs/api/hanlp/datasets/srl/conll2012_dataset.md ================================================ # conll2012_dataset ```{eval-rst} .. autoclass:: hanlp.datasets.srl.loaders.conll2012.CoNLL2012SRLDataset :members: ``` ================================================ FILE: docs/api/hanlp/datasets/srl/index.md ================================================ # srl Semantic Role Labeling datasets. ```{toctree} conll2012_dataset resources ``` ================================================ FILE: docs/api/hanlp/datasets/srl/resources.md ================================================ # resources ## OntoNotes 5 ### Chinese ```{eval-rst} .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN :noindex: .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV :noindex: .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST :noindex: ``` ================================================ FILE: docs/api/hanlp/datasets/tok/index.md ================================================ # tok Tokenization datasets. ```{toctree} txt mcws_dataset resources ``` ================================================ FILE: docs/api/hanlp/datasets/tok/mcws_dataset.md ================================================ # mcws_dataset ```{eval-rst} .. currentmodule:: hanlp.datasets.tokenization.loaders.multi_criteria_cws.mcws_dataset .. autoclass:: MultiCriteriaTextTokenizingDataset :members: ``` ================================================ FILE: docs/api/hanlp/datasets/tok/resources.md ================================================ # resources ## sighan2005 [The Second International Chinese Word Segmentation Bakeoff](http://sighan.cs.uchicago.edu/bakeoff2005/) took place over the summer of 2005. ### pku ```{eval-rst} .. automodule:: hanlp.datasets.tokenization.sighan2005.pku :members: ``` ### msr ```{eval-rst} .. automodule:: hanlp.datasets.tokenization.sighan2005.msr :members: ``` ### as ```{eval-rst} .. automodule:: hanlp.datasets.tokenization.sighan2005.as_ :members: ``` ### cityu ```{eval-rst} .. automodule:: hanlp.datasets.tokenization.sighan2005.cityu :members: ``` ## CTB6 ```{eval-rst} .. automodule:: hanlp.datasets.tokenization.ctb6 :members: ``` ## CTB8 ```{eval-rst} .. automodule:: hanlp.datasets.parsing.ctb8 .. autodata:: CTB8_CWS_TRAIN .. autodata:: CTB8_CWS_DEV .. autodata:: CTB8_CWS_TEST ``` ## CTB9 ```{eval-rst} .. automodule:: hanlp.datasets.parsing.ctb9 .. autodata:: CTB9_CWS_TRAIN .. autodata:: CTB9_CWS_DEV .. autodata:: CTB9_CWS_TEST ``` ================================================ FILE: docs/api/hanlp/datasets/tok/txt.md ================================================ # txt ```{eval-rst} .. currentmodule:: hanlp.datasets.tokenization.loaders.txt .. autoclass:: TextTokenizingDataset :members: ``` ================================================ FILE: docs/api/hanlp/hanlp.rst ================================================ .. _api/main: hanlp ========== .. currentmodule:: hanlp .. autofunction:: load .. autofunction:: pipeline ================================================ FILE: docs/api/hanlp/index.md ================================================ # hanlp Core APIs for `hanlp`. ```{toctree} hanlp common/index components/index pretrained/index datasets/index utils/index layers/index ``` ================================================ FILE: docs/api/hanlp/layers/decoders/biaffine_ner.md ================================================ # biaffine_ner ```{eval-rst} .. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner_model.BiaffineNamedEntityRecognitionDecoder :members: ``` ================================================ FILE: docs/api/hanlp/layers/decoders/index.md ================================================ # decoders ```{toctree} linear_crf biaffine_ner ``` ================================================ FILE: docs/api/hanlp/layers/decoders/linear_crf.md ================================================ # linear_crf ```{eval-rst} .. autoclass:: hanlp.components.mtl.tasks.pos.LinearCRFDecoder :members: ``` ================================================ FILE: docs/api/hanlp/layers/embeddings/char_cnn.md ================================================ # char_cnn ```{eval-rst} .. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNN :members: .. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNNEmbedding :members: ``` ================================================ FILE: docs/api/hanlp/layers/embeddings/char_rnn.md ================================================ # char_rnn ```{eval-rst} .. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNN :members: .. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNNEmbedding :members: ``` ================================================ FILE: docs/api/hanlp/layers/embeddings/embedding.md ================================================ # embedding ```{eval-rst} .. autoclass:: hanlp.layers.embeddings.embedding.Embedding :members: .. autoclass:: hanlp.layers.embeddings.embedding.ConcatModuleList :members: .. autoclass:: hanlp.layers.embeddings.embedding.EmbeddingList :members: ``` ================================================ FILE: docs/api/hanlp/layers/embeddings/fasttext.md ================================================ # fasttext ```{eval-rst} .. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbedding :members: .. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbeddingModule :members: ``` ================================================ FILE: docs/api/hanlp/layers/embeddings/index.md ================================================ # embeddings ```{toctree} embedding word2vec fasttext char_cnn char_rnn transformer ``` ================================================ FILE: docs/api/hanlp/layers/embeddings/transformer.md ================================================ # transformer ```{eval-rst} .. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbedding :members: .. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule :members: ``` ================================================ FILE: docs/api/hanlp/layers/embeddings/word2vec.md ================================================ # word2vec ```{eval-rst} .. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbedding :members: .. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbeddingModule :members: ``` ================================================ FILE: docs/api/hanlp/layers/index.md ================================================ # layers ```{toctree} embeddings/index transformers/index decoders/index ``` ================================================ FILE: docs/api/hanlp/layers/transformers/encoder.md ================================================ # encoder ```{eval-rst} .. autoclass:: hanlp.layers.transformers.encoder.TransformerEncoder :members: ``` ================================================ FILE: docs/api/hanlp/layers/transformers/index.md ================================================ # transformers ```{toctree} encoder tokenizer ``` ================================================ FILE: docs/api/hanlp/layers/transformers/tokenizer.md ================================================ # tokenizer ```{eval-rst} .. autoclass:: hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/amr.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # amr AMR captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph with labels on edges (relations) and leaves (concepts). Before loading an AMR model, make sure to install HanLP with the `amr` dependencies: ```shell pip install hanlp[amr] -U ``` To parse a raw sentence into AMR: ```{eval-rst} .. margin:: Batching is Faster .. Hint:: Parse multiple sentences at once for faster speed! ``` ```{code-cell} ipython3 :tags: [output_scroll] import hanlp amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE) amr = amr_parser('The boy wants the girl to believe him.') print(amr) ``` All the pre-trained parsers and their scores are listed below. ```{eval-rst} .. automodule:: hanlp.pretrained.amr :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/amr2text.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # amr2text AMR captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph with labels on edges (relations) and leaves (concepts). The goal of AMR-to-Text Generation is to recover the original sentence realization given an AMR. This task can be seen as the reverse of the structured prediction found in AMR parsing. Before loading an AMR model, make sure to install HanLP with the `amr` dependencies: ```shell pip install hanlp[amr] -U ``` To generate a sentence given an AMR: ```{eval-rst} .. margin:: Batching is Faster .. Hint:: Generate multiple sentences at once for faster speed! ``` ```{code-cell} ipython3 :tags: [output_scroll] import hanlp generation = hanlp.load(hanlp.pretrained.amr2text.AMR3_GRAPH_PRETRAIN_GENERATION) print(generation(''' (z0 / want-01 :ARG0 (z1 / boy) :ARG1 (z2 / believe-01 :ARG0 (z3 / girl) :ARG1 z1)) ''')) ``` All the pre-trained parsers and their scores are listed below. ```{eval-rst} .. automodule:: hanlp.pretrained.amr2text :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/constituency.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # constituency Constituency Parsing is the process of analyzing the sentences by breaking down it into sub-phrases also known as constituents. To parse a tokenized sentence into constituency tree, first load a parser: ```{eval-rst} .. margin:: Batching is Faster .. Hint:: To speed up, parse multiple sentences at once, and use a GPU. ``` ```{code-cell} ipython3 :tags: [output_scroll] import hanlp con = hanlp.load(hanlp.pretrained.constituency.CTB9_CON_FULL_TAG_ELECTRA_SMALL) ``` Then parse a sequence or multiple sequences of tokens to it. ```{code-cell} ipython3 :tags: [output_scroll] tree = con(["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"]) ``` The constituency tree is a nested list of constituencies: ```{code-cell} ipython3 :tags: [output_scroll] tree ``` You can `str` or `print` it to get its bracketed form: ```{code-cell} ipython3 :tags: [output_scroll] print(tree) ``` All the pre-trained parsers and their scores are listed below. ```{eval-rst} .. automodule:: hanlp.pretrained.constituency :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/dep.md ================================================ # dep ```{eval-rst} .. automodule:: hanlp.pretrained.dep :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/eos.md ================================================ # eos ```{eval-rst} .. automodule:: hanlp.pretrained.eos :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/fasttext.md ================================================ # fasttext ```{eval-rst} .. automodule:: hanlp.pretrained.fasttext :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/glove.md ================================================ # glove ```{eval-rst} .. automodule:: hanlp.pretrained.glove :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/index.md ================================================ # pretrained ```{eval-rst} NLP components grouped by tasks. For each task, we provide at least one :class:`~hanlp.common.component.Component` compatible class and several pretrained models. Each of them is stored in a Python constant which can be fetched using :meth:`hanlp.load`. ``` ```{toctree} mtl eos tok pos ner dep constituency srl sdp amr amr2text sts word2vec glove fasttext mlm ``` ================================================ FILE: docs/api/hanlp/pretrained/mlm.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # mlm Masked Language Model (MLM) predicts words that were originally hidden intentionally in a sentence. To perform such prediction, first load a pre-trained MLM (e.g., `bert-base-chinese`): ````{margin} Batching is Faster ```{hint} Predict multiple sentences in batch mode for faster speed! ``` ```` ````{margin} Multilingual Support ```{note} HanLP always support multilingual. Feel free to use a multilingual model listed [here](https://huggingface.co/models?pipeline_tag=fill-mask&sort=downloads). ``` ```` ```{code-cell} ipython3 :tags: [output_scroll] from hanlp.components.lm.mlm import MaskedLanguageModel mlm = MaskedLanguageModel() mlm.load('bert-base-chinese') ``` Represent blanks (masked tokens) with `[MASK]` and let MLM fills them: ```{code-cell} ipython3 :tags: [output_scroll] mlm('生活的真谛是[MASK]。') ``` Batching is always faster: ```{code-cell} ipython3 :tags: [output_scroll] mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。']) ``` All the pre-trained MLM models and their details are listed in the [docs](https://huggingface.co/models?pipeline_tag=fill-mask&sort=downloads) of Hugging Face 🤗 Transformers. ================================================ FILE: docs/api/hanlp/pretrained/mtl.md ================================================ # mtl ```{eval-rst} .. automodule:: hanlp.pretrained.mtl :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/ner.md ================================================ # ner ```{eval-rst} .. automodule:: hanlp.pretrained.ner :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/pos.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # pos The process of classifying words into their **parts of speech** and labeling them accordingly is known as **part-of-speech tagging**, **POS-tagging**, or simply **tagging**. To tag a tokenized sentence: ````{margin} Batching is Faster ```{hint} Tag multiple sentences at once for faster speed! ``` ```` ```{code-cell} ipython3 :tags: [output_scroll] import hanlp pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) pos(['我', '的', '希望', '是', '希望', '世界', '和平']) ``` ````{margin} Custom Dictionary Supported ```{seealso} See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py) for custom dictionary. ``` ```` All the pre-trained taggers and their details are listed below. ```{eval-rst} .. automodule:: hanlp.pretrained.pos :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/sdp.md ================================================ # sdp ```{eval-rst} .. automodule:: hanlp.pretrained.sdp :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/srl.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # srl Semantic Role Labeling (SRL) is one shallow semantic parsing that produces predicate-argument structures which are semantic roles (or participants) such as agent, patient, and theme associated with verbs. Inputs to SRL are tokenized sentences: ````{margin} Batching is Faster ```{hint} Feed in multiple sentences at once for faster speed! ``` ```` ```{code-cell} ipython3 :tags: [output_scroll] import hanlp srl = hanlp.load(hanlp.pretrained.srl.CPB3_SRL_ELECTRA_SMALL) srl(['男孩', '希望', '女孩', '相信', '他', '。']) ``` All the pre-trained labelers and their details are listed below. ```{eval-rst} .. automodule:: hanlp.pretrained.srl :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/sts.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # sts `sts` package holds pre-trained Semantic Textual Similarity (STS) models. We surveyed both supervised and unsupervised models and we believe that unsupervised models are still immature at this moment. Unsupervised STS is good for IR but not NLP especially on sentences with little lexical overlap. ```{eval-rst} .. automodule:: hanlp.pretrained.sts :members: ``` ```{code-cell} ipython3 import hanlp sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH) sim([ ['看图猜一电影名', '看图猜电影'], ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'], ['北京到上海的动车票', '上海到北京的动车票'], ]) ``` ================================================ FILE: docs/api/hanlp/pretrained/tok.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # tok Tokenization is a way of separating a sentence into smaller units called tokens. In lexical analysis, tokens usually refer to words. ````{margin} Batching is Faster ```{hint} Tokenize multiple sentences at once for faster speed! ``` ```` ````{margin} Custom Dictionary Supported ```{seealso} See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py) for custom dictionary. ``` ```` To tokenize raw sentences: ```{code-cell} ipython3 :tags: [output_scroll] import hanlp tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) tok(['商品和服务。', '晓美焰来到北京立方庭参观自然语义科技公司']) ``` All the pre-trained tokenizers and their details are listed below. ```{eval-rst} .. automodule:: hanlp.pretrained.tok :members: ``` ================================================ FILE: docs/api/hanlp/pretrained/word2vec.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # word2vec Word2Vec is a family of model architectures and optimizations that can be used to learn word embeddings from large unlabeled datasets. In this document, it is narrowly defined as a component to map discrete words to distributed representations which are dense vectors. To perform such mapping: ````{margin} Batching is Faster ```{hint} Map multiple tokens in batch mode for faster speed! ``` ```` ````{margin} Multilingual Support ```{note} HanLP always support multilingual. Feel free to use a multilingual model listed [here](http://vectors.nlpl.eu/repository/). ``` ```` ```{code-cell} ipython3 :tags: [output_scroll] import hanlp word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU) word2vec('先进') ``` These vectors have already been normalized to facilitate similarity computation: ```{code-cell} ipython3 :tags: [output_scroll] import torch print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0)) print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0)) ``` Using these similarity scores, the most similar words can be found: ```{code-cell} ipython3 :tags: [output_scroll] word2vec.most_similar('上海') ``` Word2Vec usually can not process OOV or phrases: ```{code-cell} ipython3 :tags: [output_scroll] word2vec.most_similar('非常寒冷') # phrases are usually OOV ``` Doc2Vec, as opposite to Word2Vec model, can create a vectorised representation by averaging a group of words. To enable Doc2Vec for OOV and phrases, pass `doc2vec=True`: ```{code-cell} ipython3 :tags: [output_scroll] word2vec.most_similar('非常寒冷', doc2vec=True) ``` All the pre-trained word2vec models and their details are listed below. ```{eval-rst} .. automodule:: hanlp.pretrained.word2vec :members: ``` ================================================ FILE: docs/api/hanlp/utils/index.md ================================================ # utils Utilities. ```{toctree} io_util ``` ================================================ FILE: docs/api/hanlp/utils/io_util.md ================================================ # io_util ```{eval-rst} .. currentmodule:: hanlp.utils .. automodule:: hanlp.utils.io_util :members: ``` ================================================ FILE: docs/api/restful.rst ================================================ .. _api/hanlp_restful: hanlp_restful ==================== .. currentmodule:: hanlp_restful .. autoclass:: HanLPClient :members: :special-members: :exclude-members: __init__, __repr__, __weakref__ ================================================ FILE: docs/api/restful_golang.md ================================================ # Golang RESTful API ## Install ```shell script go get -u github.com/hankcs/gohanlp@main ``` ## Quick Start Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `Parse` interface. ```java package main import ( "fmt" "github.com/hankcs/gohanlp/hanlp" ) func main() { client := hanlp.HanLPClient(hanlp.WithAuth("The auth you applied for")) // anonymous users can skip auth s, _ := client.Parse("In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.",hanlp.WithLanguage("mul")) fmt.Println(s) } ``` Refer to our [testcases](https://github.com/hankcs/gohanlp/blob/main/main_test.go) and [data format](../data_format) for more details. ================================================ FILE: docs/api/restful_java.md ================================================ # Java RESTful API Add the following dependency into the `pom.xml` file of your project. ```xml com.hankcs.hanlp.restful hanlp-restful 0.0.15 ``` Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `parse` interface. ```java HanLPClient client = new HanLPClient("https://hanlp.hankcs.com/api", null); // Replace null with your auth System.out.println(client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。")); ``` Refer to our [testcases](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java) and [data format](../data_format) for more details. ================================================ FILE: docs/api/trie/dictionary.md ================================================ # dictionary ```{eval-rst} .. currentmodule:: hanlp_trie .. autoclass:: hanlp_trie.dictionary.DictInterface :members: .. autoclass:: hanlp_trie.dictionary.TrieDict :members: ``` ================================================ FILE: docs/api/trie/index.md ================================================ # hanlp_trie HanLP trie/dictionary interface and referential implementation. ```{toctree} trie dictionary ``` ================================================ FILE: docs/api/trie/trie.md ================================================ # trie ```{eval-rst} .. currentmodule:: hanlp_trie .. autoclass:: hanlp_trie.trie.Node :members: .. autoclass:: hanlp_trie.trie.Trie :members: ``` ================================================ FILE: docs/conf.py ================================================ # -- Project information ----------------------------------------------------- import sys import os from datetime import datetime sys.path.append(os.path.abspath('..')) sys.path.append(os.path.abspath('../plugins/hanlp_common')) sys.path.append(os.path.abspath('../plugins/hanlp_trie')) sys.path.append(os.path.abspath('../plugins/hanlp_restful')) import hanlp project = 'HanLP' copyright = f'2020-{datetime.now().year}, hankcs' author = 'hankcs' # The short X.Y version. version = hanlp.__version__ # The full version, including alpha/beta/rc tags. release = hanlp.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. language = 'en' master_doc = "index" # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "myst_nb", "sphinx_copybutton", "sphinx_togglebutton", "sphinxcontrib.bibtex", 'sphinx_astrorefs', # astrophysics style, similar to ACL "sphinx_thebe", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.viewcode", "ablog", 'sphinx.ext.napoleon', ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] intersphinx_mapping = { "python": ("https://docs.python.org/3.8", None), "sphinx": ("https://www.sphinx-doc.org/en/3.x", None), } nitpick_ignore = [ ("py:class", "docutils.nodes.document"), ("py:class", "docutils.parsers.rst.directives.body.Sidebar"), ] autoclass_content = 'both' numfig = True myst_admonition_enable = True myst_deflist_enable = True myst_url_schemes = ("http", "https", "mailto") panels_add_bootstrap_css = False # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_book_theme" html_title = "HanLP Documentation" html_logo = "_static/logo.png" html_favicon = "_static/favicon.png" html_copy_source = True html_sourcelink_suffix = "" html_sidebars = { # "reference/blog/*": [ # "sidebar-search-bs.html", # "postcard.html", # "recentposts.html", # "tagcloud.html", # "categories.html", # "archives.html", # "sbt-sidebar-nav.html", # "sbt-sidebar-footer.html", # ] } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] jupyter_execute_notebooks = "cache" thebe_config = { "repository_url": "https://github.com/binder-examples/jupyter-stacks-datascience", "repository_branch": "master", } html_theme_options = { "theme_dev_mode": False, "path_to_docs": "docs", "repository_url": "https://github.com/hankcs/HanLP", # "repository_branch": "gh-pages", # For testing # "launch_buttons": { # # "binderhub_url": "https://mybinder.org", # # "jupyterhub_url": "https://datahub.berkeley.edu", # For testing # "colab_url": "https://colab.research.google.com/", # "notebook_interface": "jupyterlab", # "thebe": True, # }, "use_edit_page_button": True, "use_issues_button": True, "use_repository_button": True, "use_download_button": True, # For testing # "home_page_in_toc": True, # "single_page": True, # "extra_footer": "Test", # DEPRECATED KEY # "extra_navbar": "Test", } html_baseurl = "https://hanlp.hankcs.com/docs/" # -- ABlog config ------------------------------------------------- blog_path = "reference/blog" blog_post_pattern = "reference/blog/*.md" blog_baseurl = "https://hanlp.hankcs.com/docs/" fontawesome_included = True post_auto_image = 1 post_auto_excerpt = 2 execution_show_tb = "READTHEDOCS" in os.environ # Localization nb_render_priority = { "gettext": ( "application/vnd.jupyter.widget-view+json", "application/javascript", "text/html", "image/svg+xml", "image/png", "image/jpeg", "text/markdown", "text/latex", "text/plain", ) } locale_dirs = ['locale/'] # bibtex bibtex_default_style = 'unsrtalpha' ================================================ FILE: docs/configure.md ================================================ # Configuration ## Customize ``HANLP_HOME`` All resources HanLP use will be cached into a directory called `HANLP_HOME`. It is an environment variable which you can customize to any path you like. By default, `HANLP_HOME` resolves to `~/.hanlp` and `%appdata%\hanlp` on *nix and Windows respectively. If you want to redirect `HANLP_HOME` to a different location, say `/data/hanlp`, the following shell command can be very helpful. ```bash export HANLP_HOME=/data/hanlp ``` ## Use GPUs By default, HanLP tries to use the least occupied GPU so that mostly you don't need to worry about it, HanLP makes the best choice for you. This behavior is very useful when you're using a public server shared across your lab or company with your colleagues. HanLP also honors the ``CUDA_VISIBLE_DEVICES`` used by PyTorch and TensorFlow to limit which devices HanLP can choose from. For example, the following command will only keep the `0`th and `1`st GPUs. ```bash export CUDA_VISIBLE_DEVICES=0,1 ``` ```{eval-rst} If you need fine grained control over each component, ``hanlp.load(..., devices=...)`` is what you're looking for. See documents for :meth:`hanlp.load`. ``` ### External Resources For deep learning beginners, you might need to learn how to set up a working GPU environment first. Here are some resources. - [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) - It's a good practice to install the driver shipped with a CUDA package. - [PyTorch](https://pytorch.org/get-started/locally/) - If no existing PyTorch found, `pip install hanlp` will have the CPU-only PyTorch installed, which is universal and assumes no GPU or CUDA dependencies. - You will need to install a GPU-enabled PyTorch according to your CUDA and OS versions. - Cloud servers - There are many cloud services providing out-of-the-box deep learning images. HanLP works fine on these platforms. They could save your time and efforts. - Google Colab - Colab allows you to write excutable notebooks with full GPU support. PyTorch and TensorFlow have been pre-installed and configured to the best state. - In fact, you can click [![Open In Colab](https://file.hankcs.com/img/colab-badge.svg)](https://colab.research.google.com/drive/1KPX6t1y36TOzRIeB4Kt3uJ1twuj6WuFv?usp=sharing) to play with the GPU-enabled HanLP tutorial right now. ## Use Mirror Sites By default, models are downloaded from a global CDN we maintain. However, in some regions the downloading speed can be slow occasionally. If you happen to be in one of those regions, you can find some third party mirror sites on our [bbs](https://bbs.hankcs.com/). When you find a working URL, say [https://ftp.hankcs.com/hanlp/](https://ftp.hankcs.com/hanlp/), you can set a `HANLP_URL` environment variable and HanLP will pick it up at the next startup. ```bash export HANLP_URL=https://ftp.hankcs.com/hanlp/ ``` ## Control Verbosity By default, HanLP will print progressive message to the console when you load a model. If you want to silence it, use the following environment variable. ```bash export HANLP_VERBOSE=0 ``` ================================================ FILE: docs/contributing.md ================================================ # Contributing Guide Thank you for being interested in contributing to `HanLP`! You are awesome ✨. This guideline contains information about our conventions around coding style, pull request workflow, commit messages and more. This page also contains information to help you get started with development on this project. ## Development ### Set-up Get the source code of this project using git: ```bash git clone https://github.com/hankcs/HanLP --branch master cd HanLP pip install -e plugins/hanlp_trie pip install -e plugins/hanlp_common pip install -e plugins/hanlp_restful pip install -e . ``` To work on this project, you need Python 3.6 or newer. ### Running Tests This project has a test suite to ensure certain important APIs work properly. The tests can be run using: ```bash python -m unittest discover ./tests ``` ```{tip} It's hard to cover every API especially those of deep learning models, due to the limited computation resource of CI. However, we suggest all inference APIs to be tested at least. ``` ## Repository Structure This repository is a split into a few critical folders: hanlp/ : The HanLP core package, containing the Python code. plugins/ : Contains codes shared across several individual packages or non core APIs. docs/ : The documentation for HanLP, which is in markdown format mostly. : The build configuration is contained in `conf.py`. tests/ : Testing infrastructure that uses `unittest` to ensure the output of API is what we expect it to be. .github/ : Contains Continuous-integration (CI) workflows, run on commits/PRs to the GitHub repository. ================================================ FILE: docs/data_format.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # Data Format ## Input Format ### RESTful Input #### Definition To make a RESTful call, one needs to send a `json` HTTP POST request to the server, which contains at least a `text` field or a `tokens` field. The input to RESTful API is very flexible. It can be one of the following 3 formats: 1. It can be a document of raw `str` filled into `text`. The server will split it into sentences. 1. It can be a `list` of sentences, each sentence is a raw `str`, filled into `text`. 1. It can be a `list` of tokenized sentences, each sentence is a list of `str` typed tokens, filled into `tokens`. ```{eval-rst} Additionally, fine-grained controls are performed with the arguments defined in :meth:`hanlp_restful.HanLPClient.parse`. ``` #### Examples ```shell script curl -X 'POST' \ 'https://hanlp.hankcs.com/api/parse' \ -H 'accept: application/json' \ -H 'Content-Type: application/json' \ -d '{ "language": "zh", "text": "HanLP为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京参观自然语义科技公司。" }' ``` ### Model Input ````{margin} **How about training inputs?** ```{seealso} We mostly follow the conventional file format of each NLP task instead of re-inventing them. Thus, we use `.tsv` for tagging and `.conllu` for parsing etc. For more details, refer to [datasets](https://hanlp.hankcs.com/docs/api/hanlp/datasets/index.html). ``` ```` The input format to models is specified per model and per task. Generally speaking, if a model has no tokenizer built in, then its input is a sentence in `list[str]` form (a list of tokens), or multiple such sentences nested in a `list`. If a model has a tokenizer built in, each sentence is in `str` form. Additionally, you can use `skip_tasks='tok*'` to ask the model to use your tokenized inputs instead of tokenizing them, in which case, each of your sentence needs to be in `list[str]` form, as if there was no tokenizer. ```{eval-rst} For any model, its input is of sentence level, which means you have to split a document into sentences beforehand. You may want to try :class:`~hanlp.components.eos.ngram.NgramSentenceBoundaryDetector` for sentence splitting. ``` ## Output Format ```{eval-rst} The outputs of both :class:`~hanlp_restful.HanLPClient` and :class:`~hanlp.components.mtl.multi_task_learning.MultiTaskLearning` are unified as the same :class:`~hanlp_common.document.Document` format. ``` For example, the following RESTful codes will output such an instance. ```{code-cell} ipython3 :tags: [output_scroll] from hanlp_restful import HanLPClient HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None) # Fill in your auth print(HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。')) ``` The outputs above is represented as a `json` dictionary where each key is a task name and its value is the output of the corresponding task. For each output, if it's a nested `list` then it contains multiple sentences otherwise it's just one single sentence. We make the following naming convention of NLP tasks, each consists of 3 letters. ````{margin} **How about annotations?** ```{seealso} Each NLP task can exploit multiple datasets with their annotations, see our [annotations](annotations/index) for details. ``` ```` ### Naming Convention | key | Task | Chinese | | ---- | ------------------------------------------------------------ | ------------ | | tok | Tokenization. Each element is a token. | 分词 | | pos | Part-of-Speech Tagging. Each element is a tag. | 词性标注 | | lem | Lemmatization. Each element is a lemma. | 词干提取 | | fea | Features of Universal Dependencies. Each element is a feature. | 词法语法特征 | | ner | Named Entity Recognition. Each element is a tuple of `(entity, type, begin, end)`, where `end`s are exclusive offsets. | 命名实体识别 | | dep | Dependency Parsing. Each element is a tuple of `(head, relation)` where `head` starts with index `0` (which is `ROOT`). | 依存句法分析 | | con | Constituency Parsing. Each list is a bracketed constituent. | 短语成分分析 | | srl | Semantic Role Labeling. Similar to `ner`, each element is a tuple of `(arg/pred, label, begin, end)`, where the predicate is labeled as `PRED`. | 语义角色标注 | | sdp | Semantic Dependency Parsing. Similar to `dep`, however each token can have any number (including zero) of heads and corresponding relations. | 语义依存分析 | | amr | Abstract Meaning Representation. Each AMR graph is represented as list of logical triples. See [AMR guidelines](https://github.com/amrisi/amr-guidelines/blob/master/amr.md#example). | 抽象意义表示 | When there are multiple models performing the same task, their keys are appended with a secondary identifier. For example, `tok/fine` and `tok/corase` means a fine-grained tokenization model and a coarse-grained one respectively. ================================================ FILE: docs/index.md ================================================ # HanLP: Han Language Processing [![GitHub stars](https://img.shields.io/github/stars/hankcs/HanLP)](https://github.com/hankcs/HanLP/stargazers) [![GitHub forks](https://img.shields.io/github/forks/hankcs/HanLP)](https://github.com/hankcs/HanLP/network) ![pypi](https://img.shields.io/pypi/v/HanLP) [![Downloads](https://static.pepy.tech/badge/HanLP)](https://pepy.tech/project/HanLP) [![GitHub license](https://img.shields.io/github/license/hankcs/HanLP)](https://github.com/hankcs/HanLP/blob/master/LICENSE) [![Open In Colab](https://file.hankcs.com/img/colab-badge.svg)](https://colab.research.google.com/drive/1KPX6t1y36TOzRIeB4Kt3uJ1twuj6WuFv?usp=sharing) The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese, Japanese and many others. ## Tutorials ```{toctree} :maxdepth: 1 :caption: Introduction tutorial install configure data_format annotations/index contributing Live Demo ``` ## Python API ```{toctree} :caption: Python API :maxdepth: 2 api/hanlp/index api/common/index api/restful api/trie/index ``` ## Java API ```{toctree} :maxdepth: 1 :caption: Java API 1.x API api/restful_java ``` ## Golang API ```{toctree} :maxdepth: 1 :caption: Golang API api/restful_golang ``` ## References ```{toctree} :caption: References :maxdepth: 2 references ``` ## Acknowledgements HanLPv2.1 is heavily inspired by [AllenNLP](https://allennlp.org/) and [SuPar](https://pypi.org/project/supar/). [pypi-badge]: https://img.shields.io/pypi/v/hanlp.svg [pypi-link]: https://pypi.org/project/hanlp ================================================ FILE: docs/install.md ================================================ # Install ```{figure} _static/install-versions.svg --- width: 100% figclass: caption alt: HanLP versions name: hanlp-versions --- Choose your HanLP version ``` ## Install RESTful Packages [![Downloads](https://static.pepy.tech/badge/hanlp-restful)](https://pepy.tech/project/hanlp-restful) [![Downloads](https://static.pepy.tech/badge/hanlp-restful/month)](https://pepy.tech/project/hanlp-restful) [![Downloads](https://static.pepy.tech/badge/hanlp-restful/week)](https://pepy.tech/project/hanlp-restful) ```{eval-rst} .. margin:: **Beginners Attention** .. Hint:: New to NLP? Just install RESTful packages and call :meth:`~hanlp_restful.HanLPClient.parse` without pain. ``` For beginners, the recommended RESTful packages are easier to start with. The only requirement is [an auth key](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178). We officially released the following language bindings: ### Python ```shell script pip install hanlp_restful ``` ### Java See [Java instructions](https://hanlp.hankcs.com/docs/api/restful_java.html). ### Golang See [Golang instructions](https://hanlp.hankcs.com/docs/api/restful_golang.html). ## Install Native Package [![Downloads](https://static.pepy.tech/badge/hanlp)](https://pepy.tech/project/hanlp) [![Downloads](https://static.pepy.tech/badge/hanlp/month)](https://pepy.tech/project/hanlp) [![Downloads](https://static.pepy.tech/badge/hanlp/week)](https://pepy.tech/project/hanlp) The native package running locally can be installed via pip. ````{margin} **Install from Source** ```{note} See [developer guideline](https://hanlp.hankcs.com/docs/contributing.html#development). ``` ```` ``` pip install hanlp ``` HanLP requires Python 3.6 or later. GPU/TPU is suggested but not mandatory. Depending on your preference, HanLP offers the following flavors: ````{margin} **Windows Support** ```{note} Installation on Windows is **perfectly** supported. No need to install Microsoft Visual C++ Build Tools anymore. ``` ```` ````{margin} **Apple Silicon** ```{note} HanLP also perfectly supports accelerating on Apple Silicon M1 chips, see [tutorial](https://www.hankcs.com/nlp/hanlp-official-m1-support.html). ``` ```` | Flavor | Description | | ------- | ------------------------------------------------------------ | | default | This installs the default version which delivers the most commonly used functionalities. However, some heavy dependencies like TensorFlow are not installed. | | tf | This installs TensorFlow and fastText. | | amr | To support Abstract Meaning Representation (AMR) models, this installs AMR related dependencies like `penman`. | | full | For experts who seek to maximize the efficiency via TensorFlow and C++ extensions, `pip install hanlp[full]` installs all the above dependencies. | ## Install Models In short, you don't need to manually install any model. Instead, they are automatically downloaded to a directory called [`HANLP_HOME`](https://hanlp.hankcs.com/docs/configure.html#customize-hanlp-home) when you call `hanlp.load`. Occasionally, some errors might occur the first time you load a model, in which case you can refer to the following tips. ### Download Error #### HanLP Models If the auto-download of a HanLP model fails, you can either: 1. Retry as our file server might be busy serving users from all over the world. 1. Follow the message on your terminal, which often guides you to manually download a `zip` file to a particular path. 1. Use a [mirror site](https://hanlp.hankcs.com/docs/configure.html#use-mirror-sites) which could be faster and stabler in your region. #### Hugging Face 🤗 Transformers Models If the auto-download of a Hugging Face 🤗 Transformers model fails, e.g., the following exception is threw out: ```bash lib/python3.8/site-packages/transformers/file_utils.py", line 2102, in get_from_cache raise ValueError( ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on. ``` You can either: 1. Retry as the Internet is quite unstable in some regions (e.g., China). 2. Force Hugging Face 🤗 Transformers to use cached models instead of checking updates from the Internet **if you have ever successfully loaded it before**, by setting the following environment variable: ```bash export TRANSFORMERS_OFFLINE=1 ``` ### Server without Internet If your server has no Internet access at all, just debug your codes on your local PC and copy the following directories to your server via a USB disk or something. 1. `~/.hanlp`: the home directory for HanLP models. 1. `~/.cache/huggingface`: the home directory for Hugging Face 🤗 Transformers. ### Import Error Some TensorFlow/fastText models will ask you to install the missing TensorFlow/fastText modules, in which case you'll need to install the full version: ```shell script pip install hanlp[full] ``` ```{danger} NEVER install thirdparty packages (TensorFlow/fastText etc.) by yourself, as higher or lower versions of thirparty packages have not been tested and might not work properly. ``` ================================================ FILE: docs/references.bib ================================================ %% This BibTeX bibliography file was created using BibDesk. %% https://bibdesk.sourceforge.io/ %% Created for hankcs at 2022-12-07 15:02:16 -0500 %% Saved with string encoding Unicode (UTF-8) @inproceedings{bai-etal-2022-graph, address = {Dublin, Ireland}, author = {Bai, Xuefeng and Chen, Yulong and Zhang, Yue}, booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, date-added = {2022-12-07 15:02:15 -0500}, date-modified = {2022-12-07 15:02:15 -0500}, month = may, pages = {6001--6015}, publisher = {Association for Computational Linguistics}, title = {Graph Pre-training for {AMR} Parsing and Generation}, url = {https://aclanthology.org/2022.acl-long.415}, year = {2022}, bdsk-url-1 = {https://aclanthology.org/2022.acl-long.415}} @inproceedings{wang-etal-2021-minilmv2, address = {Online}, author = {Wang, Wenhui and Bao, Hangbo and Huang, Shaohan and Dong, Li and Wei, Furu}, booktitle = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021}, date-added = {2022-06-14 20:10:18 -0400}, date-modified = {2022-06-14 20:10:18 -0400}, doi = {10.18653/v1/2021.findings-acl.188}, month = aug, pages = {2140--2151}, publisher = {Association for Computational Linguistics}, title = {{M}ini{LM}v2: Multi-Head Self-Attention Relation Distillation for Compressing Pretrained Transformers}, url = {https://aclanthology.org/2021.findings-acl.188}, year = {2021}, bdsk-url-1 = {https://aclanthology.org/2021.findings-acl.188}, bdsk-url-2 = {https://doi.org/10.18653/v1/2021.findings-acl.188}} @article{zhang2021mengzi, author = {Zhang, Zhuosheng and Zhang, Hanqing and Chen, Keming and Guo, Yuhang and Hua, Jingyun and Wang, Yulong and Zhou, Ming}, date-added = {2022-04-15 10:32:14 -0400}, date-modified = {2022-04-15 10:32:14 -0400}, journal = {arXiv preprint arXiv:2110.06696}, title = {Mengzi: Towards Lightweight yet Ingenious Pre-trained Models for Chinese}, year = {2021}} @inproceedings{samuel-straka-2020-ufal, abstract = {We present PERIN, a novel permutation-invariant approach to sentence-to-graph semantic parsing. PERIN is a versatile, cross-framework and language independent architecture for universal modeling of semantic structures. Our system participated in the CoNLL 2020 shared task, Cross-Framework Meaning Representation Parsing (MRP 2020), where it was evaluated on five different frameworks (AMR, DRG, EDS, PTG and UCCA) across four languages. PERIN was one of the winners of the shared task. The source code and pretrained models are available at http://www.github.com/ufal/perin.}, address = {Online}, author = {Samuel, David and Straka, Milan}, booktitle = {Proceedings of the CoNLL 2020 Shared Task: Cross-Framework Meaning Representation Parsing}, date-added = {2022-04-12 22:36:23 -0400}, date-modified = {2022-04-12 22:36:23 -0400}, doi = {10.18653/v1/2020.conll-shared.5}, month = nov, pages = {53--64}, publisher = {Association for Computational Linguistics}, title = {{{\'U}FAL} at {MRP} 2020: Permutation-invariant Semantic Parsing in {PERIN}}, url = {https://aclanthology.org/2020.conll-shared.5}, year = {2020}, bdsk-url-1 = {https://aclanthology.org/2020.conll-shared.5}, bdsk-url-2 = {https://doi.org/10.18653/v1/2020.conll-shared.5}} @inproceedings{qiu-etal-2014-multi, address = {Dublin, Ireland}, author = {Qiu, Likun and Zhang, Yue and Jin, Peng and Wang, Houfeng}, booktitle = {Proceedings of {COLING} 2014, the 25th International Conference on Computational Linguistics: Technical Papers}, date-added = {2022-02-15 04:42:58 -0500}, date-modified = {2022-02-15 04:42:58 -0500}, month = aug, pages = {257--268}, publisher = {Dublin City University and Association for Computational Linguistics}, title = {Multi-view {C}hinese Treebanking}, url = {https://aclanthology.org/C14-1026}, year = {2014}, bdsk-url-1 = {https://aclanthology.org/C14-1026}} @inproceedings{li-etal-2018-analogical, abstract = {Analogical reasoning is effective in capturing linguistic regularities. This paper proposes an analogical reasoning task on Chinese. After delving into Chinese lexical knowledge, we sketch 68 implicit morphological relations and 28 explicit semantic relations. A big and balanced dataset CA8 is then built for this task, including 17813 questions. Furthermore, we systematically explore the influences of vector representations, context features, and corpora on analogical reasoning. With the experiments, CA8 is proved to be a reliable benchmark for evaluating Chinese word embeddings.}, address = {Melbourne, Australia}, author = {Li, Shen and Zhao, Zhe and Hu, Renfen and Li, Wensi and Liu, Tao and Du, Xiaoyong}, booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, date-added = {2022-01-30 22:52:52 -0500}, date-modified = {2022-01-30 22:52:52 -0500}, doi = {10.18653/v1/P18-2023}, month = jul, pages = {138--143}, publisher = {Association for Computational Linguistics}, title = {Analogical Reasoning on {C}hinese Morphological and Semantic Relations}, url = {https://aclanthology.org/P18-2023}, year = {2018}, bdsk-url-1 = {https://aclanthology.org/P18-2023}, bdsk-url-2 = {https://doi.org/10.18653/v1/P18-2023}} @inproceedings{NIPS2013_9aa42b31, author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff}, booktitle = {Advances in Neural Information Processing Systems}, date-added = {2022-01-30 18:17:28 -0500}, date-modified = {2022-01-30 18:17:28 -0500}, editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger}, publisher = {Curran Associates, Inc.}, title = {Distributed Representations of Words and Phrases and their Compositionality}, url = {https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf}, volume = {26}, year = {2013}, bdsk-url-1 = {https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf}} @inproceedings{bevilacqua-etal-2021-one, author = {Bevilacqua, Michele and Blloshmi, Rexhina and Navigli, Roberto}, booktitle = {Proceedings of AAAI}, date-added = {2022-01-25 11:58:03 -0500}, date-modified = {2022-01-25 11:58:03 -0500}, title = {One {SPRING} to Rule Them Both: {S}ymmetric {AMR} Semantic Parsing and Generation without a Complex Pipeline}, year = {2021}} @inproceedings{lewis-etal-2020-bart, abstract = {We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. BART is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and other recent pretraining schemes. We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of sentences and using a novel in-filling scheme, where spans of text are replaced with a single mask token. BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa on GLUE and SQuAD, and achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 3.5 ROUGE. BART also provides a 1.1 BLEU increase over a back-translation system for machine translation, with only target language pretraining. We also replicate other pretraining schemes within the BART framework, to understand their effect on end-task performance.}, address = {Online}, author = {Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Veselin and Zettlemoyer, Luke}, booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, date-added = {2022-01-25 11:56:10 -0500}, date-modified = {2022-01-25 11:56:10 -0500}, doi = {10.18653/v1/2020.acl-main.703}, month = jul, pages = {7871--7880}, publisher = {Association for Computational Linguistics}, title = {{BART}: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension}, url = {https://www.aclweb.org/anthology/2020.acl-main.703}, year = {2020}, bdsk-url-1 = {https://www.aclweb.org/anthology/2020.acl-main.703}, bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.703}} @article{knight2014abstract, author = {Knight, Kevin and Baranescu, Lauren and Bonial, Claire and Georgescu, Madalina and Griffitt, Kira and Hermjakob, Ulf and Marcu, Daniel and Palmer, Martha and Schneifer, Nathan}, date-added = {2022-01-25 11:54:11 -0500}, date-modified = {2022-01-25 11:54:11 -0500}, journal = {Web download}, title = {Abstract meaning representation (amr) annotation release 1.0}, year = {2014}} @inproceedings{he-choi-2021-stem, abstract = {Multi-task learning with transformer encoders (MTL) has emerged as a powerful technique to improve performance on closely-related tasks for both accuracy and efficiency while a question still remains whether or not it would perform as well on tasks that are distinct in nature. We first present MTL results on five NLP tasks, POS, NER, DEP, CON, and SRL, and depict its deficiency over single-task learning. We then conduct an extensive pruning analysis to show that a certain set of attention heads get claimed by most tasks during MTL, who interfere with one another to fine-tune those heads for their own objectives. Based on this finding, we propose the Stem Cell Hypothesis to reveal the existence of attention heads naturally talented for many tasks that cannot be jointly trained to create adequate embeddings for all of those tasks. Finally, we design novel parameter-free probes to justify our hypothesis and demonstrate how attention heads are transformed across the five tasks during MTL through label analysis.}, address = {Online and Punta Cana, Dominican Republic}, author = {He, Han and Choi, Jinho D.}, booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing}, date-added = {2021-11-06 18:24:44 -0400}, date-modified = {2021-11-06 18:24:44 -0400}, month = nov, pages = {5555--5577}, publisher = {Association for Computational Linguistics}, title = {The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders}, url = {https://aclanthology.org/2021.emnlp-main.451}, year = {2021}, bdsk-url-1 = {https://aclanthology.org/2021.emnlp-main.451}} @inproceedings{he-choi-2019, abstract = {This paper presents new state-of-the-art models for three tasks, part-of-speech tagging, syntactic parsing, and semantic parsing, using the cutting-edge contextualized embedding framework known as BERT. For each task, we first replicate and simplify the current state-of-the-art approach to enhance its model efficiency. We then evaluate our simplified approaches on those three tasks using token embeddings generated by BERT. 12 datasets in both English and Chinese are used for our experiments. The BERT models outperform the previously best-performing models by 2.5\% on average (7.5\% for the most significant case). All models and source codes are available in public so that researchers can improve upon and utilize them to establish strong baselines for the next decade.}, author = {Han He and Jinho Choi}, booktitle = {The Thirty-Third International Flairs Conference}, conference = {Florida Artificial Intelligence Research Society Conference}, date-added = {2021-10-16 21:09:00 -0400}, date-modified = {2021-10-16 21:09:00 -0400}, keywords = {part-of-speech tagging, syntactic parsing, semantic parsing, Transformer, BERT}, title = {Establishing Strong Baselines for the New Decade: Sequence Tagging, Syntactic and Semantic Parsing with BERT}, url = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438}, year = {2020}, bdsk-url-1 = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438}} @inproceedings{xiao-etal-2021-ernie, abstract = {Coarse-grained linguistic information, such as named entities or phrases, facilitates adequately representation learning in pre-training. Previous works mainly focus on extending the objective of BERT{'}s Masked Language Modeling (MLM) from masking individual tokens to contiguous sequences of n tokens. We argue that such contiguously masking method neglects to model the intra-dependencies and inter-relation of coarse-grained linguistic information. As an alternative, we propose ERNIE-Gram, an explicitly n-gram masking method to enhance the integration of coarse-grained information into pre-training. In ERNIE-Gram, n-grams are masked and predicted directly using explicit n-gram identities rather than contiguous sequences of n tokens. Furthermore, ERNIE-Gram employs a generator model to sample plausible n-gram identities as optional n-gram masks and predict them in both coarse-grained and fine-grained manners to enable comprehensive n-gram prediction and relation modeling. We pre-train ERNIE-Gram on English and Chinese text corpora and fine-tune on 19 downstream tasks. Experimental results show that ERNIE-Gram outperforms previous pre-training models like XLNet and RoBERTa by a large margin, and achieves comparable results with state-of-the-art methods. The source codes and pre-trained models have been released at https://github.com/PaddlePaddle/ERNIE.}, address = {Online}, author = {Xiao, Dongling and Li, Yu-Kun and Zhang, Han and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng}, booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, date-added = {2021-09-04 14:09:52 -0400}, date-modified = {2021-09-04 14:09:52 -0400}, doi = {10.18653/v1/2021.naacl-main.136}, month = jun, pages = {1702--1715}, publisher = {Association for Computational Linguistics}, title = {{ERNIE}-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding}, url = {https://aclanthology.org/2021.naacl-main.136}, year = {2021}, bdsk-url-1 = {https://aclanthology.org/2021.naacl-main.136}, bdsk-url-2 = {https://doi.org/10.18653/v1/2021.naacl-main.136}} @inproceedings{akbik-etal-2018-contextual, abstract = {Recent advances in language modeling using recurrent neural networks have made it viable to model language as distributions over characters. By learning to predict the next character on the basis of previous characters, such models have been shown to automatically internalize linguistic concepts such as words, sentences, subclauses and even sentiment. In this paper, we propose to leverage the internal states of a trained character language model to produce a novel type of word embedding which we refer to as contextual string embeddings. Our proposed embeddings have the distinct properties that they (a) are trained without any explicit notion of words and thus fundamentally model words as sequences of characters, and (b) are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. We conduct a comparative evaluation against previous embeddings and find that our embeddings are highly useful for downstream tasks: across four classic sequence labeling tasks we consistently outperform the previous state-of-the-art. In particular, we significantly outperform previous work on English and German named entity recognition (NER), allowing us to report new state-of-the-art F1-scores on the CoNLL03 shared task. We release all code and pre-trained language models in a simple-to-use framework to the research community, to enable reproduction of these experiments and application of our proposed embeddings to other tasks: https://github.com/zalandoresearch/flair}, address = {Santa Fe, New Mexico, USA}, author = {Akbik, Alan and Blythe, Duncan and Vollgraf, Roland}, booktitle = {Proceedings of the 27th International Conference on Computational Linguistics}, date-added = {2021-09-01 13:10:59 -0400}, date-modified = {2021-09-01 13:10:59 -0400}, month = aug, pages = {1638--1649}, publisher = {Association for Computational Linguistics}, title = {Contextual String Embeddings for Sequence Labeling}, url = {https://aclanthology.org/C18-1139}, year = {2018}, bdsk-url-1 = {https://aclanthology.org/C18-1139}} @inproceedings{he-choi-2021-levi, abstract = {Coupled with biaffine decoders, transformers have been effectively adapted to text-to-graph transduction and achieved state-of-the-art performance on AMR parsing. Many prior works, however, rely on the biaffine decoder for either or both arc and label predictions although most features used by the decoder may be learned by the transformer already. This paper presents a novel approach to AMR parsing by combining heterogeneous data (tokens, concepts, labels) as one input to a transformer to learn attention, and use only attention matrices from the transformer to predict all elements in AMR graphs (concepts, arcs, labels). Although our models use significantly fewer parameters than the previous state-of-the-art graph parser, they show similar or better accuracy on AMR 2.0 and 3.0.}, address = {Online}, author = {He, Han and Choi, Jinho D.}, booktitle = {Proceedings of the 17th International Conference on Parsing Technologies and the IWPT 2021 Shared Task on Parsing into Enhanced Universal Dependencies (IWPT 2021)}, date-added = {2021-09-01 13:09:14 -0400}, date-modified = {2021-09-01 13:09:14 -0400}, doi = {10.18653/v1/2021.iwpt-1.5}, month = aug, pages = {50--57}, publisher = {Association for Computational Linguistics}, title = {Levi Graph {AMR} Parser using Heterogeneous Attention}, url = {https://aclanthology.org/2021.iwpt-1.5}, year = {2021}, bdsk-url-1 = {https://aclanthology.org/2021.iwpt-1.5}, bdsk-url-2 = {https://doi.org/10.18653/v1/2021.iwpt-1.5}} @inproceedings{conneau-etal-2020-unsupervised, abstract = {This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6{\%} average accuracy on XNLI, +13{\%} average F1 score on MLQA, and +2.4{\%} F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7{\%} in XNLI accuracy for Swahili and 11.4{\%} for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code and models publicly available.}, address = {Online}, author = {Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin}, booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, date-added = {2021-09-01 12:41:50 -0400}, date-modified = {2021-09-01 12:41:50 -0400}, doi = {10.18653/v1/2020.acl-main.747}, month = jul, pages = {8440--8451}, publisher = {Association for Computational Linguistics}, title = {Unsupervised Cross-lingual Representation Learning at Scale}, url = {https://aclanthology.org/2020.acl-main.747}, year = {2020}, bdsk-url-1 = {https://aclanthology.org/2020.acl-main.747}, bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.747}} @inproceedings{xue-etal-2021-mt5, abstract = {The recent {``}Text-to-Text Transfer Transformer{''} (T5) leveraged a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual benchmarks. We also describe a simple technique to prevent {``}accidental translation{''} in the zero-shot setting, where a generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model checkpoints used in this work are publicly available.}, address = {Online}, author = {Xue, Linting and Constant, Noah and Roberts, Adam and Kale, Mihir and Al-Rfou, Rami and Siddhant, Aditya and Barua, Aditya and Raffel, Colin}, booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, date-added = {2021-09-01 12:40:34 -0400}, date-modified = {2021-09-01 12:40:34 -0400}, doi = {10.18653/v1/2021.naacl-main.41}, month = jun, pages = {483--498}, publisher = {Association for Computational Linguistics}, title = {m{T}5: A Massively Multilingual Pre-trained Text-to-Text Transformer}, url = {https://aclanthology.org/2021.naacl-main.41}, year = {2021}, bdsk-url-1 = {https://aclanthology.org/2021.naacl-main.41}, bdsk-url-2 = {https://doi.org/10.18653/v1/2021.naacl-main.41}} @misc{https://doi.org/10.35111/gvd0-xk91, author = {Xue, Nianwen and {Zhang, Xiuhong} and {Jiang, Zixin} and {Palmer, Martha} and {Xia, Fei} and {Chiou, Fu-Dong} and {Chang, Meiyu}}, date-added = {2021-09-01 12:32:05 -0400}, date-modified = {2021-09-01 12:36:22 -0400}, doi = {10.35111/GVD0-XK91}, publisher = {Linguistic Data Consortium}, title = {Chinese Treebank 9.0}, url = {https://catalog.ldc.upenn.edu/LDC2016T13}, year = {2016}, bdsk-url-1 = {https://catalog.ldc.upenn.edu/LDC2016T13}, bdsk-url-2 = {https://doi.org/10.35111/GVD0-XK91}} @inproceedings{clark2020electra, author = {Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning}, booktitle = {ICLR}, date-added = {2021-08-07 15:53:27 -0400}, date-modified = {2021-08-07 15:53:27 -0400}, title = {{ELECTRA}: Pre-training Text Encoders as Discriminators Rather Than Generators}, url = {https://openreview.net/pdf?id=r1xMH1BtvB}, year = {2020}, bdsk-url-1 = {https://openreview.net/pdf?id=r1xMH1BtvB}} @inproceedings{chang-etal-2009-discriminative, address = {Boulder, Colorado}, author = {Chang, Pi-Chuan and Tseng, Huihsin and Jurafsky, Dan and Manning, Christopher D.}, booktitle = {Proceedings of the Third Workshop on Syntax and Structure in Statistical Translation ({SSST}-3) at {NAACL} {HLT} 2009}, date-added = {2021-03-17 13:37:03 -0400}, date-modified = {2021-03-17 13:37:03 -0400}, month = jun, pages = {51--59}, publisher = {Association for Computational Linguistics}, title = {Discriminative Reordering with {C}hinese Grammatical Relations Features}, url = {https://www.aclweb.org/anthology/W09-2307}, year = {2009}, bdsk-url-1 = {https://www.aclweb.org/anthology/W09-2307}} @inproceedings{pennington-etal-2014-glove, address = {Doha, Qatar}, author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher}, booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})}, date-added = {2020-12-31 15:07:29 -0500}, date-modified = {2020-12-31 15:07:29 -0500}, doi = {10.3115/v1/D14-1162}, month = oct, pages = {1532--1543}, publisher = {Association for Computational Linguistics}, title = {{G}lo{V}e: Global Vectors for Word Representation}, url = {https://www.aclweb.org/anthology/D14-1162}, year = {2014}, bdsk-url-1 = {https://www.aclweb.org/anthology/D14-1162}, bdsk-url-2 = {https://doi.org/10.3115/v1/D14-1162}} @incollection{he2018dual, author = {He, Han and Wu, Lei and Yang, Xiaokun and Yan, Hua and Gao, Zhimin and Feng, Yi and Townsend, George}, booktitle = {Information Technology-New Generations}, date-added = {2020-12-31 15:03:58 -0500}, date-modified = {2020-12-31 15:03:58 -0500}, pages = {421--426}, publisher = {Springer}, title = {Dual long short-term memory networks for sub-character representation learning}, year = {2018}} @inproceedings{devlin-etal-2019-bert, abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).}, address = {Minneapolis, Minnesota}, author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, date-added = {2020-12-31 14:46:54 -0500}, date-modified = {2020-12-31 14:46:54 -0500}, doi = {10.18653/v1/N19-1423}, month = jun, pages = {4171--4186}, publisher = {Association for Computational Linguistics}, title = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding}, url = {https://www.aclweb.org/anthology/N19-1423}, year = {2019}, bdsk-url-1 = {https://www.aclweb.org/anthology/N19-1423}, bdsk-url-2 = {https://doi.org/10.18653/v1/N19-1423}} @inproceedings{Lan2020ALBERT:, author = {Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut}, booktitle = {International Conference on Learning Representations}, date-added = {2020-12-31 14:44:52 -0500}, date-modified = {2020-12-31 14:44:52 -0500}, title = {ALBERT: A Lite BERT for Self-supervised Learning of Language Representations}, url = {https://openreview.net/forum?id=H1eA7AEtvS}, year = {2020}, bdsk-url-1 = {https://openreview.net/forum?id=H1eA7AEtvS}} @inproceedings{wang-xu-2017-convolutional, abstract = {Character-based sequence labeling framework is flexible and efficient for Chinese word segmentation (CWS). Recently, many character-based neural models have been applied to CWS. While they obtain good performance, they have two obvious weaknesses. The first is that they heavily rely on manually designed bigram feature, i.e. they are not good at capturing $n$-gram features automatically. The second is that they make no use of full word information. For the first weakness, we propose a convolutional neural model, which is able to capture rich $n$-gram features without any feature engineering. For the second one, we propose an effective approach to integrate the proposed model with word embeddings. We evaluate the model on two benchmark datasets: PKU and MSR. Without any feature engineering, the model obtains competitive performance {---} 95.7{\%} on PKU and 97.3{\%} on MSR. Armed with word embeddings, the model achieves state-of-the-art performance on both datasets {---} 96.5{\%} on PKU and 98.0{\%} on MSR, without using any external labeled resource.}, address = {Taipei, Taiwan}, author = {Wang, Chunqi and Xu, Bo}, booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)}, date-added = {2020-12-31 14:42:35 -0500}, date-modified = {2020-12-31 14:42:35 -0500}, month = nov, pages = {163--172}, publisher = {Asian Federation of Natural Language Processing}, title = {Convolutional Neural Network with Word Embeddings for {C}hinese Word Segmentation}, url = {https://www.aclweb.org/anthology/I17-1017}, year = {2017}, bdsk-url-1 = {https://www.aclweb.org/anthology/I17-1017}} @article{bojanowski2017enriching, author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, date-added = {2020-12-25 22:31:59 -0500}, date-modified = {2020-12-25 22:31:59 -0500}, issn = {2307-387X}, journal = {Transactions of the Association for Computational Linguistics}, pages = {135--146}, title = {Enriching Word Vectors with Subword Information}, volume = {5}, year = {2017}} @article{collins-koo-2005-discriminative, author = {Collins, Michael and Koo, Terry}, date-added = {2020-12-25 17:25:59 -0500}, date-modified = {2020-12-25 17:25:59 -0500}, doi = {10.1162/0891201053630273}, journal = {Computational Linguistics}, number = {1}, pages = {25--70}, title = {Discriminative Reranking for Natural Language Parsing}, url = {https://www.aclweb.org/anthology/J05-1003}, volume = {31}, year = {2005}, bdsk-url-1 = {https://www.aclweb.org/anthology/J05-1003}, bdsk-url-2 = {https://doi.org/10.1162/0891201053630273}} @inproceedings{zhang-clark-2008-tale, address = {Honolulu, Hawaii}, author = {Zhang, Yue and Clark, Stephen}, booktitle = {Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing}, date-added = {2020-12-25 15:10:10 -0500}, date-modified = {2020-12-25 15:10:10 -0500}, month = oct, pages = {562--571}, publisher = {Association for Computational Linguistics}, title = {A Tale of Two Parsers: {I}nvestigating and Combining Graph-based and Transition-based Dependency Parsing}, url = {https://www.aclweb.org/anthology/D08-1059}, year = {2008}, bdsk-url-1 = {https://www.aclweb.org/anthology/D08-1059}} @inproceedings{pradhan-etal-2012-conll, address = {Jeju Island, Korea}, author = {Pradhan, Sameer and Moschitti, Alessandro and Xue, Nianwen and Uryupina, Olga and Zhang, Yuchen}, booktitle = {Joint Conference on {EMNLP} and {C}o{NLL} - Shared Task}, date-added = {2020-12-24 23:42:41 -0500}, date-modified = {2020-12-24 23:42:41 -0500}, month = jul, pages = {1--40}, publisher = {Association for Computational Linguistics}, title = {{C}o{NLL}-2012 Shared Task: Modeling Multilingual Unrestricted Coreference in {O}nto{N}otes}, url = {https://www.aclweb.org/anthology/W12-4501}, year = {2012}, bdsk-url-1 = {https://www.aclweb.org/anthology/W12-4501}} @inproceedings{levow-2006-third, address = {Sydney, Australia}, author = {Levow, Gina-Anne}, booktitle = {Proceedings of the Fifth {SIGHAN} Workshop on {C}hinese Language Processing}, date-added = {2020-12-24 23:21:14 -0500}, date-modified = {2020-12-24 23:21:14 -0500}, month = jul, pages = {108--117}, publisher = {Association for Computational Linguistics}, title = {The Third International {C}hinese Language Processing Bakeoff: Word Segmentation and Named Entity Recognition}, url = {https://www.aclweb.org/anthology/W06-0115}, year = {2006}, bdsk-url-1 = {https://www.aclweb.org/anthology/W06-0115}} @inproceedings{tjong-kim-sang-de-meulder-2003-introduction, author = {Tjong Kim Sang, Erik F. and De Meulder, Fien}, booktitle = {Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003}, date-added = {2020-12-24 23:19:00 -0500}, date-modified = {2020-12-24 23:19:00 -0500}, pages = {142--147}, title = {Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition}, url = {https://www.aclweb.org/anthology/W03-0419}, year = {2003}, bdsk-url-1 = {https://www.aclweb.org/anthology/W03-0419}} @inproceedings{koehn2005europarl, author = {Koehn, Philipp}, booktitle = {MT summit}, date-added = {2020-12-24 23:06:03 -0500}, date-modified = {2020-12-24 23:06:03 -0500}, organization = {Citeseer}, pages = {79--86}, title = {Europarl: A parallel corpus for statistical machine translation}, volume = {5}, year = {2005}} @inproceedings{Schweter:Ahmed:2019, author = {Stefan Schweter and Sajawel Ahmed}, booktitle = {Proceedings of the 15th Conference on Natural Language Processing (KONVENS)}, date-added = {2020-12-24 23:03:23 -0500}, date-modified = {2020-12-24 23:03:23 -0500}, location = {Erlangen, Germany}, note = {accepted}, title = {{Deep-EOS: General-Purpose Neural Networks for Sentence Boundary Detection}}, year = 2019} @incollection{he2019effective, author = {He, Han and Wu, Lei and Yan, Hua and Gao, Zhimin and Feng, Yi and Townsend, George}, booktitle = {Smart Intelligent Computing and Applications}, date-added = {2020-12-24 19:35:03 -0500}, date-modified = {2020-12-24 19:35:03 -0500}, pages = {133--142}, publisher = {Springer}, title = {Effective neural solution for multi-criteria word segmentation}, year = {2019}} @inproceedings{dozat2017stanford, author = {Dozat, Timothy and Qi, Peng and Manning, Christopher D}, booktitle = {Proceedings of the CoNLL 2017 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, date-added = {2020-12-24 15:02:18 -0500}, date-modified = {2020-12-24 15:02:18 -0500}, pages = {20--30}, title = {Stanford's graph-based neural dependency parser at the conll 2017 shared task}, year = {2017}} @inproceedings{he-etal-2018-jointly, abstract = {Recent BIO-tagging-based neural semantic role labeling models are very high performing, but assume gold predicates as part of the input and cannot incorporate span-level features. We propose an end-to-end approach for jointly predicting all predicates, arguments spans, and the relations between them. The model makes independent decisions about what relationship, if any, holds between every possible word-span pair, and learns contextualized span representations that provide rich, shared input features for each decision. Experiments demonstrate that this approach sets a new state of the art on PropBank SRL without gold predicates.}, address = {Melbourne, Australia}, author = {He, Luheng and Lee, Kenton and Levy, Omer and Zettlemoyer, Luke}, booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, date-added = {2020-12-24 14:23:45 -0500}, date-modified = {2020-12-24 14:23:45 -0500}, doi = {10.18653/v1/P18-2058}, month = jul, pages = {364--369}, publisher = {Association for Computational Linguistics}, title = {Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling}, url = {https://www.aclweb.org/anthology/P18-2058}, year = {2018}, bdsk-url-1 = {https://www.aclweb.org/anthology/P18-2058}, bdsk-url-2 = {https://doi.org/10.18653/v1/P18-2058}} @inproceedings{yu-etal-2020-named, abstract = {Named Entity Recognition (NER) is a fundamental task in Natural Language Processing, concerned with identifying spans of text expressing references to entities. NER research is often focused on flat entities only (flat NER), ignoring the fact that entity references can be nested, as in [Bank of [China]] (Finkel and Manning, 2009). In this paper, we use ideas from graph-based dependency parsing to provide our model a global view on the input via a biaffine model (Dozat and Manning, 2017). The biaffine model scores pairs of start and end tokens in a sentence which we use to explore all spans, so that the model is able to predict named entities accurately. We show that the model works well for both nested and flat NER through evaluation on 8 corpora and achieving SoTA performance on all of them, with accuracy gains of up to 2.2 percentage points.}, address = {Online}, author = {Yu, Juntao and Bohnet, Bernd and Poesio, Massimo}, booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, date-added = {2020-12-24 13:35:09 -0500}, date-modified = {2020-12-24 13:35:09 -0500}, doi = {10.18653/v1/2020.acl-main.577}, month = jul, pages = {6470--6476}, publisher = {Association for Computational Linguistics}, title = {Named Entity Recognition as Dependency Parsing}, url = {https://www.aclweb.org/anthology/2020.acl-main.577}, year = {2020}, bdsk-url-1 = {https://www.aclweb.org/anthology/2020.acl-main.577}, bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.577}} @inproceedings{10.1145/1457838.1457895, abstract = {Many computer applications require the storage of large amounts of information within the computer's memory where it will be readily available for reference and updating. Quite commonly, more storage space is required than is available in the computer's high-speed working memory. It is, therefore, a common practice to equip computers with magnetic tapes, disks, or drums, or a combination of these to provide additional storage. This additional storage is always slower in operation than the computer's working memory and therefore care must be taken when using it to avoid excessive operating time.}, address = {New York, NY, USA}, author = {De La Briandais, Rene}, booktitle = {Papers Presented at the the March 3-5, 1959, Western Joint Computer Conference}, date-added = {2020-12-24 13:07:31 -0500}, date-modified = {2020-12-24 13:07:31 -0500}, doi = {10.1145/1457838.1457895}, isbn = {9781450378659}, location = {San Francisco, California}, numpages = {4}, pages = {295--298}, publisher = {Association for Computing Machinery}, series = {IRE-AIEE-ACM '59 (Western)}, title = {File Searching Using Variable Length Keys}, url = {https://doi.org/10.1145/1457838.1457895}, year = {1959}, bdsk-url-1 = {https://doi.org/10.1145/1457838.1457895}} @article{lafferty2001conditional, author = {Lafferty, John and McCallum, Andrew and Pereira, Fernando CN}, date-added = {2020-12-24 11:46:30 -0500}, date-modified = {2020-12-24 12:08:29 -0500}, journal = {Departmental Papers (CIS)}, title = {Conditional random fields: Probabilistic models for segmenting and labeling sequence data}, year = {2001}} @inproceedings{clark-etal-2019-bam, abstract = {It can be challenging to train multi-task neural networks that outperform or even match their single-task counterparts. To help address this, we propose using knowledge distillation where single-task models teach a multi-task model. We enhance this training with teacher annealing, a novel method that gradually transitions the model from distillation to supervised learning, helping the multi-task model surpass its single-task teachers. We evaluate our approach by multi-task fine-tuning BERT on the GLUE benchmark. Our method consistently improves over standard single-task and multi-task training.}, address = {Florence, Italy}, author = {Clark, Kevin and Luong, Minh-Thang and Khandelwal, Urvashi and Manning, Christopher D. and Le, Quoc V.}, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, date-added = {2020-12-24 11:26:54 -0500}, date-modified = {2020-12-24 11:26:54 -0500}, doi = {10.18653/v1/P19-1595}, month = jul, pages = {5931--5937}, publisher = {Association for Computational Linguistics}, title = {{BAM}! Born-Again Multi-Task Networks for Natural Language Understanding}, url = {https://www.aclweb.org/anthology/P19-1595}, year = {2019}, bdsk-url-1 = {https://www.aclweb.org/anthology/P19-1595}, bdsk-url-2 = {https://doi.org/10.18653/v1/P19-1595}} @inproceedings{kondratyuk-straka-2019-75, address = {Hong Kong, China}, author = {Kondratyuk, Dan and Straka, Milan}, booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, date-added = {2020-12-23 23:51:07 -0500}, date-modified = {2020-12-23 23:51:07 -0500}, pages = {2779--2795}, publisher = {Association for Computational Linguistics}, title = {75 Languages, 1 Model: Parsing Universal Dependencies Universally}, url = {https://www.aclweb.org/anthology/D19-1279}, year = {2019}, bdsk-url-1 = {https://www.aclweb.org/anthology/D19-1279}} @inproceedings{dozat:17a, author = {Dozat, Timothy and Manning, Christopher D.}, booktitle = {Proceedings of the 5th International Conference on Learning Representations}, date-added = {2020-12-23 23:46:20 -0500}, date-modified = {2020-12-23 23:46:20 -0500}, series = {ICLR'17}, title = {{Deep Biaffine Attention for Neural Dependency Parsing}}, url = {https://openreview.net/pdf?id=Hk95PK9le}, year = {2017}, bdsk-url-1 = {http://arxiv.org/abs/1611.01734}, bdsk-url-2 = {https://openreview.net/pdf?id=Hk95PK9le}} @inproceedings{smith-smith-2007-probabilistic, address = {Prague, Czech Republic}, author = {Smith, David A. and Smith, Noah A.}, booktitle = {Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning ({EMNLP}-{C}o{NLL})}, date-added = {2020-12-23 21:46:06 -0500}, date-modified = {2020-12-23 21:46:06 -0500}, month = jun, pages = {132--140}, publisher = {Association for Computational Linguistics}, title = {Probabilistic Models of Nonprojective Dependency Trees}, url = {https://www.aclweb.org/anthology/D07-1014}, year = {2007}, bdsk-url-1 = {https://www.aclweb.org/anthology/D07-1014}} @inproceedings{ijcai2020-560, author = {Zhang, Yu and Zhou, Houquan and Li, Zhenghua}, booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence, {IJCAI-20}}, date-added = {2020-12-23 21:36:56 -0500}, date-modified = {2020-12-23 21:36:56 -0500}, doi = {10.24963/ijcai.2020/560}, editor = {Christian Bessiere}, month = {7}, note = {Main track}, pages = {4046--4053}, publisher = {International Joint Conferences on Artificial Intelligence Organization}, title = {Fast and Accurate Neural CRF Constituency Parsing}, url = {https://doi.org/10.24963/ijcai.2020/560}, year = {2020}, bdsk-url-1 = {https://doi.org/10.24963/ijcai.2020/560}} @inproceedings{buchholz-marsi-2006-conll, address = {New York City}, author = {Buchholz, Sabine and Marsi, Erwin}, booktitle = {Proceedings of the Tenth Conference on Computational Natural Language Learning ({C}o{NLL}-X)}, date-added = {2020-12-22 22:57:41 -0500}, date-modified = {2020-12-22 22:57:41 -0500}, month = jun, pages = {149--164}, publisher = {Association for Computational Linguistics}, title = {{C}o{NLL}-{X} Shared Task on Multilingual Dependency Parsing}, url = {https://www.aclweb.org/anthology/W06-2920}, year = {2006}, bdsk-url-1 = {https://www.aclweb.org/anthology/W06-2920}} ================================================ FILE: docs/references.rst ================================================ References ================== .. bibliography:: references.bib :cited: :style: astrostyle ================================================ FILE: docs/tutorial.md ================================================ --- jupytext: formats: ipynb,md:myst text_representation: extension: .md format_name: myst format_version: '0.8' jupytext_version: 1.4.2 kernelspec: display_name: Python 3 language: python name: python3 --- # Tutorial Natural Language Processing is an exciting field consisting of many closely related tasks like lexical analysis and parsing. Each task involves many datasets and models, all requiring a high degree of expertise. Things become even more complex when dealing with multilingual text, as there's simply no datasets for some low-resource languages. However, with HanLP 2.1, core NLP tasks have been made easy to access and efficient in production environments. In this tutorial, we'll walk through the APIs in HanLP step by step. HanLP offers out-of-the-box RESTful API and native Python API which share very similar interfaces while they are designed for different scenes. ```{code-cell} ipython3 :tags: [remove_cell] import hanlp_common.constant hanlp_common.constant.IPYTHON = False # Avoid pretty_print prints html which doesn't play well with this theme ``` ## RESTful API RESTful API is an endpoint where you send your documents to then get the parsed annotations back. We are hosting a **non-commercial** API service and you are welcome to [apply for an auth key](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178). An auth key is a password which gives you access to our API and protects our server from being abused. Once obtained such an auth key, you can parse your document with our RESTful client which can be installed via: ````{margin} **Non-Commercial** ```{seealso} Our models and RESTful APIs are under the [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) licence. ``` ```` ````{margin} **Zero-Shot Learning** ```{note} Although UD covers 104 languages, OntoNotes (NER, CON, SRL) covers only English, Chinese and Arabic. So NER/CON/SRL of languages other than the 3 are considered as Zero-Shot and their accuracies can be very low. ``` ```` ```bash pip install hanlp_restful ``` ```{eval-rst} Then initiate a :class:`~hanlp_restful.HanLPClient` with your auth key and send a document to have it parsed. ``` ```{code-cell} ipython3 :tags: [output_scroll] from hanlp_restful import HanLPClient # Fill in your auth, set language='zh' to use Chinese models HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul') doc = HanLP('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments. ' \ '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。' \ '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。') print(doc) ``` ````{margin} **But what do these annotations mean?** ```{seealso} See our [data format](data_format) and [annotations](annotations/index) for details. ``` ```` ## Visualization ```{eval-rst} The returned :class:`~hanlp_common.document.Document` has a handy method :meth:`~hanlp_common.document.Document.pretty_print` which offers visualization in any mono-width text environment. ``` ````{margin} **Non-ASCII** ```{note} Non-ASCII text might be skewed in terminals but in Jupyter Notebook it will align correctly. You can also use our [live demo](https://hanlp.hankcs.com/). ``` ```` ````{margin} **Non-Projective** ```{note} Non-projective dependency trees cannot be visualized and won't be printed out at this moment. ``` ```` ```{code-cell} ipython3 doc.pretty_print() ``` ## Native API ### Multi-Task Learning If you want to run our models locally or you want to implement your own RESTful server, you can [install the native API](https://hanlp.hankcs.com/docs/install.html#install-native-package) and call it just like the RESTful one. ````{margin} **Sentences Required** ```{seealso} As MTL doesn't predict sentence boundaries, inputs have to be split beforehand. See our [data format](data_format) for details. ``` ```` ```{code-cell} ipython3 :tags: [output_scroll] import hanlp HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE) print(HanLP(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.', '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'])) ``` Due to the fact that the service provider is very likely running a different model or having different settings, the RESTful and native results might be slightly different. To process Chinese or Japanese, HanLP provides mono-lingual models in each language which significantly outperform the multi-lingual model. See [docs](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mtl.html) for the list of models. ### Single-Task Learning HanLP also provides a full spectrum of single-task learning models for core NLP tasks including tagging and parsing. Please refer to the documentations of [`pretrained`](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/index.html) models for details. ================================================ FILE: hanlp/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-06-13 18:05 import hanlp.common import hanlp.components import hanlp.pretrained import hanlp.utils from hanlp.version import __version__ hanlp.utils.ls_resource_in_module(hanlp.pretrained) def load(save_dir: str, verbose=None, **kwargs) -> hanlp.common.component.Component: """Load a pretrained component from an identifier. Args: save_dir (str): The identifier to the saved component. It could be a remote URL or a local path. verbose: ``True`` to print loading progress. **kwargs: Arguments passed to :func:`hanlp.common.torch_component.TorchComponent.load`, e.g., ``devices`` is a useful argument to specify which GPU devices a PyTorch component will use. Examples:: import hanlp # Load component onto the 0-th GPU. hanlp.load(..., devices=0) # Load component onto the 0-th and 1-st GPUs using data parallelization. hanlp.load(..., devices=[0, 1]) .. Note:: A component can have dependencies on other components or resources, which will be recursively loaded. So it's common to see multiple downloading messages per single load. Returns: hanlp.common.component.Component: A pretrained component. """ save_dir = hanlp.pretrained.ALL.get(save_dir, save_dir) from hanlp.utils.component_util import load_from_meta_file if verbose is None: from hanlp_common.constant import HANLP_VERBOSE verbose = HANLP_VERBOSE return load_from_meta_file(save_dir, 'meta.json', verbose=verbose, **kwargs) def pipeline(*pipes) -> hanlp.components.pipeline.Pipeline: """Creates a pipeline of components. It's made for bundling `KerasComponents`. For `TorchComponent`, use :class:`~hanlp.components.mtl.multi_task_learning.MultiTaskLearning` instead. Args: *pipes: Components if pre-defined any. Returns: hanlp.components.pipeline.Pipeline: A pipeline, which is a list of components in order. """ return hanlp.components.pipeline.Pipeline(*pipes) ================================================ FILE: hanlp/callbacks/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-05 02:10 ================================================ FILE: hanlp/callbacks/fine_csv_logger.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-05 02:12 import copy from io import TextIOWrapper from typing import List import numpy as np import tensorflow as tf class StreamTableFormatter(object): def __init__(self) -> None: super().__init__() self.col_widths = None def format_row(self, cells) -> List[str]: if not isinstance(cells, list): cells = list(cells) if not self.col_widths: self.col_widths = [0] * len([_ for _ in cells]) for i, c in enumerate(cells): self.col_widths[i] = max(self.col_widths[i], len(self.format_cell(c, self.col_widths[i]))) return list(self.format_cell(cell, width) for cell, width in zip(cells, self.col_widths)) def format_cell(self, cell: str, min_width) -> str: if isinstance(cell, (np.float32, np.float)): return '{:>{}.4f}'.format(cell, min_width) return '{:>{}}'.format(cell, min_width) class FineCSVLogger(tf.keras.callbacks.History): def __init__(self, filename, separator=',', append=False): super().__init__() self.append = append self.separator = separator self.filename = filename self.out: TextIOWrapper = None self.keys = [] self.formatter = StreamTableFormatter() def on_train_begin(self, logs=None): super().on_train_begin(logs) self.out = open(self.filename, 'a' if self.append else 'w') def on_train_end(self, logs=None): self.out.close() def on_epoch_end(self, epoch, logs=None): super().on_epoch_end(epoch, logs) if not self.keys: self.keys = sorted(logs.keys()) if getattr(self.model, 'stop_training', None): # We set NA so that csv parsers do not fail for this last epoch. logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys]) # feed them twice to decide the actual width values = self.formatter.format_row([epoch + 1] + [logs.get(k, 'NA') for k in self.keys]) headers = self.formatter.format_row(['epoch'] + self.keys) # print headers and bars self.out.write(self.separator.join(headers) + '\n') # bars for markdown style bars = [''.join(['-'] * width) for width in self.formatter.col_widths] self.out.write(self.separator.join(bars) + '\n') values = self.formatter.format_row([epoch + 1] + [logs.get(k, 'NA') for k in self.keys]) self.out.write(self.separator.join(values) + '\n') self.out.flush() ================================================ FILE: hanlp/common/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-26 14:45 ================================================ FILE: hanlp/common/component.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-26 14:45 import inspect from abc import ABC, abstractmethod from typing import Any from hanlp_common.configurable import Configurable class Component(Configurable, ABC): @abstractmethod def predict(self, *args, **kwargs): """Predict on data. This is the base class for all components, including rule based and statistical ones. Args: *args: Any type of data subject to sub-classes **kwargs: Additional arguments Returns: Any predicted annotations. """ raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) def __call__(self, *args, **kwargs): """ A shortcut for :func:`~hanlp.common.component.predict`. Args: *args: Any type of data subject to sub-classes **kwargs: Additional arguments Returns: Any predicted annotations. """ return self.predict(*args, **kwargs) ================================================ FILE: hanlp/common/dataset.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-09 20:27 import math import os import random import tempfile import warnings from abc import ABC, abstractmethod from copy import copy from logging import Logger from typing import Union, List, Callable, Iterable, Dict, Any import torch import torch.multiprocessing as mp from hanlp.common.transform import TransformList, VocabDict, EmbeddingNamedTransform from hanlp.common.vocab import Vocab from hanlp.components.parsers.alg import kmeans from hanlp.utils.io_util import read_cells, get_resource from hanlp.utils.time_util import CountdownTimer from hanlp.utils.torch_util import dtype_of from hanlp_common.configurable import AutoConfigurable from hanlp_common.constant import IDX, HANLP_VERBOSE from hanlp_common.util import isdebugging, merge_list_of_dict, k_fold from torch.nn.utils.rnn import pad_sequence from torch.utils.data import Dataset, DataLoader, Sampler from torch.utils.data.dataset import IterableDataset class Transformable(ABC): def __init__(self, transform: Union[Callable, List] = None) -> None: """An object which can be transformed with a list of functions. It is the final result of an object being passed through a list of functions, while these functions are kept in a list. Args: transform: A transform function or a list of functions. """ super().__init__() if isinstance(transform, list) and not isinstance(transform, TransformList): transform = TransformList(*transform) self.transform: Union[Callable, TransformList] = transform def append_transform(self, transform: Callable): """Append a transform to its list of transforms. Args: transform: A new transform to be appended. Returns: Itself. """ assert transform is not None, 'None transform not allowed' if not self.transform: self.transform = TransformList(transform) elif not isinstance(self.transform, TransformList): if self.transform != transform: self.transform = TransformList(self.transform, transform) else: if transform not in self.transform: self.transform.append(transform) return self def insert_transform(self, index: int, transform: Callable): """Insert a transform to a certain position. Args: index: A certain position. transform: A new transform. Returns: Itself. """ assert transform is not None, 'None transform not allowed' if not self.transform: self.transform = TransformList(transform) elif not isinstance(self.transform, TransformList): if self.transform != transform: self.transform = TransformList(self.transform) self.transform.insert(index, transform) else: if transform not in self.transform: self.transform.insert(index, transform) return self def transform_sample(self, sample: dict, inplace=False) -> dict: """Apply transforms to a sample. Args: sample: A sample, which is a ``dict`` holding features. inplace: ``True`` to apply transforms inplace. .. Attention:: If any transform modifies existing features, it will modify again and again when ``inplace=True``. For example, if a transform insert a ``BOS`` token to a list inplace, and it is called twice, then 2 ``BOS`` will be inserted which might not be an intended result. Returns: Transformed sample. """ if not inplace: sample = copy(sample) if self.transform: sample = self.transform(sample) return sample class TransformableDataset(Transformable, Dataset, ABC): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, generate_idx=None) -> None: """A :class:`~torch.utils.data.Dataset` which can be applied with a list of transform functions. Args: data: The local or remote path to a dataset, or a list of samples where each sample is a dict. transform: Predefined transform(s). cache: ``True`` to enable caching, so that transforms won't be called twice. generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when samples are re-ordered by a sampler. """ super().__init__(transform) if generate_idx is None: generate_idx = isinstance(data, list) data_ = self.load_data(data, generate_idx) # assert data_, f'No samples loaded from {data}' if data_: assert isinstance(data_[0], dict ), f'TransformDataset expects each sample to be a dict but got {type(data_[0])} instead.' self.data = data_ if cache: self.cache = [None] * len(data_) else: self.cache = None def load_data(self, data, generate_idx=False): """A intermediate step between constructor and calling the actual file loading method. Args: data: If data is a file, this method calls :meth:`~hanlp.common.dataset.TransformableDataset.load_file` to load it. generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when samples are re-ordered by a sampler. Returns: Loaded samples. """ if self.should_load_file(data): if isinstance(data, str): data = get_resource(data) data = list(self.load_file(data)) if generate_idx: for i, each in enumerate(data): each[IDX] = i # elif isinstance(data, list): # data = self.load_list(data) return data # noinspection PyMethodMayBeStatic # def load_list(self, data: list) -> List[Dict[str, Any]]: # return data def should_load_file(self, data) -> bool: """Determines whether data is a filepath. Args: data: Data to check. Returns: ``True`` to indicate it's a filepath. """ return isinstance(data, str) @abstractmethod def load_file(self, filepath: str): """The actual file loading logic. Args: filepath: The path to a dataset. """ pass def __getitem__(self, index: Union[int, slice]) -> Union[dict, List[dict]]: """ Get the index-th sample in this dataset. Args: index: Either a integer index of a list of indices. Returns: Either a sample or or list of samples depending on how many indices are passed in. """ # if isinstance(index, (list, tuple)): # assert len(index) == 1 # index = index[0] if isinstance(index, slice): indices = range(*index.indices(len(self))) return [self[i] for i in indices] if self.cache: cache = self.cache[index] if cache: return cache sample = self.data[index] sample = self.transform_sample(sample) if self.cache: self.cache[index] = sample return sample def __len__(self) -> int: return len(self.data) def __repr__(self) -> str: return f'{len(self)} samples: {self[0] if len(self) else ""} ...' def purge_cache(self): """Purges all cache. If cache is not enabled, this method enables it. """ self.cache = [None] * len(self.data) def split(self, *ratios): """Split dataset into subsets. Args: *ratios: The ratios for each subset. They can be any type of numbers which will be normalized. For example, ``8, 1, 1`` are equivalent to ``0.8, 0.1, 0.1``. Returns: list[TransformableDataset]: A list of subsets. """ ratios = [x / sum(ratios) for x in ratios] chunks = [] prev = 0 for r in ratios: cur = prev + math.ceil(len(self) * r) chunks.append([prev, cur]) prev = cur chunks[-1][1] = len(self) outputs = [] for b, e in chunks: dataset = copy(self) dataset.data = dataset.data[b:e] if dataset.cache: dataset.cache = dataset.cache[b:e] outputs.append(dataset) return outputs def k_fold(self, k, i): """Perform k-fold sampling. Args: k (int): Number of folds. i (int): The i-th fold. Returns: TransformableDataset: The i-th fold subset of this dataset. """ assert 0 <= i <= k, f'Invalid split {i}' train_indices, test_indices = k_fold(k, len(self), i) return self.subset(train_indices), self.subset(test_indices) def subset(self, indices): """Create a subset given indices of samples. Args: indices: Indices of samples. Returns: TransformableDataset: The a subset of this dataset. """ dataset = copy(self) dataset.data = [dataset.data[i] for i in indices] if dataset.cache: dataset.cache = [dataset.cache[i] for i in indices] return dataset def shuffle(self): """Shuffle this dataset inplace. """ if not self.cache: random.shuffle(self.data) else: z = list(zip(self.data, self.cache)) random.shuffle(z) self.data, self.cache = zip(*z) def prune(self, criterion: Callable, logger: Logger = None): """Prune (to discard) samples according to a criterion. Args: criterion: A functions takes a sample as input and output ``True`` if the sample needs to be pruned. logger: If any, log statistical messages using it. Returns: int: Size before pruning. """ # noinspection PyTypeChecker size_before = len(self) good_ones = [i for i, s in enumerate(self) if not criterion(s)] self.data = [self.data[i] for i in good_ones] if self.cache: self.cache = [self.cache[i] for i in good_ones] if logger: size_after = len(self) num_pruned = size_before - size_after logger.info(f'Pruned [yellow]{num_pruned} ({num_pruned / size_before:.1%})[/yellow] ' f'samples out of {size_before}.') return size_before class TransformSequentialDataset(Transformable, IterableDataset, ABC): pass class DeviceDataLoader(DataLoader): def __init__(self, dataset, batch_size=32, shuffle=False, sampler=None, batch_sampler=None, num_workers=None, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None, device=None, **kwargs): if batch_sampler is not None: batch_size = 1 if num_workers is None: if isdebugging(): num_workers = 0 else: num_workers = 2 # noinspection PyArgumentList super(DeviceDataLoader, self).__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle, sampler=sampler, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, multiprocessing_context=multiprocessing_context, **kwargs) self.device = device def __iter__(self): for raw_batch in super(DeviceDataLoader, self).__iter__(): if self.device is not None: for field, data in raw_batch.items(): if isinstance(data, torch.Tensor): data = data.to(self.device) raw_batch[field] = data yield raw_batch def collate_fn(self, samples): return merge_list_of_dict(samples) class PadSequenceDataLoader(DataLoader): def __init__(self, dataset, batch_size=32, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None, pad: dict = None, vocabs: VocabDict = None, device=None, **kwargs): """ A dataloader commonly used for NLP tasks. It offers the following convenience. - Bachify each field of samples into a :class:`~torch.Tensor` if the field name satisfies the following criterion. - Name ends with _id, _ids, _count, _offset, _span, mask - Name is in `pad` dict. - Pad each field according to field name, the vocabs and pad dict. - Move :class:`~torch.Tensor` onto device. Args: dataset: A :class:`~torch.utils.data.Dataset` to be bachified. batch_size: Max size of each batch. shuffle: ``True`` to shuffle batches. sampler: A :class:`~torch.utils.data.Sampler` to sample samples from data. batch_sampler: A :class:`~torch.utils.data.Sampler` to sample batches form all batches. num_workers: Number of workers for multi-thread loading. Note that multi-thread loading aren't always faster. collate_fn: A function to perform batchifying. It must be set to ``None`` in order to make use of the features this class offers. pin_memory: If samples are loaded in the Dataset on CPU and would like to be pushed to the GPU, enabling pin_memory can speed up the transfer. It's not useful since most data field are not in Tensor type. drop_last: Drop the last batch since it could be half-empty. timeout: For multi-worker loading, set a timeout to wait for a worker. worker_init_fn: Init function for multi-worker. multiprocessing_context: Context for multiprocessing. pad: A dict holding field names and their padding values. vocabs: A dict of vocabs so padding value can be fetched from it. device: The device tensors will be moved onto. **kwargs: Other arguments will be passed to :meth:`torch.utils.data.Dataset.__init__` """ if device == -1: device = None if collate_fn is None: collate_fn = self.collate_fn if num_workers is None: if isdebugging(): num_workers = 0 else: num_workers = 2 if batch_sampler is None: assert batch_size, 'batch_size has to be specified when batch_sampler is None' else: batch_size = 1 shuffle = None drop_last = None # noinspection PyArgumentList super(PadSequenceDataLoader, self).__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle, sampler=sampler, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, multiprocessing_context=multiprocessing_context, **kwargs) self.vocabs = vocabs if isinstance(dataset, TransformableDataset) and dataset.transform: transform = dataset.transform if not isinstance(transform, TransformList): transform = [] for each in transform: if isinstance(each, EmbeddingNamedTransform): if pad is None: pad = {} if each.dst not in pad: pad[each.dst] = 0 self.pad = pad self.device = device def __iter__(self): for raw_batch in super(PadSequenceDataLoader, self).__iter__(): yield self.tensorize(raw_batch, vocabs=self.vocabs, pad_dict=self.pad, device=self.device) @staticmethod def tensorize(raw_batch: Dict[str, Any], vocabs: VocabDict, pad_dict: Dict[str, int] = None, device=None): for field, data in raw_batch.items(): if isinstance(data, torch.Tensor): continue vocab_key = field[:-len('_id')] if field.endswith('_id') else None vocab: Vocab = vocabs.get(vocab_key, None) if vocabs and vocab_key else None if vocab: pad = vocab.safe_pad_token_idx dtype = torch.long elif pad_dict is not None and pad_dict.get(field, None) is not None: pad = pad_dict[field] dtype = dtype_of(pad) elif field.endswith('_offset') or field.endswith('_id') or field.endswith( '_count') or field.endswith('_ids') or field.endswith('_score') or field.endswith( '_length') or field.endswith('_span'): # guess some common fields to pad pad = 0 dtype = torch.long elif field.endswith('_mask'): pad = False dtype = torch.bool else: # no need to pad continue data = PadSequenceDataLoader.pad_data(data, pad, dtype) raw_batch[field] = data if device is not None: for field, data in raw_batch.items(): if isinstance(data, torch.Tensor): data = data.to(device) raw_batch[field] = data return raw_batch @staticmethod def pad_data(data: Union[torch.Tensor, Iterable], pad, dtype=None, device=None): """Perform the actual padding for a given data. Args: data: Data to be padded. pad: Padding value. dtype: Data type. device: Device to be moved onto. Returns: torch.Tensor: A ``torch.Tensor``. """ if isinstance(data[0], torch.Tensor): data = pad_sequence(data, True, pad) elif isinstance(data[0], Iterable): inner_is_iterable = False for each in data: if len(each): if isinstance(each[0], Iterable): inner_is_iterable = True if len(each[0]): if not dtype: dtype = dtype_of(each[0][0]) else: inner_is_iterable = False if not dtype: dtype = dtype_of(each[0]) break if inner_is_iterable: max_seq_len = len(max(data, key=len)) max_word_len = len(max([chars for words in data for chars in words], key=len)) ids = torch.zeros(len(data), max_seq_len, max_word_len, dtype=dtype, device=device) for i, words in enumerate(data): for j, chars in enumerate(words): ids[i][j][:len(chars)] = torch.tensor(chars, dtype=dtype, device=device) data = ids else: data = pad_sequence([torch.tensor(x, dtype=dtype, device=device) for x in data], True, pad) elif isinstance(data, list): data = torch.tensor(data, dtype=dtype, device=device) return data def collate_fn(self, samples): return merge_list_of_dict(samples) class CachedDataLoader(object): def __init__(self, dataloader: torch.utils.data.DataLoader, filename=None): if not filename: filename = tempfile.NamedTemporaryFile(prefix='hanlp-cache-', delete=False).name self.filename = filename self.size = len(dataloader) self._build_cache(dataloader) def _build_cache(self, dataset, verbose=HANLP_VERBOSE): timer = CountdownTimer(self.size) with open(self.filename, "wb") as f: for i, batch in enumerate(dataset): torch.save(batch, f, _use_new_zipfile_serialization=False) if verbose: timer.log(f'Caching {self.filename} [blink][yellow]...[/yellow][/blink]') def close(self): if os.path.isfile(self.filename): os.remove(self.filename) def __iter__(self): with open(self.filename, "rb") as f: for i in range(self.size): batch = torch.load(f) yield batch def __len__(self): return self.size def _prefetch_generator(dataloader, queue, batchify=None): while True: for batch in dataloader: if batchify: batch = batchify(batch) queue.put(batch) class PrefetchDataLoader(DataLoader): def __init__(self, dataloader: torch.utils.data.DataLoader, prefetch: int = 10, batchify: Callable = None) -> None: """ A dataloader wrapper which speeds up bachifying using multi-processing. It works best for dataloaders of which the bachify takes very long time. But it introduces extra GPU memory consumption since prefetched batches are stored in a ``Queue`` on GPU. .. Caution:: PrefetchDataLoader only works in spawn mode with the following initialization code: Examples:: if __name__ == '__main__': import torch torch.multiprocessing.set_start_method('spawn') And these 2 lines **MUST** be put into ``if __name__ == '__main__':`` block. Args: dataloader: A :class:`~torch.utils.data.DatasetLoader` to be prefetched. prefetch: Number of batches to prefetch. batchify: A bachify function called on each batch of samples. In which case, the inner dataloader shall return samples without really bachify them. """ super().__init__(dataset=dataloader) self._batchify = batchify self.prefetch = None if isdebugging() else prefetch if self.prefetch: self._fire_process(dataloader, prefetch) def _fire_process(self, dataloader, prefetch): self.queue = mp.Queue(prefetch) self.process = mp.Process(target=_prefetch_generator, args=(dataloader, self.queue, self._batchify)) self.process.start() def __iter__(self): if not self.prefetch: for batch in self.dataset: if self._batchify: batch = self._batchify(batch) yield batch else: size = len(self) while size: batch = self.queue.get() yield batch size -= 1 def close(self): """Close this dataloader and terminates internal processes and queue. It's recommended to call this method to ensure a program can gracefully shutdown. """ if self.prefetch: self.queue.close() self.process.terminate() @property def batchify(self): return self._batchify @batchify.setter def batchify(self, batchify): self._batchify = batchify if not self.prefetch: prefetch = vars(self.queue).get('maxsize', 10) self.close() self._fire_process(self.dataset, prefetch) class BucketSampler(Sampler): # noinspection PyMissingConstructor def __init__(self, buckets: Dict[float, List[int]], batch_max_tokens, batch_size=None, shuffle=False): """A bucketing based sampler which groups samples into buckets then creates batches from each bucket. Args: buckets: A dict of which keys are some statistical numbers of each bucket, and values are the indices of samples in each bucket. batch_max_tokens: Maximum tokens per batch. batch_size: Maximum samples per batch. shuffle: ``True`` to shuffle batches and samples in a batch. """ self.shuffle = shuffle self.sizes, self.buckets = zip(*[ (size, bucket) for size, bucket in buckets.items() ]) # the number of chunks in each bucket, which is clipped by # range [1, len(bucket)] if batch_size: self.chunks = [ max(batch_size, min(len(bucket), max(round(size * len(bucket) / batch_max_tokens), 1))) for size, bucket in zip(self.sizes, self.buckets) ] else: self.chunks = [ min(len(bucket), max(round(size * len(bucket) / batch_max_tokens), 1)) for size, bucket in zip(self.sizes, self.buckets) ] def __iter__(self): # if shuffle, shuffle both the buckets and samples in each bucket range_fn = torch.randperm if self.shuffle else torch.arange for i in range_fn(len(self.buckets)).tolist(): split_sizes = [(len(self.buckets[i]) - j - 1) // self.chunks[i] + 1 for j in range(self.chunks[i])] # DON'T use `torch.chunk` which may return wrong number of chunks for batch in range_fn(len(self.buckets[i])).split(split_sizes): yield [self.buckets[i][j] for j in batch.tolist()] def __len__(self): return sum(self.chunks) class KMeansSampler(BucketSampler): def __init__(self, lengths, batch_max_tokens, batch_size=None, shuffle=False, n_buckets=1): """A bucket sampler which groups samples using KMeans on their lengths. Args: lengths: Lengths of each sample, usually measured by number of tokens. batch_max_tokens: Maximum tokens per batch. batch_size: Maximum samples per batch. shuffle: ``True`` to shuffle batches. Samples in the same batch won't be shuffled since the ordered sequence is helpful to speed up RNNs. n_buckets: Number of buckets. Clusters in terms of KMeans. """ if n_buckets > len(lengths): n_buckets = 1 self.n_buckets = n_buckets self.lengths = lengths buckets = dict(zip(*kmeans(self.lengths, n_buckets))) super().__init__(buckets, batch_max_tokens, batch_size, shuffle) class SortingSampler(Sampler): # noinspection PyMissingConstructor def __init__(self, lengths: List[int], batch_size=None, batch_max_tokens=None, use_effective_tokens=False, shuffle=False) -> None: """A sampler which sorts samples according to their lengths. It takes a continuous chunk of sorted samples to make a batch. The effective batch size is determined by ``batch_size``, ``batch_max_tokens`` and ``use_effective_tokens``. Args: lengths: Lengths of each sample, usually measured by number of tokens. batch_max_tokens: Maximum tokens per batch. use_effective_tokens: Whether to calculate the effective number of tokens after padding when applying the ``batch_max_tokens``. batch_size: Maximum samples per batch. shuffle: ``True`` to shuffle batches and samples in a batch. """ # assert any([batch_size, batch_max_tokens]), 'At least one of batch_size and batch_max_tokens is required' self.shuffle = shuffle self.batch_size = batch_size # self.batch_max_tokens = batch_max_tokens self.batch_indices = [] num_tokens = 0 mini_batch = [] for i in torch.argsort(torch.tensor(lengths), descending=True).tolist(): # if batch_max_tokens: effective_tokens = lengths[i] if (not mini_batch or not use_effective_tokens) else lengths[mini_batch[0]] if (batch_max_tokens is None or num_tokens + effective_tokens <= batch_max_tokens) and ( batch_size is None or len(mini_batch) < batch_size): mini_batch.append(i) num_tokens += effective_tokens else: if not mini_batch: # this sequence is longer than batch_max_tokens mini_batch.append(i) self.batch_indices.append(mini_batch) mini_batch = [] num_tokens = 0 else: self.batch_indices.append(mini_batch) mini_batch = [i] num_tokens = effective_tokens if mini_batch: self.batch_indices.append(mini_batch) # print(len(max(self.batch_indices, key=len))) def __iter__(self): if self.shuffle: random.shuffle(self.batch_indices) for batch in self.batch_indices: yield batch def __len__(self) -> int: return len(self.batch_indices) class SamplerBuilder(AutoConfigurable, ABC): @abstractmethod def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler: """Build a ``Sampler`` given statistics of samples and other arguments. Args: lengths: The lengths of samples. shuffle: ``True`` to shuffle batches. Note samples in each mini-batch are not necessarily shuffled. gradient_accumulation: Number of mini-batches per update step. **kwargs: Other arguments to be passed to the constructor of the sampler. """ pass def __call__(self, lengths: List[int], shuffle=False, **kwargs) -> Sampler: return self.build(lengths, shuffle, **kwargs) def scale(self, gradient_accumulation): r"""Scale down the ``batch_size`` and ``batch_max_tokens`` to :math:`\frac{1}{\text{gradient_accumulation}}` of them respectively. Args: gradient_accumulation: Number of mini-batches per update step. Returns: tuple(int,int): batch_size, batch_max_tokens """ batch_size = self.batch_size batch_max_tokens = self.batch_max_tokens if gradient_accumulation: if batch_size: batch_size //= gradient_accumulation if batch_max_tokens: batch_max_tokens //= gradient_accumulation return batch_size, batch_max_tokens class SortingSamplerBuilder(SortingSampler, SamplerBuilder): # noinspection PyMissingConstructor def __init__(self, batch_size=None, batch_max_tokens=None, use_effective_tokens=False) -> None: """Builds a :class:`~hanlp.common.dataset.SortingSampler`. Args: batch_max_tokens: Maximum tokens per batch. use_effective_tokens: Whether to calculate effective number of tokens when applying the `batch_max_tokens`. batch_size: Maximum samples per batch. """ self.use_effective_tokens = use_effective_tokens self.batch_max_tokens = batch_max_tokens self.batch_size = batch_size def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler: batch_size, batch_max_tokens = self.scale(gradient_accumulation) return SortingSampler(lengths, batch_size, batch_max_tokens, shuffle) def __len__(self) -> int: return 1 class KMeansSamplerBuilder(KMeansSampler, SamplerBuilder): # noinspection PyMissingConstructor def __init__(self, batch_max_tokens, batch_size=None, n_buckets=1): """Builds a :class:`~hanlp.common.dataset.KMeansSampler`. Args: batch_max_tokens: Maximum tokens per batch. batch_size: Maximum samples per batch. n_buckets: Number of buckets. Clusters in terms of KMeans. """ self.n_buckets = n_buckets self.batch_size = batch_size self.batch_max_tokens = batch_max_tokens def build(self, lengths: List[int], shuffle=False, gradient_accumulation=1, **kwargs) -> Sampler: batch_size, batch_max_tokens = self.scale(gradient_accumulation) return KMeansSampler(lengths, batch_max_tokens, batch_size, shuffle, self.n_buckets) def __len__(self) -> int: return 1 class TableDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, delimiter='auto', strip=True, headers=None) -> None: self.headers = headers self.strip = strip self.delimiter = delimiter super().__init__(data, transform, cache) def load_file(self, filepath: str): for idx, cells in enumerate(read_cells(filepath, strip=self.strip, delimiter=self.delimiter)): if not idx and not self.headers: self.headers = cells if any(len(h) > 32 for h in self.headers): warnings.warn('As you did not pass in `headers` to `TableDataset`, the first line is regarded as ' 'headers. However, the length for some headers are too long (>32), which might be ' 'wrong. To make sure, pass `headers=...` explicitly.') else: yield dict(zip(self.headers, cells)) ================================================ FILE: hanlp/common/keras_component.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-26 14:45 import logging import math import os import sys from abc import ABC, abstractmethod from typing import Optional, List, Any, Dict import numpy as np import tensorflow as tf import hanlp.utils from hanlp_common.io import save_json, load_json from hanlp.callbacks.fine_csv_logger import FineCSVLogger from hanlp.common.component import Component from hanlp.common.transform_tf import Transform from hanlp.common.vocab_tf import VocabTF from hanlp.metrics.chunking.iobes_tf import IOBES_F1_TF from hanlp.optimizers.adamw import AdamWeightDecay from hanlp.utils import io_util from hanlp.utils.io_util import get_resource, tempdir_human from hanlp.utils.log_util import init_logger, logger from hanlp.utils.string_util import format_scores from hanlp.utils.tf_util import format_metrics, size_of_dataset, summary_of_model, get_callback_by_class, NumpyEncoder from hanlp.utils.time_util import Timer, now_datetime from hanlp_common.reflection import str_to_type, classpath_of from hanlp_common.structure import SerializableDict from hanlp_common.util import merge_dict class KerasComponent(Component, ABC): def __init__(self, transform: Transform) -> None: super().__init__() self.meta = { 'class_path': classpath_of(self), 'hanlp_version': hanlp.version.__version__, } self.model: Optional[tf.keras.Model] = None self.config = SerializableDict() self.transform = transform # share config with transform for convenience, so we don't need to pass args around if self.transform.config: for k, v in self.transform.config.items(): self.config[k] = v self.transform.config = self.config def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=128, logger: logging.Logger = None, callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=True, verbose=True, **kwargs): input_path = get_resource(input_path) file_prefix, ext = os.path.splitext(input_path) name = os.path.basename(file_prefix) if not name: name = 'evaluate' if save_dir and not logger: logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO if verbose else logging.WARN, mode='w') tst_data = self.transform.file_to_dataset(input_path, batch_size=batch_size) samples = self.num_samples_in(tst_data) num_batches = math.ceil(samples / batch_size) if warm_up: for x, y in tst_data: self.model.predict_on_batch(x) break if output: assert save_dir, 'Must pass save_dir in order to output' if isinstance(output, bool): output = os.path.join(save_dir, name) + '.predict' + ext elif isinstance(output, str): output = output else: raise RuntimeError('output ({}) must be of type bool or str'.format(repr(output))) timer = Timer() eval_outputs = self.evaluate_dataset(tst_data, callbacks, output, num_batches, **kwargs) loss, score, output = eval_outputs[0], eval_outputs[1], eval_outputs[2] delta_time = timer.stop() speed = samples / delta_time.delta_seconds if logger: f1: IOBES_F1_TF = None for metric in self.model.metrics: if isinstance(metric, IOBES_F1_TF): f1 = metric break extra_report = '' if f1: overall, by_type, extra_report = f1.state.result(full=True, verbose=False) extra_report = ' \n' + extra_report logger.info('Evaluation results for {} - ' 'loss: {:.4f} - {} - speed: {:.2f} sample/sec{}' .format(name + ext, loss, format_scores(score) if isinstance(score, dict) else format_metrics(self.model.metrics), speed, extra_report)) if output: logger.info('Saving output to {}'.format(output)) with open(output, 'w', encoding='utf-8') as out: self.evaluate_output(tst_data, out, num_batches, self.model.metrics) return loss, score, speed def num_samples_in(self, dataset): return size_of_dataset(dataset) def evaluate_dataset(self, tst_data, callbacks, output, num_batches, **kwargs): loss, score = self.model.evaluate(tst_data, callbacks=callbacks, steps=num_batches) return loss, score, output def evaluate_output(self, tst_data, out, num_batches, metrics: List[tf.keras.metrics.Metric]): # out.write('x\ty_true\ty_pred\n') for metric in metrics: metric.reset_states() for idx, batch in enumerate(tst_data): outputs = self.model.predict_on_batch(batch[0]) for metric in metrics: metric(batch[1], outputs, outputs._keras_mask if hasattr(outputs, '_keras_mask') else None) self.evaluate_output_to_file(batch, outputs, out) print('\r{}/{} {}'.format(idx + 1, num_batches, format_metrics(metrics)), end='') print() def evaluate_output_to_file(self, batch, outputs, out): for x, y_gold, y_pred in zip(self.transform.X_to_inputs(batch[0]), self.transform.Y_to_outputs(batch[1], gold=True), self.transform.Y_to_outputs(outputs, gold=False)): out.write(self.transform.input_truth_output_to_str(x, y_gold, y_pred)) def _capture_config(self, config: Dict, exclude=( 'trn_data', 'dev_data', 'save_dir', 'kwargs', 'self', 'logger', 'verbose', 'dev_batch_size', '__class__')): """ Save arguments to config Parameters ---------- config `locals()` exclude """ if 'kwargs' in config: config.update(config['kwargs']) config = dict( (key, tf.keras.utils.serialize_keras_object(value)) if hasattr(value, 'get_config') else (key, value) for key, value in config.items()) for key in exclude: config.pop(key, None) self.config.update(config) def save_meta(self, save_dir, filename='meta.json', **kwargs): self.meta['create_time']: now_datetime() self.meta.update(kwargs) save_json(self.meta, os.path.join(save_dir, filename)) def load_meta(self, save_dir, filename='meta.json'): save_dir = get_resource(save_dir) metapath = os.path.join(save_dir, filename) if os.path.isfile(metapath): self.meta.update(load_json(metapath)) def save_config(self, save_dir, filename='config.json'): self.config.save_json(os.path.join(save_dir, filename)) def load_config(self, save_dir, filename='config.json'): save_dir = get_resource(save_dir) self.config.load_json(os.path.join(save_dir, filename)) def save_weights(self, save_dir, filename='model.h5'): self.model.save_weights(os.path.join(save_dir, filename)) def load_weights(self, save_dir, filename='model.h5', **kwargs): assert self.model.built or self.model.weights, 'You must call self.model.built() in build_model() ' \ 'in order to load it' save_dir = get_resource(save_dir) self.model.load_weights(os.path.join(save_dir, filename)) def save_vocabs(self, save_dir, filename='vocabs.json'): vocabs = SerializableDict() for key, value in vars(self.transform).items(): if isinstance(value, VocabTF): vocabs[key] = value.to_dict() vocabs.save_json(os.path.join(save_dir, filename)) def load_vocabs(self, save_dir, filename='vocabs.json'): save_dir = get_resource(save_dir) vocabs = SerializableDict() vocabs.load_json(os.path.join(save_dir, filename)) for key, value in vocabs.items(): vocab = VocabTF() vocab.copy_from(value) setattr(self.transform, key, vocab) def load_transform(self, save_dir) -> Transform: """ Try to load transform only. This method might fail due to the fact it avoids building the model. If it do fail, then you have to use `load` which might be too heavy but that's the best we can do. :param save_dir: The path to load. """ save_dir = get_resource(save_dir) self.load_config(save_dir) self.load_vocabs(save_dir) self.transform.build_config() self.transform.lock_vocabs() return self.transform def save(self, save_dir: str, **kwargs): self.save_config(save_dir) self.save_vocabs(save_dir) self.save_weights(save_dir) def load(self, save_dir: str, logger=hanlp.utils.log_util.logger, **kwargs): self.meta['load_path'] = save_dir save_dir = get_resource(save_dir) self.load_config(save_dir) self.load_vocabs(save_dir) self.build(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True)) self.load_weights(save_dir, **kwargs) self.load_meta(save_dir) @property def input_shape(self) -> List: return self.transform.output_shapes[0] def build(self, logger, **kwargs): self.transform.build_config() self.model = self.build_model(**merge_dict(self.config, training=kwargs.get('training', None), loss=kwargs.get('loss', None))) self.transform.lock_vocabs() optimizer = self.build_optimizer(**self.config) loss = self.build_loss( **self.config if 'loss' in self.config else dict(list(self.config.items()) + [('loss', None)])) # allow for different metrics = self.build_metrics(**merge_dict(self.config, metrics=kwargs.get('metrics', 'accuracy'), logger=logger, overwrite=True)) if not isinstance(metrics, list): if isinstance(metrics, tf.keras.metrics.Metric): metrics = [metrics] if not self.model.built: sample_inputs = self.sample_data if sample_inputs is not None: self.model(sample_inputs) else: if len(self.transform.output_shapes[0]) == 1 and self.transform.output_shapes[0][0] is None: x_shape = self.transform.output_shapes[0] else: x_shape = list(self.transform.output_shapes[0]) for i, shape in enumerate(x_shape): x_shape[i] = [None] + shape # batch + X.shape self.model.build(input_shape=x_shape) self.compile_model(optimizer, loss, metrics) return self.model, optimizer, loss, metrics def compile_model(self, optimizer, loss, metrics): try: self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics, run_eagerly=self.config.run_eagerly) except ValueError: from keras.saving.object_registration import CustomObjectScope with CustomObjectScope({'adamweightdecay': AdamWeightDecay}): self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics, run_eagerly=self.config.run_eagerly) def build_optimizer(self, optimizer, **kwargs) -> tf.keras.optimizers.Optimizer: if isinstance(optimizer, (str, dict)): custom_objects = {'AdamWeightDecay': AdamWeightDecay} try: optimizer = tf.keras.utils.deserialize_keras_object(optimizer, module_objects=vars(tf.keras.optimizers), custom_objects=custom_objects) except ValueError: optimizer['config'].pop('decay', None) optimizer = tf.keras.utils.deserialize_keras_object(optimizer, module_objects=vars(tf.keras.optimizers), custom_objects=custom_objects) self.config.optimizer = tf.keras.utils.serialize_keras_object(optimizer) return optimizer def build_loss(self, loss, **kwargs): if not loss: loss = tf.keras.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, from_logits=True) elif isinstance(loss, (str, dict)): loss = tf.keras.utils.deserialize_keras_object(loss, module_objects=vars(tf.keras.losses)) if isinstance(loss, tf.keras.losses.Loss): self.config.loss = tf.keras.utils.serialize_keras_object(loss) return loss def build_transform(self, **kwargs): return self.transform def build_vocab(self, trn_data, logger): train_examples = self.transform.fit(trn_data, **self.config) self.transform.summarize_vocabs(logger) return train_examples def build_metrics(self, metrics, logger: logging.Logger, **kwargs): metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') return [metric] @abstractmethod def build_model(self, **kwargs) -> tf.keras.Model: pass def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=False, logger=None, verbose=True, finetune: str = None, **kwargs): self._capture_config(locals()) if sys.version_info >= (3, 10): logger.warning(f'Training with TensorFlow {tf.__version__} has not been tested on Python ' f'{sys.version_info.major}.{sys.version_info.minor}. Please downgrade to ' f'Python<=3.9 in case any compatibility issues arise.') self.transform = self.build_transform(**self.config) if not save_dir: save_dir = tempdir_human() if not logger: logger = init_logger(name='train', root_dir=save_dir, level=logging.INFO if verbose else logging.WARN) logger.info('Hyperparameter:\n' + self.config.to_json()) num_examples = self.build_vocab(trn_data, logger) # assert num_examples, 'You forgot to return the number of training examples in your build_vocab' logger.info('Building...') train_steps_per_epoch = math.ceil(num_examples / batch_size) if num_examples else None self.config.train_steps = train_steps_per_epoch * epochs if num_examples else None model, optimizer, loss, metrics = self.build(**merge_dict(self.config, logger=logger, training=True)) logger.info('Model built:\n' + summary_of_model(self.model)) if finetune: finetune = get_resource(finetune) if os.path.isdir(finetune): finetune = os.path.join(finetune, 'model.h5') model.load_weights(finetune, by_name=True, skip_mismatch=True) logger.info(f'Loaded pretrained weights from {finetune} for finetuning') self.save_config(save_dir) self.save_vocabs(save_dir) self.save_meta(save_dir) trn_data = self.build_train_dataset(trn_data, batch_size, num_examples) dev_data = self.build_valid_dataset(dev_data, batch_size) callbacks = self.build_callbacks(save_dir, **merge_dict(self.config, overwrite=True, logger=logger)) # need to know #batches, otherwise progbar crashes dev_steps = math.ceil(self.num_samples_in(dev_data) / batch_size) checkpoint = get_callback_by_class(callbacks, tf.keras.callbacks.ModelCheckpoint) timer = Timer() try: history = self.train_loop(**merge_dict(self.config, trn_data=trn_data, dev_data=dev_data, epochs=epochs, num_examples=num_examples, train_steps_per_epoch=train_steps_per_epoch, dev_steps=dev_steps, callbacks=callbacks, logger=logger, model=model, optimizer=optimizer, loss=loss, metrics=metrics, overwrite=True)) except KeyboardInterrupt: print() if not checkpoint or checkpoint.best in (np.Inf, -np.Inf): self.save_weights(save_dir) logger.info('Aborted with model saved') else: logger.info(f'Aborted with model saved with best {checkpoint.monitor} = {checkpoint.best:.4f}') # noinspection PyTypeChecker history: tf.keras.callbacks.History() = get_callback_by_class(callbacks, tf.keras.callbacks.History) delta_time = timer.stop() best_epoch_ago = 0 if history and hasattr(history, 'epoch'): trained_epoch = len(history.epoch) logger.info('Trained {} epochs in {}, each epoch takes {}'. format(trained_epoch, delta_time, delta_time / trained_epoch if trained_epoch else delta_time)) save_json(history.history, io_util.path_join(save_dir, 'history.json'), cls=NumpyEncoder) monitor_history: List = history.history.get(checkpoint.monitor, None) if monitor_history: best_epoch_ago = len(monitor_history) - monitor_history.index(checkpoint.best) if checkpoint and monitor_history and checkpoint.best != monitor_history[-1]: logger.info(f'Restored the best model saved with best ' f'{checkpoint.monitor} = {checkpoint.best:.4f} ' f'saved {best_epoch_ago} epochs ago') self.load_weights(save_dir) # restore best model return history def train_loop(self, trn_data, dev_data, epochs, num_examples, train_steps_per_epoch, dev_steps, model, optimizer, loss, metrics, callbacks, logger, **kwargs): history = self.model.fit(trn_data, epochs=epochs, steps_per_epoch=train_steps_per_epoch, validation_data=dev_data, callbacks=callbacks, validation_steps=dev_steps, ) # type:tf.keras.callbacks.History return history def build_valid_dataset(self, dev_data, batch_size): dev_data = self.transform.file_to_dataset(dev_data, batch_size=batch_size, shuffle=False) return dev_data def build_train_dataset(self, trn_data, batch_size, num_examples): trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size, shuffle=True, repeat=-1 if self.config.train_steps else None) return trn_data def build_callbacks(self, save_dir, logger, **kwargs): metrics = kwargs.get('metrics', 'accuracy') if isinstance(metrics, (list, tuple)): metrics = metrics[-1] monitor = f'val_{metrics}' checkpoint = tf.keras.callbacks.ModelCheckpoint( os.path.join(save_dir, 'model.h5'), # verbose=1, monitor=monitor, save_best_only=True, mode='max', save_weights_only=True) logger.debug(f'Monitor {checkpoint.monitor} for checkpoint') tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=io_util.makedirs(io_util.path_join(save_dir, 'logs'))) csv_logger = FineCSVLogger(os.path.join(save_dir, 'train.log'), separator=' | ', append=True) callbacks = [checkpoint, tensorboard_callback, csv_logger] lr_decay_per_epoch = self.config.get('lr_decay_per_epoch', None) if lr_decay_per_epoch: learning_rate = self.model.optimizer.get_config().get('learning_rate', None) if not learning_rate: logger.warning('Learning rate decay not supported for optimizer={}'.format(repr(self.model.optimizer))) else: logger.debug(f'Created LearningRateScheduler with lr_decay_per_epoch={lr_decay_per_epoch}') callbacks.append(tf.keras.callbacks.LearningRateScheduler( lambda epoch: learning_rate / (1 + lr_decay_per_epoch * epoch))) anneal_factor = self.config.get('anneal_factor', None) if anneal_factor: callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(factor=anneal_factor, patience=self.config.get('anneal_patience', 10))) early_stopping_patience = self.config.get('early_stopping_patience', None) if early_stopping_patience: callbacks.append(tf.keras.callbacks.EarlyStopping(monitor=monitor, mode='max', verbose=1, patience=early_stopping_patience)) return callbacks def on_train_begin(self): """ Callback before the training starts """ pass def predict(self, data: Any, batch_size=None, **kwargs): assert self.model, 'Please call fit or load before predict' if not data: return [] data, flat = self.transform.input_to_inputs(data) if not batch_size: batch_size = self.config.batch_size dataset = self.transform.inputs_to_dataset(data, batch_size=batch_size, gold=kwargs.get('gold', False)) results = [] num_samples = 0 data_is_list = isinstance(data, list) for idx, batch in enumerate(dataset): samples_in_batch = tf.shape(batch[-1] if isinstance(batch[-1], tf.Tensor) else batch[-1][0])[0] if data_is_list: inputs = data[num_samples:num_samples + samples_in_batch] else: inputs = None # if data is a generator, it's usually one-time, not able to transform into a list for output in self.predict_batch(batch, inputs=inputs, **kwargs): results.append(output) num_samples += samples_in_batch self.transform.cleanup() if flat: return results[0] return results def predict_batch(self, batch, inputs=None, **kwargs): X = batch[0] Y = self.model.predict_on_batch(X) for output in self.transform.Y_to_outputs(Y, X=X, inputs=inputs, batch=batch, **kwargs): yield output @property def sample_data(self): return None @staticmethod def from_meta(meta: dict, **kwargs): """ Parameters ---------- meta kwargs Returns ------- KerasComponent """ cls = str_to_type(meta['class_path']) obj: KerasComponent = cls() assert 'load_path' in meta, f'{meta} doesn\'t contain load_path field' obj.load(meta['load_path']) return obj def export_model_for_serving(self, export_dir=None, version=1, overwrite=False, show_hint=False): assert self.model, 'You have to fit or load a model before exporting it' if not export_dir: assert 'load_path' in self.meta, 'When not specifying save_dir, load_path has to present' export_dir = get_resource(self.meta['load_path']) model_path = os.path.join(export_dir, str(version)) if os.path.isdir(model_path) and not overwrite: logger.info(f'{model_path} exists, skip since overwrite = {overwrite}') return export_dir logger.info(f'Exporting to {export_dir} ...') tf.saved_model.save(self.model, model_path) logger.info(f'Successfully exported model to {export_dir}') if show_hint: logger.info(f'You can serve it through \n' f'tensorflow_model_server --model_name={os.path.splitext(os.path.basename(self.meta["load_path"]))[0]} ' f'--model_base_path={export_dir} --rest_api_port=8888') return export_dir def serve(self, export_dir=None, grpc_port=8500, rest_api_port=0, overwrite=False, dry_run=False): export_dir = self.export_model_for_serving(export_dir, show_hint=False, overwrite=overwrite) if not dry_run: del self.model # free memory logger.info('The inputs of exported model is shown below.') os.system(f'saved_model_cli show --all --dir {export_dir}/1') cmd = f'nohup tensorflow_model_server --model_name={os.path.splitext(os.path.basename(self.meta["load_path"]))[0]} ' \ f'--model_base_path={export_dir} --port={grpc_port} --rest_api_port={rest_api_port} ' \ f'>serve.log 2>&1 &' logger.info(f'Running ...\n{cmd}') if not dry_run: os.system(cmd) ================================================ FILE: hanlp/common/structure.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-26 14:58 from typing import Dict from hanlp_common.configurable import Configurable from hanlp_common.reflection import classpath_of from hanlp_common.structure import SerializableDict class ConfigTracker(Configurable): def __init__(self, locals_: Dict, exclude=('kwargs', 'self', '__class__', 'locals_')) -> None: """This base class helps sub-classes to capture their arguments passed to ``__init__``, and also their types so that they can be deserialized from a config in dict form. Args: locals_: Obtained by :meth:`locals`. exclude: Arguments to be excluded. Examples: >>> class MyClass(ConfigTracker): >>> def __init__(self, i_need_this='yes') -> None: >>> super().__init__(locals()) >>> obj = MyClass() >>> print(obj.config) {'i_need_this': 'yes', 'classpath': 'test_config_tracker.MyClass'} """ if 'kwargs' in locals_: locals_.update(locals_['kwargs']) self.config = SerializableDict( (k, v.config if hasattr(v, 'config') else v) for k, v in locals_.items() if k not in exclude) self.config['classpath'] = classpath_of(self) class History(object): def __init__(self): """ A history of training context. It records how many steps have passed and provides methods to decide whether an update should be performed, and to caculate number of training steps given dataloader size and ``gradient_accumulation``. """ self.num_mini_batches = 0 def step(self, gradient_accumulation): """ Whether the training procedure should perform an update. Args: gradient_accumulation: Number of batches per update. Returns: bool: ``True`` to update. """ self.num_mini_batches += 1 return self.num_mini_batches % gradient_accumulation == 0 def num_training_steps(self, num_batches, gradient_accumulation): """ Caculate number of training steps. Args: num_batches: Size of dataloader. gradient_accumulation: Number of batches per update. Returns: """ return len( [i for i in range(self.num_mini_batches + 1, self.num_mini_batches + num_batches + 1) if i % gradient_accumulation == 0]) ================================================ FILE: hanlp/common/torch_component.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-08 21:20 import logging import os import re import time from abc import ABC, abstractmethod from typing import Optional, Dict, List, Union, Callable import torch from torch import nn from torch.utils.data import DataLoader import hanlp from hanlp.common.component import Component from hanlp.common.dataset import TransformableDataset from hanlp.common.transform import VocabDict from hanlp.utils.io_util import get_resource, basename_no_ext from hanlp.utils.log_util import init_logger, flash from hanlp.utils.torch_util import cuda_devices, set_seed from hanlp_common.configurable import Configurable from hanlp_common.constant import IDX, HANLP_VERBOSE from hanlp_common.reflection import classpath_of from hanlp_common.structure import SerializableDict from hanlp_common.util import merge_dict, isdebugging class TorchComponent(Component, ABC): def __init__(self, **kwargs) -> None: """The base class for all components using PyTorch as backend. It provides common workflows of building vocabs, datasets, dataloaders and models. These workflows are more of a conventional guideline than en-forced protocols, which means subclass has the freedom to override or completely skip some steps. Args: **kwargs: Addtional arguments to be stored in the ``config`` property. """ super().__init__() self.model: Optional[torch.nn.Module] = None self.config = SerializableDict(**kwargs) self.vocabs = VocabDict() def _capture_config(self, locals_: Dict, exclude=( 'trn_data', 'dev_data', 'save_dir', 'kwargs', 'self', 'logger', 'verbose', 'dev_batch_size', '__class__', 'devices', 'eval_trn')): """Save arguments to config Args: locals_: Dict: exclude: (Default value = ('trn_data') 'dev_data': 'save_dir': 'kwargs': 'self': 'logger': 'verbose': 'dev_batch_size': '__class__': 'devices'): Returns: """ if 'kwargs' in locals_: locals_.update(locals_['kwargs']) locals_ = dict((k, v) for k, v in locals_.items() if k not in exclude and not k.startswith('_')) self.config.update(locals_) return self.config def save_weights(self, save_dir, filename='model.pt', trainable_only=True, **kwargs): """Save model weights to a directory. Args: save_dir: The directory to save weights into. filename: A file name for weights. trainable_only: ``True`` to only save trainable weights. Useful when the model contains lots of static embeddings. **kwargs: Not used for now. """ model = self.model_ state_dict = model.state_dict() if trainable_only: trainable_names = set(n for n, p in model.named_parameters() if p.requires_grad) state_dict = dict((n, p) for n, p in state_dict.items() if n in trainable_names) torch.save(state_dict, os.path.join(save_dir, filename)) def load_weights(self, save_dir, filename='model.pt', **kwargs): """Load weights from a directory. Args: save_dir: The directory to load weights from. filename: A file name for weights. **kwargs: Not used. """ save_dir = get_resource(save_dir) filename = os.path.join(save_dir, filename) # flash(f'Loading model: {filename} [blink]...[/blink][/yellow]') try: self.model_.load_state_dict(torch.load(filename, map_location='cpu', weights_only=True), strict=False) except TypeError: self.model_.load_state_dict(torch.load(filename, map_location='cpu'), strict=False) # flash('') def save_config(self, save_dir, filename='config.json'): """Save config into a directory. Args: save_dir: The directory to save config. filename: A file name for config. """ self._savable_config.save_json(os.path.join(save_dir, filename)) def load_config(self, save_dir, filename='config.json', **kwargs): """Load config from a directory. Args: save_dir: The directory to load config. filename: A file name for config. **kwargs: K-V pairs to override config. """ save_dir = get_resource(save_dir) self.config.load_json(os.path.join(save_dir, filename)) self.config.update(kwargs) # overwrite config loaded from disk for k, v in self.config.items(): if isinstance(v, dict) and 'classpath' in v: self.config[k] = Configurable.from_config(v) self.on_config_ready(**self.config, save_dir=save_dir) def save_vocabs(self, save_dir, filename='vocabs.json'): """Save vocabularies to a directory. Args: save_dir: The directory to save vocabularies. filename: The name for vocabularies. """ if hasattr(self, 'vocabs'): self.vocabs.save_vocabs(save_dir, filename) def load_vocabs(self, save_dir, filename='vocabs.json'): """Load vocabularies from a directory. Args: save_dir: The directory to load vocabularies. filename: The name for vocabularies. """ if hasattr(self, 'vocabs'): self.vocabs = VocabDict() self.vocabs.load_vocabs(save_dir, filename) def save(self, save_dir: str, **kwargs): """Save this component to a directory. Args: save_dir: The directory to save this component. **kwargs: Not used. """ self.save_config(save_dir) self.save_vocabs(save_dir) self.save_weights(save_dir) def load(self, save_dir: str, devices=None, verbose=HANLP_VERBOSE, **kwargs): """Load from a local/remote component. Args: save_dir: An identifier which can be a local path or a remote URL or a pre-defined string. devices: The devices this component will be moved onto. verbose: ``True`` to log loading progress. **kwargs: To override some configs. """ save_dir = get_resource(save_dir) # flash('Loading config and vocabs [blink][yellow]...[/yellow][/blink]') if devices is None and self.model: devices = self.devices self.load_config(save_dir, **kwargs) self.load_vocabs(save_dir) if verbose: flash('Building model [blink][yellow]...[/yellow][/blink]') self.config.pop('training', None) # Some legacy versions accidentally put training into config file self.model = self.build_model( **merge_dict(self.config, **kwargs, overwrite=True, inplace=True), training=False, save_dir=save_dir) if verbose: flash('') self.load_weights(save_dir, **kwargs) self.to(devices, verbose=verbose) self.model.eval() def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, devices=None, logger=None, seed=None, finetune: Union[bool, str] = False, eval_trn=True, _device_placeholder=False, **kwargs): """Fit to data, triggers the training procedure. For training set and dev set, they shall be local or remote files. Args: trn_data: Training set. dev_data: Development set. save_dir: The directory to save trained component. batch_size: The number of samples in a batch. epochs: Number of epochs. devices: Devices this component will live on. logger: Any :class:`logging.Logger` instance. seed: Random seed to reproduce this training. finetune: ``True`` to load from ``save_dir`` instead of creating a randomly initialized component. ``str`` to specify a different ``save_dir`` to load from. eval_trn: Evaluate training set after each update. This can slow down the training but provides a quick diagnostic for debugging. _device_placeholder: ``True`` to create a placeholder tensor which triggers PyTorch to occupy devices so other components won't take these devices as first choices. **kwargs: Hyperparameters used by sub-classes. Returns: Any results sub-classes would like to return. Usually the best metrics on training set. """ # Common initialization steps config = self._capture_config(locals()) if not logger: logger = self.build_logger('train', save_dir) if seed is None: self.config.seed = 233 if isdebugging() else int(time.time()) set_seed(self.config.seed) logger.info(self._savable_config.to_json(sort=True)) if isinstance(devices, list) or devices is None or isinstance(devices, float): flash('[yellow]Querying CUDA devices [blink]...[/blink][/yellow]') devices = -1 if isdebugging() else cuda_devices(devices) flash('') # flash(f'Available GPUs: {devices}') if isinstance(devices, list): first_device = (devices[0] if devices else -1) elif isinstance(devices, dict): first_device = next(iter(devices.values())) elif isinstance(devices, int): first_device = devices else: first_device = -1 if _device_placeholder and first_device >= 0: _dummy_placeholder = self._create_dummy_placeholder_on(first_device) if finetune: if isinstance(finetune, str): self.load(finetune, devices=devices) else: self.load(save_dir, devices=devices) self.config.finetune = finetune self.vocabs.unlock() # For extending vocabs logger.info( f'Finetune model loaded with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}' f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.') self.on_config_ready(**self.config, save_dir=save_dir) trn = self.build_dataloader(**merge_dict(config, data=trn_data, batch_size=batch_size, shuffle=True, training=True, device=first_device, logger=logger, vocabs=self.vocabs, overwrite=True)) dev = self.build_dataloader(**merge_dict(config, data=dev_data, batch_size=batch_size, shuffle=False, training=None, device=first_device, logger=logger, vocabs=self.vocabs, overwrite=True)) if dev_data else None flash('[yellow]Building model [blink]...[/blink][/yellow]') self.model = self.build_model(**merge_dict(config, training=True), logger=logger) flash('') logger.info(f'Model built with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}' f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.') assert self.model, 'build_model is not properly implemented.' _description = repr(self.model) if len(_description.split('\n')) < 10: logger.info(_description) self.save_config(save_dir) self.save_vocabs(save_dir) self.to(devices, logger) if _device_placeholder and first_device >= 0: del _dummy_placeholder criterion = self.build_criterion(**merge_dict(config, trn=trn)) optimizer = self.build_optimizer(**merge_dict(config, trn=trn, criterion=criterion)) metric = self.build_metric(**self.config) if hasattr(trn, 'dataset') and dev and hasattr(dev, 'dataset'): if trn.dataset and dev.dataset: logger.info(f'{len(trn.dataset)}/{len(dev.dataset)} samples in trn/dev set.') if hasattr(trn, '__len__') and dev and hasattr(dev, '__len__'): trn_size = len(trn) // self.config.get('gradient_accumulation', 1) ratio_width = len(f'{trn_size}/{trn_size}') else: ratio_width = None return self.execute_training_loop(**merge_dict(config, trn=trn, dev=dev, epochs=epochs, criterion=criterion, optimizer=optimizer, metric=metric, logger=logger, save_dir=save_dir, devices=devices, ratio_width=ratio_width, trn_data=trn_data, dev_data=dev_data, eval_trn=eval_trn, overwrite=True)) def build_logger(self, name, save_dir): """Build a :class:`logging.Logger`. Args: name: The name of this logger. save_dir: The directory this logger should save logs into. Returns: logging.Logger: A logger. """ logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO, fmt="%(message)s") return logger @abstractmethod def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None, **kwargs) -> DataLoader: """Build dataloader for training, dev and test sets. It's suggested to build vocabs in this method if they are not built yet. Args: data: Data representing samples, which can be a path or a list of samples. batch_size: Number of samples per batch. shuffle: Whether to shuffle this dataloader. device: Device tensors should be loaded onto. logger: Logger for reporting some message if dataloader takes a long time or if vocabs has to be built. **kwargs: Arguments from ``**self.config``. """ pass def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger): """Override this method to build vocabs. Args: trn: Training set. logger: Logger for reporting progress. """ pass @property def _savable_config(self): def convert(k, v): if not isinstance(v, SerializableDict) and hasattr(v, 'config'): v = v.config elif isinstance(v, (set, tuple)): v = list(v) if isinstance(v, dict): v = dict(convert(_k, _v) for _k, _v in v.items()) return k, v config = SerializableDict( convert(k, v) for k, v in sorted(self.config.items())) config.update({ # 'create_time': now_datetime(), 'classpath': classpath_of(self), 'hanlp_version': hanlp.__version__, }) return config @abstractmethod def build_optimizer(self, **kwargs): """Implement this method to build an optimizer. Args: **kwargs: The subclass decides the method signature. """ pass @abstractmethod def build_criterion(self, **kwargs): """Implement this method to build criterion (loss function). Args: **kwargs: The subclass decides the method signature. """ pass @abstractmethod def build_metric(self, **kwargs): """Implement this to build metric(s). Args: **kwargs: The subclass decides the method signature. """ pass @abstractmethod def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): """Implement this to run training loop. Args: trn: Training set. dev: Development set. epochs: Number of epochs. criterion: Loss function. optimizer: Optimizer(s). metric: Metric(s) save_dir: The directory to save this component. logger: Logger for reporting progress. devices: Devices this component and dataloader will live on. ratio_width: The width of dataset size measured in number of characters. Used for logger to align messages. **kwargs: Other hyper-parameters passed from sub-class. """ pass @abstractmethod def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): """Fit onto a dataloader. Args: trn: Training set. criterion: Loss function. optimizer: Optimizer. metric: Metric(s). logger: Logger for reporting progress. **kwargs: Other hyper-parameters passed from sub-class. """ pass @abstractmethod def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs): """Evaluate on a dataloader. Args: data: Dataloader which can build from any data source. criterion: Loss function. metric: Metric(s). output: Whether to save outputs into some file. **kwargs: Not used. """ pass @abstractmethod def build_model(self, training=True, **kwargs) -> torch.nn.Module: """Build model. Args: training: ``True`` if called during training. **kwargs: ``**self.config``. """ raise NotImplementedError def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs): """Evaluate test set. Args: tst_data: Test set, which is usually a file path. save_dir: The directory to save evaluation scores or predictions. logger: Logger for reporting progress. batch_size: Batch size for test dataloader. output: Whether to save outputs into some file. **kwargs: Not used. Returns: (metric, outputs) where outputs are the return values of ``evaluate_dataloader``. """ if not self.model: raise RuntimeError('Call fit or load before evaluate.') if isinstance(tst_data, str): tst_data = get_resource(tst_data) filename = os.path.basename(tst_data) else: filename = None if output is True: output = self.generate_prediction_filename(tst_data if isinstance(tst_data, str) else 'test.txt', save_dir) if logger is None: _logger_name = basename_no_ext(filename) if filename else None logger = self.build_logger(_logger_name, save_dir) if not batch_size: batch_size = self.config.get('batch_size', 32) data = self.build_dataloader(**merge_dict(self.config, data=tst_data, batch_size=batch_size, shuffle=False, device=self.devices[0], logger=logger, overwrite=True)) dataset = data while dataset and hasattr(dataset, 'dataset'): dataset = dataset.dataset num_samples = len(dataset) if dataset else None if output and isinstance(dataset, TransformableDataset): def add_idx(samples): for idx, sample in enumerate(samples): if sample: sample[IDX] = idx add_idx(dataset.data) if dataset.cache: add_idx(dataset.cache) criterion = self.build_criterion(**self.config) metric = self.build_metric(**self.config) start = time.time() outputs = self.evaluate_dataloader(data, criterion=criterion, filename=filename, output=output, input=tst_data, save_dir=save_dir, test=True, num_samples=num_samples, **merge_dict(self.config, batch_size=batch_size, metric=metric, logger=logger, **kwargs)) elapsed = time.time() - start if logger: if num_samples: logger.info(f'speed: {num_samples / elapsed:.0f} samples/second') else: logger.info(f'speed: {len(data) / elapsed:.0f} batches/second') return metric, outputs def generate_prediction_filename(self, tst_data, save_dir): assert isinstance(tst_data, str), 'tst_data has be a str in order to infer the output name' output = os.path.splitext(os.path.basename(tst_data)) output = os.path.join(save_dir, output[0] + '.pred' + output[1]) return output def to(self, devices: Union[int, float, List[int], Dict[str, Union[int, torch.device]]] = None, logger: logging.Logger = None, verbose=HANLP_VERBOSE): """Move this component to devices. Args: devices: Target devices. logger: Logger for printing progress report, as copying a model from CPU to GPU can takes several seconds. verbose: ``True`` to print progress when logger is None. """ if devices is None: # if getattr(torch, 'has_mps', None): # mac M1 chips # devices = torch.device('mps:0') # else: devices = cuda_devices(devices) elif devices == -1 or devices == [-1]: devices = [] elif isinstance(devices, (int, float)): devices = cuda_devices(devices) if devices: if logger: logger.info(f'Using GPUs: [on_blue][cyan][bold]{devices}[/bold][/cyan][/on_blue]') if isinstance(devices, list): if verbose: flash(f'Moving model to GPUs {devices} [blink][yellow]...[/yellow][/blink]') self.model = self.model.to(devices[0]) if len(devices) > 1 and not isdebugging() and not isinstance(self.model, nn.DataParallel): self.model = self.parallelize(devices) elif isinstance(devices, dict): for name, module in self.model.named_modules(): for regex, device in devices.items(): try: on_device: torch.device = next(module.parameters()).device except StopIteration: continue if on_device == device: continue if isinstance(device, int): if on_device.index == device: continue if re.match(regex, name): if not name: name = '*' flash(f'Moving module [yellow]{name}[/yellow] to [on_yellow][magenta][bold]{device}' f'[/bold][/magenta][/on_yellow]: [red]{regex}[/red]\n') module.to(device) elif isinstance(devices, torch.device): if verbose: flash(f'Moving model to {devices} [blink][yellow]...[/yellow][/blink]') self.model = self.model.to(devices) else: raise ValueError(f'Unrecognized devices {devices}') if verbose: flash('') else: if logger: logger.info('Using [red]CPU[/red]') def parallelize(self, devices: List[Union[int, torch.device]]): return nn.DataParallel(self.model, device_ids=devices) @property def devices(self): """The devices this component lives on. """ if self.model is None: return None # next(parser.model.parameters()).device if hasattr(self.model, 'device_ids'): return self.model.device_ids device: torch.device = next(self.model.parameters()).device return [device] @property def device(self): """The first device this component lives on. """ devices = self.devices if not devices: return None return devices[0] def on_config_ready(self, **kwargs): """Called when config is ready, either during ``fit`` or ``load``. Subclass can perform extra initialization tasks in this callback. Args: **kwargs: Not used. """ pass @property def model_(self) -> nn.Module: """ The actual model when it's wrapped by a `DataParallel` Returns: The "real" model """ if isinstance(self.model, nn.DataParallel): return self.model.module return self.model # noinspection PyMethodOverriding @abstractmethod def predict(self, *args, **kwargs): """Predict on data fed by user. Users shall avoid directly call this method since it is not guarded with ``torch.no_grad`` and will introduces unnecessary gradient computation. Use ``__call__`` instead. Args: *args: Sentences or tokens. **kwargs: Used in sub-classes. """ pass @staticmethod def _create_dummy_placeholder_on(device): if device < 0: device = 'cpu:0' return torch.zeros(16, 16, device=device) @torch.no_grad() def __call__(self, *args, **kwargs): """Predict on data fed by user. This method calls :meth:`~hanlp.common.torch_component.predict` but decorates it with ``torch.no_grad``. Args: *args: Sentences or tokens. **kwargs: Used in sub-classes. """ return super().__call__(*args, **merge_dict(self.config, overwrite=True, **kwargs)) ================================================ FILE: hanlp/common/transform.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-03 14:44 import logging import os from abc import ABC, abstractmethod from typing import Tuple, Union, List from hanlp_common.constant import EOS, PAD from hanlp_common.structure import SerializableDict from hanlp_common.configurable import Configurable from hanlp.common.vocab import Vocab from hanlp.utils.io_util import get_resource from hanlp_common.io import load_json from hanlp_common.reflection import classpath_of, str_to_type from hanlp.utils.string_util import ispunct class ToIndex(ABC): def __init__(self, vocab: Vocab = None) -> None: super().__init__() if vocab is None: vocab = Vocab() self.vocab = vocab @abstractmethod def __call__(self, sample): pass def save_vocab(self, save_dir, filename='vocab.json'): vocab = SerializableDict() vocab.update(self.vocab.to_dict()) vocab.save_json(os.path.join(save_dir, filename)) def load_vocab(self, save_dir, filename='vocab.json'): save_dir = get_resource(save_dir) vocab = SerializableDict() vocab.load_json(os.path.join(save_dir, filename)) self.vocab.copy_from(vocab) class FieldToIndex(ToIndex): def __init__(self, src, vocab: Vocab, dst=None) -> None: super().__init__(vocab) self.src = src if not dst: dst = f'{src}_id' self.dst = dst def __call__(self, sample: dict): sample[self.dst] = self.vocab(sample[self.src]) return sample def save_vocab(self, save_dir, filename=None): if not filename: filename = f'{self.dst}_vocab.json' super().save_vocab(save_dir, filename) def load_vocab(self, save_dir, filename=None): if not filename: filename = f'{self.dst}_vocab.json' super().load_vocab(save_dir, filename) class VocabList(list): def __init__(self, *fields) -> None: super().__init__() for each in fields: self.append(FieldToIndex(each)) def append(self, item: Union[str, Tuple[str, Vocab], Tuple[str, str, Vocab], FieldToIndex]) -> None: if isinstance(item, str): item = FieldToIndex(item) elif isinstance(item, (list, tuple)): if len(item) == 2: item = FieldToIndex(src=item[0], vocab=item[1]) elif len(item) == 3: item = FieldToIndex(src=item[0], dst=item[1], vocab=item[2]) else: raise ValueError(f'Unsupported argument length: {item}') elif isinstance(item, FieldToIndex): pass else: raise ValueError(f'Unsupported argument type: {item}') super(self).append(item) def save_vocab(self, save_dir): for each in self: each.save_vocab(save_dir, None) def load_vocab(self, save_dir): for each in self: each.load_vocab(save_dir, None) class VocabDict(SerializableDict): def __init__(self, *args, **kwargs) -> None: """A dict holding :class:`hanlp.common.vocab.Vocab` instances. When used as a transform, it transforms the field corresponding to each :class:`hanlp.common.vocab.Vocab` into indices. Args: *args: A list of vocab names. **kwargs: Names and corresponding :class:`hanlp.common.vocab.Vocab` instances. """ vocabs = dict(kwargs) for each in args: vocabs[each] = Vocab() super().__init__(vocabs) def save_vocabs(self, save_dir, filename='vocabs.json'): """Save vocabularies to a directory. Args: save_dir: The directory to save vocabularies. filename: The name for vocabularies. """ vocabs = SerializableDict() for key, value in self.items(): if isinstance(value, Vocab): vocabs[key] = value.to_dict() vocabs.save_json(os.path.join(save_dir, filename)) def load_vocabs(self, save_dir, filename='vocabs.json', vocab_cls=Vocab): """Load vocabularies from a directory. Args: save_dir: The directory to load vocabularies. filename: The name for vocabularies. """ save_dir = get_resource(save_dir) vocabs = SerializableDict() vocabs.load_json(os.path.join(save_dir, filename)) self._load_vocabs(self, vocabs, vocab_cls) @staticmethod def _load_vocabs(vd, vocabs: dict, vocab_cls=Vocab): """ Args: vd: vocabs: vocab_cls: Default class for the new vocab """ for key, value in vocabs.items(): if 'idx_to_token' in value: cls = value.get('type', None) if cls: cls = str_to_type(cls) else: cls = vocab_cls vocab = cls() vocab.copy_from(value) vd[key] = vocab else: # nested Vocab # noinspection PyTypeChecker vd[key] = nested = VocabDict() VocabDict._load_vocabs(nested, value, vocab_cls) def lock(self): """ Lock each vocab. """ for key, value in self.items(): if isinstance(value, Vocab): value.lock() def unlock(self): """ Unlock each vocab. """ for key, value in self.items(): if isinstance(value, Vocab): value.unlock() @property def mutable(self): status = [v.mutable for v in self.values() if isinstance(v, Vocab)] return len(status) == 0 or any(status) def __call__(self, sample: dict): for key, value in self.items(): if isinstance(value, Vocab): field = sample.get(key, None) if field is not None: sample[f'{key}_id'] = value(field) return sample def __getattr__(self, key): if key.startswith('__'): return dict.__getattr__(key) return self.__getitem__(key) def __setattr__(self, key, value): return self.__setitem__(key, value) def __getitem__(self, k: str) -> Vocab: return super().__getitem__(k) def __setitem__(self, k: str, v: Vocab) -> None: super().__setitem__(k, v) def summary(self, logger: logging.Logger = None): """Log a summary of vocabs using a given logger. Args: logger: The logger to use. """ for key, value in self.items(): if isinstance(value, Vocab): report = value.summary(verbose=False) if logger: logger.info(f'{key}{report}') else: print(f'{key}{report}') def put(self, **kwargs): """Put names and corresponding :class:`hanlp.common.vocab.Vocab` instances into self. Args: **kwargs: Names and corresponding :class:`hanlp.common.vocab.Vocab` instances. """ for k, v in kwargs.items(): self[k] = v class NamedTransform(ABC): def __init__(self, src: str, dst: str = None) -> None: if dst is None: dst = src self.dst = dst self.src = src @abstractmethod def __call__(self, sample: dict) -> dict: return sample class ConfigurableTransform(Configurable, ABC): @property def config(self): return dict([('classpath', classpath_of(self))] + [(k, v) for k, v in self.__dict__.items() if not k.startswith('_')]) @classmethod def from_config(cls, config: dict): """ Args: config: kwargs: config: dict: Returns: """ cls = config.get('classpath', None) assert cls, f'{config} doesn\'t contain classpath field' cls = str_to_type(cls) config = dict(config) config.pop('classpath') return cls(**config) class ConfigurableNamedTransform(NamedTransform, ConfigurableTransform, ABC): pass class EmbeddingNamedTransform(ConfigurableNamedTransform, ABC): def __init__(self, output_dim: int, src: str, dst: str) -> None: super().__init__(src, dst) self.output_dim = output_dim class RenameField(NamedTransform): def __call__(self, sample: dict): sample[self.dst] = sample.pop(self.src) return sample class CopyField(object): def __init__(self, src, dst) -> None: self.dst = dst self.src = src def __call__(self, sample: dict) -> dict: sample[self.dst] = sample[self.src] return sample class FilterField(object): def __init__(self, *keys) -> None: self.keys = keys def __call__(self, sample: dict): sample = dict((k, sample[k]) for k in self.keys) return sample class TransformList(list): """Composes several transforms together. Args: transforms(list of ``Transform`` objects): list of transforms to compose. Example: Returns: >>> transforms.TransformList( >>> transforms.CenterCrop(10), >>> transforms.ToTensor(), >>> ) """ def __init__(self, *transforms) -> None: super().__init__() self.extend(transforms) def __call__(self, sample): for t in self: sample = t(sample) return sample def index_by_type(self, t): for i, trans in enumerate(self): if isinstance(trans, t): return i class LowerCase(object): def __init__(self, src, dst=None) -> None: if dst is None: dst = src self.src = src self.dst = dst def __call__(self, sample: dict) -> dict: src = sample[self.src] if isinstance(src, str): sample[self.dst] = src.lower() elif isinstance(src, list): sample[self.dst] = [x.lower() for x in src] return sample class LowerCase3D(LowerCase): def __call__(self, sample: dict) -> dict: src = sample[self.src] sample[self.dst] = [[y.lower() for y in x] for x in src] return sample class ToChar(object): def __init__(self, src, dst='char', max_word_length=None, min_word_length=None, pad=PAD) -> None: if dst is None: dst = src self.src = src self.dst = dst self.max_word_length = max_word_length self.min_word_length = min_word_length self.pad = pad def __call__(self, sample: dict) -> dict: src = sample[self.src] if isinstance(src, str): sample[self.dst] = self.to_chars(src) elif isinstance(src, list): sample[self.dst] = [self.to_chars(x) for x in src] return sample def to_chars(self, word: str): chars = list(word) if self.min_word_length and len(chars) < self.min_word_length: chars = chars + [self.pad] * (self.min_word_length - len(chars)) if self.max_word_length: chars = chars[:self.max_word_length] return chars class AppendEOS(NamedTransform): def __init__(self, src: str, dst: str = None, eos=EOS) -> None: super().__init__(src, dst) self.eos = eos def __call__(self, sample: dict) -> dict: sample[self.dst] = sample[self.src] + [self.eos] return sample class WhitespaceTokenizer(NamedTransform): def __call__(self, sample: dict) -> dict: src = sample[self.src] if isinstance(src, str): sample[self.dst] = self.tokenize(src) elif isinstance(src, list): sample[self.dst] = [self.tokenize(x) for x in src] return sample @staticmethod def tokenize(text: str): return text.split() class NormalizeDigit(object): def __init__(self, src, dst=None) -> None: if dst is None: dst = src self.src = src self.dst = dst @staticmethod def transform(word: str): new_word = "" for char in word: if char.isdigit(): new_word += '0' else: new_word += char return new_word def __call__(self, sample: dict) -> dict: src = sample[self.src] if isinstance(src, str): sample[self.dst] = self.transform(src) elif isinstance(src, list): sample[self.dst] = [self.transform(x) for x in src] return sample class Bigram(NamedTransform): def __init__(self, src: str, dst: str = None) -> None: if not dst: dst = f'{src}_bigram' super().__init__(src, dst) def __call__(self, sample: dict) -> dict: src: List = sample[self.src] dst = src + [EOS] dst = [dst[i] + dst[i + 1] for i in range(len(src))] sample[self.dst] = dst return sample class FieldLength(NamedTransform): def __init__(self, src: str, dst: str = None, delta=0) -> None: self.delta = delta if not dst: dst = f'{src}_length' super().__init__(src, dst) def __call__(self, sample: dict) -> dict: sample[self.dst] = len(sample[self.src]) + self.delta return sample class BMESOtoIOBES(object): def __init__(self, field='tag') -> None: self.field = field def __call__(self, sample: dict) -> dict: sample[self.field] = [self.convert(y) for y in sample[self.field]] return sample @staticmethod def convert(y: str): if y.startswith('M-'): return 'I-' return y class NormalizeToken(ConfigurableNamedTransform): def __init__(self, mapper: Union[str, dict], src: str, dst: str = None) -> None: super().__init__(src, dst) self.mapper = mapper if isinstance(mapper, str): mapper = get_resource(mapper) if isinstance(mapper, str): self._table = load_json(mapper) elif isinstance(mapper, dict): self._table = mapper else: raise ValueError(f'Unrecognized mapper type {mapper}') def __call__(self, sample: dict) -> dict: src = sample[self.src] if self.src == self.dst: sample[f'{self.src}_'] = src if isinstance(src, str): src = self.convert(src) else: src = [self.convert(x) for x in src] sample[self.dst] = src return sample def convert(self, token) -> str: return self._table.get(token, token) class PunctuationMask(ConfigurableNamedTransform): def __init__(self, src: str, dst: str = None) -> None: """Mask out all punctuations (set mask of punctuations to False) Args: src: dst: Returns: """ if not dst: dst = f'{src}_punct_mask' super().__init__(src, dst) def __call__(self, sample: dict) -> dict: src = sample[self.src] if isinstance(src, str): dst = not ispunct(src) else: dst = [not ispunct(x) for x in src] sample[self.dst] = dst return sample class NormalizeCharacter(NormalizeToken): def convert(self, token) -> str: return ''.join([NormalizeToken.convert(self, c) for c in token]) ================================================ FILE: hanlp/common/transform_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-27 14:22 import inspect from abc import ABC, abstractmethod from typing import Generator, Tuple, Union, Iterable, Any import tensorflow as tf from hanlp_common.structure import SerializableDict from hanlp.common.vocab_tf import VocabTF from hanlp.utils.io_util import get_resource from hanlp.utils.log_util import logger class Transform(ABC): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None: super().__init__() self.map_y = map_y self.map_x = map_x if kwargs: if not config: config = SerializableDict() for k, v in kwargs.items(): config[k] = v self.config = config self.output_types = None self.output_shapes = None self.padding_values = None # Fix tf memory leak: https://github.com/tensorflow/tensorflow/issues/37653#issuecomment-1000517720 self.py_func_set_to_cleanup = set() @abstractmethod def fit(self, trn_path: str, **kwargs) -> int: """ Build the vocabulary from training file Parameters ---------- trn_path : path to training set kwargs Returns ------- int How many samples in the training set """ raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) def build_config(self): """ By default, call build_types_shapes_values, usually called in component's build method. You can perform other building task here. Remember to call super().build_config """ self.output_types, self.output_shapes, self.padding_values = self.create_types_shapes_values() # We prefer list over shape here, as it's easier to type [] than () # if isinstance(self.output_shapes, tuple): # self.output_shapes = list(self.output_shapes) # for i, shapes in enumerate(self.output_shapes): # if isinstance(shapes, tuple): # self.output_shapes[i] = list(shapes) # for j, shape in enumerate(shapes): # if isinstance(shape, tuple): # shapes[j] = list(shape) @abstractmethod def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: """ Create dataset related values, """ raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) @abstractmethod def file_to_inputs(self, filepath: str, gold=True): """ Transform file to inputs. The inputs are defined as raw features (e.g. words) to be processed into more features (e.g. forms and characters) Parameters ---------- filepath gold """ raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) def inputs_to_samples(self, inputs, gold=False): if gold: yield from inputs else: for x in inputs: yield x, self.padding_values[-1] def file_to_samples(self, filepath: str, gold=True): """ Transform file to samples Parameters ---------- filepath gold """ filepath = get_resource(filepath) inputs = self.file_to_inputs(filepath, gold) yield from self.inputs_to_samples(inputs, gold) def file_to_dataset(self, filepath: str, gold=True, map_x=None, map_y=None, batch_size=32, shuffle=None, repeat=None, drop_remainder=False, prefetch=1, cache=True, **kwargs) -> tf.data.Dataset: """ Transform file to dataset Parameters ---------- filepath gold : bool Whether it's processing gold data or not. Example: there is usually a column for gold answer when gold = True. map_x : bool Whether call map_x or not. Default to self.map_x map_y : bool Whether call map_y or not. Default to self.map_y batch_size shuffle repeat prefetch kwargs Returns ------- """ # debug # for sample in self.file_to_samples(filepath): # pass def generator(): inputs = self.file_to_inputs(filepath, gold) samples = self.inputs_to_samples(inputs, gold) yield from samples return self.samples_to_dataset(generator, map_x, map_y, batch_size, shuffle, repeat, drop_remainder, prefetch, cache) def inputs_to_dataset(self, inputs, gold=False, map_x=None, map_y=None, batch_size=32, shuffle=None, repeat=None, drop_remainder=False, prefetch=1, cache=False, **kwargs) -> tf.data.Dataset: # debug # for sample in self.inputs_to_samples(inputs): # pass def generator(): samples = self.inputs_to_samples(inputs, gold) yield from samples return self.samples_to_dataset(generator, map_x, map_y, batch_size, shuffle, repeat, drop_remainder, prefetch, cache) def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=32, shuffle=None, repeat=None, drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset: output_types, output_shapes, padding_values = self.output_types, self.output_shapes, self.padding_values if not all(v for v in [output_shapes, output_shapes, padding_values]): # print('Did you forget to call build_config() on your transform?') self.build_config() output_types, output_shapes, padding_values = self.output_types, self.output_shapes, self.padding_values assert all(v for v in [output_shapes, output_shapes, padding_values]), 'Your create_types_shapes_values returns None, which is not allowed' # if not callable(samples): # samples = Transform.generator_to_callable(samples) if not hasattr(tf.compat.v1.get_default_graph(), '_py_funcs_used_in_graph'): tf.compat.v1.get_default_graph()._py_funcs_used_in_graph = [] py_func_set_before = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph) dataset = tf.data.Dataset.from_generator(samples, output_types=output_types, output_shapes=output_shapes) if cache: logger.debug('Dataset cache enabled') dataset = dataset.cache(cache if isinstance(cache, str) else '') if shuffle: if isinstance(shuffle, bool): shuffle = 1024 dataset = dataset.shuffle(shuffle) if repeat: dataset = dataset.repeat(repeat) if batch_size: dataset = dataset.padded_batch(batch_size, output_shapes, padding_values, drop_remainder) if prefetch: dataset = dataset.prefetch(prefetch) if map_x is None: map_x = self.map_x if map_y is None: map_y = self.map_y if map_x or map_y: def mapper(X, Y): if map_x: X = self.x_to_idx(X) if map_y: Y = self.y_to_idx(Y) return X, Y dataset = dataset.map(mapper, num_parallel_calls=tf.data.experimental.AUTOTUNE) py_func_set_after = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph) - py_func_set_before self.py_func_set_to_cleanup |= py_func_set_after return dataset @abstractmethod def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) @abstractmethod def y_to_idx(self, y) -> tf.Tensor: raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) def lock_vocabs(self): for key, value in vars(self).items(): if isinstance(value, VocabTF): value.lock() def summarize_vocabs(self, logger=None, header='Vocab summary:'): output = header + '\n' vocabs = {} for key, value in vars(self).items(): if isinstance(value, VocabTF): vocabs[key] = value # tag vocab comes last usually for key, value in sorted(vocabs.items(), key=lambda kv: len(kv[1]), reverse=True): output += f'{key}' + value.summary(verbose=False) + '\n' output = output.strip() if logger: logger.info(output) else: print(output) @staticmethod def generator_to_callable(generator: Generator): return lambda: (x for x in generator) def str_to_idx(self, X, Y) -> Tuple[Union[tf.Tensor, Tuple], tf.Tensor]: return self.x_to_idx(X), self.y_to_idx(Y) def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: return [repr(x) for x in X] def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, batch=None) -> Iterable: return [repr(y) for y in Y] def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False) -> Iterable: """ Convert predicted tensors to outputs Parameters ---------- X : Union[tf.Tensor, Tuple[tf.Tensor]] The inputs of model Y : Union[tf.Tensor, Tuple[tf.Tensor]] The outputs of model Returns ------- """ return [(x, y) for x, y in zip(self.X_to_inputs(X), self.Y_to_outputs(Y, gold))] def input_is_single_sample(self, input: Any) -> bool: return False def input_to_inputs(self, input: Any) -> Tuple[Any, bool]: """ If input is one sample, convert it to a list which contains this unique sample Parameters ---------- input : sample or samples Returns ------- (inputs, converted) : Tuple[Any, bool] """ flat = self.input_is_single_sample(input) if flat: input = [input] return input, flat def input_truth_output_to_str(self, input, truth, output): """ Convert input truth output to string representation, usually for writing to file during evaluation Parameters ---------- input truth output Returns ------- """ return '\t'.join([input, truth, output]) + '\n' def cleanup(self): new_py_funcs = set(tf.compat.v1.get_default_graph()._py_funcs_used_in_graph) - self.py_func_set_to_cleanup tf.compat.v1.get_default_graph()._py_funcs_used_in_graph = list(new_py_funcs) self.py_func_set_to_cleanup = set() ================================================ FILE: hanlp/common/vocab.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-06-13 22:42 from collections import Counter from typing import List, Dict, Union, Iterable from hanlp_common.constant import UNK, PAD from hanlp_common.structure import Serializable from hanlp_common.reflection import classpath_of class Vocab(Serializable): def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD, unk_token=UNK) -> None: """Vocabulary base class which converts tokens to indices and vice versa. Args: idx_to_token: id to token mapping. token_to_idx: token to id mapping. mutable: ``True`` to allow adding new tokens, ``False`` to map OOV to ``unk``. pad_token: The token representing padding. unk_token: The token representing OOV. """ super().__init__() if idx_to_token: t2i = dict((token, idx) for idx, token in enumerate(idx_to_token)) if token_to_idx: t2i.update(token_to_idx) token_to_idx = t2i if token_to_idx is None: token_to_idx = {} if pad_token is not None: token_to_idx[pad_token] = len(token_to_idx) if unk_token is not None: token_to_idx[unk_token] = token_to_idx.get(unk_token, len(token_to_idx)) self.token_to_idx = token_to_idx self.idx_to_token: List[str] = None self.mutable = mutable self.pad_token = pad_token self.unk_token = unk_token def __setitem__(self, token: str, idx: int): assert self.mutable, 'Update an immutable Vocab object is not allowed' self.token_to_idx[token] = idx def __getitem__(self, key: Union[str, int, List]) -> Union[int, str, List]: """ Get the index/indices associated with a token or a list of tokens or vice versa. Args: key: ``str`` for token(s) and ``int`` for index/indices. Returns: Associated indices or tokens. """ if isinstance(key, str): return self.get_idx(key) elif isinstance(key, int): return self.get_token(key) elif isinstance(key, list): if len(key) == 0: return [] elif isinstance(key[0], str): return [self.get_idx(x) for x in key] elif isinstance(key[0], int): return [self.get_token(x) for x in key] def __contains__(self, key: Union[str, int]): if isinstance(key, str): return key in self.token_to_idx elif isinstance(key, int): return 0 <= key < len(self.idx_to_token) else: return False def add(self, token: str) -> int: """ Tries to add a token into a vocab and returns its id. If it has already been there, its id will be returned and the vocab won't be updated. If the vocab is locked, an assertion failure will occur. Args: token: A new or existing token. Returns: Its associated id. """ assert self.mutable, 'It is not allowed to call add on an immutable Vocab' assert isinstance(token, str), f'Token type must be str but got {type(token)} from {token}' assert token is not None, 'Token must not be None' idx = self.token_to_idx.get(token, None) if idx is None: idx = len(self.token_to_idx) self.token_to_idx[token] = idx return idx def update(self, tokens: Iterable[str]) -> None: """Update the vocab with these tokens by adding them to vocab one by one. Args: tokens (Iterable[str]): A list of tokens. """ assert self.mutable, 'It is not allowed to update an immutable Vocab' for token in tokens: self.add(token) def get_idx(self, token: str) -> int: """Get the idx of a token. If it's not there, it will be added to the vocab when the vocab is locked otherwise the id of UNK will be returned. Args: token: A token. Returns: The id of that token. """ assert isinstance(token, str), 'token has to be `str`' idx = self.token_to_idx.get(token, None) if idx is None: if self.mutable: idx = len(self.token_to_idx) self.token_to_idx[token] = idx else: idx = self.token_to_idx.get(self.unk_token, None) return idx def get_idx_without_add(self, token: str) -> int: idx = self.token_to_idx.get(token, None) if idx is None: idx = self.token_to_idx.get(self.safe_unk_token, None) return idx def get_token(self, idx: int) -> str: """Get the token using its index. Args: idx: The index to a token. Returns: """ if self.idx_to_token: return self.idx_to_token[idx] if self.mutable: for token in self.token_to_idx: if self.token_to_idx[token] == idx: return token def has_key(self, token): return token in self.token_to_idx def __len__(self): return len(self.token_to_idx) def lock(self): """Lock this vocab up so that it won't accept new tokens. Returns: Itself. """ if self.locked: return self self.mutable = False self.build_idx_to_token() return self def build_idx_to_token(self): max_idx = max(self.token_to_idx.values()) self.idx_to_token = [None] * (max_idx + 1) for token, idx in self.token_to_idx.items(): self.idx_to_token[idx] = token def unlock(self): """Unlock this vocab so that new tokens can be added in. Returns: Itself. """ if not self.locked: return self.mutable = True self.idx_to_token = None return self @property def locked(self): """ ``True`` indicates this vocab is locked. """ return not self.mutable @property def unk_idx(self): """ The index of ``UNK`` token. """ if self.unk_token is None: return None else: return self.token_to_idx.get(self.unk_token, None) @property def pad_idx(self): """ The index of ``PAD`` token. """ if self.pad_token is None: return None else: return self.token_to_idx.get(self.pad_token, None) @property def tokens(self): """ A set of all tokens in this vocab. """ return self.token_to_idx.keys() def __str__(self) -> str: return self.token_to_idx.__str__() def summary(self, verbose=True) -> str: """Get or print a summary of this vocab. Args: verbose: ``True`` to print the summary to stdout. Returns: Summary in text form. """ # report = 'Length: {}\n'.format(len(self)) # report += 'Samples: {}\n'.format(str(list(self.token_to_idx.keys())[:min(50, len(self))])) # report += 'Mutable: {}'.format(self.mutable) # report = report.strip() report = '[{}] = '.format(len(self)) report += str(list(self.token_to_idx.keys())[:min(50, len(self))]) if verbose: print(report) return report def __call__(self, some_token: Union[str, Iterable[str]]) -> Union[int, List[int]]: if isinstance(some_token, (list, tuple, set)): indices = [] if len(some_token) and isinstance(some_token[0], (list, tuple, set)): for sent in some_token: inside = [] for token in sent: inside.append(self.get_idx(token)) indices.append(inside) return indices for token in some_token: indices.append(self.get_idx(token)) return indices else: return self.get_idx(some_token) def to_dict(self) -> dict: """Convert this vocab to a dict so that it can be json serialized. Returns: A dict. """ idx_to_token = self.idx_to_token pad_token = self.pad_token unk_token = self.unk_token mutable = self.mutable items = locals().copy() items.pop('self') return items def copy_from(self, item: dict): """Copy properties from a dict so that it can json de-serialized. Args: item: A dict holding ``token_to_idx`` Returns: Itself. """ for key, value in item.items(): setattr(self, key, value) self.token_to_idx = {k: v for v, k in enumerate(self.idx_to_token)} return self def lower(self): """Convert all tokens to lower case. Returns: Itself. """ self.unlock() token_to_idx = self.token_to_idx self.token_to_idx = {} for token in token_to_idx.keys(): self.add(token.lower()) return self @property def first_token(self): """The first token in this vocab. """ if self.idx_to_token: return self.idx_to_token[0] if self.token_to_idx: return next(iter(self.token_to_idx)) return None def merge(self, other): """Merge this with another vocab inplace. Args: other (Vocab): Another vocab. """ for word, idx in other.token_to_idx.items(): self.get_idx(word) @property def safe_pad_token(self) -> str: """Get the pad token safely. It always returns a pad token, which is the pad token or the first token if pad does not present in the vocab. """ if self.pad_token: return self.pad_token if self.first_token: return self.first_token return PAD @property def safe_pad_token_idx(self) -> int: """Get the idx to the pad token safely. It always returns an index, which corresponds to the pad token or the first token if pad does not present in the vocab. """ return self.token_to_idx.get(self.safe_pad_token, 0) @property def safe_unk_token(self) -> str: """Get the unk token safely. It always returns a unk token, which is the unk token or the first token if unk does not presented in the vocab. """ if self.unk_token: return self.unk_token if self.first_token: return self.first_token return UNK def __repr__(self) -> str: if self.idx_to_token is not None: return self.idx_to_token.__repr__() return self.token_to_idx.__repr__() def extend(self, tokens: Iterable[str]): self.unlock() self(tokens) def reload_idx_to_token(self, idx_to_token: List[str], pad_idx=0, unk_idx=1): self.idx_to_token = idx_to_token self.token_to_idx = dict((s, i) for i, s in enumerate(idx_to_token)) if pad_idx is not None: self.pad_token = idx_to_token[pad_idx] if unk_idx is not None: self.unk_token = idx_to_token[unk_idx] def set_unk_as_safe_unk(self): """Set ``self.unk_token = self.safe_unk_token``. It's useful when the dev/test set contains OOV labels. """ self.unk_token = self.safe_unk_token def clear(self): self.unlock() self.token_to_idx.clear() class CustomVocab(Vocab): def to_dict(self) -> dict: d = super().to_dict() d['type'] = classpath_of(self) return d class LowercaseVocab(CustomVocab): def get_idx(self, token: str) -> int: idx = self.token_to_idx.get(token, None) if idx is None: idx = self.token_to_idx.get(token.lower(), None) if idx is None: if self.mutable: idx = len(self.token_to_idx) self.token_to_idx[token] = idx else: idx = self.token_to_idx.get(self.unk_token, None) return idx class VocabWithNone(CustomVocab): def get_idx(self, token: str) -> int: if token is None: return -1 return super().get_idx(token) class VocabWithFrequency(CustomVocab): def __init__(self, counter: Counter = None, min_occur_cnt=0, pad_token=PAD, unk_token=UNK, specials=None) -> None: super().__init__(None, None, True, pad_token, unk_token) if specials: for each in specials: counter.pop(each, None) self.add(each) self.frequencies = [1] * len(self) if counter: for token, freq in counter.most_common(): if freq >= min_occur_cnt: self.add(token) self.frequencies.append(freq) self.lock() def to_dict(self) -> dict: d = super().to_dict() d['frequencies'] = self.frequencies return d def copy_from(self, item: dict): super().copy_from(item) self.frequencies = item['frequencies'] def get_frequency(self, token): idx = self.get_idx(token) if idx is not None: return self.frequencies[idx] return 0 class VocabCounter(CustomVocab): def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD, unk_token=UNK) -> None: super().__init__(idx_to_token, token_to_idx, mutable, pad_token, unk_token) self.counter = Counter() def get_idx(self, token: str) -> int: if self.mutable: self.counter[token] += 1 return super().get_idx(token) def trim(self, min_frequency): assert self.mutable specials = {self.unk_token, self.pad_token} survivors = list((token, freq) for token, freq in self.counter.most_common() if freq >= min_frequency and token not in specials) survivors = [(x, -1) for x in specials if x] + survivors self.counter = Counter(dict(survivors)) self.token_to_idx = dict() self.idx_to_token = None for token, freq in survivors: idx = len(self.token_to_idx) self.token_to_idx[token] = idx def copy_from(self, item: dict): super().copy_from(item) self.counter = Counter(item['counter'].items()) if 'counter' in item else Counter() def to_dict(self) -> dict: d = super().to_dict() d['counter'] = dict(self.counter.items()) return d class Vocab3D(CustomVocab): def __call__(self, some_token: Union[str, Iterable[str], Iterable[Iterable[str]]]) \ -> Union[int, List[int], List[List[int]]]: """It supports 3D arrays of tokens. Args: some_token: Tokens of 1D to 3D Returns: A list of indices. """ if isinstance(some_token, (list, tuple, set)): indices = [] if len(some_token) and isinstance(some_token[0], (list, tuple, set)): for sent in some_token: inside = [] for token in sent: inside.append(self.get_idx(token)) indices.append(inside) return indices for token in some_token: if isinstance(token, str): indices.append(self.get_idx(token)) else: indices.append([self.get_idx(x) for x in token]) return indices else: return self.get_idx(some_token) def create_label_vocab() -> Vocab: return Vocab(pad_token=None, unk_token=None) ================================================ FILE: hanlp/common/vocab_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-06-13 22:42 from typing import List, Dict, Union, Iterable from hanlp_common.structure import Serializable from hanlp_common.constant import PAD, UNK import tensorflow as tf from tensorflow.python.ops.lookup_ops import index_table_from_tensor class VocabTF(Serializable): def __init__(self, idx_to_token: List[str] = None, token_to_idx: Dict = None, mutable=True, pad_token=PAD, unk_token=UNK) -> None: super().__init__() if idx_to_token: t2i = dict((token, idx) for idx, token in enumerate(idx_to_token)) if token_to_idx: t2i.update(token_to_idx) token_to_idx = t2i if token_to_idx is None: token_to_idx = {} if pad_token: token_to_idx[pad_token] = len(token_to_idx) if unk_token: token_to_idx[unk_token] = len(token_to_idx) self.token_to_idx = token_to_idx self.idx_to_token: list = None self.mutable = mutable self.pad_token = pad_token self.unk_token = unk_token self.token_to_idx_table: tf.lookup.StaticHashTable = None self.idx_to_token_table = None def __setitem__(self, token: str, idx: int): assert self.mutable, 'Update an immutable Vocab object is not allowed' self.token_to_idx[token] = idx def __getitem__(self, key: Union[str, int, List]) -> Union[int, str, List]: if isinstance(key, str): return self.get_idx(key) elif isinstance(key, int): return self.get_token(key) elif isinstance(key, list): if len(key) == 0: return [] elif isinstance(key[0], str): return [self.get_idx(x) for x in key] elif isinstance(key[0], int): return [self.get_token(x) for x in key] def __contains__(self, key: Union[str, int]): if isinstance(key, str): return key in self.token_to_idx elif isinstance(key, int): return 0 <= key < len(self.idx_to_token) else: return False def add(self, token: str) -> int: assert self.mutable, 'It is not allowed to call add on an immutable Vocab' assert isinstance(token, str), f'Token type must be str but got {type(token)} from {token}' assert token, 'Token must not be None or length 0' idx = self.token_to_idx.get(token, None) if idx is None: idx = len(self.token_to_idx) self.token_to_idx[token] = idx return idx def update(self, tokens: Iterable[str]) -> None: """Update the vocab with these tokens by adding them to vocab one by one. Args: tokens: Iterable[str]: Returns: """ assert self.mutable, 'It is not allowed to update an immutable Vocab' for token in tokens: self.add(token) def get_idx(self, token: str) -> int: idx = self.token_to_idx.get(token, None) if idx is None: if self.mutable: idx = len(self.token_to_idx) self.token_to_idx[token] = idx else: idx = self.token_to_idx.get(self.unk_token, None) return idx def get_idx_without_add(self, token: str) -> int: idx = self.token_to_idx.get(token, None) if idx is None: idx = self.token_to_idx.get(self.safe_unk_token, None) return idx def get_token(self, idx: int) -> str: if self.idx_to_token: return self.idx_to_token[idx] if self.mutable: for token in self.token_to_idx: if self.token_to_idx[token] == idx: return token def has_key(self, token): return token in self.token_to_idx def __len__(self): return len(self.token_to_idx) def lock(self): if self.locked: return self self.mutable = False self.build_idx_to_token() self.build_lookup_table() return self def build_idx_to_token(self): max_idx = max(self.token_to_idx.values()) self.idx_to_token = [None] * (max_idx + 1) for token, idx in self.token_to_idx.items(): self.idx_to_token[idx] = token def build_lookup_table(self): tensor = tf.constant(self.idx_to_token, dtype=tf.string) self.token_to_idx_table = index_table_from_tensor(tensor, num_oov_buckets=1 if self.unk_idx is None else 0, default_value=-1 if self.unk_idx is None else self.unk_idx) # self.idx_to_token_table = index_to_string_table_from_tensor(self.idx_to_token, self.safe_unk_token) def unlock(self): if not self.locked: return self.mutable = True self.idx_to_token = None self.idx_to_token_table = None self.token_to_idx_table = None return self @property def locked(self): return not self.mutable @property def unk_idx(self): if self.unk_token is None: return None else: return self.token_to_idx.get(self.unk_token, None) @property def pad_idx(self): if self.pad_token is None: return None else: return self.token_to_idx.get(self.pad_token, None) @property def tokens(self): return self.token_to_idx.keys() def __str__(self) -> str: return self.token_to_idx.__str__() def summary(self, verbose=True) -> str: # report = 'Length: {}\n'.format(len(self)) # report += 'Samples: {}\n'.format(str(list(self.token_to_idx.keys())[:min(50, len(self))])) # report += 'Mutable: {}'.format(self.mutable) # report = report.strip() report = '[{}] = '.format(len(self)) report += str(list(self.token_to_idx.keys())[:min(50, len(self))]) if verbose: print(report) return report def __call__(self, some_token: Union[str, List[str]]) -> Union[int, List[int]]: if isinstance(some_token, list): indices = [] for token in some_token: indices.append(self.get_idx(token)) return indices else: return self.get_idx(some_token) def lookup(self, token_tensor: tf.Tensor) -> tf.Tensor: if self.mutable: self.lock() return self.token_to_idx_table.lookup(token_tensor) def to_dict(self) -> dict: idx_to_token = self.idx_to_token pad_token = self.pad_token unk_token = self.unk_token mutable = self.mutable items = locals().copy() items.pop('self') return items def copy_from(self, item: dict): for key, value in item.items(): setattr(self, key, value) self.token_to_idx = {k: v for v, k in enumerate(self.idx_to_token)} if not self.mutable: self.build_lookup_table() def lower(self): self.unlock() token_to_idx = self.token_to_idx self.token_to_idx = {} for token in token_to_idx.keys(): self.add(token.lower()) return self @property def first_token(self): if self.idx_to_token: return self.idx_to_token[0] if self.token_to_idx: return next(iter(self.token_to_idx)) return None def merge(self, other): for word, idx in other.token_to_idx.items(): self.get_idx(word) @property def safe_pad_token(self) -> str: """Get the pad token safely. It always returns a pad token, which is the token closest to pad if not presented in the vocab. Args: Returns: """ if self.pad_token: return self.pad_token if self.first_token: return self.first_token return PAD @property def safe_pad_token_idx(self) -> int: return self.token_to_idx.get(self.safe_pad_token, 0) @property def safe_unk_token(self) -> str: """Get the unk token safely. It always returns a unk token, which is the token closest to unk if not presented in the vocab. Args: Returns: """ if self.unk_token: return self.unk_token if self.first_token: return self.first_token return UNK def create_label_vocab() -> VocabTF: return VocabTF(pad_token=None, unk_token=None) ================================================ FILE: hanlp/components/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-26 16:10 from .pipeline import Pipeline ================================================ FILE: hanlp/components/amr/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-20 17:35 ================================================ FILE: hanlp/components/amr/amrbart/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-12-05 17:53 ================================================ FILE: hanlp/components/amr/amrbart/bart_amr_generation.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-12-05 17:56 import logging import os.path from typing import Callable, Union, List import penman import torch from torch.utils.data import DataLoader from hanlp.components.amr.amrbart.data_interface.dataset import AMR2TextDataSet from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader from hanlp.common.torch_component import TorchComponent from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset from hanlp.layers.transformers.pt_imports import AutoConfig_ from hanlp.utils.time_util import CountdownTimer from hanlp_common.constant import IDX from hanlp_common.util import reorder from hanlp.components.amr.amrbart.model_interface.modeling_bart import BartForConditionalGeneration from hanlp.components.amr.amrbart.model_interface.tokenization_bart import AMRBartTokenizer from hanlp.components.amr.amrbart.preprocess.read_and_process import dfs_linearize class BART_AMR_Generation(TorchComponent): def __init__(self, **kwargs) -> None: super().__init__(**kwargs) self.tokenizer: AMRBartTokenizer = None self.transformer_config = None self.model: BartForConditionalGeneration = None def build_dataloader(self, data, batch_size=32, shuffle=False, device=None, logger: logging.Logger = None, sampler_builder=None, **kwargs) -> DataLoader: dataset = AMRDataset(data, generate_idx=True, cache=True) dataset.append_transform(lambda x: {**x, 'lamr': ' '.join(dfs_linearize(x['amr']))}) dataset.append_transform( lambda x: AMR2TextDataSet.tokenize(x, tokenizer=self.tokenizer, text='text', amr='lamr') ) if not sampler_builder: sampler_builder = SortingSamplerBuilder(batch_max_tokens=500) sampler = sampler_builder.build([len(x['input_ids']) for x in dataset], shuffle, 1) return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler, pad={'input_ids': self.transformer_config.pad_token_id, 'labels': self.transformer_config.pad_token_id}) def build_optimizer(self, **kwargs): pass def build_criterion(self, **kwargs): pass def build_metric(self, **kwargs): pass def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): pass def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): pass def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs): pass def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module: model = BartForConditionalGeneration.from_pretrained( transformer, config=self.transformer_config, ) if not training: model.eval() model.resize_token_embeddings(len(self.tokenizer)) return model def input_is_flat(self, data): return isinstance(data, (str, penman.Graph)) def predict( self, data: Union[str, List[str]], num_beams=5, max_length=1024, beautiful_amr_graph=True, verbose=False, **kwargs ): flat = self.input_is_flat(data) if flat: data = [data] dataloader = self.build_dataloader([{'amr': penman.loads(x)[0] if isinstance(x, str) else x} for x in data], **self.config, device=self.device) orders = [] results = [] if verbose: timer = CountdownTimer(len(dataloader)) for batch in dataloader: pieces = self.predict_batch(batch, num_beams, max_length) results.extend(pieces) orders.extend(batch[IDX]) if verbose: # noinspection PyUnboundLocalVariable timer.log() results = reorder(results, orders) if flat: results = results[0] return results def predict_batch(self, batch, num_beams, max_length): tokenizer = self.tokenizer input_ids = batch['input_ids'] preds = self.model.generate( input_ids, num_beams=num_beams, use_cache=True, decoder_start_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=0, max_length=max_length, min_length=0, length_penalty=1.0, ) # tokens = batch['tgt'] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) decoded_preds = [x.strip() for x in decoded_preds] return decoded_preds def load_config(self, save_dir: str, filename='config.json', **kwargs): if os.path.isdir(save_dir): super().load_config(save_dir, filename, **kwargs) transformer = self.config.transformer else: self.config.transformer = transformer = save_dir self.transformer_config = AutoConfig_.from_pretrained(transformer) def load_vocabs(self, save_dir, filename='vocabs.json'): self.tokenizer = AMRBartTokenizer.from_pretrained( self.config.transformer, use_fast=True, ) def load_weights(self, save_dir, filename='model.pt', **kwargs): pass ================================================ FILE: hanlp/components/amr/amrbart/bart_amr_parser.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-12-05 17:56 import logging import os.path from typing import Callable, Union, List import datetime import torch from torch.utils.data import DataLoader from hanlp.components.amr.amrbart.data_interface.dataset import AMRParsingDataSet from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader from hanlp.common.torch_component import TorchComponent from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset from hanlp.components.amr.seq2seq.dataset.penman import AMRGraph from hanlp.components.amr.seq2seq.evaluation import write_predictions, compute_smatch from hanlp.layers.transformers.pt_imports import AutoConfig_ from hanlp.metrics.amr.smatch_eval import smatch_eval from hanlp.metrics.mtl import MetricDict from hanlp.utils.time_util import CountdownTimer from hanlp_common.constant import IDX from hanlp_common.util import reorder from hanlp.components.amr.amrbart.model_interface.modeling_bart import BartForConditionalGeneration from hanlp.components.amr.amrbart.model_interface.tokenization_bart import AMRBartTokenizer class BART_AMR_Parser(TorchComponent): def __init__(self, **kwargs) -> None: super().__init__(**kwargs) self.tokenizer: AMRBartTokenizer = None self.transformer_config = None self.model: BartForConditionalGeneration = None def build_dataloader(self, data, batch_size=32, shuffle=False, device=None, logger: logging.Logger = None, sampler_builder=None, **kwargs) -> DataLoader: dataset = AMRDataset(data, generate_idx=True, cache=True) if isinstance(data, str): dataset.append_transform(lambda x: {**x, 'text': x['amr'].metadata['snt']}) dataset.append_transform( lambda x: AMRParsingDataSet.tokenize(x, tokenizer=self.tokenizer, text='text') ) if not sampler_builder: sampler_builder = SortingSamplerBuilder(batch_max_tokens=500) sampler = sampler_builder.build([len(x['input_ids']) for x in dataset], shuffle, 1) return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler, pad={'input_ids': self.transformer_config.pad_token_id, 'labels': self.transformer_config.pad_token_id}) def build_optimizer(self, **kwargs): pass def build_criterion(self, **kwargs): pass def build_metric(self, **kwargs): pass def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): pass def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): pass def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module: model = BartForConditionalGeneration.from_pretrained( transformer, config=self.transformer_config, ) if not training: model.eval() model.resize_token_embeddings(len(self.tokenizer)) return model def input_is_flat(self, data): return isinstance(data, str) def predict( self, data: Union[str, List[str]], num_beams=5, max_length=1024, beautiful_amr_graph=True, verbose=False, **kwargs ): flat = self.input_is_flat(data) if flat: data = [data] dataloader = self.build_dataloader([{'text': x} for x in data], **self.config, device=self.device) orders = [] results = [] # inputs, logits, labels, loss = torch.load('/local/scratch/hhe43/amrbart/batch.pt') if verbose: timer = CountdownTimer(len(dataloader)) for batch in dataloader: pieces = self.predict_batch(batch, num_beams, max_length) results.extend(pieces) orders.extend(batch[IDX]) if verbose: # noinspection PyUnboundLocalVariable timer.log() results = reorder(results, orders) if flat: results = results[0] return results def predict_batch(self, batch, num_beams, max_length): tokenizer = self.tokenizer input_ids = batch['input_ids'] preds = self.model.generate( input_ids, num_beams=num_beams, num_return_sequences=num_beams, use_cache=True, decoder_start_token_id=tokenizer.amr_bos_token_id, eos_token_id=tokenizer.amr_eos_token_id, no_repeat_ngram_size=0, max_length=max_length, min_length=0, length_penalty=1.0, ).tolist() # tokens = batch['tgt'] graphs = [] for i in range(0, len(preds), num_beams): graphs_same_source = [] for j in range(i, i + num_beams): ith_pred = preds[j] ith_pred[0] = tokenizer.bos_token_id ith_pred = [ tokenizer.eos_token_id if itm == tokenizer.amr_eos_token_id else itm for itm in ith_pred if itm != tokenizer.pad_token_id ] graph, status, (lin, backr) = tokenizer.decode_amr( ith_pred, restore_name_ops=False ) graph.status = status graph.nodes = lin graph.backreferences = backr graph.tokens = ith_pred graphs_same_source.append(graph) graphs_same_source[:] = \ tuple(zip(*sorted(enumerate(graphs_same_source), key=lambda x: (x[1].status.value, x[0]))))[1] graphs.append(graphs_same_source) # assert len(graphs) == len(tokens), f"inconsistent lengths {len(graphs)} vs {len(tokens)}" # for idx, gps, snt in zip(batch[IDX], graphs, tokens): # for gp in gps: # gp.metadata = {"id": str(idx), "annotator": "bart-amr", # "snt": snt.replace("", '').replace("", '').strip()} pieces = [AMRGraph(g.triples, g.top, g.epidata, g.metadata) for g in [gs[0] for gs in graphs]] return pieces def load_config(self, save_dir: str, filename='config.json', **kwargs): if os.path.isdir(save_dir): super().load_config(save_dir, filename, **kwargs) transformer = self.config.transformer else: self.config.transformer = transformer = save_dir self.transformer_config = AutoConfig_.from_pretrained(transformer) def load_vocabs(self, save_dir, filename='vocabs.json'): self.tokenizer = AMRBartTokenizer.from_pretrained( self.config.transformer, use_fast=True, ) def load_weights(self, save_dir, filename='model.pt', **kwargs): pass @torch.no_grad() def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, ratio_width=None, logger=None, input=None, use_fast=False, num_beams=5, max_length=1024, **kwargs): self.model.eval() timer = CountdownTimer(len(data)) graphs = [] orders = [] smatch = 0 for idx, batch in enumerate(data): graphs_per_batch = self.predict_batch(batch, num_beams, max_length) # Copy meta data from gold graph for gp, gg in zip(graphs_per_batch, batch['amr']): metadata = gg.metadata.copy() metadata['annotator'] = f'{self.transformer_config.name_or_path}-amr' metadata['date'] = str(datetime.datetime.now()) if 'save-date' in metadata: del metadata['save-date'] gp.metadata = metadata graphs.extend(graphs_per_batch) orders.extend(batch[IDX]) if idx == timer.total - 1: graphs = reorder(graphs, orders) write_predictions(output, None, graphs) try: if use_fast: smatch = compute_smatch(output, input) else: smatch = smatch_eval(output, input, use_fast=False) except: pass timer.log(smatch.cstr() if isinstance(smatch, MetricDict) else f'{smatch:.2%}', ratio_percentage=False, logger=logger) else: timer.log(ratio_percentage=False, logger=logger) return smatch def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=True, **kwargs): return super().evaluate(tst_data, save_dir, logger, batch_size, output, **kwargs) ================================================ FILE: hanlp/components/amr/amrbart/common/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-12-05 17:53 ================================================ FILE: hanlp/components/amr/amrbart/common/constant.py ================================================ # coding:utf-8 # MIT License # # Copyright (c) 2022 xfbai # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, BartTokenizer, BartForConditionalGeneration, T5Tokenizer, T5Model, T5ForConditionalGeneration, ) from transformers.optimization import ( get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, get_constant_schedule_with_warmup, ) raw_special_tokens = ['Ġcause-01', 'Ġpossible-01', 'Ġcontrast-01', 'Ġsay-01', 'Ġhave-03', 'Ġgovern-01', 'Ġstate-01', 'Ġthink-01', 'Ġdo-02', 'Ġwant-01', 'Ġknow-01', 'Ġrecommend-01', 'Ġsee-01', 'Ġresemble-01', 'Ġmean-01', 'Ġobligate-01', 'Ġuse-01', 'Ġgood-02', 'Ġneed-01', 'Ġwork-01', 'Ġpay-01', 'Ġget-01', 'Ġattack-01', 'Ġreal-04', 'Ġbelieve-01', 'Ġsupport-01', 'Ġreport-01', 'Ġtry-01', 'Ġsame-01', 'Ġtax-01', 'Ġoppose-01', 'Ġlive-01', 'Ġtell-01', 'Ġmake-02', 'Ġdie-01', 'Ġkill-01', 'Ġnew-01', 'Ġgive-01', 'Ġincrease-01', 'Ġagree-01', 'Ġactual-02', 'Ġgo-02', 'Ġright-05', 'Ġvote-01', 'Ġmake-01', 'Ġtake-01', 'Ġseem-01', 'Ġtalk-01', 'Ġissue-02', 'Ġbecome-01', 'Ġpost-01', 'Ġhelp-01', 'Ġstart-01', 'Ġend-01', 'Ġdevelop-02', 'Ġdecide-01', 'Ġfind-01', 'Ġclaim-01', 'Ġdefend-01', 'Ġlead-02', 'Ġhigh-02', 'Ġcontrol-01', 'Ġfree-04', 'Ġtraffic-01', 'Ġlong-03', 'Ġprovide-01', 'Ġcome-01', 'Ġplan-01', 'Ġproduce-01', 'Ġchange-01', 'Ġdiffer-02', 'Ġmarry-01', 'Ġemploy-01', 'Ġchoose-01', 'Ġfight-01', 'Ġmeet-03', 'Ġcall-01', 'Ġread-01', 'Ġunderstand-01', 'Ġsure-02', 'Ġcapable-01', 'Ġallow-01', 'Ġcrime-02', 'Ġinclude-01', 'Ġsell-01', 'Ġinfer-01', 'Ġshow-01', 'Ġfeel-01', 'Ġwar-01', 'Ġquestion-01', 'Ġlook-01', 'Ġopine-01', 'Ġlegal-02', 'Ġlose-02', 'Ġstop-01', 'Ġcreate-01', 'Ġcost-01', 'Ġcontinue-01', 'Ġbad-07', 'Ġact-02', 'Ġcare-03', 'Ġwin-01', 'Ġdiscuss-01', 'Ġdestroy-01', 'Ġpolicy-01', 'Ġelect-01', 'Ġgo-01', 'Ġtrue-01', 'Ġlie-08', 'Ġbase-02', 'Ġinsure-02', 'Ġinvest-01', 'Ġfund-01', 'Ġliberal-02', 'Ġtrade-01', 'Ġspeak-01', 'Ġinvolve-01', 'Ġfail-01', 'Ġhear-01', 'Ġlet-01', 'Ġhope-01', 'Ġinterest-01', 'Ġthreaten-01', 'Ġgrow-01', 'Ġdeal-01', 'Ġspend-01', 'Ġexist-01', 'Ġbegin-01', 'Ġdepend-01', 'Ġarrest-01', 'Ġprove-01', 'Ġbuy-01', 'Ġput-01', 'Ġget-05', 'Ġactivity-06', 'Ġoffer-01', 'Ġpersonal-02', 'Ġprotect-01', 'Ġquote-01', 'Ġwrite-01', 'Ġown-01', 'Ġbuild-01', 'Ġbenefit-01', 'Ġrelation-03', 'Ġequal-01', 'Ġsurrender-01', 'Ġexpect-01', 'Ġlike-01', 'Ġcooperate-01', 'Ġmove-01', 'Ġexcept-01', 'Ġrealize-01', 'Ġstrong-02', 'Ġhate-01', 'Ġargue-01', 'Ġask-01', 'Ġanswer-01', 'Ġlow-04', 'Ġcase-03', 'Ġresult-01', 'Ġeasy-05', 'Ġhard-02', 'Ġconcern-01', 'Ġsuspect-01', 'Ġbear-02', 'Ġserve-01', 'Ġaccept-01', 'Ġclear-06', 'Ġlove-01', 'Ġdemand-01', 'Ġlaunch-01', 'Ġexplain-01', 'Ġwrong-04', 'Ġright-06', 'Ġrequire-01', 'Ġaffect-01', 'Ġeffort-01', 'Ġforce-01', 'Ġlook-02', 'Ġwatch-01', 'Ġout-06', 'Ġoperate-01', 'Ġattempt-01', 'Ġban-01', 'Ġstudy-01', 'Ġsuggest-01', 'Ġlikely-01', 'Ġconcern-02', 'Ġthank-01', 'Ġpublic-02', 'Ġwork-09', 'Ġexemplify-01', 'Ġintend-01', 'Ġprice-01', 'Ġrespond-01', 'Ġpropose-01', 'Ġvisit-01', 'Ġcomplete-02', 'Ġtransfer-01', 'Ġaccuse-01', 'Ġcounter-01', 'Ġcut-02', 'Ġsimple-02', 'Ġcare-01', 'Ġcharge-05', 'Ġrepresent-01', 'Ġsucceed-01', 'Ġlocal-02', 'Ġmurder-01', 'Ġremember-01', 'Ġsend-01', 'Ġevidence-01', 'Ġresearch-01', 'Ġmajor-02', 'Ġwait-01', 'Ġestablish-01', 'Ġremain-01', 'Ġtest-01', 'Ġkeep-02', 'Ġexport-01', 'Ġannounce-01', 'Ġbomb-01', 'Ġfavor-01', 'Ġdeny-01', 'Ġrun-01', 'Ġexperience-01', 'Ġexpert-01', 'Ġprevent-01', 'Ġfair-01', 'Ġknow-02', 'Ġgeneral-02', 'Ġapprove-01', 'Ġwhite-02', 'Ġdescribe-01', 'Ġshare-01', 'Ġconsider-01', 'Ġcase-04', 'Ġreceive-01', 'Ġignore-01', 'Ġlink-01', 'Ġkeep-01', 'Ġcomment-01', 'Ġsex-01', 'Ġlaugh-01', 'Ġinvestigate-01', 'Ġview-02', 'Ġproliferate-01', 'Ġrefuse-01', 'Ġfear-01', 'Ġget-03', 'Ġwill-02', 'Ġrape-01', 'Ġallege-01', 'Ġget-04', 'Ġstay-01', 'Ġrise-01', 'Ġsupply-01', 'Ġdirect-02', 'Ġhonest-01', 'Ġdebate-01', 'Ġobvious-01', 'Ġappear-02', 'Ġcampaign-01', 'Ġblack-05', 'Ġreduce-01', 'Ġask-02', 'Ġcriticize-01', 'Ġguess-01', 'Ġlearn-01', 'Ġseek-01', 'Ġaccess-01', 'Ġsafe-01', 'Ġwish-01', 'Ġwrong-02', 'Ġeducate-01', 'Ġconflict-01', 'Ġrespect-01', 'Ġreach-01', 'Ġage-01', 'Ġmention-01', 'Ġexecute-01', 'Ġfind-02', 'Ġjudge-01', 'Ġbring-01', 'Ġblame-01', 'Ġhead-01', 'Ġwell-09', 'Ġensure-01', 'Ġarm-01', 'Ġcover-01', 'Ġserious-02', 'Ġtreat-01', 'Ġteach-01', 'Ġdoubt-01', 'Ġimmigrate-01', 'Ġinvade-01', 'Ġsmuggle-01', 'Ġlack-01', 'Ġearn-01', 'Ġhold-01', 'Ġlimit-01', 'Ġparticipate-01', 'Ġsentence-01', 'Ġdamage-01', 'Ġconsider-02', 'Ġname-01', 'Ġsorry-01', 'Ġrelate-01', 'Ġcriminal-03', 'Ġleft-19', 'Ġadmit-01', 'Ġadministrate-01', 'Ġtarget-01', 'Ġrun-02', 'Ġgo-06', 'Ġimprove-01', 'Ġconstruct-01', 'Ġmoral-02', 'Ġfollow-01', 'Ġcorrect-02', 'Ġprotest-01', 'Ġleave-11', 'Ġaid-01', 'Ġvalue-01', 'Ġsense-02', 'Ġdrop-01', 'Ġface-01', 'Ġserious-01', 'Ġseize-01', 'Ġtrain-01', 'Ġwarn-01', 'Ġavoid-01', 'Ġeffective-04', 'Ġdeserve-01', 'Ġplay-01', 'Ġenter-01', 'Ġregulate-01', 'Ġnear-02', 'Ġborder-01', 'Ġsolve-01', 'Ġprefer-01', 'Ġviolate-01', 'Ġrelease-01', 'Ġcite-01', 'Ġfocus-01', 'Ġadvise-01', 'Ġsound-01', 'Ġrisk-01', 'Ġreturn-01', 'Ġlist-01', 'Ġsignificant-02', 'Ġhire-01', 'Ġsurprise-01', 'Ġopen-01', 'Ġnice-01', 'Ġraise-01', 'Ġmaintain-01', 'Ġprivate-03', 'Ġimplement-01', 'Ġassist-01', 'Ġcall-02', 'Ġcompare-01', 'Ġprofit-01', 'Ġcontribute-01', 'Ġhave-to-do-with-04', 'Ġcorrupt-01', 'Ġclose-10', 'Ġsuffer-01', 'Ġexpand-01', 'Ġwonder-01', 'Ġresponsible-01', 'Ġtotal-01', 'Ġspecific-02', 'Ġpass-01', 'Ġhappy-01', 'Ġassume-02', 'Ġchance-02', 'Ġremove-01', 'Ġadd-02', 'Ġmanufacture-01', 'Ġexpress-01', 'Ġinspect-01', 'Ġwalk-01', 'Ġgood-03', 'Ġrule-01', 'Ġmanage-01', 'Ġhold-04', 'Ġspecial-02', 'Ġinfluence-01', 'Ġexchange-01', 'Ġtake-10', 'Ġconvict-01', 'Ġprocess-02', 'Ġtravel-01', 'Ġcarry-01', 'Ġdefine-01', 'Ġdisagree-01', 'Ġsave-02', 'Ġpermit-01', 'Ġestimate-01', 'Ġrate-01', 'Ġcall-03', 'Ġsingle-02', 'Ġabuse-01', 'Ġsign-01', 'Ġrule-03', 'Ġact-01', 'Ġachieve-01', 'Ġintervene-01', 'Ġfall-01', 'Ġattend-02', 'Ġfeel-02', 'Ġadopt-01', 'Ġfollow-02', 'Ġgo-on-15', 'Ġloan-01', 'Ġnegotiate-01', 'Ġhit-01', 'Ġcondition-01', 'Ġshort-07', 'Ġpromise-01', 'Ġrebel-01', 'Ġpromote-02', 'Ġstrengthen-01', 'Ġsanction-02', 'Ġwarm-01', 'Ġbehave-01', 'Ġhave-06', 'Ġsuffice-01', 'Ġlead-03', 'Ġtry-02', 'Ġlike-02', 'Ġfire-01', 'Ġdrive-01', 'Ġfly-01', 'Ġgain-02', 'Ġafford-01', 'Ġexplode-01', 'Ġpoint-out-02', 'Ġconsume-01', 'Ġmeasure-02', 'Ġreform-01', 'Ġenjoy-01', 'Ġsit-01', 'Ġavailable-02', 'Ġstrike-01', 'Ġsign-02', 'Ġcome-03', 'Ġnatural-03', 'Ġorganize-01', 'Ġprepare-02', 'Ġreplace-01', 'Ġhanging-07', 'Ġleave-15', 'Ġretire-01', 'Ġimport-01', 'Ġrange-01', 'Ġokay-04', 'Ġcover-03', 'Ġimagine-01', 'Ġkey-02', 'Ġsurvive-01', 'Ġfree-03', 'Ġbase-01', 'Ġcomplain-01', 'Ġnormal-02', 'Ġcomplete-01', 'Ġreveal-01', 'Ġenforce-01', 'Ġdetermine-01', 'Ġvictimize-01', 'Ġrepeat-01', 'Ġinterview-01', 'Ġmake-05', 'Ġdonate-01', 'Ġsteal-01', 'Ġquick-02', 'Ġattract-01', 'Ġanalyze-01', 'Ġally-01', 'Ġsuppose-01', 'Ġresponsible-03', 'Ġclose-01', 'Ġcombat-01', 'Ġidentify-01', 'Ġsuppose-02', 'Ġrecord-01', 'Ġnominate-01', 'Ġrely-01', 'Ġturn-02', 'Ġhandle-01', 'Ġprocess-01', 'Ġpredict-01', 'Ġdeploy-01', 'Ġfortunate-01', 'Ġeat-01', 'Ġjustify-01', 'Ġexpend-01', 'Ġbullshit-01', 'Ġdiscover-01', 'Ġenrich-01', 'Ġcommit-02', 'Ġshoot-02', 'Ġcheap-02', 'Ġreject-01', 'Ġweak-02', 'Ġpowerful-02', 'Ġdispute-01', 'Ġlegislate-01', 'Ġissue-01', 'Ġarrive-01', 'Ġjoin-01', 'Ġapply-02', 'Ġindicate-01', 'Ġengage-01', 'Ġinnocent-01', 'Ġfast-02', 'Ġpressure-01', 'Ġpublish-01', 'Ġobtain-01', 'Ġsad-02', 'Ġconfirm-01', 'Ġtreat-03', 'Ġlead-01', 'Ġlisten-01', 'Ġoffend-01', 'Ġaddress-02', 'Ġword-01', 'Ġright-08', 'Ġnote-01', 'Ġcontain-01', 'Ġpurchase-01', 'Ġrequest-01', 'Ġgood-04', 'Ġdesign-01', 'Ġnotice-01', 'Ġpresent-01', 'Ġshock-01', 'Ġright-02', 'Ġtransport-01', 'Ġdeliver-01', 'Ġburn-01', 'Ġfault-01', 'Ġmatter-01', 'Ġabort-01', 'Ġstick-01', 'Ġconnect-01', 'Ġconclude-01', 'Ġcontract-02', 'Ġpossess-01', 'Ġend-up-03', 'Ġsearch-01', 'Ġget-02', 'Ġqualify-02', 'Ġreact-01', 'Ġconfuse-01', 'Ġanger-01', 'Ġpursue-01', 'Ġreside-01', 'Ġrelevant-01', 'Ġoccupy-01', 'Ġwithdraw-01', 'Ġokay-01', 'Ġconform-01', 'Ġdemonstrate-01', 'Ġwear-01', 'Ġhave-04', 'Ġdecrease-01', 'Ġpunish-01', 'Ġpractice-01', 'Ġcapture-01', 'Ġgo-03', 'Ġpoll-01', 'Ġshow-04', 'Ġrefer-01', 'Ġcommit-01', 'Ġdisarm-01', 'Ġbelong-01', 'Ġdivide-02', 'Ġdrink-01', 'Ġdesire-01', 'Ġsave-01', 'Ġignorant-02', 'Ġperfect-02', 'Ġposition-02', 'Ġcrap-01', 'Ġinsult-01', 'Ġprivate-02', 'Ġwaste-01', 'Ġguilty-01', 'Ġeliminate-01', 'Ġmortgage-01', 'Ġworth-01', 'Ġinherit-01', 'Ġthrow-01', 'Ġtour-01', 'Ġsuspend-01', 'Ġharm-01', 'Ġimpose-01', 'Ġimprison-01', 'Ġrecognize-01', 'Ġprosecute-01', 'Ġview-01', 'Ġforget-01', 'Ġfound-01', 'Ġchallenge-01', 'Ġtrouble-01', 'Ġsecure-02', 'Ġorder-01', 'Ġpartner-01', 'Ġspend-02', 'Ġprogressive-02', 'Ġaccount-01', 'Ġblock-01', 'Ġguarantee-01', 'Ġconvince-01', 'Ġworry-02', 'Ġendanger-01', 'Ġmovement-07', 'Ġfuck-01', 'Ġextend-01', 'Ġseparate-02', 'Ġbalance-01', 'Ġlose-03', 'Ġpower-01', 'Ġsue-02', 'Ġurge-01', 'Ġcheck-01', 'Ġpoint-01', 'Ġturn-01', 'Ġprogress-01', 'Ġrecover-01', 'Ġridiculous-02', 'Ġaccompany-01', 'Ġappear-01', 'Ġworry-01', 'Ġplace-01', 'Ġattend-01', 'Ġsleep-01', 'Ġbreak-01', 'Ġfind-out-03', 'Ġbias-01', 'Ġaccord-03', 'Ġwide-02', 'Ġenable-01', 'Ġaffair-02', 'Ġhide-01', 'Ġhold-02', 'Ġrecognize-02', 'Ġback-01', 'Ġbet-01', 'Ġhack-04', 'Ġacquire-01', 'Ġtake-04', 'Ġpenalize-01', 'Ġmessage-01', 'Ġready-02', 'Ġcease-01', 'Ġcrazy-03', 'Ġbad-04', 'Ġcompete-02', 'Ġcontact-01', 'Ġsource-01', 'Ġset-up-03', 'Ġrestrict-01', 'Ġregard-01', 'Ġwitness-01', 'Ġlabor-01', 'Ġsmoke-02', 'Ġkick-01', 'Ġcompete-01', 'Ġhouse-01', 'Ġhurt-01', 'Ġimprovise-01', 'Ġfinance-01', 'Ġinsist-01', 'Ġfarm-01', 'Ġapply-01', 'Ġstep-01', 'Ġdeep-02', 'Ġpride-01', 'Ġbill-01', 'Ġpretend-01', 'Ġfill-01', 'Ġfine-04', 'Ġstop-03', 'Ġoffend-03', 'Ġadvertise-01', 'Ġstand-01', 'Ġaim-02', 'Ġimpact-01', 'Ġfeed-01', 'Ġgrant-01', 'Ġlast-01', 'Ġform-01', 'Ġdrive-02', 'Ġengineer-01', 'Ġinjure-01', 'Ġdevelop-01', 'Ġpresent-02', 'Ġsubsidize-01', 'Ġbring-up-02', 'Ġintelligent-01', 'Ġwelcome-01', 'Ġtake-away-05', 'Ġresolve-01', 'Ġappropriate-02', 'Ġencourage-01', 'Ġperform-02', 'Ġgo-back-19', 'Ġdeclare-02', 'Ġfull-09', 'Ġhopeful-03', 'Ġconduct-01', 'Ġsurgery-01', 'Ġdetain-01', 'Ġrelative-05', 'Ġcount-01', 'Ġglad-02', 'Ġrare-02', 'Ġcome-out-09', 'Ġapproach-02', 'Ġrace-02', 'Ġbattle-01', 'Ġcross-02', 'Ġmove-02', 'Ġquestion-03', 'Ġadminister-01', 'Ġgrow-03', 'Ġmeet-02', 'Ġdown-03', 'Ġmeet-01', 'Ġcondemn-01', 'Ġreason-01', 'Ġcarry-out-03', 'Ġworth-02', 'Ġinform-01', 'Ġstable-03', 'Ġstand-11', 'Ġutilize-01', 'Ġperpetrate-01', 'Ġassociate-01', 'Ġapologize-01', 'Ġcredit-01', 'Ġdisgust-01', 'Ġspread-03', 'Ġcommand-02', 'Ġsense-01', 'Ġdetail-01', 'Ġdefeat-01', 'Ġdistribute-01', 'Ġgive-up-07', 'Ġpain-01', 'Ġship-01', 'Ġkeep-04', 'Ġaddict-01', 'Ġcompromise-01', 'Ġlegitimate-02', 'Ġregular-02', 'Ġpick-01', 'Ġsource-02', 'Ġraid-01', 'Ġhard-04', 'Ġrain-01', 'Ġcommunicate-01', 'Ġmarket-01', 'Ġlower-05', 'Ġill-01', 'Ġdefraud-01', 'Ġposition-01', 'Ġterrible-01', 'Ġdivorce-01', 'Ġamaze-01', 'Ġedit-01', 'Ġspread-02', 'Ġclarify-10', 'Ġargue-02', 'Ġpush-01', 'Ġmiss-01', 'Ġimply-01', 'Ġdiscriminate-02', 'Ġlight-06', 'Ġappoint-01', 'Ġdelay-01', 'Ġgross-03', 'Ġput-03', 'Ġintroduce-02', 'Ġstandard-02', 'Ġpull-01', 'Ġdraw-02', 'Ġgo-08', 'Ġaim-01', 'Ġmodern-02', 'Ġdare-01', 'Ġneighbor-01', 'Ġconfront-01', 'Ġsuperior-01', 'Ġreasonable-02', 'Ġschedule-01', 'Ġadd-01', 'Ġnew-02', 'Ġlend-01', 'Ġdouble-01', 'Ġfinish-01', 'Ġraise-03', 'Ġexcuse-02', 'Ġmonitor-01', 'Ġobserve-01', 'Ġpopular-02', 'Ġcharge-01', 'Ġbudget-01', 'Ġnegative-03', 'Ġdirect-01', 'Ġrid-01', 'Ġmake-18', 'Ġmean-02', 'Ġfame-01', 'Ġjoke-01', 'Ġbeautiful-02', 'Ġtend-02', 'Ġrob-01', 'Ġriot-01', 'Ġsponsor-01', 'Ġentitle-01', 'Ġlobby-01', 'Ġbad-02', 'Ġcollapse-01', 'Ġexpose-01', 'Ġemphasize-01', 'Ġfriendly-01', 'Ġplay-02', 'Ġinitiate-01', 'Ġappreciate-02', 'Ġremind-01', 'Ġblack-04', 'Ġefficient-01', 'Ġconverse-01', 'Ġresponsible-02', 'Ġmeasure-01', 'Ġcome-04', 'Ġeffect-03', 'Ġsubject-01', 'Ġmistake-02', 'Ġpass-03', 'Ġsignal-07', 'Ġguard-01', 'Ġopen-04', 'Ġset-02', 'Ġfun-01', 'Ġcome-up-11', 'Ġflee-05', 'Ġlabel-01', 'Ġsize-01', 'Ġconfident-01', 'Ġsmart-06', 'Ġhost-01', 'Ġtough-02', 'Ġrecall-02', 'Ġscare-01', 'Ġdream-01', 'Ġassault-01', 'Ġfreeze-02', 'Ġtake-over-12', 'Ġrecession-02', 'Ġfunction-01', 'Ġwhine-01', 'Ġshort-06', 'Ġprosper-01', 'Ġadvanced-02', 'Ġvalue-02', 'Ġbother-01', 'Ġcomply-01', 'Ġright-04', 'Ġrevolution-03', 'Ġaccomplish-01', 'Ġgo-out-17', 'Ġfigure-out-05', 'Ġslow-05', 'Ġaccountable-02', 'Ġcool-01', 'Ġdocument-01', 'Ġauthorize-01', 'Ġembargo-01', 'Ġvolunteer-01', 'Ġregister-02', 'Ġfrequent-02', 'Ġrank-01', 'Ġresist-01', 'Ġbreak-up-08', 'Ġred-02', 'Ġcomfortable-02', 'Ġexamine-01', 'Ġadjust-01', 'Ġoriginate-01', 'Ġreply-01', 'Ġbreak-18', 'Ġshoot-01', 'Ġmiss-02', 'Ġdismiss-01', 'Ġcollect-01', 'Ġdraft-01', 'Ġsubmit-01', 'Ġrelieve-01', 'Ġembarrass-01', 'Ġreturn-02', 'Ġvoluntary-02', 'Ġpure-02', 'Ġbeat-01', 'Ġbear-01', 'Ġvary-01', 'Ġsick-05', 'Ġaffair-01', 'Ġtypical-02', 'Ġnegative-02', 'Ġserve-02', 'Ġeradicate-01', 'Ġrealize-02', 'Ġperceive-01', 'Ġleave-14', 'Ġgive-16', 'Ġback-up-04', 'Ġgenerate-01', 'Ġbail-out-02', 'Ġtouch-01', 'Ġcultivate-01', 'Ġconvert-01', 'Ġdismantle-01', 'Ġservice-05', 'Ġstraight-04', 'Ġbad-05', 'Ġforce-04', 'Ġadvocate-01', 'Ġpray-01', 'Ġdecline-01', 'Ġinfect-01', 'Ġtitle-01', 'Ġdesperate-02', 'Ġupset-01', 'Ġtolerate-01', 'Ġprohibit-01', 'Ġmind-05', 'Ġbeat-03', 'Ġveto-01', 'Ġcrash-01', 'Ġside-01', 'Ġcombine-01', 'Ġclose-13', 'Ġgo-10', 'Ġequip-01', 'Ġrant-01', 'Ġjail-01', 'Ġcopy-01', 'Ġdrop-05', 'Ġconsistent-02', 'Ġspend-04', 'Ġsend-03', 'Ġcritical-02', 'Ġcarry-on-02', 'Ġraise-02', 'Ġmotivate-01', 'Ġguide-01', 'Ġwonderful-03', 'Ġtrust-01', 'Ġreverse-01', 'Ġjust-02', 'Ġclaim-02', 'Ġsurvey-01', 'Ġspy-01', 'Ġget-22', 'Ġhave-05', 'Ġcool-04', 'Ġpicture-01', 'Ġunion-02', 'Ġmanage-02', 'Ġinstruct-01', 'Ġblow-03', 'Ġsacrifice-01', 'Ġowe-01', 'Ġappeal-01', 'Ġexceed-01', 'Ġradiate-01', 'Ġhonor-01', 'Ġseparate-01', 'Ġarrange-01', 'Ġdominate-01', 'Ġtransact-01', 'Ġgrow-up-04', 'Ġverify-01', 'Ġgo-05', 'Ġfamiliarize-01', 'Ġrenew-01', 'Ġfire-02', 'Ġtake-out-11', 'Ġinterpret-01', 'Ġvalid-02', 'Ġshow-up-02', 'Ġconfiscate-01', 'Ġshut-down-05', 'Ġcheat-03', 'Ġharass-01', 'Ġtie-01', 'Ġabuse-02', 'Ġassess-01', 'Ġcompensate-01', 'Ġsensitive-03', 'Ġsettle-02', 'Ġencounter-01', 'Ġmatch-01', 'Ġrecover-02', 'Ġtrust-02', 'Ġperform-01', 'Ġborrow-01', 'Ġselect-01', 'Ġbetray-01', 'Ġride-01', 'Ġuseful-05', 'Ġsplit-01', 'Ġshift-01', 'Ġannoy-01', 'Ġmind-01', 'Ġfair-04', 'Ġoppress-01', 'Ġinterfere-01', 'Ġcredit-02', 'Ġlaunder-01', 'Ġamount-01', 'Ġleave-13', 'Ġrescue-01', 'Ġstaff-01', 'Ġplay-11', 'Ġkind-01', 'Ġauthor-01', 'Ġsympathize-01', 'Ġupgrade-02', 'Ġsuppress-01', 'Ġwake-up-02', 'Ġinvite-01', 'Ġcome-12', 'Ġdeter-01', 'Ġbrainwash-01', 'Ġshit-01', 'Ġfix-02', 'Ġwhite-03', 'Ġgroup-01', 'Ġabsent-01', 'Ġarmor-01', 'Ġup-03', 'Ġpraise-01', 'Ġreview-01', 'Ġdry-02', 'Ġintercept-01', 'Ġbroadcast-01', 'Ġworship-01', 'Ġterm-01', 'Ġobject-01', 'Ġpledge-01', 'Ġprepare-01', 'Ġopen-up-03', 'Ġlay-01', 'Ġfile-01', 'Ġcheck-out-05', 'Ġattach-01', 'Ġsatisfy-01', 'Ġdepart-01', 'Ġopposite-01', 'Ġworsen-01', 'Ġaward-01', 'Ġpollute-01', 'Ġretaliate-01', 'Ġdisrupt-01', 'Ġreturn-05', 'Ġpopulate-01', 'Ġenvision-01', 'Ġplease-01', 'Ġrepair-01', 'Ġslaughter-01', 'Ġsin-01', 'Ġconstitute-01', 'Ġshop-01', 'Ġtranslate-01', 'Ġassure-01', 'Ġpay-off-02', 'Ġstimulate-01', 'Ġdamn-01', 'Ġswitch-01', 'Ġdisappear-01', 'Ġreelect-01', 'Ġspin-03', 'Ġtestify-01', 'Ġlegalize-01', 'Ġprint-01', 'Ġaverage-01', 'Ġright-03', 'Ġfix-03', 'Ġundermine-01', 'Ġcome-on-25', 'Ġlicense-01', 'Ġindict-01', 'Ġtransit-01', 'Ġwash-01', 'Ġbreathe-01', 'Ġbroad-02', 'Ġleave-17', 'Ġorder-02', 'Ġhead-02', 'Ġsing-01', 'Ġentertain-01', 'Ġcomplicate-01', 'Ġpush-02', 'Ġrealistic-03', 'Ġdisappoint-01', 'Ġbother-02', 'Ġtough-03', 'Ġdisplay-01', 'Ġflow-01', 'Ġdiffer-01', 'Ġlie-07', 'Ġpremise-01', 'Ġrelocate-01', 'Ġcorrect-01', 'Ġcoordinate-01', 'Ġabandon-01', 'Ġdictate-01', 'Ġplay-08', 'Ġrebuild-01', 'Ġclean-04', 'Ġwork-out-02', 'Ġrun-13', 'Ġcurious-01', 'Ġpromote-01', 'Ġspecialize-01', 'Ġstarve-01', 'Ġshame-02', 'Ġfit-06', 'Ġflaw-01', 'Ġfigure-01', 'Ġhunt-01', 'Ġexperiment-01', 'Ġmix-01', 'Ġregular-03', 'Ġfree-01', 'Ġdeclare-01', 'Ġescape-01', 'Ġput-02', 'Ġobsess-01', 'Ġbuild-up-05', 'Ġshut-up-06', 'Ġrally-01', 'Ġdissent-01', 'Ġprogram-01', 'Ġamend-01', 'Ġinvent-01', 'Ġleak-01', 'Ġtrigger-01', 'Ġdistinguish-01', 'Ġsymbolize-01', 'Ġexcellent-02', 'Ġlook-04', 'Ġcry-02', 'Ġassign-01', 'Ġrecruit-01', 'Ġcope-01', 'Ġmigrate-01', 'Ġtake-on-09', 'Ġbless-01', 'Ġsharp-02', 'Ġuse-02', 'Ġdisturb-01', 'Ġconsult-01', 'Ġlay-off-02', 'Ġbid-01', 'Ġaccord-02', 'Ġbusy-01', 'Ġprovoke-01', 'Ġisolate-01', 'Ġdirty-02', 'Ġblind-02', 'Ġstage-01', 'Ġboost-01', 'Ġoutrage-01', 'Ġtrack-01', 'Ġretard-01', 'Ġexclude-01', 'Ġpatent-01', 'Ġblog-01', 'Ġtorture-01', 'Ġplot-01', 'Ġcut-01', 'Ġhunger-01', 'Ġoverwhelm-01', 'Ġexploit-01', 'Ġland-01', 'Ġreserve-01', 'Ġbetter-01', 'Ġup-02', 'Ġremark-01', 'Ġpiss-03', 'Ġexcuse-01', 'Ġparalyze-01', 'Ġsummarize-01', 'Ġload-01', 'Ġdevote-01', 'Ġbury-01', 'Ġsurround-01', 'Ġdance-01', 'Ġdistort-01', 'Ġretain-01', 'Ġoverthrow-01', 'Ġrival-01', 'Ġready-01', 'Ġevolve-01', 'Ġimpoverish-01', 'Ġalarm-01', 'Ġunify-01', 'Ġrepay-01', 'Ġassume-01', 'Ġclose-06', 'Ġadmire-01', 'Ġvow-01', 'Ġaverage-04', 'Ġsight-01', 'Ġinflate-01', 'Ġreference-04', 'Ġlook-up-05', 'Ġcivilize-01', 'Ġsuitable-04', 'Ġdetect-01', 'Ġpiss-off-02', 'Ġassassinate-01', 'Ġopen-05', 'Ġshave-01', 'Ġemail-01', 'Ġfuel-01', 'Ġincentivize-01', 'Ġmark-01', 'Ġsustain-01', 'Ġspeculate-01', 'Ġsurveil-01', 'Ġswim-01', 'Ġconquer-01', 'Ġgenocide-01', 'Ġhoax-01', 'Ġnotice-03', 'Ġbe-done-08', 'Ġopt-01', 'Ġbait-01', 'Ġcompile-01', 'Ġinnovate-01', 'Ġallocate-01', 'Ġshelter-01', 'Ġcontrary-01', 'Ġburden-01', 'Ġfreeze-01', 'Ġinspire-01', 'Ġgraduate-01', 'Ġwipe-out-02', 'Ġfall-05', 'Ġcover-up-04', 'Ġrepute-01', 'Ġenhance-01', 'Ġclassify-01', 'Ġgreen-03', 'Ġscore-01', 'Ġmodify-01', 'Ġreflect-01', 'Ġforce-02', 'Ġequate-01', 'Ġmerchandise-01', 'Ġregret-01', 'Ġovercome-01', 'Ġprocure-01', 'Ġscam-01', 'Ġquit-01', 'Ġdrill-01', 'Ġdisable-01', 'Ġgrasp-01', 'Ġorbit-01', 'Ġlaughable-03', 'Ġconsent-01', 'Ġendorse-01', 'Ġcatch-02', 'Ġleave-02', 'Ġweigh-01', 'Ġroll-01', 'Ġrestore-01', 'Ġshape-01', 'Ġcomprehend-01', 'Ġtrip-03', 'Ġget-away-08', 'Ġsingle-03', 'Ġphone-01', 'Ġintimidate-01', 'Ġinstall-01', 'Ġsuck-03', 'Ġback-02', 'Ġdeem-01', 'Ġmake-up-10', 'Ġplant-01', 'Ġhand-out-03', 'Ġgo-off-16', 'Ġspeed-01', 'Ġrefute-01', 'Ġimplicate-01', 'Ġdock-01', 'Ġcrack-down-06', 'Ġforecast-01', 'Ġrush-01', 'Ġgenerous-01', 'Ġunite-01', 'Ġgrab-01', 'Ġcompetent-01', 'Ġground-02', 'Ġevaluate-01', 'Ġadvance-01', 'Ġmainstream-02', 'Ġdiagnose-01', 'Ġpass-05', 'Ġuphold-01', 'Ġhalt-01', 'Ġhinder-01', 'Ġbefriend-01', 'Ġconvene-01', 'Ġawe-01', 'Ġapplaud-01', 'Ġmodernize-01', 'Ġintegrate-01', 'Ġexecute-02', 'Ġwound-01', 'Ġprostitute-01', 'Ġexercise-01', 'Ġbind-01', 'Ġphotograph-01', 'Ġfascinate-01', 'Ġreward-01', 'Ġclean-up-02', 'Ġrepeal-01', 'Ġtwist-01', 'Ġmodel-01', 'Ġmandate-01', 'Ġconspire-01', 'Ġtear-01', 'Ġbrutal-02', 'Ġcharge-08', 'Ġdry-08', 'Ġwow-01', 'Ġbank-01', 'Ġfuck-up-02', 'Ġstand-up-07', 'Ġportray-01', 'Ġnationalize-01', 'Ġliberate-01', 'Ġexempt-01', 'Ġdefy-01', 'Ġshout-01', 'Ġdevastate-01', 'Ġhijack-01', 'Ġacknowledge-01', 'Ġcompromise-02', 'Ġconsist-01', 'Ġcoach-01', 'Ġintense-02', 'Ġdrag-01', 'Ġminor-01', 'Ġfulfill-01', 'Ġclear-01', 'Ġdeceive-01', 'Ġshake-01', 'Ġcold-01', 'Ġalign-01', 'Ġsupervise-01', 'Ġinternal-02', 'Ġgift-01', 'Ġstruggle-01', 'Ġcast-01', 'Ġfeature-01', 'Ġharsh-02', 'Ġemerge-01', 'Ġfollow-04', 'Ġcut-off-04', 'Ġmistake-01', 'Ġlocate-01', 'Ġslow-01', 'Ġaccelerate-01', 'Ġcover-02', 'Ġsoft-02', 'Ġidentical-01', 'Ġsail-01', 'Ġjump-03', 'Ġfacilitate-01', 'Ġexcessive-02', 'Ġalter-01', 'Ġescalate-01', 'Ġmad-04', 'Ġkid-01', 'Ġfloat-01', 'Ġmess-up-02', 'Ġkidnap-01', 'Ġbore-02', 'Ġclean-01', 'Ġforgive-01', 'Ġgo-through-20', 'Ġcare-04', 'Ġmeet-up-04', 'Ġmoisturize-01', 'Ġhighlight-01', 'Ġdislike-01', 'Ġboom-02', 'Ġblow-up-06', 'Ġappeal-02', 'Ġadhere-02', 'Ġcontradict-01', 'Ġleave-12', 'Ġdialogue-01', 'Ġpush-04', 'Ġcontaminate-01', 'Ġfinalize-01', 'Ġtape-02', 'Ġpatrol-01', 'Ġincite-01', 'Ġrenounce-01', 'Ġhallucinate-01', 'Ġundertake-01', 'Ġaverage-03', 'Ġcompel-01', 'Ġstruggle-02', 'Ġgo-12', 'Ġtrap-01', 'Ġquiet-04', 'Ġconvey-01', 'Ġopen-02', 'Ġclothe-01', 'Ġexclusive-02', 'Ġgather-03', 'Ġextensive-03', 'Ġapproach-01', 'Ġmanipulate-02', 'Ġinfringe-01', 'Ġruin-01', 'Ġstrive-01', 'Ġproductive-03', 'Ġexplore-01', 'Ġinhabit-01', 'Ġpress-01', 'Ġforbid-01', 'Ġhit-02', 'Ġabolish-01', 'Ġimpress-01', 'Ġprospect-02', 'Ġgoogle-01', 'Ġsink-01', 'Ġresign-01', 'Ġpull-out-02', 'Ġstation-01', 'Ġcenter-02', 'Ġindustrialize-01', 'Ġcounsel-01', 'Ġpropel-01', 'Ġsmell-01', 'Ġmoderate-03', 'Ġpresume-01', 'Ġrun-09', 'Ġkeep-up-10', 'Ġdeal-03', 'Ġapprehend-01', 'Ġsick-02', 'Ġsmell-02', 'Ġhave-11', 'Ġfrustrate-01', 'Ġcatch-01', 'Ġimpression-03', 'Ġspecify-01', 'Ġemploy-02', 'Ġthankful-02', 'Ġman-01', 'Ġprioritize-01', 'Ġattribute-01', 'Ġproject-01', 'Ġparrot-01', 'Ġbitch-01', 'Ġstand-04', 'Ġvoice-01', 'Ġpreserve-01', 'Ġpublicize-01', 'Ġexhibit-01', 'Ġundergo-28', 'Ġhelp-02', 'Ġbankrupt-01', 'Ġflood-01', 'Ġprecede-01', 'Ġreinforce-01', 'Ġtask-01', 'Ġtype-03', 'Ġtransform-01', 'Ġdespair-01', 'Ġchase-01', 'Ġspread-01', 'Ġappall-01', 'Ġrestrain-01', 'Ġterrify-01', 'Ġfool-01', 'Ġaspire-01', 'Ġwarm-07', 'Ġbring-up-08', 'Ġbleed-01', 'Ġdepress-01', 'Ġcare-02', 'Ġalert-01', 'Ġwonder-02', 'Ġdrop-out-04', 'Ġspoil-01', 'Ġstink-01', 'Ġdrug-01', 'Ġoverturn-01', 'Ġheat-01', 'Ġmerge-01', 'Ġpeak-01', 'Ġset-01', 'Ġsolid-02', 'Ġinteract-01', 'Ġthrow-out-06', 'Ġholiday-01', 'Ġrefine-01', 'Ġallow-02', 'Ġsign-up-03', 'Ġbribe-01', 'Ġappease-01', 'Ġstress-02', 'Ġfine-01', 'Ġminor-02', 'Ġmine-01', 'Ġlove-02', 'Ġnetwork-01', 'Ġdeposit-01', 'Ġstore-01', 'Ġextract-01', 'Ġinterrogate-01', 'Ġturn-out-11', 'Ġimpregnate-01', 'Ġfake-02', 'Ġwhore-01', 'Ġconceal-01', 'Ġfire-03', 'Ġlean-01', 'Ġharmful-02', 'Ġout-05', 'Ġfall-07', 'Ġdodge-01', 'Ġorient-01', 'Ġbrand-01', 'Ġsocial-03', 'Ġcut-03', 'Ġcap-01', 'Ġoverpay-01', 'Ġbridge-01', 'Ġcollaborate-01', 'Ġaddress-03', 'Ġdivert-01', 'Ġpull-09', 'Ġrevise-01', 'Ġmolest-01', 'Ġextradite-01', 'Ġdismiss-02', 'Ġreprocess-01', 'Ġaccumulate-01', 'Ġoccasion-02', 'Ġobstruct-01', 'Ġbreak-down-12', 'Ġrumor-01', 'Ġfirm-03', 'Ġsettle-03', 'Ġorder-03', 'Ġstipulate-01', 'Ġaudit-01', 'Ġenact-01', 'Ġcelebrate-02', 'Ġbargain-01', 'Ġsucceed-03', 'Ġinject-01', 'Ġexcite-01', 'Ġgreet-01', 'Ġblack-07', 'Ġterminate-01', 'Ġdescend-01', 'Ġemerge-02', 'Ġwreck-01', 'Ġabsorb-01', 'Ġblow-01', 'Ġfine-03', 'Ġcirculate-01', 'Ġtight-05', 'Ġoffense-02', 'Ġactivate-01', 'Ġsecure-01', 'Ġpass-by-17', 'Ġbash-01', 'Ġprop-up-01', 'Ġcount-04', 'Ġslap-01', 'Ġbring-down-03', 'Ġamuse-01', 'Ġfilm-01', 'Ġintroduce-01', 'Ġdesignate-01', 'Ġhang-01', 'Ġwave-04', 'Ġprivilege-01', 'Ġtake-02', 'Ġcycle-02', 'Ġcancel-01', 'Ġbuy-05', 'Ġsweep-01', 'Ġhelp-out-03', 'Ġleft-20', 'Ġsuit-01', 'Ġenslave-01', 'Ġrest-01', 'Ġambush-01', 'Ġmean-04', 'Ġdistract-01', 'Ġmatch-03', 'Ġwarrant-01', 'Ġdisguise-01', 'Ġmake-up-07', 'Ġparty-01', 'Ġclose-11', 'Ġfall-10', 'Ġpump-01', 'Ġresort-01', 'Ġget-back-10', 'Ġregain-01', 'Ġlose-01', 'Ġerr-01', 'Ġrun-out-05', 'Ġthat-is-it-00', 'Ġaggravate-01', 'Ġloot-01', 'Ġhappen-02', 'Ġscrew-02', 'Ġmake-it-14', 'Ġpick-up-04', 'Ġrefer-02', 'Ġbreak-13', 'Ġupdate-01', 'Ġshine-01', 'Ġcongratulate-01', 'Ġpilot-01', 'Ġdisgrace-01', 'Ġfabricate-01', 'Ġsicken-01', 'Ġcriticism-04', 'Ġpreach-01', 'Ġdeport-01', 'Ġdeal-02', 'Ġinflict-01', 'Ġgain-01', 'Ġresume-01', 'Ġoutlaw-01', 'Ġshoot-down-05', 'Ġpartition-01', 'Ġaddress-01', 'Ġenvy-01', 'Ġbreak-02', 'Ġspeak-out-03', 'Ġbroaden-01', 'Ġstress-01', 'Ġinfiltrate-01', 'Ġflat-06', 'Ġimpeach-01', 'Ġtransgress-01', 'Ġpardon-01', 'Ġuncover-01', 'Ġcomprise-01', 'Ġreconstruct-01', 'Ġlibel-01', 'Ġhand-01', 'Ġhint-01', 'Ġencourage-02', 'Ġprevail-02', 'Ġbrave-02', 'Ġforesee-01', 'Ġconcede-01', 'Ġdeteriorate-01', 'Ġtopple-01', 'Ġmobile-02', 'Ġpanic-01', 'Ġmisunderstand-01', 'Ġtire-01', 'Ġenthusiastic-03', 'Ġexercise-02', 'Ġpersist-01', 'Ġinferior-01', 'Ġbrilliant-01', 'Ġbuild-02', 'Ġscream-01', 'Ġanticipate-01', 'Ġout-03', 'Ġration-01', 'Ġcount-02', 'Ġconsistent-01', 'Ġawait-01', 'Ġschool-01', 'Ġrent-01', 'Ġarise-02', 'Ġappeal-03', 'Ġhelpful-04', 'Ġsee-03', 'Ġlock-01', 'Ġstereotype-01', 'Ġjoin-in-05', 'Ġscrew-up-01', 'Ġwithhold-01', 'Ġmoderate-01', 'Ġaffiliate-01', 'Ġwaive-01', 'Ġsuck-01', 'Ġgolf-01', 'Ġturn-out-17', 'Ġput-up-11', 'Ġkeep-up-05', 'Ġstraight-05', 'Ġdress-01', 'Ġdig-01', 'Ġplead-02', 'Ġlecture-01', 'Ġgo-09', 'Ġpervert-01', 'Ġcry-01', 'Ġmitigate-01', 'Ġsubstitute-01', 'Ġsend-02', 'Ġdown-01', 'Ġwesternize-01', 'Ġcolor-01', 'Ġrefer-03', 'Ġpersecute-01', 'Ġscheme-01', 'Ġreactionary-02', 'Ġsubscribe-01', 'Ġshield-01', 'Ġexile-01', 'Ġdetonate-01', 'Ġstall-01', 'Ġbroker-01', 'Ġcalculate-01', 'Ġnarrow-02', 'Ġstock-01', 'Ġturn-down-05', 'Ġparole-01', 'Ġjoin-04', 'Ġinstitute-01', 'Ġdisprove-01', 'Ġpass-20', 'Ġspew-01', 'Ġbid-03', 'Ġwage-01', 'Ġsample-01', 'Ġretail-01', 'Ġratify-01', 'Ġspank-01', 'Ġdispatch-01', 'Ġharvest-01', 'Ġrot-01', 'Ġdelude-01', 'Ġclimb-01', 'Ġfrighten-01', 'Ġyell-01', 'Ġcoerce-01', 'Ġscary-03', 'Ġstretch-01', 'Ġdestabilize-01', 'Ġblood-02', 'Ġconfine-01', 'Ġoutrageous-02', 'Ġbeg-01', 'Ġwield-01', 'Ġscrap-01', 'Ġprivatize-01', 'Ġcure-01', 'Ġmature-02', 'Ġcoexist-01', 'Ġassert-02', 'Ġget-along-18', 'Ġreunify-01', 'Ġlook-forward-03', 'Ġnumber-01', 'Ġtrash-01', 'Ġrun-04', 'Ġgive-up-08', 'Ġbright-02', 'Ġout-01', 'Ġheal-01', 'Ġmassacre-01', 'Ġtackle-01', 'Ġstake-01', 'Ġopen-09', 'Ġknow-04', 'Ġcorrespond-02', 'Ġdisregard-01', 'Ġalienate-01', 'Ġinsure-01', 'Ġdisapprove-01', 'Ġdrain-01', 'Ġdeflect-01', 'Ġexit-01', 'Ġvacation-01', 'Ġcook-01', 'Ġadapt-01', 'Ġdissolve-01', 'Ġlift-01', 'Ġclose-down-04', 'Ġcome-down-23', 'Ġbully-01', 'Ġdenounce-01', 'Ġstab-01', 'Ġexpel-01', 'Ġabstain-01', 'Ġcut-out-06', 'Ġswallow-01', 'Ġcome-in-07', 'Ġstep-in-02', 'Ġseek-out-02', 'Ġpace-01', 'Ġwed-01', 'Ġgo-on-25', 'Ġsave-03', 'Ġcome-up-13', 'Ġsort-out-02', 'Ġtattoo-01', 'Ġleave-out-03', 'Ġkiss-01', 'Ġchance-01', 'Ġprolong-01', 'Ġtroll-01', 'Ġconcentrate-01', 'Ġchannel-01', 'Ġrecreation-02', 'Ġcenter-01', 'Ġweaponize-01', 'Ġexplicit-03', 'Ġdraft-02', 'Ġpose-02', 'Ġcrush-01', 'Ġdiscredit-01', 'Ġfurther-01', 'Ġdedicate-01', 'Ġsit-down-02', 'Ġleave-10', 'Ġforge-02', 'Ġcensor-01', 'Ġparade-02', 'Ġpaint-02', 'Ġcatch-03', 'Ġremortgage-01', 'Ġslow-down-03', 'Ġadmit-02', 'Ġbreak-19', 'Ġcounterfeit-01', 'Ġrun-10', 'Ġupgrade-01', 'Ġdeduct-01', 'Ġconfess-01', 'Ġdecline-02', 'Ġbar-01', 'Ġbrief-01', 'Ġconduct-02', 'Ġlynch-01', 'Ġacquit-01', 'Ġhyperlink-01', 'Ġlight-04', 'Ġconcrete-02', 'Ġreach-02', 'Ġmarch-01', 'Ġpurport-01', 'Ġcall-on-05', 'Ġpaddle-01', 'Ġfilter-02', 'Ġstrip-01', 'Ġcompose-01', 'Ġerupt-01', 'Ġwipe-01', 'Ġtrace-02', 'Ġdespise-01', 'Ġminimize-01', 'Ġneglect-01', 'Ġloyal-01', 'Ġslip-01', 'Ġrevive-01', 'Ġwork-07', 'Ġbeat-up-05', 'Ġdetermined-02', 'Ġpass-07', 'Ġprescribe-02', 'Ġfuss-01', 'Ġdemolish-01', 'Ġavail-01', 'Ġput-in-05', 'Ġlease-01', 'Ġembrace-01', 'Ġmerit-01', 'Ġintensify-01', 'Ġhearing-02', 'Ġweaken-01', 'Ġcolonize-01', 'Ġoffset-01', 'Ġgather-01', 'Ġtake-off-07', 'Ġbright-03', 'Ġextend-02', 'Ġget-30', 'Ġpreexist-01', 'Ġsnow-01', 'Ġstrike-02', 'Ġgross-06', 'Ġdiminish-01', 'Ġprejudice-01', 'Ġrage-02', 'Ġnotify-01', 'Ġcontest-02', 'Ġhype-01', 'Ġrevisit-01', 'Ġdark-02', 'Ġstand-08', 'Ġcertify-01', 'Ġoversee-01', 'Ġname-02', 'Ġlock-up-03', 'Ġknow-03', 'Ġminimal-02', 'Ġtell-02', 'Ġrotate-01', 'Ġoperate-02', 'Ġfat-03', 'Ġindulge-01', 'Ġfeel-06', 'Ġset-08', 'Ġsurpass-01', 'Ġpull-06', 'Ġget-06', 'Ġcamp-02', 'Ġgut-01', 'Ġchair-01', 'Ġqualify-01', 'Ġspare-01', 'Ġblunt-02', 'Ġproceed-01', 'Ġdump-01', 'Ġreckon-01', 'Ġpierce-01', 'Ġmelt-01', 'Ġfeel-05', 'Ġstand-03', 'Ġelaborate-01', 'Ġreach-03', 'Ġspark-01', 'Ġcoincide-01', 'Ġslander-01', 'Ġjoin-up-02', 'Ġshame-01', 'Ġboard-01', 'Ġrule-out-02', 'Ġblockade-01', 'Ġincinerate-01', 'Ġderive-01', 'Ġget-by-17', 'Ġcharacterize-01', 'Ġstockpile-01', 'Ġpersuade-01', 'Ġdecapitate-01', 'Ġrun-08', 'Ġpack-01', 'Ġbust-01', 'Ġpolice-01', 'Ġtrick-01', 'Ġblast-05', 'Ġtreat-04', 'Ġrun-off-24', 'Ġapprentice-01', 'Ġdispose-01', 'Ġinhibit-01', 'Ġwire-01', 'Ġtop-01', 'Ġhand-over-02', 'Ġknow-06', 'Ġabet-01', 'Ġcatch-up-04', 'Ġsleep-02', 'Ġslam-02', 'Ġbreed-01', 'Ġcontend-02', 'Ġperjure-01', 'Ġmanipulate-01', 'Ġprobe-01', 'Ġtrend-01', 'Ġtighten-01', 'Ġboycott-01', 'Ġtable-01', 'Ġindoctrinate-01', 'Ġsafeguard-01', 'Ġevacuate-01', 'Ġinterdict-01', 'Ġpetition-01', 'Ġformulate-01', 'Ġpartake-01', 'Ġpass-04', 'Ġoverride-01', 'Ġemit-01', 'Ġcharacteristic-02', 'Ġtimely-03', 'Ġstun-01', 'Ġcrumble-01', 'Ġmaximize-01', 'Ġpass-away-16', 'Ġrun-07', 'Ġsmile-01', 'Ġinquire-01', 'Ġlag-01', 'Ġlive-up-04', 'Ġdistance-01', 'Ġcold-02', 'Ġdeep-03', 'Ġrelax-01', 'Ġill-02', 'Ġsignify-01', 'Ġhold-back-07', 'Ġtransplant-01', 'Ġsmoke-01', 'Ġcurb-01', 'Ġdelegate-01', 'Ġseal-01', 'Ġlure-01', 'Ġintimate-02', 'Ġfresh-04', 'Ġseat-01', 'Ġmove-03', 'Ġkeep-03', 'Ġoutweigh-01', 'Ġrevere-01', 'Ġclone-01', 'Ġenlist-01', 'Ġclick-01', 'Ġempty-02', 'Ġfire-04', 'Ġcontend-01', 'Ġabide-01', 'Ġcraft-01', 'Ġtip-05', 'Ġwrap-01', 'Ġbite-01', 'Ġtoss-01', 'Ġpolite-01', 'Ġdesirable-02', 'Ġdefuse-01', 'Ġthrill-01', 'Ġproduce-02', 'Ġoblige-02', 'Ġdate-02', 'Ġalternate-01', 'Ġget-on-21', 'Ġramble-02', 'Ġhurt-02', 'Ġdistant-02', 'Ġhot-05', 'Ġpale-03', 'Ġproclaim-01', 'Ġclass-01', 'Ġcome-across-21', 'Ġsneak-01', 'Ġerode-01', 'Ġchampion-01', 'Ġneutral-02', 'Ġalien-01', 'Ġgrieve-01', 'Ġswear-01', 'Ġgo-21', 'Ġunderestimate-01', 'Ġaddictive-02', 'Ġpropagate-01', 'Ġlast-04', 'Ġcommence-01', 'Ġair-01', 'Ġmark-02', 'Ġaccommodate-01', 'Ġdemonize-01', 'Ġmock-01', 'Ġnuke-01', 'Ġswell-01', 'Ġbrag-01', 'Ġassert-03', 'Ġdisrespect-01', 'Ġwork-12', 'Ġremarkable-02', 'Ġpool-01', 'Ġpaint-03', 'Ġpour-01', 'Ġdecommission-01', 'Ġamplify-01', 'Ġmad-02', 'Ġcorrelate-01', 'Ġautomate-01', 'Ġmoney-01', 'Ġcontent-02', 'Ġstorm-01', 'Ġthrive-01', 'Ġliable-01', 'Ġhopeful-02', 'Ġexpire-01', 'Ġwork-06', 'Ġdisperse-01', 'Ġlay-04', 'Ġfall-apart-09', 'Ġterror-02', 'Ġphilander-01', 'Ġscrutinize-01', 'Ġfathom-01', 'Ġmake-up-08', 'Ġhumiliate-01', 'Ġcharge-06', 'Ġnatural-02', 'Ġfollow-up-03', 'Ġbend-01', 'Ġgrade-01', 'Ġenter-02', 'Ġpend-01', 'Ġprey-01', 'Ġmediate-01', 'Ġconclude-02', 'Ġmask-01', 'Ġreactivate-01', 'Ġevolve-02', 'Ġrestart-01', 'Ġencrypt-01', 'Ġget-through-12', 'Ġgrow-02', 'Ġbestow-01', 'Ġput-out-10', 'Ġdisplace-01', 'Ġcount-03', 'Ġstabilize-01', 'Ġembezzle-01', 'Ġpass-on-09', 'Ġform-02', 'Ġroot-02', 'Ġtrample-01', 'Ġmake-out-23', 'Ġfit-in-02', 'Ġhospitalize-01', 'Ġcut-down-11', 'Ġconstrain-01', 'Ġclash-01', 'Ġconsolidate-01', 'Ġmeddle-01', 'Ġreproduce-01', 'Ġclever-01', 'Ġdiversify-01', 'Ġpostpone-01', 'Ġstructure-01', 'Ġnarrow-01', 'Ġincur-01', 'Ġdraw-up-03', 'Ġdrive-04', 'Ġpin-01', 'Ġdelight-01', 'Ġput-on-08', 'Ġcoverage-06', 'Ġbring-about-05', 'Ġstir-up-04', 'Ġlet-down-04', 'Ġsigh-02', 'Ġspace-01', 'Ġcheat-02', 'Ġlessen-01', 'Ġrender-02', 'Ġrender-01', 'Ġmenace-01', 'Ġprevail-01', 'Ġreclaim-01', 'Ġpuzzle-01', 'Ġhesitate-01', 'Ġgo-23', 'Ġcharm-01', 'Ġturn-over-12', 'Ġwander-01', 'Ġrenovate-01', 'Ġpackage-01', 'Ġheadquarter-01', 'Ġline-01', 'Ġstraight-06', 'Ġpark-01', 'Ġturn-on-13', 'Ġarbitrary-02', 'Ġconceive-01', 'Ġexert-01', 'Ġspell-01', 'Ġdye-01', 'Ġtune-01', 'Ġrip-01', 'Ġgarner-01', 'Ġsick-04', 'Ġshove-01', 'Ġwave-01', 'Ġrust-01', 'Ġkneel-01', 'Ġcelebrate-01', 'Ġmisrepresent-01', 'Ġincarcerate-01', 'Ġawake-03', 'Ġup-01', 'Ġslip-02', 'Ġconcentrate-02', 'Ġround-05', 'Ġloose-04', 'Ġcripple-01', 'Ġpart-01', 'Ġhoard-01', 'Ġchain-01', 'Ġtricky-02', 'Ġhook-up-02', 'Ġtype-01', 'Ġglance-01', 'Ġprize-01', 'Ġtransmit-01', 'Ġhold-03', 'Ġsurge-01', 'Ġheadline-01', 'Ġvote-02', 'Ġdraw-01', 'Ġtext-01', 'Ġshower-01', 'Ġcalm-down-02', 'Ġfeed-up-03', 'Ġslide-01', 'Ġgo-down-27', 'Ġforward-01', 'Ġproject-02', 'Ġempower-01', 'Ġmind-04', 'Ġpass-02', 'Ġneutralize-01', 'Ġrepress-01', 'Ġserve-04', 'Ġeye-01', 'Ġdiscriminate-01', 'Ġoverlook-01', 'Ġtop-02', 'Ġmobilize-01', 'Ġstart-out-05', 'Ġpunishable-02', 'Ġunderlie-01', 'Ġpenetrate-01', 'Ġgrind-01', 'Ġjump-01', 'Ġpertain-01', 'Ġincline-01', 'Ġhumble-01', 'Ġmoderate-02', 'Ġmeaningful-05', 'Ġmislead-01', 'Ġfinish-07', 'Ġdisgruntle-01', 'Ġturn-up-15', 'Ġknock-01', 'Ġtake-03', 'Ġlunch-01', 'Ġadd-03', 'Ġcommend-01', 'Ġpatient-01', 'Ġattain-01', 'Ġhike-02', 'Ġlurk-01', 'Ġbe-02', 'Ġblackmail-01', 'Ġdubious-02', 'Ġentrench-01', 'Ġget-off-23', 'Ġflame-01', 'Ġstand-02', 'Ġsurvive-02', 'Ġafford-02', 'Ġlive-02', 'Ġmoan-01', 'Ġportion-01', 'Ġslash-02', 'Ġbreak-through-22', 'Ġplague-01', 'Ġblunt-01', 'Ġabominable-02', 'Ġhonorable-03', 'Ġrelated-04', 'Ġdeprive-01', 'Ġdecay-01', 'Ġdistress-01', 'Ġredistribute-01', 'Ġforeclose-01', 'Ġwarm-06', 'Ġjealous-02', 'Ġcohere-01', 'Ġpaste-01', 'Ġprompt-01', 'Ġcurtail-01', 'Ġtrack-down-02', 'Ġpity-01', 'Ġticket-02', 'Ġtransition-01', 'Ġburst-02', 'Ġbroke-23', 'Ġrewrite-01', 'Ġdeliberate-01', 'Ġdisclose-01', 'Ġsituate-01', 'Ġreiterate-01', 'Ġprofess-01', 'Ġbabble-01', 'Ġlift-02', 'Ġdeclassify-01', 'Ġremand-01', 'Ġreconcile-01', 'Ġassemble-01', 'Ġextort-01', 'Ġcorroborate-01', 'Ġsnip-01', 'Ġnormalize-01', 'Ġclose-03', 'Ġremit-01', 'Ġsweep-06', 'Ġbreach-01', 'Ġbehead-01', 'Ġsimulate-01', 'Ġastonish-01', 'Ġdeviate-01', 'Ġsmear-02', 'Ġgive-away-02', 'Ġdifferentiate-01', 'Ġintersect-01', 'Ġrectify-01', 'Ġlose-out-06', 'Ġtelephone-01', 'Ġrevolutionary-04', 'Ġblow-14', 'Ġexaggerate-01', 'Ġsoar-01', 'Ġcontent-01', 'Ġpreside-01', 'Ġcheck-07', 'Ġrefrain-01', 'Ġcrack-02', 'Ġdisintegrate-01', 'Ġexterminate-01', 'Ġridicule-01', 'Ġobey-01', 'Ġbundle-01', 'Ġcompound-01', 'Ġwine-01', 'Ġdine-01', 'Ġresent-01', 'Ġjeopardize-01', 'Ġusher-in-01', 'Ġcrowd-01', 'Ġelevate-01', 'Ġtear-down-05', 'Ġresolve-02', 'Ġearnest-01', 'Ġirritate-01', 'Ġgreen-02', 'Ġheed-01', 'Ġplay-10', 'Ġspread-out-04', 'Ġcruise-01', 'Ġcater-01', 'Ġstay-on-02', 'Ġstick-around-03', 'Ġcall-13', 'Ġbicker-01', 'Ġcurse-02', 'Ġopen-07', 'Ġrun-up-19', 'Ġtrump-01', 'Ġhappy-02', 'Ġredeem-01', 'Ġstrike-04', 'Ġbring-on-06', 'Ġenlighten-01', 'Ġgray-02', 'Ġnote-02', 'Ġshred-01', 'Ġgas-03', 'Ġlevy-01', 'Ġturn-18', 'Ġlevel-04', 'Ġbow-01', 'Ġturn-14', 'Ġrehabilitate-01', 'Ġcouple-01', 'Ġdent-01', 'Ġcautious-02', 'Ġbust-02', 'Ġshut-01', 'Ġflip-01', 'Ġvalidate-01', 'Ġkill-03', 'Ġhot-04', 'Ġchat-01', 'Ġcurious-02', 'Ġlump-01', 'Ġexacerbate-01', 'Ġsneaky-03', 'Ġconviction-02', 'Ġproceeding-02', 'Ġreorganize-01', 'Ġfit-05', 'Ġsee-05', 'Ġacquaint-01', 'Ġvile-02', 'Ġzap-01', 'Ġuniform-01', 'Ġreplicate-01', 'Ġintent-02', 'Ġgrip-01', 'Ġswear-02', 'Ġdecry-01', 'Ġsegregate-01', 'Ġspur-01', 'Ġstorm-02', 'Ġcap-02', 'Ġslant-01', 'Ġspan-01', 'Ġcut-back-05', 'Ġfledge-01', 'Ġfoster-01', 'Ġgripe-01', 'Ġquest-01', 'Ġpunch-01', 'Ġderegulate-01', 'Ġloathe-01', 'Ġimitate-01', 'Ġhang-out-06', 'Ġbaffle-01', 'Ġsuck-up-04', 'Ġtempt-01', 'Ġcondone-01', 'Ġassemble-02', 'Ġoust-01', 'Ġvent-01', 'Ġspout-01', 'Ġsound-02', 'Ġevade-01', 'Ġendure-01', 'Ġinvoke-01', 'Ġdevalue-01', 'Ġpose-01', 'Ġbear-06', 'Ġhypothesize-01', 'Ġspot-01', 'Ġdiscount-02', 'Ġrail-01', 'Ġhaul-01', 'Ġgauge-01', 'Ġcopyright-01', 'Ġgive-in-09', 'Ġimpede-01', 'Ġblast-01', 'Ġtrue-02', 'Ġbeware-01', 'Ġrestore-02', 'Ġnegative-05', 'Ġsteady-01', 'Ġfluctuate-01', 'Ġdate-01', 'Ġbathe-01', 'Ġgo-22', 'Ġrestructure-01', 'Ġpile-01', 'Ġspin-01', 'Ġtake-down-22', 'Ġbake-01', 'Ġtriple-01', 'Ġdowngrade-02', 'Ġordain-01', 'Ġmultiply-01', 'Ġskip-01', 'Ġincorporate-02', 'Ġsettle-01', 'Ġpass-on-14', 'Ġcreepy-04', 'Ġstuff-01', 'Ġline-up-02', 'Ġimmune-02', 'Ġlust-01', 'Ġnotable-04', 'Ġbuy-into-04', 'Ġimpair-01', 'Ġfigure-04', 'Ġpiss-01', 'Ġgive-back-03', 'Ġboast-01', 'Ġlay-off-06', 'Ġdive-01', 'Ġcommute-02', 'Ġracket-02', 'Ġdip-01', 'Ġrotate-02', 'Ġdemagogue-01', 'Ġchange-02', 'Ġbarter-01', 'Ġalike-05', 'Ġbind-03', 'Ġwhip-up-03', 'Ġmanifest-01', 'Ġcheck-03', 'Ġyield-01', 'Ġslay-01', 'Ġtally-01', 'Ġget-through-13', 'Ġbreak-through-26', 'Ġrelinquish-01', 'Ġreopen-01', 'Ġdefame-01', 'Ġinterrupt-01', 'Ġcast-03', 'Ġpattern-01', 'Ġdose-01', 'Ġreenter-01', 'Ġmotivate-02', 'Ġstandardize-01', 'Ġdate-entity', 'Ġgovernment-organization', 'Ġtemporal-quantity', 'Ġamr-unknown', 'Ġmulti-sentence', 'Ġpolitical-party', 'Ġ:compared-to', 'Ġmonetary-quantity', 'Ġordinal-entity', 'Ġreligious-group', 'Ġpercentage-entity', 'Ġworld-region', 'Ġ:consist', 'Ġurl-entity', 'Ġpolitical-movement', 'Ġet-cetera', 'Ġat-least', 'Ġmass-quantity', 'Ġhave-org-role-91', 'Ġhave-rel-role-91', 'Ġinclude-91', 'Ġhave-concession-91', 'Ġhave-condition-91', 'Ġbe-located-at-91', 'Ġrate-entity-91', 'Ġinstead-of-91', 'Ġhyperlink-91', 'Ġrequest-confirmation-91', 'Ġhave-purpose-91', 'Ġbe-temporally-at-91', 'Ġregardless-91', 'Ġhave-polarity-91', 'Ġbyline-91', 'Ġhave-manner-91', 'Ġhave-part-91', 'Ġhave-quant-91', 'Ġpublication-91', 'Ġbe-from-91', 'Ġhave-mod-91', 'Ġhave-frequency-91', 'Ġscore-on-scale-91', 'Ġhave-li-91', 'Ġbe-compared-to-91', 'Ġbe-destined-for-91', 'Ġcourse-91', 'Ġhave-subevent-91', 'Ġstreet-address-91', 'Ġhave-extent-91', 'Ġstatistical-test-91', 'Ġhave-instrument-91', 'Ġhave-name-91', 'Ġbe-polite-91', '-00', '-01', '-02', '-03', '-04', '-05', '-06', '-07', '-08', '-09', '-10', '-11', '-12', '-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20', '-21', '-22', '-23', '-24', '-25', '-26', '-27', '-28', '-29', '-20', '-31', '-32', '-33', '-34', '-35', '-36', '-37', '-38', '-39', '-40', '-41', '-42', '-43', '-44', '-45', '-46', '-47', '-48', '-49', '-50', '-51', '-52', '-53', '-54', '-55', '-56', '-57', '-58', '-59', '-60', '-61', '-62', '-63', '-64', '-65', '-66', '-67', '-68', '-69', '-70', '-71', '-72', '-73', '-74', '-75', '-76', '-77', '-78', '-79', '-80', '-81', '-82', '-83', '-84', '-85', '-86', '-87', '-88', '-89', '-90', '-91', '-92', '-93', '-94', '-95', '-96', '-97', '-98', '-of', 'Ġ:op1', 'Ġ:op2', 'Ġ:op3', 'Ġ:op4', 'Ġ:op5', 'Ġ:ARG0', 'Ġ:ARG1', 'Ġ:ARG2', 'Ġ:ARG3', 'Ġ:ARG4', 'Ġ:ARG5', 'Ġ:ARG6', 'Ġ:ARG7', 'Ġ:ARG8', 'Ġ:ARG9', 'Ġ:ARG10', 'Ġ:ARG11', 'Ġ:ARG12', 'Ġ:ARG13', 'Ġ:ARG14', 'Ġ:ARG15', 'Ġ:ARG16', 'Ġ:ARG17', 'Ġ:ARG18', 'Ġ:ARG19', 'Ġ:ARG20', 'Ġ:accompanier', 'Ġ:age', 'Ġ:beneficiary', 'Ġ:calendar', 'Ġ:cause', 'Ġ:century', 'Ġ:concession', 'Ġ:condition', 'Ġ:conj-as-if', 'Ġ:consist-of', 'Ġ:cost', 'Ġ:day', 'Ġ:dayperiod', 'Ġ:decade', 'Ġ:degree', 'Ġ:destination', 'Ġ:direction', 'Ġ:domain', 'Ġ:duration', 'Ġ:employed-by', 'Ġ:era', 'Ġ:example', 'Ġ:extent', 'Ġ:frequency', 'Ġ:instrument', 'Ġ:li', 'Ġ:location', 'Ġ:manner', 'Ġ:meaning', 'Ġ:medium', 'Ġ:mod', 'Ġ:mode', 'Ġ:month', 'Ġ:name', 'Ġ:ord', 'Ġ:part', 'Ġ:path', 'Ġ:polarity', 'Ġ:polite', 'Ġ:poss', 'Ġ:purpose', 'Ġ:quant', 'Ġ:quarter', 'Ġ:range', 'Ġ:relation', 'Ġ:role', 'Ġ:scale', 'Ġ:season', 'Ġ:source', 'Ġ:subevent', 'Ġ:subset', 'Ġ:superset', 'Ġ:time', 'Ġ:timezone', 'Ġ:topic', 'Ġ:unit', 'Ġ:value', 'Ġ:weekday', 'Ġ:wiki', 'Ġ:year', 'Ġ:year2', 'Ġ:snt0', 'Ġ:snt1', 'Ġ:snt2', 'Ġ:snt3', 'Ġ:snt4', 'Ġ:snt5', 'ĠCOUNTRY', 'ĠQUANTITY', 'ĠORGANIZATION', 'ĠDATE_ATTRS', 'ĠNATIONALITY', 'ĠLOCATION', 'ĠENTITY', 'ĠMISC', 'ĠORDINAL_ENTITY', 'ĠIDEOLOGY', 'ĠRELIGION', 'ĠSTATE_OR_PROVINCE', 'ĠCAUSE_OF_DEATH', 'ĠTITLE', 'ĠDATE', 'ĠNUMBER', 'ĠHANDLE', 'ĠSCORE_ENTITY', 'ĠDURATION', 'ĠORDINAL', 'ĠMONEY', 'ĠCRIMINAL_CHARGE', '_1', '_2', '_3', '_4', '_2', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', '', ''] special_tokens = [itm.lstrip("Ġ") for itm in raw_special_tokens] recategorizations = [ "\u0120COUNTRY", "\u0120QUANTITY", "\u0120ORGANIZATION", "\u0120DATE_ATTRS", "\u0120NATIONALITY", "\u0120LOCATION", "\u0120ENTITY", "\u0120MISC", "\u0120ORDINAL_ENTITY", "\u0120IDEOLOGY", "\u0120RELIGION", "\u0120STATE_OR_PROVINCE", "\u0120CAUSE_OF_DEATH", "\u0120TITLE", "\u0120DATE", "\u0120NUMBER", "\u0120HANDLE", "\u0120SCORE_ENTITY", "\u0120DURATION", "\u0120ORDINAL", "\u0120MONEY", "\u0120CRIMINAL_CHARGE", ] # special_tokens = ["", ""] arg_to_scheduler = { "linear": get_linear_schedule_with_warmup, "cosine": get_cosine_schedule_with_warmup, "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup, "polynomial": get_polynomial_decay_schedule_with_warmup, "constant": get_constant_schedule_with_warmup, } arg_to_scheduler_choices = sorted(arg_to_scheduler.keys()) arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}" ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"] arg_to_tokenizer = { "AutoTokenizer": AutoTokenizer, "BartTokenizer": BartTokenizer, "T5Tokenizer": T5Tokenizer, } arg_to_plm_model = { "AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM, "BartForConditionalGeneration": BartForConditionalGeneration, "T5Model": T5Model, "T5ForConditionalGeneration": T5ForConditionalGeneration, } ================================================ FILE: hanlp/components/amr/amrbart/common/penman_interface.py ================================================ # coding:utf-8 # MIT License # # Copyright (c) 2022 xfbai # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from penman import load as load_, Graph, Triple from penman import loads as loads_ from penman import encode as encode_ from penman.model import Model from penman.models.noop import NoOpModel from penman.models import amr op_model = Model() noop_model = NoOpModel() amr_model = amr.model DEFAULT = op_model def _get_model(dereify): if dereify is None: return DEFAULT elif dereify: return op_model else: return noop_model def _remove_wiki(graph): metadata = graph.metadata triples = [] for t in graph.triples: v1, rel, v2 = t if rel == ":wiki": t = Triple(v1, rel, "+") triples.append(t) graph = Graph(triples) graph.metadata = metadata return graph def load(source, dereify=None, remove_wiki=False): model = _get_model(dereify) out = load_(source=source, model=model) if remove_wiki: for i in range(len(out)): out[i] = _remove_wiki(out[i]) return out def loads(string, dereify=None, remove_wiki=False): model = _get_model(dereify) out = loads_(string=string, model=model) if remove_wiki: for i in range(len(out)): out[i] = _remove_wiki(out[i]) return out def encode(g, top=None, indent=-1, compact=False): model = amr_model return encode_(g=g, top=top, indent=indent, compact=compact, model=model) ================================================ FILE: hanlp/components/amr/amrbart/common/postprocessing.py ================================================ # coding:utf-8 # MIT License # # Copyright (c) 2022 xfbai # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import re import enum import penman import networkx as nx from hanlp.components.amr.amrbart.common.penman_interface import encode from collections import defaultdict, Counter BACKOFF = penman.Graph( [ penman.Triple("d2", ":instance", "dog"), penman.Triple("b1", ":instance", "bark-01"), penman.Triple("b1", ":ARG0", "d2"), ] ) def token_processing(tok): if tok is None: return None elif tok.isdigit(): try: return eval(tok) except: return tok elif tok.startswith('"') and (not tok.endswith('"')): return tok + '"' elif tok.endswith('"') and (not tok.startswith('"')): return '"' + tok else: return tok def decode_into_node_and_backreferences(subtoken_ids, tokenizer): rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)") rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>") # subtoken_ids.insert(1,36) # add "(" id # subtoken_ids.insert(-1, 4839) # add ")" id # get strings subtokens = [tokenizer.decoder.get(t) for t in subtoken_ids] # print("subtokens:", subtokens) # fix backreferences subtoken_backreferences = [max(t - len(tokenizer.encoder), -1) for t in subtoken_ids] # strip padding subtokens, subtoken_backreferences = zip( *[ (s, b) for s, b in zip(subtokens, subtoken_backreferences) if s != ("") ] ) # subword collapse tokens = [] backreferences = [] subword_to_token_map = {} current_token_i = 0 for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)): subword_to_token_map[subw_i] = current_token_i # if empty you cannot do anything but add a new word if not tokens: tokens.append(subtok.lstrip(tokenizer.INIT)) backreferences.append(-1) current_token_i += 1 # backref can't be splitted elif subw_backr > -1: tokens.append(None) backreferences.append(subword_to_token_map[subw_backr]) current_token_i += 1 # after a special token release elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]): tokens.append(subtok.lstrip(tokenizer.INIT)) backreferences.append(-1) current_token_i += 1 # after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT # TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':' elif (tokens[-1] == ":") and rex_arg.match(subtok): tokens[-1] = tokens[-1] + subtok[1:] # leading tokenizer.INIT elif subtok.startswith(tokenizer.INIT): tokens.append(subtok.lstrip(tokenizer.INIT)) backreferences.append(-1) current_token_i += 1 # very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge elif ( isinstance(tokens[-1], str) and tokens[-1].startswith(":") and tokens[-1][-1].isdigit() and (subtok != "-of") ): tokens.append(subtok.lstrip(tokenizer.INIT)) backreferences.append(-1) current_token_i += 1 # in any other case attach to the previous else: tokens[-1] = tokens[-1] + subtok # strip INIT and fix byte-level tokens = [ tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t for t in tokens ] # tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens] # unks are substituted with thing tokens = [t if t != "" else "thing" for t in tokens] old_tokens = tokens old_backreferences = backreferences # Barack Obama -> "Barack Obama" tokens = [] backreferences = [] token_to_token_map = {} start_search = 0 removed = 0 while True: try: lit_start = old_tokens.index("", start_search) token_addition = old_tokens[start_search:lit_start] for i, t in enumerate(token_addition, start=start_search): token_to_token_map[i] = i - removed tokens += token_addition backreferences_addition = [ token_to_token_map[b] if b > -1 else -1 for b in old_backreferences[start_search:lit_start] ] backreferences += backreferences_addition lit_end = min(lit_start + 2, len(old_tokens) - 1) while lit_end < len(old_tokens): old_tok = old_tokens[lit_end] if isinstance(old_tok, str) and ( (old_tok.startswith(":") and len(old_tok) > 3) or (old_tok == "") ): res_tok = old_tokens[lit_start + 1 : lit_end] for i in range(lit_start, lit_end): token_to_token_map[i] = len(tokens) # Remove possible wrong None res = old_tokens[lit_start + 1 : lit_end] res = [str(r) for r in res if r is not None] res = '"' + "_".join(res) + '"' removed += len(res_tok) start_search = lit_end tokens += [res, old_tok] backreferences += [-1, -1] break elif old_tok == "": res_tok = old_tokens[lit_start + 1 : lit_end] for i in range(lit_start, lit_end + 1): token_to_token_map[i] = len(tokens) # Remove possible wrong None res = old_tokens[lit_start + 1 : lit_end] res = [str(r) for r in res if r is not None] res = '"' + "_".join(res) + '"' removed += len(res_tok) + 1 start_search = lit_end + 1 tokens.append(res) backreferences.append(-1) break else: lit_end += 1 start_search = lit_end except ValueError: token_addition = old_tokens[start_search:] for i, t in enumerate(token_addition, start=start_search): token_to_token_map[i] = i - removed backreferences_addition = [ token_to_token_map[b] if b > -1 else b for b in old_backreferences[start_search:] ] tokens += token_addition backreferences += backreferences_addition break tokens = [token_processing(t) for t in tokens] shift = 1 if tokens[1] == "": shift = 2 tokens = tokens[shift:] backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]] if tokens[-1] == "": tokens.pop() backreferences.pop() return tokens, backreferences def index_of(element, iterable, default=None, start=None, end=None): if not callable(element): def check(x): return element == x else: check = element if start is None: start = 0 if end is None: end = len(iterable) item = start while item < end: if check(iterable[item]): return item item += 1 return default def separate_edges_nodes(edges_nodes_slice, *other): is_arg = lambda x: isinstance(x, str) and x.startswith(":") start = 0 edges = [] nodes = [] l = len(edges_nodes_slice) while start < l: edge_index = index_of(is_arg, edges_nodes_slice, start=start) if edge_index is None or edge_index == (l - 1): break if is_arg(edges_nodes_slice[edge_index + 1]): start = edge_index + 1 continue edges.append(edge_index) nodes.append(edge_index + 1) start = edge_index + 2 ret = [] for oth in other: edges_oth = [oth[i] for i in edges] nodes_oth = [oth[i] for i in nodes] ret.append((edges_oth, nodes_oth)) return ret def _split_name_ops(graph): # identify name triples name_vars = {} for i, (v1, rel, v2) in enumerate(graph.triples): if rel == ":instance" and v2 == "name": name_vars[v1] = 1 # check if they have ops name_vars_to_ops = defaultdict(list) for i, (v1, rel, v2) in enumerate(graph.triples): if v1 in name_vars and rel.startswith(":op"): name_vars_to_ops[v1].append((i, rel, v2.strip('"'))) triples = graph.triples.copy() for nv, ops in name_vars_to_ops.items(): ops = sorted(ops, key=lambda x: int(x[1][3:])) idx, _, lits = zip(*ops) for i in idx: triples[i] = None lits = ['"' + l + '"' for lit in lits for l in lit.split("_")] tt = [] for i, l in enumerate(lits, start=1): rel = ":op" + str(i) tt.append(penman.Triple(nv, rel, l)) triples[min(idx)] = tt triples = [t if isinstance(t, list) else [t] for t in triples if t is not None] triples = [t for tt in triples for t in tt] graph_ = penman.Graph(triples) graph_.metadata = graph.metadata return graph_ def _reconstruct_graph_from_nodes(nodes, backreferences): triples = [] triples_added = set() variable2index = {} index2variable = {} start_index = 0 cnt = defaultdict(Counter) while start_index < len(nodes): stop_index = index_of("", nodes, default=len(nodes) + 1, start=start_index) old_start_index = start_index start_index = stop_index + 1 src_node, src_backr = nodes[old_start_index], backreferences[old_start_index] if src_node == "": continue trg_nodes_edges = nodes[old_start_index:stop_index] trg_nodes_edges_backr = backreferences[old_start_index:stop_index] trg_nodes_edges_indices = list(range(old_start_index, stop_index)) if isinstance(src_node, str): if src_node in ("", "", ""): continue elif ("/" in src_node) or (":" in src_node) or ("(" in src_node) or (")" in src_node): src_node = "thing" if src_node is not None: src_node = str(src_node) src_var = src_node[0].lower() if not src_var not in "abcdefghijklmnopqrstuvwxyz": src_var = "x" # src_var = f'{src_var}_{len(variable2index)}' src_var = f"{src_var}{len(variable2index)}" src_var_i = old_start_index variable2index[src_var] = src_var_i index2variable[src_var_i] = src_var triple = penman.Triple(src_var, ":instance", src_node) if triple not in triples_added: triples.append(triple) triples_added.add(triple) else: if src_backr in index2variable: src_var = index2variable[src_backr] # more resilient logic here (trg_edges, trg_nodes), (_, trg_nodes_backr), (_, trg_nodes_indices) = separate_edges_nodes( trg_nodes_edges, trg_nodes_edges, trg_nodes_edges_backr, trg_nodes_edges_indices ) for n, e, nb, ni in zip(trg_nodes, trg_edges, trg_nodes_backr, trg_nodes_indices): if isinstance(n, str) and n.startswith(":"): continue if isinstance(n, str) and n.startswith("<") and n.endswith(">"): continue if e == ":li": pass elif len(e) < 4 or (not e.startswith(":")): continue # same edge more than once num = cnt[src_var][e] # num = 0 if num: if e.startswith(":op") or e.startswith(":snt"): continue # elif e.startswith(':ARG'): # continue elif num > 3: continue if n is None: if nb not in index2variable: continue trg_var = index2variable[nb] trg = trg_var elif e == ":mode": trg = n elif ( (not isinstance(n, str)) or re.match(r"^[+-]?\d+\.?\d*$", n) or (n == "-") or (n == "+") ): trg = str(n) elif n.startswith('"') and n.endswith('"') and len(n) > 2: trg = '"' + n.replace('"', "") + '"' elif ("/" in n) or (":" in n) or ("(" in n) or (")" in n) or ("=" in n): trg = f'"{n}"' elif n == '"': continue elif ( (n.startswith('"') and (not n.endswith('"'))) or (not n.startswith('"') and (n.endswith('"'))) or ('"' in n) ): trg = '"' + n.replace('"', "") + '"' else: trg_var = n[0].lower() if trg_var not in "abcdefghijklmnopqrstuvwxyz": trg_var = "x" # trg_var = f'{trg_var}_{len(variable2index)}' trg_var = f"{trg_var}{len(variable2index)}" trg_var_i = ni variable2index[trg_var] = trg_var_i index2variable[trg_var_i] = trg_var triple = penman.Triple(trg_var, ":instance", n) if triple not in triples_added: triples.append(triple) triples_added.add(triple) trg = trg_var triple = penman.Triple(src_var, e, trg) if triple not in triples_added: triples.append(triple) triples_added.add(triple) cnt[src_var][e] += 1 return penman.Graph(triples) def build_graph(nodes, backreferences, restore_name_ops=False): graph = _reconstruct_graph_from_nodes(nodes, backreferences) if restore_name_ops: graph = _split_name_ops(graph) return graph class ParsedStatus(enum.Enum): OK = 0 FIXED = 1 BACKOFF = 2 def connect_graph_if_not_connected(graph): try: encoded = encode(graph) return graph, ParsedStatus.OK except: pass nxgraph = nx.MultiGraph() variables = graph.variables() for v1, _, v2 in graph.triples: if v1 in variables and v2 in variables: nxgraph.add_edge(v1, v2) elif v1 in variables: nxgraph.add_edge(v1, v1) triples = graph.triples.copy() new_triples = [] addition = f"a{len(variables) + 1}" triples.append(penman.Triple(addition, ":instance", "and")) for i, conn_set in enumerate(nx.connected_components(nxgraph), start=1): edge = f":op{i}" conn_set = sorted(conn_set, key=lambda x: int(x[1:])) conn_set = [c for c in conn_set if c in variables] node = conn_set[0] new_triples.append(penman.Triple(addition, edge, node)) triples = new_triples + triples metadata = graph.metadata graph = penman.Graph(triples) graph.metadata.update(metadata) encode(graph) return graph, ParsedStatus.FIXED def restore_backreferences_from_pointers(nodes): new_nodes, new_backreferences = [], [] prev_pointer = None pointer2i = {} for n in nodes: is_pointer = isinstance(n, str) and n.startswith("") if not is_pointer: if prev_pointer is not None: if prev_pointer in pointer2i: new_nodes.append(None) new_backreferences.append(pointer2i[prev_pointer]) new_nodes.append(n) new_backreferences.append(-1) else: pointer2i[prev_pointer] = len(new_nodes) new_nodes.append(n) new_backreferences.append(-1) else: new_nodes.append(n) new_backreferences.append(-1) prev_pointer = None else: prev_pointer = n return new_nodes, new_backreferences ================================================ FILE: hanlp/components/amr/amrbart/data_interface/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-12-07 14:36 ================================================ FILE: hanlp/components/amr/amrbart/data_interface/dataset.py ================================================ # coding:utf-8 # MIT License # # Copyright (c) 2022 xfbai # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. class AMRParsingDataSet(object): @staticmethod def tokenize(sample: dict, tokenizer, max_src_length=400, max_tgt_length=1024, unified_input=True, amr="src", text="tgt"): amr = sample.get(amr, None) # AMR tokens txt = sample[text] # Text tokens if amr is not None: sample['labels'] = tokenizer.tokenize_amr(amr.split())[:max_src_length - 2] + [tokenizer.amr_eos_token_id] raw_txt_ids = tokenizer( txt, max_length=max_tgt_length, padding=False, truncation=True )["input_ids"] if unified_input: txt_ids = raw_txt_ids[:max_tgt_length - 3] + [tokenizer.amr_bos_token_id, tokenizer.mask_token_id, tokenizer.amr_eos_token_id] else: txt_ids = raw_txt_ids sample['input_ids'] = txt_ids return sample class AMR2TextDataSet(object): @staticmethod def tokenize(sample: dict, tokenizer, max_src_length=400, max_tgt_length=1024, unified_input=True, amr="src", text="tgt"): src = sample[amr] # AMR tokens tgt = sample.get(text, None) # Text tokens if not unified_input: src_ids = [tokenizer.amr_bos_token_id] + tokenizer.tokenize_amr(src.split())[ :max_src_length - 2] + [tokenizer.amr_eos_token_id] else: # [[mask]xxx] src_ids = [tokenizer.bos_token_id, tokenizer.mask_token_id, tokenizer.eos_token_id] + [ tokenizer.amr_bos_token_id] + tokenizer.tokenize_amr(src.split())[:max_src_length - 5] + [ tokenizer.amr_eos_token_id] sample["input_ids"] = src_ids if tgt is not None: with tokenizer.as_target_tokenizer(): tgt_ids = tokenizer( tgt, max_length=max_tgt_length, padding=False, truncation=True ) tgt_ids["input_ids"] = [ label[1:] for label in tgt_ids["input_ids"] ] sample["labels"] = tgt_ids["input_ids"] return sample ================================================ FILE: hanlp/components/amr/amrbart/model_interface/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-12-03 20:33 ================================================ FILE: hanlp/components/amr/amrbart/model_interface/modeling_bart.py ================================================ # coding=utf-8 # Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch BART model.""" import copy import math import random import warnings from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from transformers.activations import ACT2FN from transformers.modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, Seq2SeqLMOutput, Seq2SeqModelOutput, Seq2SeqQuestionAnsweringModelOutput, Seq2SeqSequenceClassifierOutput, ) from transformers.modeling_utils import PreTrainedModel from transformers.utils import ( add_code_sample_docstrings, add_end_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, ) from transformers.models.bart.configuration_bart import BartConfig logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "facebook/bart-base" _CONFIG_FOR_DOC = "BartConfig" _TOKENIZER_FOR_DOC = "BartTokenizer" # Base model docstring _EXPECTED_OUTPUT_SHAPE = [1, 8, 768] # SequenceClassification docstring _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "valhalla/bart-large-sst2" _SEQ_CLASS_EXPECTED_LOSS = 0.0 _SEQ_CLASS_EXPECTED_OUTPUT = "'POSITIVE'" # QuestionAsnwering docstring _CHECKPOINT_FOR_QA = "valhalla/bart-large-finetuned-squadv1" _QA_EXPECTED_LOSS = 0.59 _QA_EXPECTED_OUTPUT = "' nice puppet'" BART_PRETRAINED_MODEL_ARCHIVE_LIST = [ "facebook/bart-large", # see all BART models at https://huggingface.co/models?filter=bart ] def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): """ Shift input ids one token to the right. """ shifted_input_ids = input_ids.new_zeros(input_ids.shape) shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() shifted_input_ids[:, 0] = decoder_start_token_id if pad_token_id is None: raise ValueError("self.model.config.pad_token_id has to be defined.") # replace possible -100 values in labels by `pad_token_id` shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) return shifted_input_ids def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0): """ Make causal mask used for bi-directional self-attention. """ bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min)) mask_cond = torch.arange(mask.size(-1)) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1) return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) class BartLearnedPositionalEmbedding(nn.Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ def __init__(self, num_embeddings: int, embedding_dim: int): # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 # and adjust num_embeddings appropriately. Other models don't have this hack self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): """`input_ids_shape` is expected to be [bsz x seqlen].""" bsz, seq_len = input_ids_shape[:2] positions = torch.arange( past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device ) return super().forward(positions + self.offset) class BartAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( self, embed_dim: int, num_heads: int, dropout: float = 0.0, is_decoder: bool = False, bias: bool = True, ): super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads if (self.head_dim * num_heads) != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def forward( self, hidden_states: torch.Tensor, key_value_states: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, layer_head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel""" # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling # get key, value proj if is_cross_attention and past_key_value is not None: # reuse k,v, cross_attentions key_states = past_key_value[0] value_states = past_key_value[1] elif is_cross_attention: # cross_attentions key_states = self._shape(self.k_proj(key_value_states), -1, bsz) value_states = self._shape(self.v_proj(key_value_states), -1, bsz) elif past_key_value is not None: # reuse k, v, self_attention key_states = self._shape(self.k_proj(hidden_states), -1, bsz) value_states = self._shape(self.v_proj(hidden_states), -1, bsz) key_states = torch.cat([past_key_value[0], key_states], dim=2) value_states = torch.cat([past_key_value[1], value_states], dim=2) else: # self_attention key_states = self._shape(self.k_proj(hidden_states), -1, bsz) value_states = self._shape(self.v_proj(hidden_states), -1, bsz) if self.is_decoder: # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention # key/value_states (first "if" case) # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of # all previous decoder key/value_states. Further calls to uni-directional self-attention # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) # if encoder bi-directional self-attention `past_key_value` is always `None` past_key_value = (key_states, value_states) proj_shape = (bsz * self.num_heads, -1, self.head_dim) query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) key_states = key_states.view(*proj_shape) value_states = value_states.view(*proj_shape) src_len = key_states.size(1) attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): raise ValueError( f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" f" {attn_weights.size()}" ) if attention_mask is not None: if attention_mask.size() != (bsz, 1, tgt_len, src_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): raise ValueError( f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" f" {layer_head_mask.size()}" ) attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) if output_attentions: # this operation is a bit awkward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to be reshaped # twice and have to be reused in the following attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) else: attn_weights_reshaped = None attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): raise ValueError( f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" f" {attn_output.size()}" ) attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) return attn_output, attn_weights_reshaped, past_key_value class BartEncoderLayer(nn.Module): def __init__(self, config: BartConfig): super().__init__() self.embed_dim = config.d_model self.self_attn = BartAttention( embed_dim=self.embed_dim, num_heads=config.encoder_attention_heads, dropout=config.attention_dropout, ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) self.final_layer_norm = nn.LayerNorm(self.embed_dim) def forward( self, hidden_states: torch.FloatTensor, attention_mask: torch.FloatTensor, layer_head_mask: torch.FloatTensor, output_attentions: Optional[bool] = False, ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: """ Args: hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size `(encoder_attention_heads,)`. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. """ residual = hidden_states hidden_states, attn_weights, _ = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) if hidden_states.dtype == torch.float16 and ( torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() ): clamp_value = torch.finfo(hidden_states.dtype).max - 1000 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) outputs = (hidden_states,) if output_attentions: outputs += (attn_weights,) return outputs class BartDecoderLayer(nn.Module): def __init__(self, config: BartConfig): super().__init__() self.embed_dim = config.d_model self.self_attn = BartAttention( embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, is_decoder=True, ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.encoder_attn = BartAttention( self.embed_dim, config.decoder_attention_heads, dropout=config.attention_dropout, is_decoder=True, ) self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) self.final_layer_norm = nn.LayerNorm(self.embed_dim) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, layer_head_mask: Optional[torch.Tensor] = None, cross_attn_layer_head_mask: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = True, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape `(batch, seq_len, embed_dim)` encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size `(encoder_attention_heads,)`. cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of size `(decoder_attention_heads,)`. past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. """ residual = hidden_states # Self Attention # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None # add present self-attn cache to positions 1,2 of present_key_value tuple hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, past_key_value=self_attn_past_key_value, attention_mask=attention_mask, layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) # Cross-Attention Block cross_attn_present_key_value = None cross_attn_weights = None if encoder_hidden_states is not None: residual = hidden_states # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( hidden_states=hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, layer_head_mask=cross_attn_layer_head_mask, past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) # add cross-attn to positions 3,4 of present_key_value tuple present_key_value = present_key_value + cross_attn_present_key_value # Fully Connected residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights, cross_attn_weights) if use_cache: outputs += (present_key_value,) return outputs class BartClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__( self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float, ): super().__init__() self.dense = nn.Linear(input_dim, inner_dim) self.dropout = nn.Dropout(p=pooler_dropout) self.out_proj = nn.Linear(inner_dim, num_classes) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.dropout(hidden_states) hidden_states = self.dense(hidden_states) hidden_states = torch.tanh(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.out_proj(hidden_states) return hidden_states class BartPretrainedModel(PreTrainedModel): config_class = BartConfig base_model_prefix = "model" supports_gradient_checkpointing = True _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"] def _init_weights(self, module): std = self.config.init_std if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (BartDecoder, BartEncoder)): module.gradient_checkpointing = value @property def dummy_inputs(self): pad_token = self.config.pad_token_id input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) dummy_inputs = { "attention_mask": input_ids.ne(pad_token), "input_ids": input_ids, } return dummy_inputs class PretrainedBartModel(BartPretrainedModel): def __init_subclass__(self): warnings.warn( "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.", FutureWarning, ) BART_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config ([`BartConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ BART_GENERATION_EXAMPLE = r""" Summarization example: ```python >>> from transformers import BartTokenizer, BartForConditionalGeneration >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn") >>> ARTICLE_TO_SUMMARIZE = ( ... "PG&E stated it scheduled the blackouts in response to forecasts for high winds " ... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " ... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." ... ) >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt") >>> # Generate Summary >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20) >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions' ``` Mask filling example: ```python >>> from transformers import BartTokenizer, BartForConditionalGeneration >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base") >>> TXT = "My friends are but they eat too many carbs." >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"] >>> logits = model(input_ids).logits >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() >>> probs = logits[0, masked_index].softmax(dim=0) >>> values, predictions = probs.topk(5) >>> tokenizer.decode(predictions).split() ['not', 'good', 'healthy', 'great', 'very'] ``` """ BART_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids) Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). For translation and summarization training, `decoder_input_ids` should be provided. If no `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right for denoising pre-training following the paper. decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default. If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value of `inputs_embeds`. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ class BartEncoder(BartPretrainedModel): """ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`BartEncoderLayer`]. Args: config: BartConfig embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): super().__init__(config) self.dropout = config.dropout self.layerdrop = config.encoder_layerdrop embed_dim = config.d_model self.padding_idx = config.pad_token_id self.max_source_positions = config.max_position_embeddings self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 if embed_tokens is not None: self.embed_tokens = embed_tokens else: self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx) self.embed_positions = BartLearnedPositionalEmbedding( config.max_position_embeddings, embed_dim, ) self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)]) self.layernorm_embedding = nn.LayerNorm(embed_dim) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.embed_tokens def set_input_embeddings(self, value): self.embed_tokens = value def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale embed_pos = self.embed_positions(input_shape) hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype) encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None # check if head_mask has a correct number of layers specified if desired if head_mask is not None: if head_mask.size()[0] != (len(self.layers)): raise ValueError( f"The head_mask should be specified for {len(self.layers)} layers, but it is for" f" {head_mask.size()[0]}." ) for idx, encoder_layer in enumerate(self.layers): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): # skip the layer layer_outputs = (None, None) else: if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, output_attentions) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, (head_mask[idx] if head_mask is not None else None), ) else: layer_outputs = encoder_layer( hidden_states, attention_mask, layer_head_mask=(head_mask[idx] if head_mask is not None else None), output_attentions=output_attentions, ) hidden_states = layer_outputs[0] if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) if output_hidden_states: encoder_states = encoder_states + (hidden_states,) if not return_dict: return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) class BartDecoder(BartPretrainedModel): """ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`] Args: config: BartConfig embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): super().__init__(config) self.dropout = config.dropout self.layerdrop = config.decoder_layerdrop self.padding_idx = config.pad_token_id self.max_target_positions = config.max_position_embeddings self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 if embed_tokens is not None: self.embed_tokens = embed_tokens else: self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx) self.embed_positions = BartLearnedPositionalEmbedding( config.max_position_embeddings, config.d_model, ) self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)]) self.layernorm_embedding = nn.LayerNorm(config.d_model) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.embed_tokens def set_input_embeddings(self, value): self.embed_tokens = value def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] combined_attention_mask = None if input_shape[-1] > 1: combined_attention_mask = _make_causal_mask( input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length ).to(inputs_embeds.device) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) combined_attention_mask = ( expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask ) return combined_attention_mask def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing cross-attention on hidden heads. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") # past_key_values_length past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale attention_mask = self._prepare_decoder_attention_mask( attention_mask, input_shape, inputs_embeds, past_key_values_length ) # expand encoder attention mask if encoder_hidden_states is not None and encoder_attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) # embed positions positions = self.embed_positions(input_shape, past_key_values_length) hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None next_decoder_cache = () if use_cache else None # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): if attn_mask is not None: if attn_mask.size()[0] != (len(self.layers)): raise ValueError( f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" f" {head_mask.size()[0]}." ) for idx, decoder_layer in enumerate(self.layers): # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) if output_hidden_states: all_hidden_states += (hidden_states,) dropout_probability = random.uniform(0, 1) if self.training and (dropout_probability < self.layerdrop): continue past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs, output_attentions, use_cache) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, encoder_hidden_states, encoder_attention_mask, head_mask[idx] if head_mask is not None else None, cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, None, ) else: layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, layer_head_mask=(head_mask[idx] if head_mask is not None else None), cross_attn_layer_head_mask=( cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None ), past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) if output_attentions: all_self_attns += (layer_outputs[1],) if encoder_hidden_states is not None: all_cross_attentions += (layer_outputs[2],) # add hidden states from the last decoder layer if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = next_decoder_cache if use_cache else None if not return_dict: return tuple( v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] if v is not None ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, cross_attentions=all_cross_attentions, ) @add_start_docstrings( "The bare BART Model outputting raw hidden-states without any specific head on top.", BART_START_DOCSTRING, ) class BartModel(BartPretrainedModel): def __init__(self, config: BartConfig): super().__init__(config) padding_idx, vocab_size = config.pad_token_id, config.vocab_size self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) self.encoder = BartEncoder(config, self.shared) self.decoder = BartDecoder(config, self.shared) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.shared def set_input_embeddings(self, value): self.shared = value self.encoder.embed_tokens = self.shared self.decoder.embed_tokens = self.shared def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC, expected_output=_EXPECTED_OUTPUT_SHAPE, ) def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[List[torch.FloatTensor]] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqModelOutput]: # different to other models, Bart automatically creates decoder_input_ids from # input_ids if no decoder_input_ids are provided if decoder_input_ids is None and decoder_inputs_embeds is None: if input_ids is None: raise ValueError( "If no `decoder_input_ids` or `decoder_inputs_embeds` are " "passed, `input_ids` cannot be `None`. Please pass either " "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`." ) decoder_input_ids = shift_tokens_right( input_ids, self.config.pad_token_id, self.config.decoder_start_token_id ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, ) # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, encoder_hidden_states=encoder_outputs[0], encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, cross_attn_head_mask=cross_attn_head_mask, past_key_values=past_key_values, inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) if not return_dict: return decoder_outputs + encoder_outputs return Seq2SeqModelOutput( last_hidden_state=decoder_outputs.last_hidden_state, past_key_values=decoder_outputs.past_key_values, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, cross_attentions=decoder_outputs.cross_attentions, encoder_last_hidden_state=encoder_outputs.last_hidden_state, encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) @add_start_docstrings( "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING ) class BartForConditionalGeneration(BartPretrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head.weight"] def __init__(self, config: BartConfig): super().__init__(config) self.model = BartModel(config) self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) # Initialize weights and apply final processing self.post_init() def get_encoder(self): return self.model.get_encoder() def get_decoder(self): return self.model.get_decoder() def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding: new_embeddings = super().resize_token_embeddings(new_num_tokens) self._resize_final_logits_bias(new_num_tokens) return new_embeddings def _resize_final_logits_bias(self, new_num_tokens: int) -> None: old_num_tokens = self.final_logits_bias.shape[-1] if new_num_tokens <= old_num_tokens: new_bias = self.final_logits_bias[:, :new_num_tokens] else: extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) @add_end_docstrings(BART_GENERATION_EXAMPLE) def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[List[torch.FloatTensor]] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. Returns: """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: if use_cache: logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") use_cache = False if decoder_input_ids is None and decoder_inputs_embeds is None: decoder_input_ids = shift_tokens_right( labels, self.config.pad_token_id, self.config.decoder_start_token_id ) outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, encoder_outputs=encoder_outputs, decoder_attention_mask=decoder_attention_mask, head_mask=head_mask, decoder_head_mask=decoder_head_mask, cross_attn_head_mask=cross_attn_head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (lm_logits,) + outputs[1:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return Seq2SeqLMOutput( loss=masked_lm_loss, logits=lm_logits, past_key_values=outputs.past_key_values, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, cross_attentions=outputs.cross_attentions, encoder_last_hidden_state=outputs.encoder_last_hidden_state, encoder_hidden_states=outputs.encoder_hidden_states, encoder_attentions=outputs.encoder_attentions, ) def prepare_inputs_for_generation( self, decoder_input_ids, past=None, attention_mask=None, head_mask=None, decoder_head_mask=None, cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, **kwargs ): # cut decoder_input_ids if past is used if past is not None: decoder_input_ids = decoder_input_ids[:, -1:] return { "input_ids": None, # encoder_outputs is defined. input_ids not needed "encoder_outputs": encoder_outputs, "past_key_values": past, "decoder_input_ids": decoder_input_ids, "attention_mask": attention_mask, "head_mask": head_mask, "decoder_head_mask": decoder_head_mask, "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) @staticmethod def _reorder_cache(past, beam_idx): reordered_past = () for layer_past in past: # cached cross_attention states don't have to be reordered -> they are always the same reordered_past += ( tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:], ) return reordered_past @add_start_docstrings( """ Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, BART_START_DOCSTRING, ) class BartForSequenceClassification(BartPretrainedModel): def __init__(self, config: BartConfig, **kwargs): super().__init__(config, **kwargs) self.model = BartModel(config) self.classification_head = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classifier_dropout, ) self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj) @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, expected_loss=_SEQ_CLASS_EXPECTED_LOSS, ) def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False if input_ids is None and inputs_embeds is not None: raise NotImplementedError( f"Passing input embeddings is currently not supported for {self.__class__.__name__}" ) outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, head_mask=head_mask, decoder_head_mask=decoder_head_mask, cross_attn_head_mask=cross_attn_head_mask, encoder_outputs=encoder_outputs, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = outputs[0] # last hidden state eos_mask = input_ids.eq(self.config.eos_token_id) if len(torch.unique_consecutive(eos_mask.sum(1))) > 1: raise ValueError("All examples must have the same number of tokens.") sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ :, -1, : ] logits = self.classification_head(sentence_representation) loss = None if labels is not None: if self.config.problem_type is None: if self.config.num_labels == 1: self.config.problem_type = "regression" elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.config.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return Seq2SeqSequenceClassifierOutput( loss=loss, logits=logits, past_key_values=outputs.past_key_values, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, cross_attentions=outputs.cross_attentions, encoder_last_hidden_state=outputs.encoder_last_hidden_state, encoder_hidden_states=outputs.encoder_hidden_states, encoder_attentions=outputs.encoder_attentions, ) @add_start_docstrings( """ BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). """, BART_START_DOCSTRING, ) class BartForQuestionAnswering(BartPretrainedModel): def __init__(self, config): super().__init__(config) config.num_labels = 2 self.num_labels = config.num_labels self.model = BartModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.model._init_weights(self.qa_outputs) @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) @add_code_sample_docstrings( processor_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_QA, output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC, expected_loss=_QA_EXPECTED_LOSS, expected_output=_QA_EXPECTED_OUTPUT, ) def forward( self, input_ids: torch.Tensor = None, attention_mask: Optional[torch.Tensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, decoder_attention_mask: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, decoder_head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, encoder_outputs: Optional[List[torch.FloatTensor]] = None, start_positions: Optional[torch.LongTensor] = None, end_positions: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, decoder_inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence are not taken into account for computing the loss. end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if start_positions is not None and end_positions is not None: use_cache = False outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, head_mask=head_mask, decoder_head_mask=decoder_head_mask, cross_attn_head_mask=cross_attn_head_mask, encoder_outputs=encoder_outputs, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions = start_positions.clamp(0, ignored_index) end_positions = end_positions.clamp(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = ( start_logits, end_logits, ) + outputs[1:] return ((total_loss,) + output) if total_loss is not None else output return Seq2SeqQuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, past_key_values=outputs.past_key_values, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, cross_attentions=outputs.cross_attentions, encoder_last_hidden_state=outputs.encoder_last_hidden_state, encoder_hidden_states=outputs.encoder_hidden_states, encoder_attentions=outputs.encoder_attentions, ) class BartDecoderWrapper(BartPretrainedModel): """ This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is used in combination with the [`EncoderDecoderModel`] framework. """ def __init__(self, config): super().__init__(config) self.decoder = BartDecoder(config) def forward(self, *args, **kwargs): return self.decoder(*args, **kwargs) class BartForCausalLM(BartPretrainedModel): def __init__(self, config): config = copy.deepcopy(config) config.is_decoder = True config.is_encoder_decoder = False super().__init__(config) self.model = BartDecoderWrapper(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.model.decoder.embed_tokens def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def set_decoder(self, decoder): self.model.decoder = decoder def get_decoder(self): return self.model.decoder @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are only required when the model is used as a decoder in a Sequence to Sequence model. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. Returns: Example: ```python >>> from transformers import BartTokenizer, BartForCausalLM >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") >>> model = BartForCausalLM.from_pretrained("facebook/bart-base", add_cross_attention=False) >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> logits = outputs.logits >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] >>> list(logits.shape) == expected_shape True ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model.decoder( input_ids=input_ids, attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, head_mask=head_mask, cross_attn_head_mask=cross_attn_head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) logits = self.lm_head(outputs[0]) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithCrossAttentions( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs): # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_ids.shape) if past: input_ids = input_ids[:, -1:] # first step, decoder_cached_states are empty return { "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed "attention_mask": attention_mask, "past_key_values": past, "use_cache": use_cache, } @staticmethod def _reorder_cache(past, beam_idx): reordered_past = () for layer_past in past: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past ================================================ FILE: hanlp/components/amr/amrbart/model_interface/tokenization_bart.py ================================================ # coding:utf-8 # this is a simplified version of "https://github.com/SapienzaNLP/spring/blob/main/spring_amr/tokenization_bart.py" # MIT License # # Copyright (c) 2022 xfbai # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import penman import regex as re from transformers import BartTokenizer from hanlp.components.amr.amrbart.common import postprocessing from hanlp.components.amr.amrbart.common.constant import raw_special_tokens, recategorizations from hanlp.components.amr.amrbart.common.penman_interface import encode class AMRBartTokenizer(BartTokenizer): INIT = 'Ġ' def __init__(self, vocab_file, merges_file, errors="replace", bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", add_prefix_space=False, **kwargs): super().__init__(vocab_file, merges_file, errors, bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, mask_token, add_prefix_space, **kwargs) self.modified = 0 self.recategorizations = set(recategorizations) self.patterns = re.compile(r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.remove_pars = False @classmethod def from_pretrained(cls, pretrained_model_path, *args, **kwargs): inst = super().from_pretrained(pretrained_model_path, *args, **kwargs) inst.init_amr_vocabulary() return inst def init_amr_vocabulary(self): self.old_enc_size = old_enc_size = len(self.encoder) tokens = [t for t in raw_special_tokens if t not in self.encoder] for i, t in enumerate(tokens, start=old_enc_size): self.encoder[t] = i self.encoder = {k: i for i, (k,v) in enumerate(sorted(self.encoder.items(), key=lambda x: x[1]))} self.decoder = {v: k for k, v in sorted(self.encoder.items(), key=lambda x: x[1])} self.modified = len(tokens) self.amr_bos_token = "" self.amr_bos_token_id = self.encoder[self.amr_bos_token] self.amr_eos_token = "" self.amr_eos_token_id = self.encoder[self.amr_eos_token] # print(f"Added {self.modified} AMR tokens") def _tokenize(self, text): """ Tokenize a string. Modified in order to handle sentences with recategorization pointers""" bpe_tokens = [] for tok_span in text.lstrip().split(' '): tok_span = tok_span.strip() recats = tok_span.rsplit('_', 1) if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder: bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]]) else: for token in re.findall(self.pat, ' ' + tok_span): token = "".join( self.byte_encoder[b] for b in token.encode("utf-8") ) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens def _tok_bpe(self, token): tokk = [] tok = token.strip() recats = tok.rsplit('_', 1) if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder: tokk.extend([self.INIT + recats[0], '_' + recats[1]]) else: for tok in self.patterns.findall(' ' + token): tok = "".join( self.byte_encoder[b] for b in tok.encode("utf-8")) toks = self.bpe(tok).split(' ') tokk.extend(toks) return tokk def tokenize_amr(self, amr_tokens): bpe_tokens = [] for i, tokk in enumerate(amr_tokens): is_in_enc = self.INIT + tokk in self.encoder is_rel = tokk.startswith(':') and len(tokk) > 1 is_spc = tokk.startswith('<') and tokk.endswith('>') is_of = tokk.startswith(':') and tokk.endswith('-of') is_frame = re.match(r'.+-\d\d', tokk) is not None if tokk.startswith('"') and tokk.endswith('"'): # dealing with examples like "The_United_Kingdom_of_xxx" tokk = tokk[1:-1].replace('_', ' ') bpe_toks = [self.INIT + ""] bpe_toks += self._tok_bpe(tokk) bpe_toks.append(self.INIT + "") elif (is_rel or is_spc or is_frame or is_of): if is_in_enc: bpe_toks = [self.INIT + tokk] elif is_frame: bpe_toks = self._tok_bpe(tokk[:-3]) + [tokk[-3:]] elif is_of: rel = tokk[:-3] if self.INIT + rel in self.encoder: bpe_toks = [self.INIT + rel, '-of'] else: bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:]) + ['-of'] elif is_rel: bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:]) else: print("tok:", tokk) print(f"is_rel:{is_rel}, is_spc:{is_spc}, is_frame:{is_frame}, is_of:{is_of}") exit() raise else: if is_in_enc: bpe_toks = [self.INIT + tokk] else: bpe_toks = self._tok_bpe(tokk) bpe_tokens.append(bpe_toks) bpe_tokens = [b for bb in bpe_tokens for b in bb] bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens] return bpe_token_ids def decode_amr(self, tokens, restore_name_ops=None): try: nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self) except Exception as e: # print('Decoding failure:', file=sys.stderr) # print(e, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) try: graph_ = graph = self._fix_and_make_graph(nodes) # if collapse_name_ops: # graph_ = graph = postprocessing._split_name_ops(graph) except Exception as e: # print('Building failure:', file=sys.stderr) # print(nodes, file=sys.stderr) # print(backreferences, file=sys.stderr) # print(e, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) try: graph, status = postprocessing.connect_graph_if_not_connected(graph) # if status == postprocessing.ParsedStatus.BACKOFF: # print('Reconnection 1 failure:') # print(nodes, file=sys.stderr) # print(backreferences, file=sys.stderr) # print(graph_, file=sys.stderr) return graph, status, (nodes, backreferences) except Exception as e: # print('Reconnction 2 failure:', file=sys.stderr) # print(e, file=sys.stderr) # print(nodes, file=sys.stderr) # print(backreferences, file=sys.stderr) # print(graph_, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences) def _fix_and_make_graph(self, nodes): nodes_ = [] for n in nodes: if isinstance(n, str): if n.startswith('<') and n.endswith('>') and (not n.startswith('') if e != len(nxt) -1: pst = nxt[e+1:] nxt = nxt[:e+1] nodes_.append(nxt) if pst is not None: nodes_.append(pst) else: nodes_.append(nxt) i += 1 nodes = nodes_ i = 1 nodes_ = [nodes[0]] while i < len(nodes): nxt = nodes[i] if isinstance(nxt, str) and nxt.startswith(' 0: line = line[:i].strip() break old_line = line while True: open_count = len(re.findall(r'\(', line)) close_count = len(re.findall(r'\)', line)) if open_count > close_count: line += ')' * (open_count - close_count) elif close_count > open_count: for i in range(close_count - open_count): line = line.rstrip(')') line = line.rstrip(' ') if old_line == line: break old_line = line """ graph = penman.decode(linearized + ' ') triples = [] newvars = 2000 for triple in graph.triples: x, rel, y = triple if x is None: pass elif rel == ':instance' and y is None: triples.append(penman.Triple(x, rel, 'thing')) elif y is None: var = f'z{newvars}' newvars += 1 triples.append(penman.Triple(x, rel, var)) triples.append(penman.Triple(var, ':instance', 'thing')) else: triples.append(triple) graph = penman.Graph(triples) linearized = encode(graph) def fix_text(linearized=linearized): n = 0 def _repl1(match): nonlocal n out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3) n += 1 return out linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized, flags=re.IGNORECASE | re.MULTILINE) def _repl2(match): return match.group(1) linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2, linearized, flags=re.IGNORECASE | re.MULTILINE) # adds a ':' to args w/o it linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized) # removes edges with no node # linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE) return linearized linearized = fix_text(linearized) g = penman.decode(linearized) return g def _classify(self, node): if not isinstance(node, str): return "CONST" elif node == 'i': return "I" elif re.match(r'^[a-z]\d*$', node) is not None: return "VAR" elif node[0].isdigit(): return "CONST" elif node.startswith('"') and node.endswith('"'): return "CONST" elif node in ('+', '-'): return "CONST" elif node == ':mode': return 'MODE' elif node.startswith(':'): return "EDGE" elif node in ['/', '(', ')']: return node elif node[0].isalpha(): for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'): if char in node: return "CONST" return "INST" else: return 'CONST' ================================================ FILE: hanlp/components/amr/amrbart/preprocess/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-12-03 20:33 ================================================ FILE: hanlp/components/amr/amrbart/preprocess/amr_io.py ================================================ # coding:utf-8 # the code is migrated from https://github.com/SapienzaNLP/spring # MIT License # # Copyright (c) 2022 xfbai # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import glob from pathlib import Path from typing import List, Union, Iterable from hanlp.components.amr.amrbart.preprocess.penman_interface import load as pm_load def read_raw_amr_data( paths: List[Union[str, Path]], use_recategorization=False, dereify=True, remove_wiki=False, ): """ code for loading AMR from a set of files - use_recategorization: use graph recategorization trick - dereify: Dereify edges in g that have reifications in model. - remove_wiki: remove wiki links """ assert paths if not isinstance(paths, Iterable): paths = [paths] graphs = [] for path_ in paths: for path in glob.glob(str(path_)): path = Path(path) graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki)) assert graphs if use_recategorization: for g in graphs: metadata = g.metadata metadata["snt_orig"] = metadata["snt"] tokens = eval(metadata["tokens"]) metadata["snt"] = " ".join( [ t for t in tokens if not ((t.startswith("-L") or t.startswith("-R")) and t.endswith("-")) ] ) return graphs ================================================ FILE: hanlp/components/amr/amrbart/preprocess/penman_interface.py ================================================ # coding:utf-8 # MIT License # # Copyright (c) 2022 xfbai # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from penman import load as load_, Graph, Triple from penman import loads as loads_ from penman import encode as encode_ from penman.model import Model from penman.models.noop import NoOpModel from penman.models import amr op_model = Model() noop_model = NoOpModel() amr_model = amr.model DEFAULT = op_model def _get_model(dereify): if dereify is None: return DEFAULT elif dereify: return op_model else: return noop_model def _remove_wiki(graph): metadata = graph.metadata triples = [] for t in graph.triples: v1, rel, v2 = t if rel == ":wiki": t = Triple(v1, rel, "+") triples.append(t) graph = Graph(triples) graph.metadata = metadata return graph def load(source, dereify=None, remove_wiki=False): model = _get_model(dereify) out = load_(source=source, model=model) if remove_wiki: for i in range(len(out)): out[i] = _remove_wiki(out[i]) return out def loads(string, dereify=None, remove_wiki=False): model = _get_model(dereify) out = loads_(string=string, model=model) if remove_wiki: for i in range(len(out)): out[i] = _remove_wiki(out[i]) return out def encode(g, top=None, indent=-1, compact=False): model = amr_model return encode_(g=g, top=top, indent=indent, compact=compact, model=model) ================================================ FILE: hanlp/components/amr/amrbart/preprocess/read_and_process.py ================================================ # coding:utf-8 # MIT License # # Copyright (c) 2022 xfbai # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import re import copy import json import yaml import penman from tqdm import tqdm from pathlib import Path from hanlp.components.amr.amrbart.preprocess.amr_io import read_raw_amr_data def _tokenize_encoded_graph(encoded): linearized = re.sub(r"(\".+?\")", r" \1 ", encoded) pieces = [] for piece in linearized.split(): if piece.startswith('"') and piece.endswith('"'): pieces.append(piece) else: piece = piece.replace("(", " ( ") piece = piece.replace(")", " ) ") piece = piece.replace(":", " :") piece = piece.replace("/", " / ") piece = piece.strip() pieces.append(piece) linearized = re.sub(r"\s+", " ", " ".join(pieces)).strip() return linearized.split(" ") def dfs_linearize(graph, remove_pars=False, use_pointer_tokens=True): graph_ = copy.deepcopy(graph) graph_.metadata = {} linearized = penman.encode(graph_) linearized_nodes = _tokenize_encoded_graph(linearized) if use_pointer_tokens: remap = {} for i in range(1, len(linearized_nodes)): nxt = linearized_nodes[i] lst = linearized_nodes[i - 1] if nxt == "/": remap[lst] = f"" i = 1 linearized_nodes_ = [linearized_nodes[0]] while i < (len(linearized_nodes)): nxt = linearized_nodes[i] lst = linearized_nodes_[-1] if nxt in remap: if lst == "(" and linearized_nodes[i + 1] == "/": nxt = remap[nxt] i += 1 elif lst.startswith(":"): nxt = remap[nxt] linearized_nodes_.append(nxt) i += 1 linearized_nodes = linearized_nodes_ if remove_pars: linearized_nodes = [n for n in linearized_nodes if n != "("] return linearized_nodes def main(): from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter parser = ArgumentParser( description="AMR processing script", formatter_class=ArgumentDefaultsHelpFormatter, ) parser.add_argument('--config', type=Path, default='default.yaml', help='Use the following config for hparams.') parser.add_argument('--input_file', type=str, help='The input AMR file.') parser.add_argument('--output_prefix', type=str, help='The output_prefix.') args, unknown = parser.parse_known_args() with args.config.open() as y: config = yaml.load(y, Loader=yaml.FullLoader) remove_pars = False use_pointer_tokens = True graphs = read_raw_amr_data( [args.input_file], use_recategorization=config["use_recategorization"], remove_wiki=config["remove_wiki"], dereify=config["dereify"], ) line_amr, sentences = [], [] for g in tqdm(graphs): lin_tokens = dfs_linearize(g) sentences.append(g.metadata["snt"]) # line_amr.append(" ".join(lin_tokens[1:-1])) line_amr.append(" ".join(lin_tokens)) print(f"all {len(line_amr)} AMRs processed") with open(args.output_prefix + ".amr", "w", encoding="utf-8") as fout: fout.write("\n".join(line_amr) + "\n") with open(args.output_prefix + ".txt", "w", encoding="utf-8") as fout: fout.write("\n".join(sentences) + "\n") res_out = [json.dumps({"sent": sent, "amr": lamr}) for lamr, sent in zip(line_amr, sentences)] with open(args.output_prefix + ".jsonl", "w", encoding="utf-8") as fout: fout.write("\n".join(res_out) + "\n") if __name__ == '__main__': main() ================================================ FILE: hanlp/components/amr/seq2seq/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-04-27 19:24 ================================================ FILE: hanlp/components/amr/seq2seq/dataset/IO.py ================================================ import glob from typing import List, Union, Iterable from pathlib import Path from .penman import pm_load as pm_load def read_raw_amr_data( paths: List[Union[str, Path]], use_recategorization=False, dereify=True, remove_wiki=False, ): assert paths if not isinstance(paths, Iterable): paths = [paths] graphs = [] for path_ in paths: for path in glob.glob(str(path_)): path = Path(path) assert path.exists(), f'{path} not exist' graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki)) assert graphs, 'No graphs loaded' if use_recategorization: for g in graphs: metadata = g.metadata metadata['snt_orig'] = metadata['snt'] tokens = eval(metadata['tokens']) metadata['snt'] = ' '.join( [t for t in tokens if not ((t.startswith('-L') or t.startswith('-R')) and t.endswith('-'))]) return graphs ================================================ FILE: hanlp/components/amr/seq2seq/dataset/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-04-27 19:29 ================================================ FILE: hanlp/components/amr/seq2seq/dataset/dataset.py ================================================ from collections import Counter from typing import Union, List, Callable, Tuple import torch import penman from penman import Graph from hanlp.common.dataset import TransformableDataset from hanlp.components.amr.seq2seq.dataset.IO import read_raw_amr_data from hanlp.components.amr.seq2seq.dataset.penman import role_is_reverted from hanlp.components.amr.seq2seq.dataset.tokenization_bart import PENMANBartTokenizer from phrasetree.tree import Tree import json from hanlp_common.constant import BOS, EOS, ROOT from hanlp_common.io import load_pickle class AMRDataset(TransformableDataset): def __init__(self, data: Union[str, List], use_recategorization=False, remove_wiki=False, dereify=False, transform: Union[Callable, List] = None, cache=None, generate_idx=None) -> None: self.dereify = dereify self.remove_wiki = remove_wiki self.use_recategorization = use_recategorization super().__init__(data, transform, cache, generate_idx) def load_file(self, filepath: str): graphs = read_raw_amr_data([filepath], self.use_recategorization, remove_wiki=self.remove_wiki, dereify=self.dereify) for g in graphs: yield {'amr': g} def get_roles(self): roles = Counter() for sample in self.data: g: Graph = sample['amr'] for s, r, t in g.triples: if role_is_reverted(r): r = r[:-3] roles[r] += 1 return roles def get_frames(self): frames = Counter() for sample in self.data: g: Graph = sample['amr'] for i in g.instances(): t = i.target cells = t.split('-') if len(cells) == 2 and len(cells[1]) == 2 and cells[1].isdigit(): frames[t] += 1 return frames class AMRPickleDataset(AMRDataset): def load_file(self, filepath: str): items = torch.load(filepath) for each in items: each['amr'] = penman.decode(each['amr']) yield each def dfs_linearize_tokenize(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False, text_key='snt') -> dict: amr = sample.get('amr', None) if amr: l, e = tokenizer.linearize(amr) sample['graph_tokens'] = e['linearized_graphs'] sample['graph_token_ids'] = l text = amr.metadata[text_key] else: text = sample['text'] if remove_space: text = ''.join(text.split()) sample['text'] = text sample['text_token_ids'] = tokenizer.encode(text) return sample def dfs_linearize_levi(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False) -> dict: amr = sample.get('amr', None) if amr: l, e = tokenizer.linearize(amr) sample['graph_tokens'] = e['linearized_graphs'] sample['graph_token_ids'] = l tok = json.loads(amr.metadata['tok']) dep = json.loads(amr.metadata['dep']) levi = dep_to_levi(tok, dep) sample['text'] = ' '.join(levi) # ids = sum(tokenizer.batch_encode_plus([' ' + x for x in levi], add_special_tokens=False).input_ids, []) ids = [] idx = 0 for t in levi: if t in ('(', ')'): ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + t)) else: if idx % 2: ids.extend(tokenizer.encode(t, add_special_tokens=False)) else: ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + t)) idx += 1 sample['text_token_ids'] = [tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id] return sample def dfs_linearize_rgcn(sample: dict, tokenizer: PENMANBartTokenizer) -> dict: amr = sample.get('amr', None) if amr: l, e = tokenizer.linearize(amr) sample['graph_tokens'] = e['linearized_graphs'] sample['graph_token_ids'] = l tok = sample['tok'] sample['text'] = [tokenizer.cls_token] + [' ' + x for x in tok] arc_scores = sample['dep']['scores']['arc_scores'] rel_scores = sample['dep']['scores']['rel_scores'] dep_graph = arc_scores[:, :, None] * rel_scores root = torch.zeros((1,) + dep_graph.shape[1:]) sample['dep_graph'] = torch.cat([root, dep_graph], dim=0) return sample def dfs_linearize_constituency(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False) -> dict: amr = sample.get('amr', None) if amr: l, e = tokenizer.linearize(amr) sample['graph_tokens'] = e['linearized_graphs'] sample['graph_token_ids'] = l tree = Tree.from_list(json.loads(sample['amr'].metadata['con_list'])) for each in tree.subtrees(lambda x: x.height() == 2): if each[0] == '(': each[0] = '' elif each[0] == ')': each[0] = '' text = tree.pformat(margin=10e7) tokens = [] buffer = [] for c in text: if c == '(' or c == ')': tokens.append(''.join(buffer)) tokens.append(c) buffer.clear() continue buffer.append(c) if buffer: tokens.append(''.join(buffer)) tokens = [x.strip() for x in tokens] tokens = [x for x in tokens if x] restore_bracket = {'': '(', '': ')'} tokens = [restore_bracket.get(x, x) for x in tokens] ids = [] for each in tokens: pairs = each.split(' ', 1) if len(pairs) == 2: con, token = pairs ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + con)) ids.extend(tokenizer.encode(token, add_special_tokens=False)) else: ids.append(tokenizer.convert_tokens_to_ids(tokenizer.INIT + each)) if remove_space: text = ''.join(text.split()) sample['text'] = text sample['text_token_ids'] = [tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id] return sample def dfs_linearize_tokenize_with_linguistic_structures(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False, text_key='snt') -> dict: amr = sample.get('amr', None) if amr: l, e = tokenizer.linearize(amr) sample['graph_tokens'] = e['linearized_graphs'] sample['graph_token_ids'] = l text = amr.metadata[text_key] if remove_space: text = ''.join(text.split()) sample['text'] = text tok = json.loads(amr.metadata['tok']) text_token_ids = tokenizer.batch_encode_plus(tok, add_special_tokens=False).input_ids sample['text_token_ids'] = [tokenizer.bos_token_id] + sum(text_token_ids, []) + [tokenizer.eos_token_id] pos = amr.metadata.get('pos', None) if pos: flat_pos = [] pos = json.loads(pos) for subtokens, tag in zip(text_token_ids, pos): flat_pos.extend([tag] * len(subtokens)) sample['pos'] = [BOS] + flat_pos + [EOS] ner = amr.metadata.get('ner', None) if ner is not None: flat_ner = [] ner_spans = json.loads(ner) ner = ['O'] * len(text_token_ids) for form, tag, start, end in ner_spans: ner[start:end] = [tag] * (end - start) for subtokens, tag in zip(text_token_ids, ner): flat_ner.extend([tag] * len(subtokens)) sample['ner'] = [BOS] + flat_ner + [EOS] dep = amr.metadata.get('dep', None) if dep: token_to_1st_subtoken = [0] num_subtokens = 1 # 1 for BOS for subtokens in text_token_ids: token_to_1st_subtoken.append(num_subtokens) num_subtokens += len(subtokens) flat_arc, flat_rel = [0], [BOS] dep = json.loads(dep) for subtokens, (arc, rel) in zip(text_token_ids, dep): flat_arc.extend([token_to_1st_subtoken[arc]] * len(subtokens)) flat_rel.extend([rel] * len(subtokens)) sample['dep_arc'] = flat_arc + [0] sample['dep_rel'] = flat_rel + [EOS] return sample def dep_to_levi(tok: List[str], dep: List[Tuple[int, str]]): root = [i for i, x in enumerate(dep) if x[0] == 0][0] seq = [] dfs(tok, dep, root, seq) return seq def dfs(tok: List[str], dep: List[Tuple[int, str]], s, seq): seq.append(dep[s][1]) seq.append(tok[s]) children = [i for i, x in enumerate(dep) if x[0] == s + 1] if children: seq.append('(') for child in children: dfs(tok, dep, child, seq) seq.append(')') ================================================ FILE: hanlp/components/amr/seq2seq/dataset/linearization.py ================================================ import abc import itertools from collections import deque, defaultdict import re from typing import List, Optional, Dict, Any, Set, TypeVar from dataclasses import dataclass import networkx as nx import penman @dataclass class SemanticGraph: nodes_var: List[str] """ List of linearized nodes, with special tokens. """ edges: Optional[List[str]] """ List of linearized edges, with special tokens. """ backreferences: List[int] """ List of backpointers to handle rentrancies and cycles. """ var2instance: Dict[str, str] """ Dict from var ids to 'lemmatized' readable strings qualifying the node (collapsing the :instance edge for AMR). """ extra: Dict[str, Any] """ Holds extra stuff that might be useful, e.g. alignments, NER, EL. """ # @cached_property @property def variables(self) -> Set[str]: """Set of variables in this semantic graph""" variables = {v for v in self.nodes_var if not v.startswith('<')} return variables @property def resolved_nodes_var(self) -> List[str]: return [self.nodes_var[b] for b in self.backreferences] # @cached_property @property def nodes(self) -> List[str]: """Linearized nodes with varids replaced by instances""" return [self.var2instance.get(node, node) for node in self.nodes_var] @property def resolved_nodes(self) -> List[str]: return [self.nodes[b] for b in self.backreferences] def src_occurrence(self, var: str) -> int: pass class BaseLinearizer(metaclass=abc.ABCMeta): @abc.abstractmethod def linearize(self, *args, **kwargs) -> SemanticGraph: pass class AMRTokens: START, END = '<', '>' _TEMPL = START + '{}' + END BOS_N = _TEMPL.format('s') EOS_N = _TEMPL.format('/s') START_N = _TEMPL.format('start') STOP_N = _TEMPL.format('stop') PNTR_N = _TEMPL.format('pointer') LIT_START = _TEMPL.format('lit') LIT_END = _TEMPL.format('/lit') BACKR_SRC_N = _TEMPL.format('backr:src:XXX') BACKR_TRG_N = _TEMPL.format('backr:trg:XXX') BOS_E = _TEMPL.format('s') EOS_E = _TEMPL.format('/s') START_E = _TEMPL.format('start') STOP_E = _TEMPL.format('stop') _FIXED_SPECIAL_TOKENS_N = { BOS_N, EOS_N, START_N, STOP_N} _FIXED_SPECIAL_TOKENS_E = { BOS_E, EOS_E, START_E, STOP_E} _FIXED_SPECIAL_TOKENS = _FIXED_SPECIAL_TOKENS_N | _FIXED_SPECIAL_TOKENS_E # match and read backreferences _re_BACKR_SRC_N = re.compile(BACKR_SRC_N.replace('XXX', r'([0-9]+)')) _re_BACKR_TRG_N = re.compile(BACKR_TRG_N.replace('XXX', r'([0-9]+)')) @classmethod def is_node(cls, string: str) -> bool: if isinstance(string, str) and string.startswith(':'): return False elif string in cls._FIXED_SPECIAL_TOKENS_E: return False return True @classmethod def read_backr(cls, string: str) -> Optional: m_src = cls._re_BACKR_SRC_N.search(string) if m_src is not None: return m_src m_trg = cls._re_BACKR_TRG_N.search(string) if m_trg is not None: return m_trg return None T = TypeVar('T') def index_default( item: T, list_: List[T], start: Optional[int] = None, stop: Optional[int] = None, default: Optional[int] = None ): if start is None: start = 0 if stop is None: stop = len(list_) return next((i for i, x in enumerate(list_[start:stop], start=start) if x == item), default) class AMRLinearizer(BaseLinearizer): def __init__( self, use_pointer_tokens: bool = True, collapse_name_ops: bool = False, ): self.collapse_name_ops = collapse_name_ops self.interleave_edges = False self.use_pointer_tokens = use_pointer_tokens def _collapse_name_ops(self, amr): # identify name triples name_vars = {} for i, (v1, rel, v2) in enumerate(amr.triples): if rel == ':instance' and v2 == 'name': name_vars[v1] = 1 # check if they have ops name_vars_to_ops = defaultdict(list) for i, (v1, rel, v2) in enumerate(amr.triples): if v1 in name_vars and rel.startswith(':op'): name_vars_to_ops[v1].append((i, rel, v2.strip('"'))) triples = amr.triples.copy() for nv, ops in name_vars_to_ops.items(): ops = sorted(ops, key=lambda x: int(x[1][3:])) idx, _, lits = zip(*ops) for i in idx: triples[i] = None lit = '"' + '_'.join(lits) + '"' triples[min(idx)] = penman.Triple(nv, ':op1', lit) triples = [t for t in triples if t is not None] amr_ = penman.Graph(triples) amr_.metadata = amr.metadata return amr_ def linearize(self, amr: penman.Graph) -> SemanticGraph: if self.collapse_name_ops: amr = self._collapse_name_ops(amr) linearized = self._linearize(amr) linearized = self._interleave(linearized) if self.use_pointer_tokens: linearized = self._add_pointer_tokens(linearized) return linearized def _linearize(self, amr: penman.Graph) -> SemanticGraph: variables = set(amr.variables()) variables = {'var:' + v for v in variables} var2instance = {} graph = nx.MultiDiGraph() triples2order = {k: i for i, k in enumerate(amr.triples)} for triple in amr.triples: var, rel, instance = triple order = triples2order[triple] if rel != ':instance': continue for expansion_candidate in itertools.chain(range(order - 1, -1), range(order + 1, len(amr.triples))): if var == amr.triples[expansion_candidate][2]: expansion = expansion_candidate break else: expansion = 0 var = 'var:' + var var2instance[var] = instance graph.add_node(var, instance=instance, order=order, expansion=expansion) for triple in amr.edges(): var1, rel, var2 = triple order = triples2order[triple] if rel == ':instance': continue var1 = 'var:' + var1 var2 = 'var:' + var2 graph.add_edge(var1, var2, rel=rel, order=order) for triple in amr.attributes(): var, rel, attr = triple order = triples2order[triple] if rel == ':instance': continue var = 'var:' + var graph.add_edge(var, attr, rel=rel, order=order) # nodes that are not reachable from the root (e.g. because of reification) # will be present in the not_explored queue # undirected_graph = graph.to_undirected() # print(amr.variables()) not_explored = deque(sorted(variables, key=lambda x: nx.get_node_attributes(graph, 'order')[x])) # ( # len(nx.shortest_path(undirected_graph, 'var:' + amr.top, x)), # -graph.out_degree(x), # ) first_index = {} explored = set() added_to_queue = set() nodes_visit = [AMRTokens.BOS_N] edges_visit = [AMRTokens.BOS_E] backreferences = [0] queue = deque() queue.append('var:' + amr.top) while queue or not_explored: if queue: node1 = queue.popleft() else: node1 = not_explored.popleft() if node1 in added_to_queue: continue if not list(graph.successors(node1)): continue if node1 in variables: if node1 in explored: continue if node1 in first_index: nodes_visit.append(AMRTokens.BACKR_TRG_N) backreferences.append(first_index[node1]) else: backreferences.append(len(nodes_visit)) first_index[node1] = len(nodes_visit) nodes_visit.append(node1) edges_visit.append(AMRTokens.START_E) successors = [] for node2 in graph.successors(node1): for edge_data in graph.get_edge_data(node1, node2).values(): rel = edge_data['rel'] order = edge_data['order'] successors.append((order, rel, node2)) successors = sorted(successors) for order, rel, node2 in successors: edges_visit.append(rel) # node2 is a variable if node2 in variables: # ... which was mentioned before if node2 in first_index: nodes_visit.append(AMRTokens.BACKR_TRG_N) backreferences.append(first_index[node2]) # .. which is mentioned for the first time else: backreferences.append(len(nodes_visit)) first_index[node2] = len(nodes_visit) nodes_visit.append(node2) # 1) not already in Q # 2) has children # 3) the edge right before its expansion has been encountered if (node2 not in added_to_queue) and list(graph.successors(node2)) and ( nx.get_node_attributes(graph, 'expansion')[node2] <= order): queue.append(node2) added_to_queue.add(node2) # node2 is a constant else: backreferences.append(len(nodes_visit)) nodes_visit.append(node2) backreferences.append(len(nodes_visit)) nodes_visit.append(AMRTokens.STOP_N) edges_visit.append(AMRTokens.STOP_E) explored.add(node1) else: backreferences.append(len(nodes_visit)) nodes_visit.append(node1) explored.add(node1) backreferences.append(len(nodes_visit)) nodes_visit.append(AMRTokens.EOS_N) edges_visit.append(AMRTokens.EOS_E) assert len(nodes_visit) == len(edges_visit) == len(backreferences) return SemanticGraph( nodes_visit, edges_visit, backreferences, var2instance, extra={'graph': graph, 'amr': amr} ) def _interleave(self, graph: SemanticGraph) -> SemanticGraph: new_backreferences_map = [] new_nodes = [] new_edges = None new_backreferences = [] # to isolate sublist to the stop token start_i = 1 end_i = index_default(AMRTokens.STOP_N, graph.nodes_var, start_i, -1, -1) def add_node(node, backr=None): old_n_node = len(new_backreferences_map) new_n_node = len(new_nodes) if backr is None: backr = old_n_node new_backreferences_map.append(new_n_node) new_nodes.append(node) if old_n_node == backr: new_backreferences.append(new_n_node) else: new_backreferences.append(new_backreferences_map[backr]) def add_edge(edge): new_nodes.append(edge) new_backreferences.append(len(new_backreferences)) add_node(AMRTokens.BOS_N) while end_i > -1: # src node add_node(graph.nodes_var[start_i], graph.backreferences[start_i]) # edges and trg nodes, interleaved nodes = graph.nodes_var[start_i + 1:end_i] edges = graph.edges[start_i + 1:end_i] backr = graph.backreferences[start_i + 1:end_i] for n, e, b in zip(nodes, edges, backr): add_edge(e) add_node(n, b) # stop add_node(graph.nodes_var[end_i], graph.backreferences[end_i]) start_i = end_i + 1 end_i = index_default(AMRTokens.STOP_N, graph.nodes_var, start_i, -1, -1) add_node(AMRTokens.EOS_N) new_graph = SemanticGraph( new_nodes, None, new_backreferences, graph.var2instance, extra=graph.extra, ) return new_graph def _add_pointer_tokens(self, graph: SemanticGraph) -> SemanticGraph: new_nodes = [] var2pointer = {} for node, backr in zip(graph.nodes_var, graph.backreferences): if node == AMRTokens.BACKR_TRG_N: node = graph.nodes_var[backr] pointer = var2pointer[node] new_nodes.append(pointer) elif node in graph.var2instance: pointer = var2pointer.setdefault(node, f"") new_nodes.append(pointer) new_nodes.append(node) else: new_nodes.append(node) new_backreferences = list(range(len(new_nodes))) new_graph = SemanticGraph( new_nodes, None, new_backreferences, graph.var2instance, extra=graph.extra, ) return new_graph ================================================ FILE: hanlp/components/amr/seq2seq/dataset/penman.py ================================================ from typing import List from penman import load as load_, Graph, Triple from penman import loads as loads_ from penman import encode as encode_ from penman.model import Model from penman.models.noop import NoOpModel from penman.models import amr import penman import logging op_model = Model() noop_model = NoOpModel() amr_model = amr.model DEFAULT = op_model # Mute loggers penman.layout.logger.setLevel(logging.CRITICAL) penman._parse.logger.setLevel(logging.CRITICAL) def _get_model(dereify): if dereify is None: return DEFAULT elif dereify: return op_model else: return noop_model def _remove_wiki(graph): metadata = graph.metadata triples = [] for t in graph.triples: v1, rel, v2 = t if rel == ':wiki': t = Triple(v1, rel, '+') triples.append(t) graph = Graph(triples) graph.metadata = metadata return graph def pm_load(source, dereify=None, remove_wiki=False) -> List[penman.Graph]: """ Args: source: dereify: Restore reverted relations remove_wiki: Returns: """ model = _get_model(dereify) out = load_(source=source, model=model) if remove_wiki: for i in range(len(out)): out[i] = _remove_wiki(out[i]) return out def loads(string, dereify=None, remove_wiki=False): model = _get_model(dereify) out = loads_(string=string, model=model) if remove_wiki: for i in range(len(out)): out[i] = _remove_wiki(out[i]) return out def pm_encode(g, top=None, indent=-1, compact=False): model = amr_model return encode_(g=g, top=top, indent=indent, compact=compact, model=model) def role_is_reverted(role: str): if role.endswith('consist-of'): return False return role.endswith('-of') class AMRGraph(penman.Graph): def __str__(self): return penman.encode(self) ================================================ FILE: hanlp/components/amr/seq2seq/dataset/postprocessing.py ================================================ from collections import defaultdict, Counter import enum import re import networkx as nx import penman from hanlp.components.amr.seq2seq.dataset.penman import pm_encode BACKOFF = penman.Graph([ penman.Triple('d2', ':instance', 'dog'), penman.Triple('b1', ':instance', 'bark-01'), penman.Triple('b1', ':ARG0', 'd2'), ]) def token_processing(tok): if tok is None: return None elif tok.isdigit(): try: return eval(tok) except: return tok elif tok.startswith('"') and (not tok.endswith('"')): return tok + '"' elif tok.endswith('"') and (not tok.startswith('"')): return '"' + tok else: return tok def decode_into_node_and_backreferences(subtoken_ids, tokenizer): rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)") rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>") # get strings subtokens = tokenizer.convert_ids_to_tokens(subtoken_ids) # fix backreferences subtoken_backreferences = [max(t - len(tokenizer), -1) for t in subtoken_ids] # strip padding no_pad = [(s, b) for s, b in zip(subtokens, subtoken_backreferences) if s != (tokenizer.INIT + '')] if no_pad: subtokens, subtoken_backreferences = zip(*no_pad) else: subtokens, subtoken_backreferences = [''], [-1] # subword collapse tokens = [] backreferences = [] subword_to_token_map = {} current_token_i = 0 for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)): subword_to_token_map[subw_i] = current_token_i # if empty you cannot do anything but add a new word if not tokens: tokens.append(subtok.lstrip(tokenizer.INIT)) backreferences.append(-1) current_token_i += 1 # backref can't be splitted elif subw_backr > -1: tokens.append(None) backreferences.append(subword_to_token_map[subw_backr]) current_token_i += 1 # after a special token release elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]): tokens.append(subtok.lstrip(tokenizer.INIT)) backreferences.append(-1) current_token_i += 1 # after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT # TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':' elif (tokens[-1] == ':') and rex_arg.match(subtok): tokens[-1] = tokens[-1] + subtok[1:] # leading tokenizer.INIT elif subtok.startswith(tokenizer.INIT): tokens.append(subtok.lstrip(tokenizer.INIT)) backreferences.append(-1) current_token_i += 1 # very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge elif isinstance(tokens[-1], str) and tokens[-1].startswith(':') and tokens[-1][-1].isdigit() and ( subtok != '-of'): tokens.append(subtok.lstrip(tokenizer.INIT)) backreferences.append(-1) current_token_i += 1 # in any other case attach to the previous else: tokens[-1] = tokens[-1] + subtok # strip INIT and fix byte-level tokens = [tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t for t in tokens] # tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens] # unks are substituted with thing tokens = [t if t != '' else 'thing' for t in tokens] old_tokens = tokens old_backreferences = backreferences # Barack Obama -> "Barack Obama" tokens = [] backreferences = [] token_to_token_map = {} start_search = 0 removed = 0 while True: try: lit_start = old_tokens.index('', start_search) token_addition = old_tokens[start_search:lit_start] for i, t in enumerate(token_addition, start=start_search): token_to_token_map[i] = i - removed tokens += token_addition backreferences_addition = [token_to_token_map[b] if b > -1 else -1 for b in old_backreferences[start_search:lit_start]] backreferences += backreferences_addition lit_end = min(lit_start + 2, len(old_tokens) - 1) while lit_end < len(old_tokens): old_tok = old_tokens[lit_end] if isinstance(old_tok, str) and ( (old_tok.startswith(':') and len(old_tok) > 3) or (old_tok == '')): res_tok = old_tokens[lit_start + 1:lit_end] for i in range(lit_start, lit_end): token_to_token_map[i] = len(tokens) # Remove possible wrong None res = old_tokens[lit_start + 1:lit_end] res = [str(r) for r in res if r is not None] res = '"' + '_'.join(res) + '"' removed += len(res_tok) start_search = lit_end tokens += [res, old_tok] backreferences += [-1, -1] break elif old_tok == '': res_tok = old_tokens[lit_start + 1:lit_end] for i in range(lit_start, lit_end + 1): token_to_token_map[i] = len(tokens) # Remove possible wrong None res = old_tokens[lit_start + 1:lit_end] res = [str(r) for r in res if r is not None] res = '"' + '_'.join(res) + '"' removed += len(res_tok) + 1 start_search = lit_end + 1 tokens.append(res) backreferences.append(-1) break else: lit_end += 1 start_search = lit_end except ValueError: token_addition = old_tokens[start_search:] for i, t in enumerate(token_addition, start=start_search): token_to_token_map[i] = i - removed backreferences_addition = [token_to_token_map[b] if b > -1 else b for b in old_backreferences[start_search:]] tokens += token_addition backreferences += backreferences_addition break tokens = [token_processing(t) for t in tokens] shift = 1 if len(tokens) > 1 and tokens[1] == '': shift = 2 tokens = tokens[shift:] backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]] if tokens and tokens[-1] == '': tokens.pop() backreferences.pop() return tokens, backreferences def decode_into_node_and_backreferences_without_space(subtoken_ids, tokenizer): rex_arg = re.compile(f"^{tokenizer.INIT}(op|snt|conj|prep)") rex_spc = re.compile(r"<(s|/s|lit|/lit|stop|unk|pad|mask)>") # get strings subtokens = tokenizer.convert_ids_to_tokens(subtoken_ids) # fix backreferences subtoken_backreferences = [max(t - len(tokenizer), -1) for t in subtoken_ids] # strip padding no_pad = [(s, b) for s, b in zip(subtokens, subtoken_backreferences) if s != (tokenizer.INIT + '')] if no_pad: subtokens, subtoken_backreferences = zip(*no_pad) else: subtokens, subtoken_backreferences = [''], [-1] # subword collapse tokens = [] backreferences = [] subword_to_token_map = {} current_token_i = 0 prev_is_pointer = False prev_is_rel = False for subw_i, (subw_backr, subtok) in enumerate(zip(subtoken_backreferences, subtokens)): subword_to_token_map[subw_i] = current_token_i is_pointer = subtok.startswith('') is_rel = subtok.startswith(':') and len(subtok) > 1 is_bracket = subtok in '()' # if empty you cannot do anything but add a new word if not tokens: tokens.append(subtok) backreferences.append(-1) current_token_i += 1 # backref can't be splitted elif subw_backr > -1: tokens.append(None) backreferences.append(subword_to_token_map[subw_backr]) current_token_i += 1 # after a special token release elif isinstance(tokens[-1], str) and rex_spc.match(tokens[-1]): tokens.append(subtok) backreferences.append(-1) current_token_i += 1 # after a subtoken ':' (which should be followed by the rest of the edge) ignore tokenizer.INIT # TODO: this is an ugly patch due to the fact that BART tokenizer splits after ':' elif (tokens[-1] == ':') and rex_arg.match(subtok): tokens[-1] = tokens[-1] + subtok[1:] # current or prev is a control token elif (is_pointer or is_rel or prev_is_pointer or prev_is_rel or is_bracket or subtok == '') \ and subtok != '-of': tokens.append(subtok) backreferences.append(-1) current_token_i += 1 # very ugly patch for some cases in which tokenizer.INIT is not in the following token to the edge elif isinstance(tokens[-1], str) and tokens[-1].startswith(':') and tokens[-1][-1].isdigit() and ( subtok != '-of'): tokens.append(subtok) backreferences.append(-1) current_token_i += 1 # in any other case attach to the previous else: tokens[-1] = tokens[-1] + subtok prev_is_pointer = is_pointer prev_is_rel = is_rel # strip INIT and fix byte-level tokens = [tokenizer.convert_tokens_to_string(list(t)).lstrip() if isinstance(t, str) else t for t in tokens] # tokens = [t.replace(tokenizer.INIT, '') if isinstance(t, str) else t for t in tokens] # unks are substituted with thing tokens = [t if t != '' else 'thing' for t in tokens] old_tokens = tokens old_backreferences = backreferences # Barack Obama -> "Barack Obama" tokens = [] backreferences = [] token_to_token_map = {} start_search = 0 removed = 0 while True: try: lit_start = old_tokens.index('', start_search) token_addition = old_tokens[start_search:lit_start] for i, t in enumerate(token_addition, start=start_search): token_to_token_map[i] = i - removed tokens += token_addition backreferences_addition = [token_to_token_map[b] if b > -1 else -1 for b in old_backreferences[start_search:lit_start]] backreferences += backreferences_addition lit_end = min(lit_start + 2, len(old_tokens) - 1) while lit_end < len(old_tokens): old_tok = old_tokens[lit_end] if isinstance(old_tok, str) and ( (old_tok.startswith(':') and len(old_tok) > 3) or (old_tok == '')): res_tok = old_tokens[lit_start + 1:lit_end] for i in range(lit_start, lit_end): token_to_token_map[i] = len(tokens) # Remove possible wrong None res = old_tokens[lit_start + 1:lit_end] res = [str(r) for r in res if r is not None] res = '"' + '_'.join(res) + '"' removed += len(res_tok) start_search = lit_end tokens += [res, old_tok] backreferences += [-1, -1] break elif old_tok == '': res_tok = old_tokens[lit_start + 1:lit_end] for i in range(lit_start, lit_end + 1): token_to_token_map[i] = len(tokens) # Remove possible wrong None res = old_tokens[lit_start + 1:lit_end] res = [str(r) for r in res if r is not None] res = '"' + '_'.join(res) + '"' removed += len(res_tok) + 1 start_search = lit_end + 1 tokens.append(res) backreferences.append(-1) break else: lit_end += 1 start_search = lit_end except ValueError: token_addition = old_tokens[start_search:] for i, t in enumerate(token_addition, start=start_search): token_to_token_map[i] = i - removed backreferences_addition = [token_to_token_map[b] if b > -1 else b for b in old_backreferences[start_search:]] tokens += token_addition backreferences += backreferences_addition break tokens = [token_processing(t) for t in tokens] shift = 0 if len(tokens) > 1 and tokens[1] == '': shift = 1 tokens = tokens[shift:] backreferences = [b if b == -1 else b - shift for b in backreferences[shift:]] if tokens and tokens[-1] == '': tokens.pop() backreferences.pop() return tokens, backreferences def index_of(element, iterable, default=None, start=None, end=None): if not callable(element): def check(x): return element == x else: check = element if start is None: start = 0 if end is None: end = len(iterable) item = start while item < end: if check(iterable[item]): return item item += 1 return default def separate_edges_nodes(edges_nodes_slice, *other): is_arg = lambda x: isinstance(x, str) and x.startswith(':') start = 0 edges = [] nodes = [] l = len(edges_nodes_slice) while start < l: edge_index = index_of( is_arg, edges_nodes_slice, start=start) if edge_index is None or edge_index == (l - 1): break if is_arg(edges_nodes_slice[edge_index + 1]): start = edge_index + 1 continue edges.append(edge_index) nodes.append(edge_index + 1) start = edge_index + 2 ret = [] for oth in other: edges_oth = [oth[i] for i in edges] nodes_oth = [oth[i] for i in nodes] ret.append((edges_oth, nodes_oth)) return ret def _split_name_ops(graph): # identify name triples name_vars = {} for i, (v1, rel, v2) in enumerate(graph.triples): if rel == ':instance' and v2 == 'name': name_vars[v1] = 1 # check if they have ops name_vars_to_ops = defaultdict(list) for i, (v1, rel, v2) in enumerate(graph.triples): if v1 in name_vars and rel.startswith(':op'): name_vars_to_ops[v1].append((i, rel, v2.strip('"'))) triples = graph.triples.copy() for nv, ops in name_vars_to_ops.items(): ops = sorted(ops, key=lambda x: int(x[1][3:])) idx, _, lits = zip(*ops) for i in idx: triples[i] = None lits = ['"' + l + '"' for lit in lits for l in lit.split('_')] tt = [] for i, l in enumerate(lits, start=1): rel = ':op' + str(i) tt.append(penman.Triple(nv, rel, l)) triples[min(idx)] = tt triples = [t if isinstance(t, list) else [t] for t in triples if t is not None] triples = [t for tt in triples for t in tt] graph_ = penman.Graph(triples) graph_.metadata = graph.metadata return graph_ def _reconstruct_graph_from_nodes(nodes, backreferences): triples = [] triples_added = set() variable2index = {} index2variable = {} start_index = 0 cnt = defaultdict(Counter) while start_index < len(nodes): stop_index = index_of('', nodes, default=len(nodes) + 1, start=start_index) old_start_index = start_index start_index = stop_index + 1 src_node, src_backr = nodes[old_start_index], backreferences[old_start_index] if src_node == '': continue trg_nodes_edges = nodes[old_start_index:stop_index] trg_nodes_edges_backr = backreferences[old_start_index:stop_index] trg_nodes_edges_indices = list(range(old_start_index, stop_index)) if isinstance(src_node, str): if src_node in ('', '', ''): continue elif ('/' in src_node) or (':' in src_node) or ('(' in src_node) or (')' in src_node): src_node = 'thing' if src_node is not None: src_node = str(src_node) src_var = src_node[0].lower() if not src_var not in 'abcdefghijklmnopqrstuvwxyz': src_var = 'x' # src_var = f'{src_var}_{len(variable2index)}' src_var = f'{src_var}{len(variable2index)}' src_var_i = old_start_index variable2index[src_var] = src_var_i index2variable[src_var_i] = src_var triple = penman.Triple(src_var, ':instance', src_node) if triple not in triples_added: triples.append(triple) triples_added.add(triple) else: if src_backr in index2variable: src_var = index2variable[src_backr] # more resilient logic here (trg_edges, trg_nodes), (_, trg_nodes_backr), (_, trg_nodes_indices) = \ separate_edges_nodes( trg_nodes_edges, trg_nodes_edges, trg_nodes_edges_backr, trg_nodes_edges_indices) for n, e, nb, ni in zip(trg_nodes, trg_edges, trg_nodes_backr, trg_nodes_indices): if isinstance(n, str) and n.startswith(':'): continue if isinstance(n, str) and n.startswith('<') and n.endswith('>'): continue if e == ':li': pass elif len(e) < 4 or (not e.startswith(':')): continue # same edge more than once num = cnt[src_var][e] # num = 0 if num: if e.startswith(':op') or e.startswith(':snt'): continue # elif e.startswith(':ARG'): # continue elif num > 3: continue if n is None: if nb not in index2variable: continue trg_var = index2variable[nb] trg = trg_var elif e == ':mode': trg = n elif (not isinstance(n, str)) or re.match(r"^[+-]?\d+\.?\d*$", n) or (n == '-') or (n == '+'): trg = str(n) elif (n.startswith('"') and n.endswith('"') and len(n) > 2): trg = '"' + n.replace('"', '') + '"' elif ('/' in n) or (':' in n) or ('(' in n) or (')' in n) or ('=' in n): trg = f'"{n}"' elif n == '"': continue elif (n.startswith('"') and (not n.endswith('"'))) or (not n.startswith('"') and (n.endswith('"'))) or ( '"' in n): trg = '"' + n.replace('"', '') + '"' else: trg_var = n[0].lower() if trg_var not in 'abcdefghijklmnopqrstuvwxyz': trg_var = 'x' # trg_var = f'{trg_var}_{len(variable2index)}' trg_var = f'{trg_var}{len(variable2index)}' trg_var_i = ni variable2index[trg_var] = trg_var_i index2variable[trg_var_i] = trg_var triple = penman.Triple(trg_var, ':instance', n) if triple not in triples_added: triples.append(triple) triples_added.add(triple) trg = trg_var triple = penman.Triple(src_var, e, trg) if triple not in triples_added: triples.append(triple) triples_added.add(triple) cnt[src_var][e] += 1 return penman.Graph(triples) def build_graph(nodes, backreferences, restore_name_ops=False): graph = _reconstruct_graph_from_nodes(nodes, backreferences) if restore_name_ops: graph = _split_name_ops(graph) return graph class ParsedStatus(enum.Enum): OK = 0 FIXED = 1 BACKOFF = 2 def connect_graph_if_not_connected(graph): try: encoded = pm_encode(graph) return graph, ParsedStatus.OK except: pass nxgraph = nx.MultiGraph() variables = graph.variables() for v1, _, v2 in graph.triples: if v1 in variables and v2 in variables: nxgraph.add_edge(v1, v2) elif v1 in variables: nxgraph.add_edge(v1, v1) triples = graph.triples.copy() new_triples = [] addition = f'a{len(variables) + 1}' triples.append(penman.Triple(addition, ':instance', 'and')) for i, conn_set in enumerate(nx.connected_components(nxgraph), start=1): edge = f':op{i}' conn_set = sorted(conn_set, key=lambda x: int(x[1:])) conn_set = [c for c in conn_set if c in variables] node = conn_set[0] new_triples.append(penman.Triple(addition, edge, node)) triples = new_triples + triples metadata = graph.metadata graph = penman.Graph(triples) graph.metadata.update(metadata) pm_encode(graph) return graph, ParsedStatus.FIXED def restore_backreferences_from_pointers(nodes): new_nodes, new_backreferences = [], [] prev_pointer = None pointer2i = {} for n in nodes: is_pointer = isinstance(n, str) and n.startswith('') if not is_pointer: if prev_pointer is not None: if prev_pointer in pointer2i: new_nodes.append(None) new_backreferences.append(pointer2i[prev_pointer]) new_nodes.append(n) new_backreferences.append(-1) else: pointer2i[prev_pointer] = len(new_nodes) new_nodes.append(n) new_backreferences.append(-1) else: new_nodes.append(n) new_backreferences.append(-1) prev_pointer = None else: prev_pointer = n return new_nodes, new_backreferences ================================================ FILE: hanlp/components/amr/seq2seq/dataset/tokenization_bart.py ================================================ import copy import sys from typing import Set, Iterable import penman import regex as re import torch from transformers import BartTokenizer from . import postprocessing from .linearization import AMRTokens, AMRLinearizer from .penman import pm_encode class AMRBartTokenizer(BartTokenizer): ADDITIONAL = [ AMRTokens.PNTR_N, AMRTokens.STOP_N, AMRTokens.LIT_START, AMRTokens.LIT_END, AMRTokens.BACKR_SRC_N, AMRTokens.BACKR_TRG_N, ] def __init__(self, *args, use_pointer_tokens=False, collapse_name_ops=False, INIT='Ġ', **kwargs): super().__init__(*args, **kwargs) self.INIT = INIT self.patterns = re.compile( r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.linearizer = AMRLinearizer(use_pointer_tokens=use_pointer_tokens, collapse_name_ops=collapse_name_ops) self.use_pointer_tokens = use_pointer_tokens self.collapse_name_ops = collapse_name_ops self.recategorizations = set() self.modified = 0 @classmethod def from_pretrained(cls, pretrained_model_path, additional_tokens: Iterable[str] = None, recategorization_tokens: Iterable[str] = None, *args, **kwargs): inst = super().from_pretrained(pretrained_model_path, *args, **kwargs) inst.init_amr_vocabulary(additions=additional_tokens, recategorization_tokens=recategorization_tokens) return inst def init_amr_vocabulary(self, additions: Set[str] = None, recategorization_tokens: Iterable[str] = None): for tok in self.all_special_tokens: ntok = self.INIT + tok i = self.encoder[tok] self.decoder[i] = ntok del self.encoder[tok] self.encoder[ntok] = i tokens = [] if additions: tokens.extend(additions) if recategorization_tokens: for tok in recategorization_tokens: if not tok.startswith('_'): self.recategorizations.add(tok) tokens.append(tok) if self.use_pointer_tokens: for cnt in range(512): tokens.append(f"") tokens += self.ADDITIONAL tokens = [self.INIT + t if t[0] not in ('_', '-') else t for t in tokens] tokens = [t for t in tokens if t not in self.encoder] self.old_enc_size = old_enc_size = len(self.encoder) for i, t in enumerate(tokens, start=old_enc_size): self.encoder[t] = i self.encoder = {k: i for i, (k, v) in enumerate(sorted(self.encoder.items(), key=lambda x: x[1]))} self.decoder = {v: k for k, v in sorted(self.encoder.items(), key=lambda x: x[1])} self.modified = len(tokens) self.bos_token = self.INIT + self.bos_token self.pad_token = self.INIT + self.pad_token self.eos_token = self.INIT + self.eos_token self.unk_token = self.INIT + self.unk_token def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] if token_ids_1 is None: return output return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] def _tokenize(self, text): """ Tokenize a string. Modified in order to handle sentences with recategorization pointers""" bpe_tokens = [] for tok_span in text.lstrip().split(' '): tok_span = tok_span.strip() recats = tok_span.rsplit('_', 1) if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder: bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]]) else: for token in re.findall(self.pat, ' ' + tok_span): token = "".join( self.byte_encoder[b] for b in token.encode("utf-8") ) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens def _tok_bpe(self, token, add_space=True): # if add_space: # token = ' ' + token.lstrip() tokk = [] tok = token.strip() recats = tok.rsplit('_', 1) if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder: tokk.extend([self.INIT + recats[0], '_' + recats[1]]) else: for tok in self.patterns.findall(' ' + token): tok = "".join( self.byte_encoder[b] for b in tok.encode("utf-8")) toks = self.bpe(tok).split(' ') tokk.extend(toks) return tokk def _get_nodes_and_backreferences(self, graph): lin = self.linearizer.linearize(graph) linearized_nodes, backreferences = lin.nodes, lin.backreferences return linearized_nodes, backreferences def tokenize_amr(self, graph): linearized_nodes, backreferences = self._get_nodes_and_backreferences(graph) bpe_tokens = [] bpe_backreferences = [] counter = 0 for i, (backr, tokk) in enumerate(zip(backreferences, linearized_nodes)): is_in_enc = self.INIT + tokk in self.encoder is_rel = tokk.startswith(':') and len(tokk) > 1 is_spc = tokk.startswith('<') and tokk.endswith('>') is_of = tokk.startswith(':') and tokk.endswith('-of') is_frame = re.match(r'.+-\d\d', tokk) is not None if tokk.startswith('"') and tokk.endswith('"'): tokk = tokk[1:-1].replace('_', ' ') bpe_toks = [self.INIT + AMRTokens.LIT_START] bpe_toks += self._tok_bpe(tokk, add_space=True) bpe_toks.append(self.INIT + AMRTokens.LIT_END) elif (is_rel or is_spc or is_frame or is_of): if is_in_enc: bpe_toks = [self.INIT + tokk] elif is_frame: bpe_toks = self._tok_bpe(tokk[:-3], add_space=True) + [tokk[-3:]] elif is_of: rel = tokk[:-3] if self.INIT + rel in self.encoder: bpe_toks = [self.INIT + rel, '-of'] else: bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:], add_space=True) + ['-of'] elif is_rel: bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:], add_space=True) else: raise else: if is_in_enc: bpe_toks = [self.INIT + tokk] else: bpe_toks = self._tok_bpe(tokk, add_space=True) bpe_tokens.append(bpe_toks) if i == backr: bpe_backr = list(range(counter, counter + len(bpe_toks))) counter += len(bpe_toks) bpe_backreferences.append(bpe_backr) else: bpe_backreferences.append(bpe_backreferences[backr][0:1]) counter += 1 bpe_tokens = [b for bb in bpe_tokens for b in bb] bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens] bpe_backreferences = [b for bb in bpe_backreferences for b in bb] return bpe_tokens, bpe_token_ids, bpe_backreferences def batch_encode_sentences(self, sentences, device=torch.device('cpu')): sentences = [s for s in sentences] extra = {'sentences': sentences} batch = super().batch_encode_plus(sentences, return_tensors='pt', pad_to_max_length=True) batch = {k: v.to(device) for k, v in batch.items()} return batch, extra def linearize(self, graph): shift = len(self.encoder) tokens, token_ids, backreferences = self.tokenize_amr(graph) extra = {'linearized_graphs': tokens, 'graphs': graph} token_uni_ids = \ [idx if i == b else b + shift for i, (idx, b) in enumerate(zip(token_ids, backreferences))] if token_uni_ids[-1] != (self.INIT + AMRTokens.EOS_N): tokens.append(self.INIT + AMRTokens.EOS_N) token_ids.append(self.eos_token_id) token_uni_ids.append(self.eos_token_id) backreferences.append(len(backreferences)) return token_uni_ids, extra def batch_encode_graphs(self, graphs, device=torch.device('cpu')): linearized, extras = zip(*[self.linearize(g) for g in graphs]) return self.batch_encode_graphs_from_linearized(linearized, extras, device=device) def batch_encode_graphs_from_linearized(self, linearized, extras=None, device=torch.device('cpu')): if extras is not None: batch_extra = {'linearized_graphs': [], 'graphs': []} for extra in extras: batch_extra['graphs'].append(extra['graphs']) batch_extra['linearized_graphs'].append(extra['linearized_graphs']) else: batch_extra = {} maxlen = 0 batch = [] for token_uni_ids in linearized: maxlen = max(len(token_uni_ids), maxlen) batch.append(token_uni_ids) batch = [x + [self.pad_token_id] * (maxlen - len(x)) for x in batch] batch = torch.tensor(batch).to(device) batch = {'decoder_input_ids': batch[:, :-1], 'lm_labels': batch[:, 1:]} return batch, batch_extra def decode_amr(self, tokens, restore_name_ops=False): try: nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self) except Exception as e: print('Decoding failure:', file=sys.stderr) print(e, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) if self.use_pointer_tokens: nodes, backreferences = postprocessing.restore_backreferences_from_pointers(nodes) try: graph_ = graph = postprocessing.build_graph(nodes, backreferences, restore_name_ops=restore_name_ops) except Exception as e: print('Building failure:', file=sys.stderr) print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(e, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) try: graph, status = postprocessing.connect_graph_if_not_connected(graph) if status == postprocessing.ParsedStatus.BACKOFF: print('Reconnection 1 failure:') print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(graph_, file=sys.stderr) return graph, status, (nodes, backreferences) except Exception as e: print('Reconnction 2 failure:', file=sys.stderr) print(e, file=sys.stderr) print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(graph_, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences) class PENMANBartTokenizer(AMRBartTokenizer): def __init__(self, *args, raw_graph=False, **kwargs): super().__init__(*args, **kwargs) self.linearizer = None self.remove_pars = False self.raw_graph = raw_graph def _tokenize_encoded_graph(self, encoded): linearized = re.sub(r"(\".+?\")", r' \1 ', encoded) pieces = [] for piece in linearized.split(): if piece.startswith('"') and piece.endswith('"'): pieces.append(piece) else: piece = piece.replace('(', ' ( ') piece = piece.replace(')', ' ) ') piece = piece.replace(':', ' :') piece = piece.replace('/', ' / ') piece = piece.strip() pieces.append(piece) linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip() linearized_nodes = [AMRTokens.BOS_N] + linearized.split(' ') return linearized_nodes def tokenize_amr(self, graph): if self.raw_graph: graph_ = copy.deepcopy(graph) graph_.metadata = {} linearized = penman.encode(graph_) linearized = re.sub(r"\s+", ' ', linearized) bpe_tokens = [self.bos_token] + self._tokenize(linearized)[:1022] bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens] bpe_backreferences = list(range(len(bpe_token_ids))) return bpe_tokens, bpe_token_ids, bpe_backreferences else: return super().tokenize_amr(graph) def _get_nodes_and_backreferences(self, graph): graph_ = copy.deepcopy(graph) graph_.metadata = {} linearized = penman.encode(graph_) linearized_nodes = self._tokenize_encoded_graph(linearized) if self.use_pointer_tokens: remap = {} for i in range(1, len(linearized_nodes)): nxt = linearized_nodes[i] lst = linearized_nodes[i - 1] if nxt == '/': remap[lst] = f'' i = 1 linearized_nodes_ = [linearized_nodes[0]] while i < (len(linearized_nodes)): nxt = linearized_nodes[i] lst = linearized_nodes_[-1] if nxt in remap: if lst == '(' and linearized_nodes[i + 1] == '/': nxt = remap[nxt] i += 1 elif lst.startswith(':'): nxt = remap[nxt] linearized_nodes_.append(nxt) i += 1 linearized_nodes = linearized_nodes_ if self.remove_pars: linearized_nodes = [n for n in linearized_nodes if n != '('] backreferences = list(range(len(linearized_nodes))) return linearized_nodes, backreferences def _classify(self, node): if not isinstance(node, str): return "CONST" elif node == 'i': return "I" elif re.match(r'^[a-z]\d*$', node) is not None: return "VAR" elif node[0].isdigit(): return "CONST" elif node.startswith('"') and node.endswith('"'): return "CONST" elif node in ('+', '-'): return "CONST" elif node == ':mode': return 'MODE' elif node.startswith(':'): return "EDGE" elif node in ['/', '(', ')']: return node elif node[0].isalpha(): for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'): if char in node: return "CONST" return "INST" else: return 'CONST' def _fix_and_make_graph(self, nodes): nodes_ = [] for n in nodes: if isinstance(n, str): if n.startswith('<') and n.endswith('>') and (not n.startswith('') if e != len(nxt) - 1: pst = nxt[e + 1:] nxt = nxt[:e + 1] nodes_.append(nxt) if pst is not None: nodes_.append(pst) else: nodes_.append(nxt) i += 1 nodes = nodes_ i = 1 nodes_ = [nodes[0]] while i < len(nodes): nxt = nodes[i] if isinstance(nxt, str) and nxt.startswith(' 0: line = line[:i].strip() break old_line = line while True: open_count = len(re.findall(r'\(', line)) close_count = len(re.findall(r'\)', line)) if open_count > close_count: line += ')' * (open_count - close_count) elif close_count > open_count: for i in range(close_count - open_count): line = line.rstrip(')') line = line.rstrip(' ') if old_line == line: break old_line = line """ graph = penman.decode(linearized + ' ') triples = [] newvars = 2000 for triple in graph.triples: x, rel, y = triple if x is None: pass elif rel == ':instance' and y is None: triples.append(penman.Triple(x, rel, 'thing')) elif y is None: var = f'x{newvars}' newvars += 1 triples.append(penman.Triple(x, rel, var)) triples.append(penman.Triple(var, ':instance', 'thing')) else: triples.append(triple) graph = penman.Graph(triples) linearized = pm_encode(graph) def fix_text(linearized=linearized): n = 0 def _repl1(match): nonlocal n out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3) n += 1 return out linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized, flags=re.IGNORECASE | re.MULTILINE) def _repl2(match): return match.group(1) linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2, linearized, flags=re.IGNORECASE | re.MULTILINE) # adds a ':' to args w/o it linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized) # removes edges with no node # linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE) return linearized linearized = fix_text(linearized) g = penman.decode(linearized) return g def decode_amr(self, tokens, restore_name_ops=None): try: if self.raw_graph: nodes = self._tokenize_encoded_graph(self.decode(tokens)) backreferences = list(range(len(nodes))) else: nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self) nodes_ = nodes except Exception as e: print('Decoding failure:', file=sys.stderr) print(e, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) try: graph_ = graph = self._fix_and_make_graph(nodes) if self.collapse_name_ops: graph_ = graph = postprocessing._split_name_ops(graph) except Exception as e: print('Building failure:', file=sys.stderr) print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(e, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) try: graph, status = postprocessing.connect_graph_if_not_connected(graph) if status == postprocessing.ParsedStatus.BACKOFF: print('Reconnection 1 failure:') print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(graph_, file=sys.stderr) return graph, status, (nodes_, backreferences) except Exception as e: print('Reconnction 2 failure:', file=sys.stderr) print(e, file=sys.stderr) print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(graph_, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes_, backreferences) ================================================ FILE: hanlp/components/amr/seq2seq/dataset/tokenization_t5.py ================================================ import copy import sys from typing import Set, Iterable, Dict import penman import regex as re import torch import traceback from transformers import T5Tokenizer, T5TokenizerFast from . import postprocessing from .linearization import AMRTokens, AMRLinearizer from .penman import pm_encode class AMRT5Tokenizer(T5TokenizerFast): ADDITIONAL = [ AMRTokens.PNTR_N, AMRTokens.STOP_N, AMRTokens.LIT_START, AMRTokens.LIT_END, AMRTokens.BACKR_SRC_N, AMRTokens.BACKR_TRG_N, ] def __init__(self, *args, use_pointer_tokens=False, collapse_name_ops=False, INIT='', **kwargs): super().__init__(*args, **kwargs) self.INIT = INIT self.patterns = re.compile( r""" ?<[a-z]+:?\d*>| ?:[^\s]+|'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.linearizer = AMRLinearizer(use_pointer_tokens=use_pointer_tokens, collapse_name_ops=collapse_name_ops) self.use_pointer_tokens = use_pointer_tokens self.collapse_name_ops = collapse_name_ops self.recategorizations = set() self.modified = 0 @classmethod def from_pretrained(cls, pretrained_model_path, additional_tokens: Iterable[str] = None, recategorization_tokens: Iterable[str] = None, *args, **kwargs): inst = super().from_pretrained(pretrained_model_path, *args, **kwargs) inst.init_amr_vocabulary(additions=additional_tokens, recategorization_tokens=recategorization_tokens) return inst def init_amr_vocabulary(self, additions: Set[str] = None, recategorization_tokens: Iterable[str] = None): # T5 has no encoder but it's not a problem for Chinese # for tok in self.all_special_tokens: # ntok = self.INIT + tok # i = self.encoder[tok] # self.decoder[i] = ntok # del self.encoder[tok] # self.encoder[ntok] = i tokens = [AMRTokens.BOS_N] if additions: tokens.extend(additions) if recategorization_tokens: for tok in recategorization_tokens: if not tok.startswith('_'): self.recategorizations.add(tok) tokens.append(tok) if self.use_pointer_tokens: for cnt in range(512): tokens.append(f"") tokens += self.ADDITIONAL tokens = [self.INIT + t if t[0] not in ('_', '-') else t for t in tokens] self.old_enc_size = len(self) self.add_tokens(tokens) self.modified = len(tokens) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] if token_ids_1 is None: return output return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] def _tokenize(self, text): """ Tokenize a string. Modified in order to handle sentences with recategorization pointers""" bpe_tokens = [] for tok_span in text.lstrip().split(' '): tok_span = tok_span.strip() recats = tok_span.rsplit('_', 1) if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder: bpe_tokens.extend([self.INIT + recats[0], '_' + recats[1]]) else: for token in re.findall(self.pat, ' ' + tok_span): token = "".join( self.byte_encoder[b] for b in token.encode("utf-8") ) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens def _tok_bpe(self, token, add_space=True): # if add_space: # token = ' ' + token.lstrip() tokk = [] tok = token.strip() recats = tok.rsplit('_', 1) if len(recats) == 2 and recats[0] in self.recategorizations and ('_' + recats[1]) in self.encoder: tokk.extend([self.INIT + recats[0], '_' + recats[1]]) else: for tok in self.patterns.findall(token): tokk.extend(self.tokenize(tok)) return tokk def _get_nodes_and_backreferences(self, graph): lin = self.linearizer.linearize(graph) linearized_nodes, backreferences = lin.nodes, lin.backreferences return linearized_nodes, backreferences def tokenize_amr(self, graph): linearized_nodes, backreferences = self._get_nodes_and_backreferences(graph) bpe_tokens = [] bpe_backreferences = [] counter = 0 encoder = self.encoder for i, (backr, tokk) in enumerate(zip(backreferences, linearized_nodes)): is_in_enc = self.INIT + tokk in encoder is_rel = tokk.startswith(':') and len(tokk) > 1 is_spc = tokk.startswith('<') and tokk.endswith('>') is_of = tokk.startswith(':') and tokk.endswith('-of') is_frame = re.match(r'.+-\d\d', tokk) is not None if tokk.startswith('"') and tokk.endswith('"'): tokk = tokk[1:-1].replace('_', ' ') bpe_toks = [self.INIT + AMRTokens.LIT_START] bpe_toks += self._tok_bpe(tokk, add_space=True) bpe_toks.append(self.INIT + AMRTokens.LIT_END) elif (is_rel or is_spc or is_frame or is_of): if is_in_enc: bpe_toks = [self.INIT + tokk] elif is_frame: bpe_toks = self._tok_bpe(tokk[:-3], add_space=True) + [tokk[-3:]] elif is_of: rel = tokk[:-3] if self.INIT + rel in encoder: bpe_toks = [self.INIT + rel, '-of'] else: bpe_toks = [self.INIT + ':'] + self._tok_bpe(rel[1:], add_space=True) + ['-of'] elif is_rel: bpe_toks = [self.INIT + ':'] + self._tok_bpe(tokk[1:], add_space=True) else: raise else: if is_in_enc: bpe_toks = [self.INIT + tokk] else: bpe_toks = self._tok_bpe(tokk, add_space=True) bpe_tokens.append(bpe_toks) if i == backr: bpe_backr = list(range(counter, counter + len(bpe_toks))) counter += len(bpe_toks) bpe_backreferences.append(bpe_backr) else: bpe_backreferences.append(bpe_backreferences[backr][0:1]) counter += 1 bpe_tokens = [b for bb in bpe_tokens for b in bb] bpe_token_ids = self.convert_tokens_to_ids(bpe_tokens) bpe_backreferences = [b for bb in bpe_backreferences for b in bb] return bpe_tokens, bpe_token_ids, bpe_backreferences def batch_encode_sentences(self, sentences, device=torch.device('cpu')): sentences = [s for s in sentences] extra = {'sentences': sentences} batch = super().batch_encode_plus(sentences, return_tensors='pt', pad_to_max_length=True) batch = {k: v.to(device) for k, v in batch.items()} return batch, extra def linearize(self, graph): shift = len(self) tokens, token_ids, backreferences = self.tokenize_amr(graph) extra = {'linearized_graphs': tokens, 'graphs': graph} token_uni_ids = \ [idx if i == b else b + shift for i, (idx, b) in enumerate(zip(token_ids, backreferences))] if token_uni_ids[-1] != (self.INIT + AMRTokens.EOS_N): tokens.append(self.INIT + AMRTokens.EOS_N) token_ids.append(self.eos_token_id) token_uni_ids.append(self.eos_token_id) backreferences.append(len(backreferences)) return token_uni_ids, extra def batch_encode_graphs(self, graphs, device=torch.device('cpu')): linearized, extras = zip(*[self.linearize(g) for g in graphs]) return self.batch_encode_graphs_from_linearized(linearized, extras, device=device) def batch_encode_graphs_from_linearized(self, linearized, extras=None, device=torch.device('cpu')): if extras is not None: batch_extra = {'linearized_graphs': [], 'graphs': []} for extra in extras: batch_extra['graphs'].append(extra['graphs']) batch_extra['linearized_graphs'].append(extra['linearized_graphs']) else: batch_extra = {} maxlen = 0 batch = [] for token_uni_ids in linearized: maxlen = max(len(token_uni_ids), maxlen) batch.append(token_uni_ids) batch = [x + [self.pad_token_id] * (maxlen - len(x)) for x in batch] batch = torch.tensor(batch).to(device) batch = {'decoder_input_ids': batch[:, :-1], 'lm_labels': batch[:, 1:]} return batch, batch_extra def decode_amr(self, tokens, restore_name_ops=False): try: nodes, backreferences = postprocessing.decode_into_node_and_backreferences(tokens, self) except Exception as e: print('Decoding failure:', file=sys.stderr) traceback.print_exc() return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) if self.use_pointer_tokens: nodes, backreferences = postprocessing.restore_backreferences_from_pointers(nodes) try: graph_ = graph = postprocessing.build_graph(nodes, backreferences, restore_name_ops=restore_name_ops) except Exception as e: print('Building failure:', file=sys.stderr) traceback.print_exc() print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(e, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) try: graph, status = postprocessing.connect_graph_if_not_connected(graph) if status == postprocessing.ParsedStatus.BACKOFF: print('Reconnection 1 failure:') print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(graph_, file=sys.stderr) return graph, status, (nodes, backreferences) except Exception as e: print('Reconnction 2 failure:', file=sys.stderr) traceback.print_exc() print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(graph_, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes, backreferences) class PENMANT5Tokenizer(AMRT5Tokenizer): def __init__(self, *args, raw_graph=False, **kwargs): super().__init__(*args, **kwargs) self.linearizer = None self.remove_pars = False self.raw_graph = raw_graph def _tokenize_encoded_graph(self, encoded): linearized = re.sub(r"(\".+?\")", r' \1 ', encoded) pieces = [] for piece in linearized.split(): if piece.startswith('"') and piece.endswith('"'): pieces.append(piece) else: piece = piece.replace('(', ' ( ') piece = piece.replace(')', ' ) ') piece = piece.replace(':', ' :') piece = piece.replace('/', ' / ') piece = piece.strip() pieces.append(piece) linearized = re.sub(r'\s+', ' ', ' '.join(pieces)).strip() # T5 uses pad instead of # linearized_nodes = [AMRTokens.BOS_N] + linearized.split(' ') linearized_nodes = [self.pad_token] + linearized.split(' ') return linearized_nodes def tokenize_amr(self, graph): if self.raw_graph: graph_ = copy.deepcopy(graph) graph_.metadata = {} linearized = penman.encode(graph_) linearized = re.sub(r"\s+", ' ', linearized) bpe_tokens = [self.bos_token] + self._tokenize(linearized)[:1022] bpe_token_ids = [self.encoder.get(b, self.unk_token_id) for b in bpe_tokens] bpe_backreferences = list(range(len(bpe_token_ids))) return bpe_tokens, bpe_token_ids, bpe_backreferences else: return super().tokenize_amr(graph) def _get_nodes_and_backreferences(self, graph): graph_ = copy.deepcopy(graph) graph_.metadata = {} linearized = penman.encode(graph_) linearized_nodes = self._tokenize_encoded_graph(linearized) if self.use_pointer_tokens: remap = {} for i in range(1, len(linearized_nodes)): nxt = linearized_nodes[i] lst = linearized_nodes[i - 1] if nxt == '/': remap[lst] = f'' i = 1 linearized_nodes_ = [linearized_nodes[0]] while i < (len(linearized_nodes)): nxt = linearized_nodes[i] lst = linearized_nodes_[-1] if nxt in remap: if lst == '(' and linearized_nodes[i + 1] == '/': nxt = remap[nxt] i += 1 elif lst.startswith(':'): nxt = remap[nxt] linearized_nodes_.append(nxt) i += 1 linearized_nodes = linearized_nodes_ if self.remove_pars: linearized_nodes = [n for n in linearized_nodes if n != '('] backreferences = list(range(len(linearized_nodes))) return linearized_nodes, backreferences def _classify(self, node): if not isinstance(node, str): return "CONST" elif node == 'i': return "I" elif re.match(r'^[a-z]\d*$', node) is not None: return "VAR" elif node[0].isdigit(): return "CONST" elif node.startswith('"') and node.endswith('"'): return "CONST" elif node in ('+', '-'): return "CONST" elif node == ':mode': return 'MODE' elif node.startswith(':'): return "EDGE" elif node in ['/', '(', ')']: return node elif node[0].isalpha(): for char in (',', ':', '/', '(', ')', '.', '!', '?', '\\'): if char in node: return "CONST" return "INST" else: return 'CONST' def _fix_and_make_graph(self, nodes): nodes_ = [] for n in nodes: if isinstance(n, str): if n.startswith('<') and n.endswith('>') and (not n.startswith('') if e != len(nxt) - 1: pst = nxt[e + 1:] nxt = nxt[:e + 1] nodes_.append(nxt) if pst is not None: nodes_.append(pst) else: nodes_.append(nxt) i += 1 nodes = nodes_ i = 1 nodes_ = [nodes[0]] while i < len(nodes): nxt = nodes[i] if isinstance(nxt, str) and nxt.startswith(' 0: line = line[:i].strip() break old_line = line while True: open_count = len(re.findall(r'\(', line)) close_count = len(re.findall(r'\)', line)) if open_count > close_count: line += ')' * (open_count - close_count) elif close_count > open_count: for i in range(close_count - open_count): line = line.rstrip(')') line = line.rstrip(' ') if old_line == line: break old_line = line """ graph = penman.decode(linearized + ' ') triples = [] newvars = 2000 for triple in graph.triples: x, rel, y = triple if x is None: pass elif rel == ':instance' and y is None: triples.append(penman.Triple(x, rel, 'thing')) elif y is None: var = f'z{newvars}' newvars += 1 triples.append(penman.Triple(x, rel, var)) triples.append(penman.Triple(var, ':instance', 'thing')) else: triples.append(triple) graph = penman.Graph(triples) linearized = pm_encode(graph) def fix_text(linearized=linearized): n = 0 def _repl1(match): nonlocal n out = match.group(1) + match.group(2) + str(3000 + n) + ' / ' + match.group(2) + match.group(3) n += 1 return out linearized = re.sub(r'(\(\s?)([a-z])([^\/:\)]+[:\)])', _repl1, linearized, flags=re.IGNORECASE | re.MULTILINE) def _repl2(match): return match.group(1) linearized = re.sub(r'(\(\s*[a-z][\d+]\s*\/\s*[^\s\)\(:\/]+\s*)((?:/\s*[^\s\)\(:\/]+\s*)+)', _repl2, linearized, flags=re.IGNORECASE | re.MULTILINE) # adds a ':' to args w/o it linearized = re.sub(r'([^:])(ARG)', r'\1 :\2', linearized) # removes edges with no node # linearized = re.sub(r':[^\s\)\(:\/]+?\s*\)', ')', linearized, flags=re.MULTILINE) return linearized linearized = fix_text(linearized) g = penman.decode(linearized) return g def decode_amr(self, tokens, restore_name_ops=None): try: if self.raw_graph: nodes = self._tokenize_encoded_graph(self.decode(tokens)) backreferences = list(range(len(nodes))) else: nodes, backreferences = postprocessing.decode_into_node_and_backreferences_without_space(tokens, self) \ if not self.INIT else postprocessing.decode_into_node_and_backreferences(tokens, self) nodes_ = nodes except Exception as e: print('Decoding failure:', file=sys.stderr) traceback.print_exc() return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) try: graph_ = graph = self._fix_and_make_graph(nodes) if self.collapse_name_ops: graph_ = graph = postprocessing._split_name_ops(graph) except Exception as e: print('Building failure:', file=sys.stderr) traceback.print_exc() print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(e, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (None, None) try: graph, status = postprocessing.connect_graph_if_not_connected(graph) if status == postprocessing.ParsedStatus.BACKOFF: print('Reconnection 1 failure:') print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(graph_, file=sys.stderr) return graph, status, (nodes_, backreferences) except Exception as e: print('Reconnction 2 failure:', file=sys.stderr) print(e, file=sys.stderr) traceback.print_exc() print(nodes, file=sys.stderr) print(backreferences, file=sys.stderr) print(graph_, file=sys.stderr) return postprocessing.BACKOFF, postprocessing.ParsedStatus.BACKOFF, (nodes_, backreferences) @property def encoder(self) -> Dict[str, int]: return self.get_vocab() ================================================ FILE: hanlp/components/amr/seq2seq/evaluation.py ================================================ from pathlib import Path import penman def write_predictions(predictions_path, tokenizer, graphs): pieces = [penman.encode(g) for g in graphs] text = '\n\n'.join(pieces) if tokenizer: text = text.replace(tokenizer.INIT, '') Path(predictions_path).write_text(text) return predictions_path def compute_smatch(pred, gold): from perin_parser.thirdparty.mtool import smatch with Path(pred).open() as p, Path(gold).open() as g: score = next(smatch.score_amr_pairs(p, g)) return score[2] def compute_bleu(gold_sentences, pred_sentences): from sacrebleu import corpus_bleu return corpus_bleu(pred_sentences, [gold_sentences]) ================================================ FILE: hanlp/components/amr/seq2seq/optim.py ================================================ # taken from import math import torch from torch.optim.optimizer import Optimizer class RAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) self.degenerated_to_sgd = degenerated_to_sgd if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): for param in params: if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): param['buffer'] = [[None, None, None] for _ in range(10)] defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) super(RAdam, self).__init__(params, defaults) def __setstate__(self, state): super(RAdam, self).__setstate__(state) def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data.float() if grad.is_sparse: raise RuntimeError('RAdam does not support sparse gradients') p_data_fp32 = p.data.float() state = self.state[p] if len(state) == 0: state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_data_fp32) state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) else: state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) state['step'] += 1 buffered = group['buffer'][int(state['step'] % 10)] if state['step'] == buffered[0]: N_sma, step_size = buffered[1], buffered[2] else: buffered[0] = state['step'] beta2_t = beta2 ** state['step'] N_sma_max = 2 / (1 - beta2) - 1 N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) buffered[1] = N_sma # more conservative since it's an approximated value if N_sma >= 5: step_size = math.sqrt( (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( N_sma_max - 2)) / (1 - beta1 ** state['step']) elif self.degenerated_to_sgd: step_size = 1.0 / (1 - beta1 ** state['step']) else: step_size = -1 buffered[2] = step_size # more conservative since it's an approximated value if N_sma >= 5: if group['weight_decay'] != 0: p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr']) denom = exp_avg_sq.sqrt().add_(group['eps']) p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) p.data.copy_(p_data_fp32) elif step_size > 0: if group['weight_decay'] != 0: p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr']) p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr']) p.data.copy_(p_data_fp32) return loss ================================================ FILE: hanlp/components/amr/seq2seq/seq2seq_amr_parser.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-04-28 17:33 import datetime import functools import logging import os from typing import Union, List, Callable import torch from torch.utils.data import DataLoader from transformers import get_constant_schedule_with_warmup, T5ForConditionalGeneration from transformers.models.bart.modeling_bart import BartForConditionalGeneration from hanlp.common.dataset import SamplerBuilder, SortingSamplerBuilder, PadSequenceDataLoader from hanlp.common.structure import History from hanlp.common.torch_component import TorchComponent from hanlp.common.vocab import Vocab from hanlp.components.amr.seq2seq.dataset.dataset import AMRDataset, dfs_linearize_tokenize from hanlp.components.amr.seq2seq.dataset.penman import AMRGraph from hanlp.components.amr.seq2seq.dataset.tokenization_bart import PENMANBartTokenizer from hanlp.components.amr.seq2seq.dataset.tokenization_t5 import PENMANT5Tokenizer from hanlp.components.amr.seq2seq.evaluation import write_predictions, compute_smatch from hanlp.components.amr.seq2seq.optim import RAdam from hanlp.layers.transformers.pt_imports import PretrainedConfig, AutoConfig_ from hanlp.layers.transformers.resource import get_model_mirror, get_tokenizer_mirror from hanlp.metrics.amr.smatch_eval import smatch_eval from hanlp.metrics.mtl import MetricDict from hanlp.utils.time_util import CountdownTimer from hanlp_common.constant import IDX from hanlp_common.util import merge_locals_kwargs, reorder class Seq2seq_AMR_Parser(TorchComponent): def __init__(self, **kwargs): super().__init__(**kwargs) self._transformer_config: PretrainedConfig = None self._tokenizer: PENMANBartTokenizer = None self.model: BartForConditionalGeneration = None def build_dataloader(self, data, batch_size, gradient_accumulation=1, shuffle=False, sampler_builder: SamplerBuilder = None, device=None, logger: logging.Logger = None, **kwargs) -> DataLoader: dataset = self.build_dataset(data, not shuffle) if self.vocabs.mutable: self.build_vocabs(dataset, logger) self.finalize_dataset(dataset, logger) if isinstance(data, str): dataset.purge_cache() timer = CountdownTimer(len(dataset)) max_num_tokens = 0 # lc = Counter() for each in dataset: max_num_tokens = max(max_num_tokens, len(each['text_token_ids'])) # lc[len(each['text_token_ids'])] += 1 timer.log(f'Preprocessing and caching samples (longest sequence {max_num_tokens})' f'[blink][yellow]...[/yellow][/blink]') # print(lc.most_common()) if self.vocabs.mutable: self.vocabs.lock() self.vocabs.summary(logger) if not sampler_builder: sampler_builder = SortingSamplerBuilder(batch_max_tokens=500) sampler = sampler_builder.build([len(x['text_token_ids']) for x in dataset], shuffle, gradient_accumulation if dataset.cache else 1) return self._create_dataloader(dataset, batch_size, device, sampler, shuffle) def _create_dataloader(self, dataset, batch_size, device, sampler, shuffle): return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler, pad=self._get_pad_dict()) def _get_pad_dict(self): return {'text_token_ids': self._transformer_config.pad_token_id, 'graph_token_ids': self._transformer_config.pad_token_id} def finalize_dataset(self, dataset, logger: logging.Logger = None): dataset.append_transform(functools.partial(dfs_linearize_tokenize, tokenizer=self._tokenizer, remove_space='chinese' in self.config.transformer)) def build_dataset(self, data, generate_idx): dataset = AMRDataset(data, generate_idx=generate_idx) return dataset def collect_additional_tokens(self, additional_tokens, dataset): pred_min = self.config.pred_min frames = dataset.get_frames() for token, freq in frames.items(): if freq >= pred_min: additional_tokens.add(token) for token, freq in dataset.get_roles().items(): additional_tokens.add(token) additional_tokens.update(self.config.additional_tokens) def build_tokenizer(self, additional_tokens) -> PENMANBartTokenizer: transformer = self.config.transformer if 't5-' in transformer: cls = PENMANT5Tokenizer elif 'bart-' in transformer: cls = PENMANBartTokenizer else: raise NotImplemented(f'Unsupported transformer {transformer}') transformer = get_tokenizer_mirror(transformer) self._tokenizer = cls.from_pretrained( transformer, collapse_name_ops=self.config.collapse_name_ops, use_pointer_tokens=self.config.use_pointer_tokens, raw_graph=self.config.raw_graph, additional_tokens=additional_tokens, recategorization_tokens=self.config.recategorization_tokens, config=self._transformer_config, ) return self._tokenizer def build_optimizer(self, trn, lr, epochs, gradient_accumulation, warmup_steps, weight_decay, **kwargs): num_training_steps = len(trn) * epochs // gradient_accumulation if isinstance(warmup_steps, float): warmup_steps = int(num_training_steps * warmup_steps) optimizer = RAdam( self.model.parameters(), lr=lr, weight_decay=weight_decay) scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) return optimizer, scheduler def build_criterion(self, **kwargs): pass def build_metric(self, **kwargs): pass def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, dev_data=None, eval_after=None, **kwargs): best_epoch, best_metric = 0, -1 if isinstance(eval_after, float): eval_after = int(epochs * eval_after) timer = CountdownTimer(epochs) history = History() for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width, **self.config) if epoch > eval_after: dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width, output=os.path.join(save_dir, 'dev.pred.txt'), input=dev_data, use_fast=True) timer.update() report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}" if epoch > eval_after: if dev_metric > best_metric: best_epoch, best_metric = epoch, dev_metric self.save_weights(save_dir) report += ' [red](saved)[/red]' else: report += f' ({epoch - best_epoch})' # if epoch - best_epoch >= patience: # report += ' early stop' logger.info(report) # if epoch - best_epoch >= patience: # break if not best_epoch: self.save_weights(save_dir) elif best_epoch != epoch: self.load_weights(save_dir) logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}") logger.info(f"Average time of each epoch is {timer.elapsed_average_human}") logger.info(f"{timer.elapsed_human} elapsed") return best_metric def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, history: History = None, gradient_accumulation=1, ratio_percentage=None, **kwargs): optimizer, scheduler = optimizer self.model.train() timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation)) total_loss = 0 for batch in trn: output_dict = self.feed_batch(batch) loss = output_dict['loss'] if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation loss.backward() total_loss += loss.item() if history.step(gradient_accumulation): self._step(optimizer, scheduler) timer.log(self.report_metrics(total_loss / (timer.current + 1)), ratio_percentage=ratio_percentage, logger=logger) del loss del output_dict return total_loss / max(timer.total, 1) def _step(self, optimizer, scheduler): if self.config.grad_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm) optimizer.step() if scheduler: scheduler.step() optimizer.zero_grad() def report_metrics(self, loss): return f'loss: {loss:.4f}' def feed_batch(self, batch): input_ids, labels = batch['text_token_ids'], batch.get('graph_token_ids') attention_mask = input_ids.ne(self.model.config.pad_token_id).to(torch.long) if labels is not None: decoder_input_ids = labels[:, :-1] labels = labels[:, 1:].contiguous() else: decoder_input_ids = None return self.model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=labels) @torch.no_grad() def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, ratio_width=None, logger=None, input=None, use_fast=False, **kwargs): self.model.eval() timer = CountdownTimer(len(data)) graphs = [] orders = [] smatch = 0 for idx, batch in enumerate(data): graphs_per_batch = self.predict_amrs(batch) graphs_per_batch = [x[0] for x in graphs_per_batch] # Copy meta data from gold graph for gp, gg in zip(graphs_per_batch, batch['amr']): metadata = gg.metadata.copy() metadata['annotator'] = f'{self.config.transformer}-amr' metadata['date'] = str(datetime.datetime.now()) if 'save-date' in metadata: del metadata['save-date'] gp.metadata = metadata graphs.extend(graphs_per_batch) orders.extend(batch[IDX]) if idx == timer.total - 1: graphs = reorder(graphs, orders) write_predictions(output, self._tokenizer, graphs) try: if use_fast: smatch = compute_smatch(output, input) else: smatch = smatch_eval(output, input, use_fast=False) except: pass timer.log(smatch.cstr() if isinstance(smatch, MetricDict) else f'{smatch:.2%}', ratio_percentage=False, logger=logger) else: timer.log(ratio_percentage=False, logger=logger) return smatch def predict_amrs(self, batch, beam_size=1): out = self._model_generate(batch, beam_size) tokens = [] for i1 in range(0, out.size(0), beam_size): tokens_same_source = [] tokens.append(tokens_same_source) for i2 in range(i1, i1 + beam_size): tokk = out[i2].tolist() tokens_same_source.append(tokk) tokens = [t for tt in tokens for t in tt] graphs = [] tokenizer = self._tokenizer for i1 in range(0, len(tokens), beam_size): graphs_same_source = [] graphs.append(graphs_same_source) for i2 in range(i1, i1 + beam_size): tokk = tokens[i2] graph, status, (lin, backr) = tokenizer.decode_amr(tokk, restore_name_ops=False) graph.status = status graph.nodes = lin graph.backreferences = backr graph.tokens = tokk graphs_same_source.append(graph) graphs_same_source[:] = \ tuple(zip(*sorted(enumerate(graphs_same_source), key=lambda x: (x[1].status.value, x[0]))))[1] return graphs def _model_generate(self, batch, beam_size): input_ids = batch['text_token_ids'] attention_mask = input_ids.ne(self.model.config.pad_token_id).to(torch.long) out = self.model.generate( input_ids=input_ids, attention_mask=attention_mask, max_length=1024, decoder_start_token_id=0, num_beams=beam_size, num_return_sequences=beam_size) return out def build_model(self, training=True, **kwargs) -> torch.nn.Module: # noinspection PyTypeChecker transformer = self.config.transformer cls = self._get_model_cls(transformer) transformer = get_model_mirror(self.config.transformer) model: cls = cls.from_pretrained( transformer, config=self._transformer_config) if training else cls(self._transformer_config) if not training: self.build_tokenizer(self.vocabs['additional_tokens']) tokenizer = self._tokenizer model.resize_token_embeddings(len(tokenizer.encoder)) if training: self._init_new_embeddings(model if cls == T5ForConditionalGeneration else model.model, tokenizer) return model def _get_model_cls(self, transformer: str): if 't5-' in transformer: cls = T5ForConditionalGeneration elif 'bart-' in transformer: cls = BartForConditionalGeneration else: raise NotImplemented(f'Unsupported transformer {transformer}') return cls @staticmethod def _init_new_embeddings(model, tokenizer): modified = 0 encoder = tokenizer.encoder for tok, idx in encoder.items(): tok = tok.lstrip(tokenizer.INIT) if idx < tokenizer.old_enc_size: continue elif tok.startswith(''): tok_split = ['pointer', str(tok.split(':')[1].strip('>'))] elif tok.startswith('<'): continue elif tok.startswith(':'): if tok.startswith(':op'): tok_split = ['relation', 'operator', str(int(tok[3:]))] elif tok.startswith(':snt'): tok_split = ['relation', 'sentence', str(int(tok[4:]))] elif tok.startswith(':ARG'): tok_split = ['relation', 'argument', str(int(tok[4:]))] else: tok_split = ['relation'] + tok.lstrip(':').split('-') else: tok_split = tok.split('-') tok_split_ = tok_split tok_split = [] for s in tok_split_: s_ = s + tokenizer.INIT if s_ in encoder: tok_split.append(s_) else: tok_split.extend(tokenizer._tok_bpe(s)) vecs = [] for s in tok_split: idx_split = encoder.get(s, -1) if idx_split > -1: vec_split = model.encoder.embed_tokens.weight.data[idx_split].clone() vecs.append(vec_split) if vecs: vec = torch.stack(vecs, 0).mean(0) noise = torch.empty_like(vec) noise.uniform_(-0.1, +0.1) model.encoder.embed_tokens.weight.data[idx] = vec + noise modified += 1 def input_is_flat(self, data): return isinstance(data, str) def predict(self, data: Union[str, List[str]], beautiful_amr_graph=True, **kwargs): flat = self.input_is_flat(data) if flat: data = [data] dataloader = self.build_dataloader([{'text': x} for x in data], **self.config, device=self.device) orders = [] results = [] for batch in dataloader: graphs = self.predict_amrs(batch) graphs = [x[0] for x in graphs] if beautiful_amr_graph: graphs = [AMRGraph(x.triples, x.top, x.epidata, x.metadata) for x in graphs] results.extend(graphs) orders.extend(batch[IDX]) results = reorder(results, orders) if flat: results = results[0] return results def fit(self, trn_data, dev_data, save_dir, batch_size=32, epochs=30, transformer='facebook/bart-base', lr=5e-05, grad_norm=2.5, weight_decay=0.004, warmup_steps=1, dropout=0.25, attention_dropout=0.0, pred_min=5, eval_after=0.5, collapse_name_ops=False, use_pointer_tokens=True, raw_graph=False, gradient_accumulation=1, recategorization_tokens=( 'PERSON', 'COUNTRY', 'QUANTITY', 'ORGANIZATION', 'DATE_ATTRS', 'NATIONALITY', 'LOCATION', 'ENTITY', 'CITY', 'MISC', 'ORDINAL_ENTITY', 'IDEOLOGY', 'RELIGION', 'STATE_OR_PROVINCE', 'URL', 'CAUSE_OF_DEATH', 'O', 'TITLE', 'DATE', 'NUMBER', 'HANDLE', 'SCORE_ENTITY', 'DURATION', 'ORDINAL', 'MONEY', 'SET', 'CRIMINAL_CHARGE', '_1', '_2', '_3', '_4', '_2', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15'), additional_tokens=( 'date-entity', 'government-organization', 'temporal-quantity', 'amr-unknown', 'multi-sentence', 'political-party', 'monetary-quantity', 'ordinal-entity', 'religious-group', 'percentage-entity', 'world-region', 'url-entity', 'political-movement', 'et-cetera', 'at-least', 'mass-quantity', 'have-org-role-91', 'have-rel-role-91', 'include-91', 'have-concession-91', 'have-condition-91', 'be-located-at-91', 'rate-entity-91', 'instead-of-91', 'hyperlink-91', 'request-confirmation-91', 'have-purpose-91', 'be-temporally-at-91', 'regardless-91', 'have-polarity-91', 'byline-91', 'have-manner-91', 'have-part-91', 'have-quant-91', 'publication-91', 'be-from-91', 'have-mod-91', 'have-frequency-91', 'score-on-scale-91', 'have-li-91', 'be-compared-to-91', 'be-destined-for-91', 'course-91', 'have-subevent-91', 'street-address-91', 'have-extent-91', 'statistical-test-91', 'have-instrument-91', 'have-name-91', 'be-polite-91', '-00', '-01', '-02', '-03', '-04', '-05', '-06', '-07', '-08', '-09', '-10', '-11', '-12', '-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20', '-21', '-22', '-23', '-24', '-25', '-26', '-27', '-28', '-29', '-20', '-31', '-32', '-33', '-34', '-35', '-36', '-37', '-38', '-39', '-40', '-41', '-42', '-43', '-44', '-45', '-46', '-47', '-48', '-49', '-50', '-51', '-52', '-53', '-54', '-55', '-56', '-57', '-58', '-59', '-60', '-61', '-62', '-63', '-64', '-65', '-66', '-67', '-68', '-69', '-70', '-71', '-72', '-73', '-74', '-75', '-76', '-77', '-78', '-79', '-80', '-81', '-82', '-83', '-84', '-85', '-86', '-87', '-88', '-89', '-90', '-91', '-92', '-93', '-94', '-95', '-96', '-97', '-98', '-of'), devices=None, logger=None, seed=None, finetune: Union[bool, str] = False, eval_trn=True, _device_placeholder=False, **kwargs): """ Args: trn_data: dev_data: save_dir: batch_size: epochs: transformer: lr: grad_norm: weight_decay: warmup_steps: dropout: attention_dropout: pred_min: eval_after: collapse_name_ops: ``True`` to merge name ops. use_pointer_tokens: ``True`` to use pointer tokens to represent variables. raw_graph: ``True`` to use the raw graph as input and skip all pre/post-processing steps. gradient_accumulation: recategorization_tokens: Tokens used in re-categorization. They will be added to tokenizer too but do not put them into ``additional_tokens``. additional_tokens: Tokens to be added to the tokenizer vocab. devices: logger: seed: finetune: eval_trn: _device_placeholder: **kwargs: Returns: """ return super().fit(**merge_locals_kwargs(locals(), kwargs)) def on_config_ready(self, **kwargs): super().on_config_ready(**kwargs) config = AutoConfig_.from_pretrained(self.config.transformer) config.output_past = False config.no_repeat_ngram_size = 0 config.prefix = " " # config.output_attentions = True config.dropout = self.config.dropout config.attention_dropout = self.config.attention_dropout self._transformer_config = config def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=True, cache=None, ret_speed=False, **kwargs): return super().evaluate(tst_data, save_dir, logger, batch_size, output, cache, ret_speed, **kwargs) def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger): additional_tokens = set() self.collect_additional_tokens(additional_tokens, trn) additional_tokens = sorted(additional_tokens) self.build_tokenizer(additional_tokens) self.vocabs['additional_tokens'] = Vocab(idx_to_token=list(additional_tokens)) ================================================ FILE: hanlp/components/classifiers/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-11-10 13:18 ================================================ FILE: hanlp/components/classifiers/fasttext_classifier.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-09-28 13:31 import os import sys from typing import List, Union import fasttext from fasttext.FastText import _FastText import hanlp from hanlp.common.component import Component from hanlp.utils.io_util import get_resource, stdout_redirected from hanlp_common.io import load_json from hanlp_common.reflection import classpath_of from hanlp_common.structure import SerializableDict class FastTextClassifier(Component): def __init__(self) -> None: super().__init__() self._model: _FastText = None self.config = SerializableDict({ 'classpath': classpath_of(self), 'hanlp_version': hanlp.__version__, }) def load(self, save_dir, model_path=None, **kwargs): config_path = os.path.join(save_dir, 'config.json') if os.path.isfile(config_path): self.config: dict = load_json(config_path) model_path = self.config.get('model_path', model_path) else: model_path = model_path or save_dir self.config['model_path'] = model_path filepath = get_resource(model_path) with stdout_redirected(to=os.devnull, stdout=sys.stderr): self._model = fasttext.load_model(filepath) def predict(self, text: Union[str, List[str]], topk=False, prob=False, max_len=None, **kwargs): """ Classify text. Args: text: A document or a list of documents. topk: ``True`` or ``int`` to return the top-k labels. prob: Return also probabilities. max_len: Strip long document into ``max_len`` characters for faster prediction. **kwargs: Not used Returns: Classification results. """ num_labels = len(self._model.get_labels()) flat = isinstance(text, str) if flat: text = [text] if not isinstance(topk, list): topk = [topk] * len(text) if not isinstance(prob, list): prob = [prob] * len(text) if max_len: text = [x[:max_len] for x in text] text = [x.replace('\n', ' ') for x in text] batch_labels, batch_probs = self._model.predict(text, k=num_labels) results = [] for labels, probs, k, p in zip(batch_labels, batch_probs, topk, prob): labels = [self._strip_prefix(x) for x in labels] if k is False: labels = labels[0] elif k is True: pass elif k: labels = labels[:k] if p: probs = probs.tolist() if k is False: result = labels, probs[0] else: result = dict(zip(labels, probs)) else: result = labels results.append(result) if flat: results = results[0] return results @property def labels(self): return [self._strip_prefix(x) for x in self._model.get_labels()] @staticmethod def _strip_prefix(label: str): return label[len('__label__'):] ================================================ FILE: hanlp/components/classifiers/transformer_classifier.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-08 16:31 import logging from abc import ABC from typing import Callable, Union from typing import List import torch from torch import nn from torch.utils.data import DataLoader from hanlp_common.constant import IDX from hanlp.common.dataset import TableDataset, SortingSampler, PadSequenceDataLoader, TransformableDataset from hanlp.common.torch_component import TorchComponent from hanlp.common.vocab import Vocab from hanlp.components.distillation.schedulers import LinearTeacherAnnealingScheduler from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.layers.transformers.encoder import TransformerEncoder from hanlp.layers.transformers.pt_imports import PreTrainedModel, AutoTokenizer, BertTokenizer, AutoTokenizer_ from hanlp.layers.transformers.utils import transformer_sliding_window, build_optimizer_scheduler_with_transformer from hanlp.metrics.accuracy import CategoricalAccuracy from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import merge_locals_kwargs, merge_dict, isdebugging class TransformerClassificationModel(nn.Module): def __init__(self, transformer: PreTrainedModel, num_labels: int, max_seq_length=512) -> None: super().__init__() self.max_seq_length = max_seq_length self.transformer = transformer self.dropout = nn.Dropout(transformer.config.hidden_dropout_prob) self.classifier = nn.Linear(transformer.config.hidden_size, num_labels) def forward(self, input_ids, attention_mask, token_type_ids): seq_length = input_ids.size(-1) if seq_length > self.max_seq_length: sequence_output = transformer_sliding_window(self.transformer, input_ids, max_pieces=self.max_seq_length, ret_cls='max') else: sequence_output = self.transformer(input_ids, attention_mask, token_type_ids)[0][:, 0, :] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) return logits class TransformerComponent(TorchComponent, ABC): def __init__(self, **kwargs) -> None: """ The base class for transorfmer based components. If offers methods to build transformer tokenizers , optimizers and models. Args: **kwargs: Passed to config. """ super().__init__(**kwargs) self.transformer_tokenizer = None def build_optimizer(self, trn, epochs, lr, adam_epsilon, weight_decay, warmup_steps, transformer_lr=None, teacher=None, **kwargs): num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1) if transformer_lr is None: transformer_lr = lr transformer = self.model.encoder.transformer optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, transformer, lr, transformer_lr, num_training_steps, warmup_steps, weight_decay, adam_epsilon) if teacher: lambda_scheduler = LinearTeacherAnnealingScheduler(num_training_steps) scheduler = (scheduler, lambda_scheduler) return optimizer, scheduler def fit(self, trn_data, dev_data, save_dir, transformer=None, lr=5e-5, transformer_lr=None, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, batch_size=32, gradient_accumulation=1, grad_norm=5.0, transformer_grad_norm=None, average_subwords=False, scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None, word_dropout=None, hidden_dropout=None, max_seq_len=None, ret_raw_hidden_states=False, batch_max_tokens=None, epochs=3, logger=None, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def on_config_ready(self, **kwargs): super().on_config_ready(**kwargs) if 'albert_chinese' in self.config.transformer: self.transformer_tokenizer = BertTokenizer.from_pretrained(self.config.transformer, use_fast=True) else: self.transformer_tokenizer = AutoTokenizer_.from_pretrained(self.config.transformer, use_fast=True) def build_transformer(self, training=True): transformer = TransformerEncoder(self.config.transformer, self.transformer_tokenizer, self.config.average_subwords, self.config.scalar_mix, self.config.word_dropout, ret_raw_hidden_states=self.config.ret_raw_hidden_states, training=training) transformer_layers = self.config.get('transformer_layers', None) if transformer_layers: transformer.transformer.encoder.layer = transformer.transformer.encoder.layer[:transformer_layers] return transformer class TransformerClassifier(TransformerComponent): def __init__(self, **kwargs) -> None: """A classifier using transformer as encoder. Args: **kwargs: Passed to config. """ super().__init__(**kwargs) self.model: TransformerClassificationModel = None def build_criterion(self, **kwargs): criterion = nn.CrossEntropyLoss() return criterion def build_metric(self, **kwargs): return CategoricalAccuracy() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, **kwargs): best_epoch, best_metric = 0, -1 timer = CountdownTimer(epochs) ratio_width = len(f'{len(trn)}/{len(trn)}') for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger) if dev: self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width) report = f'{timer.elapsed_human}/{timer.total_time_human}' dev_score = metric.get_metric() if dev_score > best_metric: self.save_weights(save_dir) best_metric = dev_score report += ' [red]saved[/red]' timer.log(report, ratio_percentage=False, newline=True, ratio=False) @property def label_vocab(self): return self.vocabs[self.config.label_key] def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): self.model.train() timer = CountdownTimer(len(trn)) optimizer, scheduler = optimizer total_loss = 0 metric.reset() for batch in trn: optimizer.zero_grad() logits = self.feed_batch(batch) target = batch['label_id'] loss = self.compute_loss(criterion, logits, target, batch) loss.backward() optimizer.step() scheduler.step() total_loss += loss.item() self.update_metric(metric, logits, target) timer.log(f'loss: {total_loss / (timer.current + 1):.4f} acc: {metric.get_metric():.2%}', ratio_percentage=None, logger=logger) del loss return total_loss / timer.total def update_metric(self, metric, logits: torch.Tensor, target, output=None): metric(logits, target) if output: label_ids = logits.argmax(-1) return label_ids def compute_loss(self, criterion, logits, target, batch): loss = criterion(logits, target) return loss def feed_batch(self, batch) -> torch.LongTensor: logits = self.model(*[batch[key] for key in ['input_ids', 'attention_mask', 'token_type_ids']]) return logits # noinspection PyMethodOverriding def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric, logger, ratio_width=None, filename=None, output=None, **kwargs): self.model.eval() timer = CountdownTimer(len(data)) total_loss = 0 metric.reset() num_samples = 0 if output: output = open(output, 'w') for batch in data: logits = self.feed_batch(batch) target = batch['label_id'] loss = self.compute_loss(criterion, logits, target, batch) total_loss += loss.item() label_ids = self.update_metric(metric, logits, target, output) if output: labels = [self.vocabs[self.config.label_key].idx_to_token[i] for i in label_ids.tolist()] for i, label in enumerate(labels): # text_a text_b pred gold columns = [batch[self.config.text_a_key][i]] if self.config.text_b_key: columns.append(batch[self.config.text_b_key][i]) columns.append(label) columns.append(batch[self.config.label_key][i]) output.write('\t'.join(columns)) output.write('\n') num_samples += len(target) report = f'loss: {total_loss / (timer.current + 1):.4f} acc: {metric.get_metric():.2%}' if filename: report = f'{filename} {report} {num_samples / timer.elapsed:.0f} samples/sec' timer.log(report, ratio_percentage=None, logger=logger, ratio_width=ratio_width) if output: output.close() return total_loss / timer.total # noinspection PyMethodOverriding def build_model(self, transformer, training=True, **kwargs) -> torch.nn.Module: # config: PretrainedConfig = AutoConfig.from_pretrained(transformer) # config.num_labels = len(self.vocabs.label) # config.hidden_dropout_prob = self.config.hidden_dropout_prob transformer = self.build_transformer(training=training).transformer model = TransformerClassificationModel(transformer, len(self.vocabs.label)) # truncated_normal_(model.classifier.weight, mean=0.02, std=0.05) return model # noinspection PyMethodOverriding def build_dataloader(self, data, batch_size, shuffle, device, text_a_key, text_b_key, label_key, logger: logging.Logger = None, sorting=True, **kwargs) -> DataLoader: if not batch_size: batch_size = self.config.batch_size dataset = self.build_dataset(data) dataset.append_transform(self.vocabs) if self.vocabs.mutable: if not any([text_a_key, text_b_key]): if len(dataset.headers) == 2: self.config.text_a_key = dataset.headers[0] self.config.label_key = dataset.headers[1] elif len(dataset.headers) >= 3: self.config.text_a_key, self.config.text_b_key, self.config.label_key = dataset.headers[0], \ dataset.headers[1], \ dataset.headers[-1] else: raise ValueError('Wrong dataset format') report = {'text_a_key', 'text_b_key', 'label_key'} report = dict((k, self.config[k]) for k in report) report = [f'{k}={v}' for k, v in report.items() if v] report = ', '.join(report) logger.info(f'Guess [bold][blue]{report}[/blue][/bold] according to the headers of training dataset: ' f'[blue]{dataset}[/blue]') self.build_vocabs(dataset, logger) dataset.purge_cache() # if self.config.transform: # dataset.append_transform(self.config.transform) dataset.append_transform(TransformerTextTokenizer(tokenizer=self.transformer_tokenizer, text_a_key=self.config.text_a_key, text_b_key=self.config.text_b_key, max_seq_length=self.config.max_seq_length, truncate_long_sequences=self.config.truncate_long_sequences, output_key='')) batch_sampler = None if sorting and not isdebugging(): if dataset.cache and len(dataset) > 1000: timer = CountdownTimer(len(dataset)) lens = [] for idx, sample in enumerate(dataset): lens.append(len(sample['input_ids'])) timer.log('Pre-processing and caching dataset [blink][yellow]...[/yellow][/blink]', ratio_percentage=None) else: lens = [len(sample['input_ids']) for sample in dataset] batch_sampler = SortingSampler(lens, batch_size=batch_size, shuffle=shuffle, batch_max_tokens=self.config.batch_max_tokens) return PadSequenceDataLoader(dataset, batch_size, shuffle, batch_sampler=batch_sampler, device=device) def build_dataset(self, data) -> TransformableDataset: if isinstance(data, str): dataset = TableDataset(data, cache=True) elif isinstance(data, TableDataset): dataset = data elif isinstance(data, list): dataset = TableDataset(data) else: raise ValueError(f'Unsupported data {data}') return dataset def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs): if not data: return [] flat = isinstance(data, str) or isinstance(data, tuple) if flat: data = [data] samples = [] for idx, d in enumerate(data): sample = {IDX: idx} if self.config.text_b_key: sample[self.config.text_a_key] = d[0] sample[self.config.text_b_key] = d[1] else: sample[self.config.text_a_key] = d samples.append(sample) dataloader = self.build_dataloader(samples, sorting=False, **merge_dict(self.config, batch_size=batch_size, shuffle=False, device=self.device, overwrite=True) ) labels = [None] * len(data) vocab = self.vocabs.label for batch in dataloader: logits = self.feed_batch(batch) pred = logits.argmax(-1) pred = pred.tolist() for idx, tag in zip(batch[IDX], pred): labels[idx] = vocab.idx_to_token[tag] if flat: return labels[0] return labels def fit(self, trn_data, dev_data, save_dir, text_a_key=None, text_b_key=None, label_key=None, transformer=None, max_seq_len=512, truncate_long_sequences=True, # hidden_dropout_prob=0.0, lr=5e-5, transformer_lr=None, adam_epsilon=1e-6, weight_decay=0, warmup_steps=0.1, batch_size=32, batch_max_tokens=None, epochs=3, logger=None, # transform=None, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_vocabs(self, trn, logger, **kwargs): self.vocabs.label = Vocab(pad_token=None, unk_token=None) for each in trn: pass self.vocabs.lock() self.vocabs.summary(logger) ================================================ FILE: hanlp/components/classifiers/transformer_classifier_hf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2023-02-17 17:54 import logging from typing import List, Union, Callable import torch from torch.utils.data import DataLoader from transformers import AutoModelForSequenceClassification, PreTrainedTokenizer, AutoTokenizer from hanlp.common.dataset import TableDataset, PadSequenceDataLoader, SortingSamplerBuilder from hanlp.common.torch_component import TorchComponent from hanlp_common.constant import IDX from hanlp_common.util import split_dict, reorder class TransformerClassifierHF(TorchComponent): def __init__(self, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer: PreTrainedTokenizer = None def build_dataloader(self, data, sampler_builder=None, shuffle=False, device=None, logger: logging.Logger = None, **kwargs) -> DataLoader: dataset = TableDataset(data) lens = [len(sample['input_ids']) for sample in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, 1) else: sampler = SortingSamplerBuilder(batch_size=32).build(lens, shuffle, 1) loader = PadSequenceDataLoader(dataset=dataset, batch_sampler=sampler, pad={'input_ids': self._tokenizer.pad_token_id}, device=device, vocabs=self.vocabs) return loader def build_optimizer(self, **kwargs): raise NotImplementedError() def build_criterion(self, **kwargs): raise NotImplementedError() def build_metric(self, **kwargs): raise NotImplementedError() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): raise NotImplementedError() def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): raise NotImplementedError() def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs): raise NotImplementedError() def load_vocabs(self, save_dir, filename='vocabs.json'): self._tokenizer = AutoTokenizer.from_pretrained(save_dir) def load_weights(self, save_dir, filename='model.pt', **kwargs): pass def build_model(self, training=True, save_dir=None, **kwargs) -> torch.nn.Module: return AutoModelForSequenceClassification.from_pretrained(save_dir) def predict(self, text: Union[str, List[str]], topk=False, prob=False, **kwargs): """ Classify text. Args: text: A document or a list of documents. topk: ``True`` or ``int`` to return the top-k labels. prob: Return also probabilities. max_len: Strip long document into ``max_len`` characters for faster prediction. **kwargs: Not used Returns: Classification results. """ flat = isinstance(text, str) if flat: text = [text] if not isinstance(topk, list): topk = [topk] * len(text) if not isinstance(prob, list): prob = [prob] * len(text) # noinspection PyTypeChecker dataloader = self.build_dataloader( split_dict(self._tokenizer(text, max_length=self.model.config.max_position_embeddings, truncation=True, return_token_type_ids=False, return_attention_mask=False)), device=self.device) results = [] order = [] id2label = self.model.config.id2label for batch in dataloader: logits = self.model(input_ids=batch['input_ids']).logits logits, batch_labels = logits.sort(descending=True) batch_labels = [[id2label[l] for l in ls] for ls in batch_labels.tolist()] batch_probs = logits.softmax(dim=-1).tolist() for labels, probs, i in zip(batch_labels, batch_probs, batch[IDX]): k = topk[i] p = prob[i] if k is False: labels = labels[0] elif k is True: pass elif k: labels = labels[:k] if p: if k is False: result = labels, probs[0] else: result = dict(zip(labels, probs)) else: result = labels results.append(result) order.extend(batch[IDX]) results = reorder(results, order) if flat: results = results[0] return results @property def labels(self): return [x[1] for x in sorted(self.model.config.id2label.items())] ================================================ FILE: hanlp/components/classifiers/transformer_classifier_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-11-10 13:19 import math from typing import Union, Tuple, Any, Iterable import tensorflow as tf from hanlp.common.keras_component import KerasComponent from hanlp_common.structure import SerializableDict from hanlp.layers.transformers.loader_tf import build_transformer from hanlp.optimizers.adamw import create_optimizer from hanlp.transform.table_tf import TableTransform from hanlp.utils.log_util import logger from hanlp_common.util import merge_locals_kwargs from transformers.tokenization_utils import PreTrainedTokenizer class TransformerTextTransform(TableTransform): def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=None, y_column=-1, skip_header=True, delimiter='auto', multi_label=False, **kwargs) -> None: super().__init__(config, map_x, map_y, x_columns, y_column, multi_label, skip_header, delimiter, **kwargs) self.tokenizer: PreTrainedTokenizer = None def inputs_to_samples(self, inputs, gold=False): tokenizer = self.tokenizer max_length = self.config.max_length num_features = None pad_token = None if self.label_vocab.mutable else tokenizer.convert_tokens_to_ids(['[PAD]'])[0] for (X, Y) in super().inputs_to_samples(inputs, gold): if self.label_vocab.mutable: yield None, Y continue if isinstance(X, str): X = (X,) if num_features is None: num_features = self.config.num_features assert num_features == len(X), f'Numbers of features {num_features} ' \ f'inconsistent with current {len(X)}={X}' text_a = X[0] text_b = X[1] if len(X) > 1 else None tokens_a = self.tokenizer.tokenize(text_a) tokens_b = self.tokenizer.tokenize(text_b) if text_b else None tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b segment_ids += [1] * len(tokens_b) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) attention_mask = [1] * len(token_ids) diff = max_length - len(token_ids) if diff < 0: # logger.warning( # f'Input tokens {tokens} exceed the max sequence length of {max_length - 2}. ' # f'The exceeded part will be truncated and ignored. ' # f'You are recommended to split your long text into several sentences within ' # f'{max_length - 2} tokens beforehand.') token_ids = token_ids[:max_length] attention_mask = attention_mask[:max_length] segment_ids = segment_ids[:max_length] elif diff > 0: token_ids += [pad_token] * diff attention_mask += [0] * diff segment_ids += [0] * diff assert len(token_ids) == max_length, "Error with input length {} vs {}".format(len(token_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) assert len(segment_ids) == max_length, "Error with input length {} vs {}".format(len(segment_ids), max_length) label = Y yield (token_ids, attention_mask, segment_ids), label def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: max_length = self.config.max_length types = (tf.int32, tf.int32, tf.int32), tf.string shapes = ([max_length], [max_length], [max_length]), [None, ] if self.config.get('multi_label', None) else [] values = (0, 0, 0), self.label_vocab.safe_pad_token return types, shapes, values def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: logger.fatal('map_x should always be set to True') exit(1) def y_to_idx(self, y) -> tf.Tensor: if self.config.get('multi_label', None): # need to change index to binary vector mapped = tf.map_fn(fn=lambda x: tf.cast(self.label_vocab.lookup(x), tf.int32), elems=y, fn_output_signature=tf.TensorSpec(dtype=tf.dtypes.int32, shape=[None, ])) one_hots = tf.one_hot(mapped, len(self.label_vocab)) idx = tf.reduce_sum(one_hots, -2) else: idx = self.label_vocab.lookup(y) return idx def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, batch=None) -> Iterable: # Prediction to be Y > 0: if self.config.get('multi_label', None): preds = Y else: preds = tf.argmax(Y, axis=-1) for y in preds: yield self.label_vocab.idx_to_token[y] def input_is_single_sample(self, input: Any) -> bool: return isinstance(input, (str, tuple)) class TransformerClassifierTF(KerasComponent): def __init__(self, bert_text_transform=None) -> None: if not bert_text_transform: bert_text_transform = TransformerTextTransform() super().__init__(bert_text_transform) self.model: tf.keras.Model self.transform: TransformerTextTransform = bert_text_transform # noinspection PyMethodOverriding def fit(self, trn_data: Any, dev_data: Any, save_dir: str, transformer: str, max_length: int = 128, optimizer='adamw', warmup_steps_ratio=0.1, use_amp=False, batch_size=32, epochs=3, logger=None, verbose=1, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def evaluate_output(self, tst_data, out, num_batches, metric): out.write('sentence\tpred\tgold\n') total, correct, score = 0, 0, 0 for idx, batch in enumerate(tst_data): outputs = self.model.predict_on_batch(batch[0]) outputs = tf.argmax(outputs, axis=1) for X, Y_pred, Y_gold, in zip(batch[0][0], outputs, batch[1]): feature = ' '.join(self.transform.tokenizer.convert_ids_to_tokens(X.numpy())) feature = feature.replace(' ##', '') # fix sub-word generated by BERT tagger out.write('{}\t{}\t{}\n'.format(feature, self._y_id_to_str(Y_pred), self._y_id_to_str(Y_gold))) total += 1 correct += int(tf.equal(Y_pred, Y_gold).numpy()) score = correct / total print('\r{}/{} {}: {:.2f}'.format(idx + 1, num_batches, metric, score * 100), end='') print() return score def _y_id_to_str(self, Y_pred) -> str: return self.transform.label_vocab.idx_to_token[Y_pred.numpy()] def build_loss(self, loss, **kwargs): if loss: assert isinstance(loss, tf.keras.losses.loss), 'Must specify loss as an instance in tf.keras.losses' return loss elif self.config.get('multi_label', None): # Loss to be BinaryCrossentropy for multi-label: loss = tf.keras.losses.BinaryCrossentropy(from_logits=True) else: loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) return loss # noinspection PyMethodOverriding def build_optimizer(self, optimizer, use_amp, train_steps, warmup_steps, **kwargs): if optimizer == 'adamw': opt = create_optimizer(init_lr=5e-5, num_train_steps=train_steps, num_warmup_steps=warmup_steps) # opt = tfa.optimizers.AdamW(learning_rate=3e-5, epsilon=1e-08, weight_decay=0.01) # opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) self.config.optimizer = tf.keras.utils.serialize_keras_object(opt) lr_config = self.config.optimizer['config']['learning_rate']['config'] if hasattr(lr_config['decay_schedule_fn'], 'get_config'): lr_config['decay_schedule_fn'] = dict( (k, v) for k, v in lr_config['decay_schedule_fn'].config().items() if not k.startswith('_')) else: opt = super().build_optimizer(optimizer) if use_amp: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') return opt # noinspection PyMethodOverriding def build_model(self, transformer, max_length, **kwargs): model, self.transform.tokenizer = build_transformer(transformer, max_length, len(self.transform.label_vocab), tagging=False) return model def build_vocab(self, trn_data, logger): train_examples = super().build_vocab(trn_data, logger) warmup_steps_per_epoch = math.ceil(train_examples * self.config.warmup_steps_ratio / self.config.batch_size) self.config.warmup_steps = warmup_steps_per_epoch * self.config.epochs return train_examples def build_metrics(self, metrics, logger, **kwargs): if self.config.get('multi_label', None): metric = tf.keras.metrics.BinaryAccuracy('binary_accuracy') else: metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') return [metric] ================================================ FILE: hanlp/components/classifiers/transformer_regression_hf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2023-02-17 17:54 import logging from typing import List, Union, Callable import torch from torch.utils.data import DataLoader from transformers import AutoModelForSequenceClassification, PreTrainedTokenizer, AutoTokenizer from hanlp.common.dataset import TableDataset, PadSequenceDataLoader, SortingSamplerBuilder from hanlp.common.torch_component import TorchComponent from hanlp_common.constant import IDX from hanlp_common.util import split_dict, reorder class TransformerRegressionHF(TorchComponent): def __init__(self, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer: PreTrainedTokenizer = None def build_dataloader(self, data, sampler_builder=None, shuffle=False, device=None, logger: logging.Logger = None, **kwargs) -> DataLoader: dataset = TableDataset(data) lens = [len(sample['input_ids']) for sample in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, 1) else: sampler = SortingSamplerBuilder(batch_size=32).build(lens, shuffle, 1) loader = PadSequenceDataLoader(dataset=dataset, batch_sampler=sampler, pad={'input_ids': self._tokenizer.pad_token_id}, device=device, vocabs=self.vocabs) return loader def build_optimizer(self, **kwargs): raise NotImplementedError() def build_criterion(self, **kwargs): raise NotImplementedError() def build_metric(self, **kwargs): raise NotImplementedError() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): raise NotImplementedError() def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): raise NotImplementedError() def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs): raise NotImplementedError() def load_vocabs(self, save_dir, filename='vocabs.json'): self._tokenizer = AutoTokenizer.from_pretrained(save_dir) def load_weights(self, save_dir, filename='model.pt', **kwargs): pass def build_model(self, training=True, save_dir=None, **kwargs) -> torch.nn.Module: return AutoModelForSequenceClassification.from_pretrained(save_dir) def predict(self, text: Union[str, List[str]], **kwargs): """ Classify text. Args: text: A document or a list of documents. topk: ``True`` or ``int`` to return the top-k labels. prob: Return also probabilities. max_len: Strip long document into ``max_len`` characters for faster prediction. **kwargs: Not used Returns: Classification results. """ flat = isinstance(text, str) if flat: text = [text] # noinspection PyTypeChecker dataloader = self.build_dataloader( split_dict(self._tokenizer(text, max_length=self.model.config.max_position_embeddings, truncation=True, return_token_type_ids=False, return_attention_mask=False)), device=self.device) results = [] order = [] for batch in dataloader: logits = self.model(input_ids=batch['input_ids']).logits logits = logits.squeeze(-1).clip(-1, 1) logits = logits.tolist() results.extend(logits) order.extend(batch[IDX]) results = reorder(results, order) if flat: results = results[0] return results ================================================ FILE: hanlp/components/distillation/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-10-17 20:29 ================================================ FILE: hanlp/components/distillation/distillable_component.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-10-17 20:30 from abc import ABC from copy import copy import hanlp from hanlp.common.torch_component import TorchComponent from hanlp.components.distillation.losses import KnowledgeDistillationLoss from hanlp.components.distillation.schedulers import TemperatureScheduler from hanlp.utils.torch_util import cuda_devices from hanlp_common.util import merge_locals_kwargs class DistillableComponent(TorchComponent, ABC): # noinspection PyMethodMayBeStatic,PyTypeChecker def build_teacher(self, teacher: str, devices) -> TorchComponent: return hanlp.load(teacher, load_kwargs={'devices': devices}) def distill(self, teacher: str, trn_data, dev_data, save_dir, batch_size=None, epochs=None, kd_criterion='kd_ce_loss', temperature_scheduler='flsw', devices=None, logger=None, seed=None, **kwargs): devices = devices or cuda_devices() if isinstance(kd_criterion, str): kd_criterion = KnowledgeDistillationLoss(kd_criterion) if isinstance(temperature_scheduler, str): temperature_scheduler = TemperatureScheduler.from_name(temperature_scheduler) teacher = self.build_teacher(teacher, devices=devices) self.vocabs = teacher.vocabs config = copy(teacher.config) batch_size = batch_size or config.get('batch_size', None) epochs = epochs or config.get('epochs', None) config.update(kwargs) return super().fit(**merge_locals_kwargs(locals(), config, excludes=('self', 'kwargs', '__class__', 'config'))) @property def _savable_config(self): config = super(DistillableComponent, self)._savable_config if 'teacher' in config: config.teacher = config.teacher.load_path return config ================================================ FILE: hanlp/components/distillation/losses.py ================================================ # Adopted from https://github.com/airaria/TextBrewer # Apache License Version 2.0 import torch import torch.nn.functional as F from hanlp_common.configurable import AutoConfigurable def kd_mse_loss(logits_S, logits_T, temperature=1): ''' Calculate the mse loss between logits_S and logits_T :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,) ''' if isinstance(temperature, torch.Tensor) and temperature.dim() > 0: temperature = temperature.unsqueeze(-1) beta_logits_T = logits_T / temperature beta_logits_S = logits_S / temperature loss = F.mse_loss(beta_logits_S, beta_logits_T) return loss def kd_ce_loss(logits_S, logits_T, temperature=1): ''' Calculate the cross entropy between logits_S and logits_T :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,) ''' if isinstance(temperature, torch.Tensor) and temperature.dim() > 0: temperature = temperature.unsqueeze(-1) beta_logits_T = logits_T / temperature beta_logits_S = logits_S / temperature p_T = F.softmax(beta_logits_T, dim=-1) loss = -(p_T * F.log_softmax(beta_logits_S, dim=-1)).sum(dim=-1).mean() return loss def att_mse_loss(attention_S, attention_T, mask=None): ''' * Calculates the mse loss between `attention_S` and `attention_T`. * If the `inputs_mask` is given, masks the positions where ``input_mask==0``. :param torch.Tensor logits_S: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) :param torch.Tensor logits_T: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) :param torch.Tensor mask: tensor of shape (*batch_size*, *length*) ''' if mask is None: attention_S_select = torch.where(attention_S <= -1e-3, torch.zeros_like(attention_S), attention_S) attention_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), attention_T) loss = F.mse_loss(attention_S_select, attention_T_select) else: mask = mask.to(attention_S).unsqueeze(1).expand(-1, attention_S.size(1), -1) # (bs, num_of_heads, len) valid_count = torch.pow(mask.sum(dim=2), 2).sum() loss = (F.mse_loss(attention_S, attention_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze( 2)).sum() / valid_count return loss def att_mse_sum_loss(attention_S, attention_T, mask=None): ''' * Calculates the mse loss between `attention_S` and `attention_T`. * If the the shape is (*batch_size*, *num_heads*, *length*, *length*), sums along the `num_heads` dimension and then calcuates the mse loss between the two matrices. * If the `inputs_mask` is given, masks the positions where ``input_mask==0``. :param torch.Tensor logits_S: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*) :param torch.Tensor logits_T: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*) :param torch.Tensor mask: tensor of shape (*batch_size*, *length*) ''' if len(attention_S.size()) == 4: attention_T = attention_T.sum(dim=1) attention_S = attention_S.sum(dim=1) if mask is None: attention_S_select = torch.where(attention_S <= -1e-3, torch.zeros_like(attention_S), attention_S) attention_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), attention_T) loss = F.mse_loss(attention_S_select, attention_T_select) else: mask = mask.to(attention_S) valid_count = torch.pow(mask.sum(dim=1), 2).sum() loss = (F.mse_loss(attention_S, attention_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze( 1)).sum() / valid_count return loss def att_ce_loss(attention_S, attention_T, mask=None): ''' * Calculates the cross-entropy loss between `attention_S` and `attention_T`, where softmax is to applied on ``dim=-1``. * If the `inputs_mask` is given, masks the positions where ``input_mask==0``. :param torch.Tensor logits_S: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) :param torch.Tensor logits_T: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) :param torch.Tensor mask: tensor of shape (*batch_size*, *length*) ''' probs_T = F.softmax(attention_T, dim=-1) if mask is None: probs_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), probs_T) loss = -((probs_T_select * F.log_softmax(attention_S, dim=-1)).sum(dim=-1)).mean() else: mask = mask.to(attention_S).unsqueeze(1).expand(-1, attention_S.size(1), -1) # (bs, num_of_heads, len) loss = -((probs_T * F.log_softmax(attention_S, dim=-1) * mask.unsqueeze(2)).sum( dim=-1) * mask).sum() / mask.sum() return loss def att_ce_mean_loss(attention_S, attention_T, mask=None): ''' * Calculates the cross-entropy loss between `attention_S` and `attention_T`, where softmax is to applied on ``dim=-1``. * If the shape is (*batch_size*, *num_heads*, *length*, *length*), averages over dimension `num_heads` and then computes cross-entropy loss between the two matrics. * If the `inputs_mask` is given, masks the positions where ``input_mask==0``. :param torch.tensor logits_S: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*) :param torch.tensor logits_T: tensor of shape (*batch_size*, *num_heads*, *length*, *length*) or (*batch_size*, *length*, *length*) :param torch.tensor mask: tensor of shape (*batch_size*, *length*) ''' if len(attention_S.size()) == 4: attention_S = attention_S.mean(dim=1) # (bs, len, len) attention_T = attention_T.mean(dim=1) probs_T = F.softmax(attention_T, dim=-1) if mask is None: probs_T_select = torch.where(attention_T <= -1e-3, torch.zeros_like(attention_T), probs_T) loss = -((probs_T_select * F.log_softmax(attention_S, dim=-1)).sum(dim=-1)).mean() else: mask = mask.to(attention_S) loss = -((probs_T * F.log_softmax(attention_S, dim=-1) * mask.unsqueeze(1)).sum( dim=-1) * mask).sum() / mask.sum() return loss def hid_mse_loss(state_S, state_T, mask=None): ''' * Calculates the mse loss between `state_S` and `state_T`, which are the hidden state of the models. * If the `inputs_mask` is given, masks the positions where ``input_mask==0``. * If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions. :param torch.Tensor state_S: tensor of shape (*batch_size*, *length*, *hidden_size*) :param torch.Tensor state_T: tensor of shape (*batch_size*, *length*, *hidden_size*) :param torch.Tensor mask: tensor of shape (*batch_size*, *length*) ''' if mask is None: loss = F.mse_loss(state_S, state_T) else: mask = mask.to(state_S) valid_count = mask.sum() * state_S.size(-1) loss = (F.mse_loss(state_S, state_T, reduction='none') * mask.unsqueeze(-1)).sum() / valid_count return loss def cos_loss(state_S, state_T, mask=None): ''' * Computes the cosine similarity loss between the inputs. This is the loss used in DistilBERT, see `DistilBERT `_ * If the `inputs_mask` is given, masks the positions where ``input_mask==0``. * If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions. :param torch.Tensor state_S: tensor of shape (*batch_size*, *length*, *hidden_size*) :param torch.Tensor state_T: tensor of shape (*batch_size*, *length*, *hidden_size*) :param torch.Tensor mask: tensor of shape (*batch_size*, *length*) ''' if mask is None: state_S = state_S.view(-1, state_S.size(-1)) state_T = state_T.view(-1, state_T.size(-1)) else: mask = mask.to(state_S).unsqueeze(-1).expand_as(state_S) # (bs,len,dim) state_S = torch.masked_select(state_S, mask).view(-1, mask.size(-1)) # (bs * select, dim) state_T = torch.masked_select(state_T, mask).view(-1, mask.size(-1)) # (bs * select, dim) target = state_S.new(state_S.size(0)).fill_(1) loss = F.cosine_embedding_loss(state_S, state_T, target, reduction='mean') return loss def pkd_loss(state_S, state_T, mask=None): ''' * Computes normalized vector mse loss at position 0 along `length` dimension. This is the loss used in BERT-PKD, see `Patient Knowledge Distillation for BERT Model Compression `_. * If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions. :param torch.Tensor state_S: tensor of shape (*batch_size*, *length*, *hidden_size*) :param torch.Tensor state_T: tensor of shape (*batch_size*, *length*, *hidden_size*) :param mask: not used. ''' cls_T = state_T[:, 0] # (batch_size, hidden_dim) cls_S = state_S[:, 0] # (batch_size, hidden_dim) normed_cls_T = cls_T / torch.norm(cls_T, dim=1, keepdim=True) normed_cls_S = cls_S / torch.norm(cls_S, dim=1, keepdim=True) loss = (normed_cls_S - normed_cls_T).pow(2).sum(dim=-1).mean() return loss def fsp_loss(state_S, state_T, mask=None): r''' * Takes in two lists of matrics `state_S` and `state_T`. Each list contains two matrices of the shape (*batch_size*, *length*, *hidden_size*). Computes the similarity matrix between the two matrices in `state_S` ( with the resulting shape (*batch_size*, *hidden_size*, *hidden_size*) ) and the ones in B ( with the resulting shape (*batch_size*, *hidden_size*, *hidden_size*) ), then computes the mse loss between the similarity matrices: .. math:: loss = mean((S_{1}^T \cdot S_{2} - T_{1}^T \cdot T_{2})^2) * It is a Variant of FSP loss in `A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning `_. * If the `inputs_mask` is given, masks the positions where ``input_mask==0``. * If the hidden sizes of student and teacher are different, 'proj' option is required in `inetermediate_matches` to match the dimensions. :param torch.tensor state_S: list of two tensors, each tensor is of the shape (*batch_size*, *length*, *hidden_size*) :param torch.tensor state_T: list of two tensors, each tensor is of the shape (*batch_size*, *length*, *hidden_size*) :param torch.tensor mask: tensor of the shape (*batch_size*, *length*) Example in `intermediate_matches`:: intermediate_matches = [ {'layer_T':[0,0], 'layer_S':[0,0], 'feature':'hidden','loss': 'fsp', 'weight' : 1, 'proj':['linear',384,768]}, ...] ''' if mask is None: state_S_0 = state_S[0] # (batch_size , length, hidden_dim) state_S_1 = state_S[1] # (batch_size, length, hidden_dim) state_T_0 = state_T[0] state_T_1 = state_T[1] gram_S = torch.bmm(state_S_0.transpose(1, 2), state_S_1) / state_S_1.size( 1) # (batch_size, hidden_dim, hidden_dim) gram_T = torch.bmm(state_T_0.transpose(1, 2), state_T_1) / state_T_1.size(1) else: mask = mask.to(state_S[0]).unsqueeze(-1) lengths = mask.sum(dim=1, keepdim=True) state_S_0 = state_S[0] * mask state_S_1 = state_S[1] * mask state_T_0 = state_T[0] * mask state_T_1 = state_T[1] * mask gram_S = torch.bmm(state_S_0.transpose(1, 2), state_S_1) / lengths gram_T = torch.bmm(state_T_0.transpose(1, 2), state_T_1) / lengths loss = F.mse_loss(gram_S, gram_T) return loss def mmd_loss(state_S, state_T, mask=None): r''' * Takes in two lists of matrices `state_S` and `state_T`. Each list contains 2 matrices of the shape (*batch_size*, *length*, *hidden_size*). `hidden_size` of matrices in `State_S` doesn't need to be the same as that of `state_T`. Computes the similarity matrix between the two matrices in `state_S` ( with the resulting shape (*batch_size*, *length*, *length*) ) and the ones in B ( with the resulting shape (*batch_size*, *length*, *length*) ), then computes the mse loss between the similarity matrices: .. math:: loss = mean((S_{1} \cdot S_{2}^T - T_{1} \cdot T_{2}^T)^2) * It is a Variant of the NST loss in `Like What You Like: Knowledge Distill via Neuron Selectivity Transfer `_ * If the `inputs_mask` is given, masks the positions where ``input_mask==0``. :param torch.tensor state_S: list of two tensors, each tensor is of the shape (*batch_size*, *length*, *hidden_size*) :param torch.tensor state_T: list of two tensors, each tensor is of the shape (*batch_size*, *length*, *hidden_size*) :param torch.tensor mask: tensor of the shape (*batch_size*, *length*) Example in `intermediate_matches`:: intermediate_matches = [ {'layer_T':[0,0], 'layer_S':[0,0], 'feature':'hidden','loss': 'nst', 'weight' : 1}, ...] ''' state_S_0 = state_S[0] # (batch_size , length, hidden_dim_S) state_S_1 = state_S[1] # (batch_size , length, hidden_dim_S) state_T_0 = state_T[0] # (batch_size , length, hidden_dim_T) state_T_1 = state_T[1] # (batch_size , length, hidden_dim_T) if mask is None: gram_S = torch.bmm(state_S_0, state_S_1.transpose(1, 2)) / state_S_1.size(2) # (batch_size, length, length) gram_T = torch.bmm(state_T_0, state_T_1.transpose(1, 2)) / state_T_1.size(2) loss = F.mse_loss(gram_S, gram_T) else: mask = mask.to(state_S[0]) valid_count = torch.pow(mask.sum(dim=1), 2).sum() gram_S = torch.bmm(state_S_0, state_S_1.transpose(1, 2)) / state_S_1.size(1) # (batch_size, length, length) gram_T = torch.bmm(state_T_0, state_T_1.transpose(1, 2)) / state_T_1.size(1) loss = (F.mse_loss(gram_S, gram_T, reduction='none') * mask.unsqueeze(-1) * mask.unsqueeze( 1)).sum() / valid_count return loss class KnowledgeDistillationLoss(AutoConfigurable): def __init__(self, name) -> None: super().__init__() self.name = name import sys thismodule = sys.modules[__name__] self._loss = getattr(thismodule, name) def __call__(self, *args, **kwargs): return self._loss(*args, **kwargs) ================================================ FILE: hanlp/components/distillation/schedulers.py ================================================ # Adopted from https://github.com/airaria/TextBrewer # Apache License Version 2.0 from abc import ABC, abstractmethod import torch # x is between 0 and 1 from hanlp_common.configurable import AutoConfigurable def linear_growth_weight_scheduler(x): return x def linear_decay_weight_scheduler(x): return 1 - x def constant_temperature_scheduler(logits_S, logits_T, base_temperature): ''' Remember to detach logits_S ''' return base_temperature def flsw_temperature_scheduler_builder(beta, gamma, eps=1e-4, *args): ''' adapted from arXiv:1911.07471 ''' def flsw_temperature_scheduler(logits_S, logits_T, base_temperature): v = logits_S.detach() t = logits_T.detach() with torch.no_grad(): v = v / (torch.norm(v, dim=-1, keepdim=True) + eps) t = t / (torch.norm(t, dim=-1, keepdim=True) + eps) w = torch.pow((1 - (v * t).sum(dim=-1)), gamma) tau = base_temperature + (w.mean() - w) * beta return tau return flsw_temperature_scheduler def cwsm_temperature_scheduler_builder(beta, *args): ''' adapted from arXiv:1911.07471 ''' def cwsm_temperature_scheduler(logits_S, logits_T, base_temperature): v = logits_S.detach() with torch.no_grad(): v = torch.softmax(v, dim=-1) v_max = v.max(dim=-1)[0] w = 1 / (v_max + 1e-3) tau = base_temperature + (w.mean() - w) * beta return tau return cwsm_temperature_scheduler class LinearTeacherAnnealingScheduler(object): def __init__(self, num_training_steps: int) -> None: super().__init__() self._num_training_steps = num_training_steps self._current_training_steps = 0 def step(self): self._current_training_steps += 1 def __float__(self): return self._current_training_steps / self._num_training_steps class TemperatureScheduler(ABC, AutoConfigurable): def __init__(self, base_temperature) -> None: super().__init__() self.base_temperature = base_temperature def __call__(self, logits_S, logits_T): return self.forward(logits_S, logits_T) @abstractmethod def forward(self, logits_S, logits_T): raise NotImplementedError() @staticmethod def from_name(name): classes = { 'constant': ConstantScheduler, 'flsw': FlswScheduler, 'cwsm': CwsmScheduler, } assert name in classes, f'Unsupported temperature scheduler {name}. Expect one from {list(classes.keys())}.' return classes[name]() class FunctionalScheduler(TemperatureScheduler): def __init__(self, scheduler_func, base_temperature) -> None: super().__init__(base_temperature) self._scheduler_func = scheduler_func def forward(self, logits_S, logits_T): return self._scheduler_func(logits_S, logits_T, self.base_temperature) class ConstantScheduler(TemperatureScheduler): def forward(self, logits_S, logits_T): return self.base_temperature class FlswScheduler(FunctionalScheduler): def __init__(self, beta=1, gamma=1, eps=1e-4, base_temperature=8): super().__init__(flsw_temperature_scheduler_builder(beta, gamma, eps), base_temperature) self.beta = beta self.gamma = gamma self.eps = eps class CwsmScheduler(FunctionalScheduler): def __init__(self, beta=1, base_temperature=8): super().__init__(cwsm_temperature_scheduler_builder(beta), base_temperature) self.beta = beta ================================================ FILE: hanlp/components/eos/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-26 20:19 ================================================ FILE: hanlp/components/eos/ngram.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-26 20:19 import logging from collections import Counter from typing import Union, List, Callable import torch from torch import nn, optim from torch.nn import BCEWithLogitsLoss from torch.utils.data import DataLoader from hanlp.common.dataset import PadSequenceDataLoader from hanlp.common.torch_component import TorchComponent from hanlp.common.vocab import Vocab from hanlp.datasets.eos.eos import SentenceBoundaryDetectionDataset from hanlp.metrics.f1 import F1 from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import merge_locals_kwargs class NgramSentenceBoundaryDetectionModel(nn.Module): def __init__(self, char_vocab_size, embedding_size=128, rnn_type: str = 'LSTM', rnn_size=256, rnn_layers=1, rnn_bidirectional=False, dropout=0.2, **kwargs ): super(NgramSentenceBoundaryDetectionModel, self).__init__() self.embed = nn.Embedding(num_embeddings=char_vocab_size, embedding_dim=embedding_size) rnn_type = rnn_type.lower() if rnn_type == 'lstm': self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=rnn_size, num_layers=rnn_layers, dropout=self.dropout if rnn_layers > 1 else 0.0, bidirectional=rnn_bidirectional, batch_first=True) elif rnn_type == 'gru': self.rnn = nn.GRU(input_size=self.embdding_size, hidden_size=rnn_size, num_layers=rnn_layers, dropout=self.dropout if rnn_layers > 1 else 0.0, bidirectional=rnn_bidirectional, batch_first=True) else: raise NotImplementedError(f"'{rnn_type}' has to be one of [LSTM, GRU]") self.dropout = nn.Dropout(p=dropout) if dropout else None self.dense = nn.Linear(in_features=rnn_size * (2 if rnn_bidirectional else 1), out_features=1) def forward(self, x: torch.Tensor): output = self.embed(x) self.rnn.flatten_parameters() output, _ = self.rnn(output) if self.dropout: output = self.dropout(output[:, -1, :]) output = output.squeeze(1) output = self.dense(output).squeeze(-1) return output class NgramSentenceBoundaryDetector(TorchComponent): def __init__(self, **kwargs) -> None: """A sentence boundary detector using ngram as features and LSTM as encoder (:cite:`Schweter:Ahmed:2019`). It predicts whether a punctuation marks an ``EOS``. .. Note:: This component won't work on text without the punctuations defined in its config. It's always recommended to understand how it works before using it. The predefined punctuations can be listed by the following codes. >>> print(eos.config.eos_chars) Args: **kwargs: Passed to config. """ super().__init__(**kwargs) def build_optimizer(self, **kwargs): optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr) return optimizer def build_criterion(self, **kwargs): return BCEWithLogitsLoss() def build_metric(self, **kwargs): return F1() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, **kwargs): best_epoch, best_metric = 0, -1 timer = CountdownTimer(epochs) ratio_width = len(f'{len(trn)}/{len(trn)}') for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger) if dev: self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width) report = f'{timer.elapsed_human}/{timer.total_time_human}' dev_score = metric.score if dev_score > best_metric: self.save_weights(save_dir) best_metric = dev_score report += ' [red]saved[/red]' timer.log(report, ratio_percentage=False, newline=True, ratio=False) def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): self.model.train() timer = CountdownTimer(len(trn)) total_loss = 0 self.reset_metrics(metric) for batch in trn: optimizer.zero_grad() prediction = self.feed_batch(batch) loss = self.compute_loss(prediction, batch, criterion) self.update_metrics(batch, prediction, metric) loss.backward() if self.config.grad_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm) optimizer.step() total_loss += loss.item() timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger) del loss return total_loss / timer.total def compute_loss(self, prediction, batch, criterion): loss = criterion(prediction, batch['label_id']) return loss # noinspection PyMethodOverriding def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric, logger, ratio_width=None, output=False, **kwargs): self.model.eval() self.reset_metrics(metric) timer = CountdownTimer(len(data)) total_loss = 0 for batch in data: prediction = self.feed_batch(batch) self.update_metrics(batch, prediction, metric) loss = self.compute_loss(prediction, batch, criterion) total_loss += loss.item() timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger, ratio_width=ratio_width) del loss return total_loss / timer.total, metric def build_model(self, training=True, **kwargs) -> torch.nn.Module: model = NgramSentenceBoundaryDetectionModel(**self.config, char_vocab_size=len(self.vocabs.char)) return model def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger, **kwargs) -> DataLoader: dataset = SentenceBoundaryDetectionDataset(data, **self.config, transform=[self.vocabs]) if isinstance(data, str): dataset.purge_cache() if not self.vocabs: self.build_vocabs(dataset, logger) return PadSequenceDataLoader(dataset, batch_size=batch_size, shuffle=shuffle, device=device, pad={'label_id': .0}) def predict(self, data: Union[str, List[str]], batch_size: int = None, strip=True, **kwargs): """Sentence split. Args: data: A paragraph or a list of paragraphs. batch_size: Number of samples per batch. strip: Strip out blank characters at the head and tail of each sentence. Returns: A list of sentences or a list of lists of sentences. """ if not data: return [] self.model.eval() flat = isinstance(data, str) if flat: data = [data] samples = [] eos_chars = self.config.eos_chars window_size = self.config.window_size for doc_id_, corpus in enumerate(data): corpus = list(corpus) for i, c in enumerate(corpus): if c in eos_chars: window = corpus[max(0, i - window_size): i + window_size + 1] samples.append({'char': window, 'offset_': i, 'doc_id_': doc_id_}) eos_prediction = [[] for _ in range(len(data))] if samples: dataloader = self.build_dataloader(samples, **self.config, device=self.device, shuffle=False, logger=None) for batch in dataloader: logits = self.feed_batch(batch) prediction = (logits > 0).tolist() for doc_id_, offset_, eos in zip(batch['doc_id_'], batch['offset_'], prediction): if eos: eos_prediction[doc_id_].append(offset_) outputs = [] for corpus, output in zip(data, eos_prediction): sents_per_document = [] prev_offset = 0 for offset in output: offset += 1 sents_per_document.append(corpus[prev_offset:offset]) prev_offset = offset if prev_offset != len(corpus): sents_per_document.append(corpus[prev_offset:]) if strip: sents_per_document = [x.strip() for x in sents_per_document] sents_per_document = [x for x in sents_per_document if x] outputs.append(sents_per_document) if flat: outputs = outputs[0] return outputs # noinspection PyMethodOverriding def fit(self, trn_data, dev_data, save_dir, epochs=5, append_after_sentence=None, eos_chars=None, eos_char_min_freq=200, eos_char_is_punct=True, char_min_freq=None, window_size=5, batch_size=32, lr=0.001, grad_norm=None, loss_reduction='sum', embedding_size=128, rnn_type: str = 'LSTM', rnn_size=256, rnn_layers=1, rnn_bidirectional=False, dropout=0.2, devices=None, logger=None, seed=None, **kwargs ): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_vocabs(self, dataset: SentenceBoundaryDetectionDataset, logger, **kwargs): char_min_freq = self.config.char_min_freq if char_min_freq: has_cache = dataset.cache is not None char_counter = Counter() for each in dataset: for c in each['char']: char_counter[c] += 1 self.vocabs.char = vocab = Vocab() for c, f in char_counter.items(): if f >= char_min_freq: vocab.add(c) if has_cache: dataset.purge_cache() for each in dataset: pass else: self.vocabs.char = Vocab() for each in dataset: pass self.config.eos_chars = dataset.eos_chars self.vocabs.lock() self.vocabs.summary(logger) def reset_metrics(self, metrics): metrics.reset() def report_metrics(self, loss, metrics): return f'loss: {loss:.4f} {metrics}' def update_metrics(self, batch: dict, prediction: torch.FloatTensor, metrics): def nonzero_offsets(y): return set(y.nonzero().squeeze(-1).tolist()) metrics(nonzero_offsets(prediction > 0), nonzero_offsets(batch['label_id'])) def feed_batch(self, batch): prediction = self.model(batch['char_id']) return prediction ================================================ FILE: hanlp/components/lambda_wrapper.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-31 18:36 from typing import Callable, Any from hanlp.common.component import Component from hanlp_common.reflection import classpath_of, object_from_classpath, str_to_type class LambdaComponent(Component): def __init__(self, function: Callable) -> None: super().__init__() self.config = {} self.function = function self.config['function'] = classpath_of(function) self.config['classpath'] = classpath_of(self) def predict(self, data: Any, **kwargs): unpack = kwargs.pop('_hanlp_unpack', None) if unpack: return self.function(*data, **kwargs) return self.function(data, **kwargs) @staticmethod def from_config(meta: dict, **kwargs): cls = str_to_type(meta['classpath']) function = meta['function'] function = object_from_classpath(function) return cls(function) ================================================ FILE: hanlp/components/lemmatizer.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-08 18:35 from typing import List from hanlp.common.transform import TransformList from hanlp.components.parsers.ud.lemma_edit import gen_lemma_rule, apply_lemma_rule from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger def add_lemma_rules_to_sample(sample: dict): if 'tag' in sample and 'lemma' not in sample: lemma_rules = [gen_lemma_rule(word, lemma) if lemma != "_" else "_" for word, lemma in zip(sample['token'], sample['tag'])] sample['lemma'] = sample['tag'] = lemma_rules return sample class TransformerLemmatizer(TransformerTagger): def __init__(self, **kwargs) -> None: """A transition based lemmatizer using transformer as encoder. Args: **kwargs: Predefined config. """ super().__init__(**kwargs) def build_dataset(self, data, transform=None, **kwargs): if not isinstance(transform, list): transform = TransformList() transform.append(add_lemma_rules_to_sample) return super().build_dataset(data, transform, **kwargs) def prediction_to_human(self, pred, vocab: List[str], batch, token=None): if token is None: token = batch['token'] rules = super().prediction_to_human(pred, vocab, batch) for token_per_sent, rule_per_sent in zip(token, rules): lemma_per_sent = [apply_lemma_rule(t, r) for t, r in zip(token_per_sent, rule_per_sent)] yield lemma_per_sent ================================================ FILE: hanlp/components/lm/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-01-29 21:07 ================================================ FILE: hanlp/components/lm/mlm.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-01-29 21:07 import logging import math from typing import Callable, Union, List import torch from hanlp_common.constant import IDX from hanlp_common.util import reorder from torch.utils.data import DataLoader from transformers import AutoModelForMaskedLM from transformers.tokenization_utils import PreTrainedTokenizer from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader, SortingSampler from hanlp.common.torch_component import TorchComponent from hanlp.layers.transformers.pt_imports import AutoTokenizer_ from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer from hanlp.utils.time_util import CountdownTimer class MaskedLanguageModelDataset(TransformableDataset): def load_file(self, filepath: str): raise NotImplementedError() class MaskedLanguageModel(TorchComponent): def __init__(self, **kwargs) -> None: super().__init__(**kwargs) self.tokenizer: PreTrainedTokenizer = None def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None, verbose=False, **kwargs) -> DataLoader: dataset = MaskedLanguageModelDataset([{'token': x} for x in data], generate_idx=True, transform=TransformerTextTokenizer(self.tokenizer, text_a_key='token')) if verbose: verbose = CountdownTimer(len(dataset)) lens = [] for each in dataset: lens.append(len(each['token_input_ids'])) if verbose: verbose.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]') dataloader = PadSequenceDataLoader(dataset, batch_sampler=SortingSampler(lens, batch_size=batch_size), device=device) return dataloader def build_optimizer(self, **kwargs): raise NotImplementedError() def build_criterion(self, **kwargs): raise NotImplementedError() def build_metric(self, **kwargs): raise NotImplementedError() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): raise NotImplementedError() def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): raise NotImplementedError() def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs): raise NotImplementedError() def build_model(self, training=True, transformer=None, **kwargs) -> torch.nn.Module: return AutoModelForMaskedLM.from_pretrained(transformer) def input_is_flat(self, masked_sents): return isinstance(masked_sents, str) def predict(self, masked_sents: Union[str, List[str]], batch_size=32, topk=10, **kwargs): flat = self.input_is_flat(masked_sents) if flat: masked_sents = [masked_sents] dataloader = self.build_dataloader(masked_sents, **self.config, device=self.device, batch_size=batch_size) orders = [] results = [] for batch in dataloader: input_ids = batch['token_input_ids'] outputs = self.model(input_ids=input_ids, attention_mask=batch['token_attention_mask']) mask = input_ids == self.tokenizer.mask_token_id if mask.any(): num_masks = mask.sum(dim=-1).tolist() masked_logits = outputs.logits[mask] masked_logits[:, self.tokenizer.all_special_ids] = -math.inf probs, indices = torch.nn.functional.softmax(masked_logits, dim=-1).topk(topk) br = [] for p, index in zip(probs.tolist(), indices.tolist()): br.append(dict(zip(self.tokenizer.convert_ids_to_tokens(index), p))) offset = 0 for n in num_masks: results.append(br[offset:offset + n]) offset += n else: results.extend([[]] * input_ids.size(0)) orders.extend(batch[IDX]) results = reorder(results, orders) if flat: results = results[0] return results def load_config(self, save_dir, filename='config.json', **kwargs): self.config.transformer = save_dir def load_vocabs(self, save_dir, filename='vocabs.json'): self.tokenizer = AutoTokenizer_.from_pretrained(self.config.transformer) def load_weights(self, save_dir, filename='model.pt', **kwargs): pass ================================================ FILE: hanlp/components/mtl/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-20 19:54 ================================================ FILE: hanlp/components/mtl/multi_task_learning.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-20 19:55 import functools import itertools import logging import os from collections import defaultdict from copy import copy from itertools import chain from typing import Union, List, Callable, Dict, Optional, Any, Iterable, Tuple import numpy as np import torch from hanlp_common.constant import IDX, BOS, EOS from hanlp_common.document import Document from hanlp_common.util import merge_locals_kwargs, topological_sort, reorder, prefix_match from hanlp_common.visualization import markdown_table from toposort import toposort from torch.utils.data import DataLoader import hanlp.utils.torch_util from hanlp.common.dataset import PadSequenceDataLoader, PrefetchDataLoader, CachedDataLoader from hanlp.common.structure import History from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import FieldLength, TransformList from hanlp.components.mtl.tasks import Task from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding, ContextualWordEmbeddingModule from hanlp.layers.embeddings.embedding import Embedding from hanlp.layers.transformers.utils import pick_tensor_for_each_token from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer from hanlp.utils.time_util import CountdownTimer from hanlp.utils.torch_util import clip_grad_norm class MultiTaskModel(torch.nn.Module): def __init__(self, encoder: torch.nn.Module, scalar_mixes: torch.nn.ModuleDict, decoders: torch.nn.ModuleDict, use_raw_hidden_states: dict) -> None: super().__init__() self.use_raw_hidden_states = use_raw_hidden_states self.encoder: ContextualWordEmbeddingModule = encoder self.scalar_mixes = scalar_mixes self.decoders = decoders class MultiTaskDataLoader(DataLoader): def __init__(self, training=True, tau: float = 0.8, **dataloaders) -> None: # noinspection PyTypeChecker super().__init__(None) self.tau = tau self.training = training self.dataloaders: Dict[str, DataLoader] = dataloaders if dataloaders else {} # self.iterators = dict((k, iter(v)) for k, v in dataloaders.items()) def __len__(self) -> int: if self.dataloaders: return sum(len(x) for x in self.dataloaders.values()) return 0 def __iter__(self): if self.training: sampling_weights, total_size = self.sampling_weights task_names = list(self.dataloaders.keys()) iterators = dict((k, itertools.cycle(v)) for k, v in self.dataloaders.items()) for i in range(total_size): task_name = np.random.choice(task_names, p=sampling_weights) yield task_name, next(iterators[task_name]) else: for task_name, dataloader in self.dataloaders.items(): for batch in dataloader: yield task_name, batch @property def sampling_weights(self): sampling_weights = self.sizes total_size = sum(sampling_weights) Z = sum(pow(v, self.tau) for v in sampling_weights) sampling_weights = [pow(v, self.tau) / Z for v in sampling_weights] return sampling_weights, total_size @property def sizes(self): return [len(v) for v in self.dataloaders.values()] class MultiTaskLearning(TorchComponent): def __init__(self, **kwargs) -> None: """ A multi-task learning (MTL) framework. It shares the same encoder across multiple decoders. These decoders can have dependencies on each other which will be properly handled during decoding. To integrate a component into this MTL framework, a component needs to implement the :class:`~hanlp.components.mtl.tasks.Task` interface. This framework mostly follows the architecture of :cite:`clark-etal-2019-bam` and :cite:`he-choi-2021-stem`, with additional scalar mix tricks (:cite:`kondratyuk-straka-2019-75`) allowing each task to attend to any subset of layers. We also experimented with knowledge distillation on single tasks, the performance gain was nonsignificant on a large dataset. In the near future, we have no plan to invest more efforts in distillation, since most datasets HanLP uses are relatively large, and our hardware is relatively powerful. Args: **kwargs: Arguments passed to config. """ super().__init__(**kwargs) self.model: Optional[MultiTaskModel] = None self.tasks: Dict[str, Task] = None self.vocabs = None def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, tau: float = 0.8, prune=None, prefetch=None, tasks_need_custom_eval=None, cache=False, debug=False, **kwargs) -> DataLoader: # This method is only called during training or evaluation but not prediction dataloader = MultiTaskDataLoader(training=shuffle, tau=tau) for i, (task_name, task) in enumerate(self.tasks.items()): encoder_transform, transform = self.build_transform(task) training = None if data == 'trn': if debug: _data = task.dev else: _data = task.trn training = True elif data == 'dev': _data = task.dev training = False elif data == 'tst': _data = task.tst training = False else: _data = data if isinstance(data, str): logger.info(f'[yellow]{i + 1} / {len(self.tasks)}[/yellow] Building [blue]{data}[/blue] dataset for ' f'[cyan]{task_name}[/cyan] ...') # Adjust Tokenizer according to task config config = copy(task.config) config.pop('transform', None) task_dataloader: DataLoader = task.build_dataloader(_data, transform, training, device, logger, tokenizer=encoder_transform.tokenizer, gradient_accumulation=gradient_accumulation, cache=isinstance(data, str), **config) # if prune: # # noinspection PyTypeChecker # task_dataset: TransformDataset = task_dataloader.dataset # size_before = len(task_dataset) # task_dataset.prune(prune) # size_after = len(task_dataset) # num_pruned = size_before - size_after # logger.info(f'Pruned [yellow]{num_pruned} ({num_pruned / size_before:.1%})[/yellow] ' # f'samples out of {size_before}.') if cache and data in ('trn', 'dev'): task_dataloader: CachedDataLoader = CachedDataLoader( task_dataloader, f'{cache}/{os.getpid()}-{data}-{task_name.replace("/", "-")}-cache.pt' if isinstance(cache, str) else None ) dataloader.dataloaders[task_name] = task_dataloader if data == 'trn': sampling_weights, total_size = dataloader.sampling_weights headings = ['task', '#batches', '%batches', '#scaled', '%scaled', '#epoch'] matrix = [] min_epochs = [] for (task_name, dataset), weight in zip(dataloader.dataloaders.items(), sampling_weights): epochs = len(dataset) / weight / total_size matrix.append( [f'{task_name}', len(dataset), f'{len(dataset) / total_size:.2%}', int(total_size * weight), f'{weight:.2%}', f'{epochs:.2f}']) min_epochs.append(epochs) longest = int(torch.argmax(torch.tensor(min_epochs))) table = markdown_table(headings, matrix) rows = table.splitlines() cells = rows[longest + 2].split('|') cells[-2] = cells[-2].replace(f'{min_epochs[longest]:.2f}', f'[bold][red]{min_epochs[longest]:.2f}[/red][/bold]') rows[longest + 2] = '|'.join(cells) logger.info(f'[bold][yellow]{"Samples Distribution": ^{len(rows[0])}}[/yellow][/bold]') logger.info('\n'.join(rows)) if prefetch and (data == 'trn' or not tasks_need_custom_eval): dataloader = PrefetchDataLoader(dataloader, prefetch=prefetch) return dataloader def build_transform(self, task: Task) -> Tuple[TransformerSequenceTokenizer, TransformList]: encoder: ContextualWordEmbedding = self.config.encoder encoder_transform: TransformerSequenceTokenizer = task.build_tokenizer(encoder.transform()) length_transform = FieldLength('token', 'token_length') transform = TransformList(encoder_transform, length_transform) extra_transform = self.config.get('transform', None) if extra_transform: transform.insert(0, extra_transform) return encoder_transform, transform def build_optimizer(self, trn, epochs, adam_epsilon, weight_decay, warmup_steps, lr, encoder_lr, **kwargs): model = self.model_ encoder = model.encoder num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1) encoder_parameters = list(encoder.parameters()) parameter_groups: List[Dict[str, Any]] = [] decoders = model.decoders decoder_optimizers = dict() for k, task in self.tasks.items(): decoder: torch.nn.Module = decoders[k] decoder_parameters = list(decoder.parameters()) if task.separate_optimizer: decoder_optimizers[k] = task.build_optimizer(decoder=decoder, **kwargs) else: task_lr = task.lr or lr parameter_groups.append({"params": decoder_parameters, 'lr': task_lr}) parameter_groups.append({"params": encoder_parameters, 'lr': encoder_lr}) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] no_decay_parameters = set() for n, p in model.named_parameters(): if any(nd in n for nd in no_decay): no_decay_parameters.add(p) no_decay_by_lr = defaultdict(list) for group in parameter_groups: _lr = group['lr'] ps = group['params'] group['params'] = decay_parameters = [] group['weight_decay'] = weight_decay for p in ps: if p in no_decay_parameters: no_decay_by_lr[_lr].append(p) else: decay_parameters.append(p) for _lr, ps in no_decay_by_lr.items(): parameter_groups.append({"params": ps, 'lr': _lr, 'weight_decay': 0.0}) # noinspection PyTypeChecker from transformers import optimization encoder_optimizer = optimization.AdamW( parameter_groups, lr=lr, weight_decay=weight_decay, eps=adam_epsilon, ) encoder_scheduler = optimization.get_linear_schedule_with_warmup(encoder_optimizer, num_training_steps * warmup_steps, num_training_steps) return encoder_optimizer, encoder_scheduler, decoder_optimizers def build_criterion(self, **kwargs): return dict((k, v.build_criterion(decoder=self.model_.decoders[k], **kwargs)) for k, v in self.tasks.items()) def build_metric(self, **kwargs): metrics = MetricDict() for key, task in self.tasks.items(): metric = task.build_metric(**kwargs) assert metric, f'Please implement `build_metric` of {type(task)} to return a metric.' metrics[key] = metric return metrics def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, patience=0.5, **kwargs): if isinstance(patience, float): patience = int(patience * epochs) best_epoch, best_metric = 0, -1 timer = CountdownTimer(epochs) ratio_width = len(f'{len(trn)}/{len(trn)}') epoch = 0 history = History() for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, history, ratio_width=ratio_width, **self.config) if dev: self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width, input='dev') report = f'{timer.elapsed_human}/{timer.total_time_human}' dev_score = metric.score if dev_score > best_metric: self.save_weights(save_dir) best_metric = dev_score best_epoch = epoch report += ' [red]saved[/red]' else: report += f' ({epoch - best_epoch})' if epoch - best_epoch >= patience: report += ' early stop' break timer.log(report, ratio_percentage=False, newline=True, ratio=False) for d in [trn, dev]: self._close_dataloader(d) if best_epoch != epoch: logger.info(f'Restoring best model saved [red]{epoch - best_epoch}[/red] epochs ago') self.load_weights(save_dir) return best_metric def _close_dataloader(self, d): if isinstance(d, PrefetchDataLoader): d.close() if hasattr(d.dataset, 'close'): self._close_dataloader(d.dataset) elif isinstance(d, CachedDataLoader): d.close() elif isinstance(d, MultiTaskDataLoader): for d in d.dataloaders.values(): self._close_dataloader(d) # noinspection PyMethodOverriding def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, history: History, ratio_width=None, gradient_accumulation=1, encoder_grad_norm=None, decoder_grad_norm=None, patience=0.5, eval_trn=False, **kwargs): self.model.train() encoder_optimizer, encoder_scheduler, decoder_optimizers = optimizer timer = CountdownTimer(len(trn)) total_loss = 0 self.reset_metrics(metric) model = self.model_ encoder_parameters = model.encoder.parameters() decoder_parameters = model.decoders.parameters() for idx, (task_name, batch) in enumerate(trn): decoder_optimizer = decoder_optimizers.get(task_name, None) output_dict, _ = self.feed_batch(batch, task_name) loss = self.compute_loss(batch, output_dict[task_name]['output'], criterion[task_name], self.tasks[task_name]) if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation loss.backward() total_loss += float(loss.item()) if history.step(gradient_accumulation): if self.config.get('grad_norm', None): clip_grad_norm(model, self.config.grad_norm) if encoder_grad_norm: torch.nn.utils.clip_grad_norm_(encoder_parameters, encoder_grad_norm) if decoder_grad_norm: torch.nn.utils.clip_grad_norm_(decoder_parameters, decoder_grad_norm) encoder_optimizer.step() encoder_optimizer.zero_grad() encoder_scheduler.step() if decoder_optimizer: if isinstance(decoder_optimizer, tuple): decoder_optimizer, decoder_scheduler = decoder_optimizer else: decoder_scheduler = None decoder_optimizer.step() decoder_optimizer.zero_grad() if decoder_scheduler: decoder_scheduler.step() if eval_trn: self.decode_output(output_dict, batch, task_name) self.update_metrics(batch, output_dict, metric, task_name) timer.log(self.report_metrics(total_loss / (timer.current + 1), metric if eval_trn else None), ratio_percentage=None, ratio_width=ratio_width, logger=logger) del loss del output_dict return total_loss / timer.total def report_metrics(self, loss, metrics: MetricDict): return f'loss: {loss:.4f} {metrics.cstr()}' if metrics else f'loss: {loss:.4f}' # noinspection PyMethodOverriding @torch.no_grad() def evaluate_dataloader(self, data: MultiTaskDataLoader, criterion, metric: MetricDict, logger, ratio_width=None, input: str = None, **kwargs): self.model.eval() self.reset_metrics(metric) tasks_need_custom_eval = self.config.get('tasks_need_custom_eval', None) tasks_need_custom_eval = tasks_need_custom_eval or {} tasks_need_custom_eval = dict((k, None) for k in tasks_need_custom_eval) for each in tasks_need_custom_eval: tasks_need_custom_eval[each] = data.dataloaders.pop(each) timer = CountdownTimer(len(data) + len(tasks_need_custom_eval)) total_loss = 0 for idx, (task_name, batch) in enumerate(data): output_dict, _ = self.feed_batch(batch, task_name) loss = self.compute_loss(batch, output_dict[task_name]['output'], criterion[task_name], self.tasks[task_name]) total_loss += loss.item() self.decode_output(output_dict, batch, task_name) self.update_metrics(batch, output_dict, metric, task_name) timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger, ratio_width=ratio_width) del loss del output_dict for task_name, dataset in tasks_need_custom_eval.items(): task = self.tasks[task_name] decoder = self.model_.decoders[task_name] task.evaluate_dataloader( dataset, task.build_criterion(decoder=decoder), metric=metric[task_name], input=task.dev if input == 'dev' else task.tst, split=input, decoder=decoder, h=functools.partial(self._encode, task_name=task_name, cls_is_bos=task.cls_is_bos, sep_is_eos=task.sep_is_eos) ) data.dataloaders[task_name] = dataset timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger, ratio_width=ratio_width) return total_loss / timer.total, metric, data def build_model(self, training=False, **kwargs) -> torch.nn.Module: tasks = self.tasks encoder: ContextualWordEmbedding = self.config.encoder transformer_module = encoder.module(training=training) encoder_size = transformer_module.get_output_dim() scalar_mixes = torch.nn.ModuleDict() decoders = torch.nn.ModuleDict() use_raw_hidden_states = dict() for task_name, task in tasks.items(): decoder = task.build_model(encoder_size, training=training, **task.config) assert decoder, f'Please implement `build_model` of {type(task)} to return a decoder.' decoders[task_name] = decoder if task.scalar_mix: scalar_mix = task.scalar_mix.build() scalar_mixes[task_name] = scalar_mix # Activate scalar mix starting from 0-th layer encoder.scalar_mix = 0 use_raw_hidden_states[task_name] = task.use_raw_hidden_states encoder.ret_raw_hidden_states = any(use_raw_hidden_states.values()) return MultiTaskModel(transformer_module, scalar_mixes, decoders, use_raw_hidden_states) def predict(self, data: Union[str, List[str]], tasks: Optional[Union[str, List[str]]] = None, skip_tasks: Optional[Union[str, List[str]]] = None, resolved_tasks=None, **kwargs) -> Document: """Predict on data. Args: data: A sentence or a list of sentences. tasks: The tasks to predict. skip_tasks: The tasks to skip. resolved_tasks: The resolved tasks to override ``tasks`` and ``skip_tasks``. **kwargs: Not used. Returns: A :class:`~hanlp_common.document.Document`. """ doc = Document() target_tasks = resolved_tasks or self.resolve_tasks(tasks, skip_tasks) if data == []: for group in target_tasks: for task_name in group: doc[task_name] = [] return doc flatten_target_tasks = [self.tasks[t] for group in target_tasks for t in group] cls_is_bos = any([x.cls_is_bos for x in flatten_target_tasks]) sep_is_eos = any([x.sep_is_eos for x in flatten_target_tasks]) # Now build the dataloaders and execute tasks first_task_name: str = list(target_tasks[0])[0] first_task: Task = self.tasks[first_task_name] encoder_transform, transform = self.build_transform(first_task) # Override the tokenizer config of the 1st task encoder_transform.sep_is_eos = sep_is_eos encoder_transform.cls_is_bos = cls_is_bos average_subwords = self.model.encoder.average_subwords flat = first_task.input_is_flat(data) if flat: data = [data] device = self.device samples = first_task.build_samples(data, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) dataloader = first_task.build_dataloader(samples, transform=transform, device=device) results = defaultdict(list) order = [] for batch in dataloader: order.extend(batch[IDX]) # Run the first task, let it make the initial batch for the successors output_dict = self.predict_task(first_task, first_task_name, batch, results, run_transform=True, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) # Run each task group in order for group_id, group in enumerate(target_tasks): # We could parallelize this in the future for task_name in group: if task_name == first_task_name: continue output_dict = self.predict_task(self.tasks[task_name], task_name, batch, results, output_dict, run_transform=True, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) if group_id == 0: # We are kind of hard coding here. If the first task is a tokenizer, # we need to convert the hidden and mask to token level if first_task_name.startswith('tok'): spans = [] tokens = [] output_spans = first_task.config.get('output_spans', None) for span_per_sent, token_per_sent in zip(output_dict[first_task_name]['prediction'], results[first_task_name][-len(batch[IDX]):]): if output_spans: token_per_sent = [x[0] for x in token_per_sent] if cls_is_bos: span_per_sent = [(-1, 0)] + span_per_sent token_per_sent = [BOS] + token_per_sent if sep_is_eos: span_per_sent = span_per_sent + [(span_per_sent[-1][0] + 1, span_per_sent[-1][1] + 1)] token_per_sent = token_per_sent + [EOS] # The offsets start with 0 while [CLS] is zero if average_subwords: span_per_sent = [list(range(x[0] + 1, x[1] + 1)) for x in span_per_sent] else: span_per_sent = [x[0] + 1 for x in span_per_sent] spans.append(span_per_sent) tokens.append(token_per_sent) spans = PadSequenceDataLoader.pad_data(spans, 0, torch.long, device=device) output_dict['hidden'] = pick_tensor_for_each_token(output_dict['hidden'], spans, average_subwords) batch['token_token_span'] = spans batch['token'] = tokens # noinspection PyTypeChecker batch['token_length'] = torch.tensor([len(x) for x in tokens], dtype=torch.long, device=device) batch.pop('mask', None) # Put results into doc in the order of tasks for k in self.config.task_names: v = results.get(k, None) if v is None: continue doc[k] = reorder(v, order) # Allow task to perform finalization on document for group in target_tasks: for task_name in group: task = self.tasks[task_name] task.finalize_document(doc, task_name) # If no tok in doc, use raw input as tok if not any(k.startswith('tok') for k in doc): doc['tok'] = data if flat: for k, v in list(doc.items()): doc[k] = v[0] # If there is only one field, don't bother to wrap it # if len(doc) == 1: # return list(doc.values())[0] return doc def resolve_tasks(self, tasks, skip_tasks) -> List[Iterable[str]]: # Now we decide which tasks to perform and their orders tasks_in_topological_order = self._tasks_in_topological_order task_topological_order = self._task_topological_order computation_graph = self._computation_graph target_tasks = self._resolve_task_name(tasks) if not target_tasks: target_tasks = tasks_in_topological_order else: target_topological_order = defaultdict(set) for task_name in target_tasks: for dependency in topological_sort(computation_graph, task_name): target_topological_order[task_topological_order[dependency]].add(dependency) target_tasks = [item[1] for item in sorted(target_topological_order.items())] if skip_tasks: skip_tasks = self._resolve_task_name(skip_tasks) target_tasks = [x - skip_tasks for x in target_tasks] target_tasks = [x for x in target_tasks if x] assert target_tasks, f'No task to perform due to `tasks = {tasks}`.' # Sort target tasks within the same group in a defined order target_tasks = [sorted(x, key=lambda _x: self.config.task_names.index(_x)) for x in target_tasks] return target_tasks def predict_task(self, task: Task, output_key, batch, results, output_dict=None, run_transform=True, cls_is_bos=True, sep_is_eos=True): output_dict, batch = self.feed_batch(batch, output_key, output_dict, run_transform, cls_is_bos, sep_is_eos, results) self.decode_output(output_dict, batch, output_key) results[output_key].extend(task.prediction_to_result(output_dict[output_key]['prediction'], batch)) return output_dict def _resolve_task_name(self, dependencies): resolved_dependencies = set() if isinstance(dependencies, str): if dependencies in self.tasks: resolved_dependencies.add(dependencies) elif dependencies.endswith('*'): resolved_dependencies.update(x for x in self.tasks if x.startswith(dependencies[:-1])) else: prefix_matched = prefix_match(dependencies, self.config.task_names) assert prefix_matched, f'No prefix matching for {dependencies}. ' \ f'Check your dependencies definition: {list(self.tasks.values())}' resolved_dependencies.add(prefix_matched) elif isinstance(dependencies, Iterable): resolved_dependencies.update(set(chain.from_iterable(self._resolve_task_name(x) for x in dependencies))) return resolved_dependencies def fit(self, encoder: Embedding, tasks: Dict[str, Task], save_dir, epochs, patience=0.5, lr=1e-3, encoder_lr=5e-5, adam_epsilon=1e-8, weight_decay=0.0, warmup_steps=0.1, gradient_accumulation=1, grad_norm=5.0, encoder_grad_norm=None, decoder_grad_norm=None, tau: float = 0.8, transform=None, # prune: Callable = None, eval_trn=True, prefetch=None, tasks_need_custom_eval=None, _device_placeholder=False, cache=False, devices=None, logger=None, seed=None, **kwargs): trn_data, dev_data, batch_size = 'trn', 'dev', None task_names = list(tasks.keys()) return super().fit(**merge_locals_kwargs(locals(), kwargs, excludes=('self', 'kwargs', '__class__', 'tasks')), **tasks) # noinspection PyAttributeOutsideInit def on_config_ready(self, **kwargs): self.tasks = dict((key, task) for key, task in self.config.items() if isinstance(task, Task)) computation_graph = dict() for task_name, task in self.tasks.items(): dependencies = task.dependencies resolved_dependencies = self._resolve_task_name(dependencies) computation_graph[task_name] = resolved_dependencies # We can cache this order tasks_in_topological_order = list(toposort(computation_graph)) task_topological_order = dict() for i, group in enumerate(tasks_in_topological_order): for task_name in group: task_topological_order[task_name] = i self._tasks_in_topological_order = tasks_in_topological_order self._task_topological_order = task_topological_order self._computation_graph = computation_graph @staticmethod def reset_metrics(metrics: Dict[str, Metric]): for metric in metrics.values(): metric.reset() def feed_batch(self, batch: Dict[str, Any], task_name, output_dict=None, run_transform=False, cls_is_bos=False, sep_is_eos=False, results=None) -> Tuple[Dict[str, Any], Dict[str, Any]]: h, output_dict = self._encode(batch, task_name, output_dict, cls_is_bos, sep_is_eos) task = self.tasks[task_name] if run_transform: batch = task.transform_batch(batch, results=results, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) batch['mask'] = mask = hanlp.utils.torch_util.lengths_to_mask(batch['token_length']) output_dict[task_name] = { 'output': task.feed_batch(h, batch=batch, mask=mask, decoder=self.model.decoders[task_name]), 'mask': mask } return output_dict, batch def _encode(self, batch, task_name, output_dict=None, cls_is_bos=False, sep_is_eos=False): model = self.model if output_dict: hidden, raw_hidden = output_dict['hidden'], output_dict['raw_hidden'] else: hidden = model.encoder(batch) if isinstance(hidden, tuple): hidden, raw_hidden = hidden else: raw_hidden = None output_dict = {'hidden': hidden, 'raw_hidden': raw_hidden} hidden_states = raw_hidden if model.use_raw_hidden_states[task_name] else hidden if task_name in model.scalar_mixes: scalar_mix = model.scalar_mixes[task_name] h = scalar_mix(hidden_states) else: if model.scalar_mixes: # If any task enables scalar_mix, hidden_states will be a 4d tensor hidden_states = hidden_states[-1, :, :, :] h = hidden_states # If the task doesn't need cls while h has cls, remove cls task = self.tasks[task_name] if cls_is_bos and not task.cls_is_bos: h = h[:, 1:, :] if sep_is_eos and not task.sep_is_eos: h = h[:, :-1, :] return h, output_dict def decode_output(self, output_dict, batch, task_name=None): if not task_name: for task_name, task in self.tasks.items(): output_per_task = output_dict.get(task_name, None) if output_per_task is not None: output_per_task['prediction'] = task.decode_output( output_per_task['output'], output_per_task['mask'], batch, self.model.decoders[task_name]) else: output_per_task = output_dict[task_name] output_per_task['prediction'] = self.tasks[task_name].decode_output( output_per_task['output'], output_per_task['mask'], batch, self.model.decoders[task_name]) def update_metrics(self, batch: Dict[str, Any], output_dict: Dict[str, Any], metrics: MetricDict, task_name): task = self.tasks[task_name] output_per_task = output_dict.get(task_name, None) if output_per_task: output = output_per_task['output'] prediction = output_per_task['prediction'] metric = metrics.get(task_name, None) task.update_metrics(batch, output, prediction, metric) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion: Callable, task: Task) -> torch.FloatTensor: return task.compute_loss(batch, output, criterion) def evaluate(self, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs): rets = super().evaluate('tst', save_dir, logger, batch_size, output, **kwargs) tst = rets[-1] self._close_dataloader(tst) return rets def save_vocabs(self, save_dir, filename='vocabs.json'): for task_name, task in self.tasks.items(): task.save_vocabs(save_dir, f'{task_name}_{filename}') def load_vocabs(self, save_dir, filename='vocabs.json'): for task_name, task in self.tasks.items(): task.load_vocabs(save_dir, f'{task_name}_{filename}') def parallelize(self, devices: List[Union[int, torch.device]]): raise NotImplementedError('Parallelization is not implemented yet.') def __call__(self, data, **kwargs) -> Document: return super().__call__(data, **kwargs) def __getitem__(self, task_name: str) -> Task: return self.tasks[task_name] def __delitem__(self, task_name: str): """Delete a task (and every resource it owns) from this component. Args: task_name: The name of the task to be deleted. Examples: >>> del mtl['dep'] # Delete dep from MTL """ del self.config[task_name] self.config.task_names.remove(task_name) del self.tasks[task_name] del self.model.decoders[task_name] del self._computation_graph[task_name] self._task_topological_order.pop(task_name) for group in self._tasks_in_topological_order: group: set = group group.discard(task_name) def __repr__(self): return repr(self.config) def items(self): yield from self.tasks.items() def __setattr__(self, key: str, value): if key and key.startswith('dict') and not hasattr(self, key): please_read_the_doc_ok = f'This MTL component has no {key}.' matched_children = [] for name in self.config.task_names: if hasattr(self[name], key): matched_children.append(name) if matched_children: please_read_the_doc_ok += f' Maybe you are looking for one of its tasks: {matched_children}. ' \ f'For example, HanLP["{matched_children[0]}"].{key} = ...' raise TypeError(please_read_the_doc_ok) object.__setattr__(self, key, value) ================================================ FILE: hanlp/components/mtl/tasks/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-02 16:51 import logging import os import warnings from abc import ABC, abstractmethod from copy import copy from typing import Callable, Dict, Any, Union, Iterable, List import torch from hanlp_common.util import merge_locals_kwargs from torch.utils.data import DataLoader from hanlp_common.constant import BOS, EOS from hanlp.common.dataset import SamplerBuilder, SortingSamplerBuilder, TransformableDataset, KMeansSamplerBuilder from hanlp_common.document import Document from hanlp.common.structure import ConfigTracker from hanlp.common.torch_component import TorchComponent from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer from hanlp.utils.time_util import CountdownTimer class Task(ConfigTracker, TorchComponent, ABC): # noinspection PyMissingConstructor def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, cls_is_bos=False, sep_is_eos=False, **kwargs) -> None: """ A task in the multi-task learning framework Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. **kwargs: Additional config. """ ConfigTracker.__init__(self, merge_locals_kwargs(locals(), kwargs)) for f, n in zip([trn, dev, tst], ['trn', 'dev', 'tst']): if f and os.path.isfile(f): # anonymize local file names self.config.pop(n) self.separate_optimizer = separate_optimizer self.lr = lr self.use_raw_hidden_states = use_raw_hidden_states if sampler_builder is None: sampler_builder = SortingSamplerBuilder(batch_size=32) self.sampler_builder: Union[SortingSamplerBuilder, KMeansSamplerBuilder] = sampler_builder self.dependencies = dependencies self.tst = tst self.dev = dev self.trn = trn self.scalar_mix = scalar_mix self.cls_is_bos = cls_is_bos self.sep_is_eos = sep_is_eos @abstractmethod def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: """ Build a dataloader for training or evaluation. Args: data: Either a path or a list of samples. transform: The transform from MTL, which is usually [TransformerSequenceTokenizer, FieldLength('token')] training: Whether this method is called on training set. device: The device dataloader is intended to work with. logger: Logger for printing message indicating progress. cache: Whether the dataloader should be cached. gradient_accumulation: Gradient accumulation to be passed to sampler builder. **kwargs: Additional experimental arguments. """ pass def build_optimizer(self, decoder: torch.nn.Module, **kwargs): pass def build_batch_wise_scheduler(self, decoder: torch.nn.Module, **kwargs): pass @abstractmethod def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion, ) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: pass @abstractmethod def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]: pass @abstractmethod def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): pass # noinspection PyMethodOverriding @abstractmethod def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: pass @abstractmethod def build_metric(self, **kwargs): pass def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): pass def evaluate_dataloader(self, data: DataLoader, criterion: Callable, output=False, **kwargs): pass def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, **kwargs): pass # noinspection PyMethodMayBeStatic def compute_lens(self, data: Union[List[Dict[str, Any]], str], dataset: TransformableDataset, input_ids='token_input_ids'): """ Args: data: Samples to be measured or path to dataset during training time. dataset: During training time, use this dataset to measure the length of each sample inside. input_ids: Field name corresponds to input ids. Returns: Length list of this samples """ if dataset.cache is None: warnings.warn(f'Caching for the dataset is not enabled, ' f'try `dataset.purge_cache()` if possible. The dataset is {dataset}.') if isinstance(data, str): timer = CountdownTimer(len(dataset)) for each in dataset: timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]') timer.erase() return [len(x[input_ids]) for x in dataset] def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor, decoder: torch.nn.Module): return decoder(h, batch=batch, mask=mask) def input_is_flat(self, data) -> bool: """ Check whether the data is flat (meaning that it's only a single sample, not even batched). Returns: bool: ``True`` to indicate the input data is flat. """ raise NotImplementedError( '`input_is_flat()` needs to be implemented for the task component to accept raw input from user.' ) @abstractmethod def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List: raise NotImplementedError() # noinspection PyMethodMayBeStatic def transform_batch(self, batch: Dict[str, Any], # inputs: List[List[str]], results: Dict[str, Any] = None, cls_is_bos=False, sep_is_eos=False) -> Dict[str, Any]: """ Let the task transform the batch before feeding the batch into its decoder. The default behavior is to adjust the head and tail of tokens, according to ``cls_is_bos``, ``sep_is_eos`` passed in and the two settings of the task itself. Args: batch: A batch of samples. results: Predicted results from other tasks which might be useful for this task to utilize. Say a dep task uses both token and pos as features, then it will need both tok and pos results to make a batch. cls_is_bos: First token in this batch is BOS. sep_is_eos: Last token in this batch is EOS. Returns: A batch. """ if cls_is_bos != self.cls_is_bos or sep_is_eos != self.sep_is_eos: batch = copy(batch) tokens = self._adjust_token(batch, cls_is_bos, sep_is_eos, 'token') delta = len(tokens[0]) - len(batch['token'][0]) batch['token_length'] = batch['token_length'] + delta batch['token'] = tokens if 'token_' in batch: if isinstance(batch['token_'][0], list): batch['token_'] = self._adjust_token(batch, cls_is_bos, sep_is_eos, 'token_') else: batch['token_'] = tokens return batch def _adjust_token(self, batch, cls_is_bos, sep_is_eos, token_key): tokens = [] for sent in batch[token_key]: if cls_is_bos: if not self.cls_is_bos: sent = sent[1:] elif self.cls_is_bos: sent = [BOS] + sent if sep_is_eos: if not self.sep_is_eos: sent = sent[:-1] elif self.sep_is_eos: sent = sent + [EOS] tokens.append(sent) return tokens # noinspection PyMethodMayBeStatic def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False): """ Build samples for this task. Called when this task is the first task. Default behaviour is to take inputs as list of tokens and put these tokens into a dict per sample. Args: inputs: Inputs from users, usually a list of lists of tokens. cls_is_bos: Insert BOS to the head of each sentence. sep_is_eos: Append EOS to the tail of each sentence. Returns: List of samples. """ if cls_is_bos: inputs = [[BOS] + x for x in inputs] if sep_is_eos: inputs = [x + [EOS] for x in inputs] return [{'token': token} for token in inputs] def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer): """Build a transformer tokenizer for this task. Args: tokenizer: A tokenizer which is shared but can be adjusted to provide per-task settings. Returns: A TransformerSequenceTokenizer. """ if tokenizer.cls_is_bos != self.cls_is_bos or tokenizer.sep_is_eos != self.sep_is_eos: tokenizer = copy(tokenizer) tokenizer.cls_is_bos = self.cls_is_bos tokenizer.sep_is_eos = self.sep_is_eos return tokenizer # noinspection PyMethodMayBeStatic def finalize_document(self, doc: Document, task_name: str): pass ================================================ FILE: hanlp/components/mtl/tasks/amr.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-12 16:05 import logging from typing import Dict, Any, List, Union, Iterable, Callable import torch from stog.data.dataset_readers.amr_parsing.amr import AMRGraph from stog.data.dataset_readers.amr_parsing.node_utils import NodeUtilities from stog.data.dataset_readers.amr_parsing.postprocess.node_restore import NodeRestore from torch.utils.data import DataLoader from hanlp_common.constant import CLS from hanlp.common.dataset import PrefetchDataLoader, SamplerBuilder from hanlp.common.transform import VocabDict from hanlp.components.amr.amr_parser.graph_amr_decoder import GraphAbstractMeaningRepresentationDecoder from hanlp.components.amr.amr_parser.graph_parser import GraphAbstractMeaningRepresentationParser from hanlp.components.amr.amr_parser.postprocess import PostProcessor from hanlp.components.amr.amr_parser.work import parse_batch from hanlp.components.mtl.tasks import Task from hanlp.datasets.parsing.amr import batchify, get_concepts from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.amr.smatch_eval import SmatchScores, get_amr_utils from hanlp.metrics.f1 import F1_ from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp.utils.io_util import get_resource from hanlp_common.util import merge_list_of_dict, merge_locals_kwargs class GraphAbstractMeaningRepresentationParsing(Task, GraphAbstractMeaningRepresentationParser): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=False, char2concept_dim=128, cnn_filters=((3, 256),), concept_char_dim=32, concept_dim=300, dropout=0.2, embed_dim=512, eval_every=20, ff_embed_dim=1024, graph_layers=2, inference_layers=4, num_heads=8, rel_dim=100, snt_layers=4, unk_rate=0.33, vocab_min_freq=5, beam_size=8, alpha=0.6, max_time_step=100, amr_version='2.0', **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() utils_dir = get_resource(get_amr_utils(amr_version)) self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir)) def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: if isinstance(data, list): data = GraphAbstractMeaningRepresentationParser.build_samples(self, data) dataset, lens = GraphAbstractMeaningRepresentationParser.build_dataset(self, data, logger=logger, transform=transform, training=training) if self.vocabs.mutable: GraphAbstractMeaningRepresentationParser.build_vocabs(self, dataset, logger) dataloader = PrefetchDataLoader( DataLoader(batch_sampler=self.sampler_builder.build(lens, shuffle=training, gradient_accumulation=gradient_accumulation), dataset=dataset, collate_fn=merge_list_of_dict, num_workers=0), batchify=self.build_batchify(device, training), prefetch=None) return dataloader def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: concept_loss, arc_loss, rel_loss, graph_arc_loss = output concept_loss, concept_correct, concept_total = concept_loss rel_loss, rel_correct, rel_total = rel_loss loss = concept_loss + arc_loss + rel_loss return loss def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]: return output def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): pass def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return GraphAbstractMeaningRepresentationDecoder(vocabs=self.vocabs, encoder_size=encoder_size, **self.config) def build_metric(self, **kwargs): return SmatchScores({'Smatch': F1_(0, 0, 0)}) def input_is_flat(self, data) -> bool: return GraphAbstractMeaningRepresentationParser.input_is_flat(self, data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List: pp = PostProcessor(self.vocabs['rel']) for concept, relation, score in zip(prediction['concept'], prediction['relation'], prediction['score']): amr = pp.to_amr(concept, relation) amr_graph = AMRGraph(amr) self.sense_restore.restore_graph(amr_graph) yield amr_graph def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, input=None, decoder=None, h=None, split=None, **kwargs): # noinspection PyTypeChecker GraphAbstractMeaningRepresentationParser.evaluate_dataloader(self, data, logger=None, metric=metric, input=input, model=decoder, h=lambda x: h(x)[0], use_fast=True) def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor, decoder: torch.nn.Module): if decoder.training: return super().feed_batch(h, batch, mask, decoder) beam_size = self.config.get('beam_size', 8) alpha = self.config.get('alpha', 0.6) max_time_step = self.config.get('max_time_step', 100) res = parse_batch(decoder, batch, beam_size, alpha, max_time_step, h=h) return res def transform_batch(self, batch: Dict[str, Any], results: Dict[str, Any] = None, cls_is_bos=False, sep_is_eos=False) -> Dict[str, Any]: batch = super().transform_batch(batch, results, cls_is_bos, sep_is_eos) batch['lemma'] = [[CLS] + x for x in results['lem']] copy_seq = merge_list_of_dict( [get_concepts({'token': t[1:], 'lemma': l[1:]}, self.vocabs.predictable_concept) for t, l in zip(batch['token'], batch['lemma'])]) copy_seq.pop('token') copy_seq.pop('lemma') batch.update(copy_seq) ret = batchify(batch, self.vocabs, device=batch['token_input_ids'].device) return ret ================================================ FILE: hanlp/components/mtl/tasks/constituency.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-29 16:52 import logging from typing import Dict, Any, List, Union, Iterable, Callable import torch from phrasetree.tree import Tree from hanlp_common.constant import BOS, EOS from hanlp_common.document import Document from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict from hanlp.components.mtl.tasks import Task from hanlp.components.parsers.constituency.crf_constituency_model import CRFConstituencyDecoder from hanlp.components.parsers.constituency.crf_constituency_parser import CRFConstituencyParser from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import merge_locals_kwargs, prefix_match class CRFConstituencyParsing(Task, CRFConstituencyParser): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, cls_is_bos=True, sep_is_eos=True, delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP', ',', 'S1'), equal=(('ADVP', 'PRT'),), mbr=True, n_mlp_span=500, n_mlp_label=100, mlp_dropout=.33, no_subcategory=True, **kwargs ) -> None: r"""Two-stage CRF Parsing (:cite:`ijcai2020-560`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. delete: Constituencies to be deleted from training and evaluation. equal: Constituencies that are regarded as equal during evaluation. mbr: ``True`` to enable Minimum Bayes Risk (MBR) decoding (:cite:`smith-smith-2007-probabilistic`). n_mlp_span: Number of features for span decoder. n_mlp_label: Number of features for label decoder. mlp_dropout: Dropout applied to MLPs. no_subcategory: Strip out subcategories. **kwargs: Not used. """ if isinstance(equal, tuple): equal = dict(equal) super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() # noinspection DuplicatedCode def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = CRFConstituencyParsing.build_dataset(self, data, transform) dataset.purge_cache() if self.vocabs.mutable: CRFConstituencyParsing.build_vocabs(self, dataset, logger) if isinstance(data, str): timer = CountdownTimer(len(dataset)) # noinspection PyCallByClass BiaffineDependencyParser.cache_dataset(self, dataset, timer, training, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset) def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor, decoder: torch.nn.Module): return { 'output': decoder(h), 'mask': CRFConstituencyParser.compute_mask( self, batch, offset=1 if 'constituency' in batch or batch['token'][0][-1] == EOS else -1) } def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: out, mask = output['output'], output['mask'] loss, span_probs = CRFConstituencyParser.compute_loss(self, out, batch['chart_id'], mask, crf_decoder=criterion) output['span_probs'] = span_probs return loss def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]: out, mask = output['output'], output['mask'] tokens = [] for sent in batch['token']: if sent[0] == BOS: sent = sent[1:] if sent[-1] == EOS: sent = sent[:-1] tokens.append(sent) return CRFConstituencyParser.decode_output(self, out, mask, batch, output.get('span_probs', None), decoder=decoder, tokens=tokens) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): return CRFConstituencyParser.update_metrics(self, metric, batch, prediction) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return CRFConstituencyDecoder(n_labels=len(self.vocabs.chart), n_hidden=encoder_size) def build_metric(self, **kwargs): return CRFConstituencyParser.build_metric(self) def input_is_flat(self, data) -> bool: return CRFConstituencyParser.input_is_flat(self, data) def prediction_to_result(self, prediction: List, batch: Dict[str, Any]) -> List: return prediction def finalize_document(self, doc: Document, task_name: str): pos_key = prefix_match('pos', doc) pos: List[List[str]] = doc.get(pos_key, None) if pos: for tree, pos_per_sent in zip(doc[task_name], pos): tree: Tree = tree offset = 0 for subtree in tree.subtrees(lambda t: t.height() == 2): tag = subtree.label() if tag == '_': subtree.set_label(pos_per_sent[offset]) offset += 1 def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False): return CRFConstituencyParser.build_samples(self, inputs) ================================================ FILE: hanlp/components/mtl/tasks/dep.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-13 21:39 import logging from typing import Dict, Any, Union, Iterable, List import torch from torch.optim import Adam from torch.optim.lr_scheduler import ExponentialLR from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict, TransformList from hanlp.components.mtl.tasks import Task from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder from hanlp.datasets.parsing.loaders.conll_dataset import append_bos from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp.utils.time_util import CountdownTimer from hanlp_common.constant import EOS from hanlp_common.util import merge_locals_kwargs class BiaffineDependencyParsing(Task, BiaffineDependencyParser): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=False, punct=False, tree=False, proj=False, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, mu=.9, nu=.9, epsilon=1e-12, decay=.75, decay_steps=5000, use_pos=False, max_seq_len=None, **kwargs) -> None: """Biaffine dependency parsing (:cite:`dozat:17a`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. punct: ``True`` to include punctuations in evaluation. tree: ``True`` to enforce tree constraint. proj: ``True`` for projective parsing. n_mlp_arc: Number of features for arc representation. n_mlp_rel: Number of features for rel representation. mlp_dropout: Dropout applied to MLPs. mu: First coefficient used for computing running averages of gradient and its square in Adam. nu: Second coefficient used for computing running averages of gradient and its square in Adam. epsilon: Term added to the denominator to improve numerical stability decay: Decay rate for exceptional lr scheduler. decay_steps: Decay every ``decay_steps`` steps. use_pos: Use pos feature. max_seq_len: Prune samples longer than this length. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): BiaffineDependencyParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1], batch.get('punct_mask', None), metric, batch) def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]: (arc_scores, rel_scores), mask = output return BiaffineDependencyParser.decode(self, arc_scores, rel_scores, mask, batch) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \ Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: (arc_scores, rel_scores), mask = output return BiaffineDependencyParser.compute_loss(self, arc_scores, rel_scores, batch['arc'], batch['rel_id'], mask, criterion, batch) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return BiaffineDecoder(encoder_size, self.config.n_mlp_arc, self.config.n_mlp_rel, self.config.mlp_dropout, len(self.vocabs.rel)) def build_metric(self, **kwargs): return BiaffineDependencyParser.build_metric(self, **kwargs) def build_dataloader(self, data, transform: TransformList = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: transform.insert(0, append_bos) dataset = BiaffineDependencyParser.build_dataset(self, data, transform) dataset.purge_cache() if self.vocabs.mutable: BiaffineDependencyParser.build_vocabs(self, dataset, logger, transformer=True) if isinstance(data, str): timer = CountdownTimer(len(dataset)) BiaffineDependencyParser.cache_dataset(self, dataset, timer, training, logger) max_seq_len = self.config.get('max_seq_len', None) if max_seq_len and isinstance(data, str): dataset.prune(lambda x: len(x['token_input_ids']) > max_seq_len, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset, pad=self.get_pad_dict()) def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor, decoder: torch.nn.Module): logits = super().feed_batch(h, batch, mask, decoder) mask = mask.clone() mask[:, 0] = 0 return logits, mask def build_optimizer(self, decoder: torch.nn.Module, **kwargs): config = self.config optimizer = Adam(decoder.parameters(), config.lr, (config.mu, config.nu), config.epsilon) scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps)) return optimizer, scheduler def input_is_flat(self, data) -> bool: return BiaffineDependencyParser.input_is_flat(self, data, self.config.use_pos) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List: arcs, rels = prediction arcs = arcs[:, 1:] # Skip the ROOT rels = rels[:, 1:] arcs = arcs.tolist() rels = rels.tolist() vocab = self.vocabs['rel'].idx_to_token for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']): tokens = tokens[1:] sent_len = len(tokens) result = list(zip(arcs_per_sent[:sent_len], [vocab[r] for r in rels_per_sent[:sent_len]])) yield result def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False): return [{'FORM': token + ([EOS] if sep_is_eos else [])} for token in inputs] ================================================ FILE: hanlp/components/mtl/tasks/dep_2nd.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-07 14:14 import logging from typing import Dict, Any, Union, Iterable, Callable, List import torch from hanlp_common.util import merge_locals_kwargs from torch.utils.data import DataLoader import hanlp.utils.torch_util from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict from hanlp.components.mtl.tasks import Task from hanlp.components.parsers.biaffine.biaffine_2nd_dep import BiaffineSecondaryParser, BiaffineJointDecoder, \ BiaffineSeparateDecoder from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict class BiaffineSecondaryDependencyDecoder(torch.nn.Module): def __init__(self, hidden_size, config) -> None: super().__init__() self.decoder = BiaffineJointDecoder(hidden_size, config) if config.joint \ else BiaffineSeparateDecoder(hidden_size, config) def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None): if mask is None: mask = hanlp.utils.torch_util.lengths_to_mask(batch['token_length']) else: mask = mask.clone() scores = self.decoder(contextualized_embeddings, mask) mask[:, 0] = 0 return scores, mask class BiaffineSecondaryDependencyParsing(Task, BiaffineSecondaryParser): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, punct=False, tree=False, apply_constraint=True, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, pad_rel=None, joint=True, mu=.9, nu=.9, epsilon=1e-12, cls_is_bos=True, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = BiaffineSecondaryParser.build_dataset(self, data, transform) dataset.purge_cache() if self.vocabs.mutable: BiaffineSecondaryParser.build_vocabs(self, dataset, logger, transformer=True) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset, pad={'arc': 0, 'arc_2nd': False}) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): BiaffineSecondaryParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1], batch['punct_mask'], metric, batch) def decode_output(self, output: Dict[str, Any], batch: Dict[str, Any], decoder, **kwargs) \ -> Union[Dict[str, Any], Any]: return BiaffineSecondaryParser.decode(self, *output[0], output[1], batch) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \ Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: return BiaffineSecondaryParser.compute_loss(self, *output[0], batch['arc'], batch['rel_id'], output[1], criterion, batch) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return BiaffineSecondaryDependencyDecoder(encoder_size, self.config) def build_metric(self, **kwargs): return BiaffineSecondaryParser.build_metric(self, **kwargs) def build_criterion(self, **kwargs): return BiaffineSecondaryParser.build_criterion(self, **kwargs) def build_optimizer(self, decoder: torch.nn.Module, **kwargs): config = self.config optimizer = torch.optim.Adam(decoder.parameters(), config.lr, (config.mu, config.nu), config.epsilon) return optimizer def input_is_flat(self, data) -> bool: return BiaffineSecondaryParser.input_is_flat(self, data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List: outputs = [] return BiaffineSecondaryParser.predictions_to_human(self, prediction, outputs, batch['token'], use_pos=False) ================================================ FILE: hanlp/components/mtl/tasks/lem.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-09 16:37 import logging from typing import Dict, Any, Union, Iterable, Callable, List import torch from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict from hanlp.components.lemmatizer import TransformerLemmatizer from hanlp.components.mtl.tasks import Task from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs from torch.utils.data import DataLoader class LinearDecoder(torch.nn.Module): def __init__(self, hidden_size, num_labels) -> None: super().__init__() self.classifier = torch.nn.Linear(hidden_size, num_labels) def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None): return self.classifier(contextualized_embeddings) class TransformerLemmatization(Task, TransformerLemmatizer): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=False, sep_is_eos=False, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, token_key='token', **kwargs) -> None: """ Transition based lemmatization (:cite:`kondratyuk-straka-2019-75`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() def build_dataloader(self, data: List[List[str]], transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: args = dict((k, self.config[k]) for k in ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config) dataset = self.build_dataset(data, cache=True, transform=transform, **args) dataset.append_transform(self.vocabs) if self.vocabs.mutable: self.build_vocabs(dataset, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: return TransformerLemmatizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask']) def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]: return TransformerLemmatizer.decode_output(self, output, mask, batch, decoder) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): return TransformerLemmatizer.update_metrics(self, metric, output, batch['tag_id'], batch['mask']) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return LinearDecoder(encoder_size, len(self.vocabs['tag'])) def build_metric(self, **kwargs): return TransformerLemmatizer.build_metric(self, **kwargs) def input_is_flat(self, data) -> bool: return TransformerLemmatizer.input_is_flat(self, data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]: return TransformerLemmatizer.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token, batch, token=batch['token']) ================================================ FILE: hanlp/components/mtl/tasks/ner/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-03 14:34 ================================================ FILE: hanlp/components/mtl/tasks/ner/biaffine_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-05 01:49 import logging from copy import copy from typing import Dict, Any, Union, Iterable, List import torch from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict, TransformList from hanlp.components.mtl.tasks import Task from hanlp.components.ner.biaffine_ner.biaffine_ner import BiaffineNamedEntityRecognizer from hanlp.components.ner.biaffine_ner.biaffine_ner_model import BiaffineNamedEntityRecognitionDecoder from hanlp.datasets.ner.loaders.json_ner import unpack_ner from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs class BiaffineNamedEntityRecognition(Task, BiaffineNamedEntityRecognizer): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, doc_level_offset=True, is_flat_ner=True, tagset=None, ret_tokens=' ', ffnn_size=150, loss_reduction='mean', **kwargs) -> None: """An implementation of Named Entity Recognition as Dependency Parsing (:cite:`yu-etal-2020-named`). It treats every possible span as a candidate of entity and predicts its entity label. Non-entity spans are assigned NULL label to be excluded. The label prediction is done with a biaffine layer (:cite:`dozat:17a`). As it makes no assumption about the spans, it naturally supports flat NER and nested NER. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. is_flat_ner: ``True`` for flat NER, otherwise nested NER. tagset: Optional tagset to prune entities outside of this tagset from datasets. ret_tokens: A delimiter between tokens in entities so that the surface form of an entity can be rebuilt. ffnn_size: Feedforward size for MLPs extracting the head/tail representations. loss_reduction: The loss reduction used in aggregating losses. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): BiaffineNamedEntityRecognizer.update_metrics(self, batch, prediction, metric) def decode_output(self, output: Dict[str, Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]: return self.get_pred_ner(batch['token'], output['candidate_ner_scores']) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \ Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: return output['loss'] def build_dataloader(self, data, transform: TransformList = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: transform = copy(transform) transform.append(unpack_ner) dataset = BiaffineNamedEntityRecognizer.build_dataset(self, data, self.vocabs, transform) dataset.purge_cache() if self.vocabs.mutable: BiaffineNamedEntityRecognizer.build_vocabs(self, dataset, logger, self.vocabs) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return BiaffineNamedEntityRecognitionDecoder(encoder_size, self.config.ffnn_size, len(self.vocabs.label), self.config.loss_reduction) def build_metric(self, **kwargs): return BiaffineNamedEntityRecognizer.build_metric(self, **kwargs) def input_is_flat(self, data) -> bool: return BiaffineNamedEntityRecognizer.input_is_flat(data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List: results = [] BiaffineNamedEntityRecognizer.prediction_to_result(batch['token'], prediction, results, ret_tokens=self.config.get('ret_tokens', ' ')) return results ================================================ FILE: hanlp/components/mtl/tasks/ner/tag_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-03 14:35 import logging from typing import Union, List, Dict, Any, Iterable, Callable, Set, Sequence import torch from hanlp_trie import DictInterface from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict from hanlp.components.mtl.tasks import Task from hanlp.components.ner.transformer_ner import TransformerNamedEntityRecognizer from hanlp.layers.crf.crf import CRF from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs class LinearCRFDecoder(torch.nn.Module): def __init__(self, hidden_size, num_labels, secondary_encoder=None, crf=False) -> None: super().__init__() self.secondary_encoder = secondary_encoder self.classifier = torch.nn.Linear(hidden_size, num_labels) self.crf = CRF(num_labels) if crf else None def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None): if self.secondary_encoder: contextualized_embeddings = self.secondary_encoder(contextualized_embeddings, mask=mask) return self.classifier(contextualized_embeddings) class TaggingNamedEntityRecognition(Task, TransformerNamedEntityRecognizer): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, tagging_scheme=None, crf=False, delimiter_in_entity=None, merge_types: List[str] = None, secondary_encoder=None, token_key='token', dict_whitelist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, dict_blacklist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, dict_tags: Union[ DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None, **kwargs) -> None: r"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type. During decoding, it performs longest-prefix-matching of these words to override the prediction from underlying statistical model. It also uses a blacklist to mask out mis-predicted entities. .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can do and what it can't do. The tutorial in `this book `_ can be very helpful. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining tokens during decoding. merge_types: The types of consecutive entities to be merged. secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden states from the main encoder as input. token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. dict_whitelist: A :class:`dict` or a :class:`~hanlp_trie.dictionary.DictInterface` of gazetteers to be included into the final results. dict_blacklist: A :class:`set` or a :class:`~hanlp_trie.dictionary.DictInterface` of badcases to be excluded from the final results. **kwargs: """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() self.secondary_encoder = secondary_encoder self.dict_whitelist = dict_whitelist self.dict_blacklist = dict_blacklist self.dict_tags = dict_tags def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: args = dict((k, self.config[k]) for k in ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config) dataset = self.build_dataset(data, cache=cache, transform=transform, **args) dataset.append_transform(self.vocabs) dataset.purge_cache() if self.vocabs.mutable: self.build_vocabs(dataset, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build( self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: return TransformerNamedEntityRecognizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask']) def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]: return TransformerNamedEntityRecognizer.decode_output(self, output, batch['mask'], batch, decoder) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): return TransformerNamedEntityRecognizer.update_metrics(self, metric, output, batch['tag_id'], batch['mask'], batch, prediction) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.secondary_encoder, self.config.crf) def build_metric(self, **kwargs): return TransformerNamedEntityRecognizer.build_metric(self, **kwargs) def input_is_flat(self, data) -> bool: return TransformerNamedEntityRecognizer.input_is_flat(self, data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]: return TransformerNamedEntityRecognizer.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token, batch) ================================================ FILE: hanlp/components/mtl/tasks/pos.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-10-19 18:56 import logging from typing import Dict, Any, Union, Iterable, Callable, List, Tuple, Sequence import torch from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict from hanlp.components.mtl.tasks import Task from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger from hanlp.layers.crf.crf import CRF from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs from hanlp_trie import DictInterface, TrieDict class LinearCRFDecoder(torch.nn.Module): def __init__(self, hidden_size, num_labels, crf=False) -> None: """A linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer on top of it. Args: hidden_size: Size of hidden states. num_labels: Size of tag set. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). """ super().__init__() self.classifier = torch.nn.Linear(hidden_size, num_labels) self.crf = CRF(num_labels) if crf else None def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None): """ Args: contextualized_embeddings: Hidden states for contextual layer. batch: A dict of a batch. mask: Mask for tokens. Returns: Logits. Users are expected to call ``CRF.decode`` on these emissions during decoding and ``CRF.forward`` during training. """ return self.classifier(contextualized_embeddings) class TransformerTagging(Task, TransformerTagger): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=False, sep_is_eos=False, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, crf=False, token_key='token', dict_tags: Union[ DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None, **kwargs) -> None: """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for any tagging tasks including PoS tagging and many others. It also features with a custom dictionary ``dict_tags`` to perform ``longest-prefix-matching`` which replaces matched tokens with given tags. .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can do and what it can't do. The tutorial in `this book `_ can be very helpful. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. dict_tags: A custom dictionary to override predicted tags by performing longest-prefix-matching. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() self.dict_tags = dict_tags def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: args = dict((k, self.config[k]) for k in ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config) dataset = self.build_dataset(data, cache=True, transform=transform, **args) dataset.append_transform(self.vocabs) if self.vocabs.mutable: self.build_vocabs(dataset, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: return TransformerTagger.compute_loss(self, criterion, output, batch['tag_id'], batch['mask']) def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]: return TransformerTagger.decode_output(self, output, mask, batch, decoder) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): return TransformerTagger.update_metrics(self, metric, output, batch['tag_id'], batch['mask']) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.config.crf) def build_metric(self, **kwargs): return TransformerTagger.build_metric(self, **kwargs) def input_is_flat(self, data) -> bool: return TransformerTagger.input_is_flat(self, data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]: return TransformerTagger.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token, batch) ================================================ FILE: hanlp/components/mtl/tasks/sdp.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-13 21:39 import logging from typing import Dict, Any, Union, Iterable, List import torch from torch.optim import Adam from torch.optim.lr_scheduler import ExponentialLR from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict, TransformList from hanlp.components.mtl.tasks import Task from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder from hanlp.components.parsers.biaffine.biaffine_sdp import BiaffineSemanticDependencyParser from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import merge_locals_kwargs class BiaffineSemanticDependencyParsing(Task, BiaffineSemanticDependencyParser): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, punct=False, tree=True, pad_rel=None, apply_constraint=False, single_root=True, no_zero_head=None, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, mu=.9, nu=.9, epsilon=1e-12, decay=.75, decay_steps=5000, cls_is_bos=True, use_pos=False, **kwargs) -> None: r"""Implementation of "Stanford's graph-based neural dependency parser at the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade" (:cite:`he-choi-2019`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. punct: ``True`` to include punctuations in evaluation. pad_rel: Padding token for relations. apply_constraint: Enforce constraints (see following parameters). single_root: Force single root. no_zero_head: Every token has at least one head. n_mlp_arc: Number of features for arc representation. n_mlp_rel: Number of features for rel representation. mlp_dropout: Dropout applied to MLPs. mu: First coefficient used for computing running averages of gradient and its square in Adam. nu: Second coefficient used for computing running averages of gradient and its square in Adam. epsilon: Term added to the denominator to improve numerical stability decay: Decay rate for exceptional lr scheduler. decay_steps: Decay every ``decay_steps`` steps. cls_is_bos: ``True`` to treat the first token as ``BOS``. use_pos: Use pos feature. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): BiaffineSemanticDependencyParser.update_metric(self, *prediction, batch['arc'], batch['rel_id'], output[1], output[-1], metric, batch) def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]: (arc_scores, rel_scores), mask, punct_mask = output return BiaffineSemanticDependencyParser.decode(self, arc_scores, rel_scores, mask, batch) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \ Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: (arc_scores, rel_scores), mask, punct_mask = output return BiaffineSemanticDependencyParser.compute_loss(self, arc_scores, rel_scores, batch['arc'], batch['rel_id'], mask, criterion, batch) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return BiaffineDecoder(encoder_size, self.config.n_mlp_arc, self.config.n_mlp_rel, self.config.mlp_dropout, len(self.vocabs.rel)) def build_metric(self, **kwargs): return BiaffineSemanticDependencyParser.build_metric(self, **kwargs) def build_dataloader(self, data, transform: TransformList = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = BiaffineSemanticDependencyParser.build_dataset(self, data, transform) dataset.purge_cache() if self.vocabs.mutable: BiaffineSemanticDependencyParser.build_vocabs(self, dataset, logger, transformer=True) if isinstance(data, str): timer = CountdownTimer(len(dataset)) BiaffineSemanticDependencyParser.cache_dataset(self, dataset, timer, training, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset, pad=self.get_pad_dict()) def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor, decoder: torch.nn.Module): logits = super().feed_batch(h, batch, mask, decoder) arc_scores = logits[0] mask = mask.clone() mask[:, 0] = 0 mask = self.convert_to_3d_mask(arc_scores, mask) punct_mask = self.convert_to_3d_puncts(batch.get('punct_mask', None), mask) return logits, mask, punct_mask def build_optimizer(self, decoder: torch.nn.Module, **kwargs): config = self.config optimizer = Adam(decoder.parameters(), config.lr, (config.mu, config.nu), config.epsilon) scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps)) return optimizer, scheduler def input_is_flat(self, data) -> bool: return BiaffineSemanticDependencyParser.input_is_flat(self, data, self.config.use_pos) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List: arcs, rels = prediction arcs = arcs[:, 1:, :] # Skip the ROOT rels = rels[:, 1:, :] arcs = arcs.tolist() rels = rels.tolist() vocab = self.vocabs['rel'].idx_to_token for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']): tokens = tokens[1:] sent_len = len(tokens) result = [] for a, r in zip(arcs_per_sent[:sent_len], rels_per_sent[:sent_len]): heads = [i for i in range(sent_len + 1) if a[i]] deprels = [vocab[r[i]] for i in range(sent_len + 1) if a[i]] result.append(list(zip(heads, deprels))) yield result def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False): return BiaffineSemanticDependencyParser.build_samples(self, inputs, self.config.use_pos) ================================================ FILE: hanlp/components/mtl/tasks/srl/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-04 16:49 ================================================ FILE: hanlp/components/mtl/tasks/srl/bio_srl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-04 16:50 import logging from typing import Dict, Any, List, Union, Iterable, Callable import torch from torch.utils.data import DataLoader from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder from hanlp.common.transform import VocabDict from hanlp.components.mtl.tasks import Task from hanlp.components.srl.span_bio.baffine_tagging import BiaffineTaggingDecoder from hanlp.components.srl.span_bio.span_bio import SpanBIOSemanticRoleLabeler from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs import torch.nn.functional as F class SpanBIOSemanticRoleLabeling(Task, SpanBIOSemanticRoleLabeler): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, cls_is_bos=False, sep_is_eos=False, crf=False, n_mlp_rel=300, mlp_dropout=0.2, loss_reduction='mean', doc_level_offset=True, **kwargs) -> None: """A span based Semantic Role Labeling task using BIO scheme for tagging the role of each token. Given a predicate and a token, it uses biaffine (:cite:`dozat:17a`) to predict their relations as one of BIO-ROLE. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). n_mlp_rel: Output size of MLPs for representing predicate and tokens. mlp_dropout: Dropout applied to MLPs. loss_reduction: Loss reduction for aggregating losses. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = self.build_dataset(data, transform=[transform, self.vocabs]) dataset.purge_cache() if self.vocabs.mutable: SpanBIOSemanticRoleLabeler.build_vocabs(self, dataset, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \ Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: pred, mask = output return SpanBIOSemanticRoleLabeler.compute_loss(self, criterion, pred, batch['srl_id'], mask) def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[Dict[str, Any], Any]: pred, mask = output return SpanBIOSemanticRoleLabeler.decode_output(self, pred, mask, batch, decoder) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): return SpanBIOSemanticRoleLabeler.update_metrics(self, metric, prediction, batch) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return BiaffineTaggingDecoder( len(self.vocabs['srl']), encoder_size, self.config.n_mlp_rel, self.config.mlp_dropout, self.config.crf, ) def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor, decoder: torch.nn.Module): if not h.numel(): # No tokens, don't bother to run the decoder return [], None pred = decoder(h) mask3d = self.compute_mask(mask) if self.config.crf: token_index = mask3d[0] pred = pred.flatten(end_dim=1)[token_index] pred = F.log_softmax(pred, dim=-1) return pred, mask3d def build_metric(self, **kwargs): return SpanBIOSemanticRoleLabeler.build_metric(self) def input_is_flat(self, data) -> bool: return SpanBIOSemanticRoleLabeler.input_is_flat(self, data) def prediction_to_result(self, prediction: List, batch: Dict[str, Any]) -> List: yield from SpanBIOSemanticRoleLabeler.prediction_to_result(self, prediction, batch) ================================================ FILE: hanlp/components/mtl/tasks/srl/rank_srl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-05 15:43 import logging from typing import Union, List, Dict, Any, Iterable, Callable import torch from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict from hanlp.components.mtl.tasks import Task from hanlp.components.srl.span_rank.span_rank import SpanRankingSemanticRoleLabeler from hanlp.components.srl.span_rank.span_ranking_srl_model import SpanRankingSRLDecoder from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs class SpanRankingSemanticRoleLabeling(Task, SpanRankingSemanticRoleLabeler): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, lexical_dropout=0.5, dropout=0.2, span_width_feature_size=20, ffnn_size=150, ffnn_depth=2, argument_ratio=0.8, predicate_ratio=0.4, max_arg_width=30, mlp_label_size=100, enforce_srl_constraint=False, use_gold_predicates=False, doc_level_offset=True, use_biaffine=False, loss_reduction='mean', with_argument=' ', **kwargs) -> None: r""" An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling" (:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. lexical_dropout: Dropout applied to hidden states of encoder. dropout: Dropout used for other layers except the encoder. span_width_feature_size: Span width feature size. ffnn_size: Feedforward size. ffnn_depth: Number of layers of feedforward MLPs. argument_ratio: Ratio of candidate arguments over number of tokens. predicate_ratio: Ratio of candidate predicates over number of tokens. max_arg_width: Maximum argument width. mlp_label_size: Feature size for label representation. enforce_srl_constraint: Enforce SRL constraints (number of core ARGs etc.). use_gold_predicates: Use gold predicates instead of predicting them. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. use_biaffine: ``True`` to use biaffine (:cite:`dozat:17a`) instead of lineary layer for label prediction. loss_reduction: The loss reduction used in aggregating losses. with_argument: The delimiter between tokens in arguments to be used for joining tokens for outputs. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = self.build_dataset(data, isinstance(data, list), logger, transform) dataset.purge_cache() return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): return SpanRankingSemanticRoleLabeler.update_metrics(self, batch, {'prediction': prediction}, tuple(metric.values())) def decode_output(self, output: Dict[str, Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]: return SpanRankingSemanticRoleLabeler.decode_output(self, output, batch) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \ Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: return output['loss'] def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return SpanRankingSRLDecoder(encoder_size, len(self.vocabs.srl_label), self.config) def build_metric(self, **kwargs): predicate_f1, end_to_end_f1 = SpanRankingSemanticRoleLabeler.build_metric(self, **kwargs) return MetricDict({'predicate': predicate_f1, 'e2e': end_to_end_f1}) def build_criterion(self, **kwargs): pass def input_is_flat(self, data) -> bool: return SpanRankingSemanticRoleLabeler.input_is_flat(self, data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List: return SpanRankingSemanticRoleLabeler.format_dict_to_results(batch['token'], prediction, exclusive_offset=True, with_predicate=True, with_argument=self.config.get('with_argument', ' '), label_first=True) ================================================ FILE: hanlp/components/mtl/tasks/tok/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-11 16:34 ================================================ FILE: hanlp/components/mtl/tasks/tok/reg_tok.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-02 16:51 import logging from typing import Union, List, Dict, Any, Iterable, Tuple import torch from hanlp_common.util import merge_locals_kwargs from torch import Tensor from torch.utils.data import DataLoader import hanlp.utils.torch_util from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import FieldLength, TransformList from hanlp.components.mtl.tasks import Task from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer from hanlp.metrics.chunking.binary_chunking_f1 import BinaryChunkingF1 from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer def generate_token_span_tuple(sample: dict): prefix_mask = sample.get('text_prefix_mask', None) if prefix_mask: sample['span_tuple'] = spans = [] previous_prefix = 0 prefix_mask_ = prefix_mask[1:-1] for i, mask in enumerate(prefix_mask_): if i and mask: spans.append((previous_prefix, i)) previous_prefix = i spans.append((previous_prefix, len(prefix_mask_))) return sample class RegressionTokenizingDecoder(torch.nn.Linear): def __init__(self, in_features: int, out_features: int = 1, bias: bool = ...) -> None: super().__init__(in_features, out_features, bias) # noinspection PyMethodOverriding def forward(self, input: Tensor, **kwargs) -> Tensor: return super().forward(input[:, 1:-1, :]).squeeze_(-1) class RegressionTokenization(Task): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=True, lr=1e-3, separate_optimizer=False, delimiter=None, max_seq_len=None, sent_delimiter=None) -> None: super().__init__(**merge_locals_kwargs(locals())) def build_criterion(self, **kwargs): return torch.nn.BCEWithLogitsLoss(reduction='mean') def build_metric(self, **kwargs): return BinaryChunkingF1() # noinspection PyMethodOverriding def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return RegressionTokenizingDecoder(encoder_size) def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs): pass def build_dataloader(self, data, transform: TransformList = None, training=False, device=None, logger: logging.Logger = None, tokenizer: PreTrainedTokenizer = None, **kwargs) -> DataLoader: assert tokenizer dataset = TextTokenizingDataset(data, cache=True, delimiter=self.config.sent_delimiter, generate_idx=isinstance(data, list), max_seq_len=self.config.max_seq_len, sent_delimiter=self.config.sent_delimiter, transform=[ TransformerSequenceTokenizer(tokenizer, 'text', ret_prefix_mask=True, ret_subtokens=True, ), FieldLength('text_input_ids', 'text_input_ids_length', delta=-2), generate_token_span_tuple]) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset, 'text_input_ids'), shuffle=training), device=device, dataset=dataset) def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], batch: Dict[str, Any], **kwargs) -> List[Tuple[int, int]]: spans = BinaryChunkingF1.decode_spans(output > 0, batch['text_input_ids_length']) return spans def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: List[Tuple[int, int]], metric: BinaryChunkingF1): metric.update(prediction, batch['span_tuple']) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion): mask = hanlp.utils.torch_util.lengths_to_mask(batch['text_input_ids_length']) return criterion(output[mask], batch['text_prefix_mask'][:, 1:-1][mask].to(torch.float)) ================================================ FILE: hanlp/components/mtl/tasks/tok/tag_tok.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-11 16:35 import logging from typing import Dict, Any, Union, Iterable, List, Set import torch from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp.common.transform import VocabDict, TransformList from hanlp.components.mtl.tasks import Task from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer from hanlp.layers.crf.crf import CRF from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer from hanlp_common.util import merge_locals_kwargs from hanlp_trie import DictInterface, TrieDict class LinearCRFDecoder(torch.nn.Module): def __init__(self, hidden_size, num_labels, crf=False) -> None: super().__init__() self.classifier = torch.nn.Linear(hidden_size, num_labels) self.crf = CRF(num_labels) if crf else None def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None): return self.classifier(contextualized_embeddings[:, 1:-1, :]) class TaggingTokenization(Task, TransformerTaggingTokenizer): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=True, delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, tagging_scheme='BMES', crf=False, token_key='token', dict_force: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, dict_combine: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, **kwargs) -> None: """Tokenization which casts a chunking problem into a tagging problem. This task has to create batch of tokens containing both [CLS] and [SEP] since it's usually the first task and later tasks might need them. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. delimiter: Delimiter used to split a line in the corpus. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. transform: An optional transform to be applied to samples. Usually a character normalization transform is passed in. tagging_scheme: Either ``BMES`` or ``BI``. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs, excludes=( 'self', 'kwargs', '__class__', 'dict_force', 'dict_combine'))) # avoid to config self.transform = transform self.vocabs = VocabDict() self.dict_force = dict_force self.dict_combine = dict_combine def build_dataloader(self, data, transform: TransformList = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: args = dict((k, self.config[k]) for k in ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config) # We only need those transforms before TransformerTokenizer transformer_index = transform.index_by_type(TransformerSequenceTokenizer) assert transformer_index is not None transform = transform[:transformer_index + 1] if self.transform: transform.insert(0, self.transform) transform.append(self.last_transform()) dataset = self.build_dataset(data, cache=cache, transform=transform, **args) dataset.purge_cache() if self.vocabs.mutable: self.build_vocabs(dataset, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: return TransformerTaggingTokenizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask']) def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder, **kwargs) -> Union[Dict[str, Any], Any]: return TransformerTaggingTokenizer.decode_output(self, output, mask, batch, decoder) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): TransformerTaggingTokenizer.update_metrics(self, metric, output, batch['tag_id'], None, batch, prediction) def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module: return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.config.crf) def build_metric(self, **kwargs): return TransformerTaggingTokenizer.build_metric(self) def build_criterion(self, model=None, **kwargs): return TransformerTaggingTokenizer.build_criterion(self, model=model, reduction='mean') def input_is_flat(self, data) -> bool: return TransformerTaggingTokenizer.input_is_flat(self, data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]: return TransformerTaggingTokenizer.prediction_to_human(self, prediction, None, batch, rebuild_span=True) def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer): # The transform for tokenizer needs very special settings, ensure these settings are set properly. return TransformerSequenceTokenizer( tokenizer.tokenizer, tokenizer.input_key, tokenizer.output_key, tokenizer.max_seq_length, tokenizer.truncate_long_sequences, ret_subtokens=True, ret_subtokens_group=True, ret_token_span=True, cls_is_bos=True, sep_is_eos=True, use_fast=tokenizer.tokenizer.is_fast, dict_force=self.dict_force, strip_cls_sep=False, ) def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False): return [{self.config.token_key: sent} for sent in inputs] @property def dict_force(self) -> DictInterface: return TransformerTaggingTokenizer.dict_force.fget(self) @dict_force.setter def dict_force(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): dictionary = TrieDict(dictionary) self.config.dict_force = dictionary @property def dict_combine(self) -> DictInterface: return TransformerTaggingTokenizer.dict_combine.fget(self) @dict_combine.setter def dict_combine(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): # noinspection PyArgumentList TransformerTaggingTokenizer.dict_combine.fset(self, dictionary) def transform_batch(self, batch: Dict[str, Any], results: Dict[str, Any] = None, cls_is_bos=False, sep_is_eos=False) -> Dict[str, Any]: """ This method is overrode to honor the zero indexed token used in custom dict. Although for a tokenizer, cls_is_bos = sep_is_eos = True, its tokens don't contain [CLS] or [SEP]. This behaviour is adopted from the early versions and it is better kept to avoid migration efforts. Args: batch: A batch of samples. results: Predicted results from other tasks which might be useful for this task to utilize. Say a dep task uses both token and pos as features, then it will need both tok and pos results to make a batch. cls_is_bos: First token in this batch is BOS. sep_is_eos: Last token in this batch is EOS. Returns: A batch. """ return batch ================================================ FILE: hanlp/components/mtl/tasks/ud.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-17 21:54 import logging from typing import Dict, Any, List, Union, Iterable, Callable import torch from torch.utils.data import DataLoader from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader from hanlp_common.document import Document from hanlp.common.transform import VocabDict, PunctuationMask from hanlp.components.mtl.tasks import Task from hanlp_common.conll import CoNLLUWord from hanlp.components.parsers.ud.ud_model import UniversalDependenciesDecoder from hanlp.components.parsers.ud.ud_parser import UniversalDependenciesParser from hanlp.components.parsers.ud.util import generate_lemma_rule, append_bos from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs class UniversalDependenciesParsing(Task, UniversalDependenciesParser): def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, cls_is_bos=True, sep_is_eos=False, n_mlp_arc=768, n_mlp_rel=256, mlp_dropout=.33, tree=False, proj=False, punct=False, max_seq_len=None, **kwargs) -> None: r"""Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing) implementation of "75 Languages, 1 Model: Parsing Universal Dependencies Universally" (:cite:`kondratyuk-straka-2019-75`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. n_mlp_arc: Number of features for arc representation. n_mlp_rel: Number of features for rel representation. mlp_dropout: Dropout applied to MLPs. tree: ``True`` to enforce tree constraint. proj: ``True`` for projective parsing. punct: ``True`` to include punctuations in evaluation. max_seq_len: Prune samples longer than this length. Useful for reducing GPU consumption. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: _transform = [generate_lemma_rule, append_bos, self.vocabs, transform] if isinstance(data, str) and not self.config.punct: _transform.append(PunctuationMask('token', 'punct_mask')) dataset = UniversalDependenciesParser.build_dataset(self, data, _transform) dataset.purge_cache() if self.vocabs.mutable: UniversalDependenciesParser.build_vocabs(self, dataset, logger, transformer=True) max_seq_len = self.config.get('max_seq_len', None) if max_seq_len and isinstance(data, str): dataset.prune(lambda x: len(x['token_input_ids']) > max_seq_len, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset, pad={'arc': 0}) def compute_loss(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], criterion) -> \ Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]: return output[0]['loss'] / 4 # we have 4 tasks def decode_output(self, output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], mask: torch.BoolTensor, batch: Dict[str, Any], decoder: torch.nn.Module, **kwargs) -> Union[ Dict[str, Any], Any]: return UniversalDependenciesParser.decode_output(self, *output, batch) def update_metrics(self, batch: Dict[str, Any], output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any], prediction: Dict[str, Any], metric: Union[MetricDict, Metric]): UniversalDependenciesParser.update_metrics(self, metric, batch, *output) # noinspection PyMethodOverriding def build_model(self, encoder_size, n_mlp_arc, n_mlp_rel, mlp_dropout, training=True, **kwargs) -> torch.nn.Module: return UniversalDependenciesDecoder( encoder_size, n_mlp_arc, n_mlp_rel, mlp_dropout, len(self.vocabs.rel), len(self.vocabs.lemma), len(self.vocabs.pos), len(self.vocabs.feat), 0, 0 ) def build_metric(self, **kwargs): return UniversalDependenciesParser.build_metric(self) def input_is_flat(self, data) -> bool: return UniversalDependenciesParser.input_is_flat(self, data) def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> List: yield from UniversalDependenciesParser.prediction_to_human(self, prediction, batch) def feed_batch(self, h: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask: torch.BoolTensor, decoder: torch.nn.Module): mask = self.compute_mask(batch) output_dict = decoder(h, batch, mask) if decoder.training: mask = mask.clone() mask[:, 0] = 0 return output_dict, mask def finalize_document(self, doc: Document, task_name: str): lem = [] pos = [] feat = [] dep = [] for sent in doc[task_name]: sent: List[CoNLLUWord] = sent lem.append([x.lemma for x in sent]) pos.append([x.upos for x in sent]) feat.append([x.feats for x in sent]) dep.append([(x.head, x.deprel) for x in sent]) promoted = 0 if 'lem' not in doc: doc['lem'] = lem promoted += 1 if 'pos' not in doc: doc['pos'] = pos promoted += 1 if 'feat' not in doc: doc['fea'] = feat promoted += 1 if 'dep' not in doc: doc['dep'] = dep promoted += 1 if promoted == 4: doc.pop(task_name) ================================================ FILE: hanlp/components/ner/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-21 17:22 ================================================ FILE: hanlp/components/ner/biaffine_ner/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-21 18:41 ================================================ FILE: hanlp/components/ner/biaffine_ner/biaffine_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-09 18:13 import logging from typing import Union, List, Callable, Dict, Any from hanlp_common.constant import IDX from hanlp.common.structure import History from hanlp.components.ner.biaffine_ner.biaffine_ner_model import BiaffineNamedEntityRecognitionModel from hanlp.datasets.ner.loaders.json_ner import JsonNERDataset, unpack_ner from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer import torch from torch.utils.data import DataLoader from hanlp.common.dataset import PadSequenceDataLoader from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import FieldLength, TransformList from hanlp.common.vocab import Vocab from hanlp.layers.embeddings.embedding import Embedding from hanlp.metrics.f1 import F1 from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import merge_locals_kwargs, reorder class BiaffineNamedEntityRecognizer(TorchComponent): def __init__(self, **kwargs) -> None: """An implementation of Named Entity Recognition as Dependency Parsing (:cite:`yu-etal-2020-named`). It treats every possible span as a candidate of entity and predicts its entity label. Non-entity spans are assigned NULL label to be excluded. The label prediction is done with a biaffine layer (:cite:`dozat:17a`). As it makes no assumption about the spans, it naturally supports flat NER and nested NER. Args: **kwargs: Predefined config. """ super().__init__(**kwargs) self.model: BiaffineNamedEntityRecognitionModel = None def build_optimizer(self, trn, epochs, lr, adam_epsilon, weight_decay, warmup_steps, transformer_lr, **kwargs): # noinspection PyProtectedMember if self.use_transformer: num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1) optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, self._get_transformer(), lr, transformer_lr, num_training_steps, warmup_steps, weight_decay, adam_epsilon) else: optimizer = torch.optim.Adam(self.model.parameters(), self.config.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer, mode='max', factor=0.5, patience=2, verbose=True, ) return optimizer, scheduler @property def use_transformer(self): return 'token' not in self.vocabs def _get_transformer(self): return getattr(self.model_.embed, 'transformer', None) def build_criterion(self, **kwargs): pass # noinspection PyProtectedMember def build_metric(self, **kwargs) -> F1: return F1() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, gradient_accumulation=1, **kwargs): best_epoch, best_metric = 0, -1 optimizer, scheduler = optimizer history = History() timer = CountdownTimer(epochs) ratio_width = len(f'{len(trn)}/{len(trn)}') for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, gradient_accumulation=gradient_accumulation, linear_scheduler=scheduler if self._get_transformer() else None) if dev: self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width) report = f'{timer.elapsed_human}/{timer.total_time_human}' dev_score = metric.score if not self._get_transformer(): scheduler.step(dev_score) if dev_score > best_metric: self.save_weights(save_dir) best_metric = dev_score report += ' [red]saved[/red]' timer.log(report, ratio_percentage=False, newline=True, ratio=False) return best_metric def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, linear_scheduler=None, history: History = None, gradient_accumulation=1, **kwargs): self.model.train() timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation)) total_loss = 0 self.reset_metrics(metric) for batch in trn: optimizer.zero_grad() output_dict = self.feed_batch(batch) self.update_metrics(batch, output_dict, metric) loss = output_dict['loss'] if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation loss.backward() total_loss += loss.item() if history.step(gradient_accumulation): if self.config.grad_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm) optimizer.step() if linear_scheduler: linear_scheduler.step() timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger) del loss return total_loss / timer.total # noinspection PyMethodOverriding @torch.no_grad() def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric, logger, ratio_width=None, output=False, **kwargs): self.model.eval() self.reset_metrics(metric) timer = CountdownTimer(len(data)) total_loss = 0 if output: fp = open(output, 'w') for batch in data: output_dict = self.feed_batch(batch) if output: for sent, pred, gold in zip(batch['token'], output_dict['prediction'], batch['ner']): fp.write('Tokens\t' + ' '.join(sent) + '\n') fp.write('Pred\t' + '\t'.join( ['[' + ' '.join(sent[x:y + 1]) + f']/{label}' for x, y, label in pred]) + '\n') fp.write('Gold\t' + '\t'.join( ['[' + ' '.join(sent[x:y + 1]) + f']/{label}' for x, y, label in gold]) + '\n') fp.write('\n') self.update_metrics(batch, output_dict, metric) loss = output_dict['loss'] total_loss += loss.item() timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger, ratio_width=ratio_width) del loss if output: fp.close() return total_loss / timer.total, metric def build_model(self, training=True, **kwargs) -> torch.nn.Module: # noinspection PyTypeChecker # embed: torch.nn.Embedding = self.config.embed.module(vocabs=self.vocabs)[0].embed model = BiaffineNamedEntityRecognitionModel(self.config, self.config.embed.module(vocabs=self.vocabs), self.config.context_layer, len(self.vocabs.label)) return model # noinspection PyMethodOverriding def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None, vocabs=None, sampler_builder=None, gradient_accumulation=1, **kwargs) -> DataLoader: if vocabs is None: vocabs = self.vocabs transform = TransformList(unpack_ner, FieldLength('token')) if isinstance(self.config.embed, Embedding): transform.append(self.config.embed.transform(vocabs=vocabs)) transform.append(self.vocabs) dataset = self.build_dataset(data, vocabs, transform) if vocabs.mutable: self.build_vocabs(dataset, logger, vocabs) if 'token' in vocabs: lens = [x['token'] for x in dataset] else: lens = [len(x['token_input_ids']) for x in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = None return PadSequenceDataLoader(batch_sampler=sampler, device=device, dataset=dataset) def build_dataset(self, data, vocabs, transform): dataset = JsonNERDataset(data, transform=transform, doc_level_offset=self.config.get('doc_level_offset', True), tagset=self.config.get('tagset', None)) dataset.append_transform(vocabs) if isinstance(data, str): dataset.purge_cache() # Enable cache return dataset def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, ret_tokens=True, **kwargs): if not data: return [] flat = self.input_is_flat(data) if flat: data = [data] dataloader = self.build_dataloader([{'token': x} for x in data], batch_size, False, self.device) predictions = [] orders = [] for batch in dataloader: output_dict = self.feed_batch(batch) token = batch['token'] prediction = output_dict['prediction'] self.prediction_to_result(token, prediction, predictions, ret_tokens) orders.extend(batch[IDX]) predictions = reorder(predictions, orders) if flat: return predictions[0] return predictions @staticmethod def prediction_to_result(token, prediction, predictions: List, ret_tokens: Union[bool, str]): for tokens, ner in zip(token, prediction): prediction_per_sent = [] for i, (b, e, l) in enumerate(ner): if ret_tokens is not None: entity = tokens[b: e + 1] if isinstance(ret_tokens, str): entity = ret_tokens.join(entity) prediction_per_sent.append((entity, l, b, e + 1)) else: prediction_per_sent.append((b, e + 1, l)) predictions.append(prediction_per_sent) @staticmethod def input_is_flat(data): return isinstance(data[0], str) # noinspection PyMethodOverriding def fit(self, trn_data, dev_data, save_dir, embed: Embedding, context_layer, sampler='sorting', n_buckets=32, batch_size=50, lexical_dropout=0.5, ffnn_size=150, is_flat_ner=True, doc_level_offset=True, lr=1e-3, transformer_lr=1e-5, adam_epsilon=1e-6, weight_decay=0.01, warmup_steps=0.1, grad_norm=5.0, epochs=50, loss_reduction='sum', gradient_accumulation=1, ret_tokens=True, tagset=None, sampler_builder=None, devices=None, logger=None, seed=None, **kwargs ): """ Args: trn_data: Path to training set. dev_data: Path to dev set. save_dir: The directory to save trained component. embed: Embeddings to use. context_layer: A contextualization layer (transformer or RNN). sampler: Sampler to use. n_buckets: Number of buckets to use in KMeans sampler. batch_size: The number of samples in a batch. lexical_dropout: Dropout applied to hidden states of context layer. ffnn_size: Feedforward size for MLPs extracting the head/tail representations. is_flat_ner: ``True`` for flat NER, otherwise nested NER. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. lr: Learning rate for decoder. transformer_lr: Learning rate for encoder. adam_epsilon: The epsilon to use in Adam. weight_decay: The weight decay to use. warmup_steps: The number of warmup steps. grad_norm: Gradient norm for clipping. epochs: The number of epochs to train. loss_reduction: The loss reduction used in aggregating losses. gradient_accumulation: Number of mini-batches per update step. ret_tokens: A delimiter between tokens in entities so that the surface form of an entity can be rebuilt. tagset: Optional tagset to prune entities outside of this tagset from datasets. sampler_builder: The builder to build sampler, which will override batch_size. devices: Devices this component will live on. logger: Any :class:`logging.Logger` instance. seed: Random seed to reproduce this training. **kwargs: Not used. Returns: The best metrics on training set. """ return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_vocabs(self, dataset, logger, vocabs, lock=True, label_vocab_name='label', **kwargs): vocabs[label_vocab_name] = label_vocab = Vocab(pad_token=None, unk_token=None) # Use null to indicate no relationship label_vocab.add('') timer = CountdownTimer(len(dataset)) for each in dataset: timer.log('Building NER vocab [blink][yellow]...[/yellow][/blink]') label_vocab.set_unk_as_safe_unk() if lock: vocabs.lock() vocabs.summary(logger) def reset_metrics(self, metrics): metrics.reset() def report_metrics(self, loss, metrics): return f'loss: {loss:.4f} {metrics}' def feed_batch(self, batch) -> Dict[str, Any]: output_dict = self.model(batch) output_dict['prediction'] = self.get_pred_ner(batch['token'], output_dict['candidate_ner_scores']) return output_dict def update_metrics(self, batch: dict, prediction: Union[Dict, List], metrics): if isinstance(prediction, dict): prediction = prediction['prediction'] assert len(prediction) == len(batch['ner']) for pred, gold in zip(prediction, batch['ner']): metrics(set(pred), set(gold)) def get_pred_ner(self, sentences, span_scores): is_flat_ner = self.config.is_flat_ner candidates = [] for sid, sent in enumerate(sentences): for s in range(len(sent)): for e in range(s, len(sent)): candidates.append((sid, s, e)) top_spans = [[] for _ in range(len(sentences))] span_scores_cpu = span_scores.tolist() for i, type in enumerate(torch.argmax(span_scores, dim=-1).tolist()): if type > 0: sid, s, e = candidates[i] top_spans[sid].append((s, e, type, span_scores_cpu[i][type])) top_spans = [sorted(top_span, reverse=True, key=lambda x: x[3]) for top_span in top_spans] sent_pred_mentions = [[] for _ in range(len(sentences))] for sid, top_span in enumerate(top_spans): for ns, ne, t, _ in top_span: for ts, te, _ in sent_pred_mentions[sid]: if ns < ts <= ne < te or ts < ns <= te < ne: # for both nested and flat ner no clash is allowed break if is_flat_ner and (ns <= ts <= te <= ne or ts <= ns <= ne <= te): # for flat ner nested mentions are not allowed break else: sent_pred_mentions[sid].append((ns, ne, t)) pred_mentions = set((sid, s, e, t) for sid, spr in enumerate(sent_pred_mentions) for s, e, t in spr) prediction = [[] for _ in range(len(sentences))] idx_to_label = self.vocabs['label'].idx_to_token for sid, s, e, t in sorted(pred_mentions): prediction[sid].append((s, e, idx_to_label[t])) return prediction ================================================ FILE: hanlp/components/ner/biaffine_ner/biaffine_ner_model.py ================================================ from typing import Dict import torch import torch.nn.functional as F from torch import nn import hanlp.utils.torch_util from hanlp.layers.time_distributed import TimeDistributed from ...parsers.biaffine.biaffine import Biaffine def initializer_1d(input_tensor, initializer): assert len(input_tensor.size()) == 1 input_tensor = input_tensor.view(-1, 1) input_tensor = initializer(input_tensor) return input_tensor.view(-1) class BiaffineNamedEntityRecognitionModel(nn.Module): def __init__(self, config, embed: torch.nn.Module, context_layer: torch.nn.Module, label_space_size): super(BiaffineNamedEntityRecognitionModel, self).__init__() self.config = config self.lexical_dropout = float(self.config.lexical_dropout) self.label_space_size = label_space_size # Initialize layers and parameters self.word_embedding_dim = embed.get_output_dim() # get the embedding dim self.embed = embed # Initialize context layer self.context_layer = context_layer context_layer_output_dim = context_layer.get_output_dim() self.decoder = BiaffineNamedEntityRecognitionDecoder(context_layer_output_dim, config.ffnn_size, label_space_size, config.loss_reduction) def forward(self, batch: Dict[str, torch.Tensor] ): keys = 'token_length', 'begin_offset', 'end_offset', 'label_id' sent_lengths, gold_starts, gold_ends, gold_labels = [batch.get(k, None) for k in keys] masks = hanlp.utils.torch_util.lengths_to_mask(sent_lengths) num_sentences, max_sent_length = masks.size() raw_embeddings = self.embed(batch, mask=masks) raw_embeddings = F.dropout(raw_embeddings, self.lexical_dropout, self.training) contextualized_embeddings = self.context_layer(raw_embeddings, masks) return self.decoder.decode(contextualized_embeddings, gold_starts, gold_ends, gold_labels, masks, max_sent_length, num_sentences, sent_lengths) class BiaffineNamedEntityRecognitionDecoder(nn.Module): def __init__(self, hidden_size, ffnn_size, label_space_size, loss_reduction='sum') -> None: """An implementation of the biaffine decoder in "Named Entity Recognition as Dependency Parsing" (:cite:`yu-etal-2020-named`). Args: hidden_size: Size of hidden states. ffnn_size: Feedforward size for MLPs extracting the head/tail representations. label_space_size: Size of tag set. loss_reduction: The loss reduction used in aggregating losses. """ super().__init__() self.loss_reduction = loss_reduction # MLPs def new_mlp(): return TimeDistributed(nn.Linear(hidden_size, ffnn_size)) self.start_mlp = new_mlp() self.end_mlp = new_mlp() self.biaffine = Biaffine(ffnn_size, label_space_size) def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None): keys = 'token_length', 'begin_offset', 'end_offset', 'label_id' sent_lengths, gold_starts, gold_ends, gold_labels = [batch.get(k, None) for k in keys] if mask is None: mask = hanlp.utils.torch_util.lengths_to_mask(sent_lengths) num_sentences, max_sent_length = mask.size() return self.decode(contextualized_embeddings, gold_starts, gold_ends, gold_labels, mask, max_sent_length, num_sentences, sent_lengths) def get_dense_span_labels(self, span_starts, span_ends, span_labels, max_sentence_length): num_sentences, max_spans_num = span_starts.size() sentence_indices = torch.arange(0, num_sentences, device=span_starts.device).unsqueeze(1).expand(-1, max_spans_num) sparse_indices = torch.cat([sentence_indices.unsqueeze(2), span_starts.unsqueeze(2), span_ends.unsqueeze(2)], dim=2) rank = 3 dense_labels = torch.sparse.LongTensor(sparse_indices.view(num_sentences * max_spans_num, rank).t(), span_labels.view(-1), torch.Size([num_sentences] + [max_sentence_length] * (rank - 1))) \ .to_dense() return dense_labels def decode(self, contextualized_embeddings, gold_starts, gold_ends, gold_labels, masks, max_sent_length, num_sentences, sent_lengths): # Apply MLPs to starts and ends, [num_sentences, max_sentences_length,emb] candidate_starts_emb = self.start_mlp(contextualized_embeddings) candidate_ends_emb = self.end_mlp(contextualized_embeddings) candidate_ner_scores = self.biaffine(candidate_starts_emb, candidate_ends_emb).permute([0, 2, 3, 1]) """generate candidate spans with argument pruning""" # Generate masks candidate_scores_mask = masks.unsqueeze(1) & masks.unsqueeze(2) device = sent_lengths.device sentence_ends_leq_starts = ( ~hanlp.utils.torch_util.lengths_to_mask(torch.arange(max_sent_length, device=device), max_sent_length)) \ .unsqueeze_(0).expand(num_sentences, -1, -1) candidate_scores_mask &= sentence_ends_leq_starts candidate_ner_scores = candidate_ner_scores[candidate_scores_mask] predict_dict = { "candidate_ner_scores": candidate_ner_scores, } if gold_starts is not None: gold_ner_labels = self.get_dense_span_labels(gold_starts, gold_ends, gold_labels, max_sent_length) loss = torch.nn.functional.cross_entropy(candidate_ner_scores, gold_ner_labels[candidate_scores_mask], reduction=self.loss_reduction) predict_dict['loss'] = loss return predict_dict ================================================ FILE: hanlp/components/ner/ner_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-09-14 20:33 from abc import ABC from typing import Union, Any, Tuple, Iterable import tensorflow as tf from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform from hanlp.common.transform_tf import Transform from hanlp.common.keras_component import KerasComponent from hanlp.components.taggers.ngram_conv.ngram_conv_tagger import NgramConvTaggerTF from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF from hanlp.metrics.chunking.sequence_labeling import iobes_to_span from hanlp_common.util import merge_locals_kwargs class IOBES_NamedEntityRecognizer(KerasComponent, ABC): def predict_batch(self, batch, inputs=None): for words, tags in zip(inputs, super().predict_batch(batch, inputs)): yield from iobes_to_span(words, tags) class IOBES_Transform(Transform): def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, batch=None) -> Iterable: for words, tags in zip(inputs, super().Y_to_outputs(Y, gold, inputs=inputs, X=X, batch=batch)): yield from iobes_to_span(words, tags) class RNNNamedEntityRecognizerTF(RNNTaggerTF, IOBES_NamedEntityRecognizer): def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False, rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, logger=None, loss: Union[tf.keras.losses.Loss, str] = None, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=32, dev_batch_size=32, lr_decay_per_epoch=None, run_eagerly=False, verbose=True, **kwargs): # assert kwargs.get('run_eagerly', True), 'This component can only run eagerly' # kwargs['run_eagerly'] = True return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_loss(self, loss, **kwargs): if not loss: loss = tf.keras.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE, from_logits=True) return super().build_loss(loss, **kwargs) class NgramConvNamedEntityRecognizerTF(NgramConvTaggerTF, IOBES_NamedEntityRecognizer): def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200, ngram_embed: Union[str, int, dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3, filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True, loss: Union[tf.keras.losses.Loss, str] = None, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=100, epochs=100, logger=None, verbose=True, **kwargs): return super().fit(trn_data, dev_data, save_dir, word_embed, ngram_embed, embedding_trainable, window_size, kernel_size, filters, dropout_embed, dropout_hidden, weight_norm, loss, optimizer, metrics, batch_size, epochs, logger, verbose, **kwargs) class IOBES_TransformerTransform(IOBES_Transform, TransformerTransform): pass class TransformerNamedEntityRecognizerTF(TransformerTaggerTF): def __init__(self, transform: TransformerTransform = None) -> None: if not transform: transform = IOBES_TransformerTransform() super().__init__(transform) def fit(self, trn_data, dev_data, save_dir, transformer, optimizer='adamw', learning_rate=5e-5, weight_decay_rate=0, epsilon=1e-8, clipnorm=1.0, warmup_steps_ratio=0, use_amp=False, max_seq_length=128, batch_size=32, epochs=3, metrics='f1', run_eagerly=False, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) ================================================ FILE: hanlp/components/ner/rnn_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-12 18:00 from typing import Any import torch from hanlp_common.util import merge_locals_kwargs import hanlp.utils.span_util from hanlp.components.taggers.rnn_tagger import RNNTagger from hanlp.metrics.chunking.conlleval import SpanF1 class RNNNamedEntityRecognizer(RNNTagger): def __init__(self, **kwargs) -> None: """An old-school RNN tagger using word2vec or fasttext embeddings. Args: **kwargs: Predefined config. """ super().__init__(**kwargs) def build_metric(self, **kwargs): return SpanF1(self.tagging_scheme) def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, **kwargs): loss, metric = super().evaluate_dataloader(data, criterion, logger, ratio_width, **kwargs) if logger: logger.info(metric.result(True, False)[-1]) return loss, metric def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256, drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='token', tagging_scheme=None, anneal_factor: float = 0.5, delimiter=None, anneal_patience=2, devices=None, token_delimiter=None, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def update_metrics(self, metric, logits, y, mask, batch, prediction): logits = self.decode_output(logits, mask, batch) if isinstance(logits, torch.Tensor): logits = logits.tolist() metric(self._id_to_tags(logits), batch['tag']) def predict(self, tokens: Any, batch_size: int = None, **kwargs): return super().predict(tokens, batch_size, **kwargs) def predict_data(self, data, batch_size, **kwargs): outputs = super().predict_data(data, batch_size) tagging_scheme = self.tagging_scheme if tagging_scheme == 'IOBES': entities = [hanlp.utils.span_util.iobes_tags_to_spans(y) for y in outputs] elif tagging_scheme == 'BIO': entities = [hanlp.utils.span_util.bio_tags_to_spans(y) for y in outputs] elif tagging_scheme == 'BIOUL': entities = [hanlp.utils.span_util.bioul_tags_to_spans(y) for y in outputs] else: raise ValueError(f'Unrecognized tag scheme {tagging_scheme}') for i, (tokens, es) in enumerate(zip(data, entities)): outputs[i] = [(self.config.token_delimiter.join(tokens[b:e + 1]), t, b, e + 1) for t, (b, e) in es] return outputs def save_config(self, save_dir, filename='config.json'): if self.config.token_delimiter is None: self.config.token_delimiter = '' if all( [len(x) == 1 for x in self.vocabs[self.config.token_key].idx_to_token[-100:]]) else ' ' super().save_config(save_dir, filename) ================================================ FILE: hanlp/components/ner/transformer_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-10-07 11:08 import functools from typing import Union, List, Dict, Any, Set from hanlp_trie import DictInterface, TrieDict from hanlp.common.dataset import SamplerBuilder from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger from hanlp.metrics.chunking.sequence_labeling import get_entities from hanlp.metrics.f1 import F1 from hanlp.datasets.ner.loaders.json_ner import prune_ner_tagset from hanlp.utils.string_util import guess_delimiter from hanlp_common.util import merge_locals_kwargs class TransformerNamedEntityRecognizer(TransformerTagger): def __init__(self, **kwargs) -> None: r"""A simple tagger using transformers and a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type. During decoding, it performs longest-prefix-matching of these words to override the prediction from underlying statistical model. It also uses a blacklist to mask out mis-predicted entities. .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can do and what it can't do. The tutorial in `this book `_ can be very helpful. Args: **kwargs: Not used. """ super().__init__(**kwargs) def build_metric(self, **kwargs): return F1() # noinspection PyMethodOverriding def update_metrics(self, metric, logits, y, mask, batch, prediction): for p, g in zip(prediction, self.tag_to_span(batch['tag'], batch)): pred = set(p) gold = set(g) metric(pred, gold) # noinspection PyMethodOverriding def decode_output(self, logits, mask, batch, model=None): output = super().decode_output(logits, mask, batch, model) prediction = super().prediction_to_human(output, self.vocabs['tag'].idx_to_token, batch) return self.tag_to_span(prediction, batch) def tag_to_span(self, batch_tags, batch): spans = [] sents = batch[self.config.token_key] dict_whitelist = self.dict_whitelist dict_blacklist = self.dict_blacklist merge_types = self.config.get('merge_types', None) for tags, tokens in zip(batch_tags, sents): entities = get_entities(tags) if dict_whitelist: matches = dict_whitelist.tokenize(tokens) if matches: # Fix O E-LOC O like predictions entities = get_entities(tags) for label, start, end in entities: if end - start == 1: tags[start] = 'S-' + label else: tags[start] = 'B-' + label for i in range(start + 1, end - 1): tags[i] = 'I-' + label tags[end - 1] = 'E-' + label for start, end, label in matches: if (not tags[start][0] in 'ME') and (not tags[end - 1][0] in 'BM'): if end - start == 1: tags[start] = 'S-' + label else: tags[start] = 'B-' + label for i in range(start + 1, end - 1): tags[i] = 'I-' + label tags[end - 1] = 'E-' + label entities = get_entities(tags) if merge_types and len(entities) > 1: merged_entities = [] begin = 0 for i in range(1, len(entities)): if entities[begin][0] != entities[i][0] or entities[i - 1][2] != entities[i][1] \ or entities[i][0] not in merge_types: merged_entities.append((entities[begin][0], entities[begin][1], entities[i - 1][2])) begin = i merged_entities.append((entities[begin][0], entities[begin][1], entities[-1][2])) entities = merged_entities if dict_blacklist: pruned = [] delimiter_in_entity = self.config.get('delimiter_in_entity', ' ') for label, start, end in entities: entity = delimiter_in_entity.join(tokens[start:end]) if entity not in dict_blacklist: pruned.append((label, start, end)) entities = pruned spans.append(entities) return spans def decorate_spans(self, spans, batch): batch_ner = [] delimiter_in_entity = self.config.get('delimiter_in_entity', ' ') for spans_per_sent, tokens in zip(spans, batch.get(f'{self.config.token_key}_', batch[self.config.token_key])): ner_per_sent = [] for label, start, end in spans_per_sent: ner_per_sent.append((delimiter_in_entity.join(tokens[start:end]), label, start, end)) batch_ner.append(ner_per_sent) return batch_ner def generate_prediction_filename(self, tst_data, save_dir): return super().generate_prediction_filename(tst_data.replace('.tsv', '.txt'), save_dir) def prediction_to_human(self, pred, vocab, batch): return self.decorate_spans(pred, batch) def input_is_flat(self, tokens): return tokens and isinstance(tokens, list) and isinstance(tokens[0], str) def fit(self, trn_data, dev_data, save_dir, transformer, delimiter_in_entity=None, merge_types: List[str] = None, average_subwords=False, word_dropout: float = 0.2, hidden_dropout=None, layer_dropout=0, scalar_mix=None, grad_norm=5.0, lr=5e-5, transformer_lr=None, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, crf=False, secondary_encoder=None, reduction='sum', batch_size=32, sampler_builder: SamplerBuilder = None, epochs=3, tagset=None, token_key='token', max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None, seed=None, devices: Union[float, int, List[int]] = None, **kwargs): """Fit component to training set. Args: trn_data: Training set. dev_data: Development set. save_dir: The directory to save trained component. transformer: An identifier of a pre-trained transformer. delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining tokens during decoding. merge_types: The types of consecutive entities to be merged. average_subwords: ``True`` to average subword representations. word_dropout: Dropout rate to randomly replace a subword with MASK. hidden_dropout: Dropout rate applied to hidden states. layer_dropout: Randomly zero out hidden states of a transformer layer. scalar_mix: Layer attention. grad_norm: Gradient norm for clipping. lr: Learning rate for decoder. transformer_lr: Learning for encoder. adam_epsilon: The epsilon to use in Adam. weight_decay: The weight decay to use. warmup_steps: The number of warmup steps. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden states from the main encoder as input. reduction: The loss reduction used in aggregating losses. batch_size: The number of samples in a batch. sampler_builder: The builder to build sampler, which will override batch_size. epochs: The number of epochs to train. tagset: Optional tagset to prune entities outside of this tagset from datasets. token_key: The key to tokens in dataset. max_seq_len: The maximum sequence length. Sequence longer than this will be handled by sliding window. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. transform: An optional transform to be applied to samples. Usually a character normalization transform is passed in. devices: Devices this component will live on. logger: Any :class:`logging.Logger` instance. seed: Random seed to reproduce this training. **kwargs: Not used. Returns: The best metrics on training set. """ return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_vocabs(self, trn, logger, **kwargs): super().build_vocabs(trn, logger, **kwargs) if self.config.get('delimiter_in_entity', None) is None: # Check the first sample to guess the delimiter between tokens in a NE tokens = trn[0][self.config.token_key] delimiter_in_entity = guess_delimiter(tokens) logger.info(f'Guess the delimiter between tokens in named entity could be [blue]"{delimiter_in_entity}' f'"[/blue]. If not, specify `delimiter_in_entity` in `fit()`') self.config.delimiter_in_entity = delimiter_in_entity def build_dataset(self, data, transform=None, **kwargs): dataset = super().build_dataset(data, transform, **kwargs) if isinstance(data, str): tagset = self.config.get('tagset', None) if tagset: dataset.append_transform(functools.partial(prune_ner_tagset, tagset=tagset)) return dataset @property def dict_whitelist(self) -> DictInterface: return self.config.get('dict_whitelist', None) @dict_whitelist.setter def dict_whitelist(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): dictionary = TrieDict(dictionary) self.config.dict_whitelist = dictionary @property def dict_blacklist(self) -> DictInterface: return self.config.get('dict_blacklist', None) @dict_blacklist.setter def dict_blacklist(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): dictionary = TrieDict(dictionary) self.config.dict_blacklist = dictionary ================================================ FILE: hanlp/components/parsers/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-22 12:46 ================================================ FILE: hanlp/components/parsers/alg.py ================================================ # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import torch from hanlp_common.conll import isprojective def kmeans(x, k, max_it=32): r""" KMeans algorithm for clustering the sentences by length. Args: x (list[int]): The list of sentence lengths. k (int): The number of clusters. This is an approximate value. The final number of clusters can be less or equal to `k`. max_it (int): Maximum number of iterations. If centroids does not converge after several iterations, the algorithm will be early stopped. Returns: list[float], list[list[int]]: The first list contains average lengths of sentences in each cluster. The second is the list of clusters holding indices of data points. Examples: >>> x = torch.randint(10,20,(10,)).tolist() >>> x [15, 10, 17, 11, 18, 13, 17, 19, 18, 14] >>> centroids, clusters = kmeans(x, 3) >>> centroids [10.5, 14.0, 17.799999237060547] >>> clusters [[1, 3], [0, 5, 9], [2, 4, 6, 7, 8]] """ # the number of clusters must not be greater than the number of datapoints x, k = torch.tensor(x, dtype=torch.float), min(len(x), k) # collect unique datapoints d = x.unique() # initialize k centroids randomly c = d[torch.randperm(len(d))[:k]] # assign each datapoint to the cluster with the closest centroid dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1) for _ in range(max_it): # if an empty cluster is encountered, # choose the farthest datapoint from the biggest cluster and move that the empty one mask = torch.arange(k).unsqueeze(-1).eq(y) none = torch.where(~mask.any(-1))[0].tolist() while len(none) > 0: for i in none: # the biggest cluster b = torch.where(mask[mask.sum(-1).argmax()])[0] # the datapoint farthest from the centroid of cluster b f = dists[b].argmax() # update the assigned cluster of f y[b[f]] = i # re-calculate the mask mask = torch.arange(k).unsqueeze(-1).eq(y) none = torch.where(~mask.any(-1))[0].tolist() # update the centroids c, old = (x * mask).sum(-1) / mask.sum(-1), c # re-assign all datapoints to clusters dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1) # stop iteration early if the centroids converge if c.equal(old): break # assign all datapoints to the new-generated clusters # the empty ones are discarded assigned = y.unique().tolist() # get the centroids of the assigned clusters centroids = c[assigned].tolist() # map all values of datapoints to buckets clusters = [torch.where(y.eq(i))[0].tolist() for i in assigned] return centroids, clusters def eisner(scores, mask): r""" First-order Eisner algorithm for projective decoding. References: - Ryan McDonald, Koby Crammer and Fernando Pereira. 2005. `Online Large-Margin Training of Dependency Parsers`_. Args: scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all dependent-head pairs. mask (~torch.BoolTensor): ``[batch_size, seq_len]``. The mask to avoid parsing over padding tokens. The first column serving as pseudo words for roots should be ``False``. Returns: ~torch.Tensor: A tensor with shape ``[batch_size, seq_len]`` for the resulting projective parse trees. Examples: >>> scores = torch.tensor([[[-13.5026, -18.3700, -13.0033, -16.6809], [-36.5235, -28.6344, -28.4696, -31.6750], [ -2.9084, -7.4825, -1.4861, -6.8709], [-29.4880, -27.6905, -26.1498, -27.0233]]]) >>> mask = torch.tensor([[False, True, True, True]]) >>> eisner(scores, mask) tensor([[0, 2, 0, 2]]) .. _Online Large-Margin Training of Dependency Parsers: https://www.aclweb.org/anthology/P05-1012/ """ lens = mask.sum(1) batch_size, seq_len, _ = scores.shape scores = scores.permute(2, 1, 0) s_i = torch.full_like(scores, float('-inf')) s_c = torch.full_like(scores, float('-inf')) p_i = scores.new_zeros(seq_len, seq_len, batch_size).long() p_c = scores.new_zeros(seq_len, seq_len, batch_size).long() s_c.diagonal().fill_(0) for w in range(1, seq_len): n = seq_len - w starts = p_i.new_tensor(range(n)).unsqueeze(0) # ilr = C(i->r) + C(j->r+1) ilr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1)) # [batch_size, n, w] il = ir = ilr.permute(2, 0, 1) # I(j->i) = max(C(i->r) + C(j->r+1) + s(j->i)), i <= r < j il_span, il_path = il.max(-1) s_i.diagonal(-w).copy_(il_span + scores.diagonal(-w)) p_i.diagonal(-w).copy_(il_path + starts) # I(i->j) = max(C(i->r) + C(j->r+1) + s(i->j)), i <= r < j ir_span, ir_path = ir.max(-1) s_i.diagonal(w).copy_(ir_span + scores.diagonal(w)) p_i.diagonal(w).copy_(ir_path + starts) # C(j->i) = max(C(r->i) + I(j->r)), i <= r < j cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0)) cl_span, cl_path = cl.permute(2, 0, 1).max(-1) s_c.diagonal(-w).copy_(cl_span) p_c.diagonal(-w).copy_(cl_path + starts) # C(i->j) = max(I(i->r) + C(r->j)), i < r <= j cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0) cr_span, cr_path = cr.permute(2, 0, 1).max(-1) s_c.diagonal(w).copy_(cr_span) s_c[0, w][lens.ne(w)] = float('-inf') p_c.diagonal(w).copy_(cr_path + starts + 1) def backtrack(p_i, p_c, heads, i, j, complete): if i == j: return if complete: r = p_c[i, j] backtrack(p_i, p_c, heads, i, r, False) backtrack(p_i, p_c, heads, r, j, True) else: r, heads[j] = p_i[i, j], i i, j = sorted((i, j)) backtrack(p_i, p_c, heads, i, r, True) backtrack(p_i, p_c, heads, j, r + 1, True) preds = [] p_c = p_c.permute(2, 0, 1).cpu() p_i = p_i.permute(2, 0, 1).cpu() for i, length in enumerate(lens.tolist()): heads = p_c.new_zeros(length + 1, dtype=torch.long) backtrack(p_i[i], p_c[i], heads, 0, length, True) preds.append(heads.to(mask.device)) return pad(preds, total_length=seq_len).to(mask.device) def backtrack(p_i, p_c, heads, i, j, complete): if i == j: return if complete: r = p_c[i, j] backtrack(p_i, p_c, heads, i, r, False) backtrack(p_i, p_c, heads, r, j, True) else: r, heads[j] = p_i[i, j], i i, j = sorted((i, j)) backtrack(p_i, p_c, heads, i, r, True) backtrack(p_i, p_c, heads, j, r + 1, True) def stripe(x, n, w, offset=(0, 0), dim=1): """r'''Returns a diagonal stripe of the tensor. Args: x: Tensor n: int w: int offset: tuple (Default value = (0) dim: int (Default value = 1) Example: 0): Returns: >>> x = torch.arange(25).view(5, 5) >>> x tensor([[ 0, 1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], [20, 21, 22, 23, 24]]) >>> stripe(x, 2, 3, (1, 1)) tensor([[ 6, 7, 8], [12, 13, 14]]) >>> stripe(x, 2, 3, dim=0) tensor([[ 0, 5, 10], [ 6, 11, 16]]) """ x, seq_len = x.contiguous(), x.size(1) stride, numel = list(x.stride()), x[0, 0].numel() stride[0] = (seq_len + 1) * numel stride[1] = (1 if dim == 1 else seq_len) * numel return x.as_strided(size=(n, w, *x.shape[2:]), stride=stride, storage_offset=(offset[0] * seq_len + offset[1]) * numel) def cky(scores, mask): r""" The implementation of `Cocke-Kasami-Younger`_ (CKY) algorithm to parse constituency trees. References: - Yu Zhang, Houquan Zhou and Zhenghua Li. 2020. `Fast and Accurate Neural CRF Constituency Parsing`_. Args: scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all candidate constituents. mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``. The mask to avoid parsing over padding tokens. For each square matrix in a batch, the positions except upper triangular part should be masked out. Returns: Sequences of factorized predicted bracketed trees that are traversed in pre-order. Examples: >>> scores = torch.tensor([[[ 2.5659, 1.4253, -2.5272, 3.3011], [ 1.3687, -0.5869, 1.0011, 3.3020], [ 1.2297, 0.4862, 1.1975, 2.5387], [-0.0511, -1.2541, -0.7577, 0.2659]]]) >>> mask = torch.tensor([[[False, True, True, True], [False, False, True, True], [False, False, False, True], [False, False, False, False]]]) >>> cky(scores, mask) [[(0, 3), (0, 1), (1, 3), (1, 2), (2, 3)]] .. _Cocke-Kasami-Younger: https://en.wikipedia.org/wiki/CYK_algorithm .. _Fast and Accurate Neural CRF Constituency Parsing: https://www.ijcai.org/Proceedings/2020/560/ """ lens = mask[:, 0].sum(-1) scores = scores.permute(1, 2, 0) seq_len, seq_len, batch_size = scores.shape s = scores.new_zeros(seq_len, seq_len, batch_size) p = scores.new_zeros(seq_len, seq_len, batch_size).long() for w in range(1, seq_len): n = seq_len - w starts = p.new_tensor(range(n)).unsqueeze(0) if w == 1: s.diagonal(w).copy_(scores.diagonal(w)) continue # [n, w, batch_size] s_span = stripe(s, n, w - 1, (0, 1)) + stripe(s, n, w - 1, (1, w), 0) # [batch_size, n, w] s_span = s_span.permute(2, 0, 1) # [batch_size, n] s_span, p_span = s_span.max(-1) s.diagonal(w).copy_(s_span + scores.diagonal(w)) p.diagonal(w).copy_(p_span + starts + 1) def backtrack(p, i, j): if j == i + 1: return [(i, j)] split = p[i][j] ltree = backtrack(p, i, split) rtree = backtrack(p, split, j) return [(i, j)] + ltree + rtree p = p.permute(2, 0, 1).tolist() trees = [backtrack(p[i], 0, length) if length else [] for i, length in enumerate(lens.tolist())] return trees def istree(sequence, proj=False, multiroot=False): r""" Checks if the arcs form an valid dependency tree. Args: sequence (list[int]): A list of head indices. proj (bool): If ``True``, requires the tree to be projective. Default: ``False``. multiroot (bool): If ``False``, requires the tree to contain only a single root. Default: ``True``. Returns: ``True`` if the arcs form an valid tree, ``False`` otherwise. Examples: >>> istree([3, 0, 0, 3], multiroot=True) True >>> istree([3, 0, 0, 3], proj=True) False """ if proj and not isprojective(sequence): return False n_roots = sum(head == 0 for head in sequence) if n_roots == 0: return False if not multiroot and n_roots > 1: return False if any(i == head for i, head in enumerate(sequence, 1)): return False return next(tarjan(sequence), None) is None def tarjan(sequence): r""" Tarjan algorithm for finding Strongly Connected Components (SCCs) of a graph. Args: sequence (list): List of head indices. Yields: A list of indices that make up a SCC. All self-loops are ignored. Examples: >>> next(tarjan([2, 5, 0, 3, 1])) # (1 -> 5 -> 2 -> 1) is a cycle [2, 5, 1] """ sequence = [-1] + sequence # record the search order, i.e., the timestep dfn = [-1] * len(sequence) # record the the smallest timestep in a SCC low = [-1] * len(sequence) # push the visited into the stack stack, onstack = [], [False] * len(sequence) def connect(i, timestep): dfn[i] = low[i] = timestep[0] timestep[0] += 1 stack.append(i) onstack[i] = True for j, head in enumerate(sequence): if head != i: continue if dfn[j] == -1: yield from connect(j, timestep) low[i] = min(low[i], low[j]) elif onstack[j]: low[i] = min(low[i], dfn[j]) # a SCC is completed if low[i] == dfn[i]: cycle = [stack.pop()] while cycle[-1] != i: onstack[cycle[-1]] = False cycle.append(stack.pop()) onstack[i] = False # ignore the self-loop if len(cycle) > 1: yield cycle timestep = [0] for i in range(len(sequence)): if dfn[i] == -1: yield from connect(i, timestep) def chuliu_edmonds(s): r""" ChuLiu/Edmonds algorithm for non-projective decoding. Some code is borrowed from `tdozat's implementation`_. Descriptions of notations and formulas can be found in `Non-projective Dependency Parsing using Spanning Tree Algorithms`_. Notes: The algorithm does not guarantee to parse a single-root tree. References: - Ryan McDonald, Fernando Pereira, Kiril Ribarov and Jan Hajic. 2005. `Non-projective Dependency Parsing using Spanning Tree Algorithms`_. Args: s (~torch.Tensor): ``[seq_len, seq_len]``. Scores of all dependent-head pairs. Returns: ~torch.Tensor: A tensor with shape ``[seq_len]`` for the resulting non-projective parse tree. .. _tdozat's implementation: https://github.com/tdozat/Parser-v3 .. _Non-projective Dependency Parsing using Spanning Tree Algorithms: https://www.aclweb.org/anthology/H05-1066/ """ s[0, 1:] = float('-inf') # prevent self-loops s.diagonal()[1:].fill_(float('-inf')) # select heads with highest scores tree = s.argmax(-1) # return the cycle finded by tarjan algorithm lazily cycle = next(tarjan(tree.tolist()[1:]), None) # if the tree has no cycles, then it is a MST if not cycle: return tree # indices of cycle in the original tree cycle = torch.tensor(cycle) # indices of noncycle in the original tree noncycle = torch.ones(len(s)).index_fill_(0, cycle, 0) noncycle = torch.where(noncycle.gt(0))[0] def contract(s): # heads of cycle in original tree cycle_heads = tree[cycle] # scores of cycle in original tree s_cycle = s[cycle, cycle_heads] # calculate the scores of cycle's potential dependents # s(c->x) = max(s(x'->x)), x in noncycle and x' in cycle s_dep = s[noncycle][:, cycle] # find the best cycle head for each noncycle dependent deps = s_dep.argmax(1) # calculate the scores of cycle's potential heads # s(x->c) = max(s(x'->x) - s(a(x')->x') + s(cycle)), x in noncycle and x' in cycle # a(v) is the predecessor of v in cycle # s(cycle) = sum(s(a(v)->v)) s_head = s[cycle][:, noncycle] - s_cycle.view(-1, 1) + s_cycle.sum() # find the best noncycle head for each cycle dependent heads = s_head.argmax(0) contracted = torch.cat((noncycle, torch.tensor([-1]))) # calculate the scores of contracted graph s = s[contracted][:, contracted] # set the contracted graph scores of cycle's potential dependents s[:-1, -1] = s_dep[range(len(deps)), deps] # set the contracted graph scores of cycle's potential heads s[-1, :-1] = s_head[heads, range(len(heads))] return s, heads, deps # keep track of the endpoints of the edges into and out of cycle for reconstruction later s, heads, deps = contract(s) # y is the contracted tree y = chuliu_edmonds(s) # exclude head of cycle from y y, cycle_head = y[:-1], y[-1] # fix the subtree with no heads coming from the cycle # len(y) denotes heads coming from the cycle subtree = y < len(y) # add the nodes to the new tree tree[noncycle[subtree]] = noncycle[y[subtree]] # fix the subtree with heads coming from the cycle subtree = ~subtree # add the nodes to the tree tree[noncycle[subtree]] = cycle[deps[subtree]] # fix the root of the cycle cycle_root = heads[cycle_head] # break the cycle and add the root of the cycle to the tree tree[cycle[cycle_root]] = noncycle[cycle_head] return tree def mst(scores, mask, multiroot=False): r""" MST algorithm for decoding non-pojective trees. This is a wrapper for ChuLiu/Edmonds algorithm. The algorithm first runs ChuLiu/Edmonds to parse a tree and then have a check of multi-roots, If ``multiroot=True`` and there indeed exist multi-roots, the algorithm seeks to find best single-root trees by iterating all possible single-root trees parsed by ChuLiu/Edmonds. Otherwise the resulting trees are directly taken as the final outputs. Args: scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all dependent-head pairs. mask (~torch.BoolTensor): ``[batch_size, seq_len]``. The mask to avoid parsing over padding tokens. The first column serving as pseudo words for roots should be ``False``. muliroot (bool): Ensures to parse a single-root tree If ``False``. Returns: ~torch.Tensor: A tensor with shape ``[batch_size, seq_len]`` for the resulting non-projective parse trees. Examples: >>> scores = torch.tensor([[[-11.9436, -13.1464, -6.4789, -13.8917], [-60.6957, -60.2866, -48.6457, -63.8125], [-38.1747, -49.9296, -45.2733, -49.5571], [-19.7504, -23.9066, -9.9139, -16.2088]]]) >>> scores[:, 0, 1:] = float('-inf') >>> scores.diagonal(0, 1, 2)[1:].fill_(float('-inf')) >>> mask = torch.tensor([[False, True, True, True]]) >>> mst(scores, mask) tensor([[0, 2, 0, 2]]) """ batch_size, seq_len, _ = scores.shape scores = scores.detach().cpu().unbind() preds = [] for i, length in enumerate(mask.sum(1).tolist()): s = scores[i][:length + 1, :length + 1] tree = chuliu_edmonds(s) roots = torch.where(tree[1:].eq(0))[0] + 1 if not multiroot and len(roots) > 1: s_root = s[:, 0] s_best = float('-inf') s = s.index_fill(1, torch.tensor(0), float('-inf')) for root in roots: s[:, 0] = float('-inf') s[root, 0] = s_root[root] t = chuliu_edmonds(s) s_tree = s[1:].gather(1, t[1:].unsqueeze(-1)).sum() if s_tree > s_best: s_best, tree = s_tree, t preds.append(tree) return pad(preds, total_length=seq_len).to(mask.device) def eisner2o(scores, mask): r""" Second-order Eisner algorithm for projective decoding. This is an extension of the first-order one that further incorporates sibling scores into tree scoring. References: - Ryan McDonald and Fernando Pereira. 2006. `Online Learning of Approximate Dependency Parsing Algorithms`_. Args: scores (~torch.Tensor, ~torch.Tensor): A tuple of two tensors representing the first-order and second-order scores repectively. The first (``[batch_size, seq_len, seq_len]``) holds scores of all dependent-head pairs. The second (``[batch_size, seq_len, seq_len, seq_len]``) holds scores of all dependent-head-sibling triples. mask (~torch.BoolTensor): ``[batch_size, seq_len]``. The mask to avoid parsing over padding tokens. The first column serving as pseudo words for roots should be ``False``. Returns: ~torch.Tensor: A tensor with shape ``[batch_size, seq_len]`` for the resulting projective parse trees. Examples: >>> s_arc = torch.tensor([[[ -2.8092, -7.9104, -0.9414, -5.4360], [-10.3494, -7.9298, -3.6929, -7.3985], [ 1.1815, -3.8291, 2.3166, -2.7183], [ -3.9776, -3.9063, -1.6762, -3.1861]]]) >>> s_sib = torch.tensor([[[[ 0.4719, 0.4154, 1.1333, 0.6946], [ 1.1252, 1.3043, 2.1128, 1.4621], [ 0.5974, 0.5635, 1.0115, 0.7550], [ 1.1174, 1.3794, 2.2567, 1.4043]], [[-2.1480, -4.1830, -2.5519, -1.8020], [-1.2496, -1.7859, -0.0665, -0.4938], [-2.6171, -4.0142, -2.9428, -2.2121], [-0.5166, -1.0925, 0.5190, 0.1371]], [[ 0.5827, -1.2499, -0.0648, -0.0497], [ 1.4695, 0.3522, 1.5614, 1.0236], [ 0.4647, -0.7996, -0.3801, 0.0046], [ 1.5611, 0.3875, 1.8285, 1.0766]], [[-1.3053, -2.9423, -1.5779, -1.2142], [-0.1908, -0.9699, 0.3085, 0.1061], [-1.6783, -2.8199, -1.8853, -1.5653], [ 0.3629, -0.3488, 0.9011, 0.5674]]]]) >>> mask = torch.tensor([[False, True, True, True]]) >>> eisner2o((s_arc, s_sib), mask) tensor([[0, 2, 0, 2]]) .. _Online Learning of Approximate Dependency Parsing Algorithms: https://www.aclweb.org/anthology/E06-1011/ """ # the end position of each sentence in a batch lens = mask.sum(1) s_arc, s_sib = scores batch_size, seq_len, _ = s_arc.shape # [seq_len, seq_len, batch_size] s_arc = s_arc.permute(2, 1, 0) # [seq_len, seq_len, seq_len, batch_size] s_sib = s_sib.permute(2, 1, 3, 0) s_i = torch.full_like(s_arc, float('-inf')) s_s = torch.full_like(s_arc, float('-inf')) s_c = torch.full_like(s_arc, float('-inf')) p_i = s_arc.new_zeros(seq_len, seq_len, batch_size).long() p_s = s_arc.new_zeros(seq_len, seq_len, batch_size).long() p_c = s_arc.new_zeros(seq_len, seq_len, batch_size).long() s_c.diagonal().fill_(0) for w in range(1, seq_len): # n denotes the number of spans to iterate, # from span (0, w) to span (n, n+w) given width w n = seq_len - w starts = p_i.new_tensor(range(n)).unsqueeze(0) # I(j->i) = max(I(j->r) + S(j->r, i)), i < r < j | # C(j->j) + C(i->j-1)) # + s(j->i) # [n, w, batch_size] il = stripe(s_i, n, w, (w, 1)) + stripe(s_s, n, w, (1, 0), 0) il += stripe(s_sib[range(w, n + w), range(n)], n, w, (0, 1)) # [n, 1, batch_size] il0 = stripe(s_c, n, 1, (w, w)) + stripe(s_c, n, 1, (0, w - 1)) # il0[0] are set to zeros since the scores of the complete spans starting from 0 are always -inf il[:, -1] = il0.index_fill_(0, lens.new_tensor(0), 0).squeeze(1) il_span, il_path = il.permute(2, 0, 1).max(-1) s_i.diagonal(-w).copy_(il_span + s_arc.diagonal(-w)) p_i.diagonal(-w).copy_(il_path + starts + 1) # I(i->j) = max(I(i->r) + S(i->r, j), i < r < j | # C(i->i) + C(j->i+1)) # + s(i->j) # [n, w, batch_size] ir = stripe(s_i, n, w) + stripe(s_s, n, w, (0, w), 0) ir += stripe(s_sib[range(n), range(w, n + w)], n, w) ir[0] = float('-inf') # [n, 1, batch_size] ir0 = stripe(s_c, n, 1) + stripe(s_c, n, 1, (w, 1)) ir[:, 0] = ir0.squeeze(1) ir_span, ir_path = ir.permute(2, 0, 1).max(-1) s_i.diagonal(w).copy_(ir_span + s_arc.diagonal(w)) p_i.diagonal(w).copy_(ir_path + starts) # [n, w, batch_size] slr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1)) slr_span, slr_path = slr.permute(2, 0, 1).max(-1) # S(j, i) = max(C(i->r) + C(j->r+1)), i <= r < j s_s.diagonal(-w).copy_(slr_span) p_s.diagonal(-w).copy_(slr_path + starts) # S(i, j) = max(C(i->r) + C(j->r+1)), i <= r < j s_s.diagonal(w).copy_(slr_span) p_s.diagonal(w).copy_(slr_path + starts) # C(j->i) = max(C(r->i) + I(j->r)), i <= r < j cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0)) cl_span, cl_path = cl.permute(2, 0, 1).max(-1) s_c.diagonal(-w).copy_(cl_span) p_c.diagonal(-w).copy_(cl_path + starts) # C(i->j) = max(I(i->r) + C(r->j)), i < r <= j cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0) cr_span, cr_path = cr.permute(2, 0, 1).max(-1) s_c.diagonal(w).copy_(cr_span) # disable multi words to modify the root s_c[0, w][lens.ne(w)] = float('-inf') p_c.diagonal(w).copy_(cr_path + starts + 1) def backtrack(p_i, p_s, p_c, heads, i, j, flag): if i == j: return if flag == 'c': r = p_c[i, j] backtrack(p_i, p_s, p_c, heads, i, r, 'i') backtrack(p_i, p_s, p_c, heads, r, j, 'c') elif flag == 's': r = p_s[i, j] i, j = sorted((i, j)) backtrack(p_i, p_s, p_c, heads, i, r, 'c') backtrack(p_i, p_s, p_c, heads, j, r + 1, 'c') elif flag == 'i': r, heads[j] = p_i[i, j], i if r == i: r = i + 1 if i < j else i - 1 backtrack(p_i, p_s, p_c, heads, j, r, 'c') else: backtrack(p_i, p_s, p_c, heads, i, r, 'i') backtrack(p_i, p_s, p_c, heads, r, j, 's') preds = [] p_i = p_i.permute(2, 0, 1).cpu() p_s = p_s.permute(2, 0, 1).cpu() p_c = p_c.permute(2, 0, 1).cpu() for i, length in enumerate(lens.tolist()): heads = p_c.new_zeros(length + 1, dtype=torch.long) backtrack(p_i[i], p_s[i], p_c[i], heads, 0, length, 'c') preds.append(heads.to(mask.device)) return pad(preds, total_length=seq_len).to(mask.device) def pad(tensors, padding_value=0, total_length=None): size = [len(tensors)] + [max(tensor.size(i) for tensor in tensors) for i in range(len(tensors[0].size()))] if total_length is not None: assert total_length >= size[1] size[1] = total_length out_tensor = tensors[0].data.new(*size).fill_(padding_value) for i, tensor in enumerate(tensors): out_tensor[i][[slice(0, i) for i in tensor.size()]] = tensor return out_tensor def decode_dep(s_arc, mask, tree=False, proj=False): r""" Args: s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all possible arcs. mask (~torch.BoolTensor): ``[batch_size, seq_len]``. The mask for covering the unpadded tokens. tree (bool): If ``True``, ensures to output well-formed trees. Default: ``False``. proj (bool): If ``True``, ensures to output projective trees. Default: ``False``. Returns: ~torch.Tensor, ~torch.Tensor: Predicted arcs and labels of shape ``[batch_size, seq_len]``. """ lens = mask.sum(1) arc_preds = s_arc.argmax(-1) bad = [not istree(seq[1:i + 1], proj) for i, seq in zip(lens.tolist(), arc_preds.tolist())] if tree and any(bad): if proj: alg = eisner else: alg = mst s_arc.diagonal(0, 1, 2)[1:].fill_(float('-inf')) arc_preds[bad] = alg(s_arc[bad], mask[bad]) return arc_preds ================================================ FILE: hanlp/components/parsers/alg_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-26 19:49 # Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser from typing import List import numpy as np import tensorflow as tf from collections import defaultdict def nonzero(t: tf.Tensor) -> tf.Tensor: return tf.where(t > 0) def view(t: tf.Tensor, *dims) -> tf.Tensor: return tf.reshape(t, dims) def arange(n: int) -> tf.Tensor: return tf.range(n) def randperm(n: int) -> tf.Tensor: return tf.random.shuffle(arange(n)) def tolist(t: tf.Tensor) -> List: if isinstance(t, tf.Tensor): t = t.numpy() return t.tolist() def kmeans(x, k, seed=None): """See https://github.com/zysite/biaffine-parser/blob/master/parser/utils/alg.py#L7 Args: x(list): Lengths of sentences k(int): seed: (Default value = None) Returns: """ x = tf.constant(x, dtype=tf.float32) # count the frequency of each datapoint d, indices, f = tf.unique_with_counts(x, tf.int32) f = tf.cast(f, tf.float32) # calculate the sum of the values of the same datapoints total = d * f # initialize k centroids randomly c, old = tf.random.shuffle(d, seed)[:k], None # assign labels to each datapoint based on centroids dists = tf.abs(tf.expand_dims(d, -1) - c) y = tf.argmin(dists, axis=-1, output_type=tf.int32) dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y]))) # make sure number of datapoints is greater than that of clusters assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters" while old is None or not tf.reduce_all(c == old): # if an empty cluster is encountered, # choose the farthest datapoint from the biggest cluster # and move that the empty one for i in range(k): if not tf.reduce_any(y == i): mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32) lens = tf.reduce_sum(mask, axis=-1) biggest = view(nonzero(mask[tf.argmax(lens)]), -1) farthest = tf.argmax(tf.gather(dists, biggest)) tf.tensor_scatter_nd_update(y, tf.expand_dims(tf.expand_dims(biggest[farthest], -1), -1), [i]) mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32) # update the centroids c, old = tf.cast(tf.reduce_sum(total * mask, axis=-1), tf.float32) / tf.cast(tf.reduce_sum(f * mask, axis=-1), tf.float32), c # re-assign all datapoints to clusters dists = tf.abs(tf.expand_dims(d, -1) - c) y = tf.argmin(dists, axis=-1, output_type=tf.int32) dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y]))) # assign all datapoints to the new-generated clusters # without considering the empty ones y, (assigned, _) = tf.gather(y, indices), tf.unique(y) # get the centroids of the assigned clusters centroids = tf.gather(c, assigned).numpy().tolist() # map all values of datapoints to buckets clusters = [tf.squeeze(tf.where(y == i), axis=-1).numpy().tolist() for i in assigned] return centroids, clusters # *************************************************************** class Tarjan: """Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph""" def __init__(self, prediction, tokens): """ Parameters ---------- prediction : numpy.ndarray a predicted dependency tree where prediction[dep_idx] = head_idx tokens : numpy.ndarray the tokens we care about (i.e. exclude _GO, _EOS, and _PAD) """ self._edges = defaultdict(set) self._vertices = set((0,)) for dep, head in enumerate(prediction[tokens]): self._vertices.add(dep + 1) self._edges[head].add(dep + 1) self._indices = {} self._lowlinks = {} self._onstack = defaultdict(lambda: False) self._SCCs = [] index = 0 stack = [] for v in self.vertices: if v not in self.indices: self.strongconnect(v, index, stack) # ============================================================= def strongconnect(self, v, index, stack): """ Args: v: index: stack: Returns: """ self._indices[v] = index self._lowlinks[v] = index index += 1 stack.append(v) self._onstack[v] = True for w in self.edges[v]: if w not in self.indices: self.strongconnect(w, index, stack) self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w]) elif self._onstack[w]: self._lowlinks[v] = min(self._lowlinks[v], self._indices[w]) if self._lowlinks[v] == self._indices[v]: self._SCCs.append(set()) while stack[-1] != v: w = stack.pop() self._onstack[w] = False self._SCCs[-1].add(w) w = stack.pop() self._onstack[w] = False self._SCCs[-1].add(w) return # ====================== @property def edges(self): return self._edges @property def vertices(self): return self._vertices @property def indices(self): return self._indices @property def SCCs(self): return self._SCCs def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True): """Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py Args: parse_probs(NDArray): seq_len x seq_len, the probability of arcs length(NDArray): sentence length including ROOT tokens_to_keep(NDArray): mask matrix ensure_tree: (Default value = True) Returns: """ if ensure_tree: I = np.eye(len(tokens_to_keep)) # block loops and pad heads parse_probs = parse_probs * tokens_to_keep * (1 - I) parse_preds = np.argmax(parse_probs, axis=1) tokens = np.arange(1, length) roots = np.where(parse_preds[tokens] == 0)[0] + 1 # ensure at least one root if len(roots) < 1: # The current root probabilities root_probs = parse_probs[tokens, 0] # The current head probabilities old_head_probs = parse_probs[tokens, parse_preds[tokens]] # Get new potential root probabilities new_root_probs = root_probs / old_head_probs # Select the most probable root new_root = tokens[np.argmax(new_root_probs)] # Make the change parse_preds[new_root] = 0 # ensure at most one root elif len(roots) > 1: # The probabilities of the current heads root_probs = parse_probs[roots, 0] # Set the probability of depending on the root zero parse_probs[roots, 0] = 0 # Get new potential heads and their probabilities new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1 new_head_probs = parse_probs[roots, new_heads] / root_probs # Select the most probable root new_root = roots[np.argmin(new_head_probs)] # Make the change parse_preds[roots] = new_heads parse_preds[new_root] = 0 # remove cycles tarjan = Tarjan(parse_preds, tokens) for SCC in tarjan.SCCs: if len(SCC) > 1: dependents = set() to_visit = set(SCC) while len(to_visit) > 0: node = to_visit.pop() if not node in dependents: dependents.add(node) to_visit.update(tarjan.edges[node]) # The indices of the nodes that participate in the cycle cycle = np.array(list(SCC)) # The probabilities of the current heads old_heads = parse_preds[cycle] old_head_probs = parse_probs[cycle, old_heads] # Set the probability of depending on a non-head to zero non_heads = np.array(list(dependents)) parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0 # Get new potential heads and their probabilities new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1 new_head_probs = parse_probs[cycle, new_heads] / old_head_probs # Select the most probable change change = np.argmax(new_head_probs) changed_cycle = cycle[change] old_head = old_heads[change] new_head = new_heads[change] # Make the change parse_preds[changed_cycle] = new_head tarjan.edges[new_head].add(changed_cycle) tarjan.edges[old_head].remove(changed_cycle) return parse_preds else: # block and pad heads parse_probs = parse_probs * tokens_to_keep parse_preds = np.argmax(parse_probs, axis=1) return parse_preds def rel_argmax(rel_probs, length, root, ensure_tree=True): """Fix the relation prediction by heuristic rules Args: rel_probs(NDArray): seq_len x rel_size length: real sentence length ensure_tree: (Default value = True) root: Returns: """ if ensure_tree: tokens = np.arange(1, length) rel_preds = np.argmax(rel_probs, axis=1) roots = np.where(rel_preds[tokens] == root)[0] + 1 if len(roots) < 1: rel_preds[1 + np.argmax(rel_probs[tokens, root])] = root elif len(roots) > 1: root_probs = rel_probs[roots, root] rel_probs[roots, root] = 0 new_rel_preds = np.argmax(rel_probs[roots], axis=1) new_rel_probs = rel_probs[roots, new_rel_preds] / root_probs new_root = roots[np.argmin(new_rel_probs)] rel_preds[roots] = new_rel_preds rel_preds[new_root] = root return rel_preds else: rel_preds = np.argmax(rel_probs, axis=1) return rel_preds ================================================ FILE: hanlp/components/parsers/biaffine/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-08 20:43 ================================================ FILE: hanlp/components/parsers/biaffine/biaffine.py ================================================ # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import torch import torch.nn as nn class Biaffine(nn.Module): r""" Biaffine layer for first-order scoring. This function has a tensor of weights :math:`W` and bias terms if needed. The score :math:`s(x, y)` of the vector pair :math:`(x, y)` is computed as :math:`x^T W y`, in which :math:`x` and :math:`y` can be concatenated with bias terms. References: - Timothy Dozat and Christopher D. Manning. 2017. `Deep Biaffine Attention for Neural Dependency Parsing`_. Args: n_in (int): The size of the input feature. n_out (int): The number of output channels. bias_x (bool): If ``True``, adds a bias term for tensor :math:`x`. Default: ``True``. bias_y (bool): If ``True``, adds a bias term for tensor :math:`y`. Default: ``True``. .. _Deep Biaffine Attention for Neural Dependency Parsing: https://openreview.net/forum?id=Hk95PK9le """ def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True): super().__init__() self.n_in = n_in self.n_out = n_out self.bias_x = bias_x self.bias_y = bias_y self.weight = nn.Parameter(torch.Tensor(n_out, n_in + bias_x, n_in + bias_y)) self.reset_parameters() def __repr__(self): s = f"n_in={self.n_in}, n_out={self.n_out}" if self.bias_x: s += f", bias_x={self.bias_x}" if self.bias_y: s += f", bias_y={self.bias_y}" return f"{self.__class__.__name__}({s})" def reset_parameters(self): nn.init.zeros_(self.weight) def forward(self, x, y): r""" Args: x (torch.Tensor): ``[batch_size, seq_len, n_in]``. y (torch.Tensor): ``[batch_size, seq_len, n_in]``. Returns: ~torch.Tensor: A scoring tensor of shape ``[batch_size, n_out, seq_len, seq_len]``. If ``n_out=1``, the dimension for ``n_out`` will be squeezed automatically. """ if self.bias_x: x = torch.cat((x, torch.ones_like(x[..., :1])), -1) if self.bias_y: y = torch.cat((y, torch.ones_like(y[..., :1])), -1) # [batch_size, n_out, seq_len, seq_len] s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y) # remove dim 1 if n_out == 1 s = s.squeeze(1) return s ================================================ FILE: hanlp/components/parsers/biaffine/biaffine_2nd_dep.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-06 13:57 import functools from typing import Union, List, Any import torch from hanlp_common.constant import UNK from hanlp.common.transform import TransformList from hanlp.common.vocab import Vocab from hanlp.components.parsers.biaffine.biaffine import Biaffine from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder, \ EncoderWithContextualLayer from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser from hanlp.components.parsers.biaffine.biaffine_sdp import BiaffineSemanticDependencyParser from hanlp_common.conll import CoNLLUWord, CoNLLSentence from hanlp.components.parsers.parse_alg import add_secondary_arcs_by_preds from hanlp.datasets.parsing.loaders.conll_dataset import append_bos from hanlp.datasets.parsing.semeval15 import unpack_deps_to_head_deprel, merge_head_deprel_with_2nd from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs from transformers import PreTrainedModel, PreTrainedTokenizer class BiaffineSeparateDecoder(torch.nn.Module): def __init__(self, hidden_size, config) -> None: super().__init__() self.biaffine_decoder = BiaffineDecoder(hidden_size, config.n_mlp_arc, config.n_mlp_rel, config.mlp_dropout, config.n_rels) self.biaffine_decoder_2nd = BiaffineDecoder(hidden_size, config.n_mlp_arc, config.n_mlp_rel, config.mlp_dropout, config.n_rels_2nd) def forward(self, x, mask): return tuple(zip(self.biaffine_decoder(x, mask), self.biaffine_decoder_2nd(x, mask))) class BiaffineJointDecoder(BiaffineDecoder): def __init__(self, hidden_size, config) -> None: super().__init__(hidden_size, config.n_mlp_arc, config.n_mlp_rel, config.mlp_dropout, config.n_rels) # the Biaffine layers for secondary dep self.arc_attn_2nd = Biaffine(n_in=config.n_mlp_arc, bias_x=True, bias_y=False) self.rel_attn_2nd = Biaffine(n_in=config.n_mlp_rel, n_out=config.n_rels, bias_x=True, bias_y=True) def forward(self, x, mask=None, **kwargs: Any): arc_d, arc_h, rel_d, rel_h = self.apply_mlps(x) s_arc, s_rel = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn, self.rel_attn) s_arc_2nd, s_rel_2nd = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn_2nd, self.rel_attn_2nd) return (s_arc, s_arc_2nd), (s_rel, s_rel_2nd) class BiaffineSecondaryModel(torch.nn.Module): def __init__(self, config, pretrained_embed: torch.Tensor = None, transformer: PreTrainedModel = None, transformer_tokenizer: PreTrainedTokenizer = None): super().__init__() self.encoder = EncoderWithContextualLayer(config, pretrained_embed, transformer, transformer_tokenizer) self.decoder = BiaffineJointDecoder(self.encoder.hidden_size, config) if config.joint \ else BiaffineSeparateDecoder(self.encoder.hidden_size, config) def forward(self, words=None, feats=None, input_ids=None, token_span=None, mask=None, lens=None, **kwargs): x, mask = self.encoder(words, feats, input_ids, token_span, mask, lens) return self.decoder(x, mask) class BiaffineSecondaryParser(BiaffineDependencyParser): def __init__(self) -> None: super().__init__() self.model: BiaffineSecondaryModel = None def build_dataset(self, data, bos_transform=None): transform = TransformList(functools.partial(append_bos, pos_key='UPOS'), functools.partial(unpack_deps_to_head_deprel, pad_rel=self.config.pad_rel, arc_key='arc_2nd', rel_key='rel_2nd')) if self.config.joint: transform.append(merge_head_deprel_with_2nd) if bos_transform: transform.append(bos_transform) return super().build_dataset(data, transform) def build_criterion(self, **kwargs): # noinspection PyCallByClass return super().build_criterion(**kwargs), (BiaffineSemanticDependencyParser.build_criterion(self, **kwargs)) def fit(self, trn_data, dev_data, save_dir, feat=None, n_embed=100, pretrained_embed=None, transformer=None, average_subwords=False, word_dropout: float = 0.2, transformer_hidden_dropout=None, layer_dropout=0, scalar_mix: int = None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, hidden_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, lr=2e-3, transformer_lr=5e-5, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100, batch_size=None, sampler_builder=None, lowercase=False, epochs=50000, tree=False, punct=False, min_freq=2, apply_constraint=True, joint=False, no_cycle=False, root=None, logger=None, verbose=True, unk=UNK, pad_rel=None, max_sequence_length=512, devices: Union[float, int, List[int]] = None, transform=None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_vocabs(self, dataset, logger=None, transformer=None): self.vocabs['rel_2nd'] = rel_2nd = Vocab(pad_token=self.config.pad_rel, unk_token=self.config.pad_rel) if self.config.joint: self.vocabs['rel'] = rel_2nd super().build_vocabs(dataset, logger, transformer) self.config.n_rels_2nd = len(rel_2nd) def create_model(self, pretrained_embed, transformer): return BiaffineSecondaryModel(self.config, pretrained_embed, transformer, self.transformer_tokenizer) def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None): arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd = self.unpack_scores(arc_scores, rel_scores) loss_1st = super().compute_loss(arc_scores_1st, rel_scores_1st, arcs, rels, mask, criterion[0], batch) mask = self.compute_mask(arc_scores_2nd, batch, mask) # noinspection PyCallByClass loss_2st = BiaffineSemanticDependencyParser.compute_loss(self, arc_scores_2nd, rel_scores_2nd, batch['arc_2nd'], batch['rel_2nd_id'], mask, criterion[1], batch) return loss_1st + loss_2st @staticmethod def compute_mask(arc_scores_2nd, batch, mask_1st): mask = batch.get('mask_2nd', None) if mask is None: batch['mask_2nd'] = mask = BiaffineSemanticDependencyParser.convert_to_3d_mask(arc_scores_2nd, mask_1st) return mask def unpack_scores(self, arc_scores, rel_scores): arc_scores_1st, arc_scores_2nd = arc_scores rel_scores_1st, rel_scores_2nd = rel_scores return arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd def get_pad_dict(self): d = super(BiaffineSecondaryParser, self).get_pad_dict() d.update({'arc_2nd': False}) return d def decode(self, arc_scores, rel_scores, mask, batch=None, predicting=None): output_1st, output_2nd = batch.get('outputs', (None, None)) if output_1st is None: arc_scores_1st, arc_scores_2nd, rel_scores_1st, rel_scores_2nd = self.unpack_scores(arc_scores, rel_scores) output_1st = super().decode(arc_scores_1st, rel_scores_1st, mask) mask = self.compute_mask(arc_scores_2nd, batch, mask) # noinspection PyCallByClass output_2nd = BiaffineSemanticDependencyParser.decode(self, arc_scores_2nd, rel_scores_2nd, mask, batch) if self.config.get('no_cycle'): assert predicting, 'No cycle constraint for evaluation is not implemented yet. If you are ' \ 'interested, welcome to submit a pull request.' root_rel_idx = self.vocabs['rel'].token_to_idx.get(self.config.get('root', None), None) arc_pred_1st, rel_pred_1st, arc_pred_2nd, rel_pred_2nd = *output_1st, *output_2nd arc_scores_2nd = arc_scores_2nd.transpose(1, 2).cpu().detach().numpy() arc_pred_2nd = arc_pred_2nd.cpu().detach().numpy() rel_pred_2nd = rel_pred_2nd.cpu().detach().numpy() trees = arc_pred_1st.cpu().detach().numpy() graphs = [] for i, (arc_scores, arc_preds, rel_preds, tree, tokens) in enumerate( zip(arc_scores_2nd, arc_pred_2nd, rel_pred_2nd, trees, batch['token'])): sent_len = len(tokens) graph = add_secondary_arcs_by_preds(arc_scores, arc_preds[:sent_len, :sent_len], rel_preds, tree[:sent_len], root_rel_idx) graphs.append(graph[1:]) # Remove root # if not predicting: # # Write back to torch Tensor # for d, hr in zip(graph): # pass output_2nd = None, graphs return tuple(zip(output_1st, output_2nd)) def update_metric(self, arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch=None): super().update_metric(arc_preds[0], rel_preds[0], arcs, rels, mask, puncts, metric['1st'], batch) puncts = BiaffineSemanticDependencyParser.convert_to_3d_puncts(puncts, batch['mask_2nd']) # noinspection PyCallByClass BiaffineSemanticDependencyParser.update_metric(self, arc_preds[1], rel_preds[1], batch['arc_2nd'], batch['rel_2nd_id'], batch['mask_2nd'], puncts, metric['2nd'], batch) def build_metric(self, **kwargs): # noinspection PyCallByClass return MetricDict({'1st': super().build_metric(**kwargs), '2nd': BiaffineSemanticDependencyParser.build_metric(self, **kwargs)}) def collect_outputs_extend(self, predictions: list, arc_preds, rel_preds, lens, mask): predictions.extend(rel_preds[1]) def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True): rel_vocab = self.vocabs['rel'].idx_to_token for d, graph in zip(data, predictions): sent = CoNLLSentence() for idx, (cell, hrs) in enumerate(zip(d, graph)): if use_pos: token, pos = cell else: token, pos = cell, None head = hrs[0][0] deprel = rel_vocab[hrs[0][1]] deps = [(h, rel_vocab[r]) for h, r in hrs[1:]] sent.append(CoNLLUWord(idx + 1, token, upos=pos, head=head, deprel=deprel, deps=deps)) outputs.append(sent) ================================================ FILE: hanlp/components/parsers/biaffine/biaffine_dep.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-08 20:51 import os from collections import Counter from typing import Union, Any, List from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, AutoModel_, AutoTokenizer_ import torch from hanlp.utils.torch_util import lengths_to_mask from torch import nn from torch.optim import Adam from torch.optim.lr_scheduler import ExponentialLR from torch.utils.data import DataLoader from hanlp_common.constant import ROOT, UNK, IDX from hanlp.common.dataset import PadSequenceDataLoader from hanlp.common.structure import History from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import LowerCase, FieldLength, PunctuationMask from hanlp.common.vocab import Vocab from hanlp.components.parsers.alg import decode_dep from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDependencyModel from hanlp_common.conll import CoNLLWord, CoNLLSentence from hanlp.datasets.parsing.loaders.conll_dataset import CoNLLParsingDataset, append_bos from hanlp.layers.embeddings.util import index_word2vec_with_vocab from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer from hanlp.metrics.parsing.attachmentscore import AttachmentScore from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import isdebugging, merge_locals_kwargs, merge_dict, reorder class BiaffineDependencyParser(TorchComponent): def __init__(self) -> None: """Biaffine dependency parsing (:cite:`dozat:17a`). """ super().__init__() self.model: BiaffineDependencyModel = None self.transformer_tokenizer: PreTrainedTokenizer = None def predict(self, data: Any, batch_size=None, batch_max_tokens=None, conll=True, **kwargs): if not data: return [] use_pos = self.use_pos flat = self.input_is_flat(data, use_pos) if flat: data = [data] samples = self.build_samples(data, use_pos) if not batch_max_tokens: batch_max_tokens = self.config.get('batch_max_tokens', None) if not batch_size: batch_size = self.config.batch_size dataloader = self.build_dataloader(samples, device=self.devices[0], shuffle=False, **merge_dict(self.config, batch_size=batch_size, batch_max_tokens=batch_max_tokens, overwrite=True, **kwargs)) predictions, build_data, data, order = self.before_outputs(data) for batch in dataloader: arc_scores, rel_scores, mask, puncts = self.feed_batch(batch) self.collect_outputs(arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos, build_data) outputs = self.post_outputs(predictions, data, order, use_pos, build_data, conll=conll) if flat: return outputs[0] return outputs def build_samples(self, data, use_pos=None): samples = [] pos_key = 'CPOS' if 'CPOS' in self.vocabs else 'UPOS' for idx, each in enumerate(data): sample = {IDX: idx} if use_pos: token, pos = zip(*each) sample.update({'FORM': list(token), pos_key: list(pos)}) else: token = each sample.update({'FORM': list(token)}) samples.append(sample) return samples def input_is_flat(self, data, use_pos=None): if use_pos is None: use_pos = 'CPOS' in self.vocabs if use_pos: flat = isinstance(data[0], (list, tuple)) and isinstance(data[0][0], str) else: flat = isinstance(data[0], str) return flat def before_outputs(self, data): predictions, order = [], [] build_data = data is None if build_data: data = [] return predictions, build_data, data, order def post_outputs(self, predictions, data, order, use_pos, build_data, conll=True): predictions = reorder(predictions, order) if build_data: data = reorder(data, order) outputs = [] self.predictions_to_human(predictions, outputs, data, use_pos, conll=conll) return outputs def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True): if conll: for d, (arcs, rels) in zip(data, predictions): sent = CoNLLSentence() for idx, (cell, a, r) in enumerate(zip(d, arcs, rels)): if use_pos: token, pos = cell else: token, pos = cell, None sent.append(CoNLLWord(idx + 1, token, cpos=pos, head=a, deprel=self.vocabs['rel'][r])) outputs.append(sent) else: for d, (arcs, rels) in zip(data, predictions): sent = [] for idx, (a, r) in enumerate(zip(arcs, rels)): sent.append((a, self.vocabs['rel'][r])) outputs.append(sent) def collect_outputs(self, arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos, build_data): lens = [len(token) - 1 for token in batch['token']] arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch) self.collect_outputs_extend(predictions, arc_preds, rel_preds, lens, mask) order.extend(batch[IDX]) if build_data: if use_pos: data.extend(zip(batch['FORM'], batch['CPOS'])) else: data.extend(batch['FORM']) def collect_outputs_extend(self, predictions: list, arc_preds, rel_preds, lens, mask): predictions.extend(zip([seq.tolist() for seq in arc_preds[mask].split(lens)], [seq.tolist() for seq in rel_preds[mask].split(lens)])) @property def use_pos(self): return self.config.get('feat', None) == 'pos' def fit(self, trn_data, dev_data, save_dir, feat=None, n_embed=100, pretrained_embed=None, transformer=None, average_subwords=False, word_dropout=0.2, transformer_hidden_dropout=None, layer_dropout=0, scalar_mix: int = None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, hidden_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, lr=2e-3, transformer_lr=5e-5, mu=.9, nu=.9, epsilon=1e-12, grad_norm=5.0, decay=.75, decay_steps=5000, weight_decay=0, warmup_steps=0.1, separate_optimizer=False, patience=100, lowercase=False, epochs=50000, tree=False, proj=False, punct=False, min_freq=2, logger=None, verbose=True, unk=UNK, max_sequence_length=512, batch_size=None, sampler_builder=None, gradient_accumulation=1, devices: Union[float, int, List[int]] = None, transform=None, secondary_encoder=None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def execute_training_loop(self, trn, dev, devices, epochs, logger, patience, save_dir, optimizer, gradient_accumulation, **kwargs): optimizer, scheduler, transformer_optimizer, transformer_scheduler = optimizer criterion = self.build_criterion() best_e, best_metric = 0, self.build_metric() timer = CountdownTimer(epochs) history = History() ratio_width = len(f'{len(trn) // gradient_accumulation}/{len(trn) // gradient_accumulation}') for epoch in range(1, epochs + 1): # train one epoch and update the parameters logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, optimizer, scheduler, criterion, epoch, logger, history, transformer_optimizer, transformer_scheduler, gradient_accumulation=gradient_accumulation) loss, dev_metric = self.evaluate_dataloader(dev, criterion, ratio_width=ratio_width, logger=logger) timer.update() # logger.info(f"{'Dev' + ' ' * ratio_width} loss: {loss:.4f} {dev_metric}") # save the model if it is the best so far report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}" if dev_metric > best_metric: best_e, best_metric = epoch, dev_metric self.save_weights(save_dir) report += ' ([red]saved[/red])' else: if patience != epochs: report += f' ({epoch - best_e}/{patience})' else: report += f' ({epoch - best_e})' logger.info(report) if patience is not None and epoch - best_e >= patience: logger.info(f'LAS has stopped improving for {patience} epochs, early stop.') break timer.stop() if not best_e: self.save_weights(save_dir) elif best_e != epoch: self.load_weights(save_dir) logger.info(f"Max score of dev is {best_metric.score:.2%} at epoch {best_e}") logger.info(f"Average time of each epoch is {timer.elapsed_average_human}") logger.info(f"{timer.elapsed_human} elapsed") def build_optimizer(self, epochs, trn, gradient_accumulation, **kwargs): config = self.config model = self.model if isinstance(model, nn.DataParallel): model = model.module if self.config.transformer: transformer = model.encoder.transformer optimizer = Adam(set(model.parameters()) - set(transformer.parameters()), config.lr, (config.mu, config.nu), config.epsilon) if self.config.transformer_lr: num_training_steps = len(trn) * epochs // gradient_accumulation if self.config.separate_optimizer: transformer_optimizer, transformer_scheduler = \ build_optimizer_scheduler_with_transformer(transformer, transformer, config.transformer_lr, config.transformer_lr, num_training_steps, config.warmup_steps, config.weight_decay, adam_epsilon=1e-8) else: optimizer, scheduler = build_optimizer_scheduler_with_transformer(model, transformer, config.lr, config.transformer_lr, num_training_steps, config.warmup_steps, config.weight_decay, adam_epsilon=1e-8) transformer_optimizer, transformer_scheduler = None, None else: transformer.requires_grad_(False) transformer_optimizer, transformer_scheduler = None, None else: optimizer = Adam(model.parameters(), config.lr, (config.mu, config.nu), config.epsilon) transformer_optimizer, transformer_scheduler = None, None if self.config.separate_optimizer: scheduler = ExponentialLR(optimizer, config.decay ** (1 / config.decay_steps)) # noinspection PyUnboundLocalVariable return optimizer, scheduler, transformer_optimizer, transformer_scheduler def build_transformer_tokenizer(self): transformer = self.config.transformer if transformer: transformer_tokenizer: PreTrainedTokenizer = AutoTokenizer_.from_pretrained(transformer, use_fast=True) else: transformer_tokenizer = None self.transformer_tokenizer = transformer_tokenizer return transformer_tokenizer # noinspection PyMethodOverriding def build_dataloader(self, data, shuffle, device, training=False, logger=None, gradient_accumulation=1, sampler_builder=None, batch_size=None, **kwargs) -> DataLoader: dataset = self.build_dataset(data) if self.vocabs.mutable: self.build_vocabs(dataset, logger, self.config.transformer) transformer_tokenizer = self.transformer_tokenizer if transformer_tokenizer: dataset.transform.append(self.build_tokenizer_transform()) dataset.append_transform(FieldLength('token', 'sent_length')) if isinstance(data, str): dataset.purge_cache() if len(dataset) > 1000 and isinstance(data, str): timer = CountdownTimer(len(dataset)) self.cache_dataset(dataset, timer, training, logger) if self.config.transformer: lens = [len(sample['input_ids']) for sample in dataset] else: lens = [sample['sent_length'] for sample in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = None loader = PadSequenceDataLoader(dataset=dataset, batch_sampler=sampler, batch_size=batch_size, pad=self.get_pad_dict(), device=device, vocabs=self.vocabs) return loader def cache_dataset(self, dataset, timer, training=False, logger=None): for each in dataset: timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]') def get_pad_dict(self): return {'arc': 0} def build_dataset(self, data, bos_transform=None): if not bos_transform: bos_transform = append_bos transform = [bos_transform] if self.config.get('transform', None): transform.append(self.config.transform) if self.config.get('lowercase', False): transform.append(LowerCase('token')) transform.append(self.vocabs) if not self.config.punct: transform.append(PunctuationMask('token', 'punct_mask')) return CoNLLParsingDataset(data, transform=transform) def build_tokenizer_transform(self): return TransformerSequenceTokenizer(self.transformer_tokenizer, 'token', '', ret_token_span=True, cls_is_bos=True, max_seq_length=self.config.get('max_sequence_length', 512), truncate_long_sequences=False) def build_vocabs(self, dataset, logger=None, transformer=None): rel_vocab = self.vocabs.get('rel', None) if rel_vocab is None: rel_vocab = Vocab(unk_token=None, pad_token=self.config.get('pad_rel', None)) self.vocabs.put(rel=rel_vocab) if self.config.get('feat', None) == 'pos' or self.config.get('use_pos', False): self.vocabs['pos'] = Vocab(unk_token=None, pad_token=None) timer = CountdownTimer(len(dataset)) if transformer: token_vocab = None else: token_vocab = Vocab() self.vocabs.token = token_vocab unk = self.config.get('unk', None) if unk is not None: token_vocab.unk_token = unk if token_vocab and self.config.get('min_freq', None): counter = Counter() for sample in dataset: for form in sample['token']: counter[form] += 1 reserved_token = [token_vocab.pad_token, token_vocab.unk_token] if ROOT in token_vocab: reserved_token.append(ROOT) freq_words = reserved_token + [token for token, freq in counter.items() if freq >= self.config.min_freq] token_vocab.token_to_idx.clear() for word in freq_words: token_vocab(word) else: for i, sample in enumerate(dataset): timer.log('vocab building [blink][yellow]...[/yellow][/blink]', ratio_percentage=True) rel_vocab.set_unk_as_safe_unk() # Some relation in dev set is OOV self.vocabs.lock() self.vocabs.summary(logger=logger) if token_vocab: self.config.n_words = len(self.vocabs['token']) if 'pos' in self.vocabs: self.config.n_feats = len(self.vocabs['pos']) self.vocabs['pos'].set_unk_as_safe_unk() self.config.n_rels = len(self.vocabs['rel']) if token_vocab: self.config.pad_index = self.vocabs['token'].pad_idx self.config.unk_index = self.vocabs['token'].unk_idx def build_model(self, training=True, **kwargs) -> torch.nn.Module: pretrained_embed, transformer = self.build_embeddings(training=training) if pretrained_embed is not None: self.config.n_embed = pretrained_embed.size(-1) model = self.create_model(pretrained_embed, transformer) return model def create_model(self, pretrained_embed, transformer): return BiaffineDependencyModel(self.config, pretrained_embed, transformer, self.transformer_tokenizer) def build_embeddings(self, training=True): pretrained_embed = None if self.config.get('pretrained_embed', None): pretrained_embed = index_word2vec_with_vocab(self.config.pretrained_embed, self.vocabs['token'], init='zeros', normalize=True) transformer = self.config.transformer if transformer: transformer = AutoModel_.from_pretrained(transformer, training=training) return pretrained_embed, transformer # noinspection PyMethodOverriding def fit_dataloader(self, trn, optimizer, scheduler, criterion, epoch, logger, history: History, transformer_optimizer=None, transformer_scheduler=None, gradient_accumulation=1, **kwargs): self.model.train() timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation)) metric = self.build_metric(training=True) total_loss = 0 for idx, batch in enumerate(trn): arc_scores, rel_scores, mask, puncts = self.feed_batch(batch) arcs, rels = batch['arc'], batch['rel_id'] loss = self.compute_loss(arc_scores, rel_scores, arcs, rels, mask, criterion, batch) if gradient_accumulation > 1: loss /= gradient_accumulation loss.backward() total_loss += loss.item() arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch) self.update_metric(arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch) if history.step(gradient_accumulation): self._step(optimizer, scheduler, transformer_optimizer, transformer_scheduler) report = self._report(total_loss / (timer.current + 1), metric) timer.log(report, ratio_percentage=False, logger=logger) del loss def _step(self, optimizer, scheduler, transformer_optimizer, transformer_scheduler): if self.config.get('grad_norm', None): nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm) optimizer.step() optimizer.zero_grad() scheduler.step() if self.config.transformer and self.config.transformer_lr and transformer_optimizer: transformer_optimizer.step() transformer_optimizer.zero_grad() transformer_scheduler.step() def feed_batch(self, batch): words, feats, lens, puncts = batch.get('token_id', None), batch.get('pos_id', None), batch['sent_length'], \ batch.get('punct_mask', None) mask = lengths_to_mask(lens) arc_scores, rel_scores = self.model(words=words, feats=feats, mask=mask, batch=batch, **batch) # ignore the first token of each sentence # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation if self.model.training: mask = mask.clone() mask[:, 0] = 0 return arc_scores, rel_scores, mask, puncts def _report(self, loss, metric: AttachmentScore): return f'loss: {loss:.4f} {metric}' def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None): arc_scores, arcs = arc_scores[mask], arcs[mask] rel_scores, rels = rel_scores[mask], rels[mask] rel_scores = rel_scores[torch.arange(len(arcs)), arcs] arc_loss = criterion(arc_scores, arcs) rel_loss = criterion(rel_scores, rels) loss = arc_loss + rel_loss return loss # noinspection PyUnboundLocalVariable @torch.no_grad() def evaluate_dataloader(self, loader: PadSequenceDataLoader, criterion, logger=None, filename=None, output=False, ratio_width=None, metric=None, **kwargs): self.model.eval() loss = 0 if not metric: metric = self.build_metric() if output: fp = open(output, 'w') predictions, build_data, data, order = self.before_outputs(None) timer = CountdownTimer(len(loader)) use_pos = self.use_pos for batch in loader: arc_scores, rel_scores, mask, puncts = self.feed_batch(batch) if output: self.collect_outputs(arc_scores, rel_scores, mask, batch, predictions, order, data, use_pos, build_data) arcs, rels = batch['arc'], batch['rel_id'] loss += self.compute_loss(arc_scores, rel_scores, arcs, rels, mask, criterion, batch).item() arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask, batch) self.update_metric(arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch) report = self._report(loss / (timer.current + 1), metric) if filename: report = f'{os.path.basename(filename)} ' + report timer.log(report, ratio_percentage=False, logger=logger, ratio_width=ratio_width) loss /= len(loader) if output: outputs = self.post_outputs(predictions, data, order, use_pos, build_data) for each in outputs: fp.write(f'{each}\n\n') fp.close() logger.info(f'Predictions saved in [underline][yellow]{output}[/yellow][/underline]') return loss, metric def update_metric(self, arc_preds, rel_preds, arcs, rels, mask, puncts, metric, batch=None): # ignore all punctuation if not specified if not self.config.punct: mask &= puncts metric(arc_preds, rel_preds, arcs, rels, mask) def decode(self, arc_scores, rel_scores, mask, batch=None): tree, proj = self.config.tree, self.config.get('proj', False) if tree: arc_preds = decode_dep(arc_scores, mask, tree, proj) else: arc_preds = arc_scores.argmax(-1) rel_preds = rel_scores.argmax(-1) rel_preds = rel_preds.gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1) return arc_preds, rel_preds def build_criterion(self, **kwargs): criterion = nn.CrossEntropyLoss() return criterion def build_metric(self, **kwargs): return AttachmentScore() def on_config_ready(self, **kwargs): self.build_transformer_tokenizer() # We have to build tokenizer before building the dataloader and model self.config.patience = min(self.config.patience, self.config.epochs) def prediction_to_head_rel(self, arcs: torch.LongTensor, rels: torch.LongTensor, batch: dict): arcs = arcs[:, 1:] # Skip the ROOT rels = rels[:, 1:] arcs = arcs.tolist() rels = rels.tolist() vocab = self.vocabs['rel'].idx_to_token for arcs_per_sent, rels_per_sent, tokens in zip(arcs, rels, batch['token']): tokens = tokens[1:] sent_len = len(tokens) result = list(zip(arcs_per_sent[:sent_len], [vocab[r] for r in rels_per_sent[:sent_len]])) yield result ================================================ FILE: hanlp/components/parsers/biaffine/biaffine_model.py ================================================ # -*- coding: utf-8 -*- from typing import Any, Tuple import torch import torch.nn as nn from torch.nn.utils.rnn import (pack_padded_sequence, pad_packed_sequence, pad_sequence) from hanlp.components.parsers.biaffine.biaffine import Biaffine from hanlp.components.parsers.biaffine.mlp import MLP from hanlp.components.parsers.biaffine.variationalbilstm import VariationalLSTM from hanlp.layers.dropout import IndependentDropout, SharedDropout, WordDropout from hanlp.layers.transformers.encoder import TransformerEncoder from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer from hanlp.layers.transformers.utils import transformer_encode class EncoderWithContextualLayer(nn.Module): def __init__(self, config, pretrained_embed: torch.Tensor = None, transformer: PreTrainedModel = None, transformer_tokenizer: PreTrainedTokenizer = None, ): super(EncoderWithContextualLayer, self).__init__() self.secondary_encoder = config.get('secondary_encoder', None) self.config = config if not transformer: self.pad_index = config.pad_index self.unk_index = config.unk_index if config.word_dropout: oov = self.unk_index excludes = [self.pad_index] self.word_dropout = WordDropout(p=config.word_dropout, oov_token=oov, exclude_tokens=excludes) else: self.word_dropout = None if transformer: input_size = 0 if self.config.transformer_lr: hidden_size = transformer.config.hidden_size else: input_size = transformer.config.hidden_size hidden_size = config.n_lstm_hidden * 2 if config.feat == 'pos': self.feat_embed = nn.Embedding(num_embeddings=config.n_feats, embedding_dim=config.n_embed) self.embed_dropout = IndependentDropout(p=config.embed_dropout) if self.config.transformer_lr: hidden_size += config.n_embed else: input_size += config.n_embed if not self.config.transformer_lr: self.lstm = VariationalLSTM(input_size=input_size, hidden_size=config.n_lstm_hidden, num_layers=config.n_lstm_layers, dropout=config.hidden_dropout, bidirectional=True) else: # the embedding layer input_size = config.n_embed self.word_embed = nn.Embedding(num_embeddings=config.n_words, embedding_dim=config.n_embed) if pretrained_embed is not None: if not isinstance(pretrained_embed, torch.Tensor): pretrained_embed = torch.Tensor(pretrained_embed) self.pretrained = nn.Embedding.from_pretrained(pretrained_embed) nn.init.zeros_(self.word_embed.weight) if config.feat == 'pos': self.feat_embed = nn.Embedding(num_embeddings=config.n_feats, embedding_dim=config.n_embed) self.embed_dropout = IndependentDropout(p=config.embed_dropout) input_size += config.n_embed # the word-lstm layer hidden_size = config.n_lstm_hidden * 2 self.lstm = VariationalLSTM(input_size=input_size, hidden_size=config.n_lstm_hidden, num_layers=config.n_lstm_layers, dropout=config.hidden_dropout, bidirectional=True) self.hidden_size = hidden_size self.hidden_dropout = SharedDropout(p=config.hidden_dropout) if transformer: transformer = TransformerEncoder(transformer, transformer_tokenizer, config.average_subwords, word_dropout=config.word_dropout, max_sequence_length=config.max_sequence_length) self.transformer = transformer def forward(self, words, feats, input_ids, token_span, mask, lens): if mask is None: # get the mask and lengths of given batch mask = words.ne(self.pad_index) if lens is None: lens = mask.sum(dim=1) batch_size, seq_len = mask.shape if self.config.transformer: # trans_embed = self.run_transformer(input_ids, token_span=token_span) trans_embed = self.transformer.forward(input_ids, token_span=token_span) if hasattr(self, 'feat_embed'): feat_embed = self.feat_embed(feats) trans_embed, feat_embed = self.embed_dropout(trans_embed, feat_embed) embed = torch.cat((trans_embed, feat_embed), dim=-1) else: embed = trans_embed if hasattr(self, 'lstm'): x = self.run_rnn(embed, lens, seq_len) else: x = embed if self.secondary_encoder: x = self.secondary_encoder(x, mask) x = self.hidden_dropout(x) else: if self.word_dropout: words = self.word_dropout(words) # set the indices larger than num_embeddings to unk_index ext_mask = words.ge(self.word_embed.num_embeddings) ext_words = words.masked_fill(ext_mask, self.unk_index) # get outputs from embedding layers word_embed = self.word_embed(ext_words) if hasattr(self, 'pretrained'): word_embed += self.pretrained(words) if self.config.feat == 'char': feat_embed = self.feat_embed(feats[mask]) feat_embed = pad_sequence(feat_embed.split(lens.tolist()), True) elif self.config.feat == 'bert': feat_embed = self.feat_embed(*feats) elif hasattr(self, 'feat_embed'): feat_embed = self.feat_embed(feats) else: feat_embed = None if feat_embed is not None: word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed) # concatenate the word and feat representations embed = torch.cat((word_embed, feat_embed), dim=-1) else: embed = word_embed x = self.run_rnn(embed, lens, seq_len) x = self.hidden_dropout(x) return x, mask def run_rnn(self, embed, lens, seq_len): x = pack_padded_sequence(embed, lens, True, False) x, _ = self.lstm(x) x, _ = pad_packed_sequence(x, True, total_length=seq_len) return x def run_transformer(self, input_ids, token_span): return transformer_encode(self.transformer, input_ids, None, None, token_span, average_subwords=self.config.average_subwords) class BiaffineDecoder(nn.Module): def __init__(self, hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels, arc_dropout=None, rel_dropout=None) -> None: super().__init__() # the MLP layers self.mlp_arc_h = MLP(hidden_size, n_mlp_arc, dropout=arc_dropout or mlp_dropout) self.mlp_arc_d = MLP(hidden_size, n_mlp_arc, dropout=arc_dropout or mlp_dropout) self.mlp_rel_h = MLP(hidden_size, n_mlp_rel, dropout=rel_dropout or mlp_dropout) self.mlp_rel_d = MLP(hidden_size, n_mlp_rel, dropout=rel_dropout or mlp_dropout) # the Biaffine layers self.arc_attn = Biaffine(n_in=n_mlp_arc, bias_x=True, bias_y=False) self.rel_attn = Biaffine(n_in=n_mlp_rel, n_out=n_rels, bias_x=True, bias_y=True) def forward(self, x, mask=None, **kwargs: Any) -> Tuple[torch.Tensor, torch.Tensor]: arc_d, arc_h, rel_d, rel_h = self.apply_mlps(x) s_arc, s_rel = self.decode(arc_d, arc_h, rel_d, rel_h, mask, self.arc_attn, self.rel_attn) return s_arc, s_rel @staticmethod def decode(arc_d, arc_h, rel_d, rel_h, mask, arc_attn, rel_attn): # get arc and rel scores from the bilinear attention # [batch_size, seq_len, seq_len] s_arc = arc_attn(arc_d, arc_h) # [batch_size, seq_len, seq_len, n_rels] s_rel = rel_attn(rel_d, rel_h).permute(0, 2, 3, 1) if mask is not None: # set the scores that exceed the length of each sentence to -inf s_arc.masked_fill_(~mask.unsqueeze(1), float('-inf')) return s_arc, s_rel def apply_mlps(self, x): # apply MLPs to the hidden states arc_d = self.mlp_arc_d(x) arc_h = self.mlp_arc_h(x) rel_d = self.mlp_rel_d(x) rel_h = self.mlp_rel_h(x) return arc_d, arc_h, rel_d, rel_h class BiaffineDependencyModel(nn.Module): def __init__(self, config, pretrained_embed: torch.Tensor = None, transformer: PreTrainedModel = None, transformer_tokenizer: PreTrainedTokenizer = None): super().__init__() self.encoder = EncoderWithContextualLayer(config, pretrained_embed, transformer, transformer_tokenizer) self.biaffine_decoder = BiaffineDecoder(self.encoder.hidden_size, config.n_mlp_arc, config.n_mlp_rel, config.mlp_dropout, config.n_rels) def forward(self, words=None, feats=None, input_ids=None, token_span=None, mask=None, lens=None, **kwargs): x, mask = self.encoder(words, feats, input_ids, token_span, mask, lens) s_arc, s_rel = self.biaffine_decoder(x, mask) return s_arc, s_rel ================================================ FILE: hanlp/components/parsers/biaffine/biaffine_sdp.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-28 15:12 import functools from collections import Counter from typing import Union, List import torch from torch import nn from hanlp_common.constant import UNK from hanlp.common.transform import TransformList from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser from hanlp_common.conll import CoNLLUWord, CoNLLSentence from hanlp.datasets.parsing.semeval15 import unpack_deps_to_head_deprel, append_bos_to_form_pos from hanlp.metrics.parsing.labeled_f1 import LabeledF1 from hanlp_common.util import merge_locals_kwargs class BiaffineSemanticDependencyParser(BiaffineDependencyParser): def __init__(self) -> None: r"""Implementation of "Stanford's graph-based neural dependency parser at the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade" (:cite:`he-choi-2019`). """ super().__init__() def get_pad_dict(self): return {'arc': False} def build_metric(self, **kwargs): return LabeledF1() # noinspection PyMethodOverriding def build_dataset(self, data, transform=None): transforms = TransformList(functools.partial(append_bos_to_form_pos, pos_key='UPOS'), functools.partial(unpack_deps_to_head_deprel, pad_rel=self.config.pad_rel)) if transform: transforms.append(transform) return super(BiaffineSemanticDependencyParser, self).build_dataset(data, transforms) def build_criterion(self, **kwargs): return nn.BCEWithLogitsLoss(), nn.CrossEntropyLoss() def feed_batch(self, batch): arc_scores, rel_scores, mask, puncts = super().feed_batch(batch) mask = self.convert_to_3d_mask(arc_scores, mask) puncts = self.convert_to_3d_puncts(puncts, mask) return arc_scores, rel_scores, mask, puncts @staticmethod def convert_to_3d_puncts(puncts, mask): if puncts is not None: puncts = puncts.unsqueeze(-1).expand_as(mask) return puncts @staticmethod def convert_to_3d_mask(arc_scores, mask): # 3d masks mask = mask.unsqueeze(-1).expand_as(arc_scores).clone() mask[:, :, 1:] = mask[:, :, 1:] & mask.transpose(1, 2)[:, :, 1:] # Keep the 1st colum because it predicts root return mask def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask: torch.BoolTensor, criterion, batch=None): bce, ce = criterion arc_scores, arcs = arc_scores[mask], arcs[mask] rel_scores, rels = rel_scores[mask], rels[mask] rel_scores, rels = rel_scores[arcs], rels[arcs] arc_loss = bce(arc_scores, arcs.to(torch.float)) arc_loss_interpolation = self.config.get('arc_loss_interpolation', None) loss = arc_loss * arc_loss_interpolation if arc_loss_interpolation else arc_loss if len(rels): rel_loss = ce(rel_scores, rels) loss += (rel_loss * (1 - arc_loss_interpolation)) if arc_loss_interpolation else rel_loss if arc_loss_interpolation: loss *= 2 return loss def cache_dataset(self, dataset, timer, training=False, logger=None): if not self.config.apply_constraint: return super(BiaffineSemanticDependencyParser, self).cache_dataset(dataset, timer, training) num_roots = Counter() no_zero_head = True root_rels = Counter() for each in dataset: if training: num_roots[sum([x[0] for x in each['arc']])] += 1 no_zero_head &= all([x != '_' for x in each['DEPS']]) head_is_root = [i for i in range(len(each['arc'])) if each['arc'][i][0]] if head_is_root: for i in head_is_root: root_rels[each['rel'][i][0]] += 1 timer.log('Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]') if training: if self.config.single_root is None: self.config.single_root = len(num_roots) == 1 and num_roots.most_common()[0][0] == 1 if self.config.no_zero_head is None: self.config.no_zero_head = no_zero_head root_rel = root_rels.most_common()[0][0] self.config.root_rel_id = self.vocabs['rel'].get_idx(root_rel) if logger: logger.info(f'Training set properties: [blue]single_root = {self.config.single_root}[/blue], ' f'[blue]no_zero_head = {no_zero_head}[/blue], ' f'[blue]root_rel = {root_rel}[/blue]') def decode(self, arc_scores, rel_scores, mask, batch=None): eye = torch.arange(0, arc_scores.size(1), device=arc_scores.device).view(1, 1, -1).expand( arc_scores.size(0), -1, -1) inf = float('inf') arc_scores.scatter_(dim=1, index=eye, value=-inf) if self.config.apply_constraint: if self.config.get('single_root', False): arc_scores[~mask] = -inf # the biaffine decoder doesn't apply 3d mask for now root_mask = arc_scores[:, :, 0].argmax(dim=-1).unsqueeze_(-1).expand_as(arc_scores[:, :, 0]) arc_scores[:, :, 0] = -inf arc_scores[:, :, 0].scatter_(dim=-1, index=root_mask, value=inf) root_rel_id = self.config.root_rel_id rel_scores[:, :, 0, root_rel_id] = inf rel_scores[:, :, 1:, root_rel_id] = -inf arc_scores_T = arc_scores.transpose(-1, -2) arc = ((arc_scores > 0) & (arc_scores_T < arc_scores)) if self.config.get('no_zero_head', False): arc_scores_T[arc] = -inf # avoid cycle between a pair of nodes arc_scores_fix = arc_scores_T.argmax(dim=-2).unsqueeze_(-1).expand_as(arc_scores) arc.scatter_(dim=-1, index=arc_scores_fix, value=True) else: arc = arc_scores > 0 rel = rel_scores.argmax(dim=-1) return arc, rel def collect_outputs_extend(self, predictions, arc_preds, rel_preds, lens, mask): predictions.extend(zip(arc_preds.tolist(), rel_preds.tolist(), mask.tolist())) # all_arcs.extend(seq.tolist() for seq in arc_preds[mask].split([x * x for x in lens])) # all_rels.extend(seq.tolist() for seq in rel_preds[mask].split([x * x for x in lens])) def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True): for d, (arcs, rels, masks) in zip(data, predictions): sent = CoNLLSentence() for idx, (cell, a, r) in enumerate(zip(d, arcs[1:], rels[1:])): if use_pos: token, pos = cell else: token, pos = cell, None heads = [i for i in range(len(d) + 1) if a[i]] deprels = [self.vocabs['rel'][r[i]] for i in range(len(d) + 1) if a[i]] sent.append( CoNLLUWord(idx + 1, token, upos=pos, head=None, deprel=None, deps=list(zip(heads, deprels)))) outputs.append(sent) def fit(self, trn_data, dev_data, save_dir, feat=None, n_embed=100, pretrained_embed=None, transformer=None, average_subwords=False, word_dropout: float = 0.2, transformer_hidden_dropout=None, layer_dropout=0, mix_embedding: int = None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, hidden_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, arc_dropout=None, rel_dropout=None, arc_loss_interpolation=0.4, lr=2e-3, transformer_lr=5e-5, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, weight_decay=0, warmup_steps=0.1, separate_optimizer=True, patience=100, batch_size=None, sampler_builder=None, lowercase=False, epochs=50000, apply_constraint=False, single_root=None, no_zero_head=None, punct=False, min_freq=2, logger=None, verbose=True, unk=UNK, pad_rel=None, max_sequence_length=512, gradient_accumulation=1, devices: Union[float, int, List[int]] = None, transform=None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) ================================================ FILE: hanlp/components/parsers/biaffine/mlp.py ================================================ # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import torch.nn as nn from hanlp.layers.dropout import SharedDropout class MLP(nn.Module): r""" Applies a linear transformation together with a non-linear activation to the incoming tensor: :math:`y = \mathrm{Activation}(x A^T + b)` Args: n_in (~torch.Tensor): The size of each input feature. n_out (~torch.Tensor): The size of each output feature. dropout (float): If non-zero, introduce a :class:`SharedDropout` layer on the output with this dropout ratio. Default: 0. activation (bool): Whether to use activations. Default: True. """ def __init__(self, n_in, n_out, dropout=0, activation=True): super().__init__() self.n_in = n_in self.n_out = n_out self.linear = nn.Linear(n_in, n_out) self.activation = nn.LeakyReLU(negative_slope=0.1) if activation else nn.Identity() self.dropout = SharedDropout(p=dropout) self.reset_parameters() def __repr__(self): s = f"n_in={self.n_in}, n_out={self.n_out}" if self.dropout.p > 0: s += f", dropout={self.dropout.p}" return f"{self.__class__.__name__}({s})" def reset_parameters(self): nn.init.orthogonal_(self.linear.weight) nn.init.zeros_(self.linear.bias) def forward(self, x): r""" Args: x (~torch.Tensor): The size of each input feature is `n_in`. Returns: A tensor with the size of each output feature `n_out`. """ x = self.linear(x) x = self.activation(x) x = self.dropout(x) return x ================================================ FILE: hanlp/components/parsers/biaffine/structual_attention.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-26 10:40 from typing import Union, List import torch import torch.nn.functional as F from hanlp.utils.torch_util import lengths_to_mask from torch import nn from hanlp.common.torch_component import TorchComponent from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder from hanlp.layers.transformers.encoder import TransformerEncoder from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer from hanlp.metrics.accuracy import CategoricalAccuracy from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer from hanlp_common.util import merge_locals_kwargs class StructuralAttentionLayer(nn.Module): def __init__(self, hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels, projeciton=None) -> None: super().__init__() self.biaffine = BiaffineDecoder(hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, n_rels) if projeciton: self.projection = nn.Linear(hidden_size, projeciton) hidden_size = projeciton else: self.projection = None self.head_WV = nn.Parameter(torch.randn(n_rels, hidden_size, hidden_size)) self.dense = nn.Linear(hidden_size * n_rels, hidden_size) self.activation = nn.GELU() def forward(self, x, mask): s_arc, s_rel = self.biaffine(x, mask) p_arc = F.softmax(s_arc, dim=-1) * mask.unsqueeze(-1) p_rel = F.softmax(s_rel, -1) A = p_arc.unsqueeze(-1) * p_rel if self.projection: x = self.projection(x) Ax = torch.einsum('bijk,bih->bihk', A, x) AxW = torch.einsum('bihk,khm->bihk', Ax, self.head_WV) AxW = AxW.flatten(2) x = self.dense(AxW) x = self.activation(x) return s_arc, s_rel, x class StructuralAttentionModel(nn.Module): def __init__(self, config, transformer: PreTrainedModel = None, transformer_tokenizer: PreTrainedTokenizer = None ) -> None: super().__init__() self.encoder = TransformerEncoder(transformer, transformer_tokenizer, config.average_subwords, config.scalar_mix, None, # No word_dropout since SA is predicting masked tokens config.transformer_hidden_dropout, config.layer_dropout, config.max_sequence_length) hidden_size = transformer.config.hidden_size self.sa = StructuralAttentionLayer(hidden_size, config.n_mlp_arc, config.n_mlp_rel, config.mlp_dropout, config.n_rels, config.projection ) if config.projection: hidden_size = config.projection self.mlm = nn.Linear(hidden_size, transformer_tokenizer.vocab_size) def forward(self, input_ids: torch.LongTensor, attention_mask=None, token_type_ids=None, token_span=None, mask=None, batch=None, **kwargs): h = self.encoder(input_ids, attention_mask, token_type_ids, token_span) s_arc, s_rel, h = self.sa(h, mask) x = self.mlm(h) return s_arc, s_rel, x class MaskedTokenGenerator(object): def __init__(self, transformer_tokenizer: PreTrainedTokenizer, mask_prob=0.15) -> None: super().__init__() self.mask_prob = mask_prob self.transformer_tokenizer = transformer_tokenizer self.oov = transformer_tokenizer.mask_token_id self.pad = transformer_tokenizer.pad_token_id self.cls = transformer_tokenizer.cls_token_id self.sep = transformer_tokenizer.sep_token_id self.excludes = [self.pad, self.cls, self.sep] def __call__(self, tokens: torch.LongTensor, prefix_mask: torch.LongTensor): padding_mask = tokens.new_ones(tokens.size(), dtype=torch.bool) for pad in self.excludes: padding_mask &= (tokens != pad) padding_mask &= prefix_mask # Only mask prefixes since the others won't be attended # Create a uniformly random mask selecting either the original words or OOV tokens dropout_mask = (tokens.new_empty(tokens.size(), dtype=torch.float).uniform_() < self.mask_prob) oov_mask = dropout_mask & padding_mask oov_fill = tokens.new_empty(tokens.size(), dtype=torch.long).fill_(self.oov) result = torch.where(oov_mask, oov_fill, tokens) return result, oov_mask class StructuralAttentionParser(BiaffineDependencyParser): def __init__(self) -> None: super().__init__() self.model: StructuralAttentionModel = None self.mlm_generator: MaskedTokenGenerator = None def build_model(self, training=True, **kwargs) -> torch.nn.Module: transformer = TransformerEncoder.build_transformer(config=self.config, training=training) model = StructuralAttentionModel(self.config, transformer, self.transformer_tokenizer) return model def fit(self, trn_data, dev_data, save_dir, transformer=None, mask_prob=0.15, projection=None, average_subwords=False, transformer_hidden_dropout=None, layer_dropout=0, mix_embedding: int = None, embed_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, lr=2e-3, transformer_lr=5e-5, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100, sampler='kmeans', n_buckets=32, batch_max_tokens=5000, batch_size=None, epochs=50000, tree=False, punct=False, logger=None, verbose=True, max_sequence_length=512, devices: Union[float, int, List[int]] = None, transform=None, **kwargs): return TorchComponent.fit(self, **merge_locals_kwargs(locals(), kwargs)) def feed_batch(self, batch): if self.model.training: input_ids = batch['input_ids'] prefix_mask = batch['prefix_mask'] batch['gold_input_ids'] = input_ids batch['input_ids'], batch['input_ids_mask'] = self.mlm_generator(input_ids, prefix_mask) words, feats, lens, puncts = batch.get('token_id', None), batch.get('pos_id', None), batch['sent_length'], \ batch.get('punct_mask', None) mask = lengths_to_mask(lens) arc_scores, rel_scores, pred_input_ids = self.model(words=words, feats=feats, mask=mask, batch=batch, **batch) batch['pred_input_ids'] = pred_input_ids # ignore the first token of each sentence # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation if self.model.training: mask = mask.clone() mask[:, 0] = 0 return arc_scores, rel_scores, mask, puncts def on_config_ready(self, **kwargs): super().on_config_ready(**kwargs) self.mlm_generator = MaskedTokenGenerator(self.transformer_tokenizer, self.config.mask_prob) def compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch=None): parse_loss = BiaffineDependencyParser.compute_loss(self, arc_scores, rel_scores, arcs, rels, mask, criterion, batch) if self.model.training: gold_input_ids = batch['gold_input_ids'] pred_input_ids = batch['pred_input_ids'] input_ids_mask = batch['input_ids_mask'] token_span = batch['token_span'] gold_input_ids = batch['gold_input_ids'] = gold_input_ids.gather(1, token_span[:, :, 0]) input_ids_mask = batch['input_ids_mask'] = input_ids_mask.gather(1, token_span[:, :, 0]) mlm_loss = F.cross_entropy(pred_input_ids[input_ids_mask], gold_input_ids[input_ids_mask]) loss = parse_loss + mlm_loss return loss return parse_loss def build_tokenizer_transform(self): return TransformerSequenceTokenizer(self.transformer_tokenizer, 'token', '', ret_prefix_mask=True, ret_token_span=True, cls_is_bos=True, max_seq_length=self.config.get('max_sequence_length', 512), truncate_long_sequences=False) def build_metric(self, training=None, **kwargs): parse_metric = super().build_metric(**kwargs) if training: mlm_metric = CategoricalAccuracy() return parse_metric, mlm_metric return parse_metric def update_metric(self, arc_scores, rel_scores, arcs, rels, mask, puncts, metric, batch=None): if isinstance(metric, tuple): parse_metric, mlm_metric = metric super().update_metric(arc_scores, rel_scores, arcs, rels, mask, puncts, parse_metric) gold_input_ids = batch['gold_input_ids'] input_ids_mask = batch['input_ids_mask'] pred_input_ids = batch['pred_input_ids'] pred_input_ids = pred_input_ids[input_ids_mask] gold_input_ids = gold_input_ids[input_ids_mask] if len(pred_input_ids): mlm_metric(pred_input_ids, gold_input_ids) else: super().update_metric(arc_scores, rel_scores, arcs, rels, mask, puncts, metric) def _report(self, loss, metric): if isinstance(metric, tuple): parse_metric, mlm_metric = metric return super()._report(loss, parse_metric) + f' {mlm_metric}' else: return super()._report(loss, metric) ================================================ FILE: hanlp/components/parsers/biaffine/variationalbilstm.py ================================================ # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import torch import torch.nn as nn from torch.nn.modules.rnn import apply_permutation from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence from hanlp.common.structure import ConfigTracker from hanlp.layers.dropout import SharedDropout class VariationalLSTM(nn.Module): r""" LSTM is an variant of the vanilla bidirectional LSTM adopted by Biaffine Parser with the only difference of the dropout strategy. It drops nodes in the LSTM layers (input and recurrent connections) and applies the same dropout mask at every recurrent timesteps. APIs are roughly the same as :class:`~torch.nn.LSTM` except that we only allows :class:`~torch.nn.utils.rnn.PackedSequence` as input. References: - Timothy Dozat and Christopher D. Manning. 2017. `Deep Biaffine Attention for Neural Dependency Parsing`_. Args: input_size (int): The number of expected features in the input. hidden_size (int): The number of features in the hidden state `h`. num_layers (int): The number of recurrent layers. Default: 1. bidirectional (bool): If ``True``, becomes a bidirectional LSTM. Default: ``False`` dropout (float): If non-zero, introduces a :class:`SharedDropout` layer on the outputs of each LSTM layer except the last layer. Default: 0. .. _Deep Biaffine Attention for Neural Dependency Parsing: https://openreview.net/forum?id=Hk95PK9le """ def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, dropout=0): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.bidirectional = bidirectional self.dropout = dropout self.num_directions = 1 + self.bidirectional self.f_cells = nn.ModuleList() if bidirectional: self.b_cells = nn.ModuleList() for _ in range(self.num_layers): self.f_cells.append(nn.LSTMCell(input_size=input_size, hidden_size=hidden_size)) if bidirectional: self.b_cells.append(nn.LSTMCell(input_size=input_size, hidden_size=hidden_size)) input_size = hidden_size * self.num_directions self.reset_parameters() def __repr__(self): s = f"{self.input_size}, {self.hidden_size}" if self.num_layers > 1: s += f", num_layers={self.num_layers}" if self.bidirectional: s += f", bidirectional={self.bidirectional}" if self.dropout > 0: s += f", dropout={self.dropout}" return f"{self.__class__.__name__}({s})" def reset_parameters(self): for param in self.parameters(): # apply orthogonal_ to weight if len(param.shape) > 1: nn.init.orthogonal_(param) # apply zeros_ to bias else: nn.init.zeros_(param) def permute_hidden(self, hx, permutation): if permutation is None: return hx h = apply_permutation(hx[0], permutation) c = apply_permutation(hx[1], permutation) return h, c def layer_forward(self, x, hx, cell, batch_sizes, reverse=False): hx_0 = hx_i = hx hx_n, output = [], [] steps = reversed(range(len(x))) if reverse else range(len(x)) if self.training: hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout) for t in steps: last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t] if last_batch_size < batch_size: hx_i = [torch.cat((h, ih[last_batch_size:batch_size])) for h, ih in zip(hx_i, hx_0)] else: hx_n.append([h[batch_size:] for h in hx_i]) hx_i = [h[:batch_size] for h in hx_i] hx_i = [h for h in cell(x[t], hx_i)] output.append(hx_i[0]) if self.training: hx_i[0] = hx_i[0] * hid_mask[:batch_size] if reverse: hx_n = hx_i output.reverse() else: hx_n.append(hx_i) hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))] output = torch.cat(output) return output, hx_n def forward(self, sequence, hx=None): r""" Args: sequence (~torch.nn.utils.rnn.PackedSequence): A packed variable length sequence. hx (~torch.Tensor, ~torch.Tensor): A tuple composed of two tensors `h` and `c`. `h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial hidden state for each element in the batch. `c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the initial cell state for each element in the batch. If `hx` is not provided, both `h` and `c` default to zero. Default: ``None``. Returns: ~torch.nn.utils.rnn.PackedSequence, (~torch.Tensor, ~torch.Tensor): The first is a packed variable length sequence. The second is a tuple of tensors `h` and `c`. `h` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the hidden state for `t=seq_len`. Like output, the layers can be separated using ``h.view(num_layers, num_directions, batch_size, hidden_size)`` and similarly for c. `c` of shape ``[num_layers*num_directions, batch_size, hidden_size]`` holds the cell state for `t=seq_len`. """ x, batch_sizes = sequence.data, sequence.batch_sizes.tolist() batch_size = batch_sizes[0] h_n, c_n = [], [] if hx is None: ih = x.new_zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size) h, c = ih, ih else: h, c = self.permute_hidden(hx, sequence.sorted_indices) h = h.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) c = c.view(self.num_layers, self.num_directions, batch_size, self.hidden_size) for i in range(self.num_layers): x = torch.split(x, batch_sizes) if self.training: mask = SharedDropout.get_mask(x[0], self.dropout) x = [i * mask[:len(i)] for i in x] x_i, (h_i, c_i) = self.layer_forward(x=x, hx=(h[i, 0], c[i, 0]), cell=self.f_cells[i], batch_sizes=batch_sizes) if self.bidirectional: x_b, (h_b, c_b) = self.layer_forward(x=x, hx=(h[i, 1], c[i, 1]), cell=self.b_cells[i], batch_sizes=batch_sizes, reverse=True) x_i = torch.cat((x_i, x_b), -1) h_i = torch.stack((h_i, h_b)) c_i = torch.stack((c_i, c_b)) x = x_i h_n.append(h_i) c_n.append(h_i) x = PackedSequence(x, sequence.batch_sizes, sequence.sorted_indices, sequence.unsorted_indices) hx = torch.cat(h_n, 0), torch.cat(c_n, 0) hx = self.permute_hidden(hx, sequence.unsorted_indices) return x, hx class VariationalLSTMEncoder(VariationalLSTM, ConfigTracker): def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False, variational_dropout=0, word_dropout=0, ): super().__init__(input_size, hidden_size, num_layers, bidirectional, variational_dropout) ConfigTracker.__init__(self, locals()) self.lstm_dropout = SharedDropout(p=word_dropout) # noinspection PyMethodOverriding def forward(self, embed, mask): batch_size, seq_len = mask.shape x = pack_padded_sequence(embed, mask.sum(1), True, False) x, _ = super().forward(x) x, _ = pad_packed_sequence(x, True, total_length=seq_len) x = self.lstm_dropout(x) return x def get_output_dim(self): return self.hidden_size * self.num_directions ================================================ FILE: hanlp/components/parsers/biaffine_parser_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-22 12:47 import logging import math import os from typing import List import numpy as np import tensorflow as tf from hanlp.components.parsers.parse_alg import unique_root, adjust_root_score, chu_liu_edmonds from hanlp.layers.transformers.loader_tf import build_transformer from hanlp.common.keras_component import KerasComponent from hanlp.components.parsers.alg_tf import tarjan from hanlp.components.parsers.biaffine_tf.model import BiaffineModelTF from hanlp.transform.conll_tf import CoNLL_DEP_Transform, CoNLL_Transformer_Transform, CoNLL_SDP_Transform from hanlp.layers.embeddings.util_tf import build_embedding from hanlp.layers.transformers.tf_imports import PreTrainedTokenizer, TFAutoModel, TFPreTrainedModel, AutoTokenizer, \ TFAutoModelWithLMHead, BertTokenizerFast, AlbertConfig, BertTokenizer, TFBertModel from hanlp.layers.transformers.utils_tf import build_adamw_optimizer from hanlp.metrics.parsing.labeled_f1_tf import LabeledF1TF from hanlp.metrics.parsing.labeled_score import LabeledScore from hanlp_common.util import merge_locals_kwargs class BiaffineDependencyParserTF(KerasComponent): def __init__(self, transform: CoNLL_DEP_Transform = None) -> None: if not transform: transform = CoNLL_DEP_Transform() super().__init__(transform) self.transform: CoNLL_DEP_Transform = transform self.model: BiaffineModelTF = None def build_model(self, pretrained_embed, n_embed, training, **kwargs) -> tf.keras.Model: if training: self.config.n_words = len(self.transform.form_vocab) else: self.config.lstm_dropout = 0. # keras will use cuda lstm when config.lstm_dropout is 0 self.config.n_feats = len(self.transform.cpos_vocab) self._init_config() pretrained: tf.keras.layers.Embedding = build_embedding(pretrained_embed, self.transform.form_vocab, self.transform) if pretrained_embed else None if pretrained_embed: self.config.n_embed = pretrained.output_dim model = BiaffineModelTF(self.config, pretrained) return model def _init_config(self): self.config.n_rels = len(self.transform.rel_vocab) self.config.pad_index = self.transform.form_vocab.pad_idx self.config.unk_index = self.transform.form_vocab.unk_idx self.config.bos_index = 2 def load_weights(self, save_dir, filename='model.h5', functional=False, **kwargs): super().load_weights(save_dir, filename) if functional: self.model = self.model.to_functional() def fit(self, trn_data, dev_data, save_dir, n_embed=100, pretrained_embed=None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, lstm_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, optimizer='adam', lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100, arc_loss='sparse_categorical_crossentropy', rel_loss='sparse_categorical_crossentropy', metrics=('UAS', 'LAS'), n_buckets=32, batch_size=5000, epochs=50000, early_stopping_patience=100, tree=False, punct=False, min_freq=2, run_eagerly=False, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) # noinspection PyMethodOverriding def train_loop(self, trn_data, dev_data, epochs, num_examples, train_steps_per_epoch, dev_steps, model, optimizer, loss, metrics, callbacks, logger: logging.Logger, arc_loss, rel_loss, **kwargs): arc_loss, rel_loss = loss # because we are customizing batching train_steps_per_epoch = len(list(iter(trn_data))) # progbar: tf.keras.callbacks.ProgbarLogger = callbacks[-1] c: tf.keras.callbacks.Callback = None metric = self._build_metrics() for c in callbacks: if not hasattr(c, 'params'): c.params = dict() c.params['epochs'] = epochs c.params['trn_data'] = trn_data c.params['metrics'] = ['loss'] + self.config.metrics c.params['metrics'] = c.params['metrics'] + [f'val_{k}' for k in c.params['metrics']] c.on_train_begin() for epoch in range(epochs): metric.reset_states() for c in callbacks: c.params['steps'] = train_steps_per_epoch c.on_epoch_begin(epoch) for idx, ((words, feats), (arcs, rels)) in enumerate(iter(trn_data)): logs = {} for c in callbacks: c.on_batch_begin(idx, logs) mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index) loss, arc_scores, rel_scores = self.train_batch(words, feats, arcs, rels, mask, optimizer, arc_loss, rel_loss) self.run_metrics(arcs, rels, arc_scores, rel_scores, words, mask, metric) logs['loss'] = loss logs.update(metric.to_dict()) if epoch == epochs - 1: self.model.stop_training = True for c in callbacks: c.on_batch_end(idx, logs) # evaluate on dev metric.reset_states() logs = {} for idx, ((words, feats), (arcs, rels)) in enumerate(iter(dev_data)): arc_scores, rel_scores, loss, mask, arc_preds, rel_preds = self.evaluate_batch(words, feats, arcs, rels, arc_loss, rel_loss, metric) logs['val_loss'] = loss logs.update((f'val_{k}', v) for k, v in metric.to_dict().items()) for c in callbacks: c.on_epoch_end(epoch, logs) if getattr(self.model, 'stop_training', None): break for c in callbacks: c.on_train_end() def evaluate(self, input_path: str, save_dir=None, output=False, batch_size=None, logger: logging.Logger = None, callbacks: List[tf.keras.callbacks.Callback] = None, warm_up=False, verbose=True, **kwargs): if batch_size is None: batch_size = self.config.batch_size return super().evaluate(input_path, save_dir, output, batch_size, logger, callbacks, warm_up, verbose, **kwargs) def evaluate_batch(self, words, feats, arcs, rels, arc_loss, rel_loss, metric): mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index) arc_scores, rel_scores = self.model((words, feats)) loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss) arc_preds, rel_preds = self.run_metrics(arcs, rels, arc_scores, rel_scores, words, mask, metric) return arc_scores, rel_scores, loss, mask, arc_preds, rel_preds def _build_metrics(self): if isinstance(self.config.metrics, tuple): self.config.metrics = list(self.config.metrics) if self.config.metrics == ['UAS', 'LAS']: metric = LabeledScore() else: metric = LabeledF1TF() return metric def run_metrics(self, arcs, rels, arc_scores, rel_scores, words, mask, metric): arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask) # ignore all punctuation if not specified if not self.config.punct: mask &= tf.reduce_all(tf.not_equal(tf.expand_dims(words, axis=-1), self.transform.puncts), axis=-1) metric(arc_preds, rel_preds, arcs, rels, mask) return arc_preds, rel_preds def train_batch(self, words, feats, arcs, rels, mask, optimizer, arc_loss, rel_loss): with tf.GradientTape() as tape: arc_scores, rel_scores = self.model((words, feats), training=True) loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss) grads = tape.gradient(loss, self.model.trainable_variables) optimizer.apply_gradients(zip(grads, self.model.trainable_variables)) return loss, arc_scores, rel_scores def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss): arc_scores, arcs = arc_scores[mask], arcs[mask] rel_scores, rels = rel_scores[mask], rels[mask] rel_scores = tf.gather_nd(rel_scores, tf.stack([tf.range(len(arcs), dtype=tf.int64), arcs], axis=1)) arc_loss = arc_loss(arcs, arc_scores) rel_loss = rel_loss(rels, rel_scores) loss = arc_loss + rel_loss return loss def build_optimizer(self, optimizer='adam', lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, **kwargs): if optimizer == 'adam': scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=lr, decay_steps=decay_steps, decay_rate=decay) from hanlp.optimizers.adamw.optimization import AdamTF optimizer = AdamTF(learning_rate=scheduler, beta_1=mu, beta_2=nu, epsilon=epsilon, clipnorm=clip) return optimizer return super().build_optimizer(optimizer, **kwargs) # noinspection PyMethodOverriding def build_loss(self, arc_loss, rel_loss, **kwargs): if arc_loss == 'binary_crossentropy': arc_loss = tf.losses.BinaryCrossentropy(from_logits=True) else: arc_loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) if arc_loss == 'sparse_categorical_crossentropy' else super().build_loss(arc_loss) rel_loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) if rel_loss == 'sparse_categorical_crossentropy' else super().build_loss(rel_loss) return arc_loss, rel_loss @property def sample_data(self): return tf.constant([[2, 3, 4], [2, 5, 0]], dtype=tf.int64), tf.constant([[1, 2, 3], [4, 5, 0]], dtype=tf.int64) def num_samples_in(self, dataset): return sum(len(x[0][0]) for x in iter(dataset)) def build_train_dataset(self, trn_data, batch_size, num_examples): trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size, shuffle=True, repeat=None) return trn_data # noinspection PyMethodOverriding def build_callbacks(self, save_dir, logger, metrics, **kwargs): callbacks = super().build_callbacks(save_dir, logger, metrics=metrics, **kwargs) if isinstance(metrics, tuple): metrics = list(metrics) callbacks.append(self.build_progbar(metrics)) params = {'verbose': 1, 'epochs': 1} for c in callbacks: c.set_params(params) c.set_model(self.model) return callbacks def build_progbar(self, metrics, training=True): return tf.keras.callbacks.ProgbarLogger(count_mode='steps', stateful_metrics=metrics + [f'val_{k}' for k in metrics] if training else []) def decode(self, arc_scores, rel_scores, mask): if self.config.tree: root_rel_idx = self.transform.root_rel_idx root_rel_onehot = np.eye(len(self.transform.rel_vocab))[root_rel_idx] arc_preds = np.zeros_like(mask, dtype=np.int64) rel_preds = np.zeros_like(mask, dtype=np.int64) for arc, rel, m, arc_pred, rel_pred in zip(arc_scores, rel_scores, mask, arc_preds, rel_preds): length = int(tf.math.count_nonzero(m)) + 1 arc = arc[:length, :length] arc_probs = tf.nn.softmax(arc).numpy() m = np.expand_dims(m.numpy()[:length], -1) if self.config.tree == 'tarjan': heads = tarjan(arc_probs, length, m) elif self.config.tree == 'mst': heads, head_probs, tokens = unique_root(arc_probs, m, length) arc = arc.numpy() adjust_root_score(arc, heads, root_rel_idx) heads = chu_liu_edmonds(arc, length) else: raise ValueError(f'Unknown tree algorithm {self.config.tree}') arc_pred[:length] = heads root = np.where(heads[np.arange(1, length)] == 0)[0] + 1 rel_prob = tf.nn.softmax(rel[:length, :length, :]).numpy() rel_prob = rel_prob[np.arange(length), heads] rel_prob[root] = root_rel_onehot rel_prob[np.arange(length) != root, np.arange(len(self.transform.rel_vocab)) == root_rel_idx] = 0 # rels = rel_argmax(rel_prob, length, root_rel_idx) rels = np.argmax(rel_prob, axis=1) rel_pred[:length] = rels arc_preds = tf.constant(arc_preds) rel_preds = tf.constant(rel_preds) else: arc_preds = tf.argmax(arc_scores, -1) rel_preds = tf.argmax(rel_scores, -1) rel_preds = tf.squeeze(tf.gather(rel_preds, tf.expand_dims(arc_preds, -1), batch_dims=2), axis=-1) return arc_preds, rel_preds def evaluate_dataset(self, tst_data, callbacks, output, num_batches, ret_scores=None, **kwargs): if 'mask_p' in self.config: self.config['mask_p'] = None arc_loss, rel_loss = self.build_loss(**self.config) callbacks = [self.build_progbar(self.config['metrics'])] steps_per_epoch = len(list(iter(tst_data))) metric = self._build_metrics() params = {'verbose': 1, 'epochs': 1, 'metrics': ['loss'] + self.config.metrics, 'steps': steps_per_epoch} for c in callbacks: c.set_params(params) c.on_test_begin() c.on_epoch_end(0) logs = {} if ret_scores: scores = [] if output: ext = os.path.splitext(output)[-1] output = open(output, 'w', encoding='utf-8') for idx, ((words, feats), Y) in enumerate(iter(tst_data)): arcs, rels = Y[0], Y[1] for c in callbacks: c.on_test_batch_begin(idx, logs) arc_scores, rel_scores, loss, mask, arc_preds, rel_preds = self.evaluate_batch(words, feats, arcs, rels, arc_loss, rel_loss, metric) if ret_scores: scores.append((arc_scores.numpy(), rel_scores.numpy(), mask.numpy())) if output: for sent in self.transform.XY_to_inputs_outputs((words, feats, mask), (arc_preds, rel_preds), conll=ext, arc_scores=arc_scores, rel_scores=rel_scores): output.write(str(sent)) output.write('\n\n') logs['loss'] = loss logs.update(metric.to_dict()) for c in callbacks: c.on_test_batch_end(idx, logs) for c in callbacks: c.on_epoch_end(0) c.on_test_end() if output: output.close() loss = float(c.progbar._values['loss'][0] / c.progbar._values['loss'][1]) outputs = loss, metric.to_dict(), False if ret_scores: outputs += (scores,) return outputs def predict_batch(self, batch, inputs=None, conll=True, **kwargs): ((words, feats), (arcs, rels)) = batch mask = tf.not_equal(words, self.config.pad_index) & tf.not_equal(words, self.config.bos_index) arc_scores, rel_scores = self.model((words, feats)) arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask) for sent in self.transform.XY_to_inputs_outputs((words, feats, mask), (arc_preds, rel_preds), gold=False, inputs=inputs, conll=conll): yield sent def compile_model(self, optimizer, loss, metrics): super().compile_model(optimizer, loss, metrics) class BiaffineSemanticDependencyParserTF(BiaffineDependencyParserTF): def __init__(self, transform: CoNLL_SDP_Transform = None) -> None: if not transform: transform = CoNLL_SDP_Transform() # noinspection PyTypeChecker super().__init__(transform) self.transform: CoNLL_SDP_Transform = transform def fit(self, trn_data, dev_data, save_dir, n_embed=100, pretrained_embed=None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, lstm_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, optimizer='adam', lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100, arc_loss='binary_crossentropy', rel_loss='sparse_categorical_crossentropy', metrics=('UF', 'LF'), n_buckets=32, batch_size=5000, epochs=50000, early_stopping_patience=100, tree=False, punct=False, min_freq=2, run_eagerly=False, logger=None, verbose=True, **kwargs): return super().fit(trn_data, dev_data, save_dir, n_embed, pretrained_embed, embed_dropout, n_lstm_hidden, n_lstm_layers, lstm_dropout, n_mlp_arc, n_mlp_rel, mlp_dropout, optimizer, lr, mu, nu, epsilon, clip, decay, decay_steps, patience, arc_loss, rel_loss, metrics, n_buckets, batch_size, epochs, early_stopping_patience, tree, punct, min_freq, run_eagerly, logger, verbose, **kwargs) def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss): mask = tf.tile(tf.expand_dims(mask, -1), [1, 1, tf.shape(mask)[-1]]) mask &= tf.transpose(mask, [0, 2, 1]) arc_scores, arcs = arc_scores[mask], arcs[mask] rel_scores, rels = rel_scores[mask], rels[mask] rel_scores, rels = rel_scores[arcs], rels[arcs] arc_loss = arc_loss(arcs, arc_scores) rel_loss = rel_loss(rels, rel_scores) loss = arc_loss + rel_loss return loss def decode(self, arc_scores, rel_scores, mask): arc_preds = arc_scores > 0 rel_preds = tf.argmax(rel_scores, -1) return arc_preds, rel_preds class BiaffineTransformerDependencyParserTF(BiaffineDependencyParserTF, tf.keras.callbacks.Callback): def __init__(self, transform: CoNLL_Transformer_Transform = None) -> None: if not transform: transform = CoNLL_Transformer_Transform() super().__init__(transform) self.transform: CoNLL_Transformer_Transform = transform def build_model(self, transformer, training, **kwargs) -> tf.keras.Model: transformer = self.build_transformer(training, transformer) model = BiaffineModelTF(self.config, transformer=transformer) return model def build_transformer(self, training, transformer): if training: self.config.n_words = len(self.transform.form_vocab) self._init_config() if isinstance(transformer, str): if 'albert_chinese' in transformer: tokenizer = BertTokenizerFast.from_pretrained(transformer, add_special_tokens=False) transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer, from_pt=True) elif transformer.startswith('albert') and transformer.endswith('zh'): transformer, tokenizer, path = build_transformer(transformer) transformer.config = AlbertConfig.from_json_file(os.path.join(path, "albert_config.json")) tokenizer = BertTokenizer.from_pretrained(os.path.join(path, "vocab_chinese.txt"), add_special_tokens=False) elif 'chinese-roberta' in transformer: tokenizer = BertTokenizer.from_pretrained(transformer) transformer = TFBertModel.from_pretrained(transformer, name=transformer, from_pt=True) else: tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(transformer) try: transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer) except (TypeError, OSError): transformer: TFPreTrainedModel = TFAutoModel.from_pretrained(transformer, name=transformer, from_pt=True) elif transformer[0] == 'AutoModelWithLMHead': tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(transformer[1]) transformer: TFAutoModelWithLMHead = TFAutoModelWithLMHead.from_pretrained(transformer[1]) else: raise ValueError(f'Unknown identifier {transformer}') self.transform.tokenizer = tokenizer if self.config.get('fp16', None) or self.config.get('use_amp', None): policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16') tf.keras.mixed_precision.experimental.set_policy(policy) # tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) transformer.set_weights([w.astype('float16') for w in transformer.get_weights()]) self.transform.transformer_config = transformer.config return transformer # noinspection PyMethodOverriding def fit(self, trn_data, dev_data, save_dir, transformer, max_seq_length=256, transformer_dropout=.33, d_positional=None, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, optimizer='adamw', learning_rate=5e-5, learning_rate_transformer=None, weight_decay_rate=0, epsilon=1e-8, clipnorm=None, fp16=False, warmup_steps_ratio=0, arc_loss='sparse_categorical_crossentropy', rel_loss='sparse_categorical_crossentropy', metrics=('UAS', 'LAS'), batch_size=3000, samples_per_batch=150, max_samples_per_batch=None, epochs=100, tree=False, punct=False, token_mapping=None, run_eagerly=False, logger=None, verbose=True, **kwargs): self.set_params({}) return KerasComponent.fit(self, **merge_locals_kwargs(locals(), kwargs)) @property def sample_data(self): dataset = self.transform.inputs_to_dataset( [[('Hello', 'NN'), ('world', 'NN')], [('HanLP', 'NN'), ('is', 'NN'), ('good', 'NN')]] if self.config.get( 'use_pos', None) else [['Hello', 'world'], ['HanLP', 'is', 'good']]) return next(iter(dataset))[0] # noinspection PyMethodOverriding def build_optimizer(self, optimizer, learning_rate, epsilon, weight_decay_rate, clipnorm, fp16, train_steps, **kwargs): if optimizer == 'adamw': epochs = self.config['epochs'] learning_rate_transformer = kwargs.get('learning_rate_transformer', None) train_steps = math.ceil(self.config.train_examples * epochs / self.config.samples_per_batch) warmup_steps = math.ceil(train_steps * self.config['warmup_steps_ratio']) if learning_rate_transformer is not None: if learning_rate_transformer > 0: self.params['optimizer_transformer'] = build_adamw_optimizer(self.config, learning_rate_transformer, epsilon, clipnorm, train_steps, fp16, math.ceil(warmup_steps), weight_decay_rate) else: self.model.transformer.trainable = False return super().build_optimizer(lr=learning_rate) # use a normal adam for biaffine else: return build_adamw_optimizer(self.config, learning_rate, epsilon, clipnorm, train_steps, fp16, math.ceil(warmup_steps), weight_decay_rate) return super().build_optimizer(optimizer, **kwargs) def build_vocab(self, trn_data, logger): self.config.train_examples = train_examples = super().build_vocab(trn_data, logger) return train_examples def build_callbacks(self, save_dir, logger, metrics, **kwargs): callbacks = super().build_callbacks(save_dir, logger, metrics=metrics, **kwargs) callbacks.append(self) if not self.params: self.set_params({}) return callbacks def on_train_begin(self): self.params['accum_grads'] = [tf.Variable(tf.zeros_like(tv.read_value()), trainable=False) for tv in self.model.trainable_variables] self.params['trained_samples'] = 0 self.params['transformer_variable_names'] = {x.name for x in self.model.transformer.trainable_variables} def train_batch(self, words, feats, arcs, rels, mask, optimizer, arc_loss, rel_loss): with tf.GradientTape() as tape: arc_scores, rel_scores = self.model((words, feats), training=True) loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss) grads = tape.gradient(loss, self.model.trainable_variables) accum_grads = self.params['accum_grads'] for i, grad in enumerate(grads): if grad is not None: accum_grads[i].assign_add(grad) self.params['trained_samples'] += tf.shape(words)[0] if self.params['trained_samples'] >= self.config.samples_per_batch: self._apply_grads(accum_grads) return loss, arc_scores, rel_scores def _apply_grads(self, accum_grads): optimizer_transformer = self.params.get('optimizer_transformer', None) if optimizer_transformer: transformer = self.params['transformer_variable_names'] trainable_variables = self.model.trainable_variables optimizer_transformer.apply_gradients( (g, w) for g, w in zip(accum_grads, trainable_variables) if w.name in transformer) self.model.optimizer.apply_gradients( (g, w) for g, w in zip(accum_grads, trainable_variables) if w.name not in transformer) else: self.model.optimizer.apply_gradients(zip(accum_grads, self.model.trainable_variables)) for tv in accum_grads: tv.assign(tf.zeros_like(tv)) # print('Apply grads after', self.params['trained_samples'], 'samples') self.params['trained_samples'] = 0 def on_epoch_end(self, epoch, logs=None): if self.params['trained_samples']: self._apply_grads(self.params['accum_grads']) class BiaffineTransformerSemanticDependencyParser(BiaffineTransformerDependencyParserTF): def __init__(self, transform: CoNLL_Transformer_Transform = None) -> None: if not transform: transform = CoNLL_Transformer_Transform(graph=True) super().__init__(transform) def get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss): return BiaffineSemanticDependencyParserTF.get_loss(self, arc_scores, rel_scores, arcs, rels, mask, arc_loss, rel_loss) def fit(self, trn_data, dev_data, save_dir, transformer, max_seq_length=256, transformer_dropout=.33, d_positional=None, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, optimizer='adamw', learning_rate=5e-5, learning_rate_transformer=None, weight_decay_rate=0, epsilon=1e-8, clipnorm=None, fp16=False, warmup_steps_ratio=0, arc_loss='binary_crossentropy', rel_loss='sparse_categorical_crossentropy', metrics=('UF', 'LF'), batch_size=3000, samples_per_batch=150, max_samples_per_batch=None, epochs=100, tree=False, punct=False, token_mapping=None, enhanced_only=False, run_eagerly=False, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def decode(self, arc_scores, rel_scores, mask): return BiaffineSemanticDependencyParserTF.decode(self, arc_scores, rel_scores, mask) ================================================ FILE: hanlp/components/parsers/biaffine_tf/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-26 23:03 ================================================ FILE: hanlp/components/parsers/biaffine_tf/alg.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-26 19:49 # Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser from typing import List import numpy as np import tensorflow as tf from collections import defaultdict def nonzero(t: tf.Tensor) -> tf.Tensor: return tf.where(t > 0) def view(t: tf.Tensor, *dims) -> tf.Tensor: return tf.reshape(t, dims) def arange(n: int) -> tf.Tensor: return tf.range(n) def randperm(n: int) -> tf.Tensor: return tf.random.shuffle(arange(n)) def tolist(t: tf.Tensor) -> List: if isinstance(t, tf.Tensor): t = t.numpy() return t.tolist() def kmeans(x, k, seed=None): """See https://github.com/zysite/biaffine-parser/blob/master/parser/utils/alg.py#L7 Args: x(list): Lengths of sentences k(int): seed: (Default value = None) Returns: """ x = tf.constant(x, dtype=tf.float32) # count the frequency of each datapoint d, indices, f = tf.unique_with_counts(x, tf.int32) f = tf.cast(f, tf.float32) # calculate the sum of the values of the same datapoints total = d * f # initialize k centroids randomly c, old = tf.random.shuffle(d, seed)[:k], None # assign labels to each datapoint based on centroids dists = tf.abs(tf.expand_dims(d, -1) - c) y = tf.argmin(dists, axis=-1, output_type=tf.int32) dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y]))) # make sure number of datapoints is greater than that of clusters assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters" while old is None or not tf.reduce_all(c == old): # if an empty cluster is encountered, # choose the farthest datapoint from the biggest cluster # and move that the empty one for i in range(k): if not tf.reduce_any(y == i): mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32) lens = tf.reduce_sum(mask, axis=-1) biggest = view(nonzero(mask[tf.argmax(lens)]), -1) farthest = tf.argmax(tf.gather(dists, biggest)) tf.tensor_scatter_nd_update(y, tf.expand_dims(tf.expand_dims(biggest[farthest], -1), -1), [i]) mask = tf.cast(y == tf.expand_dims(tf.range(k, dtype=tf.int32), -1), tf.float32) # update the centroids c, old = tf.cast(tf.reduce_sum(total * mask, axis=-1), tf.float32) / tf.cast(tf.reduce_sum(f * mask, axis=-1), tf.float32), c # re-assign all datapoints to clusters dists = tf.abs(tf.expand_dims(d, -1) - c) y = tf.argmin(dists, axis=-1, output_type=tf.int32) dists = tf.gather_nd(dists, tf.transpose(tf.stack([tf.range(tf.shape(dists)[0], dtype=tf.int32), y]))) # assign all datapoints to the new-generated clusters # without considering the empty ones y, (assigned, _) = tf.gather(y, indices), tf.unique(y) # get the centroids of the assigned clusters centroids = tf.gather(c, assigned).numpy().tolist() # map all values of datapoints to buckets clusters = [tf.squeeze(tf.where(y == i), axis=-1).numpy().tolist() for i in assigned] return centroids, clusters # *************************************************************** class Tarjan: """Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph""" def __init__(self, prediction, tokens): """ Parameters ---------- prediction : numpy.ndarray a predicted dependency tree where prediction[dep_idx] = head_idx tokens : numpy.ndarray the tokens we care about (i.e. exclude _GO, _EOS, and _PAD) """ self._edges = defaultdict(set) self._vertices = set((0,)) for dep, head in enumerate(prediction[tokens]): self._vertices.add(dep + 1) self._edges[head].add(dep + 1) self._indices = {} self._lowlinks = {} self._onstack = defaultdict(lambda: False) self._SCCs = [] index = 0 stack = [] for v in self.vertices: if v not in self.indices: self.strongconnect(v, index, stack) # ============================================================= def strongconnect(self, v, index, stack): """ Args: v: index: stack: Returns: """ self._indices[v] = index self._lowlinks[v] = index index += 1 stack.append(v) self._onstack[v] = True for w in self.edges[v]: if w not in self.indices: self.strongconnect(w, index, stack) self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w]) elif self._onstack[w]: self._lowlinks[v] = min(self._lowlinks[v], self._indices[w]) if self._lowlinks[v] == self._indices[v]: self._SCCs.append(set()) while stack[-1] != v: w = stack.pop() self._onstack[w] = False self._SCCs[-1].add(w) w = stack.pop() self._onstack[w] = False self._SCCs[-1].add(w) return # ====================== @property def edges(self): return self._edges @property def vertices(self): return self._vertices @property def indices(self): return self._indices @property def SCCs(self): return self._SCCs def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True): """Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py Args: parse_probs(NDArray): seq_len x seq_len, the probability of arcs length(NDArray): sentence length including ROOT tokens_to_keep(NDArray): mask matrix ensure_tree: (Default value = True) Returns: """ if ensure_tree: I = np.eye(len(tokens_to_keep)) # block loops and pad heads parse_probs = parse_probs * tokens_to_keep * (1 - I) parse_preds = np.argmax(parse_probs, axis=1) tokens = np.arange(1, length) roots = np.where(parse_preds[tokens] == 0)[0] + 1 # ensure at least one root if len(roots) < 1: # The current root probabilities root_probs = parse_probs[tokens, 0] # The current head probabilities old_head_probs = parse_probs[tokens, parse_preds[tokens]] # Get new potential root probabilities new_root_probs = root_probs / old_head_probs # Select the most probable root new_root = tokens[np.argmax(new_root_probs)] # Make the change parse_preds[new_root] = 0 # ensure at most one root elif len(roots) > 1: # The probabilities of the current heads root_probs = parse_probs[roots, 0] # Set the probability of depending on the root zero parse_probs[roots, 0] = 0 # Get new potential heads and their probabilities new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1 new_head_probs = parse_probs[roots, new_heads] / root_probs # Select the most probable root new_root = roots[np.argmin(new_head_probs)] # Make the change parse_preds[roots] = new_heads parse_preds[new_root] = 0 # remove cycles tarjan = Tarjan(parse_preds, tokens) for SCC in tarjan.SCCs: if len(SCC) > 1: dependents = set() to_visit = set(SCC) while len(to_visit) > 0: node = to_visit.pop() if not node in dependents: dependents.add(node) to_visit.update(tarjan.edges[node]) # The indices of the nodes that participate in the cycle cycle = np.array(list(SCC)) # The probabilities of the current heads old_heads = parse_preds[cycle] old_head_probs = parse_probs[cycle, old_heads] # Set the probability of depending on a non-head to zero non_heads = np.array(list(dependents)) parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0 # Get new potential heads and their probabilities new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1 new_head_probs = parse_probs[cycle, new_heads] / old_head_probs # Select the most probable change change = np.argmax(new_head_probs) changed_cycle = cycle[change] old_head = old_heads[change] new_head = new_heads[change] # Make the change parse_preds[changed_cycle] = new_head tarjan.edges[new_head].add(changed_cycle) tarjan.edges[old_head].remove(changed_cycle) return parse_preds else: # block and pad heads parse_probs = parse_probs * tokens_to_keep parse_preds = np.argmax(parse_probs, axis=1) return parse_preds def rel_argmax(rel_probs, length, root, ensure_tree=True): """Fix the relation prediction by heuristic rules Args: rel_probs(NDArray): seq_len x rel_size length: real sentence length ensure_tree: (Default value = True) root: Returns: """ if ensure_tree: tokens = np.arange(1, length) rel_preds = np.argmax(rel_probs, axis=1) roots = np.where(rel_preds[tokens] == root)[0] + 1 if len(roots) < 1: rel_preds[1 + np.argmax(rel_probs[tokens, root])] = root elif len(roots) > 1: root_probs = rel_probs[roots, root] rel_probs[roots, root] = 0 new_rel_preds = np.argmax(rel_probs[roots], axis=1) new_rel_probs = rel_probs[roots, new_rel_preds] / root_probs new_root = roots[np.argmin(new_rel_probs)] rel_preds[roots] = new_rel_preds rel_preds[new_root] = root return rel_preds else: rel_preds = np.argmax(rel_probs, axis=1) return rel_preds ================================================ FILE: hanlp/components/parsers/biaffine_tf/layers.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-26 23:05 # Ported from the PyTorch implementation https://github.com/zysite/biaffine-parser import tensorflow as tf from hanlp.utils.tf_util import tf_bernoulli class Biaffine(tf.keras.layers.Layer): def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True, trainable=True, name=None, dtype=None, dynamic=False, **kwargs): super().__init__(trainable, name, dtype, dynamic, **kwargs) self.n_in = n_in self.n_out = n_out self.bias_x = bias_x self.bias_y = bias_y self.weight = None def build(self, input_shape): self.weight = self.add_weight(name='kernel', shape=(self.n_out, self.n_in + self.bias_x, self.n_in + self.bias_y), initializer='zero') def extra_repr(self): s = f"n_in={self.n_in}, n_out={self.n_out}" if self.bias_x: s += f", bias_x={self.bias_x}" if self.bias_y: s += f", bias_y={self.bias_y}" return s # noinspection PyMethodOverriding def call(self, x, y, **kwargs): if self.bias_x: x = tf.concat((x, tf.ones_like(x[..., :1])), -1) if self.bias_y: y = tf.concat((y, tf.ones_like(y[..., :1])), -1) # [batch_size, n_out, seq_len, seq_len] s = tf.einsum('bxi,oij,byj->boxy', x, self.weight, y) # remove dim 1 if n_out == 1 if self.n_out == 1: s = tf.squeeze(s, axis=1) return s class MLP(tf.keras.layers.Layer): def __init__(self, n_hidden, dropout=0, trainable=True, name=None, dtype=None, dynamic=False, **kwargs): super().__init__(trainable, name, dtype, dynamic, **kwargs) self.linear = tf.keras.layers.Dense(n_hidden, kernel_initializer='orthogonal') self.activation = tf.keras.layers.LeakyReLU(0.1) self.dropout = SharedDropout(p=dropout) def call(self, x, **kwargs): x = self.linear(x) x = self.activation(x) x = self.dropout(x) return x class SharedDropout(tf.keras.layers.Layer): def __init__(self, p=0.5, batch_first=True, trainable=True, name=None, dtype=None, dynamic=False, **kwargs): """Dropout on timesteps with bernoulli distribution""" super().__init__(trainable, name, dtype, dynamic, **kwargs) self.p = p self.batch_first = batch_first def extra_repr(self): s = f"p={self.p}" if self.batch_first: s += f", batch_first={self.batch_first}" return s def call(self, x, training=None, **kwargs): if training and self.p > 0: if self.batch_first: mask = self.get_mask(x[:, 0], self.p) else: mask = self.get_mask(x[0], self.p) x *= tf.expand_dims(mask, axis=1) if self.batch_first else mask return x @staticmethod def get_mask(x, p): mask = tf_bernoulli(tf.shape(x), 1 - p, x.dtype) mask = mask / (1 - p) return mask class IndependentDropout(tf.keras.layers.Layer): def __init__(self, p=0.5, trainable=True, name=None, dtype=None, dynamic=False, **kwargs): """Dropout on the first two dimensions""" super().__init__(trainable, name, dtype, dynamic, **kwargs) self.p = p def extra_repr(self): return f"p={self.p}" def call(self, inputs, training=None, **kwargs): if training and self.p > 0: masks = [tf_bernoulli(tf.shape(x)[:2], 1 - self.p) for x in inputs] total = sum(masks) scale = len(inputs) / tf.reduce_max(tf.ones_like(total)) masks = [mask * scale for mask in masks] inputs = [item * tf.expand_dims(mask, axis=-1) for item, mask in zip(inputs, masks)] return inputs ================================================ FILE: hanlp/components/parsers/biaffine_tf/model.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-26 23:04 import tensorflow as tf from hanlp.layers.transformers.tf_imports import TFPreTrainedModel from hanlp.components.parsers.biaffine_tf.layers import IndependentDropout, SharedDropout, Biaffine, MLP class BiaffineModelTF(tf.keras.Model): def __init__(self, config, embed=None, transformer: TFPreTrainedModel = None): """An implementation of T. Dozat and C. D. Manning, “Deep Biaffine Attention for Neural Dependency Parsing.,” ICLR, 2017. Although I have my MXNet implementation, I found zysite's PyTorch implementation is cleaner so I port it to TensorFlow Args: config: param embed: Returns: """ super(BiaffineModelTF, self).__init__() assert not (embed and transformer), 'Either pre-trained word embed and transformer is supported, but not both' normal = tf.keras.initializers.RandomNormal(stddev=1.) if not transformer: # the embedding layer self.word_embed = tf.keras.layers.Embedding(input_dim=config.n_words, output_dim=config.n_embed, embeddings_initializer=tf.keras.initializers.zeros() if embed else normal, name='word_embed') self.feat_embed = tf.keras.layers.Embedding(input_dim=config.n_feats, output_dim=config.n_embed, embeddings_initializer=tf.keras.initializers.zeros() if embed else normal, name='feat_embed') self.embed_dropout = IndependentDropout(p=config.embed_dropout, name='embed_dropout') # the word-lstm layer self.lstm = tf.keras.models.Sequential(name='lstm') for _ in range(config.n_lstm_layers): self.lstm.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM( units=config.n_lstm_hidden, dropout=config.lstm_dropout, recurrent_dropout=config.lstm_dropout, return_sequences=True, kernel_initializer='orthogonal', unit_forget_bias=False, # turns out to hinder performance ))) self.lstm_dropout = SharedDropout(p=config.lstm_dropout, name='lstm_dropout') else: self.transformer = transformer transformer_dropout = config.get('transformer_dropout', None) if transformer_dropout: self.transformer_dropout = SharedDropout(p=config.transformer_dropout, name='transformer_dropout') d_positional = config.get('d_positional', None) if d_positional: max_seq_length = config.get('max_seq_length', 256) self.position_table = self.add_weight(shape=(max_seq_length, d_positional), initializer='random_normal', trainable=True) # the MLP layers self.mlp_arc_h = MLP(n_hidden=config.n_mlp_arc, dropout=config.mlp_dropout, name='mlp_arc_h') self.mlp_arc_d = MLP(n_hidden=config.n_mlp_arc, dropout=config.mlp_dropout, name='mlp_arc_d') self.mlp_rel_h = MLP(n_hidden=config.n_mlp_rel, dropout=config.mlp_dropout, name='mlp_rel_h') self.mlp_rel_d = MLP(n_hidden=config.n_mlp_rel, dropout=config.mlp_dropout, name='mlp_rel_d') # the Biaffine layers self.arc_attn = Biaffine(n_in=config.n_mlp_arc, bias_x=True, bias_y=False, name='arc_attn') self.rel_attn = Biaffine(n_in=config.n_mlp_rel, n_out=config.n_rels, bias_x=True, bias_y=True, name='rel_attn') if embed is not None: self.pretrained = embed self.pad_index = tf.constant(config.pad_index, dtype=tf.int64) self.unk_index = tf.constant(config.unk_index, dtype=tf.int64) # noinspection PyMethodOverriding def call(self, inputs, mask_inf=True, **kwargs): # batch_size, seq_len = words.shape # get the mask and lengths of given batch # mask = words.ne(self.pad_index) if hasattr(self, 'lstm'): words, feats = inputs mask = tf.not_equal(words, self.pad_index) # set the indices larger than num_embeddings to unk_index # ext_mask = words.ge(self.word_embed.num_embeddings) ext_mask = tf.greater_equal(words, self.word_embed.input_dim) ext_words = tf.where(ext_mask, self.unk_index, words) # get outputs from embedding layers word_embed = self.word_embed(ext_words) if hasattr(self, 'pretrained'): word_embed += self.pretrained(words) feat_embed = self.feat_embed(feats) word_embed, feat_embed = self.embed_dropout([word_embed, feat_embed]) # concatenate the word and feat representations embed = tf.concat((word_embed, feat_embed), axis=-1) x = self.lstm(embed, mask=mask) x = self.lstm_dropout(x) else: words, (input_ids, input_mask, prefix_offset) = inputs mask = tf.not_equal(words, self.pad_index) x = self.run_transformer(input_ids, input_mask, prefix_offset) # apply MLPs to the BiLSTM output states arc_h = self.mlp_arc_h(x) arc_d = self.mlp_arc_d(x) rel_h = self.mlp_rel_h(x) rel_d = self.mlp_rel_d(x) # get arc and rel scores from the bilinear attention # [batch_size, seq_len, seq_len] s_arc = self.arc_attn(arc_d, arc_h) # [batch_size, seq_len, seq_len, n_rels] s_rel = tf.transpose(self.rel_attn(rel_d, rel_h), [0, 2, 3, 1]) # set the scores that exceed the length of each sentence to -inf if mask_inf: s_arc = tf.where(tf.expand_dims(mask, 1), s_arc, float('-inf')) return s_arc, s_rel def run_transformer(self, input_ids, input_mask, prefix_offset): if isinstance(self.transformer, TFPreTrainedModel): sequence_output = self.transformer([input_ids, input_mask]) sequence_output = sequence_output[0] else: sequence_output = self.transformer([input_ids, tf.zeros_like(input_ids)], mask=input_mask) x = tf.gather(sequence_output, prefix_offset, batch_dims=1) if hasattr(self, 'transformer_dropout'): x = self.transformer_dropout(x) if hasattr(self, 'position_table'): batch_size, seq_length = tf.shape(x)[:2] timing_signal = tf.broadcast_to(self.position_table[:seq_length], [batch_size, seq_length, self.position_table.shape[-1]]) x = tf.concat([x, timing_signal], axis=-1) return x def to_functional(self): words = tf.keras.Input(shape=[None], dtype=tf.int64, name='words') feats = tf.keras.Input(shape=[None], dtype=tf.int64, name='feats') s_arc, s_rel = self.call([words, feats], mask_inf=False) return tf.keras.Model(inputs=[words, feats], outputs=[s_arc, s_rel]) ================================================ FILE: hanlp/components/parsers/chu_liu_edmonds.py ================================================ # Adopted from https://github.com/allenai/allennlp under Apache Licence 2.0. # Changed the packaging. from typing import List, Set, Tuple, Dict import numpy def decode_mst( energy: numpy.ndarray, length: int, has_labels: bool = True ) -> Tuple[numpy.ndarray, numpy.ndarray]: """Note: Counter to typical intuition, this function decodes the _maximum_ spanning tree. Decode the optimal MST tree with the Chu-Liu-Edmonds algorithm for maximum spanning arborescences on graphs. Adopted from https://github.com/allenai/allennlp/blob/master/allennlp/nn/chu_liu_edmonds.py which is licensed under the Apache License 2.0 # Parameters energy : `numpy.ndarray`, required. A tensor with shape (num_labels, timesteps, timesteps) containing the energy of each edge. If has_labels is `False`, the tensor should have shape (timesteps, timesteps) instead. length : `int`, required. The length of this sequence, as the energy may have come from a padded batch. has_labels : `bool`, optional, (default = True) Whether the graph has labels or not. Args: energy: numpy.ndarray: length: int: has_labels: bool: (Default value = True) Returns: """ if has_labels and energy.ndim != 3: raise ValueError("The dimension of the energy array is not equal to 3.") elif not has_labels and energy.ndim != 2: raise ValueError("The dimension of the energy array is not equal to 2.") input_shape = energy.shape max_length = input_shape[-1] # Our energy matrix might have been batched - # here we clip it to contain only non padded tokens. if has_labels: energy = energy[:, :length, :length] # get best label for each edge. label_id_matrix = energy.argmax(axis=0) energy = energy.max(axis=0) else: energy = energy[:length, :length] label_id_matrix = None # get original score matrix original_score_matrix = energy # initialize score matrix to original score matrix score_matrix = numpy.array(original_score_matrix, copy=True) old_input = numpy.zeros([length, length], dtype=numpy.int32) old_output = numpy.zeros([length, length], dtype=numpy.int32) current_nodes = [True for _ in range(length)] representatives: List[Set[int]] = [] for node1 in range(length): original_score_matrix[node1, node1] = 0.0 score_matrix[node1, node1] = 0.0 representatives.append({node1}) for node2 in range(node1 + 1, length): old_input[node1, node2] = node1 old_output[node1, node2] = node2 old_input[node2, node1] = node2 old_output[node2, node1] = node1 final_edges: Dict[int, int] = {} # The main algorithm operates inplace. chu_liu_edmonds( length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives ) heads = numpy.zeros([max_length], numpy.int32) if has_labels: head_type = numpy.ones([max_length], numpy.int32) else: head_type = None for child, parent in final_edges.items(): heads[child] = parent if has_labels: head_type[child] = label_id_matrix[parent, child] return heads, head_type def chu_liu_edmonds( length: int, score_matrix: numpy.ndarray, current_nodes: List[bool], final_edges: Dict[int, int], old_input: numpy.ndarray, old_output: numpy.ndarray, representatives: List[Set[int]], ): """Applies the chu-liu-edmonds algorithm recursively to a graph with edge weights defined by score_matrix. Note that this function operates in place, so variables will be modified. # Parameters length : `int`, required. The number of nodes. score_matrix : `numpy.ndarray`, required. The score matrix representing the scores for pairs of nodes. current_nodes : `List[bool]`, required. The nodes which are representatives in the graph. A representative at it's most basic represents a node, but as the algorithm progresses, individual nodes will represent collapsed cycles in the graph. final_edges : `Dict[int, int]`, required. An empty dictionary which will be populated with the nodes which are connected in the maximum spanning tree. old_input : `numpy.ndarray`, required. old_output : `numpy.ndarray`, required. representatives : `List[Set[int]]`, required. A list containing the nodes that a particular node is representing at this iteration in the graph. # Returns Nothing - all variables are modified in place. Args: length: int: score_matrix: numpy.ndarray: current_nodes: List[bool]: final_edges: Dict[int: int]: old_input: numpy.ndarray: old_output: numpy.ndarray: representatives: List[Set[int]]: Returns: """ # Set the initial graph to be the greedy best one. parents = [-1] for node1 in range(1, length): parents.append(0) if current_nodes[node1]: max_score = score_matrix[0, node1] for node2 in range(1, length): if node2 == node1 or not current_nodes[node2]: continue new_score = score_matrix[node2, node1] if new_score > max_score: max_score = new_score parents[node1] = node2 # Check if this solution has a cycle. has_cycle, cycle = _find_cycle(parents, length, current_nodes) # If there are no cycles, find all edges and return. if not has_cycle: final_edges[0] = -1 for node in range(1, length): if not current_nodes[node]: continue parent = old_input[parents[node], node] child = old_output[parents[node], node] final_edges[child] = parent return # Otherwise, we have a cycle so we need to remove an edge. # From here until the recursive call is the contraction stage of the algorithm. cycle_weight = 0.0 # Find the weight of the cycle. index = 0 for node in cycle: index += 1 cycle_weight += score_matrix[parents[node], node] # For each node in the graph, find the maximum weight incoming # and outgoing edge into the cycle. cycle_representative = cycle[0] for node in range(length): if not current_nodes[node] or node in cycle: continue in_edge_weight = float("-inf") in_edge = -1 out_edge_weight = float("-inf") out_edge = -1 for node_in_cycle in cycle: if score_matrix[node_in_cycle, node] > in_edge_weight: in_edge_weight = score_matrix[node_in_cycle, node] in_edge = node_in_cycle # Add the new edge score to the cycle weight # and subtract the edge we're considering removing. score = ( cycle_weight + score_matrix[node, node_in_cycle] - score_matrix[parents[node_in_cycle], node_in_cycle] ) if score > out_edge_weight: out_edge_weight = score out_edge = node_in_cycle score_matrix[cycle_representative, node] = in_edge_weight old_input[cycle_representative, node] = old_input[in_edge, node] old_output[cycle_representative, node] = old_output[in_edge, node] score_matrix[node, cycle_representative] = out_edge_weight old_output[node, cycle_representative] = old_output[node, out_edge] old_input[node, cycle_representative] = old_input[node, out_edge] # For the next recursive iteration, we want to consider the cycle as a # single node. Here we collapse the cycle into the first node in the # cycle (first node is arbitrary), set all the other nodes not be # considered in the next iteration. We also keep track of which # representatives we are considering this iteration because we need # them below to check if we're done. considered_representatives: List[Set[int]] = [] for i, node_in_cycle in enumerate(cycle): considered_representatives.append(set()) if i > 0: # We need to consider at least one # node in the cycle, arbitrarily choose # the first. current_nodes[node_in_cycle] = False for node in representatives[node_in_cycle]: considered_representatives[i].add(node) if i > 0: representatives[cycle_representative].add(node) chu_liu_edmonds( length, score_matrix, current_nodes, final_edges, old_input, old_output, representatives ) # Expansion stage. # check each node in cycle, if one of its representatives # is a key in the final_edges, it is the one we need. found = False key_node = -1 for i, node in enumerate(cycle): for cycle_rep in considered_representatives[i]: if cycle_rep in final_edges: key_node = node found = True break if found: break previous = parents[key_node] while previous != key_node: child = old_output[parents[previous], previous] parent = old_input[parents[previous], previous] final_edges[child] = parent previous = parents[previous] def _find_cycle( parents: List[int], length: int, current_nodes: List[bool] ) -> Tuple[bool, List[int]]: added = [False for _ in range(length)] added[0] = True cycle = set() has_cycle = False for i in range(1, length): if has_cycle: break # don't redo nodes we've already # visited or aren't considering. if added[i] or not current_nodes[i]: continue # Initialize a new possible cycle. this_cycle = set() this_cycle.add(i) added[i] = True has_cycle = True next_node = i while parents[next_node] not in this_cycle: next_node = parents[next_node] # If we see a node we've already processed, # we can stop, because the node we are # processing would have been in that cycle. if added[next_node]: has_cycle = False break added[next_node] = True this_cycle.add(next_node) if has_cycle: original = next_node cycle.add(original) next_node = parents[original] while next_node != original: cycle.add(next_node) next_node = parents[next_node] break return has_cycle, list(cycle) ================================================ FILE: hanlp/components/parsers/conll.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-26 15:37 from typing import Union from hanlp.utils.io_util import get_resource, TimingFileIterator from hanlp.utils.log_util import logger def collapse_enhanced_empty_nodes(sent: list): collapsed = [] for cells in sent: if isinstance(cells[0], float): id = cells[0] head, deprel = cells[8].split(':', 1) for x in sent: arrows = [s.split(':', 1) for s in x[8].split('|')] arrows = [(head, f'{head}:{deprel}>{r}') if h == str(id) else (h, r) for h, r in arrows] arrows = sorted(arrows) x[8] = '|'.join(f'{h}:{r}' for h, r in arrows) sent[head][7] += f'>{cells[7]}' else: collapsed.append(cells) return collapsed def read_conll(filepath: Union[str, TimingFileIterator], underline_to_none=False, enhanced_collapse_empty_nodes=False): sent = [] if isinstance(filepath, str): filepath: str = get_resource(filepath) if filepath.endswith('.conllu') and enhanced_collapse_empty_nodes is None: enhanced_collapse_empty_nodes = True src = open(filepath, encoding='utf-8') else: src = filepath for idx, line in enumerate(src): if line.startswith('#'): continue line = line.strip() cells = line.split('\t') if line and cells: if enhanced_collapse_empty_nodes and '.' in cells[0]: cells[0] = float(cells[0]) cells[6] = None else: if '-' in cells[0] or '.' in cells[0]: # sent[-1][1] += cells[1] continue cells[0] = int(cells[0]) if cells[6] != '_': try: cells[6] = int(cells[6]) except ValueError: cells[6] = 0 logger.exception(f'Wrong CoNLL format {filepath}:{idx + 1}\n{line}') if underline_to_none: for i, x in enumerate(cells): if x == '_': cells[i] = None sent.append(cells) else: if enhanced_collapse_empty_nodes: sent = collapse_enhanced_empty_nodes(sent) yield sent sent = [] if sent: if enhanced_collapse_empty_nodes: sent = collapse_enhanced_empty_nodes(sent) yield sent src.close() ================================================ FILE: hanlp/components/parsers/constituency/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-28 19:26 ================================================ FILE: hanlp/components/parsers/constituency/crf_constituency_model.py ================================================ # -*- coding:utf-8 -*- # Adopted from https://github.com/yzhangcs/parser # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import torch from torch import nn from hanlp.components.parsers.constituency.treecrf import CRFConstituency from hanlp.components.parsers.alg import cky from hanlp.components.parsers.biaffine.biaffine import Biaffine from hanlp.components.parsers.biaffine.mlp import MLP class CRFConstituencyDecoder(nn.Module): r""" The implementation of CRF Constituency Parser, also called FANCY (abbr. of Fast and Accurate Neural Crf constituencY) Parser. References: - Yu Zhang, Houquan Zhou and Zhenghua Li. 2020. `Fast and Accurate Neural CRF Constituency Parsing`_. Args: n_words (int): The size of the word vocabulary. n_feats (int): The size of the feat vocabulary. n_labels (int): The number of labels. feat (str): Specifies which type of additional feature to use: ``'char'`` | ``'bert'`` | ``'tag'``. ``'char'``: Character-level representations extracted by CharLSTM. ``'bert'``: BERT representations, other pretrained langugae models like XLNet are also feasible. ``'tag'``: POS tag embeddings. Default: 'char'. n_embed (int): The size of word embeddings. Default: 100. n_feat_embed (int): The size of feature representations. Default: 100. n_char_embed (int): The size of character embeddings serving as inputs of CharLSTM, required if ``feat='char'``. Default: 50. bert (str): Specifies which kind of language model to use, e.g., ``'bert-base-cased'`` and ``'xlnet-base-cased'``. This is required if ``feat='bert'``. The full list can be found in `transformers`. Default: ``None``. n_bert_layers (int): Specifies how many last layers to use. Required if ``feat='bert'``. The final outputs would be the weight sum of the hidden states of these layers. Default: 4. mix_dropout (float): The dropout ratio of BERT layers. Required if ``feat='bert'``. Default: .0. embed_dropout (float): The dropout ratio of input embeddings. Default: .33. n_hidden (int): The size of LSTM hidden states. Default: 400. n_lstm_layers (int): The number of LSTM layers. Default: 3. lstm_dropout (float): The dropout ratio of LSTM. Default: .33. n_mlp_span (int): Span MLP size. Default: 500. n_mlp_label (int): Label MLP size. Default: 100. mlp_dropout (float): The dropout ratio of MLP layers. Default: .33. feat_pad_index (int): The index of the padding token in the feat vocabulary. Default: 0. pad_index (int): The index of the padding token in the word vocabulary. Default: 0. unk_index (int): The index of the unknown token in the word vocabulary. Default: 1. .. _Fast and Accurate Neural CRF Constituency Parsing: https://www.ijcai.org/Proceedings/2020/560/ .. _transformers: https://github.com/huggingface/transformers """ def __init__(self, n_labels, n_hidden=400, n_mlp_span=500, n_mlp_label=100, mlp_dropout=.33, **kwargs ): super().__init__() # the MLP layers self.mlp_span_l = MLP(n_in=n_hidden, n_out=n_mlp_span, dropout=mlp_dropout) self.mlp_span_r = MLP(n_in=n_hidden, n_out=n_mlp_span, dropout=mlp_dropout) self.mlp_label_l = MLP(n_in=n_hidden, n_out=n_mlp_label, dropout=mlp_dropout) self.mlp_label_r = MLP(n_in=n_hidden, n_out=n_mlp_label, dropout=mlp_dropout) # the Biaffine layers self.span_attn = Biaffine(n_in=n_mlp_span, bias_x=True, bias_y=False) self.label_attn = Biaffine(n_in=n_mlp_label, n_out=n_labels, bias_x=True, bias_y=True) self.crf = CRFConstituency() self.criterion = nn.CrossEntropyLoss() def forward(self, x, **kwargs): r""" Args: x (~torch.FloatTensor): ``[batch_size, seq_len, hidden_dim]``. Hidden states from encoder. Returns: ~torch.Tensor, ~torch.Tensor: The first tensor of shape ``[batch_size, seq_len, seq_len]`` holds scores of all possible spans. The second of shape ``[batch_size, seq_len, seq_len, n_labels]`` holds scores of all possible labels on each span. """ x_f, x_b = x.chunk(2, -1) x = torch.cat((x_f[:, :-1], x_b[:, 1:]), -1) # apply MLPs to the BiLSTM output states span_l = self.mlp_span_l(x) span_r = self.mlp_span_r(x) label_l = self.mlp_label_l(x) label_r = self.mlp_label_r(x) # [batch_size, seq_len, seq_len] s_span = self.span_attn(span_l, span_r) # [batch_size, seq_len, seq_len, n_labels] s_label = self.label_attn(label_l, label_r).permute(0, 2, 3, 1) return s_span, s_label def loss(self, s_span, s_label, charts, mask, mbr=True): r""" Args: s_span (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all spans s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``. Scores of all labels on each span. charts (~torch.LongTensor): ``[batch_size, seq_len, seq_len]``. The tensor of gold-standard labels, in which positions without labels are filled with -1. mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``. The mask for covering the unpadded tokens in each chart. mbr (bool): If ``True``, returns marginals for MBR decoding. Default: ``True``. Returns: ~torch.Tensor, ~torch.Tensor: The training loss and original span scores of shape ``[batch_size, seq_len, seq_len]`` if ``mbr=False``, or marginals otherwise. """ span_mask = charts.ge(0) & mask span_loss, span_probs = self.crf(s_span, mask, span_mask, mbr) label_loss = self.criterion(s_label[span_mask], charts[span_mask]) loss = span_loss + label_loss return loss, span_probs def decode(self, s_span, s_label, mask): r""" Args: s_span (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all spans. s_label (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``. Scores of all labels on each span. mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``. The mask for covering the unpadded tokens in each chart. Returns: list[list[tuple]]: Sequences of factorized labeled trees traversed in pre-order. """ span_preds = cky(s_span, mask) label_preds = s_label.argmax(-1).tolist() return [[(i, j, labels[i][j]) for i, j in spans] for spans, labels in zip(span_preds, label_preds)] class CRFConstituencyModel(nn.Module): def __init__(self, encoder, decoder: CRFConstituencyDecoder) -> None: super().__init__() self.encoder = encoder self.decoder = decoder def forward(self, batch): r""" Args: batch (~dict): Batch of input data. Returns: ~torch.Tensor, ~torch.Tensor: The first tensor of shape ``[batch_size, seq_len, seq_len]`` holds scores of all possible spans. The second of shape ``[batch_size, seq_len, seq_len, n_labels]`` holds scores of all possible labels on each span. """ x = self.encoder(batch) return self.decoder(x) ================================================ FILE: hanlp/components/parsers/constituency/crf_constituency_parser.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-28 21:24 import logging from typing import Union, List import torch from phrasetree.tree import Tree from torch.utils.data import DataLoader from hanlp_common.constant import BOS, EOS, IDX from hanlp.common.dataset import TransformableDataset, SamplerBuilder, PadSequenceDataLoader from hanlp.common.structure import History from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import FieldLength, TransformList from hanlp.common.vocab import VocabWithNone from hanlp.components.classifiers.transformer_classifier import TransformerComponent from hanlp.datasets.parsing.loaders.constituency_dataset import ConstituencyDataset, unpack_tree_to_features, \ build_tree, factorize, remove_subcategory from hanlp.components.parsers.constituency.crf_constituency_model import CRFConstituencyDecoder, CRFConstituencyModel from hanlp.metrics.parsing.span import SpanMetric from hanlp.utils.time_util import CountdownTimer from hanlp.utils.torch_util import clip_grad_norm from hanlp_common.util import merge_locals_kwargs, merge_dict, reorder class CRFConstituencyParser(TorchComponent): def __init__(self, **kwargs) -> None: """Two-stage CRF Parsing (:cite:`ijcai2020-560`). Args: **kwargs: Predefined config. """ super().__init__(**kwargs) self.model: CRFConstituencyModel = self.model def build_optimizer(self, trn, **kwargs): # noinspection PyCallByClass,PyTypeChecker return TransformerComponent.build_optimizer(self, trn, **kwargs) def build_criterion(self, decoder=None, **kwargs): return decoder def build_metric(self, **kwargs): return SpanMetric() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, patience=0.5, eval_trn=True, **kwargs): if isinstance(patience, float): patience = int(patience * epochs) best_epoch, best_metric = 0, -1 timer = CountdownTimer(epochs) history = History() for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width, eval_trn=eval_trn, **self.config) loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width) timer.update() report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}" if dev_metric > best_metric: best_epoch, best_metric = epoch, dev_metric self.save_weights(save_dir) report += ' [red](saved)[/red]' else: report += f' ({epoch - best_epoch})' if epoch - best_epoch >= patience: report += ' early stop' logger.info(report) if epoch - best_epoch >= patience: break if not best_epoch: self.save_weights(save_dir) elif best_epoch != epoch: self.load_weights(save_dir) logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}") logger.info(f"Average time of each epoch is {timer.elapsed_average_human}") logger.info(f"{timer.elapsed_human} elapsed") # noinspection PyMethodOverriding def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric: SpanMetric, logger: logging.Logger, history: History, gradient_accumulation=1, grad_norm=None, ratio_width=None, eval_trn=True, **kwargs): optimizer, scheduler = optimizer metric.reset() self.model.train() timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation)) total_loss = 0 for idx, batch in enumerate(trn): out, mask = self.feed_batch(batch) y = batch['chart_id'] loss, span_probs = self.compute_loss(out, y, mask) if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation loss.backward() total_loss += loss.item() if eval_trn: prediction = self.decode_output(out, mask, batch, span_probs) self.update_metrics(metric, batch, prediction) if history.step(gradient_accumulation): self._step(optimizer, scheduler, grad_norm) report = f'loss: {total_loss / (idx + 1):.4f} {metric}' if eval_trn \ else f'loss: {total_loss / (idx + 1):.4f}' timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width) del loss del out del mask def decode_output(self, out, mask, batch, span_probs=None, decoder=None, tokens=None): s_span, s_label = out if not decoder: decoder = self.model.decoder if mask.any().item(): if span_probs is None: if self.config.mbr: s_span = decoder.crf(s_span, mask, mbr=True) else: s_span = span_probs chart_preds = decoder.decode(s_span, s_label, mask) else: chart_preds = [[]] * len(tokens) idx_to_token = self.vocabs.chart.idx_to_token if tokens is None: tokens = batch.get('token_', None) # Use the original tokens if any if tokens is None: tokens = batch['token'] tokens = [x[1:-1] for x in tokens] trees = [build_tree(token, [(i, j, idx_to_token[label]) for i, j, label in chart]) for token, chart in zip(tokens, chart_preds)] # probs = [prob[:i - 1, 1:i].cpu() for i, prob in zip(lens, s_span.unbind())] return trees def update_metrics(self, metric, batch, prediction): # Add pre-terminals (pos tags) back to prediction for safe factorization (deletion based on pos) for pred, gold in zip(prediction, batch['constituency']): pred: Tree = pred gold: Tree = gold for p, g in zip(pred.subtrees(lambda t: t.height() == 2), gold.pos()): token, pos = g p: Tree = p assert p.label() == '_' p.set_label(pos) metric([factorize(tree, self.config.delete, self.config.equal) for tree in prediction], [factorize(tree, self.config.delete, self.config.equal) for tree in batch['constituency']]) return metric def feed_batch(self, batch: dict): mask = self.compute_mask(batch) s_span, s_label = self.model(batch) return (s_span, s_label), mask def compute_mask(self, batch, offset=1): lens = batch['token_length'] - offset seq_len = lens.max() mask = lens.new_tensor(range(seq_len)) < lens.view(-1, 1, 1) mask = mask & mask.new_ones(seq_len, seq_len).triu_(1) return mask def compute_loss(self, out, y, mask, crf_decoder=None): if not crf_decoder: crf_decoder = self.model.decoder loss, span_probs = crf_decoder.loss(out[0], out[1], y, mask, self.config.mbr) if loss < 0: # wired negative loss loss *= 0 return loss, span_probs def _step(self, optimizer, scheduler, grad_norm): clip_grad_norm(self.model, grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() @torch.no_grad() def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, metric=None, output=None, **kwargs): self.model.eval() total_loss = 0 if not metric: metric = self.build_metric() else: metric.reset() timer = CountdownTimer(len(data)) for idx, batch in enumerate(data): out, mask = self.feed_batch(batch) y = batch['chart_id'] loss, span_probs = self.compute_loss(out, y, mask) total_loss += loss.item() prediction = self.decode_output(out, mask, batch, span_probs) self.update_metrics(metric, batch, prediction) timer.log(f'loss: {total_loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger, ratio_width=ratio_width) total_loss /= len(data) if output: output.close() return total_loss, metric # noinspection PyMethodOverriding def build_model(self, encoder, training=True, **kwargs) -> torch.nn.Module: decoder = CRFConstituencyDecoder(n_labels=len(self.vocabs.chart), n_hidden=encoder.get_output_dim(), **kwargs) encoder = encoder.module(vocabs=self.vocabs, training=training) return CRFConstituencyModel(encoder, decoder) def build_dataloader(self, data, batch_size, sampler_builder: SamplerBuilder = None, gradient_accumulation=1, shuffle=False, device=None, logger: logging.Logger = None, **kwargs) -> DataLoader: if isinstance(data, TransformableDataset): dataset = data else: transform = self.config.encoder.transform() if self.config.get('transform', None): transform = TransformList(self.config.transform, transform) dataset = self.build_dataset(data, transform, logger) if self.vocabs.mutable: # noinspection PyTypeChecker self.build_vocabs(dataset, logger) lens = [len(x['token_input_ids']) for x in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = None return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler) def predict(self, data: Union[str, List[str]], **kwargs): if not data: return [] flat = self.input_is_flat(data) if flat: data = [data] samples = self.build_samples(data) dataloader = self.build_dataloader(samples, device=self.device, **kwargs) outputs = [] orders = [] for idx, batch in enumerate(dataloader): out, mask = self.feed_batch(batch) prediction = self.decode_output(out, mask, batch, span_probs=None) # prediction = [x[0] for x in prediction] outputs.extend(prediction) orders.extend(batch[IDX]) outputs = reorder(outputs, orders) if flat: return outputs[0] return outputs def input_is_flat(self, data): return isinstance(data[0], str) def build_samples(self, data): return [{'token': [BOS] + token + [EOS]} for token in data] # noinspection PyMethodOverriding def fit(self, trn_data, dev_data, save_dir, encoder, lr=5e-5, transformer_lr=None, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, grad_norm=1.0, n_mlp_span=500, n_mlp_label=100, mlp_dropout=.33, batch_size=None, batch_max_tokens=5000, gradient_accumulation=1, epochs=30, patience=0.5, mbr=True, sampler_builder=None, delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP', ',', 'S1'), equal=(('ADVP', 'PRT'),), no_subcategory=True, eval_trn=True, transform=None, devices=None, logger=None, seed=None, **kwargs): if isinstance(equal, tuple): equal = dict(equal) return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_dataset(self, data, transform, logger=None): _transform = [ unpack_tree_to_features, self.vocabs, FieldLength('token'), transform ] if self.config.get('no_subcategory', True): _transform.insert(0, remove_subcategory) dataset = ConstituencyDataset(data, transform=_transform, cache=isinstance(data, str)) return dataset def build_vocabs(self, trn, logger, **kwargs): self.vocabs.chart = VocabWithNone(pad_token=None, unk_token=None) timer = CountdownTimer(len(trn)) max_seq_len = 0 for each in trn: max_seq_len = max(max_seq_len, len(each['token_input_ids'])) timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})') self.vocabs.chart.set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger) ================================================ FILE: hanlp/components/parsers/constituency/treecrf.py ================================================ # -*- coding:utf-8 -*- # Adopted from https://github.com/yzhangcs/parser # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import torch import torch.autograd as autograd import torch.nn as nn from hanlp.components.parsers.alg import stripe, istree, eisner, mst, eisner2o class CRFConstituency(nn.Module): r""" TreeCRF for calculating partition functions and marginals in :math:`O(n^3)` for constituency trees. References: - Yu Zhang, houquan Zhou and Zhenghua Li. 2020. `Fast and Accurate Neural CRF Constituency Parsing`_. .. _Fast and Accurate Neural CRF Constituency Parsing: https://www.ijcai.org/Proceedings/2020/560/ """ @torch.enable_grad() def forward(self, scores, mask, target=None, mbr=False): r""" Args: scores (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all possible constituents. mask (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``. The mask to avoid parsing over padding tokens. For each square matrix in a batch, the positions except upper triangular part should be masked out. target (~torch.BoolTensor): ``[batch_size, seq_len, seq_len]``. The tensor of gold-standard constituents. ``True`` if a constituent exists. Default: ``None``. mbr (bool): If ``True``, marginals will be returned to perform minimum Bayes-risk (MBR) decoding. Default: ``False``. Returns: ~torch.Tensor, ~torch.Tensor: The first is the training loss averaged by the number of tokens, which won't be returned if ``target=None``. The second is a tensor of shape ``[batch_size, seq_len, seq_len]``, in which are marginals if ``mbr=True``, or original scores otherwise. """ training = scores.requires_grad # always enable the gradient computation of scores in order for the computation of marginals logZ = self.inside(scores.requires_grad_(), mask) # marginals are used for decoding, and can be computed by combining the inside pass and autograd mechanism probs = scores if mbr: probs, = autograd.grad(logZ, scores, retain_graph=training) if target is None: return probs loss = (logZ - scores[mask & target].sum()) / mask[:, 0].sum() return loss, probs def inside(self, scores, mask): lens = mask[:, 0].sum(-1) batch_size, seq_len, _ = scores.shape # [seq_len, seq_len, batch_size] scores, mask = scores.permute(1, 2, 0), mask.permute(1, 2, 0) s = torch.full_like(scores, float('-inf')) for w in range(1, seq_len): # n denotes the number of spans to iterate, # from span (0, w) to span (n, n+w) given width w n = seq_len - w if w == 1: s.diagonal(w).copy_(scores.diagonal(w)) continue # [n, w, batch_size] s_s = stripe(s, n, w - 1, (0, 1)) + stripe(s, n, w - 1, (1, w), 0) # [batch_size, n, w] s_s = s_s.permute(2, 0, 1) if s_s.requires_grad: s_s.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0)) s_s = s_s.logsumexp(-1) s.diagonal(w).copy_(s_s + scores.diagonal(w)) return s[0].gather(0, lens.unsqueeze(0)).sum() class CRF2oDependency(nn.Module): r""" Second-order TreeCRF for calculating partition functions and marginals in :math:`O(n^3)` for projective dependency trees. References: - Yu Zhang, Zhenghua Li and Min Zhang. 2020. `Efficient Second-Order TreeCRF for Neural Dependency Parsing`_. .. _Efficient Second-Order TreeCRF for Neural Dependency Parsing: https://www.aclweb.org/anthology/2020.acl-main.302/ """ def __init__(self): super().__init__() self.criterion = nn.CrossEntropyLoss() @torch.enable_grad() def forward(self, scores, mask, target=None, mbr=True, partial=False): r""" Args: scores (~torch.Tensor, ~torch.Tensor): Tuple of two tensors `s_arc` and `s_sib`. `s_arc` (``[batch_size, seq_len, seq_len]``) holds Scores of all possible dependent-head pairs. `s_sib` (``[batch_size, seq_len, seq_len, seq_len]``) holds the scores of dependent-head-sibling triples. mask (~torch.BoolTensor): ``[batch_size, seq_len]``. The mask to avoid aggregation on padding tokens. The first column serving as pseudo words for roots should be ``False``. target (~torch.LongTensor): ``[batch_size, seq_len]``. Tensors of gold-standard dependent-head pairs and dependent-head-sibling triples. If partially annotated, the unannotated positions should be filled with -1. Default: ``None``. mbr (bool): If ``True``, marginals will be returned to perform minimum Bayes-risk (MBR) decoding. Default: ``False``. partial (bool): ``True`` indicates that the trees are partially annotated. Default: ``False``. Returns: ~torch.Tensor, ~torch.Tensor: The first is the training loss averaged by the number of tokens, which won't be returned if ``target=None``. The second is a tensor of shape ``[batch_size, seq_len, seq_len]``, in which are marginals if ``mbr=True``, or original scores otherwise. """ s_arc, s_sib = scores training = s_arc.requires_grad batch_size, seq_len, _ = s_arc.shape # always enable the gradient computation of scores in order for the computation of marginals logZ = self.inside((s.requires_grad_() for s in scores), mask) # marginals are used for decoding, and can be computed by combining the inside pass and autograd mechanism probs = s_arc if mbr: probs, = autograd.grad(logZ, s_arc, retain_graph=training) if target is None: return probs arcs, sibs = target # the second inside process is needed if use partial annotation if partial: score = self.inside(scores, mask, arcs) else: arc_seq, sib_seq = arcs[mask], sibs[mask] arc_mask, sib_mask = mask, sib_seq.gt(0) sib_seq = sib_seq[sib_mask] s_sib = s_sib[mask][torch.arange(len(arc_seq)), arc_seq] s_arc = s_arc[arc_mask].gather(-1, arc_seq.unsqueeze(-1)) s_sib = s_sib[sib_mask].gather(-1, sib_seq.unsqueeze(-1)) score = s_arc.sum() + s_sib.sum() loss = (logZ - score) / mask.sum() return loss, probs def inside(self, scores, mask, cands=None): # the end position of each sentence in a batch lens = mask.sum(1) s_arc, s_sib = scores batch_size, seq_len, _ = s_arc.shape # [seq_len, seq_len, batch_size] s_arc = s_arc.permute(2, 1, 0) # [seq_len, seq_len, seq_len, batch_size] s_sib = s_sib.permute(2, 1, 3, 0) s_i = torch.full_like(s_arc, float('-inf')) s_s = torch.full_like(s_arc, float('-inf')) s_c = torch.full_like(s_arc, float('-inf')) s_c.diagonal().fill_(0) # set the scores of arcs excluded by cands to -inf if cands is not None: mask = mask.index_fill(1, lens.new_tensor(0), 1) mask = (mask.unsqueeze(1) & mask.unsqueeze(-1)).permute(2, 1, 0) cands = cands.unsqueeze(-1).index_fill(1, lens.new_tensor(0), -1) cands = cands.eq(lens.new_tensor(range(seq_len))) | cands.lt(0) cands = cands.permute(2, 1, 0) & mask s_arc = s_arc.masked_fill(~cands, float('-inf')) for w in range(1, seq_len): # n denotes the number of spans to iterate, # from span (0, w) to span (n, n+w) given width w n = seq_len - w # I(j->i) = logsum(exp(I(j->r) + S(j->r, i)) +, i < r < j # exp(C(j->j) + C(i->j-1))) # + s(j->i) # [n, w, batch_size] il = stripe(s_i, n, w, (w, 1)) + stripe(s_s, n, w, (1, 0), 0) il += stripe(s_sib[range(w, n + w), range(n)], n, w, (0, 1)) # [n, 1, batch_size] il0 = stripe(s_c, n, 1, (w, w)) + stripe(s_c, n, 1, (0, w - 1)) # il0[0] are set to zeros since the scores of the complete spans starting from 0 are always -inf il[:, -1] = il0.index_fill_(0, lens.new_tensor(0), 0).squeeze(1) if il.requires_grad: il.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0)) il = il.permute(2, 0, 1).logsumexp(-1) s_i.diagonal(-w).copy_(il + s_arc.diagonal(-w)) # I(i->j) = logsum(exp(I(i->r) + S(i->r, j)) +, i < r < j # exp(C(i->i) + C(j->i+1))) # + s(i->j) # [n, w, batch_size] ir = stripe(s_i, n, w) + stripe(s_s, n, w, (0, w), 0) ir += stripe(s_sib[range(n), range(w, n + w)], n, w) ir[0] = float('-inf') # [n, 1, batch_size] ir0 = stripe(s_c, n, 1) + stripe(s_c, n, 1, (w, 1)) ir[:, 0] = ir0.squeeze(1) if ir.requires_grad: ir.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0)) ir = ir.permute(2, 0, 1).logsumexp(-1) s_i.diagonal(w).copy_(ir + s_arc.diagonal(w)) # [n, w, batch_size] slr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1)) if slr.requires_grad: slr.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0)) slr = slr.permute(2, 0, 1).logsumexp(-1) # S(j, i) = logsumexp(C(i->r) + C(j->r+1)), i <= r < j s_s.diagonal(-w).copy_(slr) # S(i, j) = logsumexp(C(i->r) + C(j->r+1)), i <= r < j s_s.diagonal(w).copy_(slr) # C(j->i) = logsumexp(C(r->i) + I(j->r)), i <= r < j cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0)) cl.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0)) s_c.diagonal(-w).copy_(cl.permute(2, 0, 1).logsumexp(-1)) # C(i->j) = logsumexp(I(i->r) + C(r->j)), i < r <= j cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0) cr.register_hook(lambda x: x.masked_fill_(torch.isnan(x), 0)) s_c.diagonal(w).copy_(cr.permute(2, 0, 1).logsumexp(-1)) # disable multi words to modify the root s_c[0, w][lens.ne(w)] = float('-inf') return s_c[0].gather(0, lens.unsqueeze(0)).sum() def loss(self, s_arc, s_sib, s_rel, arcs, sibs, rels, mask, mbr=True, partial=False): r""" Args: s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all possible arcs. s_sib (~torch.Tensor): ``[batch_size, seq_len, seq_len, seq_len]``. Scores of all possible dependent-head-sibling triples. s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``. Scores of all possible labels on each arc. arcs (~torch.LongTensor): ``[batch_size, seq_len]``. The tensor of gold-standard arcs. sibs (~torch.LongTensor): ``[batch_size, seq_len]``. The tensor of gold-standard siblings. rels (~torch.LongTensor): ``[batch_size, seq_len]``. The tensor of gold-standard labels. mask (~torch.BoolTensor): ``[batch_size, seq_len]``. The mask for covering the unpadded tokens. mbr (bool): If ``True``, returns marginals for MBR decoding. Default: ``True``. partial (bool): ``True`` denotes the trees are partially annotated. Default: ``False``. Returns: ~torch.Tensor, ~torch.Tensor: The training loss and original arc scores of shape ``[batch_size, seq_len, seq_len]`` if ``mbr=False``, or marginals otherwise. """ scores, target = (s_arc, s_sib), (arcs, sibs) arc_loss, arc_probs = self.forward(scores, mask, target, mbr, partial) # -1 denotes un-annotated arcs if partial: mask = mask & arcs.ge(0) s_rel, rels = s_rel[mask], rels[mask] s_rel = s_rel[torch.arange(len(rels)), arcs[mask]] rel_loss = self.criterion(s_rel, rels) loss = arc_loss + rel_loss return loss, arc_probs # def decode(self, s_arc, s_rel, mask, tree=False, proj=False, alg=None): # r""" # Args: # s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. # Scores of all possible arcs. # s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``. # Scores of all possible labels on each arc. # mask (~torch.BoolTensor): ``[batch_size, seq_len]``. # The mask for covering the unpadded tokens. # tree (bool): # If ``True``, ensures to output well-formed trees. Default: ``False``. # proj (bool): # If ``True``, ensures to output projective trees. Default: ``False``. # # Returns: # ~torch.Tensor, ~torch.Tensor: # Predicted arcs and labels of shape ``[batch_size, seq_len]``. # """ # # lens = mask.sum(1) # arc_preds = s_arc.argmax(-1) # if tree and not alg: # bad = [not istree(seq[1:i + 1], proj) # for i, seq in zip(lens.tolist(), arc_preds.tolist())] # if any(bad): # alg = eisner if proj else mst # arc_preds[bad] = alg(s_arc[bad], mask[bad]) # rel_preds = s_rel.argmax(-1).gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1) # # return arc_preds, rel_preds def decode(self, s_arc, s_sib, s_rel, mask, tree=False, mbr=True, proj=False): r""" Args: s_arc (~torch.Tensor): ``[batch_size, seq_len, seq_len]``. Scores of all possible arcs. s_sib (~torch.Tensor): ``[batch_size, seq_len, seq_len, seq_len]``. Scores of all possible dependent-head-sibling triples. s_rel (~torch.Tensor): ``[batch_size, seq_len, seq_len, n_labels]``. Scores of all possible labels on each arc. mask (~torch.BoolTensor): ``[batch_size, seq_len]``. The mask for covering the unpadded tokens. tree (bool): If ``True``, ensures to output well-formed trees. Default: ``False``. mbr (bool): If ``True``, performs MBR decoding. Default: ``True``. proj (bool): If ``True``, ensures to output projective trees. Default: ``False``. Returns: ~torch.Tensor, ~torch.Tensor: Predicted arcs and labels of shape ``[batch_size, seq_len]``. """ lens = mask.sum(1) arc_preds = s_arc.argmax(-1) if tree: bad = [not istree(seq[1:i + 1], proj) for i, seq in zip(lens.tolist(), arc_preds.tolist())] if any(bad): if proj and not mbr: arc_preds = eisner2o((s_arc, s_sib), mask) else: alg = eisner if proj else mst arc_preds[bad] = alg(s_arc[bad], mask[bad]) rel_preds = s_rel.argmax(-1).gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1) return arc_preds, rel_preds ================================================ FILE: hanlp/components/parsers/parse_alg.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-04-02 23:20 from collections import defaultdict import hanlp.utils.span_util from hanlp.components.parsers.chu_liu_edmonds import decode_mst import numpy as np class Tarjan: """Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph""" def __init__(self, prediction, tokens): """ Parameters ---------- prediction : numpy.ndarray a predicted dependency tree where prediction[dep_idx] = head_idx tokens : numpy.ndarray the tokens we care about (i.e. exclude _GO, _EOS, and _PAD) """ self._edges = defaultdict(set) self._vertices = set((0,)) for dep, head in enumerate(prediction[tokens]): self._vertices.add(dep + 1) self._edges[head].add(dep + 1) self._indices = {} self._lowlinks = {} self._onstack = defaultdict(lambda: False) self._SCCs = [] index = 0 stack = [] for v in self.vertices: if v not in self.indices: self.strongconnect(v, index, stack) # ============================================================= def strongconnect(self, v, index, stack): """ Args: v: index: stack: Returns: """ self._indices[v] = index self._lowlinks[v] = index index += 1 stack.append(v) self._onstack[v] = True for w in self.edges[v]: if w not in self.indices: self.strongconnect(w, index, stack) self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w]) elif self._onstack[w]: self._lowlinks[v] = min(self._lowlinks[v], self._indices[w]) if self._lowlinks[v] == self._indices[v]: self._SCCs.append(set()) while stack[-1] != v: w = stack.pop() self._onstack[w] = False self._SCCs[-1].add(w) w = stack.pop() self._onstack[w] = False self._SCCs[-1].add(w) return # ====================== @property def edges(self): return self._edges @property def vertices(self): return self._vertices @property def indices(self): return self._indices @property def SCCs(self): return self._SCCs class UnionFind(object): def __init__(self, n) -> None: super().__init__() self.parent = [x for x in range(n)] self.height = [0] * n def find(self, x): if self.parent[x] == x: return x self.parent[x] = self.find(self.parent[x]) return self.parent[x] def unite(self, x, y): x = self.find(x) y = self.find(y) if x == y: return if self.height[x] < self.height[y]: self.parent[x] = y else: self.parent[y] = x if self.height[x] == self.height[y]: self.height[x] += 1 def same(self, x, y): return self.find(x) == self.find(y) def tarjan(parse_probs, length, tokens_to_keep, ensure_tree=True): """Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py Args: parse_probs(NDArray): seq_len x seq_len, the probability of arcs length(NDArray): sentence length including ROOT tokens_to_keep(NDArray): mask matrix ensure_tree: (Default value = True) Returns: """ if ensure_tree: parse_preds, parse_probs, tokens = unique_root(parse_probs, tokens_to_keep, length) # remove cycles tarjan = Tarjan(parse_preds, tokens) for SCC in tarjan.SCCs: if len(SCC) > 1: dependents = set() to_visit = set(SCC) while len(to_visit) > 0: node = to_visit.pop() if not node in dependents: dependents.add(node) to_visit.update(tarjan.edges[node]) # The indices of the nodes that participate in the cycle cycle = np.array(list(SCC)) # The probabilities of the current heads old_heads = parse_preds[cycle] old_head_probs = parse_probs[cycle, old_heads] # Set the probability of depending on a non-head to zero non_heads = np.array(list(dependents)) parse_probs[np.repeat(cycle, len(non_heads)), np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0 # Get new potential heads and their probabilities new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1 new_head_probs = parse_probs[cycle, new_heads] / old_head_probs # Select the most probable change change = np.argmax(new_head_probs) changed_cycle = cycle[change] old_head = old_heads[change] new_head = new_heads[change] # Make the change parse_preds[changed_cycle] = new_head tarjan.edges[new_head].add(changed_cycle) tarjan.edges[old_head].remove(changed_cycle) return parse_preds else: # block and pad heads parse_probs = parse_probs * tokens_to_keep parse_preds = np.argmax(parse_probs, axis=1) return parse_preds def chu_liu_edmonds(parse_probs, length): tree = decode_mst(hanlp.utils.span_util.T, length, False)[0] tree[0] = 0 return tree def unique_root(parse_probs, tokens_to_keep: np.ndarray, length): I = np.eye(len(tokens_to_keep)) # block loops and pad heads if tokens_to_keep.ndim == 1: tokens_to_keep = np.expand_dims(tokens_to_keep, -1) parse_probs = parse_probs * tokens_to_keep * (1 - I) parse_preds = np.argmax(parse_probs, axis=1) tokens = np.arange(1, length) roots = np.where(parse_preds[tokens] == 0)[0] + 1 # ensure at least one root if len(roots) < 1: # The current root probabilities root_probs = parse_probs[tokens, 0] # The current head probabilities old_head_probs = parse_probs[tokens, parse_preds[tokens]] # Get new potential root probabilities new_root_probs = root_probs / old_head_probs # Select the most probable root new_root = tokens[np.argmax(new_root_probs)] # Make the change parse_preds[new_root] = 0 # ensure at most one root elif len(roots) > 1: # The probabilities of the current heads root_probs = parse_probs[roots, 0] # Set the probability of depending on the root zero parse_probs[roots, 0] = 0 # Get new potential heads and their probabilities new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1 new_head_probs = parse_probs[roots, new_heads] / root_probs # Select the most probable root new_root = roots[np.argmin(new_head_probs)] # Make the change parse_preds[roots] = new_heads parse_preds[new_root] = 0 return parse_preds, parse_probs, tokens def dfs(graph, start, end): fringe = [(start, [])] while fringe: state, path = fringe.pop() if path and state == end: yield path continue for next_state in graph[state]: if next_state in path: continue fringe.append((next_state, path + [next_state])) def mst_then_greedy(arc_scores, rel_scores, mask, root_rel_idx, rel_idx=None): from scipy.special import softmax from scipy.special import expit as sigmoid length = sum(mask) + 1 mask = mask[:length] arc_scores = arc_scores[:length, :length] arc_pred = arc_scores > 0 arc_probs = sigmoid(arc_scores) rel_scores = rel_scores[:length, :length, :] rel_probs = softmax(rel_scores, -1) if not any(arc_pred[:, 0][1:]): # no root root = np.argmax(rel_probs[1:, 0, root_rel_idx]) + 1 arc_probs[root, 0] = 1 parse_preds, parse_probs, tokens = unique_root(arc_probs, mask, length) root = adjust_root_score(arc_scores, parse_preds, root_rel_idx, rel_scores) tree = chu_liu_edmonds(arc_scores, length) if rel_idx is not None: # Unknown DEPREL label: 'ref' rel_scores[np.arange(len(tree)), tree, rel_idx] = -float('inf') return tree, add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx) def adjust_root_score(arc_scores, parse_preds, root_rel_idx, rel_scores=None): root = np.where(parse_preds[1:] == 0)[0] + 1 arc_scores[:, 0] = min(np.min(arc_scores), -1000) arc_scores[root, 0] = max(np.max(arc_scores), 1000) if rel_scores is not None: rel_scores[:, :, root_rel_idx] = -float('inf') rel_scores[root, 0, root_rel_idx] = float('inf') return root def add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx, arc_preds=None): if not isinstance(tree, np.ndarray): tree = np.array(tree) if arc_preds is None: arc_preds = arc_scores > 0 rel_pred = np.argmax(rel_scores, axis=-1) return add_secondary_arcs_by_preds(arc_scores, arc_preds, rel_pred, tree, root_rel_idx) def add_secondary_arcs_by_preds(arc_scores, arc_preds, rel_preds, tree, root_rel_idx=None): dh = np.argwhere(arc_preds) sdh = sorted([(arc_scores[x[0], x[1]], list(x)) for x in dh], reverse=True) graph = [[] for _ in range(len(tree))] for d, h in enumerate(tree): if d: graph[h].append(d) for s, (d, h) in sdh: if not d or not h or d in graph[h]: continue try: path = next(dfs(graph, d, h)) except StopIteration: # no path from d to h graph[h].append(d) parse_graph = [[] for _ in range(len(tree))] num_root = 0 for h in range(len(tree)): for d in graph[h]: rel = rel_preds[d, h] if h == 0 and root_rel_idx is not None: rel = root_rel_idx assert num_root == 0 num_root += 1 parse_graph[d].append((h, rel)) parse_graph[d] = sorted(parse_graph[d]) return parse_graph def adjust_root_score_then_add_secondary_arcs(arc_scores, rel_scores, tree, root_rel_idx): if len(arc_scores) != tree: arc_scores = arc_scores[:len(tree), :len(tree)] rel_scores = rel_scores[:len(tree), :len(tree), :] parse_preds = arc_scores > 0 # adjust_root_score(arc_scores, parse_preds, rel_scores) parse_preds[:, 0] = False # set heads to False rel_scores[:, :, root_rel_idx] = -float('inf') return add_secondary_arcs_by_scores(arc_scores, rel_scores, tree, root_rel_idx, parse_preds) ================================================ FILE: hanlp/components/parsers/ud/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-14 20:34 ================================================ FILE: hanlp/components/parsers/ud/lemma_edit.py ================================================ """ Utilities for processing lemmas Adopted from UDPipe Future https://github.com/CoNLL-UD-2018/UDPipe-Future """ def min_edit_script(source, target, allow_copy=False): """Finds the minimum edit script to transform the source to the target Args: source: target: allow_copy: (Default value = False) Returns: """ a = [[(len(source) + len(target) + 1, None)] * (len(target) + 1) for _ in range(len(source) + 1)] for i in range(0, len(source) + 1): for j in range(0, len(target) + 1): if i == 0 and j == 0: a[i][j] = (0, "") else: if allow_copy and i and j and source[i - 1] == target[j - 1] and a[i - 1][j - 1][0] < a[i][j][0]: a[i][j] = (a[i - 1][j - 1][0], a[i - 1][j - 1][1] + "→") if i and a[i - 1][j][0] < a[i][j][0]: a[i][j] = (a[i - 1][j][0] + 1, a[i - 1][j][1] + "-") if j and a[i][j - 1][0] < a[i][j][0]: a[i][j] = (a[i][j - 1][0] + 1, a[i][j - 1][1] + "+" + target[j - 1]) return a[-1][-1][1] def gen_lemma_rule(form, lemma, allow_copy=False): """Generates a lemma rule to transform the source to the target Args: form: lemma: allow_copy: (Default value = False) Returns: """ form = form.lower() previous_case = -1 lemma_casing = "" for i, c in enumerate(lemma): case = "↑" if c.lower() != c else "↓" if case != previous_case: lemma_casing += "{}{}{}".format("¦" if lemma_casing else "", case, i if i <= len(lemma) // 2 else i - len(lemma)) previous_case = case lemma = lemma.lower() best, best_form, best_lemma = 0, 0, 0 for l in range(len(lemma)): for f in range(len(form)): cpl = 0 while f + cpl < len(form) and l + cpl < len(lemma) and form[f + cpl] == lemma[l + cpl]: cpl += 1 if cpl > best: best = cpl best_form = f best_lemma = l rule = lemma_casing + ";" if not best: rule += "a" + lemma else: rule += "d{}¦{}".format( min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy), min_edit_script(form[best_form + best:], lemma[best_lemma + best:], allow_copy), ) return rule def apply_lemma_rule(form, lemma_rule): """Applies the lemma rule to the form to generate the lemma Args: form: lemma_rule: Returns: """ cells = lemma_rule.split(";", 1) if len(cells) == 1: # Some predicted lemma rules are _, which might be due to partial annotation return form.lower() casing, rule = cells if rule.startswith("a"): lemma = rule[1:] else: form = form.lower() rules, rule_sources = rule[1:].split("¦"), [] assert len(rules) == 2 for rule in rules: source, i = 0, 0 while i < len(rule): if rule[i] == "→" or rule[i] == "-": source += 1 else: assert rule[i] == "+" i += 1 i += 1 rule_sources.append(source) try: lemma, form_offset = "", 0 for i in range(2): j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1]) while j < len(rules[i]): if rules[i][j] == "→": lemma += form[offset] offset += 1 elif rules[i][j] == "-": offset += 1 else: assert (rules[i][j] == "+") lemma += rules[i][j + 1] j += 1 j += 1 if i == 0: lemma += form[rule_sources[0]: len(form) - rule_sources[1]] except: lemma = form for rule in casing.split("¦"): if rule == "↓0": continue # The lemma is lowercased initially case, offset = rule[0], int(rule[1:]) lemma = lemma[:offset] + (lemma[offset:].upper() if case == "↑" else lemma[offset:].lower()) return lemma ================================================ FILE: hanlp/components/parsers/ud/tag_decoder.py ================================================ # This file is modified from udify, which is licensed under the MIT license: # MIT License # # Copyright (c) 2019 Dan Kondratyuk # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. """ Decodes sequences of tags, e.g., POS tags, given a list of contextualized word embeddings """ from typing import Dict import numpy import torch import torch.nn.functional as F from torch.nn.modules.adaptive import AdaptiveLogSoftmaxWithLoss from torch.nn.modules.linear import Linear from hanlp.components.parsers.ud.lemma_edit import apply_lemma_rule from hanlp.components.parsers.ud.udify_util import sequence_cross_entropy, sequence_cross_entropy_with_logits class TagDecoder(torch.nn.Module): """A basic sequence tagger that decodes from inputs of word embeddings""" def __init__(self, input_dim, num_classes, label_smoothing: float = 0.03, adaptive: bool = False) -> None: super(TagDecoder, self).__init__() self.label_smoothing = label_smoothing self.num_classes = num_classes self.adaptive = adaptive if self.adaptive: adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)] self.task_output = AdaptiveLogSoftmaxWithLoss(input_dim, self.num_classes, cutoffs=adaptive_cutoffs, div_value=4.0) else: self.task_output = Linear(self.output_dim, self.num_classes) def forward(self, encoded_text: torch.FloatTensor, mask: torch.LongTensor, gold_tags: torch.LongTensor, ) -> Dict[str, torch.Tensor]: hidden = encoded_text batch_size, sequence_length, _ = hidden.size() output_dim = [batch_size, sequence_length, self.num_classes] loss_fn = self._adaptive_loss if self.adaptive else self._loss output_dict = loss_fn(hidden, mask, gold_tags, output_dim) return output_dict def _adaptive_loss(self, hidden, mask, gold_tags, output_dim): logits = hidden reshaped_log_probs = logits.reshape(-1, logits.size(2)) class_probabilities = self.task_output.log_prob(reshaped_log_probs).view(output_dim) output_dict = {"logits": logits, "class_probabilities": class_probabilities} if gold_tags is not None: output_dict["loss"] = sequence_cross_entropy(class_probabilities, gold_tags, mask, label_smoothing=self.label_smoothing) return output_dict def _loss(self, hidden, mask, gold_tags, output_dim): logits = self.task_output(hidden) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(output_dim) output_dict = {"logits": logits, "class_probabilities": class_probabilities} if gold_tags is not None: output_dict["loss"] = sequence_cross_entropy_with_logits(logits, gold_tags, mask, label_smoothing=self.label_smoothing) return output_dict def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: all_words = output_dict["words"] all_predictions = output_dict["class_probabilities"][self.task].cpu().data.numpy() if all_predictions.ndim == 3: predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])] else: predictions_list = [all_predictions] all_tags = [] for predictions, words in zip(predictions_list, all_words): argmax_indices = numpy.argmax(predictions, axis=-1) tags = [self.vocab.get_token_from_index(x, namespace=self.task) for x in argmax_indices] if self.task == "lemmas": def decode_lemma(word, rule): if rule == "_": return "_" if rule == "@@UNKNOWN@@": return word return apply_lemma_rule(word, rule) tags = [decode_lemma(word, rule) for word, rule in zip(words, tags)] all_tags.append(tags) output_dict[self.task] = all_tags return output_dict ================================================ FILE: hanlp/components/parsers/ud/ud_model.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-15 14:21 from typing import Dict, Any import torch from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser from hanlp.components.parsers.biaffine.biaffine_model import BiaffineDecoder from hanlp.components.parsers.ud.tag_decoder import TagDecoder from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbeddingModule from hanlp.layers.scalar_mix import ScalarMixWithDropout class UniversalDependenciesModel(torch.nn.Module): def __init__(self, encoder: ContextualWordEmbeddingModule, n_mlp_arc, n_mlp_rel, mlp_dropout, num_rels, num_lemmas, num_upos, num_feats, mix_embedding: int = 13, layer_dropout: int = 0.0): super().__init__() self.encoder = encoder self.decoder = UniversalDependenciesDecoder( encoder.get_output_dim(), n_mlp_arc, n_mlp_rel, mlp_dropout, num_rels, num_lemmas, num_upos, num_feats, mix_embedding, layer_dropout ) def forward(self, batch: Dict[str, torch.Tensor], mask, ): hidden = self.encoder(batch) return self.decoder(hidden, batch=batch, mask=mask) class UniversalDependenciesDecoder(torch.nn.Module): def __init__(self, hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, num_rels, num_lemmas, num_upos, num_feats, mix_embedding: int = 13, layer_dropout: int = 0.0, ) -> None: super(UniversalDependenciesDecoder, self).__init__() # decoders self.decoders = torch.nn.ModuleDict({ 'lemmas': TagDecoder(hidden_size, num_lemmas, label_smoothing=0.03, adaptive=True), 'upos': TagDecoder(hidden_size, num_upos, label_smoothing=0.03, adaptive=True), 'deps': BiaffineDecoder(hidden_size, n_mlp_arc, n_mlp_rel, mlp_dropout, num_rels), 'feats': TagDecoder(hidden_size, num_feats, label_smoothing=0.03, adaptive=True), }) self.gold_keys = { 'lemmas': 'lemma_id', 'upos': 'pos_id', 'feats': 'feat_id', } if mix_embedding: self.scalar_mix = torch.nn.ModuleDict({ task: ScalarMixWithDropout((1, mix_embedding), do_layer_norm=False, dropout=layer_dropout) for task in self.decoders }) else: self.scalar_mix = None def forward(self, hidden, batch: Dict[str, torch.Tensor], mask) -> Dict[str, Any]: mask_without_root = mask.clone() mask_without_root[:, 0] = False logits = {} class_probabilities = {} output_dict = {"logits": logits, "class_probabilities": class_probabilities} loss = 0 arc = batch.get('arc', None) # Run through each of the tasks on the shared encoder and save predictions for task in self.decoders: if self.scalar_mix: decoder_input = self.scalar_mix[task](hidden, mask) else: decoder_input = hidden if task == "deps": s_arc, s_rel = self.decoders[task](decoder_input, mask) pred_output = {'class_probabilities': {'s_arc': s_arc, 's_rel': s_rel}} if arc is not None: # noinspection PyTypeChecker pred_output['loss'] = BiaffineDependencyParser.compute_loss(None, s_arc, s_rel, arc, batch['rel_id'], mask_without_root, torch.nn.functional.cross_entropy) else: pred_output = self.decoders[task](decoder_input, mask_without_root, batch.get(self.gold_keys[task], None)) if 'logits' in pred_output: logits[task] = pred_output["logits"] if 'class_probabilities' in pred_output: class_probabilities[task] = pred_output["class_probabilities"] if 'loss' in pred_output: # Keep track of the loss if we have the gold tags available loss += pred_output["loss"] if arc is not None: output_dict["loss"] = loss return output_dict def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: for task in self.tasks: self.decoders[task].decode(output_dict) return output_dict ================================================ FILE: hanlp/components/parsers/ud/ud_parser.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-14 20:34 import logging from copy import deepcopy from typing import Union, List, Callable import torch from torch.utils.data import DataLoader from hanlp_common.constant import IDX from hanlp.common.dataset import PadSequenceDataLoader, SortingSamplerBuilder from hanlp.common.structure import History from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import FieldLength, PunctuationMask from hanlp.common.vocab import Vocab from hanlp.components.classifiers.transformer_classifier import TransformerComponent from hanlp.components.parsers.biaffine.biaffine_dep import BiaffineDependencyParser from hanlp_common.conll import CoNLLUWord, CoNLLSentence from hanlp.components.parsers.ud.ud_model import UniversalDependenciesModel from hanlp.components.parsers.ud.util import generate_lemma_rule, append_bos, sample_form_missing from hanlp.components.parsers.ud.lemma_edit import apply_lemma_rule from hanlp.datasets.parsing.loaders.conll_dataset import CoNLLParsingDataset from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding from hanlp.metrics.accuracy import CategoricalAccuracy from hanlp.metrics.metric import Metric from hanlp.metrics.mtl import MetricDict from hanlp.metrics.parsing.attachmentscore import AttachmentScore from hanlp.utils.time_util import CountdownTimer from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask from hanlp_common.util import merge_locals_kwargs, merge_dict, reorder class UniversalDependenciesParser(TorchComponent): def __init__(self, **kwargs) -> None: """Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing) implementation of "75 Languages, 1 Model: Parsing Universal Dependencies Universally" (:cite:`kondratyuk-straka-2019-75`). Args: **kwargs: Predefined config. """ super().__init__(**kwargs) self.model: UniversalDependenciesModel = self.model def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None, sampler_builder=None, gradient_accumulation=1, transformer: ContextualWordEmbedding = None, **kwargs) -> DataLoader: transform = [generate_lemma_rule, append_bos, self.vocabs, transformer.transform(), FieldLength('token')] if not self.config.punct: transform.append(PunctuationMask('token', 'punct_mask')) dataset = self.build_dataset(data, transform) if self.vocabs.mutable: # noinspection PyTypeChecker self.build_vocabs(dataset, logger) lens = [len(x['token_input_ids']) for x in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = SortingSamplerBuilder(batch_size).build(lens, shuffle, gradient_accumulation) return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler, pad={'arc': 0}, ) def build_vocabs(self, trn, logger, **kwargs): self.vocabs.pos = Vocab(unk_token=None, pad_token=None) self.vocabs.rel = Vocab(unk_token=None, pad_token=None) self.vocabs.lemma = Vocab(unk_token=None, pad_token=None) self.vocabs.feat = Vocab(unk_token=None, pad_token=None) timer = CountdownTimer(len(trn)) max_seq_len = 0 for each in trn: max_seq_len = max(max_seq_len, len(each['token'])) timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})') for v in self.vocabs.values(): v.set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger) def build_dataset(self, data, transform): dataset = CoNLLParsingDataset(data, transform=transform, prune=sample_form_missing, cache=isinstance(data, str)) return dataset def build_optimizer(self, trn, **kwargs): # noinspection PyCallByClass,PyTypeChecker return TransformerComponent.build_optimizer(self, trn, **kwargs) def build_criterion(self, **kwargs): pass def build_metric(self, **kwargs): return MetricDict({ 'lemmas': CategoricalAccuracy(), 'upos': CategoricalAccuracy(), 'deps': AttachmentScore(), 'feats': CategoricalAccuracy(), }) def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric: MetricDict = None, output=False, logger=None, ratio_width=None, **kwargs): metric.reset() self.model.eval() timer = CountdownTimer(len(data)) total_loss = 0 for idx, batch in enumerate(data): out, mask = self.feed_batch(batch) loss = out['loss'] total_loss += loss.item() self.decode_output(out, mask, batch) self.update_metrics(metric, batch, out, mask) report = f'loss: {total_loss / (idx + 1):.4f} {metric.cstr()}' timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width) del loss del out del mask return total_loss / len(data), metric # noinspection PyMethodOverriding def build_model(self, transformer: ContextualWordEmbedding, n_mlp_arc, n_mlp_rel, mlp_dropout, mix_embedding, layer_dropout, training=True, **kwargs) -> torch.nn.Module: assert bool(transformer.scalar_mix) == bool(mix_embedding), 'transformer.scalar_mix has to be 1 ' \ 'when mix_embedding is non-zero.' # noinspection PyTypeChecker return UniversalDependenciesModel(transformer.module(training=training), n_mlp_arc, n_mlp_rel, mlp_dropout, len(self.vocabs.rel), len(self.vocabs.lemma), len(self.vocabs.pos), len(self.vocabs.feat), mix_embedding, layer_dropout) def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, **kwargs): if not data: return [] flat = self.input_is_flat(data) if flat: data = [data] samples = self.build_samples(data) if not batch_size: batch_size = self.config.batch_size dataloader = self.build_dataloader(samples, device=self.devices[0], shuffle=False, **merge_dict(self.config, batch_size=batch_size, overwrite=True, **kwargs)) order = [] outputs = [] for batch in dataloader: out, mask = self.feed_batch(batch) self.decode_output(out, mask, batch) outputs.extend(self.prediction_to_human(out, batch)) order.extend(batch[IDX]) outputs = reorder(outputs, order) if flat: return outputs[0] return outputs def build_samples(self, data: List[List[str]]): return [{'FORM': x} for x in data] def fit(self, trn_data, dev_data, save_dir, transformer: ContextualWordEmbedding, sampler_builder=None, mix_embedding: int = 13, layer_dropout: int = 0.1, n_mlp_arc=768, n_mlp_rel=256, mlp_dropout=.33, lr=1e-3, transformer_lr=2.5e-5, patience=0.1, batch_size=32, epochs=30, gradient_accumulation=1, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, grad_norm=1.0, tree=False, proj=False, punct=False, logger=None, verbose=True, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, patience=0.5, eval_trn=True, **kwargs): if isinstance(patience, float): patience = int(patience * epochs) best_epoch, best_metric = 0, -1 timer = CountdownTimer(epochs) history = History() for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width, eval_trn=eval_trn, **self.config) loss, dev_metric = self.evaluate_dataloader(dev, criterion, metric, logger=logger, ratio_width=ratio_width) timer.update() report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}" if dev_metric > best_metric: best_epoch, best_metric = epoch, deepcopy(dev_metric) self.save_weights(save_dir) report += ' [red](saved)[/red]' else: report += f' ({epoch - best_epoch})' if epoch - best_epoch >= patience: report += ' early stop' logger.info(report) if epoch - best_epoch >= patience: break if not best_epoch: self.save_weights(save_dir) elif best_epoch != epoch: self.load_weights(save_dir) logger.info(f"Max score of dev is {best_metric.cstr()} at epoch {best_epoch}") logger.info(f"Average time of each epoch is {timer.elapsed_average_human}") logger.info(f"{timer.elapsed_human} elapsed") # noinspection PyMethodOverriding def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric: MetricDict, logger: logging.Logger, history: History, gradient_accumulation=1, grad_norm=None, ratio_width=None, eval_trn=True, **kwargs): optimizer, scheduler = optimizer metric.reset() self.model.train() timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation)) total_loss = 0 for idx, batch in enumerate(trn): out, mask = self.feed_batch(batch) loss = out['loss'] if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation loss.backward() total_loss += loss.item() if eval_trn: self.decode_output(out, mask, batch) self.update_metrics(metric, batch, out, mask) if history.step(gradient_accumulation): self._step(optimizer, scheduler, grad_norm) report = f'loss: {total_loss / (idx + 1):.4f} {metric.cstr()}' if eval_trn \ else f'loss: {total_loss / (idx + 1):.4f}' timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width) del loss del out del mask def decode_output(self, outputs, mask, batch): arc_scores, rel_scores = outputs['class_probabilities']['deps']['s_arc'], \ outputs['class_probabilities']['deps']['s_rel'] arc_preds, rel_preds = BiaffineDependencyParser.decode(self, arc_scores, rel_scores, mask, batch) outputs['arc_preds'], outputs['rel_preds'] = arc_preds, rel_preds return outputs def update_metrics(self, metrics, batch, outputs, mask): arc_preds, rel_preds, puncts = outputs['arc_preds'], outputs['rel_preds'], batch.get('punct_mask', None) BiaffineDependencyParser.update_metric(self, arc_preds, rel_preds, batch['arc'], batch['rel_id'], mask, puncts, metrics['deps'], batch) for task, key in zip(['lemmas', 'upos', 'feats'], ['lemma_id', 'pos_id', 'feat_id']): metric: Metric = metrics[task] pred = outputs['class_probabilities'][task] gold = batch[key] metric(pred.detach(), gold, mask=mask) return metrics def feed_batch(self, batch: dict): mask = self.compute_mask(batch) output_dict = self.model(batch, mask) if self.model.training: mask = mask.clone() mask[:, 0] = 0 return output_dict, mask def compute_mask(self, batch): lens = batch['token_length'] mask = lengths_to_mask(lens) return mask def _step(self, optimizer, scheduler, grad_norm): clip_grad_norm(self.model, grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() def input_is_flat(self, data): # noinspection PyCallByClass,PyTypeChecker return BiaffineDependencyParser.input_is_flat(self, data, False) def prediction_to_human(self, outputs: dict, batch): arcs, rels = outputs['arc_preds'], outputs['rel_preds'] upos = outputs['class_probabilities']['upos'][:, 1:, :].argmax(-1).tolist() feats = outputs['class_probabilities']['feats'][:, 1:, :].argmax(-1).tolist() lemmas = outputs['class_probabilities']['lemmas'][:, 1:, :].argmax(-1).tolist() lem_vocab = self.vocabs['lemma'].idx_to_token pos_vocab = self.vocabs['pos'].idx_to_token feat_vocab = self.vocabs['feat'].idx_to_token # noinspection PyCallByClass,PyTypeChecker for tree, form, lemma, pos, feat in zip(BiaffineDependencyParser.prediction_to_head_rel( self, arcs, rels, batch), batch['token'], lemmas, upos, feats): form = form[1:] assert len(form) == len(tree) lemma = [apply_lemma_rule(t, lem_vocab[r]) for t, r in zip(form, lemma)] pos = [pos_vocab[x] for x in pos] feat = [feat_vocab[x] for x in feat] yield CoNLLSentence( [CoNLLUWord(id=i + 1, form=fo, lemma=l, upos=p, feats=fe, head=a, deprel=r) for i, (fo, (a, r), l, p, fe) in enumerate(zip(form, tree, lemma, pos, feat))]) def __call__(self, data, batch_size=None, **kwargs) -> Union[CoNLLSentence, List[CoNLLSentence]]: return super().__call__(data, batch_size, **kwargs) ================================================ FILE: hanlp/components/parsers/ud/udify_util.py ================================================ # This file is modified from udify and allennlp, which are licensed under the MIT license: # MIT License # # Copyright (c) 2019 Dan Kondratyuk and allennlp # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os from typing import List, Dict, Tuple, Union import numpy import torch def get_ud_treebank_files(dataset_dir: str, treebanks: List[str] = None) -> Dict[str, Tuple[str, str, str]]: """Retrieves all treebank data paths in the given directory. Adopted from https://github.com/Hyperparticle/udify MIT Licence Args: dataset_dir: treebanks: dataset_dir: str: treebanks: List[str]: (Default value = None) Returns: """ datasets = {} treebanks = os.listdir(dataset_dir) if not treebanks else treebanks for treebank in treebanks: treebank_path = os.path.join(dataset_dir, treebank) conllu_files = [file for file in sorted(os.listdir(treebank_path)) if file.endswith(".conllu")] train_file = [file for file in conllu_files if file.endswith("train.conllu")] dev_file = [file for file in conllu_files if file.endswith("dev.conllu")] test_file = [file for file in conllu_files if file.endswith("test.conllu")] train_file = os.path.join(treebank_path, train_file[0]) if train_file else None dev_file = os.path.join(treebank_path, dev_file[0]) if dev_file else None test_file = os.path.join(treebank_path, test_file[0]) if test_file else None datasets[treebank] = (train_file, dev_file, test_file) return datasets def sequence_cross_entropy(log_probs: torch.FloatTensor, targets: torch.LongTensor, weights: torch.FloatTensor, average: str = "batch", label_smoothing: float = None) -> torch.FloatTensor: if average not in {None, "token", "batch"}: raise ValueError("Got average f{average}, expected one of " "None, 'token', or 'batch'") # shape : (batch * sequence_length, num_classes) log_probs_flat = log_probs.view(-1, log_probs.size(2)) # shape : (batch * max_len, 1) targets_flat = targets.view(-1, 1).long() if label_smoothing is not None and label_smoothing > 0.0: num_classes = log_probs.size(-1) smoothing_value = label_smoothing / num_classes # Fill all the correct indices with 1 - smoothing value. one_hot_targets = torch.zeros_like(log_probs_flat).scatter_(-1, targets_flat, 1.0 - label_smoothing) smoothed_targets = one_hot_targets + smoothing_value negative_log_likelihood_flat = - log_probs_flat * smoothed_targets negative_log_likelihood_flat = negative_log_likelihood_flat.sum(-1, keepdim=True) else: # Contribution to the negative log likelihood only comes from the exact indices # of the targets, as the target distributions are one-hot. Here we use torch.gather # to extract the indices of the num_classes dimension which contribute to the loss. # shape : (batch * sequence_length, 1) negative_log_likelihood_flat = - torch.gather(log_probs_flat, dim=1, index=targets_flat) # shape : (batch, sequence_length) negative_log_likelihood = negative_log_likelihood_flat.view(*targets.size()) # shape : (batch, sequence_length) negative_log_likelihood = negative_log_likelihood * weights.float() if average == "batch": # shape : (batch_size,) per_batch_loss = negative_log_likelihood.sum(1) / (weights.sum(1).float() + 1e-13) num_non_empty_sequences = ((weights.sum(1) > 0).float().sum() + 1e-13) return per_batch_loss.sum() / num_non_empty_sequences elif average == "token": return negative_log_likelihood.sum() / (weights.sum().float() + 1e-13) else: # shape : (batch_size,) per_batch_loss = negative_log_likelihood.sum(1) / (weights.sum(1).float() + 1e-13) return per_batch_loss def sequence_cross_entropy_with_logits( logits: torch.FloatTensor, targets: torch.LongTensor, weights: Union[torch.FloatTensor, torch.BoolTensor], average: str = "batch", label_smoothing: float = None, gamma: float = None, alpha: Union[float, List[float], torch.FloatTensor] = None, ) -> torch.FloatTensor: """Computes the cross entropy loss of a sequence, weighted with respect to some user provided weights. Note that the weighting here is not the same as in the `torch.nn.CrossEntropyLoss()` criterion, which is weighting classes; here we are weighting the loss contribution from particular elements in the sequence. This allows loss computations for models which use padding. # Parameters logits : `torch.FloatTensor`, required. A `torch.FloatTensor` of size (batch_size, sequence_length, num_classes) which contains the unnormalized probability for each class. targets : `torch.LongTensor`, required. A `torch.LongTensor` of size (batch, sequence_length) which contains the index of the true class for each corresponding step. weights : `Union[torch.FloatTensor, torch.BoolTensor]`, required. A `torch.FloatTensor` of size (batch, sequence_length) average: `str`, optional (default = `"batch"`) If "batch", average the loss across the batches. If "token", average the loss across each item in the input. If `None`, return a vector of losses per batch element. label_smoothing : `float`, optional (default = `None`) Whether or not to apply label smoothing to the cross-entropy loss. For example, with a label smoothing value of 0.2, a 4 class classification target would look like `[0.05, 0.05, 0.85, 0.05]` if the 3rd class was the correct label. gamma : `float`, optional (default = `None`) Focal loss[*] focusing parameter `gamma` to reduces the relative loss for well-classified examples and put more focus on hard. The greater value `gamma` is, the more focus on hard examples. alpha : `Union[float, List[float]]`, optional (default = `None`) Focal loss[*] weighting factor `alpha` to balance between classes. Can be used independently with `gamma`. If a single `float` is provided, it is assumed binary case using `alpha` and `1 - alpha` for positive and negative respectively. If a list of `float` is provided, with the same length as the number of classes, the weights will match the classes. [*] T. Lin, P. Goyal, R. Girshick, K. He and P. Dollár, "Focal Loss for Dense Object Detection," 2017 IEEE International Conference on Computer Vision (ICCV), Venice, 2017, pp. 2999-3007. # Returns `torch.FloatTensor` A torch.FloatTensor representing the cross entropy loss. If `average=="batch"` or `average=="token"`, the returned loss is a scalar. If `average is None`, the returned loss is a vector of shape (batch_size,). Args: logits: torch.FloatTensor: targets: torch.LongTensor: weights: Union[torch.FloatTensor: torch.BoolTensor]: average: str: (Default value = "batch") label_smoothing: float: (Default value = None) gamma: float: (Default value = None) alpha: Union[float: List[float]: torch.FloatTensor]: (Default value = None) Returns: """ if average not in {None, "token", "batch"}: raise ValueError("Got average f{average}, expected one of None, 'token', or 'batch'") # make sure weights are float weights = weights.to(logits.dtype) # sum all dim except batch non_batch_dims = tuple(range(1, len(weights.shape))) # shape : (batch_size,) weights_batch_sum = weights.sum(dim=non_batch_dims) # shape : (batch * sequence_length, num_classes) logits_flat = logits.view(-1, logits.size(-1)) # shape : (batch * sequence_length, num_classes) log_probs_flat = torch.nn.functional.log_softmax(logits_flat, dim=-1) # shape : (batch * max_len, 1) targets_flat = targets.view(-1, 1).long() # focal loss coefficient if gamma: # shape : (batch * sequence_length, num_classes) probs_flat = log_probs_flat.exp() # shape : (batch * sequence_length,) probs_flat = torch.gather(probs_flat, dim=1, index=targets_flat) # shape : (batch * sequence_length,) focal_factor = (1.0 - probs_flat) ** gamma # shape : (batch, sequence_length) focal_factor = focal_factor.view(*targets.size()) weights = weights * focal_factor if alpha is not None: # shape : () / (num_classes,) if isinstance(alpha, (float, int)): # shape : (2,) alpha_factor = torch.tensor( [1.0 - float(alpha), float(alpha)], dtype=weights.dtype, device=weights.device ) elif isinstance(alpha, (list, numpy.ndarray, torch.Tensor)): # shape : (c,) alpha_factor = torch.tensor(alpha, dtype=weights.dtype, device=weights.device) if not alpha_factor.size(): # shape : (1,) alpha_factor = alpha_factor.view(1) # shape : (2,) alpha_factor = torch.cat([1 - alpha_factor, alpha_factor]) else: raise TypeError( ("alpha must be float, list of float, or torch.FloatTensor, {} provided.").format( type(alpha) ) ) # shape : (batch, max_len) alpha_factor = torch.gather(alpha_factor, dim=0, index=targets_flat.view(-1)).view( *targets.size() ) weights = weights * alpha_factor if label_smoothing is not None and label_smoothing > 0.0: num_classes = logits.size(-1) smoothing_value = label_smoothing / num_classes # Fill all the correct indices with 1 - smoothing value. one_hot_targets = torch.zeros_like(log_probs_flat).scatter_( -1, targets_flat, 1.0 - label_smoothing ) smoothed_targets = one_hot_targets + smoothing_value negative_log_likelihood_flat = -log_probs_flat * smoothed_targets negative_log_likelihood_flat = negative_log_likelihood_flat.sum(-1, keepdim=True) else: # Contribution to the negative log likelihood only comes from the exact indices # of the targets, as the target distributions are one-hot. Here we use torch.gather # to extract the indices of the num_classes dimension which contribute to the loss. # shape : (batch * sequence_length, 1) negative_log_likelihood_flat = -torch.gather(log_probs_flat, dim=1, index=targets_flat) # shape : (batch, sequence_length) negative_log_likelihood = negative_log_likelihood_flat.view(*targets.size()) # shape : (batch, sequence_length) negative_log_likelihood = negative_log_likelihood * weights if average == "batch": # shape : (batch_size,) per_batch_loss = negative_log_likelihood.sum(non_batch_dims) / ( weights_batch_sum + tiny_value_of_dtype(negative_log_likelihood.dtype) ) num_non_empty_sequences = (weights_batch_sum > 0).sum() + tiny_value_of_dtype( negative_log_likelihood.dtype ) return per_batch_loss.sum() / num_non_empty_sequences elif average == "token": return negative_log_likelihood.sum() / ( weights_batch_sum.sum() + tiny_value_of_dtype(negative_log_likelihood.dtype) ) else: # shape : (batch_size,) per_batch_loss = negative_log_likelihood.sum(non_batch_dims) / ( weights_batch_sum + tiny_value_of_dtype(negative_log_likelihood.dtype) ) return per_batch_loss def tiny_value_of_dtype(dtype: torch.dtype): """Returns a moderately tiny value for a given PyTorch data type that is used to avoid numerical issues such as division by zero. This is different from `info_value_of_dtype(dtype).tiny` because it causes some NaN bugs. Only supports floating point dtypes. Args: dtype: torch.dtype: Returns: """ if not dtype.is_floating_point: raise TypeError("Only supports floating point dtypes.") if dtype == torch.float or dtype == torch.double: return 1e-13 elif dtype == torch.half: return 1e-4 else: raise TypeError("Does not support dtype " + str(dtype)) def combine_initial_dims_to_1d_or_2d(tensor: torch.Tensor) -> torch.Tensor: """Given a (possibly higher order) tensor of ids with shape (d1, ..., dn, sequence_length) Args: tensor: torch.Tensor: Returns: If original tensor is 1-d or 2-d, return it as is. """ if tensor.dim() <= 2: return tensor else: return tensor.view(-1, tensor.size(-1)) def uncombine_initial_dims(tensor: torch.Tensor, original_size: torch.Size) -> torch.Tensor: """Given a tensor of embeddings with shape (d1 * ... * dn, sequence_length, embedding_dim) and the original shape (d1, ..., dn, sequence_length), Args: tensor: torch.Tensor: original_size: torch.Size: Returns: (d1, ..., dn, sequence_length, embedding_dim). If original size is 1-d or 2-d, return it as is. """ if len(original_size) <= 2: return tensor else: view_args = list(original_size) + [tensor.size(-1)] return tensor.view(*view_args) def get_range_vector(size: int, device: int) -> torch.Tensor: """Returns a range vector with the desired size, starting at 0. The CUDA implementation is meant to avoid copy data from CPU to GPU. Args: size: int: device: int: Returns: """ if device > -1: return torch.cuda.LongTensor(size, device=device).fill_(1).cumsum(0) - 1 else: return torch.arange(0, size, dtype=torch.long) def get_device_of(tensor: torch.Tensor) -> int: """Returns the device of the tensor. Args: tensor: torch.Tensor: Returns: """ if not tensor.is_cuda: return -1 else: return tensor.get_device() ================================================ FILE: hanlp/components/parsers/ud/util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-14 20:44 from hanlp_common.constant import ROOT from hanlp.components.parsers.ud.lemma_edit import gen_lemma_rule def generate_lemma_rule(sample: dict): if 'LEMMA' in sample: sample['lemma'] = [gen_lemma_rule(word, lemma) if lemma != "_" else "_" for word, lemma in zip(sample['FORM'], sample['LEMMA'])] return sample def append_bos(sample: dict): if 'FORM' in sample: sample['token'] = [ROOT] + sample['FORM'] if 'UPOS' in sample: sample['pos'] = sample['UPOS'][:1] + sample['UPOS'] sample['arc'] = [0] + sample['HEAD'] sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] sample['lemma'] = sample['lemma'][:1] + sample['lemma'] sample['feat'] = sample['FEATS'][:1] + sample['FEATS'] return sample def sample_form_missing(sample: dict): return all(t == '_' for t in sample['FORM']) ================================================ FILE: hanlp/components/pipeline.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-31 00:22 import types from typing import Callable, Union, Iterable, Any from hanlp.components.lambda_wrapper import LambdaComponent from hanlp.common.component import Component from hanlp_common.document import Document from hanlp.utils.component_util import load_from_meta from hanlp_common.io import save_json, load_json from hanlp_common.reflection import str_to_type, classpath_of import hanlp class Pipe(Component): def __init__(self, component: Component, input_key: str = None, output_key: str = None, **kwargs) -> None: super().__init__() if not hasattr(self, 'config'): self.config = {'classpath': classpath_of(self)} self.output_key = output_key self.input_key = input_key self.component = component self.kwargs = kwargs self.config.update({ 'component': component.config, 'input_key': self.input_key, 'output_key': self.output_key, 'kwargs': self.kwargs }) # noinspection PyShadowingBuiltins def predict(self, doc: Document, **kwargs) -> Document: unpack = False if self.input_key: if isinstance(self.input_key, (tuple, list)): if isinstance(self.component, LambdaComponent): # assume functions take multiple arguments input = [doc[key] for key in self.input_key] unpack = True else: input = list(list(zip(*sent)) for sent in zip(*[doc[key] for key in self.input_key])) else: input = doc[self.input_key] else: input = doc if self.kwargs: kwargs.update(self.kwargs) if unpack: kwargs['_hanlp_unpack'] = True output = self.component(input, **kwargs) if isinstance(output, types.GeneratorType): output = list(output) if self.output_key: if not isinstance(doc, Document): doc = Document() if isinstance(self.output_key, tuple): for key, value in zip(self.output_key, output): doc[key] = value else: doc[self.output_key] = output return doc return output def __repr__(self): name = self.component.function.__name__ if isinstance(self.component, LambdaComponent) \ else self.component.__class__.__name__ return f'{self.input_key}->{name}->{self.output_key}' @staticmethod def from_config(meta: dict, **kwargs): cls = str_to_type(meta['classpath']) component = load_from_meta(meta['component']) return cls(component, meta['input_key'], meta['output_key'], **meta['kwargs']) class Pipeline(Component, list): def __init__(self, *pipes: Pipe) -> None: super().__init__() if not hasattr(self, 'config'): self.config = {'classpath': classpath_of(self)} if pipes: self.extend(pipes) def append(self, component: Callable, input_key: Union[str, Iterable[str]] = None, output_key: Union[str, Iterable[str]] = None, **kwargs): """ Append a pipe to the tail of this pipeline. Args: component: A callable function. input_key: The input key indicating which fields will be inputted to the pipe. ``None``: inherit from previous pipe; ``*``: use all the outputs from previous pipes wrapped in a :class:`~hanlp_common.document.Document`. output_key: The output key indicating where to store the outputs **kwargs: Extra arguments passed to the ``Pipe`` constructor. Returns: Pipeline: A pipeline. """ self.insert(len(self), component, input_key, output_key, **kwargs) return self def insert(self, index: int, component: Callable, input_key: Union[str, Iterable[str]] = None, output_key: Union[str, Iterable[str]] = None, **kwargs): """ Args: index: The index of the new pipe. input_key: The input key indicating which fields will be inputted to the pipe. ``None``: inherit from previous pipe; ``*``: use all the outputs from previous pipes wrapped in a :class:`~hanlp_common.document.Document`. output_key: The output key indicating where to store the outputs **kwargs: Extra arguments passed to the ``Pipe`` constructor. Returns: Pipeline: A pipeline. """ if input_key == '*': input_key = None elif not input_key and len(self) and index: input_key = self[index - 1].output_key if not isinstance(component, Component): component = LambdaComponent(component) super().insert(index, Pipe(component, input_key, output_key, **kwargs)) return self def __call__(self, doc: Union[Document, Any] = None, **kwargs) -> Document: """Run the pipeline as a function. Args: doc: A :class:`~hanlp_common.document.Document` or other data types. **kwargs: If `doc` is set to None then create a :class:`~hanlp_common.document.Document` as the input to the first pipe using all the parameters in ``kwargs``. Returns: A :class:`~hanlp_common.document.Document`. """ if doc is None: doc = Document(**kwargs) for component in self: doc = component(doc) return doc def copy(self): return self.__copy__() def __copy__(self): config = self.meta return Pipeline.from_config(config) @property def meta(self): return { 'classpath': classpath_of(self), 'hanlp_version': hanlp.version.__version__, 'pipes': [pipe.config for pipe in self] } @meta.setter def meta(self, value): pass def save(self, filepath): save_json(self.meta, filepath) def load(self, filepath): meta = load_json(filepath) self.clear() self.extend(Pipeline.from_config(meta)) @staticmethod def from_config(meta: Union[dict, str], **kwargs): if isinstance(meta, str): meta = load_json(meta) return Pipeline(*[load_from_meta(pipe) for pipe in meta['pipes']]) ================================================ FILE: hanlp/components/rnn_language_model_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-04 17:28 from typing import List, Union import tensorflow as tf from hanlp.common.keras_component import KerasComponent from hanlp.transform.text_tf import TextTransform class RNNLanguageModel(KerasComponent): def __init__(self, transform: TextTransform = None) -> None: if not transform: transform = TextTransform() super().__init__(transform) self.transform: TextTransform = transform def fit(self, trn_data, dev_data, save_dir, forward=True, embedding=100, rnn_input_dropout=0.1, rnn_units: int = 1024, rnn_output_dropout=0.1, seq_len: int = 250, optimizer='sgd', learning_rate=20, anneal_factor: float = 0.25, anneal_patience: int = 10, clipnorm=0.25, batch_size: int = 100, epochs=1000, run_eagerly=False, logger=None, verbose=True, **kwargs): return super().fit(**dict((k, v) for k, v in locals().items() if k not in ('self', 'kwargs'))) def build_model(self, embedding, rnn_input_dropout, rnn_units, rnn_output_dropout, batch_size, seq_len, training, **kwargs) -> tf.keras.Model: model = tf.keras.Sequential() extra_args = {} if training: extra_args['batch_input_shape'] = [batch_size, seq_len] embedding = tf.keras.layers.Embedding(input_dim=len(self.transform.vocab), output_dim=embedding, trainable=True, mask_zero=True, **extra_args) model.add(embedding) if rnn_input_dropout: model.add(tf.keras.layers.Dropout(rnn_input_dropout, name='rnn_input_dropout')) model.add(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, stateful=training, name='encoder')) if rnn_output_dropout: model.add(tf.keras.layers.Dropout(rnn_output_dropout, name='rnn_output_dropout')) model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(self.transform.vocab)), name='decoder')) return model # noinspection PyMethodOverriding def build_optimizer(self, optimizer, learning_rate, clipnorm, **kwargs): if optimizer == 'sgd': optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, clipnorm=clipnorm) return super().build_optimizer(optimizer, **kwargs) def build_train_dataset(self, trn_data, batch_size): trn_data = self.transform.file_to_dataset(trn_data, batch_size=batch_size, shuffle=False, repeat=-1) return trn_data def build_valid_dataset(self, dev_data, batch_size): dev_data = self.transform.file_to_dataset(dev_data, batch_size=batch_size, shuffle=False, drop_remainder=True) return dev_data def generate_text(self, text: Union[str, List[str]] = '\n', num_steps=50): char_mode = False if isinstance(text, str): text = list(text) char_mode = True forward = self.config['forward'] # A slow implementation. Might better to let LSTM return states. # But anyway, this interface is for fun so let's take it easy for step in range(num_steps): output = self.predict(text) first_or_last_token = output[-1] if forward: text += first_or_last_token else: text = [first_or_last_token] + text if char_mode: text = ''.join(text) return text ================================================ FILE: hanlp/components/srl/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-22 20:50 ================================================ FILE: hanlp/components/srl/span_bio/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-04 13:59 ================================================ FILE: hanlp/components/srl/span_bio/baffine_tagging.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-04 13:59 import math import torch from torch import nn from hanlp.components.parsers.biaffine.biaffine import Biaffine from hanlp.components.parsers.biaffine.mlp import MLP from hanlp.layers.crf.crf import CRF class BiaffineTaggingDecoder(nn.Module): def __init__(self, n_rels, hidden_size, n_mlp_rel=300, mlp_dropout=0.2, crf=False) -> None: super().__init__() self.mlp_rel_h = MLP(n_in=hidden_size, n_out=n_mlp_rel, dropout=mlp_dropout) self.mlp_rel_d = MLP(n_in=hidden_size, n_out=n_mlp_rel, dropout=mlp_dropout) self.rel_attn = Biaffine(n_in=n_mlp_rel, n_out=n_rels, bias_x=True, bias_y=True) bias = 1 / math.sqrt(self.rel_attn.weight.size(1)) nn.init.uniform_(self.rel_attn.weight, -bias, bias) self.crf = CRF(n_rels) if crf else None # noinspection PyUnusedLocal def forward(self, x: torch.Tensor, **kwargs): rel_h = self.mlp_rel_h(x) rel_d = self.mlp_rel_d(x) # get arc and rel scores from the bilinear attention # [batch_size, seq_len, seq_len, n_rels] s_rel = self.rel_attn(rel_d, rel_h).permute(0, 2, 3, 1) return s_rel class SpanBIOSemanticRoleLabelingModel(nn.Module): def __init__(self, embed, encoder, num_labels: int, n_mlp_rel, mlp_dropout, crf=False, ) -> None: super().__init__() self.embed = embed self.encoder = encoder hidden_size = encoder.get_output_dim() if encoder else embed.get_output_dim() self.decoder = BiaffineTaggingDecoder( num_labels, hidden_size, n_mlp_rel, mlp_dropout, crf, ) def forward(self, batch, mask): x = self.embed(batch) if self.encoder: x = self.encoder(x, mask=mask) x = self.decoder(x) return x ================================================ FILE: hanlp/components/srl/span_bio/span_bio.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-22 20:54 import logging from copy import copy from typing import Union, List, Callable, Dict, Any from bisect import bisect import torch import torch.nn.functional as F from torch import nn from torch.utils.data import DataLoader from hanlp_common.constant import IDX, PRED from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder, TransformableDataset from hanlp.common.structure import History from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import FieldLength from hanlp.common.vocab import Vocab from hanlp.components.srl.span_bio.baffine_tagging import SpanBIOSemanticRoleLabelingModel from hanlp.datasets.srl.loaders.conll2012 import CoNLL2012SRLBIODataset from hanlp.layers.crf.crf import CRF from hanlp.layers.embeddings.contextual_word_embedding import find_transformer from hanlp.layers.embeddings.embedding import Embedding from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer from hanlp.metrics.chunking.sequence_labeling import get_entities from hanlp.metrics.f1 import F1 from hanlp.utils.string_util import guess_delimiter from hanlp.utils.time_util import CountdownTimer from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask from hanlp_common.util import merge_locals_kwargs, reorder class SpanBIOSemanticRoleLabeler(TorchComponent): def __init__(self, **kwargs) -> None: """A span based Semantic Role Labeling task using BIO scheme for tagging the role of each token. Given a predicate and a token, it uses biaffine (:cite:`dozat:17a`) to predict their relations as one of BIO-ROLE. Args: **kwargs: Predefined config. """ super().__init__(**kwargs) self.model: SpanBIOSemanticRoleLabelingModel = None def build_optimizer(self, trn, epochs, lr, adam_epsilon, weight_decay, warmup_steps, transformer_lr=None, gradient_accumulation=1, **kwargs): num_training_steps = len(trn) * epochs // gradient_accumulation if transformer_lr is None: transformer_lr = lr transformer = find_transformer(self.model.embed) optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, transformer, lr, transformer_lr, num_training_steps, warmup_steps, weight_decay, adam_epsilon) return optimizer, scheduler def build_criterion(self, decoder=None, **kwargs): if self.config.crf: if not decoder: decoder = self.model.decoder if isinstance(decoder, torch.nn.DataParallel): decoder = decoder.module return decoder.crf else: return nn.CrossEntropyLoss(reduction=self.config.loss_reduction) def build_metric(self, **kwargs): return F1() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, patience=0.5, **kwargs): if isinstance(patience, float): patience = int(patience * epochs) best_epoch, best_metric = 0, -1 timer = CountdownTimer(epochs) history = History() for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width, **self.config) loss, dev_metric = self.evaluate_dataloader(dev, criterion, metric, logger=logger, ratio_width=ratio_width) timer.update() report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}" if dev_metric > best_metric: best_epoch, best_metric = epoch, copy(dev_metric) self.save_weights(save_dir) report += ' [red](saved)[/red]' else: report += f' ({epoch - best_epoch})' if epoch - best_epoch >= patience: report += ' early stop' logger.info(report) if epoch - best_epoch >= patience: break if not best_epoch: self.save_weights(save_dir) elif best_epoch != epoch: self.load_weights(save_dir) logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}") logger.info(f"Average time of each epoch is {timer.elapsed_average_human}") logger.info(f"{timer.elapsed_human} elapsed") # noinspection PyMethodOverriding def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, history: History, gradient_accumulation=1, grad_norm=None, ratio_width=None, eval_trn=False, **kwargs): optimizer, scheduler = optimizer self.model.train() timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation)) total_loss = 0 for idx, batch in enumerate(trn): pred, mask = self.feed_batch(batch) loss = self.compute_loss(criterion, pred, batch['srl_id'], mask) if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation loss.backward() total_loss += loss.item() if eval_trn: prediction = self.decode_output(pred, mask, batch) self.update_metrics(metric, prediction, batch) if history.step(gradient_accumulation): self._step(optimizer, scheduler, grad_norm) report = f'loss: {total_loss / (idx + 1):.4f} {metric}' if eval_trn else f'loss: {total_loss / (idx + 1):.4f}' timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width) del loss del pred del mask def naive_decode(self, pred, mask, batch, decoder=None): vocab = self.vocabs['srl'].idx_to_token results = [] for sent, matrix in zip(batch['token'], pred.argmax(-1).tolist()): results.append([]) for token, tags_per_token in zip(sent, matrix): tags_per_token = [vocab[x] for x in tags_per_token][:len(sent)] srl_per_token = get_entities(tags_per_token) results[-1].append(srl_per_token) return results def decode_output(self, pred, mask, batch, decoder=None): # naive = self.naive_decode(pred, mask, batch, decoder) vocab = self.vocabs['srl'].idx_to_token if mask is not None: if self.config.crf: if not decoder: decoder = self.model.decoder crf: CRF = decoder.crf token_index, mask = mask pred = crf.decode(pred, mask) pred = sum(pred, []) else: pred = pred[mask].argmax(-1) pred = pred.tolist() pred = [vocab[x] for x in pred] results = [] offset = 0 for sent in batch['token']: results.append([]) for token in sent: tags_per_token = pred[offset:offset + len(sent)] srl_per_token = get_entities(tags_per_token) results[-1].append(srl_per_token) offset += len(sent) assert offset == len(pred) # assert results == naive return results def update_metrics(self, metric, prediction, batch): for p, g in zip(prediction, batch['srl_set']): srl = set() for i, args in enumerate(p): srl.update((i, start, end, label) for (label, start, end) in args) metric(srl, g) return metric def feed_batch(self, batch: dict): lens = batch['token_length'] mask2d = lengths_to_mask(lens) pred = self.model(batch, mask=mask2d) mask3d = self.compute_mask(mask2d) if self.config.crf: token_index = mask3d[0] pred = pred.flatten(end_dim=1)[token_index] pred = F.log_softmax(pred, dim=-1) return pred, mask3d def compute_mask(self, mask2d): mask3d = mask2d.unsqueeze_(-1).expand(-1, -1, mask2d.size(1)) mask3d = mask3d & mask3d.transpose(1, 2) if self.config.crf: mask3d = mask3d.flatten(end_dim=1) token_index = mask3d[:, 0] mask3d = mask3d[token_index] return token_index, mask3d else: return mask3d def _step(self, optimizer, scheduler, grad_norm): clip_grad_norm(self.model, grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() # noinspection PyMethodOverriding def build_model(self, embed: Embedding, encoder, training, **kwargs) -> torch.nn.Module: # noinspection PyCallByClass model = SpanBIOSemanticRoleLabelingModel( embed.module(training=training, vocabs=self.vocabs), encoder, len(self.vocabs.srl), self.config.n_mlp_rel, self.config.mlp_dropout, self.config.crf, ) return model # noinspection PyMethodOverriding def build_dataloader(self, data, batch_size, sampler_builder: SamplerBuilder = None, gradient_accumulation=1, shuffle=False, device=None, logger: logging.Logger = None, transform=None, **kwargs) -> DataLoader: if isinstance(data, TransformableDataset): dataset = data else: transforms = [self.config.embed.transform(vocabs=self.vocabs), self.vocabs, FieldLength('token')] if transform: transforms.insert(0, transform) dataset = self.build_dataset(data, transforms) if self.vocabs.mutable: # noinspection PyTypeChecker self.build_vocabs(dataset, logger) lens = [len(x['token_input_ids']) for x in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = None return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler) def build_dataset(self, data, transform): dataset = CoNLL2012SRLBIODataset(data, transform=transform, doc_level_offset=self.config.get('doc_level_offset', True), cache=isinstance(data, str)) return dataset def build_vocabs(self, dataset, logger, **kwargs): self.vocabs.srl = Vocab(pad_token=None, unk_token=None) timer = CountdownTimer(len(dataset)) max_seq_len = 0 for sample in dataset: max_seq_len = max(max_seq_len, len(sample['token_input_ids'])) timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})') self.vocabs['srl'].set_unk_as_safe_unk() # C-ARGM-FRQ appears only in test set self.vocabs.lock() self.vocabs.summary(logger) if self.config.get('delimiter') is None: tokens = dataset[0]['token'] self.config.delimiter = guess_delimiter(tokens) logger.info(f'Guess the delimiter between tokens could be [blue]"{self.config.delimiter}"[/blue]. ' f'If not, specify `delimiter` in `fit()`') def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs): if not data: return [] flat = self.input_is_flat(data) if flat: data = [data] dataloader = self.build_dataloader(self.build_samples(data), batch_size, device=self.device, **kwargs) results = [] order = [] for batch in dataloader: pred, mask = self.feed_batch(batch) prediction = self.decode_output(pred, mask, batch) results.extend(self.prediction_to_result(prediction, batch)) order.extend(batch[IDX]) results = reorder(results, order) if flat: return results[0] return results def build_samples(self, data): return [{'token': token} for token in data] # noinspection PyMethodOverriding def fit(self, trn_data, dev_data, save_dir, embed, encoder=None, lr=1e-3, transformer_lr=1e-4, adam_epsilon=1e-8, warmup_steps=0.1, weight_decay=0, crf=False, n_mlp_rel=300, mlp_dropout=0.2, batch_size=32, gradient_accumulation=1, grad_norm=1, loss_reduction='mean', epochs=30, delimiter=None, doc_level_offset=True, eval_trn=False, logger=None, devices: Union[float, int, List[int]] = None, transform=None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def compute_loss(self, criterion, pred, srl, mask): if self.config.crf: token_index, mask = mask criterion: CRF = criterion loss = -criterion.forward(pred, srl.flatten(end_dim=1)[token_index], mask, reduction=self.config.loss_reduction) else: loss = criterion(pred[mask], srl[mask]) return loss # noinspection PyMethodOverriding @torch.no_grad() def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric, logger, ratio_width=None, filename=None, **kwargs): self.model.eval() timer = CountdownTimer(len(data)) total_loss = 0 metric.reset() for idx, batch in enumerate(data): pred, mask = self.feed_batch(batch) loss = self.compute_loss(criterion, pred, batch['srl_id'], mask) total_loss += loss.item() prediction = self.decode_output(pred, mask, batch) self.update_metrics(metric, prediction, batch) report = f'loss: {total_loss / (idx + 1):.4f} {metric}' timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width) return total_loss / timer.total, metric def input_is_flat(self, data) -> bool: return isinstance(data[0], str) def prediction_to_result(self, prediction: List, batch: Dict[str, Any], delimiter=None) -> List: if delimiter is None: delimiter = self.config.delimiter for matrix, tokens in zip(prediction, batch['token']): result = [] for i, arguments in enumerate(matrix): if arguments: pas = [(delimiter.join(tokens[x[1]:x[2]]),) + x for x in arguments] pas.insert(bisect([a[1] for a in arguments], i), (tokens[i], PRED, i, i + 1)) result.append(pas) yield result ================================================ FILE: hanlp/components/srl/span_rank/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-19 22:22 ================================================ FILE: hanlp/components/srl/span_rank/highway_variational_lstm.py ================================================ # Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL import torch import torch.nn as nn import torch.nn.functional as F import torch.nn.init as init from torch.autograd import Variable from .layer import DropoutLayer, HighwayLSTMCell, VariationalLSTMCell def initializer_1d(input_tensor, initializer): assert len(input_tensor.size()) == 1 input_tensor = input_tensor.view(-1, 1) input_tensor = initializer(input_tensor) return input_tensor.view(-1) class HighwayBiLSTM(nn.Module): """A module that runs multiple steps of HighwayBiLSTM.""" def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, bidirectional=False, dropout_in=0, dropout_out=0): super(HighwayBiLSTM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.batch_first = batch_first self.bidirectional = bidirectional self.dropout_in = dropout_in self.dropout_out = dropout_out self.num_directions = 2 if bidirectional else 1 self.fcells, self.f_dropout, self.f_hidden_dropout = [], [], [] self.bcells, self.b_dropout, self.b_hidden_dropout = [], [], [] for layer in range(num_layers): layer_input_size = input_size if layer == 0 else hidden_size self.fcells.append(HighwayLSTMCell(input_size=layer_input_size, hidden_size=hidden_size)) self.f_dropout.append(DropoutLayer(hidden_size, self.dropout_out)) self.f_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out)) if self.bidirectional: self.bcells.append(HighwayLSTMCell(input_size=hidden_size, hidden_size=hidden_size)) self.b_dropout.append(DropoutLayer(hidden_size, self.dropout_out)) self.b_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out)) self.fcells, self.bcells = nn.ModuleList(self.fcells), nn.ModuleList(self.bcells) self.f_dropout, self.b_dropout = nn.ModuleList(self.f_dropout), nn.ModuleList(self.b_dropout) def reset_dropout_layer(self, batch_size): for layer in range(self.num_layers): self.f_dropout[layer].reset_dropout_mask(batch_size) if self.bidirectional: self.b_dropout[layer].reset_dropout_mask(batch_size) @staticmethod def _forward_rnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None): max_time = input.size(0) output = [] hx = initial for time in range(max_time): h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks) hx = (h_next, c_next) output.append(h_next) output = torch.stack(output, 0) return output, hx @staticmethod def _forward_brnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None): max_time = input.size(0) output = [] hx = initial for time in reversed(list(range(max_time))): h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks) hx = (h_next, c_next) output.append(h_next) output.reverse() output = torch.stack(output, 0) return output, hx def forward(self, input, masks, initial=None): if self.batch_first: input = input.transpose(0, 1) # transpose: return the transpose matrix masks = torch.unsqueeze(masks.transpose(0, 1), dim=2) max_time, batch_size, _ = input.size() self.reset_dropout_layer(batch_size) # reset the dropout each batch forward masks = masks.expand(-1, -1, self.hidden_size) # expand: -1 means not expand that dimension if initial is None: initial = Variable(input.data.new(batch_size, self.hidden_size).zero_()) initial = (initial, initial) # h0, c0 h_n, c_n = [], [] for layer in range(self.num_layers): # hidden_mask, hidden_drop = None, None hidden_mask, hidden_drop = self.f_dropout[layer], self.f_hidden_dropout[layer] layer_output, (layer_h_n, layer_c_n) = HighwayBiLSTM._forward_rnn(cell=self.fcells[layer], \ gate=None, input=input, masks=masks, initial=initial, \ drop_masks=hidden_mask, hidden_drop=hidden_drop) h_n.append(layer_h_n) c_n.append(layer_c_n) if self.bidirectional: hidden_mask, hidden_drop = self.b_dropout[layer], self.b_hidden_dropout[layer] blayer_output, (blayer_h_n, blayer_c_n) = HighwayBiLSTM._forward_brnn(cell=self.bcells[layer], \ gate=None, input=layer_output, masks=masks, initial=initial, \ drop_masks=hidden_mask, hidden_drop=hidden_drop) h_n.append(blayer_h_n) c_n.append(blayer_c_n) input = blayer_output if self.bidirectional else layer_output h_n, c_n = torch.stack(h_n, 0), torch.stack(c_n, 0) if self.batch_first: input = input.transpose(1, 0) # transpose: return the transpose matrix return input, (h_n, c_n) class StackedHighwayBiLSTM(nn.Module): """A module that runs multiple steps of HighwayBiLSTM.""" def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, \ bidirectional=False, dropout_in=0, dropout_out=0): super(StackedHighwayBiLSTM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.batch_first = batch_first self.bidirectional = bidirectional self.dropout_in = dropout_in self.dropout_out = dropout_out self.num_directions = 2 if bidirectional else 1 self.fcells, self.f_dropout, self.f_hidden_dropout = [], [], [] self.bcells, self.b_dropout, self.b_hidden_dropout = [], [], [] self.f_initial, self.b_initial = [], [] for layer in range(num_layers): layer_input_size = input_size if layer == 0 else 2 * hidden_size if self.bidirectional else hidden_size self.fcells.append(VariationalLSTMCell(input_size=layer_input_size, hidden_size=hidden_size)) self.f_dropout.append(DropoutLayer(hidden_size, self.dropout_out)) self.f_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out)) self.f_initial.append(nn.Parameter(torch.Tensor(2, self.hidden_size))) assert self.bidirectional is True self.bcells.append(VariationalLSTMCell(input_size=layer_input_size, hidden_size=hidden_size)) self.b_dropout.append(DropoutLayer(hidden_size, self.dropout_out)) self.b_hidden_dropout.append(DropoutLayer(hidden_size, self.dropout_out)) self.b_initial.append(nn.Parameter(torch.Tensor(2, self.hidden_size))) self.lstm_project_layer = nn.ModuleList([nn.Linear(2 * self.hidden_size, 2 * self.hidden_size) for _ in range(num_layers - 1)]) self.fcells, self.bcells = nn.ModuleList(self.fcells), nn.ModuleList(self.bcells) self.f_dropout, self.b_dropout = nn.ModuleList(self.f_dropout), nn.ModuleList(self.b_dropout) self.f_hidden_dropout, self.b_hidden_dropout = \ nn.ModuleList(self.f_hidden_dropout), nn.ModuleList(self.b_hidden_dropout) self.f_initial, self.b_initial = nn.ParameterList(self.f_initial), nn.ParameterList(self.b_initial) self.reset_parameters() def reset_parameters(self): for layer_initial in [self.f_initial, self.b_initial]: for initial in layer_initial: init.xavier_uniform_(initial) for layer in self.lstm_project_layer: init.xavier_uniform_(layer.weight) initializer_1d(layer.bias, init.xavier_uniform_) def reset_dropout_layer(self, batch_size): for layer in range(self.num_layers): self.f_dropout[layer].reset_dropout_mask(batch_size) self.f_hidden_dropout[layer].reset_dropout_mask(batch_size) if self.bidirectional: self.b_dropout[layer].reset_dropout_mask(batch_size) self.b_hidden_dropout[layer].reset_dropout_mask(batch_size) def reset_state(self, batch_size): f_states, b_states = [], [] for f_layer_initial, b_layer_initial in zip(self.f_initial, self.b_initial): f_states.append([f_layer_initial[0].expand(batch_size, -1), f_layer_initial[1].expand(batch_size, -1)]) b_states.append([b_layer_initial[0].expand(batch_size, -1), b_layer_initial[1].expand(batch_size, -1)]) return f_states, b_states @staticmethod def _forward_rnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None): max_time = input.size(0) output = [] hx = initial for time in range(max_time): h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks) hx = (h_next, c_next) output.append(h_next) output = torch.stack(output, 0) return output, hx @staticmethod def _forward_brnn(cell, gate, input, masks, initial, drop_masks=None, hidden_drop=None): max_time = input.size(0) output = [] hx = initial for time in reversed(list(range(max_time))): h_next, c_next = cell(input[time], mask=masks[time], hx=hx, dropout=drop_masks) hx = (h_next, c_next) output.append(h_next) output.reverse() output = torch.stack(output, 0) return output, hx def forward(self, input, masks, initial=None): if self.batch_first: input = input.transpose(0, 1) # transpose: return the transpose matrix masks = torch.unsqueeze(masks.transpose(0, 1), dim=2) max_time, batch_size, _ = input.size() self.reset_dropout_layer(batch_size) # reset the dropout each batch forward f_states, b_states = self.reset_state(batch_size) masks = masks.expand(-1, -1, self.hidden_size) # expand: -1 means not expand that dimension h_n, c_n = [], [] outputs = [] for layer in range(self.num_layers): hidden_mask, hidden_drop = self.f_dropout[layer], self.f_hidden_dropout[layer] layer_output, (layer_h_n, layer_c_n) = \ StackedHighwayBiLSTM._forward_rnn(cell=self.fcells[layer], gate=None, input=input, masks=masks, initial=f_states[layer], drop_masks=hidden_mask, hidden_drop=hidden_drop) h_n.append(layer_h_n) c_n.append(layer_c_n) assert self.bidirectional is True hidden_mask, hidden_drop = self.b_dropout[layer], self.b_hidden_dropout[layer] blayer_output, (blayer_h_n, blayer_c_n) = \ StackedHighwayBiLSTM._forward_brnn(cell=self.bcells[layer], gate=None, input=input, masks=masks, initial=b_states[layer], drop_masks=hidden_mask, hidden_drop=hidden_drop) h_n.append(blayer_h_n) c_n.append(blayer_c_n) output = torch.cat([layer_output, blayer_output], 2) if self.bidirectional else layer_output output = F.dropout(output, self.dropout_out, self.training) if layer > 0: # Highway highway_gates = torch.sigmoid(self.lstm_project_layer[layer - 1].forward(output)) output = highway_gates * output + (1 - highway_gates) * input if self.batch_first: outputs.append(output.transpose(1, 0)) else: outputs.append(output) input = output h_n, c_n = torch.stack(h_n, 0), torch.stack(c_n, 0) if self.batch_first: output = output.transpose(1, 0) # transpose: return the transpose matrix return output, (h_n, c_n), outputs ================================================ FILE: hanlp/components/srl/span_rank/inference_utils.py ================================================ # Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL # Inference functions for the SRL model. import numpy as np def decode_spans(span_starts, span_ends, span_scores, labels_inv): """ Args: span_starts: [num_candidates,] span_scores: [num_candidates, num_labels] span_ends: labels_inv: Returns: """ pred_spans = [] span_labels = np.argmax(span_scores, axis=1) # [num_candidates] spans_list = list(zip(span_starts, span_ends, span_labels, span_scores)) spans_list = sorted(spans_list, key=lambda x: x[3][x[2]], reverse=True) predicted_spans = {} for start, end, label, _ in spans_list: # Skip invalid span. if label == 0 or (start, end) in predicted_spans: continue pred_spans.append((start, end, labels_inv[label])) predicted_spans[(start, end)] = label return pred_spans def greedy_decode(predict_dict, srl_labels_inv): """Greedy decoding for SRL predicate-argument structures. Args: predict_dict: Dictionary of name to numpy arrays. srl_labels_inv: SRL label id to string name. suppress_overlap: Whether to greedily suppress overlapping arguments for the same predicate. Returns: """ arg_starts = predict_dict["arg_starts"] arg_ends = predict_dict["arg_ends"] predicates = predict_dict["predicates"] arg_labels = predict_dict["arg_labels"] scores = predict_dict["srl_scores"] num_suppressed_args = 0 # Map from predicates to a list of labeled spans. pred_to_args = {} if len(arg_ends) > 0 and len(predicates) > 0: max_len = max(np.max(arg_ends), np.max(predicates)) + 1 else: max_len = 1 for j, pred_id in enumerate(predicates): args_list = [] for i, (arg_start, arg_end) in enumerate(zip(arg_starts, arg_ends)): # If label is not null. if arg_labels[i][j] == 0: continue label = srl_labels_inv[arg_labels[i][j]] # if label not in ["V", "C-V"]: args_list.append((arg_start, arg_end, label, scores[i][j][arg_labels[i][j]])) # Sort arguments by highest score first. args_list = sorted(args_list, key=lambda x: x[3], reverse=True) new_args_list = [] flags = [False for _ in range(max_len)] # Predicate will not overlap with arguments either. flags[pred_id] = True for (arg_start, arg_end, label, score) in args_list: # If none of the tokens has been covered: if not max(flags[arg_start:arg_end + 1]): new_args_list.append((arg_start, arg_end, label)) for k in range(arg_start, arg_end + 1): flags[k] = True # Only add predicate if it has any argument. if new_args_list: pred_to_args[pred_id] = new_args_list num_suppressed_args += len(args_list) - len(new_args_list) return pred_to_args, num_suppressed_args _CORE_ARGS = {"ARG0": 1, "ARG1": 2, "ARG2": 4, "ARG3": 8, "ARG4": 16, "ARG5": 32, "ARGA": 64, "A0": 1, "A1": 2, "A2": 4, "A3": 8, "A4": 16, "A5": 32, "AA": 64} def get_predicted_clusters(top_span_starts, top_span_ends, predicted_antecedents): mention_to_predicted = {} predicted_clusters = [] for i, predicted_index in enumerate(predicted_antecedents): if predicted_index < 0: continue assert i > predicted_index predicted_antecedent = (int(top_span_starts[predicted_index]), int(top_span_ends[predicted_index])) if predicted_antecedent in mention_to_predicted: predicted_cluster = mention_to_predicted[predicted_antecedent] else: predicted_cluster = len(predicted_clusters) predicted_clusters.append([predicted_antecedent]) mention_to_predicted[predicted_antecedent] = predicted_cluster mention = (int(top_span_starts[i]), int(top_span_ends[i])) predicted_clusters[predicted_cluster].append(mention) mention_to_predicted[mention] = predicted_cluster predicted_clusters = [tuple(pc) for pc in predicted_clusters] mention_to_predicted = {m: predicted_clusters[i] for m, i in list(mention_to_predicted.items())} return predicted_clusters, mention_to_predicted def _decode_non_overlapping_spans(starts, ends, scores, max_len, labels_inv, pred_id): labels = np.argmax(scores, axis=1) spans = [] for i, (start, end, label) in enumerate(zip(starts, ends, labels)): if label <= 0: continue label_str = labels_inv[label] if pred_id is not None and label_str == "V": continue spans.append((start, end, label_str, scores[i][label])) spans = sorted(spans, key=lambda x: x[3], reverse=True) flags = np.zeros([max_len], dtype=bool) if pred_id is not None: flags[pred_id] = True new_spans = [] for start, end, label_str, score in spans: if not max(flags[start:end + 1]): new_spans.append((start, end, label_str)) # , score)) for k in range(start, end + 1): flags[k] = True return new_spans def _dp_decode_non_overlapping_spans(starts, ends, scores, max_len, labels_inv, pred_id, u_constraint=False): num_roles = scores.shape[1] # [num_arg, num_roles] labels = np.argmax(scores, axis=1).astype(np.int64) spans = list(zip(starts, ends, list(range(len(starts))))) spans = sorted(spans, key=lambda x: (x[0], x[1])) # sort according to the span start index if u_constraint: f = np.zeros([max_len + 1, 128], dtype=float) - 0.1 else: # This one f = np.zeros([max_len + 1, 1], dtype=float) - 0.1 f[0, 0] = 0 states = {0: set([0])} # A dictionary from id to list of binary core-arg states. pointers = {} # A dictionary from states to (arg_id, role, prev_t, prev_rs) best_state = [(0, 0)] def _update_state(t0, rs0, t1, rs1, delta, arg_id, role): if f[t0][rs0] + delta > f[t1][rs1]: f[t1][rs1] = f[t0][rs0] + delta if t1 not in states: states[t1] = set() states[t1].update([rs1]) pointers[(t1, rs1)] = (arg_id, role, t0, rs0) # the pointers store if f[t1][rs1] > f[best_state[0][0]][best_state[0][1]]: best_state[0] = (t1, rs1) for start, end, i in spans: # [arg_start, arg_end, arg_span_id] assert scores[i][0] == 0 # dummy score # The extra dummy score should be same for all states, so we can safely skip arguments overlap # with the predicate. if pred_id is not None and start <= pred_id and pred_id <= end: # skip the span contains the predicate continue r0 = labels[i] # Locally best role assignment. # Strictly better to incorporate a dummy span if it has the highest local score. if r0 == 0: # labels_inv[r0] == "O" continue r0_str = labels_inv[r0] # Enumerate explored states. t_states = [t for t in list(states.keys()) if t <= start] # collect the state which is before the current span for t in t_states: # for each state role_states = states[t] # Update states if best role is not a core arg. if not u_constraint or r0_str not in _CORE_ARGS: # True; this one for rs in role_states: # the set type in the value in the state dict _update_state(t, rs, end + 1, rs, scores[i][r0], i, r0) # update the state else: for rs in role_states: for r in range(1, num_roles): if scores[i][r] > 0: r_str = labels_inv[r] core_state = _CORE_ARGS.get(r_str, 0) # print start, end, i, r_str, core_state, rs if core_state & rs == 0: _update_state(t, rs, end + 1, rs | core_state, scores[i][r], i, r) # Backtrack to decode. new_spans = [] t, rs = best_state[0] while (t, rs) in pointers: i, r, t0, rs0 = pointers[(t, rs)] new_spans.append((int(starts[i]), int(ends[i]), labels_inv[r])) t = t0 rs = rs0 return new_spans[::-1] def srl_decode(sentence_lengths, predict_dict, srl_labels_inv, config): # decode the predictions. # Decode sentence-level tasks. num_sentences = len(sentence_lengths) predictions = [{} for _ in range(num_sentences)] # Sentence-level predictions. for i in range(num_sentences): # for each sentences # if predict_dict["No_arg"] is True: # predictions["srl"][i][predict_dict["predicates"][i]] = [] # continue predict_dict_num_args_ = predict_dict["num_args"].cpu().numpy() predict_dict_num_preds_ = predict_dict["num_preds"].cpu().numpy() predict_dict_predicates_ = predict_dict["predicates"].cpu().numpy() predict_dict_arg_starts_ = predict_dict["arg_starts"].cpu().numpy() predict_dict_arg_ends_ = predict_dict["arg_ends"].cpu().numpy() predict_dict_srl_scores_ = predict_dict["srl_scores"].detach().cpu().numpy() num_args = predict_dict_num_args_[i] # the number of the candidate argument spans num_preds = predict_dict_num_preds_[i] # the number of the candidate predicates # for each predicate id, exec the decode process for j, pred_id in enumerate(predict_dict_predicates_[i][:num_preds]): # sorted arg_starts and arg_ends and srl_scores ? should be??? enforce_srl_constraint = False arg_spans = _dp_decode_non_overlapping_spans( predict_dict_arg_starts_[i][:num_args], predict_dict_arg_ends_[i][:num_args], predict_dict_srl_scores_[i, :num_args, j, :], sentence_lengths[i], srl_labels_inv, pred_id, config.enforce_srl_constraint) # To avoid warnings in the eval script. if config.use_gold_predicates: # false arg_spans.append((pred_id, pred_id, "V")) if arg_spans: predictions[i][int(pred_id)] = sorted(arg_spans, key=lambda x: (x[0], x[1])) return predictions ================================================ FILE: hanlp/components/srl/span_rank/layer.py ================================================ # Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL import torch import torch.nn as nn from torch.autograd import Variable import numpy as np import torch.nn.functional as F from hanlp.components.srl.span_rank.util import block_orth_normal_initializer def get_tensor_np(t): return t.data.cpu().numpy() def orthonormal_initializer(output_size, input_size): """adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/linalg.py Args: output_size: input_size: Returns: """ print((output_size, input_size)) I = np.eye(output_size) lr = .1 eps = .05 / (output_size + input_size) success = False tries = 0 while not success and tries < 10: Q = np.random.randn(input_size, output_size) / np.sqrt(output_size) for i in range(100): QTQmI = Q.T.dot(Q) - I loss = np.sum(QTQmI ** 2 / 2) Q2 = Q ** 2 Q -= lr * Q.dot(QTQmI) / ( np.abs(Q2 + Q2.sum(axis=0, keepdims=True) + Q2.sum(axis=1, keepdims=True) - 1) + eps) if np.max(Q) > 1e6 or loss > 1e6 or not np.isfinite(loss): tries += 1 lr /= 2 break success = True if success: print(('Orthogonal pretrainer loss: %.2e' % loss)) else: print('Orthogonal pretrainer failed, using non-orthogonal random matrix') Q = np.random.randn(input_size, output_size) / np.sqrt(output_size) return np.transpose(Q.astype(np.float32)) class LayerNorm(nn.Module): def __init__(self, features, eps=1e-8): super(LayerNorm, self).__init__() self.gamma = nn.Parameter(torch.ones(features)) self.beta = nn.Parameter(torch.zeros(features)) self.eps = eps def forward(self, x): mean = x.mean(-1, keepdim=True) std = x.std(-1, keepdim=True) return self.gamma * (x - mean) / (std + self.eps) + self.beta class DropoutLayer3D(nn.Module): def __init__(self, input_size, dropout_rate=0.0): super(DropoutLayer3D, self).__init__() self.dropout_rate = dropout_rate self.input_size = input_size self.drop_mask = torch.FloatTensor(self.input_size).fill_(1 - self.dropout_rate) self.drop_mask = Variable(torch.bernoulli(self.drop_mask), requires_grad=False) if torch.cuda.is_available(): self.drop_mask = self.drop_mask.cuda() def reset_dropout_mask(self, batch_size, length): self.drop_mask = torch.FloatTensor(batch_size, length, self.input_size).fill_(1 - self.dropout_rate) self.drop_mask = Variable(torch.bernoulli(self.drop_mask), requires_grad=False) if torch.cuda.is_available(): self.drop_mask = self.drop_mask.cuda() def forward(self, x): if self.training: return torch.mul(x, self.drop_mask) else: # eval return x * (1.0 - self.dropout_rate) class DropoutLayer(nn.Module): def __init__(self, input_size, dropout_rate=0.0): super(DropoutLayer, self).__init__() self.dropout_rate = dropout_rate self.input_size = input_size self.drop_mask = torch.Tensor(self.input_size).fill_(1 - self.dropout_rate) self.drop_mask = torch.bernoulli(self.drop_mask) def reset_dropout_mask(self, batch_size): self.drop_mask = torch.Tensor(batch_size, self.input_size).fill_(1 - self.dropout_rate) self.drop_mask = torch.bernoulli(self.drop_mask) def forward(self, x): if self.training: return torch.mul(x, self.drop_mask.to(x.device)) else: # eval return x * (1.0 - self.dropout_rate) class NonLinear(nn.Module): def __init__(self, input_size, hidden_size, activation=None): super(NonLinear, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.linear = nn.Linear(in_features=input_size, out_features=hidden_size) if activation is None: self._activate = lambda x: x else: if not callable(activation): raise ValueError("activation must be callable: type={}".format(type(activation))) self._activate = activation self.reset_parameters() def forward(self, x): y = self.linear(x) return self._activate(y) def reset_parameters(self): nn.init.xavier_uniform_(self.linear.weight) nn.init.zeros_(self.linear.bias) class Biaffine(nn.Module): def __init__(self, in1_features, in2_features, out_features, bias=(True, True)): super(Biaffine, self).__init__() self.in1_features = in1_features self.in2_features = in2_features self.out_features = out_features self.bias = bias self.linear_input_size = in1_features + int(bias[0]) self.linear_output_size = out_features * (in2_features + int(bias[1])) self.linear = nn.Linear(in_features=self.linear_input_size, out_features=self.linear_output_size, bias=False) self.reset_parameters() def reset_parameters(self): torch.nn.init.xavier_uniform_(self.linear.weight) def forward(self, input1, input2): batch_size, len1, dim1 = input1.size() batch_size, len2, dim2 = input2.size() if self.bias[0]: ones = input1.data.new(batch_size, len1, 1).zero_().fill_(1) # this kind of implementation is too tedious input1 = torch.cat((input1, Variable(ones)), dim=2) dim1 += 1 if self.bias[1]: ones = input2.data.new(batch_size, len2, 1).zero_().fill_(1) input2 = torch.cat((input2, Variable(ones)), dim=2) dim2 += 1 affine = self.linear(input1) affine = affine.view(batch_size, len1 * self.out_features, dim2) input2 = torch.transpose(input2, 1, 2) # torch.bmm: Performs a batch matrix-matrix product of matrices stored in batch1 and batch2. biaffine = torch.transpose(torch.bmm(affine, input2), 1, 2) # view: Returns a new tensor with the same data as the self tensor but of a different size. biaffine = biaffine.contiguous().view(batch_size, len2, len1, self.out_features) return biaffine def __repr__(self): return self.__class__.__name__ + ' (' \ + 'in1_features=' + str(self.in1_features) \ + ', in2_features=' + str(self.in2_features) \ + ', out_features=' + str(self.out_features) + ')' class HighwayLSTMCell(nn.Module): def __init__(self, input_size, hidden_size): super(HighwayLSTMCell, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.linear_ih = nn.Linear(in_features=input_size, out_features=6 * hidden_size) self.linear_hh = nn.Linear(in_features=hidden_size, out_features=5 * hidden_size, bias=False) self.reset_parameters() # reset all the param in the MyLSTMCell def reset_parameters(self): weight_ih = block_orth_normal_initializer([self.input_size, ], [self.hidden_size] * 6) self.linear_ih.weight.data.copy_(weight_ih) weight_hh = block_orth_normal_initializer([self.hidden_size, ], [self.hidden_size] * 5) self.linear_hh.weight.data.copy_(weight_hh) # nn.init.constant(self.linear_hh.weight, 1.0) # nn.init.constant(self.linear_ih.weight, 1.0) nn.init.constant(self.linear_ih.bias, 0.0) def forward(self, x, mask=None, hx=None, dropout=None): assert mask is not None and hx is not None _h, _c = hx _x = self.linear_ih(x) # compute the x preact = self.linear_hh(_h) + _x[:, :self.hidden_size * 5] i, f, o, t, j = preact.chunk(chunks=5, dim=1) i, f, o, t, j = F.sigmoid(i), F.sigmoid(f + 1.0), F.sigmoid(o), F.sigmoid(t), F.tanh(j) k = _x[:, self.hidden_size * 5:] c = f * _c + i * j c = mask * c + (1.0 - mask) * _c h = t * o * F.tanh(c) + (1.0 - t) * k if dropout is not None: h = dropout(h) h = mask * h + (1.0 - mask) * _h return h, c class VariationalLSTMCell(nn.Module): def __init__(self, input_size, hidden_size): super(VariationalLSTMCell, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.linear = nn.Linear(in_features=input_size + self.hidden_size, out_features=3 * hidden_size) self.reset_parameters() # reset all the param in the MyLSTMCell def reset_parameters(self): weight = block_orth_normal_initializer([self.input_size + self.hidden_size, ], [self.hidden_size] * 3) self.linear.weight.data.copy_(weight) nn.init.constant_(self.linear.bias, 0.0) def forward(self, x, mask=None, hx=None, dropout=None): assert mask is not None and hx is not None _h, _c = hx _h = dropout(_h) _x = self.linear(torch.cat([x, _h], 1)) # compute the x i, j, o = _x.chunk(3, dim=1) i = torch.sigmoid(i) c = (1.0 - i) * _c + i * torch.tanh(j) c = mask * c # + (1.0 - mask) * _c h = torch.tanh(c) * torch.sigmoid(o) h = mask * h # + (1.0 - mask) * _h return h, c class VariationalLSTM(nn.Module): """A module that runs multiple steps of LSTM.""" def __init__(self, input_size, hidden_size, num_layers=1, batch_first=False, \ bidirectional=False, dropout_in=0, dropout_out=0): super(VariationalLSTM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.batch_first = batch_first self.bidirectional = bidirectional self.dropout_in = dropout_in self.dropout_out = dropout_out self.num_directions = 2 if bidirectional else 1 self.fcells = [] self.bcells = [] for layer in range(num_layers): layer_input_size = input_size if layer == 0 else hidden_size * self.num_directions self.fcells.append(nn.LSTMCell(input_size=layer_input_size, hidden_size=hidden_size)) if self.bidirectional: self.bcells.append(nn.LSTMCell(input_size=layer_input_size, hidden_size=hidden_size)) self._all_weights = [] for layer in range(num_layers): layer_params = (self.fcells[layer].weight_ih, self.fcells[layer].weight_hh, \ self.fcells[layer].bias_ih, self.fcells[layer].bias_hh) suffix = '' param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}'] param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}'] param_names = [x.format(layer, suffix) for x in param_names] for name, param in zip(param_names, layer_params): setattr(self, name, param) self._all_weights.append(param_names) if self.bidirectional: layer_params = (self.bcells[layer].weight_ih, self.bcells[layer].weight_hh, \ self.bcells[layer].bias_ih, self.bcells[layer].bias_hh) suffix = '_reverse' param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}'] param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}'] param_names = [x.format(layer, suffix) for x in param_names] for name, param in zip(param_names, layer_params): setattr(self, name, param) self._all_weights.append(param_names) self.reset_parameters() def reset_parameters(self): # modified by kiro for name, param in self.named_parameters(): print(name) if "weight" in name: # for i in range(4): # nn.init.orthogonal(self.__getattr__(name)[self.hidden_size*i:self.hidden_size*(i+1),:]) nn.init.orthogonal(self.__getattr__(name)) if "bias" in name: nn.init.normal(self.__getattr__(name), 0.0, 0.01) # nn.init.constant(self.__getattr__(name), 1.0) # different from zhang's 0 @staticmethod def _forward_rnn(cell, input, masks, initial, drop_masks): max_time = input.size(0) output = [] hx = initial for time in range(max_time): h_next, c_next = cell(input=input[time], hx=hx) h_next = h_next * masks[time] + initial[0] * (1 - masks[time]) c_next = c_next * masks[time] + initial[1] * (1 - masks[time]) output.append(h_next) if drop_masks is not None: h_next = h_next * drop_masks hx = (h_next, c_next) output = torch.stack(output, 0) return output, hx @staticmethod def _forward_brnn(cell, input, masks, initial, drop_masks): max_time = input.size(0) output = [] hx = initial for time in reversed(list(range(max_time))): h_next, c_next = cell(input=input[time], hx=hx) h_next = h_next * masks[time] + initial[0] * (1 - masks[time]) c_next = c_next * masks[time] + initial[1] * (1 - masks[time]) output.append(h_next) if drop_masks is not None: h_next = h_next * drop_masks hx = (h_next, c_next) output.reverse() output = torch.stack(output, 0) return output, hx def forward(self, input, masks, initial=None): if self.batch_first: input = input.transpose(0, 1) # transpose: return the transpose matrix masks = torch.unsqueeze(masks.transpose(0, 1), dim=2) max_time, batch_size, _ = input.size() masks = masks.expand(-1, -1, self.hidden_size) # expand: -1 means not expand that dimension if initial is None: initial = Variable(input.data.new(batch_size, self.hidden_size).zero_()) initial = (initial, initial) # h0, c0 h_n = [] c_n = [] for layer in range(self.num_layers): max_time, batch_size, input_size = input.size() input_mask, hidden_mask = None, None if self.training: # when training, use the dropout input_mask = input.data.new(batch_size, input_size).fill_(1 - self.dropout_in) input_mask = Variable(torch.bernoulli(input_mask), requires_grad=False) input_mask = input_mask / (1 - self.dropout_in) # permute: exchange the dimension input_mask = torch.unsqueeze(input_mask, dim=2).expand(-1, -1, max_time).permute(2, 0, 1) input = input * input_mask hidden_mask = input.data.new(batch_size, self.hidden_size).fill_(1 - self.dropout_out) hidden_mask = Variable(torch.bernoulli(hidden_mask), requires_grad=False) hidden_mask = hidden_mask / (1 - self.dropout_out) layer_output, (layer_h_n, layer_c_n) = VariationalLSTM._forward_rnn(cell=self.fcells[layer], \ input=input, masks=masks, initial=initial, drop_masks=hidden_mask) if self.bidirectional: blayer_output, (blayer_h_n, blayer_c_n) = VariationalLSTM._forward_brnn(cell=self.bcells[layer], \ input=input, masks=masks, initial=initial, drop_masks=hidden_mask) h_n.append(torch.cat([layer_h_n, blayer_h_n], 1) if self.bidirectional else layer_h_n) c_n.append(torch.cat([layer_c_n, blayer_c_n], 1) if self.bidirectional else layer_c_n) input = torch.cat([layer_output, blayer_output], 2) if self.bidirectional else layer_output h_n = torch.stack(h_n, 0) c_n = torch.stack(c_n, 0) if self.batch_first: input = input.transpose(1, 0) # transpose: return the transpose matrix return input, (h_n, c_n) ================================================ FILE: hanlp/components/srl/span_rank/span_rank.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-09 18:13 import logging from bisect import bisect from typing import Union, List, Callable, Tuple, Dict, Any from hanlp_common.constant import IDX from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer import torch from torch.utils.data import DataLoader from hanlp.common.dataset import PadSequenceDataLoader, SortingSampler from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import FieldLength from hanlp.common.vocab import Vocab from hanlp.components.srl.span_rank.inference_utils import srl_decode from hanlp.components.srl.span_rank.span_ranking_srl_model import SpanRankingSRLModel from hanlp.components.srl.span_rank.srl_eval_utils import compute_srl_f1 from hanlp.datasets.srl.loaders.conll2012 import CoNLL2012SRLDataset, filter_v_args, unpack_srl, \ group_pa_by_p from hanlp.layers.embeddings.embedding import Embedding from hanlp.metrics.f1 import F1 from hanlp_common.visualization import markdown_table from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import merge_locals_kwargs, reorder class SpanRankingSemanticRoleLabeler(TorchComponent): def __init__(self, **kwargs) -> None: """An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling" (:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them. Args: **kwargs: Predefined config. """ super().__init__(**kwargs) self.model: SpanRankingSRLModel = None def build_optimizer(self, trn, epochs, lr, adam_epsilon, weight_decay, warmup_steps, transformer_lr, **kwargs): # noinspection PyProtectedMember transformer = self._get_transformer() if transformer: num_training_steps = len(trn) * epochs // self.config.get('gradient_accumulation', 1) optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, transformer, lr, transformer_lr, num_training_steps, warmup_steps, weight_decay, adam_epsilon) else: optimizer = torch.optim.Adam(self.model.parameters(), self.config.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer, mode='max', factor=0.5, patience=2, verbose=True, ) return optimizer, scheduler def _get_transformer(self): return getattr(self.model_.embed, 'transformer', None) def build_criterion(self, **kwargs): pass # noinspection PyProtectedMember def build_metric(self, **kwargs) -> Tuple[F1, F1]: predicate_f1 = F1() end_to_end_f1 = F1() return predicate_f1, end_to_end_f1 def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, **kwargs): best_epoch, best_metric = 0, -1 predicate, end_to_end = metric optimizer, scheduler = optimizer timer = CountdownTimer(epochs) ratio_width = len(f'{len(trn)}/{len(trn)}') for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, linear_scheduler=scheduler if self._get_transformer() else None) if dev: self.evaluate_dataloader(dev, criterion, metric, logger, ratio_width=ratio_width) report = f'{timer.elapsed_human}/{timer.total_time_human}' dev_score = end_to_end.score if not self._get_transformer(): scheduler.step(dev_score) if dev_score > best_metric: self.save_weights(save_dir) best_metric = dev_score report += ' [red]saved[/red]' timer.log(report, ratio_percentage=False, newline=True, ratio=False) def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, linear_scheduler=None, gradient_accumulation=1, **kwargs): self.model.train() timer = CountdownTimer(len(trn) // gradient_accumulation) total_loss = 0 self.reset_metrics(metric) for idx, batch in enumerate(trn): output_dict = self.feed_batch(batch) self.update_metrics(batch, output_dict, metric) loss = output_dict['loss'] loss = loss.sum() # For data parallel if torch.isnan(loss): # w/ gold pred, some batches do not have PAs at all, resulting in empty scores loss = torch.zeros((1,), device=loss.device) else: loss.backward() if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation if self.config.grad_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm) if (idx + 1) % gradient_accumulation == 0: self._step(optimizer, linear_scheduler) timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger) total_loss += loss.item() del loss if len(trn) % gradient_accumulation: self._step(optimizer, linear_scheduler) return total_loss / timer.total def _step(self, optimizer, linear_scheduler): optimizer.step() optimizer.zero_grad() if linear_scheduler: linear_scheduler.step() # noinspection PyMethodOverriding @torch.no_grad() def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric, logger, ratio_width=None, output=False, official=False, confusion_matrix=False, **kwargs): self.model.eval() self.reset_metrics(metric) timer = CountdownTimer(len(data)) total_loss = 0 if official: sentences = [] gold = [] pred = [] for batch in data: output_dict = self.feed_batch(batch) if official: sentences += batch['token'] gold += batch['srl'] pred += output_dict['prediction'] self.update_metrics(batch, output_dict, metric) loss = output_dict['loss'] total_loss += loss.item() timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger, ratio_width=ratio_width) del loss if official: scores = compute_srl_f1(sentences, gold, pred) if logger: if confusion_matrix: labels = sorted(set(y for x in scores.label_confusions.keys() for y in x)) headings = ['GOLD↓PRED→'] + labels matrix = [] for i, gold in enumerate(labels): row = [gold] matrix.append(row) for j, pred in enumerate(labels): row.append(scores.label_confusions.get((gold, pred), 0)) matrix = markdown_table(headings, matrix) logger.info(f'{"Confusion Matrix": ^{len(matrix.splitlines()[0])}}') logger.info(matrix) headings = ['Settings', 'Precision', 'Recall', 'F1'] data = [] for h, (p, r, f) in zip(['Unlabeled', 'Labeled', 'Official'], [ [scores.unlabeled_precision, scores.unlabeled_recall, scores.unlabeled_f1], [scores.precision, scores.recall, scores.f1], [scores.conll_precision, scores.conll_recall, scores.conll_f1], ]): data.append([h] + [f'{x:.2%}' for x in [p, r, f]]) table = markdown_table(headings, data) logger.info(f'{"Scores": ^{len(table.splitlines()[0])}}') logger.info(table) else: scores = metric return total_loss / timer.total, scores def build_model(self, training=True, **kwargs) -> torch.nn.Module: # noinspection PyTypeChecker # embed: torch.nn.Embedding = self.config.embed.module(vocabs=self.vocabs)[0].embed model = SpanRankingSRLModel(self.config, self.config.embed.module(vocabs=self.vocabs, training=training), self.config.context_layer, len(self.vocabs.srl_label)) return model # noinspection PyMethodOverriding def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger, generate_idx=False, transform=None, **kwargs) -> DataLoader: batch_max_tokens = self.config.batch_max_tokens gradient_accumulation = self.config.get('gradient_accumulation', 1) if batch_size: batch_size //= gradient_accumulation if batch_max_tokens: batch_max_tokens //= gradient_accumulation dataset = self.build_dataset(data, generate_idx, logger, transform) sampler = SortingSampler([x['token_length'] for x in dataset], batch_size=batch_size, batch_max_tokens=batch_max_tokens, shuffle=shuffle) return PadSequenceDataLoader(batch_sampler=sampler, device=device, dataset=dataset) def build_dataset(self, data, generate_idx, logger, transform=None): dataset = CoNLL2012SRLDataset(data, transform=[filter_v_args, unpack_srl, group_pa_by_p], doc_level_offset=self.config.doc_level_offset, generate_idx=generate_idx) if transform: dataset.append_transform(transform) if isinstance(self.config.get('embed', None), Embedding): transform = self.config.embed.transform(vocabs=self.vocabs) if transform: dataset.append_transform(transform) dataset.append_transform(self.vocabs) dataset.append_transform(FieldLength('token')) if isinstance(data, str): dataset.purge_cache() # Enable cache if self.vocabs.mutable: self.build_vocabs(dataset, logger) return dataset def predict(self, data: Union[str, List[str]], batch_size: int = None, fmt='dict', **kwargs): if not data: return [] flat = self.input_is_flat(data) if flat: data = [data] samples = [] for token in data: sample = dict() sample['token'] = token samples.append(sample) batch_size = batch_size or self.config.batch_size dataloader = self.build_dataloader(samples, batch_size, False, self.device, None, generate_idx=True) outputs = [] order = [] for batch in dataloader: output_dict = self.feed_batch(batch) outputs.extend(output_dict['prediction']) order.extend(batch[IDX]) outputs = reorder(outputs, order) if fmt == 'list': outputs = self.format_dict_to_results(data, outputs) if flat: return outputs[0] return outputs @staticmethod def format_dict_to_results(data, outputs, exclusive_offset=False, with_predicate=False, with_argument=False, label_first=False): results = [] for i in range(len(outputs)): tokens = data[i] output = [] for p, a in outputs[i].items(): # a: [(0, 0, 'ARG0')] if with_predicate: a.insert(bisect([x[0] for x in a], p), (p, p, 'PRED')) if with_argument is not False: a = [x + (tokens[x[0]:x[1] + 1],) for x in a] if isinstance(with_argument, str): a = [x[:-1] + (with_argument.join(x[-1]),) for x in a] if exclusive_offset: a = [(x[0], x[1] + 1) + x[2:] for x in a] if label_first: a = [tuple(reversed(x[2:])) + x[:2] for x in a] output.append(a) results.append(output) return results def input_is_flat(self, data): return isinstance(data[0], str) # noinspection PyMethodOverriding def fit(self, trn_data, dev_data, save_dir, embed, context_layer, batch_size=40, batch_max_tokens=700, lexical_dropout=0.5, dropout=0.2, span_width_feature_size=20, ffnn_size=150, ffnn_depth=2, argument_ratio=0.8, predicate_ratio=0.4, max_arg_width=30, mlp_label_size=100, enforce_srl_constraint=False, use_gold_predicates=False, doc_level_offset=True, use_biaffine=False, lr=1e-3, transformer_lr=1e-5, adam_epsilon=1e-6, weight_decay=0.01, warmup_steps=0.1, grad_norm=5.0, gradient_accumulation=1, loss_reduction='sum', transform=None, devices=None, logger=None, seed=None, **kwargs ): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_vocabs(self, dataset, logger, **kwargs): self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None) # Use null to indicate no relationship self.vocabs.srl_label.add('') timer = CountdownTimer(len(dataset)) max_seq_len = 0 for each in dataset: max_seq_len = max(max_seq_len, len(each['token_input_ids'])) timer.log(f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]') pass timer.stop() timer.erase() self.vocabs['srl_label'].set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger) def reset_metrics(self, metrics): for each in metrics: each.reset() def report_metrics(self, loss, metrics): predicate, end_to_end = metrics return f'loss: {loss:.4f} predicate: {predicate.score:.2%} end_to_end: {end_to_end.score:.2%}' def feed_batch(self, batch) -> Dict[str, Any]: output_dict = self.model(batch) prediction = self.decode_output(output_dict, batch, self.model.training) output_dict['prediction'] = prediction return output_dict def decode_output(self, output_dict, batch, training=False): idx_to_label = self.vocabs['srl_label'].idx_to_token if training: # Use fast decoding during training, prediction = [] top_predicate_indices = output_dict['predicates'].tolist() top_spans = torch.stack([output_dict['arg_starts'], output_dict['arg_ends']], dim=-1).tolist() srl_mask = output_dict['srl_mask'].tolist() srl_scores = output_dict['srl_scores'] pal_list = srl_scores.argmax(-1).tolist() if srl_scores.numel() else [] for n, (pal, predicate_indices, argument_spans) in enumerate( zip(pal_list, top_predicate_indices, top_spans)): srl_per_sentence = {} for p, (al, predicate_index) in enumerate(zip(pal, predicate_indices)): for a, (l, argument_span) in enumerate(zip(al, argument_spans)): if l and srl_mask[n][p][a]: args = srl_per_sentence.get(p, None) if args is None: args = srl_per_sentence[p] = [] args.append((*argument_span, idx_to_label[l])) prediction.append(srl_per_sentence) else: prediction = srl_decode(batch['token_length'], output_dict, idx_to_label, self.config) return prediction def update_metrics(self, batch: dict, output_dict: dict, metrics): def unpack(y: dict): return set((p, bel) for p, a in y.items() for bel in a) predicate, end_to_end = metrics for pred, gold in zip(output_dict['prediction'], batch['srl']): predicate(pred.keys(), gold.keys()) end_to_end(unpack(pred), unpack(gold)) ================================================ FILE: hanlp/components/srl/span_rank/span_ranking_srl_model.py ================================================ from typing import Dict import hanlp.utils.torch_util from hanlp.layers.feedforward import FeedForward from hanlp.layers.time_distributed import TimeDistributed from .highway_variational_lstm import * import torch from ...parsers.biaffine.biaffine import Biaffine def initializer_1d(input_tensor, initializer): assert len(input_tensor.size()) == 1 input_tensor = input_tensor.view(-1, 1) input_tensor = initializer(input_tensor) return input_tensor.view(-1) class SpanRankingSRLDecoder(nn.Module): def __init__(self, context_layer_output_dim, label_space_size, config) -> None: super().__init__() self.config = config self.label_space_size = label_space_size self.dropout = float(config.dropout) self.use_gold_predicates = config.use_gold_predicates # span width feature embedding self.span_width_embedding = nn.Embedding(self.config.max_arg_width, self.config.span_width_feature_size) # self.context_projective_layer = nn.Linear(2 * self.lstm_hidden_size, self.config.num_attention_heads) # span scores self.span_emb_size = 3 * context_layer_output_dim + self.config.span_width_feature_size self.arg_unary_score_layers = nn.ModuleList([nn.Linear(self.span_emb_size, self.config.ffnn_size) if i == 0 else nn.Linear(self.config.ffnn_size, self.config.ffnn_size) for i in range(self.config.ffnn_depth)]) # [,150] self.arg_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)]) self.arg_unary_score_projection = nn.Linear(self.config.ffnn_size, 1) # predicate scores self.pred_unary_score_layers = nn.ModuleList( [nn.Linear(context_layer_output_dim, self.config.ffnn_size) if i == 0 else nn.Linear(self.config.ffnn_size, self.config.ffnn_size) for i in range(self.config.ffnn_depth)]) # [,150] self.pred_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)]) self.pred_unary_score_projection = nn.Linear(self.config.ffnn_size, 1) # srl scores self.srl_unary_score_input_size = self.span_emb_size + context_layer_output_dim self.srl_unary_score_layers = nn.ModuleList([nn.Linear(self.srl_unary_score_input_size, self.config.ffnn_size) if i == 0 else nn.Linear(self.config.ffnn_size, self.config.ffnn_size) for i in range(self.config.ffnn_depth)]) self.srl_dropout_layers = nn.ModuleList([nn.Dropout(self.dropout) for _ in range(self.config.ffnn_depth)]) self.srl_unary_score_projection = nn.Linear(self.config.ffnn_size, self.label_space_size - 1) if config.use_biaffine: self.predicate_scale = TimeDistributed(FeedForward(context_layer_output_dim, 1, self.span_emb_size, 'ReLU')) self.biaffine = Biaffine(self.span_emb_size, self.label_space_size - 1) self.loss_reduction = config.loss_reduction self.reset_parameters() def reset_parameters(self): init.xavier_uniform_(self.span_width_embedding.weight) # init.xavier_uniform_(self.context_projective_layer.weight) # initializer_1d(self.context_projective_layer.bias, init.xavier_uniform_) for layer in self.arg_unary_score_layers: init.xavier_uniform_(layer.weight) initializer_1d(layer.bias, init.xavier_uniform_) init.xavier_uniform_(self.arg_unary_score_projection.weight) initializer_1d(self.arg_unary_score_projection.bias, init.xavier_uniform_) for layer in self.pred_unary_score_layers: init.xavier_uniform_(layer.weight) initializer_1d(layer.bias, init.xavier_uniform_) init.xavier_uniform_(self.pred_unary_score_projection.weight) initializer_1d(self.pred_unary_score_projection.bias, init.xavier_uniform_) for layer in self.srl_unary_score_layers: init.xavier_uniform_(layer.weight) initializer_1d(layer.bias, init.xavier_uniform_) init.xavier_uniform_(self.srl_unary_score_projection.weight) initializer_1d(self.srl_unary_score_projection.bias, init.xavier_uniform_) return None def forward(self, hidden_states, batch, mask=None): gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, masks, sent_lengths = SpanRankingSRLModel.unpack( batch, mask=mask, training=self.training) return self.decode(hidden_states, sent_lengths, masks, gold_arg_starts, gold_arg_ends, gold_arg_labels, gold_predicates) @staticmethod def get_candidate_spans(sent_lengths: torch.Tensor, max_sent_length, max_arg_width): num_sentences = len(sent_lengths) device = sent_lengths.device candidate_starts = torch.arange(0, max_sent_length, device=device).expand(num_sentences, max_arg_width, -1) candidate_width = torch.arange(0, max_arg_width, device=device).view(1, -1, 1) candidate_ends = candidate_starts + candidate_width candidate_starts = candidate_starts.contiguous().view(num_sentences, max_sent_length * max_arg_width) candidate_ends = candidate_ends.contiguous().view(num_sentences, max_sent_length * max_arg_width) actual_sent_lengths = sent_lengths.view(-1, 1).expand(-1, max_sent_length * max_arg_width) candidate_mask = candidate_ends < actual_sent_lengths candidate_starts = candidate_starts * candidate_mask candidate_ends = candidate_ends * candidate_mask return candidate_starts, candidate_ends, candidate_mask @staticmethod def exclusive_cumsum(input: torch.Tensor, exclusive=True): """ Args: input: input is the sentence lengths tensor. exclusive: exclude the last sentence length (Default value = True) input(torch.Tensor :): input: torch.Tensor: Returns: """ assert exclusive is True if exclusive is True: exclusive_sent_lengths = input.new_zeros(1, dtype=torch.long) result = torch.cumsum(torch.cat([exclusive_sent_lengths, input], 0)[:-1], 0).view(-1, 1) else: result = torch.cumsum(input, 0).view(-1, 1) return result def flatten_emb(self, emb): num_sentences, max_sentence_length = emb.size()[0], emb.size()[1] assert len(emb.size()) == 3 flatted_emb = emb.contiguous().view(num_sentences * max_sentence_length, -1) return flatted_emb def flatten_emb_in_sentence(self, emb, batch_sentences_mask): num_sentences, max_sentence_length = emb.size()[0], emb.size()[1] flatted_emb = self.flatten_emb(emb) return flatted_emb[batch_sentences_mask.reshape(num_sentences * max_sentence_length)] def get_span_emb(self, flatted_context_emb, flatted_candidate_starts, flatted_candidate_ends, config, dropout=0.0): batch_word_num = flatted_context_emb.size()[0] # gather slices from embeddings according to indices span_start_emb = flatted_context_emb[flatted_candidate_starts] span_end_emb = flatted_context_emb[flatted_candidate_ends] span_emb_feature_list = [span_start_emb, span_end_emb] # store the span vector representations for span rep. span_width = 1 + flatted_candidate_ends - flatted_candidate_starts # [num_spans], generate the span width max_arg_width = config.max_arg_width # get the span width feature emb span_width_index = span_width - 1 span_width_emb = self.span_width_embedding(span_width_index) span_width_emb = F.dropout(span_width_emb, dropout, self.training) span_emb_feature_list.append(span_width_emb) """head features""" cpu_flatted_candidte_starts = flatted_candidate_starts span_indices = torch.arange(0, max_arg_width, device=flatted_context_emb.device).view(1, -1) + \ cpu_flatted_candidte_starts.view(-1, 1) # For all the i, where i in [begin, ..i, end] for span # reset the position index to the batch_word_num index with index - 1 span_indices = torch.clamp(span_indices, max=batch_word_num - 1) num_spans, spans_width = span_indices.size()[0], span_indices.size()[1] flatted_span_indices = span_indices.view(-1) # so Huge!!!, column is the span? # if torch.cuda.is_available(): flatted_span_indices = flatted_span_indices span_text_emb = flatted_context_emb.index_select(0, flatted_span_indices).view(num_spans, spans_width, -1) span_indices_mask = hanlp.utils.torch_util.lengths_to_mask(span_width, max_len=max_arg_width) # project context output to num head # head_scores = self.context_projective_layer.forward(flatted_context_emb) # get span attention # span_attention = head_scores.index_select(0, flatted_span_indices).view(num_spans, spans_width) # span_attention = torch.add(span_attention, expanded_span_indices_log_mask).unsqueeze(2) # control the span len # span_attention = F.softmax(span_attention, dim=1) span_text_emb = span_text_emb * span_indices_mask.unsqueeze(2).expand(-1, -1, span_text_emb.size()[-1]) span_head_emb = torch.mean(span_text_emb, 1) span_emb_feature_list.append(span_head_emb) span_emb = torch.cat(span_emb_feature_list, 1) return span_emb, None, span_text_emb, span_indices, span_indices_mask def get_arg_unary_scores(self, span_emb): """Compute span score with FFNN(span embedding) Args: span_emb: tensor of [num_sentences, num_spans, emb_size] config: param dropout: num_labels: param name: Returns: """ input = span_emb for i, ffnn in enumerate(self.arg_unary_score_layers): input = F.relu(ffnn.forward(input)) input = self.arg_dropout_layers[i].forward(input) output = self.arg_unary_score_projection.forward(input) return output def get_pred_unary_scores(self, span_emb): input = span_emb for i, ffnn in enumerate(self.pred_unary_score_layers): input = F.relu(ffnn.forward(input)) input = self.pred_dropout_layers[i].forward(input) output = self.pred_unary_score_projection.forward(input) return output def extract_spans(self, candidate_scores, candidate_starts, candidate_ends, topk, max_sentence_length, sort_spans, enforce_non_crossing): """extract the topk span indices Args: candidate_scores: param candidate_starts: candidate_ends: param topk: [num_sentences] max_sentence_length: param sort_spans: enforce_non_crossing: return: indices [num_sentences, max_num_predictions] candidate_starts: topk: sort_spans: Returns: """ # num_sentences = candidate_scores.size()[0] # num_input_spans = candidate_scores.size()[1] max_num_output_spans = int(torch.max(topk)) indices = [score.topk(k)[1] for score, k in zip(candidate_scores, topk)] output_span_indices_tensor = [F.pad(item, [0, max_num_output_spans - item.size()[0]], value=item[-1]) for item in indices] output_span_indices_tensor = torch.stack(output_span_indices_tensor) return output_span_indices_tensor def batch_index_select(self, emb, indices): num_sentences = emb.size()[0] max_sent_length = emb.size()[1] flatten_emb = self.flatten_emb(emb) offset = (torch.arange(0, num_sentences, device=emb.device) * max_sent_length).unsqueeze(1) return torch.index_select(flatten_emb, 0, (indices + offset).view(-1)) \ .view(indices.size()[0], indices.size()[1], emb.size(-1)) def get_batch_topk(self, candidate_starts: torch.Tensor, candidate_ends, candidate_scores, topk_ratio, text_len, max_sentence_length, sort_spans=False, enforce_non_crossing=True): num_sentences = candidate_starts.size()[0] max_sentence_length = candidate_starts.size()[1] topk = torch.floor(text_len.to(torch.float) * topk_ratio).to(torch.long) topk = torch.max(topk, torch.ones(num_sentences, device=candidate_starts.device, dtype=torch.long)) # this part should be implemented with C++ predicted_indices = self.extract_spans(candidate_scores, candidate_starts, candidate_ends, topk, max_sentence_length, sort_spans, enforce_non_crossing) predicted_starts = torch.gather(candidate_starts, 1, predicted_indices) predicted_ends = torch.gather(candidate_ends, 1, predicted_indices) predicted_scores = torch.gather(candidate_scores, 1, predicted_indices) return predicted_starts, predicted_ends, predicted_scores, topk, predicted_indices def get_dense_span_labels(self, span_starts, span_ends, span_labels, max_sentence_length, span_parents=None): num_sentences = span_starts.size()[0] max_spans_num = span_starts.size()[1] # span_starts = span_starts + 1 - (span_labels > 0).to(torch.long) span_starts[(span_labels == 0) & (span_starts < max_sentence_length - 1)] += 1 # make start > end sentence_indices = torch.arange(0, num_sentences, device=span_starts.device).unsqueeze(1).expand(-1, max_spans_num) sparse_indices = torch.cat([sentence_indices.unsqueeze(2), span_starts.unsqueeze(2), span_ends.unsqueeze(2)], dim=2) if span_parents is not None: # semantic span predicate offset sparse_indices = torch.cat([sparse_indices, span_parents.unsqueeze(2)], 2) rank = 3 if span_parents is None else 4 dense_labels = torch.sparse.LongTensor(sparse_indices.view(num_sentences * max_spans_num, rank).t(), span_labels.view(-1), torch.Size([num_sentences] + [max_sentence_length] * (rank - 1))) \ .to_dense() return dense_labels @staticmethod def gather_4d(params, indices): assert len(params.size()) == 4 and len(indices) == 4 indices_a, indices_b, indices_c, indices_d = indices result = params[indices_a, indices_b, indices_c, indices_d] return result def get_srl_labels(self, arg_starts, arg_ends, predicates, gold_predicates, gold_arg_starts, gold_arg_ends, gold_arg_labels, max_sentence_length ): num_sentences = arg_starts.size()[0] max_arg_num = arg_starts.size()[1] max_pred_num = predicates.size()[1] sentence_indices_2d = torch.arange(0, num_sentences, device=arg_starts.device).unsqueeze(1).unsqueeze(2).expand( -1, max_arg_num, max_pred_num) expanded_arg_starts = arg_starts.unsqueeze(2).expand(-1, -1, max_pred_num) expanded_arg_ends = arg_ends.unsqueeze(2).expand(-1, -1, max_pred_num) expanded_predicates = predicates.unsqueeze(1).expand(-1, max_arg_num, -1) dense_srl_labels = self.get_dense_span_labels(gold_arg_starts, gold_arg_ends, gold_arg_labels, max_sentence_length, span_parents=gold_predicates) # ans srl_labels = self.gather_4d(dense_srl_labels, [sentence_indices_2d, expanded_arg_starts, expanded_arg_ends, expanded_predicates]) return srl_labels def get_srl_unary_scores(self, span_emb): input = span_emb for i, ffnn in enumerate(self.srl_unary_score_layers): input = F.relu(ffnn.forward(input)) input = self.srl_dropout_layers[i].forward(input) output = self.srl_unary_score_projection.forward(input) return output def get_srl_scores(self, arg_emb, pred_emb, arg_scores, pred_scores, num_labels, config, dropout): num_sentences = arg_emb.size()[0] num_args = arg_emb.size()[1] # [batch_size, max_arg_num, arg_emb_size] num_preds = pred_emb.size()[1] # [batch_size, max_pred_num, pred_emb_size] unsqueezed_arg_emb = arg_emb.unsqueeze(2) unsqueezed_pred_emb = pred_emb.unsqueeze(1) expanded_arg_emb = unsqueezed_arg_emb.expand(-1, -1, num_preds, -1) expanded_pred_emb = unsqueezed_pred_emb.expand(-1, num_args, -1, -1) pair_emb_list = [expanded_arg_emb, expanded_pred_emb] pair_emb = torch.cat(pair_emb_list, 3) # concatenate the argument emb and pre emb pair_emb_size = pair_emb.size()[3] flat_pair_emb = pair_emb.view(num_sentences * num_args * num_preds, pair_emb_size) # get unary scores flat_srl_scores = self.get_srl_unary_scores(flat_pair_emb) srl_scores = flat_srl_scores.view(num_sentences, num_args, num_preds, flat_srl_scores.size(-1)) if self.config.use_biaffine: srl_scores += self.biaffine(arg_emb, self.predicate_scale(pred_emb)).permute([0, 2, 3, 1]) unsqueezed_arg_scores, unsqueezed_pred_scores = \ arg_scores.unsqueeze(2).unsqueeze(3), pred_scores.unsqueeze(1).unsqueeze(3) srl_scores = srl_scores + unsqueezed_arg_scores + unsqueezed_pred_scores dummy_scores = torch.zeros([num_sentences, num_args, num_preds, 1], device=arg_emb.device) srl_scores = torch.cat([dummy_scores, srl_scores], 3) return srl_scores def get_srl_softmax_loss(self, srl_scores, srl_labels, num_predicted_args, num_predicted_preds): srl_loss_mask = self.get_srl_loss_mask(srl_scores, num_predicted_args, num_predicted_preds) loss = torch.nn.functional.cross_entropy(srl_scores[srl_loss_mask], srl_labels[srl_loss_mask], reduction=self.loss_reduction) return loss, srl_loss_mask def get_srl_loss_mask(self, srl_scores, num_predicted_args, num_predicted_preds): max_num_arg = srl_scores.size()[1] max_num_pred = srl_scores.size()[2] # num_predicted_args, 1D tensor; max_num_arg: a int variable means the gold ans's max arg number args_mask = hanlp.utils.torch_util.lengths_to_mask(num_predicted_args, max_num_arg) pred_mask = hanlp.utils.torch_util.lengths_to_mask(num_predicted_preds, max_num_pred) srl_loss_mask = args_mask.unsqueeze(2) & pred_mask.unsqueeze(1) return srl_loss_mask def decode(self, contextualized_embeddings, sent_lengths, masks, gold_arg_starts, gold_arg_ends, gold_arg_labels, gold_predicates): num_sentences, max_sent_length = masks.size() device = sent_lengths.device """generate candidate spans with argument pruning""" # candidate_starts [num_sentences, max_sent_length * max_arg_width] candidate_starts, candidate_ends, candidate_mask = self.get_candidate_spans( sent_lengths, max_sent_length, self.config.max_arg_width) flatted_candidate_mask = candidate_mask.view(-1) batch_word_offset = self.exclusive_cumsum(sent_lengths) # get the word offset in a batch # choose the flatted_candidate_starts with the actual existing positions, i.e. exclude the illegal starts flatted_candidate_starts = candidate_starts + batch_word_offset flatted_candidate_starts = flatted_candidate_starts.view(-1)[flatted_candidate_mask].to(torch.long) flatted_candidate_ends = candidate_ends + batch_word_offset flatted_candidate_ends = flatted_candidate_ends.view(-1)[flatted_candidate_mask].to(torch.long) # flatten the lstm output according to the sentence mask, i.e. exclude the illegal (padding) lstm output flatted_context_output = self.flatten_emb_in_sentence(contextualized_embeddings, masks) """generate the span embedding""" candidate_span_emb, head_scores, span_head_emb, head_indices, head_indices_log_mask = self.get_span_emb( flatted_context_output, flatted_candidate_starts, flatted_candidate_ends, self.config, dropout=self.dropout) """Get the span ids""" candidate_span_number = candidate_span_emb.size()[0] max_candidate_spans_num_per_sentence = candidate_mask.size()[1] sparse_indices = candidate_mask.nonzero(as_tuple=False) sparse_values = torch.arange(0, candidate_span_number, device=device) candidate_span_ids = torch.sparse.FloatTensor(sparse_indices.t(), sparse_values, torch.Size([num_sentences, max_candidate_spans_num_per_sentence])).to_dense() spans_log_mask = torch.log(candidate_mask.to(torch.float)) predict_dict = {"candidate_starts": candidate_starts, "candidate_ends": candidate_ends, "head_scores": head_scores} """Get unary scores and topk of candidate argument spans.""" flatted_candidate_arg_scores = self.get_arg_unary_scores(candidate_span_emb) candidate_arg_scores = flatted_candidate_arg_scores.index_select(0, candidate_span_ids.view(-1)) \ .view(candidate_span_ids.size()[0], candidate_span_ids.size()[1]) candidate_arg_scores = candidate_arg_scores + spans_log_mask arg_starts, arg_ends, arg_scores, num_args, top_arg_indices = \ self.get_batch_topk(candidate_starts, candidate_ends, candidate_arg_scores, self.config.argument_ratio, sent_lengths, max_sent_length, sort_spans=False, enforce_non_crossing=False) """Get the candidate predicate""" candidate_pred_ids = torch.arange(0, max_sent_length, device=device).unsqueeze(0).expand(num_sentences, -1) candidate_pred_emb = contextualized_embeddings candidate_pred_scores = self.get_pred_unary_scores(candidate_pred_emb) candidate_pred_scores = candidate_pred_scores + torch.log(masks.to(torch.float).unsqueeze(2)) candidate_pred_scores = candidate_pred_scores.squeeze(2) if self.use_gold_predicates is True: predicates = gold_predicates num_preds = (gold_arg_labels > 0).sum(dim=-1) pred_scores = torch.zeros_like(predicates) top_pred_indices = predicates else: predicates, _, pred_scores, num_preds, top_pred_indices = self.get_batch_topk( candidate_pred_ids, candidate_pred_ids, candidate_pred_scores, self.config.predicate_ratio, sent_lengths, max_sent_length, sort_spans=False, enforce_non_crossing=False) """Get top arg embeddings""" arg_span_indices = torch.gather(candidate_span_ids, 1, top_arg_indices) # [num_sentences, max_num_args] arg_emb = candidate_span_emb.index_select(0, arg_span_indices.view(-1)).view( arg_span_indices.size()[0], arg_span_indices.size()[1], -1 ) # [num_sentences, max_num_args, emb] """Get top predicate embeddings""" pred_emb = self.batch_index_select(candidate_pred_emb, top_pred_indices) # [num_sentences, max_num_preds, emb] """Get the srl scores according to the arg emb and pre emb.""" srl_scores = self.get_srl_scores(arg_emb, pred_emb, arg_scores, pred_scores, self.label_space_size, self.config, self.dropout) # [num_sentences, max_num_args, max_num_preds, num_labels] if gold_arg_labels is not None: """Get the answers according to the labels""" srl_labels = self.get_srl_labels(arg_starts, arg_ends, predicates, gold_predicates, gold_arg_starts, gold_arg_ends, gold_arg_labels, max_sent_length) """Compute the srl loss""" srl_loss, srl_mask = self.get_srl_softmax_loss(srl_scores, srl_labels, num_args, num_preds) predict_dict.update({ 'srl_mask': srl_mask, 'loss': srl_loss }) else: predict_dict['srl_mask'] = self.get_srl_loss_mask(srl_scores, num_args, num_preds) predict_dict.update({ "candidate_arg_scores": candidate_arg_scores, "candidate_pred_scores": candidate_pred_scores, "predicates": predicates, "arg_starts": arg_starts, "arg_ends": arg_ends, "arg_scores": arg_scores, "pred_scores": pred_scores, "num_args": num_args, "num_preds": num_preds, # [num_sentences, num_args, num_preds] avoid max on empty tensor # "arg_labels": torch.max(srl_scores, 1)[1] if srl_scores.numel() else srl_scores[:, :, :, 0], "srl_scores": srl_scores, }) return predict_dict class SpanRankingSRLModel(nn.Module): def __init__(self, config, embed: torch.nn.Module, context_layer: torch.nn.Module, label_space_size): super(SpanRankingSRLModel, self).__init__() self.config = config self.dropout = float(config.dropout) self.lexical_dropout = float(self.config.lexical_dropout) self.label_space_size = label_space_size # Initialize layers and parameters self.word_embedding_dim = embed.get_output_dim() # get the embedding dim self.embed = embed # Initialize context layer self.context_layer = context_layer context_layer_output_dim = context_layer.get_output_dim() if context_layer else self.word_embedding_dim self.decoder = SpanRankingSRLDecoder(context_layer_output_dim, label_space_size, config) def forward(self, batch: Dict[str, torch.Tensor] ): gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, masks, sent_lengths = \ self.unpack(batch, training=self.training) context_embeddings = self.embed(batch) context_embeddings = F.dropout(context_embeddings, self.lexical_dropout, self.training) if self.context_layer: context_embeddings = self.context_layer(context_embeddings, masks) return self.decoder.decode(context_embeddings, sent_lengths, masks, gold_arg_starts, gold_arg_ends, gold_arg_labels, gold_predicates) @staticmethod def unpack(batch, mask=None, training=False): keys = 'token_length', 'predicate_offset', 'argument_begin_offset', 'argument_end_offset', 'srl_label_id' sent_lengths, gold_predicates, gold_arg_starts, gold_arg_ends, gold_arg_labels = [batch.get(k, None) for k in keys] if mask is None: mask = hanlp.utils.torch_util.lengths_to_mask(sent_lengths) # elif not training: # sent_lengths = mask.sum(dim=1) return gold_arg_ends, gold_arg_labels, gold_arg_starts, gold_predicates, mask, sent_lengths ================================================ FILE: hanlp/components/srl/span_rank/srl_eval_utils.py ================================================ # Evaluation util functions for PropBank SRL. import codecs import collections import operator import tempfile from collections import Counter from hanlp.metrics.srl.srlconll import official_conll_05_evaluate _SRL_CONLL_EVAL_SCRIPT = "../run_eval.sh" def split_example_for_eval(example): """Split document-based samples into sentence-based samples for evaluation. Args: example: Returns: """ sentences = example["sentences"] num_words = sum(len(s) for s in sentences) word_offset = 0 samples = [] # assert len(sentences) == 1 for i, sentence in enumerate(sentences): # assert i == 0 # For CoNLL-2005, there are always document == sentence. srl_rels = {} ner_spans = [] # Unused. for r in example["srl"][i]: pred_id = r[0] - word_offset if pred_id not in srl_rels: srl_rels[pred_id] = [] srl_rels[pred_id].append((r[1] - word_offset, r[2] - word_offset, r[3])) samples.append((sentence, srl_rels, ner_spans)) word_offset += len(sentence) return samples def evaluate_retrieval(span_starts, span_ends, span_scores, pred_starts, pred_ends, gold_spans, text_length, evaluators, debugging=False): """Evaluation for unlabeled retrieval. Args: gold_spans: Set of tuples of (start, end). span_starts: span_ends: span_scores: pred_starts: pred_ends: text_length: evaluators: debugging: (Default value = False) Returns: """ if len(span_starts) > 0: sorted_starts, sorted_ends, sorted_scores = list(zip(*sorted( zip(span_starts, span_ends, span_scores), key=operator.itemgetter(2), reverse=True))) else: sorted_starts = [] sorted_ends = [] for k, evaluator in list(evaluators.items()): if k == -3: predicted_spans = set(zip(span_starts, span_ends)) & gold_spans else: if k == -2: predicted_starts = pred_starts predicted_ends = pred_ends if debugging: print("Predicted", list(zip(sorted_starts, sorted_ends, sorted_scores))[:len(gold_spans)]) print("Gold", gold_spans) # FIXME: scalar index error elif k == 0: is_predicted = span_scores > 0 predicted_starts = span_starts[is_predicted] predicted_ends = span_ends[is_predicted] else: if k == -1: num_predictions = len(gold_spans) else: num_predictions = (k * text_length) / 100 predicted_starts = sorted_starts[:num_predictions] predicted_ends = sorted_ends[:num_predictions] predicted_spans = set(zip(predicted_starts, predicted_ends)) evaluator.update(gold_set=gold_spans, predicted_set=predicted_spans) def _calc_f1(total_gold, total_predicted, total_matched, message=None): precision = total_matched / total_predicted if total_predicted > 0 else 0 recall = total_matched / total_gold if total_gold > 0 else 0 f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0 if message: print(("{}: Precision: {:.2%} Recall: {:.2%} F1: {:.2%}".format(message, precision, recall, f1))) return precision, recall, f1 def compute_span_f1(gold_data, predictions, task_name): assert len(gold_data) == len(predictions) total_gold = 0 total_predicted = 0 total_matched = 0 total_unlabeled_matched = 0 label_confusions = Counter() # Counter of (gold, pred) label pairs. for i in range(len(gold_data)): gold = gold_data[i] pred = predictions[i] total_gold += len(gold) total_predicted += len(pred) for a0 in gold: for a1 in pred: if a0[0] == a1[0] and a0[1] == a1[1]: total_unlabeled_matched += 1 label_confusions.update([(a0[2], a1[2]), ]) if a0[2] == a1[2]: total_matched += 1 prec, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched, task_name) ul_prec, ul_recall, ul_f1 = _calc_f1(total_gold, total_predicted, total_unlabeled_matched, "Unlabeled " + task_name) return prec, recall, f1, ul_prec, ul_recall, ul_f1, label_confusions def compute_unlabeled_span_f1(gold_data, predictions, task_name): assert len(gold_data) == len(predictions) total_gold = 0 total_predicted = 0 total_matched = 0 total_unlabeled_matched = 0 label_confusions = Counter() # Counter of (gold, pred) label pairs. for i in range(len(gold_data)): gold = gold_data[i] pred = predictions[i] total_gold += len(gold) total_predicted += len(pred) for a0 in gold: for a1 in pred: if a0[0] == a1[0] and a0[1] == a1[1]: total_unlabeled_matched += 1 label_confusions.update([(a0[2], a1[2]), ]) if a0[2] == a1[2]: total_matched += 1 prec, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched, task_name) ul_prec, ul_recall, ul_f1 = _calc_f1(total_gold, total_predicted, total_unlabeled_matched, "Unlabeled " + task_name) return prec, recall, f1, ul_prec, ul_recall, ul_f1, label_confusions SRLScores = collections.namedtuple('SRLScores', ['unlabeled_precision', 'unlabeled_recall', 'unlabeled_f1', 'precision', 'recall', 'f1', 'conll_precision', 'conll_recall', 'conll_f1', 'label_confusions', 'num_sents']) def compute_srl_f1(sentences, gold_srl, predictions, gold_path=None) -> SRLScores: assert len(gold_srl) == len(predictions) total_gold = 0 total_predicted = 0 total_matched = 0 total_unlabeled_matched = 0 num_sents = 0 label_confusions = Counter() # Compute unofficial F1 of SRL relations. for gold, prediction in zip(gold_srl, predictions): gold_rels = 0 pred_rels = 0 matched = 0 for pred_id, gold_args in gold.items(): filtered_gold_args = [a for a in gold_args if a[2] not in ["V", "C-V"]] total_gold += len(filtered_gold_args) gold_rels += len(filtered_gold_args) if pred_id not in prediction: continue for a0 in filtered_gold_args: for a1 in prediction[pred_id]: if a0[0] == a1[0] and a0[1] == a1[1]: total_unlabeled_matched += 1 label_confusions.update([(a0[2], a1[2]), ]) if a0[2] == a1[2]: total_matched += 1 matched += 1 for pred_id, args in prediction.items(): filtered_args = [a for a in args if a[2] not in ["V"]] # "C-V"]] total_predicted += len(filtered_args) pred_rels += len(filtered_args) if gold_rels == matched and pred_rels == matched: num_sents += 1 precision, recall, f1 = _calc_f1(total_gold, total_predicted, total_matched, # "SRL (unofficial)" ) unlabeled_precision, unlabeled_recall, unlabeled_f1 = _calc_f1(total_gold, total_predicted, total_unlabeled_matched, # "Unlabeled SRL (unofficial)" ) # Prepare to compute official F1. if not gold_path: # print("No gold conll_eval data provided. Recreating ...") gold_path = tempfile.NamedTemporaryFile().name print_to_conll(sentences, gold_srl, gold_path, None) gold_predicates = None else: gold_predicates = read_gold_predicates(gold_path) temp_output = tempfile.NamedTemporaryFile().name # print(("Output temp outoput {}".format(temp_output))) print_to_conll(sentences, predictions, temp_output, gold_predicates) # Evaluate twice with official script. conll_precision, conll_recall, conll_f1 = official_conll_05_evaluate(temp_output, gold_path) return SRLScores(unlabeled_precision, unlabeled_recall, unlabeled_f1, precision, recall, f1, conll_precision, conll_recall, conll_f1, label_confusions, num_sents) def print_sentence_to_conll(fout, tokens, labels): """Print a labeled sentence into CoNLL format. Args: fout: tokens: labels: Returns: """ for label_column in labels: assert len(label_column) == len(tokens) for i in range(len(tokens)): fout.write(tokens[i].ljust(15)) for label_column in labels: fout.write(label_column[i].rjust(15)) fout.write("\n") fout.write("\n") def read_gold_predicates(gold_path): print("gold path", gold_path) fin = codecs.open(gold_path, "r", "utf-8") gold_predicates = [[], ] for line in fin: line = line.strip() if not line: gold_predicates.append([]) else: info = line.split() gold_predicates[-1].append(info[0]) fin.close() return gold_predicates def print_to_conll(sentences, srl_labels, output_filename, gold_predicates=None): fout = codecs.open(output_filename, "w", "utf-8") for sent_id, words in enumerate(sentences): if gold_predicates: assert len(gold_predicates[sent_id]) == len(words) pred_to_args = srl_labels[sent_id] props = ["-" for _ in words] col_labels = [["*" for _ in words] for _ in range(len(pred_to_args))] for i, pred_id in enumerate(sorted(pred_to_args.keys())): # To make sure CoNLL-eval script count matching predicates as correct. if gold_predicates and gold_predicates[sent_id][pred_id] != "-": props[pred_id] = gold_predicates[sent_id][pred_id] else: props[pred_id] = "P" + words[pred_id] flags = [False for _ in words] for start, end, label in pred_to_args[pred_id]: if not max(flags[start:end + 1]): col_labels[i][start] = "(" + label + col_labels[i][start] col_labels[i][end] = col_labels[i][end] + ")" for j in range(start, end + 1): flags[j] = True # Add unpredicted verb (for predicted SRL). if not flags[pred_id]: # if the predicate id is False col_labels[i][pred_id] = "(V*)" print_sentence_to_conll(fout, props, col_labels) fout.close() ================================================ FILE: hanlp/components/srl/span_rank/util.py ================================================ # Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL import torch def block_orth_normal_initializer(input_size, output_size): weight = [] for o in output_size: for i in input_size: param = torch.FloatTensor(o, i) torch.nn.init.orthogonal_(param) weight.append(param) return torch.cat(weight) ================================================ FILE: hanlp/components/sts/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-20 17:02 ================================================ FILE: hanlp/components/sts/transformer_sts.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-20 17:03 import logging from typing import Union, List import torch from torch.utils.data import DataLoader from hanlp.common.structure import History from hanlp.layers.transformers.pt_imports import AutoConfig_, AutoTokenizer_ from transformers import AutoModelForSequenceClassification from transformers.modeling_outputs import SequenceClassifierOutput from hanlp.common.dataset import SortingSamplerBuilder, PadSequenceDataLoader from hanlp.common.torch_component import TorchComponent from hanlp.datasets.sts.stsb import SemanticTextualSimilarityDataset from hanlp.layers.transformers.utils import build_optimizer_scheduler_with_transformer from hanlp.metrics.spearman_correlation import SpearmanCorrelation from hanlp.transform.transformer_tokenizer import TransformerTextTokenizer from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import merge_locals_kwargs, reorder from hanlp_common.constant import IDX class TransformerSemanticTextualSimilarity(TorchComponent): def __init__(self, **kwargs) -> None: """ A simple Semantic Textual Similarity (STS) baseline which fine-tunes a transformer with a regression layer on top of it. Args: **kwargs: Predefined config. """ super().__init__(**kwargs) self._tokenizer = None # noinspection PyMethodOverriding def build_dataloader(self, data, batch_size, sent_a_col=None, sent_b_col=None, similarity_col=None, delimiter='auto', gradient_accumulation=1, sampler_builder=None, shuffle=False, device=None, logger: logging.Logger = None, split=None, **kwargs) -> DataLoader: dataset = SemanticTextualSimilarityDataset(data, sent_a_col, sent_b_col, similarity_col, delimiter=delimiter, transform=self._tokenizer, cache=isinstance(data, str)) if split == 'trn': scores = [x['similarity'] for x in dataset] self.config.max_score = max(scores) self.config.min_score = min(scores) if not sampler_builder: sampler_builder = SortingSamplerBuilder(batch_size=batch_size) lens = [len(x['input_ids']) for x in dataset] return PadSequenceDataLoader(dataset, batch_sampler=sampler_builder.build(lens, shuffle, gradient_accumulation), device=device, pad={'similarity': 0.0, 'input_ids': self._tokenizer.tokenizer.pad_token_id}) def build_optimizer(self, trn, epochs, gradient_accumulation=1, lr=1e-3, transformer_lr=5e-5, adam_epsilon=1e-8, weight_decay=0.0, warmup_steps=0.1, **kwargs): num_training_steps = len(trn) * epochs // gradient_accumulation optimizer, scheduler = build_optimizer_scheduler_with_transformer(self.model, self.model.base_model, lr, transformer_lr, num_training_steps, warmup_steps, weight_decay, adam_epsilon) return optimizer, scheduler def build_criterion(self, **kwargs): pass def build_metric(self, **kwargs): return SpearmanCorrelation() def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, gradient_accumulation=1, **kwargs): best_epoch, best_metric = 0, -1 timer = CountdownTimer(epochs) history = History() for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, ratio_width=ratio_width, gradient_accumulation=gradient_accumulation, history=history, save_dir=save_dir) report = f'{timer.elapsed_human}/{timer.total_time_human}' self.evaluate_dataloader(dev, logger, ratio_width=ratio_width, save_dir=save_dir, metric=metric) if metric > best_metric: self.save_weights(save_dir) best_metric = float(metric) best_epoch = epoch report += ' [red]saved[/red]' timer.log(report, ratio_percentage=False, newline=True, ratio=False) if best_epoch and best_epoch != epochs: logger.info(f'Restored the best model with {best_metric} saved {epochs - best_epoch} epochs ago') self.load_weights(save_dir) def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric: SpearmanCorrelation, logger: logging.Logger, history=None, gradient_accumulation=1, **kwargs): self.model.train() optimizer, scheduler = optimizer timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation)) total_loss = 0 metric.reset() for batch in trn: output = self.feed_batch(batch) prediction = self.decode(output) metric(prediction, batch['similarity']) loss = output['loss'] if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation loss.backward() total_loss += loss.item() if history.step(gradient_accumulation): if self.config.grad_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.grad_norm) optimizer.step() if scheduler: scheduler.step() optimizer.zero_grad() timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger) del loss return total_loss / timer.total @torch.no_grad() def evaluate_dataloader(self, data: DataLoader, logger: logging.Logger, metric=None, output=False, **kwargs): self.model.eval() timer = CountdownTimer(len(data)) total_loss = 0 metric.reset() if output: predictions = [] orders = [] samples = [] for batch in data: output_dict = self.feed_batch(batch) prediction = self.decode(output_dict) metric(prediction, batch['similarity']) if output: predictions.extend(prediction.tolist()) orders.extend(batch[IDX]) samples.extend(list(zip(batch['sent_a'], batch['sent_b']))) loss = output_dict['loss'] total_loss += loss.item() timer.log(self.report_metrics(total_loss / (timer.current + 1), metric), ratio_percentage=None, logger=logger) del loss if output: predictions = reorder(predictions, orders) samples = reorder(samples, orders) with open(output, 'w') as out: for s, p in zip(samples, predictions): out.write('\t'.join(s + (str(p),))) out.write('\n') return total_loss / timer.total # noinspection PyMethodOverriding def build_model(self, transformer, training=True, **kwargs) -> torch.nn.Module: config = AutoConfig_.from_pretrained(transformer, num_labels=1) if training: model = AutoModelForSequenceClassification.from_pretrained(transformer, config=config) else: model = AutoModelForSequenceClassification.from_config(config) return model def predict(self, data: Union[List[str], List[List[str]]], batch_size: int = None, **kwargs) -> Union[ float, List[float]]: """ Predict the similarity between sentence pairs. Args: data: Sentence pairs. batch_size: The number of samples in a batch. **kwargs: Not used. Returns: Similarities between sentences. """ if not data: return [] flat = isinstance(data[0], str) if flat: data = [data] dataloader = self.build_dataloader([{'sent_a': x[0], 'sent_b': x[1]} for x in data], batch_size=batch_size or self.config.batch_size, device=self.device) orders = [] predictions = [] for batch in dataloader: output_dict = self.feed_batch(batch) prediction = self.decode(output_dict) predictions.extend(prediction.tolist()) orders.extend(batch[IDX]) predictions = reorder(predictions, orders) if flat: return predictions[0] return predictions # noinspection PyMethodOverriding def fit(self, trn_data, dev_data, save_dir, transformer, sent_a_col, sent_b_col, similarity_col, delimiter='auto', batch_size=32, max_seq_len=128, epochs=3, lr=1e-3, transformer_lr=5e-5, adam_epsilon=1e-8, weight_decay=0.0, warmup_steps=0.1, gradient_accumulation=1, grad_norm=1.0, sampler_builder=None, devices=None, logger=None, seed=None, finetune: Union[bool, str] = False, eval_trn=True, _device_placeholder=False, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def on_config_ready(self, transformer, max_seq_len, **kwargs): super().on_config_ready(**kwargs) self._tokenizer = TransformerTextTokenizer(AutoTokenizer_.from_pretrained(transformer), text_a_key='sent_a', text_b_key='sent_b', output_key='', max_seq_length=max_seq_len) def feed_batch(self, batch) -> SequenceClassifierOutput: return self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], token_type_ids=batch['token_type_ids'], labels=batch.get('similarity', None)) def decode(self, output: SequenceClassifierOutput): return output.logits.squeeze(-1).detach().clip(self.config.min_score, self.config.max_score) def report_metrics(self, loss, metric): return f'loss: {loss:.4f} {metric}' ================================================ FILE: hanlp/components/taggers/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-28 15:39 ================================================ FILE: hanlp/components/taggers/cnn_tagger_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-31 13:52 from abc import ABC from typing import Union, Tuple, Any, List, Iterable import tensorflow as tf from hanlp.components.taggers.tagger_tf import TaggerComponent from hanlp.transform.tsv_tf import TSVTaggingTransform from hanlp.common.vocab_tf import VocabTF from hanlp.layers.embeddings.util_tf import build_embedding class WindowTokenTransform(TSVTaggingTransform): def fit(self, trn_path: str, **kwargs): self.word_vocab = VocabTF() self.tag_vocab = VocabTF(pad_token=None, unk_token=None) for ngrams, tags in self.file_to_samples(trn_path): for words in ngrams: self.word_vocab.update(words) self.tag_vocab.update(tags) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: window_radius = self.config.window_radius window_size = 2 * window_radius + 1 types = tf.string, tf.string shapes = [None, window_size], [None] values = self.word_vocab.pad_token, self.tag_vocab.first_token return types, shapes, values def inputs_to_samples(self, inputs, gold=False): window_radius = self.config.window_radius for t in inputs: if gold: words, tags = t else: words, tags = t, [self.padding_values[-1]] * len(t) ngrams = [] for i, word in enumerate(words): features = [] for t in range(-window_radius, window_radius + 1): index = i + t if index < 0: feature = 'bos{}'.format(index) elif index >= len(words): feature = 'eos+{}'.format(index - len(words) + 1) else: feature = words[index] features.append(feature) ngrams.append(features) yield ngrams, tags def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: for xs in X: words = [] for x in xs: words.append(self.word_vocab.idx_to_token[int(x[len(x) // 2])]) yield words class CNNTaggingModel(tf.keras.models.Model): def __init__(self, filters, num_tags, embed, dropout, kernels, **kwargs): super().__init__() self.embed = embed self.embed_dropout = tf.keras.layers.Dropout(rate=dropout) self.conv2d = [] for k in kernels: self.conv2d.append( tf.keras.layers.Conv2D(filters=filters, kernel_size=k, data_format='channels_last', padding='same')) self.conv2d_dropout = tf.keras.layers.Dropout(rate=dropout) self.concat = tf.keras.layers.Concatenate() self.dense = tf.keras.layers.Dense(units=num_tags) def call(self, inputs, **kwargs): # if inputs.shape_h[0] is None: # return tf.zeros_like() # print(inputs) embeds = self.embed(inputs) embeds = self.embed_dropout(embeds) hs = [conv(embeds) for conv in self.conv2d] h = self.concat(hs) h = self.conv2d_dropout(h) shape_h = tf.shape(h) h = tf.reshape(h, [shape_h[0], shape_h[1], h.shape[2] * h.shape[3]]) o = self.dense(h) if h.shape[0]: mask = embeds._keras_mask[:, :, 0] o._keras_mask = mask return o class CNNTaggerTF(TaggerComponent, ABC): def __init__(self, transform: WindowTokenTransform = None) -> None: if not transform: transform = WindowTokenTransform() super().__init__(transform) self.model: CNNTaggingModel = self.model # refine the type self.transform: WindowTokenTransform = self.transform def build_model(self, embedding, **kwargs) -> tf.keras.Model: embed = build_embedding(embedding, self.transform.word_vocab, self.transform) self.transform.map_x = embed.dtype != tf.string model = CNNTaggingModel(num_tags=len(self.transform.tag_vocab), embed=embed, **kwargs) # model.build((None, None, 3)) return model # noinspection PyMethodOverriding def fit(self, trn_data: Any, dev_data: Any, save_dir: str, embedding=200, window_radius=3, kernels=(1, 2, 3, 4, 5), filters=200, dropout=0.3, loss: Union[tf.keras.losses.Loss, str] = None, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy', batch_size=100, epochs=100, logger=None, verbose=True, **kwargs): kwargs.update(locals()) for k in 'self', 'kwargs', '__class__': kwargs.pop(k) super().fit(**kwargs) @property def input_shape(self) -> List: return [[None, None, self.config.window_radius * 2 + 1]] ================================================ FILE: hanlp/components/taggers/ngram_conv/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 22:18 ================================================ FILE: hanlp/components/taggers/ngram_conv/ngram_conv_tagger.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-25 00:04 from typing import Union, Optional, Tuple, Any, Iterable, List import tensorflow as tf from hanlp_common.structure import SerializableDict from hanlp.components.taggers.tagger_tf import TaggerComponent from hanlp.transform.tsv_tf import TSVTaggingTransform from hanlp.transform.txt_tf import bmes_to_words, extract_ngram_features from hanlp.common.vocab_tf import VocabTF from hanlp.layers.embeddings.util_tf import build_embedding from hanlp.layers.weight_normalization import WeightNormalization from hanlp_common.util import merge_locals_kwargs class NgramTransform(TSVTaggingTransform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None: super().__init__(config, map_x, map_y, **kwargs) self.ngram_vocab: Optional[VocabTF] = None self.tag_vocab: Optional[VocabTF] = None def inputs_to_samples(self, inputs, gold=False): for data in inputs: if gold: words, tags = data else: words, tags = data, [self.tag_vocab.safe_pad_token] * len(data) features = [words] if not tags: tags = [self.tag_vocab.first_token] * len(words) features.extend(extract_ngram_features(words, False, self.config.window_size)) yield tuple(features), tags def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: ids = [self.word_vocab.lookup(x[0]) if self.config.map_word_feature else x[0]] for ngram in x[1:]: ids.append(self.ngram_vocab.lookup(ngram)) return tuple(ids) def y_to_idx(self, y) -> tf.Tensor: return self.tag_vocab.lookup(y) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: window_size = self.config.window_size ngram_size = window_size * (window_size + 1) // 2 vec_dim = 2 + ngram_size shapes = tuple([[None]] * (vec_dim - 1)), [None] types = tuple([tf.string] * (vec_dim - 1)), tf.string word_vocab, ngram_vocab, tag_vocab = self.word_vocab, self.ngram_vocab, self.tag_vocab defaults = tuple([word_vocab.pad_token] + [ ngram_vocab.pad_token if ngram_vocab else word_vocab.pad_token] * ngram_size), ( tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token) return types, shapes, defaults def fit(self, trn_path: str, **kwargs): word_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(pad_token=None, unk_token=None) num_samples = 0 for X, Y in self.file_to_samples(trn_path, gold=True): num_samples += 1 word_vocab.update(X[0]) for ngram in X[1:]: ngram_vocab.update(filter(lambda x: x, ngram)) tag_vocab.update(Y) self.word_vocab, self.ngram_vocab, self.tag_vocab = word_vocab, ngram_vocab, tag_vocab if self.config.window_size: vocabs = word_vocab, ngram_vocab, tag_vocab else: vocabs = word_vocab, None, tag_vocab self.word_vocab, self.ngram_vocab, self.tag_vocab = vocabs return num_samples def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: yield from super().X_to_inputs(X[0]) def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]): words = bmes_to_words(input, output) return ' '.join(words) class NgramConvTaggingModel(tf.keras.models.Model): def __init__(self, word_embed: tf.keras.layers.Embedding, ngram_embed: tf.keras.layers.Embedding, filters, kernel_size, dropout_embed, dropout_hidden, weight_norm, num_tags, **kwargs): super().__init__(**kwargs) if ngram_embed is not None: self.ngram_embed = ngram_embed self.word_embed = word_embed # self.concat = tf.keras.layers.Concatenate(axis=2) self.dropout_embed = tf.keras.layers.Dropout(rate=dropout_embed) self.filters_w = [] self.filters_v = [] def create_conv1d(filter, name): conv = tf.keras.layers.Conv1D(filter, kernel_size, padding="same", name=name) if weight_norm: conv_norm = WeightNormalization(conv, name=name + '_norm', data_init=False) return conv_norm return conv for idx, filter in enumerate(filters): self.filters_w.append(create_conv1d(filter, 'Conv1Dw_{}'.format(idx))) self.filters_v.append(create_conv1d(filter, 'Conv1Dv_{}'.format(idx))) self.dropout_hidden = tf.keras.layers.Dropout(rate=dropout_hidden) self.dense = tf.keras.layers.Dense(num_tags, use_bias=False) def call(self, inputs, **kwargs): if hasattr(self, 'ngram_embed'): chars, ngrams = inputs[0], inputs[1:] embeds = [self.word_embed(chars)] mask = embeds[0]._keras_mask for ngram in ngrams: embeds.append(self.ngram_embed(ngram)) if len(embeds) > 1: embed_input = tf.concat(embeds, axis=2) else: embed_input = embeds[0] else: chars = inputs if isinstance(inputs, tf.Tensor) else inputs[0] embed_input = self.word_embed(chars) mask = embed_input._keras_mask mask_float = tf.dtypes.cast(mask, tf.float32) embed_input = self.dropout_embed(embed_input) hidden_output = embed_input for fw, fv in zip(self.filters_w.layers, self.filters_v.layers): w = fw(hidden_output) v = fv(hidden_output) hidden_output = w * tf.nn.sigmoid(v) # Mask paddings. hidden_output = hidden_output * tf.expand_dims(mask_float, -1) hidden_output = self.dropout_hidden(hidden_output) # dirty hack hidden_output._keras_mask = mask logits = self.dense(hidden_output) return logits class NgramConvTaggerTF(TaggerComponent): def __init__(self, transform: NgramTransform = None) -> None: if not transform: transform = NgramTransform() super().__init__(transform) self.transform: NgramTransform = transform def build_model(self, word_embed, ngram_embed, window_size, weight_norm, filters, kernel_size, dropout_embed, dropout_hidden, **kwargs) -> tf.keras.Model: word_vocab, ngram_vocab, tag_vocab = self.transform.word_vocab, self.transform.ngram_vocab, \ self.transform.tag_vocab word_embed = build_embedding(word_embed, word_vocab, self.transform) if 'map_x' in self.config: self.config.map_word_feature = self.config.map_x del self.config.map_x else: self.config.map_word_feature = True if window_size: ngram_embed = build_embedding(ngram_embed, ngram_vocab, self.transform) else: ngram_embed = None model = NgramConvTaggingModel(word_embed, ngram_embed, filters, kernel_size, dropout_embed, dropout_hidden, weight_norm, len(tag_vocab)) return model def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200, ngram_embed: Union[str, int,dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3, filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True, loss: Union[tf.keras.losses.Loss, str] = None, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy', batch_size=100, epochs=100, logger=None, verbose=True, **kwargs): assert kwargs.get('run_eagerly', True), 'NgramConvTaggingModel can only run eagerly' kwargs['run_eagerly'] = True return super().fit(**merge_locals_kwargs(locals(), kwargs)) ================================================ FILE: hanlp/components/taggers/pos_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-05 23:05 from hanlp.components.taggers.cnn_tagger_tf import CNNTaggerTF from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF class CNNPartOfSpeechTaggerTF(CNNTaggerTF): pass class RNNPartOfSpeechTaggerTF(RNNTaggerTF): pass ================================================ FILE: hanlp/components/taggers/rnn/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-19 15:41 ================================================ FILE: hanlp/components/taggers/rnn/rnntaggingmodel.py ================================================ # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from typing import Union import torch import torch.nn as nn from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence from hanlp.layers.crf.crf import CRF class RNNTaggingModel(nn.Module): def __init__(self, embed: Union[nn.Embedding, int], rnn_input, rnn_hidden, n_out, drop=0.5, crf=True, crf_constraints=None): super(RNNTaggingModel, self).__init__() # the embedding layer if isinstance(embed, nn.Module): self.embed = embed n_embed = embed.embedding_dim else: self.embed = None n_embed = embed if rnn_input: self.embed_to_rnn = nn.Linear(n_embed, rnn_input) else: self.embed_to_rnn = None rnn_input = n_embed # the word-lstm layer self.word_lstm = nn.LSTM(input_size=rnn_input, hidden_size=rnn_hidden, batch_first=True, bidirectional=True) # the output layer self.out = nn.Linear(rnn_hidden * 2, n_out) # the CRF layer self.crf = CRF(n_out, crf_constraints) if crf else None self.drop = nn.Dropout(drop) # self.drop = SharedDropout(drop) # self.drop = LockedDropout(drop) self.reset_parameters() def reset_parameters(self): # init Linear nn.init.xavier_uniform_(self.out.weight) def forward(self, x: torch.Tensor, batch=None, **kwargs): # get the mask and lengths of given batch mask = x.gt(0) lens = mask.sum(dim=1) # get outputs from embedding layers if isinstance(self.embed, nn.Embedding): x = self.embed(x[mask]) else: x = self.embed(batch, mask=mask) if x.dim() == 3: x = x[mask] x = self.drop(x) if self.embed_to_rnn: x = self.embed_to_rnn(x) x = pack_sequence(torch.split(x, lens.tolist()), True) x, _ = self.word_lstm(x) x, _ = pad_packed_sequence(x, True) x = self.drop(x) return self.out(x), mask ================================================ FILE: hanlp/components/taggers/rnn_tagger.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-20 13:12 import logging import torch from torch import nn from torch.optim.lr_scheduler import ReduceLROnPlateau from torch.utils.data import DataLoader from hanlp.common.dataset import PadSequenceDataLoader, SortingSampler, TransformableDataset from hanlp_common.configurable import Configurable from hanlp.common.transform import EmbeddingNamedTransform from hanlp.common.vocab import Vocab from hanlp.components.taggers.rnn.rnntaggingmodel import RNNTaggingModel from hanlp.components.taggers.tagger import Tagger from hanlp.datasets.ner.loaders.tsv import TSVTaggingDataset from hanlp.layers.embeddings.embedding import Embedding from hanlp.layers.embeddings.util import build_word2vec_with_vocab from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import merge_locals_kwargs, merge_dict class RNNTagger(Tagger): def __init__(self, **kwargs) -> None: """An old-school tagger using non-contextualized embeddings and RNNs as context layer. Args: **kwargs: Predefined config. """ super().__init__(**kwargs) self.model: RNNTaggingModel = None # noinspection PyMethodOverriding def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger, patience, **kwargs): max_e, max_metric = 0, -1 criterion = self.build_criterion() timer = CountdownTimer(epochs) ratio_width = len(f'{len(trn)}/{len(trn)}') scheduler = self.build_scheduler(**merge_dict(self.config, optimizer=optimizer, overwrite=True)) if not patience: patience = epochs for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, ratio_width=ratio_width) loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger) if scheduler: if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(dev_metric.score) else: scheduler.step(epoch) report_patience = f'Patience: {epoch - max_e}/{patience}' # save the model if it is the best so far if dev_metric > max_metric: self.save_weights(save_dir) max_e, max_metric = epoch, dev_metric report_patience = '[red]Saved[/red] ' stop = epoch - max_e >= patience if stop: timer.stop() timer.log(f'{report_patience} lr: {optimizer.param_groups[0]["lr"]:.4f}', ratio_percentage=False, newline=True, ratio=False) if stop: break timer.stop() if max_e != epoch: self.load_weights(save_dir) logger.info(f"Max score of dev is {max_metric.score:.2%} at epoch {max_e}") logger.info(f"{timer.elapsed_human} elapsed, average time of each epoch is {timer.elapsed_average_human}") def build_scheduler(self, optimizer, anneal_factor, anneal_patience, **kwargs): scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=anneal_patience, mode='max') if anneal_factor and anneal_patience else None return scheduler def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, ratio_width=None, **kwargs): self.model.train() timer = CountdownTimer(len(trn)) total_loss = 0 for idx, batch in enumerate(trn): optimizer.zero_grad() out, mask = self.feed_batch(batch) y = batch['tag_id'] loss = self.compute_loss(criterion, out, y, mask) loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() total_loss += loss.item() prediction = self.decode_output(out, mask, batch) self.update_metrics(metric, out, y, mask, batch, prediction) timer.log(f'loss: {loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger, ratio_width=ratio_width) del loss del out del mask def feed_batch(self, batch): x = batch[f'{self.config.token_key}_id'] out, mask = self.model(x, **batch, batch=batch) return out, mask # noinspection PyMethodOverriding def build_model(self, rnn_input, rnn_hidden, drop, crf, **kwargs) -> torch.nn.Module: vocabs = self.vocabs token_embed = self._convert_embed() if isinstance(token_embed, EmbeddingNamedTransform): token_embed = token_embed.output_dim elif isinstance(token_embed, Embedding): token_embed = token_embed.module(vocabs=vocabs) else: token_embed = build_word2vec_with_vocab(token_embed, vocabs[self.config.token_key]) model = RNNTaggingModel(token_embed, rnn_input, rnn_hidden, len(vocabs['tag']), drop, crf) return model def _convert_embed(self): embed = self.config['embed'] if isinstance(embed, dict): self.config['embed'] = embed = Configurable.from_config(embed) return embed def build_dataloader(self, data, batch_size, shuffle, device, logger=None, **kwargs) -> DataLoader: vocabs = self.vocabs token_embed = self._convert_embed() dataset = data if isinstance(data, TransformableDataset) else self.build_dataset(data, transform=[vocabs]) if vocabs.mutable: # Before building vocabs, let embeddings submit their vocabs, some embeddings will possibly opt out as their # transforms are not relevant to vocabs if isinstance(token_embed, Embedding): transform = token_embed.transform(vocabs=vocabs) if transform: dataset.transform.insert(-1, transform) self.build_vocabs(dataset, logger) if isinstance(token_embed, Embedding): # Vocabs built, now add all transforms to the pipeline. Be careful about redundant ones. transform = token_embed.transform(vocabs=vocabs) if transform and transform not in dataset.transform: dataset.transform.insert(-1, transform) sampler = SortingSampler([len(sample[self.config.token_key]) for sample in dataset], batch_size, shuffle=shuffle) return PadSequenceDataLoader(dataset, device=device, batch_sampler=sampler, vocabs=vocabs) def build_dataset(self, data, transform): return TSVTaggingDataset(data, transform) def build_vocabs(self, dataset, logger): self.vocabs.tag = Vocab(unk_token=None, pad_token=None) self.vocabs[self.config.token_key] = Vocab() for each in dataset: pass self.vocabs.lock() self.vocabs.summary(logger) def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256, drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='token', tagging_scheme=None, anneal_factor: float = 0.5, anneal_patience=2, devices=None, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def _id_to_tags(self, ids): batch = [] vocab = self.vocabs['tag'].idx_to_token for b in ids: batch.append([]) for i in b: batch[-1].append(vocab[i]) return batch def write_output(self, yhat, y, mask, batch, prediction, output): pass ================================================ FILE: hanlp/components/taggers/rnn_tagger_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-09-14 20:30 from typing import Union, List import tensorflow as tf from hanlp.common.transform_tf import Transform from hanlp.components.taggers.tagger_tf import TaggerComponent from hanlp.transform.tsv_tf import TSVTaggingTransform from hanlp.common.vocab_tf import VocabTF from hanlp.layers.embeddings.util_tf import build_embedding, embeddings_require_string_input, \ embeddings_require_char_input from hanlp_common.util import merge_locals_kwargs class RNNTaggerTF(TaggerComponent): def __init__(self, transform: Transform = None) -> None: if not transform: self.transform = transform = TSVTaggingTransform() super().__init__(transform) def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False, rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, lower=False, logger=None, loss: Union[tf.keras.losses.Loss, str] = None, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy', batch_size=32, dev_batch_size=32, lr_decay_per_epoch=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def build_model(self, embeddings, embedding_trainable, rnn_input_dropout, rnn_output_dropout, rnn_units, loss, **kwargs) -> tf.keras.Model: model = tf.keras.Sequential() embeddings = build_embedding(embeddings, self.transform.word_vocab, self.transform) model.add(embeddings) if rnn_input_dropout: model.add(tf.keras.layers.Dropout(rnn_input_dropout, name='rnn_input_dropout')) model.add( tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=rnn_units, return_sequences=True), name='bilstm')) if rnn_output_dropout: model.add(tf.keras.layers.Dropout(rnn_output_dropout, name='rnn_output_dropout')) model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(self.transform.tag_vocab)), name='dense')) return model def predict(self, sents: Union[List[str], List[List[str]]], batch_size=32, **kwargs) -> Union[ List[str], List[List[str]]]: return super().predict(sents, batch_size) def save_weights(self, save_dir, filename='model.h5'): # remove the pre-trained embedding embedding_layer: tf.keras.layers.Embedding = self.model.get_layer(index=0) if embedding_layer.trainable: super().save_weights(save_dir, filename) else: truncated_model = tf.keras.Sequential(layers=self.model.layers[1:]) truncated_model.build(input_shape=embedding_layer.output_shape) truncated_model.save_weights(save_dir) def build_loss(self, loss, **kwargs): if not loss: loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM, from_logits=True) return loss return super().build_loss(loss, **kwargs) @property def tag_vocab(self) -> VocabTF: return self.transform.tag_vocab def build_transform(self, embeddings, **kwargs): if embeddings_require_string_input(embeddings): self.transform.map_x = False if embeddings_require_char_input(embeddings): self.transform.char_vocab = VocabTF() return super().build_transform(**kwargs) @property def sample_data(self): if self.transform.char_vocab: # You cannot build your model by calling `build` if your layers do not support float type inputs. # Instead, in order to instantiate and build your model, `call` your model on real tensor data (of the # correct dtype). sample = tf.constant([ ['hello', 'world', self.transform.word_vocab.pad_token], ['hello', 'this', 'world'], ]) sample._keras_mask = tf.not_equal(sample, self.transform.word_vocab.pad_token) return sample ================================================ FILE: hanlp/components/taggers/tagger.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-11 12:19 import logging import warnings from abc import ABC, abstractmethod from typing import List, TextIO, Any, Union, Dict, Tuple, Sequence import torch from torch import optim, nn from torch.utils.data import DataLoader from hanlp_common.constant import IDX from hanlp.common.structure import History from hanlp.components.distillation.distillable_component import DistillableComponent from hanlp.components.taggers.util import guess_tagging_scheme from hanlp.layers.crf.crf import CRF from hanlp.metrics.accuracy import CategoricalAccuracy from hanlp.utils.time_util import CountdownTimer from hanlp_common.util import reorder from hanlp_trie import DictInterface, TrieDict from hanlp_trie.dictionary import TupleTrieDict class Tagger(DistillableComponent, ABC): def build_optimizer(self, optimizer, lr, **kwargs): if optimizer == 'adam': return optim.Adam(params=self.model.parameters(), lr=lr) elif optimizer == 'sgd': return torch.optim.SGD(self.model.parameters(), lr=lr) def build_criterion(self, model=None, reduction='mean', decoder=None, **kwargs): if self.config.get('crf', False): if not model: model = decoder or self.model if isinstance(model, nn.DataParallel): raise ValueError('DataParallel not supported when CRF is used') return self.model_from_config.module.crf return model.crf else: return nn.CrossEntropyLoss(reduction=reduction) def build_metric(self, **kwargs): return CategoricalAccuracy() @abstractmethod def feed_batch(self, batch): pass def compute_loss(self, criterion, out, y, mask): if self.config.get('crf', False): criterion: CRF = criterion loss = -criterion.forward(out, y, mask) else: loss = criterion(out[mask], y[mask]) return loss def decode_output(self, logits, mask, batch, model=None): if self.config.get('crf', False): if model is None: model = self.model crf: CRF = model.crf return crf.decode(logits, mask) else: return logits.argmax(-1) def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, patience=5, teacher=None, kd_criterion=None, eval_trn=True, **kwargs): best_epoch, best_metric = 0, -1 timer = CountdownTimer(epochs) history = History() for epoch in range(1, epochs + 1): logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]") self.fit_dataloader(trn, criterion, optimizer, metric, logger, history=history, ratio_width=ratio_width, eval_trn=eval_trn, **self.config) loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger=logger, ratio_width=ratio_width) timer.update() report = f"{timer.elapsed_human} / {timer.total_time_human} ETA: {timer.eta_human}" if dev_metric > best_metric: best_epoch, best_metric = epoch, dev_metric self.save_weights(save_dir) report += ' [red](saved)[/red]' else: report += f' ({epoch - best_epoch})' if epoch - best_epoch >= patience: report += ' early stop' logger.info(report) if epoch - best_epoch >= patience: break if not best_epoch: self.save_weights(save_dir) elif best_epoch != epoch: self.load_weights(save_dir) logger.info(f"Max score of dev is {best_metric} at epoch {best_epoch}") logger.info(f"Average time of each epoch is {timer.elapsed_average_human}") logger.info(f"{timer.elapsed_human} elapsed") return best_metric def id_to_tags(self, ids: torch.LongTensor, lens: List[int]): batch = [] vocab = self.vocabs['tag'].idx_to_token for b, l in zip(ids, lens): batch.append([]) for i in b[:l]: batch[-1].append(vocab[i]) return batch def update_metrics(self, metric, logits, y, mask, batch=None, prediction=None): metric(logits, y, mask) @torch.no_grad() def evaluate_dataloader(self, data, criterion, logger=None, ratio_width=None, metric=None, output=None, **kwargs): self.model.eval() if isinstance(output, str): output = open(output, 'w') loss = 0 if not metric: metric = self.build_metric() else: metric.reset() timer = CountdownTimer(len(data)) for idx, batch in enumerate(data): logits, mask = self.feed_batch(batch) y = batch['tag_id'] loss += self.compute_loss(criterion, logits, y, mask).item() prediction = self.decode_output(logits, mask, batch) self.update_metrics(metric, logits, y, mask, batch, prediction) if output: self.write_prediction(prediction, batch, output) timer.log(f'loss: {loss / (idx + 1):.4f} {metric}', ratio_percentage=False, logger=logger, ratio_width=ratio_width) loss /= len(data) if output: output.close() return float(loss), metric def write_prediction(self, prediction, batch, output: TextIO): for tokens, ps, gs in zip(batch[self.config.token_key], prediction, batch['tag']): output.write('\n'.join('\t'.join([t, p, g]) for t, p, g in zip(tokens, ps, gs))) output.write('\n') def predict(self, tokens: Any, batch_size: int = None, **kwargs): if not tokens: return [] flat = self.input_is_flat(tokens) if flat: tokens = [tokens] outputs = self.predict_data(tokens, batch_size, **kwargs) if flat: return outputs[0] return outputs def input_is_flat(self, tokens): return isinstance(tokens, list) and isinstance(tokens[0], str) def predict_data(self, data, batch_size, sampler_builder=None, **kwargs): samples = self.build_samples(data, **kwargs) if not batch_size: batch_size = self.config.get('batch_size', 32) dataloader = self.build_dataloader(samples, batch_size, False, self.device, sampler_builder=sampler_builder, **kwargs) outputs = [] orders = [] vocab = self.vocabs['tag'].idx_to_token for batch in dataloader: out, mask = self.feed_batch(batch) pred = self.decode_output(out, mask, batch) outputs.extend(self.prediction_to_human(pred, vocab, batch)) orders.extend(batch[IDX]) outputs = reorder(outputs, orders) return outputs def build_samples(self, data: List[str], **kwargs): return [{self.config.token_key: sent} for sent in data] def prediction_to_human(self, pred_ids, vocab: List[str], batch): if isinstance(pred_ids, torch.Tensor): pred_ids = pred_ids.tolist() sents = batch.get(f'{self.config.token_key}_') if not sents: sents = batch[self.config.token_key] dict_tags: DictInterface = self.dict_tags for each, sent in zip(pred_ids, sents): tags = [vocab[id] for id in each[:len(sent)]] if dict_tags: for begin, end, label in dict_tags.tokenize(sent): tags[begin:end] = label yield tags @property def tagging_scheme(self): tagging_scheme = self.config.tagging_scheme if not tagging_scheme: self.config.tagging_scheme = tagging_scheme = guess_tagging_scheme(self.vocabs.tag.idx_to_token) if tagging_scheme == 'BIO': warnings.warn(f'The tag scheme for {self.vocabs.tag.idx_to_token} might be IOB1 or IOB2 ' f'but we are using IOB2 by default. Please set tagging_scheme="IOB1" or tagging_scheme="BIO" ' f'to get rid of this warning.') return tagging_scheme @property def dict_tags(self) -> DictInterface: r""" A custom dictionary to override predicted tags by performing longest-prefix-matching. Examples: >>> pos.dict_tags = {'HanLP': 'state-of-the-art-tool'} # Force 'HanLP' to be 'state-of-the-art-tool' >>> tagger("HanLP为生产环境带来次世代最先进的多语种NLP技术。") # HanLP/state-of-the-art-tool 为/P 生产/NN 环境/NN 带来/VV 次世代/NN 最/AD 先进/VA 的/DEC 多语种/NN NLP/NR 技术/NN 。/PU >>> pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'} # Conditional matching >>> tagger("我的希望是希望张晚霞的背影被晚霞映红。") # 我/PN 的/补语成分 希望/名词 是/VC 希望/动词 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU """ return self.config.get('dict_tags', None) @dict_tags.setter def dict_tags(self, dictionary: Union[DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): assert isinstance(dictionary, dict), f'Expected dictionary to be `dict` but got {type(dictionary)}.' _d = dict() for k, v in dictionary.items(): if isinstance(k, str): k = (k,) if isinstance(v, str): v = (v,) * len(k) _d[k] = v dictionary = TupleTrieDict(_d) self.config.dict_tags = dictionary ================================================ FILE: hanlp/components/taggers/tagger_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-25 21:49 import logging from abc import ABC import tensorflow as tf from hanlp.common.keras_component import KerasComponent from hanlp.layers.crf.crf_layer_tf import CRF, CRFLoss, CRFWrapper from hanlp.metrics.chunking.iobes_tf import IOBES_F1_TF class TaggerComponent(KerasComponent, ABC): def build_metrics(self, metrics, logger: logging.Logger, **kwargs): if metrics == 'f1': assert hasattr(self.transform, 'tag_vocab'), 'Name your tag vocab tag_vocab in your transform ' \ 'or override build_metrics' if not self.config.get('run_eagerly', None): logger.debug('ChunkingF1 runs only under eager mode, ' 'set run_eagerly=True to remove this warning') self.config.run_eagerly = True return IOBES_F1_TF(self.transform.tag_vocab) return super().build_metrics(metrics, logger, **kwargs) def build_loss(self, loss, **kwargs): assert self.model is not None, 'should create model before build loss' if loss == 'crf': if isinstance(self.model, tf.keras.models.Sequential): crf = CRF(len(self.transform.tag_vocab)) self.model.add(crf) loss = CRFLoss(crf, self.model.dtype) else: self.model = CRFWrapper(self.model, len(self.transform.tag_vocab)) loss = CRFLoss(self.model.crf, self.model.dtype) return loss return super().build_loss(loss, **kwargs) ================================================ FILE: hanlp/components/taggers/transformers/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 13:57 ================================================ FILE: hanlp/components/taggers/transformers/metrics_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-30 16:33 import tensorflow as tf class Accuracy(tf.keras.metrics.SparseCategoricalAccuracy): def __init__(self, name='sparse_categorical_accuracy', dtype=None, mask_value=0): super().__init__(name, dtype) self.mask_value = mask_value def update_state(self, y_true, y_pred, sample_weight=None): sample_weight = tf.not_equal(y_true, self.mask_value) return super().update_state(y_true, y_pred, sample_weight) ================================================ FILE: hanlp/components/taggers/transformers/transformer_tagger.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-15 20:55 import logging from typing import Union, List import torch from torch import nn from torch.utils.data import DataLoader from hanlp.common.dataset import PadSequenceDataLoader, SamplerBuilder, TransformableDataset from hanlp.common.structure import History from hanlp.common.transform import FieldLength, TransformList from hanlp.common.vocab import Vocab from hanlp.components.classifiers.transformer_classifier import TransformerComponent from hanlp.components.taggers.tagger import Tagger from hanlp.datasets.ner.loaders.tsv import TSVTaggingDataset from hanlp.layers.crf.crf import CRF from hanlp.layers.embeddings.embedding import EmbeddingDim, Embedding from hanlp.layers.transformers.encoder import TransformerEncoder from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer from hanlp.utils.time_util import CountdownTimer from hanlp.utils.torch_util import clip_grad_norm, lengths_to_mask, filter_state_dict_safely from hanlp_common.util import merge_locals_kwargs # noinspection PyAbstractClass class TransformerTaggingModel(nn.Module): def __init__(self, encoder: TransformerEncoder, num_labels, crf=False, secondary_encoder=None, extra_embeddings: EmbeddingDim = None) -> None: """ A shallow tagging model use transformer as decoder. Args: encoder: A pretrained transformer. num_labels: Size of tagset. crf: True to enable CRF. extra_embeddings: Extra embeddings which will be concatenated to the encoder outputs. """ super().__init__() self.encoder = encoder self.secondary_encoder = secondary_encoder self.extra_embeddings = extra_embeddings # noinspection PyUnresolvedReferences feature_size = encoder.transformer.config.hidden_size if extra_embeddings: feature_size += extra_embeddings.get_output_dim() self.classifier = nn.Linear(feature_size, num_labels) self.crf = CRF(num_labels) if crf else None def forward(self, lens: torch.LongTensor, input_ids, token_span, token_type_ids=None, batch=None): mask = lengths_to_mask(lens) x = self.encoder(input_ids, token_span=token_span, token_type_ids=token_type_ids) if self.secondary_encoder: x = self.secondary_encoder(x, mask=mask) if self.extra_embeddings: # noinspection PyCallingNonCallable embed = self.extra_embeddings(batch, mask=mask) x = torch.cat([x, embed], dim=-1) x = self.classifier(x) return x, mask class TransformerTagger(TransformerComponent, Tagger): def __init__(self, **kwargs) -> None: """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for any tagging tasks including PoS tagging and many others. Args: **kwargs: Not used. """ super().__init__(**kwargs) self._tokenizer_transform = None self.model: TransformerTaggingModel = None # noinspection PyMethodOverriding def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, history: History, gradient_accumulation=1, grad_norm=None, transformer_grad_norm=None, teacher: Tagger = None, kd_criterion=None, temperature_scheduler=None, ratio_width=None, eval_trn=True, **kwargs): optimizer, scheduler = optimizer if teacher: scheduler, lambda_scheduler = scheduler else: lambda_scheduler = None self.model.train() timer = CountdownTimer(history.num_training_steps(len(trn), gradient_accumulation=gradient_accumulation)) total_loss = 0 for idx, batch in enumerate(trn): out, mask = self.feed_batch(batch) y = batch['tag_id'] loss = self.compute_loss(criterion, out, y, mask) if gradient_accumulation and gradient_accumulation > 1: loss /= gradient_accumulation if teacher: with torch.no_grad(): out_T, _ = teacher.feed_batch(batch) # noinspection PyNoneFunctionAssignment kd_loss = self.compute_distill_loss(kd_criterion, out, out_T, mask, temperature_scheduler) _lambda = float(lambda_scheduler) loss = _lambda * loss + (1 - _lambda) * kd_loss loss.backward() total_loss += loss.item() if eval_trn: prediction = self.decode_output(out, mask, batch) self.update_metrics(metric, out, y, mask, batch, prediction) if history.step(gradient_accumulation): self._step(optimizer, scheduler, grad_norm, transformer_grad_norm, lambda_scheduler) report = f'loss: {total_loss / (idx + 1):.4f} {metric if eval_trn else ""}' timer.log(report, logger=logger, ratio_percentage=False, ratio_width=ratio_width) del loss del out del mask def _step(self, optimizer, scheduler, grad_norm, transformer_grad_norm, lambda_scheduler): clip_grad_norm(self.model, grad_norm, self.model.encoder.transformer, transformer_grad_norm) optimizer.step() scheduler.step() if lambda_scheduler: lambda_scheduler.step() optimizer.zero_grad() def compute_distill_loss(self, kd_criterion, out_S, out_T, mask, temperature_scheduler): logits_S = out_S[mask] logits_T = out_T[mask] temperature = temperature_scheduler(logits_S, logits_T) return kd_criterion(logits_S, logits_T, temperature) def build_model(self, training=True, extra_embeddings: Embedding = None, finetune=False, logger=None, **kwargs) -> torch.nn.Module: model = TransformerTaggingModel( self.build_transformer(training=training), len(self.vocabs.tag), self.config.crf, self.config.get('secondary_encoder', None), extra_embeddings=extra_embeddings.module(self.vocabs) if extra_embeddings else None, ) if finetune and self.model: model_state = model.state_dict() load_state = self.model.state_dict() safe_state = filter_state_dict_safely(model_state, load_state) missing_params = model_state.keys() - safe_state.keys() if missing_params: logger.info(f'The following parameters were missing from the checkpoint: ' f'{", ".join(sorted(missing_params))}.') model.load_state_dict(safe_state, strict=False) n = self.model.classifier.bias.size(0) if model.classifier.bias.size(0) != n: model.classifier.weight.data[:n, :] = self.model.classifier.weight.data[:n, :] model.classifier.bias.data[:n] = self.model.classifier.bias.data[:n] return model # noinspection PyMethodOverriding def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None, sampler_builder: SamplerBuilder = None, gradient_accumulation=1, extra_embeddings: Embedding = None, transform=None, max_seq_len=None, **kwargs) -> DataLoader: if isinstance(data, TransformableDataset): dataset = data else: args = dict((k, self.config.get(k, None)) for k in ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint']) dataset = self.build_dataset(data, **args) if self.config.token_key is None: self.config.token_key = next(iter(dataset[0])) logger.info( f'Guess [bold][blue]token_key={self.config.token_key}[/blue][/bold] according to the ' f'training dataset: [blue]{dataset}[/blue]') if transform: dataset.append_transform(transform) if extra_embeddings: dataset.append_transform(extra_embeddings.transform(self.vocabs)) dataset.append_transform(self.tokenizer_transform) dataset.append_transform(self.last_transform()) if not isinstance(data, list): dataset.purge_cache() if self.vocabs.mutable: self.build_vocabs(dataset, logger) if isinstance(data, str) and max_seq_len: token_key = self.config.token_key dataset.prune(lambda x: len(x[token_key]) > max_seq_len, logger) if sampler_builder is not None: sampler = sampler_builder.build([len(x[f'{self.config.token_key}_input_ids']) for x in dataset], shuffle, gradient_accumulation=gradient_accumulation if shuffle else 1) else: sampler = None return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler) def build_dataset(self, data, transform=None, **kwargs): return TSVTaggingDataset(data, transform=transform, **kwargs) def last_transform(self): transforms = TransformList(self.vocabs, FieldLength(self.config.token_key)) return transforms @property def tokenizer_transform(self) -> TransformerSequenceTokenizer: if not self._tokenizer_transform: self._tokenizer_transform = TransformerSequenceTokenizer(self.transformer_tokenizer, self.config.token_key, ret_token_span=True) return self._tokenizer_transform def build_vocabs(self, trn, logger, **kwargs): if 'tag' not in self.vocabs: self.vocabs.tag = Vocab(pad_token=None, unk_token=None) timer = CountdownTimer(len(trn)) max_seq_len = 0 token_key = self.config.token_key for each in trn: max_seq_len = max(max_seq_len, len(each[token_key])) timer.log(f'Building vocab [blink][yellow]...[/yellow][/blink] (longest sequence: {max_seq_len})') self.vocabs.tag.set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger) # noinspection PyMethodOverriding def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2, hidden_dropout=None, layer_dropout=0, scalar_mix=None, mix_embedding: int = 0, grad_norm=5.0, transformer_grad_norm=None, lr=5e-5, transformer_lr=None, transformer_layers=None, gradient_accumulation=1, adam_epsilon=1e-6, weight_decay=0, warmup_steps=0.1, secondary_encoder=None, extra_embeddings: Embedding = None, crf=False, reduction='sum', batch_size=32, sampler_builder: SamplerBuilder = None, epochs=3, patience=5, token_key=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def feed_batch(self, batch: dict): features = [batch[k] for k in self.tokenizer_transform.output_key] if len(features) == 2: input_ids, token_span = features else: input_ids, token_span = features[0], None lens = batch[f'{self.config.token_key}_length'] x, mask = self.model(lens, input_ids, token_span, batch.get(f'{self.config.token_key}_token_type_ids'), batch=batch) return x, mask # noinspection PyMethodOverriding def distill(self, teacher: str, trn_data, dev_data, save_dir, transformer: str, batch_size=None, temperature_scheduler='flsw', epochs=None, devices=None, logger=None, seed=None, **kwargs): return super().distill(**merge_locals_kwargs(locals(), kwargs)) ================================================ FILE: hanlp/components/taggers/transformers/transformer_tagger_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 13:55 import math import tensorflow as tf from hanlp.common.transform_tf import Transform from hanlp.components.taggers.tagger_tf import TaggerComponent from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform from hanlp.layers.transformers.loader_tf import build_transformer from hanlp.layers.transformers.utils_tf import build_adamw_optimizer from hanlp.losses.sparse_categorical_crossentropy import SparseCategoricalCrossentropyOverBatchFirstDim from hanlp_common.util import merge_locals_kwargs class TransformerTaggingModel(tf.keras.Model): def __init__(self, transformer: tf.keras.Model, *args, **kwargs): super().__init__(*args, **kwargs) self.transformer = transformer def call(self, inputs, training=None, mask=None): return super().call(inputs, training, mask) class TransformerTaggerTF(TaggerComponent): def __init__(self, transform: TransformerTransform = None) -> None: if transform is None: transform = TransformerTransform() super().__init__(transform) self.transform: TransformerTransform = transform def build_model(self, transformer, max_seq_length, **kwargs) -> tf.keras.Model: model, tokenizer = build_transformer(transformer, max_seq_length, len(self.transform.tag_vocab), tagging=True) self.transform.tokenizer = tokenizer return model def fit(self, trn_data, dev_data, save_dir, transformer, optimizer='adamw', learning_rate=5e-5, weight_decay_rate=0, epsilon=1e-8, clipnorm=1.0, warmup_steps_ratio=0, use_amp=False, max_seq_length=128, batch_size=32, epochs=3, metrics='accuracy', run_eagerly=False, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) # noinspection PyMethodOverriding def build_optimizer(self, optimizer, learning_rate, epsilon, weight_decay_rate, clipnorm, use_amp, train_steps, warmup_steps, **kwargs): if optimizer == 'adamw': opt = build_adamw_optimizer(self.config, learning_rate, epsilon, clipnorm, train_steps, use_amp, warmup_steps, weight_decay_rate) else: opt = super().build_optimizer(optimizer) return opt def build_vocab(self, trn_data, logger): train_examples = super().build_vocab(trn_data, logger) warmup_steps_per_epoch = math.ceil(train_examples * self.config.warmup_steps_ratio / self.config.batch_size) self.config.warmup_steps = warmup_steps_per_epoch * self.config.epochs return train_examples def train_loop(self, trn_data, dev_data, epochs, num_examples, train_steps_per_epoch, dev_steps, model, optimizer, loss, metrics, callbacks, logger, **kwargs): history = self.model.fit(trn_data, epochs=epochs, steps_per_epoch=train_steps_per_epoch, validation_data=dev_data, callbacks=callbacks, validation_steps=dev_steps, # mask out padding labels # class_weight=dict( # (i, 0 if i == 0 else 1) for i in range(len(self.transform.tag_vocab))) ) # type:tf.keras.callbacks.History return history def build_loss(self, loss, **kwargs): if not loss: return SparseCategoricalCrossentropyOverBatchFirstDim() return super().build_loss(loss, **kwargs) def load_transform(self, save_dir) -> Transform: super().load_transform(save_dir) self.transform.tokenizer = build_transformer(self.config.transformer, self.config.max_seq_length, len(self.transform.tag_vocab), tagging=True, tokenizer_only=True) return self.transform ================================================ FILE: hanlp/components/taggers/transformers/transformer_transform_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 15:14 from typing import Union, Tuple, List, Iterable import tensorflow as tf from hanlp_common.structure import SerializableDict from hanlp.common.transform_tf import Transform from hanlp.common.vocab_tf import VocabTF from hanlp.layers.transformers.utils_tf import convert_examples_to_features from hanlp.transform.tsv_tf import TsvTaggingFormat class TransformerTransform(TsvTaggingFormat, Transform): def __init__(self, tokenizer=None, config: SerializableDict = None, map_x=False, map_y=False, **kwargs) -> None: super().__init__(config, map_x, map_y, **kwargs) self._tokenizer = tokenizer self.tag_vocab: VocabTF = None self.special_token_ids = None self.pad = '[PAD]' self.unk = '[UNK]' @property def max_seq_length(self): # -2 for special tokens [CLS] and [SEP] return self.config.get('max_seq_length', 128) - 2 @property def tokenizer(self): return self._tokenizer @tokenizer.setter def tokenizer(self, tokenizer): self._tokenizer = tokenizer vocab = tokenizer._vocab if hasattr(tokenizer, '_vocab') else tokenizer.vocab if self.pad not in vocab: # English albert use instead of [PAD] self.pad = '' if self.unk not in vocab: self.unk = '' self.special_token_ids = tf.constant([vocab[token] for token in [self.pad, '[CLS]', '[SEP]']], dtype=tf.int32) def fit(self, trn_path: str, **kwargs) -> int: self.tag_vocab = VocabTF(unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, gold=True): num_samples += 1 self.tag_vocab.update(tags) return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: max_seq_length = self.config.get('max_seq_length', 128) types = (tf.int32, tf.int32, tf.int32), tf.int32 # (input_ids, input_mask, segment_ids), label_ids shapes = ([max_seq_length], [max_seq_length], [max_seq_length]), [None] values = (0, 0, 0), self.tag_vocab.pad_idx return types, shapes, values def lock_vocabs(self): super().lock_vocabs() def inputs_to_samples(self, inputs, gold=False): max_seq_length = self.config.get('max_seq_length', 128) tokenizer = self._tokenizer xlnet = False roberta = False pad_token = self.pad cls_token = '[CLS]' sep_token = '[SEP]' unk_token = self.unk pad_label_idx = self.tag_vocab.pad_idx pad_token = tokenizer.convert_tokens_to_ids([pad_token])[0] for sample in inputs: if gold: words, tags = sample else: words, tags = sample, [self.tag_vocab.idx_to_token[1]] * len(sample) input_ids, input_mask, segment_ids, label_ids = convert_examples_to_features(words, max_seq_length, tokenizer, tags, self.tag_vocab.token_to_idx, cls_token_at_end=xlnet, # xlnet has a cls token at the end cls_token=cls_token, cls_token_segment_id=2 if xlnet else 0, sep_token=sep_token, sep_token_extra=roberta, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=xlnet, # pad on the left for xlnet pad_token_id=pad_token, pad_token_segment_id=4 if xlnet else 0, pad_token_label_id=pad_label_idx, unk_token=unk_token) if None in input_ids: print(input_ids) if None in input_mask: print(input_mask) if None in segment_ids: print(input_mask) yield (input_ids, input_mask, segment_ids), label_ids def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: raise NotImplementedError('transformers has its own tagger, not need to convert idx for x') def y_to_idx(self, y) -> tf.Tensor: raise NotImplementedError('transformers has its own tagger, not need to convert idx for y') def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input[0], str) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, X=None, inputs=None, batch=None, **kwargs) -> Iterable: assert batch is not None, 'Need the batch to know actual length of Y' label_mask = batch[1] if self.tag_vocab.pad_token: Y[:, :, self.tag_vocab.pad_idx] = float('-inf') Y = tf.argmax(Y, axis=-1) Y = Y[label_mask > 0] tags = [self.tag_vocab.idx_to_token[tid] for tid in Y] offset = 0 for words in inputs: yield tags[offset:offset + len(words)] offset += len(words) ================================================ FILE: hanlp/components/taggers/util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-01 00:31 from typing import List, Tuple from hanlp.utils.span_util import allowed_transitions def guess_tagging_scheme(labels: List[str]) -> str: tagset = set(y.split('-')[0] for y in labels) for scheme in "BIO", "BIOUL", "BMES", 'IOBES': if tagset == set(list(scheme)): return scheme def guess_allowed_transitions(labels) -> List[Tuple[int, int]]: scheme = guess_tagging_scheme(labels) if not scheme: return None if scheme == 'IOBES': scheme = 'BIOUL' labels = [y.replace('E-', 'L-').replace('S-', 'U-') for y in labels] return allowed_transitions(scheme, dict(enumerate(labels))) ================================================ FILE: hanlp/components/tokenizers/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-11 02:48 ================================================ FILE: hanlp/components/tokenizers/multi_criteria_cws_transformer.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-10-21 19:55 from typing import List, Union from hanlp.common.dataset import SamplerBuilder from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer from hanlp.datasets.tokenization.loaders.multi_criteria_cws.mcws_dataset import MultiCriteriaTextTokenizingDataset, append_criteria_token import functools from hanlp.metrics.f1 import F1 from hanlp.metrics.mtl import MetricDict from hanlp_common.util import merge_locals_kwargs class MultiCriteriaTransformerTaggingTokenizer(TransformerTaggingTokenizer): def __init__(self, **kwargs) -> None: r"""Transformer based implementation of "Effective Neural Solution for Multi-Criteria Word Segmentation" (:cite:`he2019effective`). It uses an artificial token ``[unused_i]`` instead of ``[SEP]`` in the input_ids to mark the i-th segmentation criteria. Args: **kwargs: Not used. """ super().__init__(**kwargs) def build_dataset(self, data, **kwargs): return MultiCriteriaTextTokenizingDataset(data, **kwargs) def on_config_ready(self, **kwargs): super().on_config_ready(**kwargs) # noinspection PyAttributeOutsideInit if 'criteria_token_map' not in self.config: unused_tokens = [f'[unused{i}]' for i in range(1, 100)] ids = self.transformer_tokenizer.convert_tokens_to_ids(unused_tokens) self.config.unused_tokens = dict((x, ids[i]) for i, x in enumerate(unused_tokens) if ids[i] != self.transformer_tokenizer.unk_token_id) self.config.criteria_token_map = dict() def last_transform(self): transforms = super().last_transform() transforms.append(functools.partial(append_criteria_token, criteria_tokens=self.config.unused_tokens, criteria_token_map=self.config.criteria_token_map)) return transforms def build_vocabs(self, trn, logger, **kwargs): super().build_vocabs(trn, logger, **kwargs) logger.info(f'criteria[{len(self.config.criteria_token_map)}] = {list(self.config.criteria_token_map)}') def feed_batch(self, batch: dict): x, mask = TransformerTagger.feed_batch(self, batch) # strip [CLS], [SEP] and [unused_i] return x[:, 1:-2, :], mask def build_samples(self, data: List[str], criteria=None, **kwargs): if not criteria: criteria = next(iter(self.config.criteria_token_map.keys())) else: assert criteria in self.config.criteria_token_map, \ f'Unsupported criteria {criteria}. Choose one from {list(self.config.criteria_token_map.keys())}' samples = super().build_samples(data, **kwargs) for sample in samples: sample['criteria'] = criteria return samples def build_metric(self, **kwargs): metrics = MetricDict() for criteria in self.config.criteria_token_map: metrics[criteria] = F1() return metrics def update_metrics(self, metric, logits, y, mask, batch, prediction): for p, g, c in zip(prediction, self.tag_to_span(batch['tag']), batch['criteria']): pred = set(p) gold = set(g) metric[c](pred, gold) def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2, hidden_dropout=None, layer_dropout=0, scalar_mix=None, mix_embedding: int = 0, grad_norm=5.0, transformer_grad_norm=None, lr=5e-5, transformer_lr=None, transformer_layers=None, gradient_accumulation=1, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, crf=False, reduction='sum', batch_size=32, sampler_builder: SamplerBuilder = None, epochs=30, patience=5, token_key=None, tagging_scheme='BMES', delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) ================================================ FILE: hanlp/components/tokenizers/tok.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-12 13:08 from typing import Any, Callable from hanlp.components.taggers.rnn_tagger import RNNTagger from hanlp.datasets.tokenization.loaders.chunking_dataset import ChunkingDataset from hanlp.metrics.chunking.chunking_f1 import ChunkingF1 from hanlp.utils.span_util import bmes_to_words from hanlp_common.util import merge_locals_kwargs class RNNTokenizer(RNNTagger): def predict(self, sentence: Any, batch_size: int = None, **kwargs): flat = isinstance(sentence, str) if flat: sentence = [sentence] for i, s in enumerate(sentence): sentence[i] = list(s) outputs = RNNTagger.predict(self, sentence, batch_size, **kwargs) if flat: return outputs[0] return outputs def predict_data(self, data, batch_size, **kwargs): tags = RNNTagger.predict_data(self, data, batch_size, **kwargs) words = [bmes_to_words(c, t) for c, t in zip(data, tags)] return words def build_dataset(self, data, transform=None): dataset = ChunkingDataset(data) if 'transform' in self.config: dataset.append_transform(self.config.transform) if transform: dataset.append_transform(transform) return dataset def build_metric(self, **kwargs): return ChunkingF1() def update_metrics(self, metric, logits, y, mask, batch): pred = self.decode_output(logits, mask, batch) pred = self._id_to_tags(pred) gold = batch['tag'] metric(pred, gold) def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256, drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='char', tagging_scheme=None, anneal_factor: float = 0.5, anneal_patience=2, devices=None, logger=None, verbose=True, transform: Callable = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) ================================================ FILE: hanlp/components/tokenizers/tok_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-27 14:30 import logging from typing import Union, Any, List, Tuple, Iterable import tensorflow as tf from hanlp.common.keras_component import KerasComponent from hanlp.components.taggers.ngram_conv.ngram_conv_tagger import NgramTransform, NgramConvTaggerTF from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF from hanlp.components.taggers.transformers.transformer_transform_tf import TransformerTransform from hanlp.losses.sparse_categorical_crossentropy import SparseCategoricalCrossentropyOverBatchFirstDim from hanlp.metrics.chunking.bmes_tf import BMES_F1_TF from hanlp.transform.tsv_tf import TSVTaggingTransform from hanlp.transform.txt_tf import TxtFormat, TxtBMESFormat, extract_ngram_features_and_tags, bmes_to_words from hanlp_common.util import merge_locals_kwargs class BMESTokenizerTF(KerasComponent): def build_metrics(self, metrics, logger: logging.Logger, **kwargs): if metrics == 'f1': self.config.run_eagerly = True return BMES_F1_TF(self.transform.tag_vocab) return super().build_metrics(metrics, logger, **kwargs) class NgramConvTokenizerTransform(TxtFormat, NgramTransform): def inputs_to_samples(self, inputs, gold=False): if self.input_is_single_sample(inputs): inputs = [inputs] for sent in inputs: # bigram_only = false yield extract_ngram_features_and_tags(sent, False, self.config.window_size, gold) def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool: if not input: return True return isinstance(input, str) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, **kwargs) -> Iterable: yield from TxtBMESFormat.Y_to_tokens(self, self.tag_vocab, Y, gold, inputs) class NgramConvTokenizerTF(BMESTokenizerTF, NgramConvTaggerTF): def __init__(self) -> None: super().__init__(NgramConvTokenizerTransform()) def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200, ngram_embed: Union[str, int, dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3, filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True, loss: Union[tf.keras.losses.Loss, str] = None, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=100, epochs=100, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) def evaluate_output_to_file(self, batch, outputs, out): for x, y_pred in zip(self.transform.X_to_inputs(batch[0]), self.transform.Y_to_outputs(outputs, gold=False)): out.write(self.transform.input_truth_output_to_str(x, None, y_pred)) out.write('\n') def build_loss(self, loss, **kwargs): if loss is None: return SparseCategoricalCrossentropyOverBatchFirstDim() return super().build_loss(loss, **kwargs) class TransformerTokenizerTransform(TxtBMESFormat, TransformerTransform): def inputs_to_samples(self, inputs, gold=False): yield from TransformerTransform.inputs_to_samples(self, TxtBMESFormat.inputs_to_samples(self, inputs, gold), True) def Y_to_tokens(self, tag_vocab, Y, gold, inputs): if not gold: Y = tf.argmax(Y, axis=2) for text, ys in zip(inputs, Y): tags = [tag_vocab.idx_to_token[int(y)] for y in ys[1:len(text) + 1]] yield bmes_to_words(list(text), tags) class TransformerTokenizerTF(BMESTokenizerTF, TransformerTaggerTF): def __init__(self, transform: TransformerTokenizerTransform = None) -> None: if transform is None: transform = TransformerTokenizerTransform() super().__init__(transform) class RNNTokenizerTransform(TxtBMESFormat, TSVTaggingTransform): pass class RNNTokenizerTF(BMESTokenizerTF, RNNTaggerTF): def __init__(self, transform: RNNTokenizerTransform = None) -> None: if not transform: transform = RNNTokenizerTransform() super().__init__(transform) def fit(self, trn_data: str, dev_data: str = None, save_dir: str = None, embeddings=100, embedding_trainable=False, rnn_input_dropout=0.2, rnn_units=100, rnn_output_dropout=0.2, epochs=20, lower=False, max_seq_len=50, logger=None, loss: Union[tf.keras.losses.Loss, str] = None, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='f1', batch_size=32, dev_batch_size=32, lr_decay_per_epoch=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs)) ================================================ FILE: hanlp/components/tokenizers/transformer.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-11 02:48 import functools from typing import TextIO, Union, List, Dict, Any, Set import torch from hanlp.common.dataset import SamplerBuilder from hanlp.common.transform import TransformList from hanlp.components.taggers.transformers.transformer_tagger import TransformerTagger from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset, generate_tags_for_subtokens from hanlp.metrics.f1 import F1 from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer from hanlp.utils.span_util import bmes_to_spans from hanlp.utils.string_util import possible_tokenization from hanlp_common.util import merge_locals_kwargs from hanlp_trie import DictInterface, TrieDict from hanlp_trie.dictionary import TupleTrieDict class TransformerTaggingTokenizer(TransformerTagger): def __init__(self, **kwargs) -> None: """ A tokenizer using transformer tagger for span prediction. It features with 2 high performance dictionaries to handle edge cases in real application. - ``dict_force``: High priority dictionary performs longest-prefix-matching on input text which takes higher priority over model predictions. - ``dict_combine``: Low priority dictionary performs longest-prefix-matching on model predictions then combines them. .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can do and what it can't do. The tutorial in `this book `_ can be very helpful. It also supports outputting the span of each token by setting ``config.output_spans = True``. Args: **kwargs: Predefined config. """ super().__init__(**kwargs) @property def dict_force(self) -> DictInterface: r""" The high priority dictionary which perform longest-prefix-matching on inputs to split them into two subsets: 1. spans containing no keywords, which are then fed into tokenizer for further tokenization. 2. keywords, which will be outputed without furthur tokenization. .. Caution:: Longest-prefix-matching **NEVER** guarantee the presence of any keywords. Abuse of ``dict_force`` can lead to low quality results. For more details, refer to `this book `_. Examples: >>> tok.dict_force = {'和服', '服务行业'} # Force '和服' and '服务行业' by longest-prefix-matching >>> tok("商品和服务行业") ['商品', '和服', '务行业'] >>> tok.dict_force = {'和服务': ['和', '服务']} # Force '和服务' to be tokenized as ['和', '服务'] >>> tok("商品和服务行业") ['商品', '和', '服务', '行业'] """ return self.config.get('dict_force', None) @dict_force.setter def dict_force(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): dictionary = TrieDict(dictionary) self.config.dict_force = dictionary self.tokenizer_transform.dict = dictionary @property def dict_combine(self) -> DictInterface: """ The low priority dictionary which perform longest-prefix-matching on model predictions and combing them. Examples: >>> tok.dict_combine = {'和服', '服务行业'} >>> tok("商品和服务行业") # '和服' is not in the original results ['商品', '和', '服务']. '服务', '行业' are combined to '服务行业' ['商品', '和', '服务行业'] """ return self.config.get('dict_combine', None) @dict_combine.setter def dict_combine(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): if all(isinstance(k, str) for k in dictionary): dictionary = TrieDict(dictionary) else: _d = set() for k in dictionary: if isinstance(k, str): _d.update(possible_tokenization(k)) else: _d.add(k) dictionary = TupleTrieDict(_d) self.config.dict_combine = dictionary def build_metric(self, **kwargs): return F1() # noinspection PyMethodOverriding def update_metrics(self, metric, logits, y, mask, batch, prediction): for p, g in zip(prediction, self.tag_to_span(batch['tag'], batch)): pred = set(p) gold = set(g) metric(pred, gold) def decode_output(self, logits, mask, batch, model=None): output = super().decode_output(logits, mask, batch, model) if isinstance(output, torch.Tensor): output = output.tolist() prediction = self.id_to_tags(output, [len(x) for x in batch['token']]) return self.tag_to_span(prediction, batch) def tag_to_span(self, batch_tags, batch: dict): spans = [] if 'custom_words' in batch: if self.config.tagging_scheme == 'BMES': S = 'S' M = 'M' E = 'E' else: S = 'B' M = 'I' E = 'I' for tags, custom_words in zip(batch_tags, batch['custom_words']): # [batch['raw_token'][0][x[0]:x[1]] for x in subwords] if custom_words: for start, end, label in custom_words: if end - start == 1: tags[start] = S else: tags[start] = 'B' tags[end - 1] = E for i in range(start + 1, end - 1): tags[i] = M if end < len(tags): tags[end] = 'B' if 'token_subtoken_offsets_group' not in batch: # only check prediction on raw text for now # Check cases that a single char gets split into multiple subtokens, e.g., ‥ -> . + . for tags, subtoken_offsets in zip(batch_tags, batch['token_subtoken_offsets']): offset = -1 # BERT produces 'ᄒ', '##ᅡ', '##ᆫ' for '한' and they share the same span prev_tag = None for i, (tag, (b, e)) in enumerate(zip(tags, subtoken_offsets)): if b < offset: if prev_tag == 'S': tags[i - 1] = 'B' elif prev_tag == 'E': tags[i - 1] = 'M' tags[i] = tag = 'M' offset = e prev_tag = tag for tags in batch_tags: spans.append(bmes_to_spans(tags)) return spans def write_prediction(self, prediction, batch, output: TextIO): batch_tokens = self.spans_to_tokens(prediction, batch) for tokens in batch_tokens: output.write(' '.join(tokens)) output.write('\n') @property def tokenizer_transform(self): if not self._tokenizer_transform: self._tokenizer_transform = TransformerSequenceTokenizer(self.transformer_tokenizer, self.config.token_key, ret_subtokens=True, ret_subtokens_group=True, ret_token_span=False, dict_force=self.dict_force) return self._tokenizer_transform def spans_to_tokens(self, spans, batch, rebuild_span=False): batch_tokens = [] dict_combine = self.dict_combine raw_text = batch.get('token_', None) # Use raw text to rebuild the token according to its offset for b, (spans_per_sent, sub_tokens) in enumerate(zip(spans, batch[self.config.token_key])): if raw_text: # This will restore iPhone X as a whole text = raw_text[b] offsets = batch['token_subtoken_offsets'][b] tokens = [text[offsets[b][0]:offsets[e - 1][-1]] for b, e in spans_per_sent] else: # This will merge iPhone X into iPhoneX tokens = [''.join(sub_tokens[span[0]:span[1]]) for span in spans_per_sent] if dict_combine: buffer = [] offset = 0 delta = 0 for start, end, label in dict_combine.tokenize(tokens): if offset < start: buffer.extend(tokens[offset:start]) if raw_text: # noinspection PyUnboundLocalVariable combined = text[offsets[spans_per_sent[start - delta][0]][0]: offsets[spans_per_sent[end - delta - 1][1] - 1][1]] else: combined = ''.join(tokens[start:end]) buffer.append(combined) offset = end if rebuild_span: start -= delta end -= delta combined_span = (spans_per_sent[start][0], spans_per_sent[end - 1][1]) del spans_per_sent[start:end] delta += end - start - 1 spans_per_sent.insert(start, combined_span) if offset < len(tokens): buffer.extend(tokens[offset:]) tokens = buffer batch_tokens.append(tokens) return batch_tokens def generate_prediction_filename(self, tst_data, save_dir): return super().generate_prediction_filename(tst_data.replace('.tsv', '.txt'), save_dir) def prediction_to_human(self, pred, vocab, batch, rebuild_span=False): output_spans = self.config.get('output_spans', None) tokens = self.spans_to_tokens(pred, batch, rebuild_span or output_spans) if output_spans: subtoken_spans = batch['token_subtoken_offsets'] results = [] for toks, offs, subs in zip(tokens, pred, subtoken_spans): r = [] results.append(r) for t, (b, e) in zip(toks, offs): r.append([t, subs[b][0], subs[e - 1][-1]]) return results return tokens def input_is_flat(self, tokens): return isinstance(tokens, str) def build_dataset(self, data, **kwargs): return TextTokenizingDataset(data, **kwargs) def last_transform(self): return TransformList(functools.partial(generate_tags_for_subtokens, tagging_scheme=self.config.tagging_scheme), super().last_transform()) def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2, hidden_dropout=None, layer_dropout=0, scalar_mix=None, grad_norm=5.0, transformer_grad_norm=None, lr=5e-5, eval_trn=True, transformer_lr=None, transformer_layers=None, gradient_accumulation=1, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, crf=False, reduction='sum', batch_size=32, sampler_builder: SamplerBuilder = None, epochs=30, patience=5, token_key=None, tagging_scheme='BMES', delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None, devices: Union[float, int, List[int]] = None, **kwargs): """ Args: trn_data: Training set. dev_data: Development set. save_dir: The directory to save trained component. transformer: An identifier of a pre-trained transformer. average_subwords: ``True`` to average subword representations. word_dropout: Dropout rate to randomly replace a subword with MASK. hidden_dropout: Dropout rate applied to hidden states. layer_dropout: Randomly zero out hidden states of a transformer layer. scalar_mix: Layer attention. grad_norm: Gradient norm for clipping. transformer_grad_norm: Gradient norm for clipping transformer gradient. lr: Learning rate for decoder. transformer_lr: Learning for encoder. transformer_layers: The number of bottom layers to use. gradient_accumulation: Number of batches per update. adam_epsilon: The epsilon to use in Adam. weight_decay: The weight decay to use. warmup_steps: The number of warmup steps. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). reduction: The loss reduction used in aggregating losses. batch_size: The number of samples in a batch. sampler_builder: The builder to build sampler, which will override batch_size. epochs: The number of epochs to train. patience: The number of patience epochs before early stopping. token_key: The key to tokens in dataset. tagging_scheme: Either ``BMES`` or ``BI``. delimiter: Delimiter between tokens used to split a line in the corpus. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. transform: An optional transform to be applied to samples. Usually a character normalization transform is passed in. devices: Devices this component will live on. logger: Any :class:`logging.Logger` instance. seed: Random seed to reproduce this training. **kwargs: Not used. Returns: Best metrics on dev set. """ return super().fit(**merge_locals_kwargs(locals(), kwargs)) def feed_batch(self, batch: dict): x, mask = super().feed_batch(batch) return x[:, 1:-1, :], mask ================================================ FILE: hanlp/datasets/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-06-13 18:15 ================================================ FILE: hanlp/datasets/classification/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-11-10 11:49 ================================================ FILE: hanlp/datasets/classification/sentiment.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-30 21:03 _ERNIE_TASK_DATA = 'https://ernie.bj.bcebos.com/task_data_zh.tgz#' CHNSENTICORP_ERNIE_TRAIN = _ERNIE_TASK_DATA + 'chnsenticorp/train.tsv' CHNSENTICORP_ERNIE_DEV = _ERNIE_TASK_DATA + 'chnsenticorp/dev.tsv' CHNSENTICORP_ERNIE_TEST = _ERNIE_TASK_DATA + 'chnsenticorp/test.tsv' ================================================ FILE: hanlp/datasets/coref/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-04 13:39 ================================================ FILE: hanlp/datasets/coref/loaders/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:03 ================================================ FILE: hanlp/datasets/coref/loaders/conll12coref.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-04 15:33 import collections import os from typing import Union, List, Callable, DefaultDict, Tuple, Optional, Iterator from hanlp.datasets.srl.loaders.ontonotes_loader import Ontonotes as _Ontonotes, OntonotesSentence, \ make_coref_instance from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import TimingFileIterator class Ontonotes(_Ontonotes): def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]: """An iterator over CONLL formatted files which yields documents, regardless of the number of document annotations in a particular file. This is useful for conll data which has been preprocessed, such as the preprocessing which takes place for the 2012 CONLL Coreference Resolution task. Args: file_path: str: Returns: """ open_file = TimingFileIterator(file_path) conll_rows = [] document: List[OntonotesSentence] = [] for line in open_file: open_file.log(f'Loading {os.path.basename(file_path)}') line = line.strip() if line != "" and not line.startswith("#"): # Non-empty line. Collect the annotation. conll_rows.append(line) else: if conll_rows: document.append(self._conll_rows_to_sentence(conll_rows)) conll_rows = [] if line.startswith("#end document"): yield document document = [] open_file.erase() if document: # Collect any stragglers or files which might not # have the '#end document' format for the end of the file. yield document class CONLL12CorefDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, max_span_width=10, max_sentences=None, remove_singleton_clusters=False) -> None: self.remove_singleton_clusters = remove_singleton_clusters self.max_sentences = max_sentences self.max_span_width = max_span_width super().__init__(data, transform, cache) def load_file(self, filepath: str): ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(filepath): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) yield self.text_to_instance([s.words for s in sentences], list(clusters.values())) def text_to_instance( self, # type: ignore sentences: List[List[str]], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, ) -> dict: return make_coref_instance( sentences, self.max_span_width, gold_clusters, self.max_sentences, self.remove_singleton_clusters, ) ================================================ FILE: hanlp/datasets/eos/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-26 18:11 ================================================ FILE: hanlp/datasets/eos/eos.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-26 18:12 import itertools from collections import Counter from typing import Union, List, Callable from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import TimingFileIterator from hanlp.utils.log_util import cprint from hanlp.utils.string_util import ispunct class SentenceBoundaryDetectionDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, append_after_sentence=None, eos_chars=None, eos_char_min_freq=200, eos_char_is_punct=True, window_size=5, **kwargs, ) -> None: """Dataset for sentence boundary detection (eos). Args: data: The local or remote path to a dataset, or a list of samples where each sample is a dict. transform: Predefined transform(s). cache: ``True`` to enable caching, so that transforms won't be called twice. append_after_sentence: A :class:`str` to insert at the tail of each sentence. For example, English always have a space between sentences. eos_chars: Punctuations at the tail of sentences. If ``None``, then it will built from training samples. eos_char_min_freq: Minimal frequency to keep a eos char. eos_char_is_punct: Limit eos chars to punctuations. window_size: Window size to extract ngram features. kwargs: Not used. """ self.eos_char_is_punct = eos_char_is_punct self.append_after_sentence = append_after_sentence self.window_size = window_size self.eos_chars = eos_chars self.eos_char_min_freq = eos_char_min_freq super().__init__(data, transform, cache) def load_file(self, filepath: str): """Load eos corpus. Args: filepath: Path to the corpus. .. highlight:: bash .. code-block:: bash $ head -n 2 ctb8.txt 中国经济简讯 新华社北京十月二十九日电中国经济简讯 """ f = TimingFileIterator(filepath) sents = [] eos_offsets = [] offset = 0 for line in f: if not line.strip(): continue line = line.rstrip('\n') eos_offsets.append(offset + len(line.rstrip()) - 1) offset += len(line) if self.append_after_sentence: line += self.append_after_sentence offset += len(self.append_after_sentence) f.log(line) sents.append(line) f.erase() corpus = list(itertools.chain.from_iterable(sents)) if self.eos_chars: if not isinstance(self.eos_chars, set): self.eos_chars = set(self.eos_chars) else: eos_chars = Counter() for i in eos_offsets: eos_chars[corpus[i]] += 1 self.eos_chars = set(k for (k, v) in eos_chars.most_common() if v >= self.eos_char_min_freq and (not self.eos_char_is_punct or ispunct(k))) cprint(f'eos_chars = [yellow]{self.eos_chars}[/yellow]') eos_index = 0 eos_offsets = [i for i in eos_offsets if corpus[i] in self.eos_chars] window_size = self.window_size for i, c in enumerate(corpus): if c in self.eos_chars: window = corpus[i - window_size: i + window_size + 1] label_id = 1. if eos_offsets[eos_index] == i else 0. if label_id > 0: eos_index += 1 yield {'char': window, 'label_id': label_id} assert eos_index == len(eos_offsets), f'{eos_index} != {len(eos_offsets)}' ================================================ FILE: hanlp/datasets/eos/loaders/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:03 ================================================ FILE: hanlp/datasets/eos/loaders/nn_eos.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-24 22:51 _SETIMES2_EN_HR_SENTENCES_HOME = 'https://schweter.eu/cloud/nn_eos/SETIMES2.en-hr.sentences.tar.xz' SETIMES2_EN_HR_HR_SENTENCES_TRAIN = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.train' '''Training set of SETimes corpus.''' SETIMES2_EN_HR_HR_SENTENCES_DEV = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.dev' '''Dev set of SETimes corpus.''' SETIMES2_EN_HR_HR_SENTENCES_TEST = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.test' '''Test set of SETimes corpus.''' _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME = 'http://schweter.eu/cloud/nn_eos/europarl-v7.de-en.en.sentences.tar.xz' EUROPARL_V7_DE_EN_EN_SENTENCES_TRAIN = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.train' '''Training set of Europarl corpus (:cite:`koehn2005europarl`).''' EUROPARL_V7_DE_EN_EN_SENTENCES_DEV = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.dev' '''Dev set of Europarl corpus (:cite:`koehn2005europarl`).''' EUROPARL_V7_DE_EN_EN_SENTENCES_TEST = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.test' '''Test set of Europarl corpus (:cite:`koehn2005europarl`).''' ================================================ FILE: hanlp/datasets/lm/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-05 21:41 _PTB_HOME = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz#' PTB_TOKEN_TRAIN = _PTB_HOME + 'data/ptb.train.txt' PTB_TOKEN_DEV = _PTB_HOME + 'data/ptb.valid.txt' PTB_TOKEN_TEST = _PTB_HOME + 'data/ptb.test.txt' PTB_CHAR_TRAIN = _PTB_HOME + 'data/ptb.char.train.txt' PTB_CHAR_DEV = _PTB_HOME + 'data/ptb.char.valid.txt' PTB_CHAR_TEST = _PTB_HOME + 'data/ptb.char.test.txt' ================================================ FILE: hanlp/datasets/lm/loaders/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:04 ================================================ FILE: hanlp/datasets/lm/loaders/lm_dataset.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-05 21:42 import os from typing import Union, Callable, List import hanlp_common.io import torch from hanlp.common.dataset import TransformSequentialDataset from hanlp.common.transform import ToChar, WhitespaceTokenizer, AppendEOS, FieldToIndex from hanlp.common.vocab import Vocab from hanlp.utils.io_util import file_cache, get_resource, TimingFileIterator class LanguageModelDataset(TransformSequentialDataset): def __init__(self, data: str, batch_size, seq_len, tokenizer='char', eos='\n', strip=True, vocab=None, cache=False, transform: Union[Callable, List] = None) -> None: self.cache = cache self.eos = eos self.strip = strip super().__init__(transform) if isinstance(tokenizer, str): available_tokenizers = { 'char': ToChar('text', 'token'), 'whitespace': WhitespaceTokenizer('text', 'token') } assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} ' self.append_transform(available_tokenizers[tokenizer]) if vocab is None: vocab = Vocab() self.training = True else: self.training = vocab.mutable self.append_transform(AppendEOS('token', eos=eos)) self.append_transform(FieldToIndex('token', vocab)) self.batch_size = batch_size data = get_resource(data) self.data = data self.num_tokens = None self.load_file(data) self._fp = None if isinstance(seq_len, int): self.seq_len = lambda: seq_len else: self.seq_len = seq_len @property def vocab(self): return self.transform[-1].vocab @property def vocab_path(self): return os.path.splitext(self.data)[0] + '.vocab.json' def load_file(self, filepath): cache, valid = file_cache(filepath, not self.cache) if not valid or (self.vocab.mutable and not os.path.isfile(self.vocab_path)): with open(cache, 'wb') as out: tokens, lines = 0, 0 f = TimingFileIterator(filepath) for line in f: if self.strip: line = line.strip() if not line: continue sample = {'text': line} sample = self.transform_sample(sample, inplace=True) for id in sample['token_id']: out.write((id).to_bytes(4, 'little')) tokens += len(sample['token_id']) lines += 1 f.log(f'{tokens // 1000000}M tokens, {lines // 1000000}M lines\n' f'{sample["token"][:10]}') f.erase() if self.vocab.mutable: self.vocab.lock() hanlp_common.io.save_json(self.vocab_path) self.num_tokens = tokens else: self.num_tokens = int(os.path.getsize(self.filecache) / 4) if self.vocab.mutable: hanlp_common.io.load_json(self.vocab_path) def __iter__(self): batch_size = self.batch_size max_seq_len = self.max_seq_len i = 0 safety = 2 if self.training else 1 with open(self.filecache, 'rb') as fp: while i < max_seq_len - safety: seq_len = self.seq_len() seq_len = min(seq_len, max_seq_len - 1 - i) data = [] for j in range(batch_size): data.append(self._read_chunk(fp, max_seq_len * j + i, seq_len + 1)) data = torch.LongTensor(data) data.transpose_(0, 1) data, targets = data[:seq_len, :], data[1:, :] yield data, targets.contiguous().view(-1) i += seq_len def estimate_num_batches(self, seq_len=None): if not seq_len: seq_len = self.seq_len() return self.max_seq_len // seq_len @property def max_seq_len(self): max_seq_len = self.num_tokens // self.batch_size return max_seq_len @staticmethod def _read_chunk(fp, offset, length): data = [] fp.seek(offset * 4) for i in range(length): id = int.from_bytes(fp.read(4), 'little') data.append(id) return data def _debug_load_cache(self): with open(self.filecache, 'rb') as src: ids = [] for i in range(self.num_tokens): id = int.from_bytes(src.read(4), 'little') ids.append(id) return torch.LongTensor(ids) @property def filecache(self): return file_cache(self.data)[0] ================================================ FILE: hanlp/datasets/lu/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:08 ================================================ FILE: hanlp/datasets/lu/glue.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-11-10 11:47 from hanlp.common.dataset import TableDataset STANFORD_SENTIMENT_TREEBANK_2_TRAIN = 'http://file.hankcs.com/corpus/SST2.zip#train.tsv' STANFORD_SENTIMENT_TREEBANK_2_DEV = 'http://file.hankcs.com/corpus/SST2.zip#dev.tsv' STANFORD_SENTIMENT_TREEBANK_2_TEST = 'http://file.hankcs.com/corpus/SST2.zip#test.tsv' MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TRAIN = 'http://file.hankcs.com/corpus/mrpc.zip#train.tsv' MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV = 'http://file.hankcs.com/corpus/mrpc.zip#dev.tsv' MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TEST = 'http://file.hankcs.com/corpus/mrpc.zip#test.tsv' class SST2Dataset(TableDataset): pass def main(): dataset = SST2Dataset(STANFORD_SENTIMENT_TREEBANK_2_TEST) print(dataset) if __name__ == '__main__': main() ================================================ FILE: hanlp/datasets/ner/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-06 15:32 ================================================ FILE: hanlp/datasets/ner/conll03.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-06 15:31 CONLL03_EN_TRAIN = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.train.tsv' '''Training set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)''' CONLL03_EN_DEV = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.dev.tsv' '''Dev set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)''' CONLL03_EN_TEST = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.test.tsv' '''Test set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)''' ================================================ FILE: hanlp/datasets/ner/loaders/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:04 ================================================ FILE: hanlp/datasets/ner/loaders/json_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-21 16:26 import json import os from typing import Union, List, Callable, Dict from hanlp_common.constant import NULL import hanlp.utils.span_util from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import TimingFileIterator, read_tsv_as_sents class JsonNERDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, generate_idx=None, doc_level_offset=True, tagset=None) -> None: """A dataset for ``.jsonlines`` format NER corpora. Args: data: The local or remote path to a dataset, or a list of samples where each sample is a dict. transform: Predefined transform(s). cache: ``True`` to enable caching, so that transforms won't be called twice. generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when samples are re-ordered by a sampler. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. tagset: Optional tagset to prune entities outside of this tagset from datasets. """ self.tagset = tagset self.doc_level_offset = doc_level_offset super().__init__(data, transform, cache, generate_idx) def load_file(self, filepath: str): """Load ``.jsonlines`` NER corpus. Samples of this corpus can be found using the following scripts. .. highlight:: python .. code-block:: python import json from hanlp_common.document import Document from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV from hanlp.utils.io_util import get_resource with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src: for line in src: doc = json.loads(line) print(Document(doc)) break Args: filepath: ``.jsonlines`` NER corpus. """ filename = os.path.basename(filepath) reader = TimingFileIterator(filepath) num_docs, num_sentences = 0, 0 for line in reader: line = line.strip() if not line: continue doc = json.loads(line) num_docs += 1 num_tokens_in_doc = 0 for sentence, ner in zip(doc['sentences'], doc['ner']): if self.doc_level_offset: ner = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc, x[2]) for x in ner] else: ner = [(x[0], x[1], x[2]) for x in ner] if self.tagset: ner = [x for x in ner if x[2] in self.tagset] if isinstance(self.tagset, dict): ner = [(x[0], x[1], self.tagset[x[2]]) for x in ner] deduplicated_srl = [] be_set = set() for b, e, l in ner: be = (b, e) if be in be_set: continue be_set.add(be) deduplicated_srl.append((b, e, l)) yield { 'token': sentence, 'ner': deduplicated_srl } num_sentences += 1 num_tokens_in_doc += len(sentence) reader.log( f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]') reader.erase() def convert_conll03_to_json(file_path): dataset = [] num_docs = [0] def new_doc(): doc_key = num_docs[0] num_docs[0] += 1 return { 'doc_key': doc_key, 'sentences': [], 'ner': [], } doc = new_doc() offset = 0 for cells in read_tsv_as_sents(file_path): if cells[0][0] == '-DOCSTART-' and doc['ner']: dataset.append(doc) doc = new_doc() offset = 0 sentence = [x[0] for x in cells] ner = [x[-1] for x in cells] ner = hanlp.utils.span_util.iobes_tags_to_spans(ner) adjusted_ner = [] for label, (span_start, span_end) in ner: adjusted_ner.append([span_start + offset, span_end + offset, label]) doc['sentences'].append(sentence) doc['ner'].append(adjusted_ner) offset += len(sentence) if doc['ner']: dataset.append(doc) output_path = os.path.splitext(file_path)[0] + '.json' with open(output_path, 'w') as out: for each in dataset: json.dump(each, out) out.write('\n') def unpack_ner(sample: dict) -> dict: ner: list = sample.get('ner', None) if ner is not None: if ner: sample['begin_offset'], sample['end_offset'], sample['label'] = zip(*ner) else: # It's necessary to create a null label when there is no NER in the sentence for the sake of padding. sample['begin_offset'], sample['end_offset'], sample['label'] = [0], [0], [NULL] return sample def prune_ner_tagset(sample: dict, tagset: Union[set, Dict[str, str]]): if 'tag' in sample: pruned_tag = [] for tag in sample['tag']: cells = tag.split('-', 1) if len(cells) == 2: role, ner_type = cells if ner_type in tagset: if isinstance(tagset, dict): tag = role + '-' + tagset[ner_type] else: tag = 'O' pruned_tag.append(tag) sample['tag'] = pruned_tag return sample ================================================ FILE: hanlp/datasets/ner/loaders/tsv.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-24 23:09 from typing import Union, List, Callable from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv from hanlp.utils.string_util import split_long_sentence_into class TSVTaggingDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, generate_idx=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, **kwargs ) -> None: """ Args: data: The local or remote path to a dataset, or a list of samples where each sample is a dict. transform: Predefined transform(s). cache: ``True`` to enable caching, so that transforms won't be called twice. generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when samples are re-ordered by a sampler. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. kwargs: Not used. """ self.char_level = char_level self.hard_constraint = hard_constraint self.sent_delimiter = sent_delimiter self.max_seq_len = max_seq_len super().__init__(data, transform, cache, generate_idx) def load_file(self, filepath): """Load a ``.tsv`` file. A ``.tsv`` file for tagging is defined as a tab separated text file, where non-empty lines have two columns for token and tag respectively, empty lines mark the end of sentences. Args: filepath: Path to a ``.tsv`` tagging file. .. highlight:: bash .. code-block:: bash $ head eng.train.tsv -DOCSTART- O EU S-ORG rejects O German S-MISC call O to O boycott O British S-MISC lamb O """ filepath = get_resource(filepath) # idx = 0 for words, tags in generate_words_tags_from_tsv(filepath, lower=False): # idx += 1 # if idx % 1000 == 0: # print(f'\rRead instances {idx // 1000}k', end='') if self.max_seq_len: start = 0 for short_sents in split_long_sentence_into(words, self.max_seq_len, self.sent_delimiter, char_level=self.char_level, hard_constraint=self.hard_constraint): end = start + len(short_sents) yield {'token': short_sents, 'tag': tags[start:end]} start = end else: yield {'token': words, 'tag': tags} # print('\r', end='') ================================================ FILE: hanlp/datasets/ner/msra.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:13 _MSRA_NER_HOME = 'http://file.hankcs.com/corpus/msra_ner.zip' _MSRA_NER_TOKEN_LEVEL_HOME = 'http://file.hankcs.com/corpus/msra_ner_token_level.zip' MSRA_NER_CHAR_LEVEL_TRAIN = f'{_MSRA_NER_HOME}#train.tsv' '''Training set of MSRA (:cite:`levow-2006-third`) in character level.''' MSRA_NER_CHAR_LEVEL_DEV = f'{_MSRA_NER_HOME}#dev.tsv' '''Dev set of MSRA (:cite:`levow-2006-third`) in character level.''' MSRA_NER_CHAR_LEVEL_TEST = f'{_MSRA_NER_HOME}#test.tsv' '''Test set of MSRA (:cite:`levow-2006-third`) in character level.''' MSRA_NER_TOKEN_LEVEL_IOBES_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.tsv' '''Training set of MSRA (:cite:`levow-2006-third`) in token level.''' MSRA_NER_TOKEN_LEVEL_IOBES_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.tsv' '''Dev set of MSRA (:cite:`levow-2006-third`) in token level.''' MSRA_NER_TOKEN_LEVEL_IOBES_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.tsv' '''Test set of MSRA (:cite:`levow-2006-third`) in token level.''' MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.short.tsv' '''Training set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.''' MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.short.tsv' '''Dev set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.''' MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.short.tsv' '''Test set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level.''' MSRA_NER_TOKEN_LEVEL_SHORT_JSON_TRAIN = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.train.short.jsonlines' '''Training set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.''' MSRA_NER_TOKEN_LEVEL_SHORT_JSON_DEV = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.dev.short.jsonlines' '''Dev set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.''' MSRA_NER_TOKEN_LEVEL_SHORT_JSON_TEST = f'{_MSRA_NER_TOKEN_LEVEL_HOME}#word_level.test.short.jsonlines' '''Test set of shorten (<= 128 tokens) MSRA (:cite:`levow-2006-third`) in token level and jsonlines format.''' ================================================ FILE: hanlp/datasets/ner/resume.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-08 12:10 from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv _RESUME_NER_HOME = 'https://github.com/jiesutd/LatticeLSTM/archive/master.zip#' RESUME_NER_TRAIN = _RESUME_NER_HOME + 'ResumeNER/train.char.bmes' '''Training set of Resume in char level.''' RESUME_NER_DEV = _RESUME_NER_HOME + 'ResumeNER/dev.char.bmes' '''Dev set of Resume in char level.''' RESUME_NER_TEST = _RESUME_NER_HOME + 'ResumeNER/test.char.bmes' '''Test set of Resume in char level.''' ================================================ FILE: hanlp/datasets/ner/weibo.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-03 23:33 from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv _WEIBO_NER_HOME = 'https://github.com/hltcoe/golden-horse/archive/master.zip#data/' WEIBO_NER_TRAIN = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.train' '''Training set of Weibo in char level.''' WEIBO_NER_DEV = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.dev' '''Dev set of Weibo in char level.''' WEIBO_NER_TEST = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.test' '''Test set of Weibo in char level.''' ================================================ FILE: hanlp/datasets/parsing/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 00:51 ================================================ FILE: hanlp/datasets/parsing/amr.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-18 17:47 from collections import defaultdict from copy import copy from typing import List import numpy as np import torch from hanlp_common.constant import CLS from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader from hanlp.common.transform import VocabDict from hanlp.common.vocab import VocabWithFrequency from hanlp.components.amr.amr_parser.amrio import AMRIO from hanlp.components.amr.amr_parser.data import END, DUM, list_to_tensor, lists_of_string_to_tensor, NIL, REL from hanlp.components.amr.amr_parser.transformer import SelfAttentionMask from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer from hanlp_common.util import merge_list_of_dict class AbstractMeaningRepresentationDataset(TransformableDataset): def load_file(self, filepath: str): for tok, lem, pos, ner, amr in AMRIO.read(filepath): yield {'token': tok, 'lemma': lem, 'pos': pos, 'ner': ner, 'amr': amr} def generate_oracle(sample: dict): amr = sample.get('amr', None) if amr: concept, edge, _ = amr.root_centered_sort() sample['concept'] = concept sample['edge'] = edge return sample def chars_for_tok(sample: dict, max_string_len=20): token = sample['token'] chars = [] for each in token: each = each[:max_string_len] chars.append([CLS] + list(each) + [END]) sample['word_char'] = chars return sample def append_bos(sample: dict): for key in ['token', 'lemma', 'pos', 'ner']: if key in sample: sample[key] = [CLS] + sample[key] return sample def get_concepts(sample: dict, vocab: VocabWithFrequency = None, rel_vocab: VocabWithFrequency = None): lem, tok = sample['lemma'], sample['token'] cp_seq, mp_seq = [], [] new_tokens = set() for le, to in zip(lem, tok): cp_seq.append(le + '_') mp_seq.append(le) for cp, mp in zip(cp_seq, mp_seq): if vocab.get_idx(cp) == vocab.unk_idx: new_tokens.add(cp) if vocab.get_idx(mp) == vocab.unk_idx: new_tokens.add(mp) nxt = len(vocab) token2idx, idx2token = dict(), dict() if rel_vocab: new_tokens = rel_vocab.idx_to_token + sorted(new_tokens) else: new_tokens = sorted(new_tokens) for x in new_tokens: token2idx[x] = nxt idx2token[nxt] = x nxt += 1 for k, v in zip(['cp_seq', 'mp_seq', 'token2idx', 'idx2token'], [cp_seq, mp_seq, token2idx, idx2token]): sample[k] = v return sample def batchify(data, vocabs: VocabDict, unk_rate=0., device=None, squeeze=False, tokenizer: TransformerSequenceTokenizer = None, shuffle_sibling=True, levi_graph=False, extra_arc=False, bart=False): rel_vocab: VocabWithFrequency = vocabs.rel _tok = list_to_tensor(data['token'], vocabs['token'], unk_rate=unk_rate) if 'token' in vocabs else None _lem = list_to_tensor(data['lemma'], vocabs['lemma'], unk_rate=unk_rate) _pos = list_to_tensor(data['pos'], vocabs['pos'], unk_rate=unk_rate) if 'pos' in vocabs else None _ner = list_to_tensor(data['ner'], vocabs['ner'], unk_rate=unk_rate) if 'ner' in vocabs else None _word_char = lists_of_string_to_tensor(data['token'], vocabs['word_char']) if 'word_char' in vocabs else None local_token2idx = data['token2idx'] local_idx2token = data['idx2token'] _cp_seq = list_to_tensor(data['cp_seq'], vocabs['predictable_concept'], local_token2idx) _mp_seq = list_to_tensor(data['mp_seq'], vocabs['predictable_concept'], local_token2idx) ret = copy(data) if 'amr' in data: concept, edge = [], [] for amr in data['amr']: if levi_graph == 'kahn': concept_i, edge_i = amr.to_levi(rel_vocab.get_frequency, shuffle=shuffle_sibling) else: concept_i, edge_i, _ = amr.root_centered_sort(rel_vocab.get_frequency, shuffle=shuffle_sibling) concept.append(concept_i) edge.append(edge_i) if levi_graph is True: concept_with_rel, edge_with_rel = levi_amr(concept, edge, extra_arc=extra_arc) concept = concept_with_rel edge = edge_with_rel augmented_concept = [[DUM] + x + [END] for x in concept] _concept_in = list_to_tensor(augmented_concept, vocabs.get('concept_and_rel', vocabs['concept']), unk_rate=unk_rate)[:-1] _concept_char_in = lists_of_string_to_tensor(augmented_concept, vocabs['concept_char'])[:-1] _concept_out = list_to_tensor(augmented_concept, vocabs['predictable_concept'], local_token2idx)[1:] out_conc_len, bsz = _concept_out.shape _rel = np.full((1 + out_conc_len, bsz, out_conc_len), rel_vocab.pad_idx) # v: [, concept_0, ..., concept_l, ..., concept_{n-1}, ] u: [, concept_0, ..., concept_l, ..., concept_{n-1}] for bidx, (x, y) in enumerate(zip(edge, concept)): for l, _ in enumerate(y): if l > 0: # l=1 => pos=l+1=2 _rel[l + 1, bidx, 1:l + 1] = rel_vocab.get_idx(NIL) for v, u, r in x: if levi_graph: r = 1 else: r = rel_vocab.get_idx(r) assert v > u, 'Invalid typological order' _rel[v + 1, bidx, u + 1] = r ret.update( {'concept_in': _concept_in, 'concept_char_in': _concept_char_in, 'concept_out': _concept_out, 'rel': _rel}) else: augmented_concept = None token_length = ret.get('token_length', None) if token_length is not None and not isinstance(token_length, torch.Tensor): ret['token_length'] = torch.tensor(token_length, dtype=torch.long, device=device if ( isinstance(device, torch.device) or device >= 0) else 'cpu:0') ret.update({'lem': _lem, 'tok': _tok, 'pos': _pos, 'ner': _ner, 'word_char': _word_char, 'copy_seq': np.stack([_cp_seq, _mp_seq], -1), 'local_token2idx': local_token2idx, 'local_idx2token': local_idx2token}) if squeeze: token_field = make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret) else: token_field = 'token' subtoken_to_tensor(token_field, ret) if bart: make_batch_for_bart(augmented_concept, ret, tokenizer, device) move_dict_to_device(ret, device) return ret def make_batch_for_bart(augmented_concept, ret, tokenizer, device, training=True): token_field = 'concept' tokenizer = TransformerSequenceTokenizer(tokenizer.tokenizer, token_field, cls_is_bos=True, sep_is_eos=None) encodings = [tokenizer({token_field: x[:-1] if training else x}) for x in augmented_concept] ret.update(merge_list_of_dict(encodings)) decoder_mask = [] max_seq_len = len(max(ret['concept_input_ids'], key=len)) last_concept_offset = [] for spans, concepts in zip(ret['concept_token_span'], augmented_concept): mask = ~SelfAttentionMask.get_mask(max_seq_len, device, ret_parameter=False) for group in spans: for i in range(len(group)): for j in range(i + 1, len(group)): mask[group[i], group[j]] = True decoder_mask.append(mask) last_concept_offset.append(len(concepts) - 1) ret['decoder_mask'] = torch.stack(decoder_mask) if not training: ret['last_concept_offset'] = torch.tensor(last_concept_offset, device=device, dtype=torch.long) subtoken_to_tensor(token_field, ret) def levi_amr(concept, edge, extra_arc=False): concept_with_rel = [] edge_with_rel = [] for bidx, (edge_i, concept_i) in enumerate(zip(edge, concept)): concept_i, edge_i = linearize(concept_i, edge_i, NIL, prefix=REL, extra_arc=extra_arc) # This is a undirectional graph, so we can safely reverse edge edge_i = [tuple(reversed(sorted(x[:2]))) + x[2:] for x in edge_i] concept_with_rel.append(concept_i) edge_with_rel.append(edge_i) return concept_with_rel, edge_with_rel def move_dict_to_device(ret, device): if device == -1: device = 'cpu:0' for k, v in ret.items(): if isinstance(v, np.ndarray): ret[k] = torch.tensor(v, device=device).contiguous() elif isinstance(v, torch.Tensor): ret[k] = v.to(device).contiguous() def subtoken_to_tensor(token_field, ret): token_input_ids = PadSequenceDataLoader.pad_data(ret[f'{token_field}_input_ids'], 0, torch.long) token_token_span = PadSequenceDataLoader.pad_data(ret[f'{token_field}_token_span'], 0, torch.long) ret.update({f'{token_field}_token_span': token_token_span, f'{token_field}_input_ids': token_input_ids}) def make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret): token_field = 'token_and_concept' attention_mask = [] token_and_concept = [t + [tokenizer.sep_token] + c for t, c in zip(data['token'], augmented_concept)] encodings = [tokenizer({token_field: x}) for x in token_and_concept] ret.update(merge_list_of_dict(encodings)) max_input_len = len(max(ret[f'{token_field}_input_ids'], key=len)) concept_mask = [] token_mask = [] token_type_ids = [] snt_len = [] last_concept_offset = [] for tokens, concepts, input_ids, spans in zip(data['token'], augmented_concept, ret['token_and_concept_input_ids'], ret['token_and_concept_token_span']): raw_sent_len = len(tokens) + 1 # for [SEP] raw_concept_len = len(concepts) if concepts[-1] == END: concept_mask.append([False] * raw_sent_len + [True] * (raw_concept_len - 1) + [False]) # skip END concept else: concept_mask.append([False] * raw_sent_len + [True] * raw_concept_len) token_mask.append([False] + [True] * (raw_sent_len - 2) + [False] * (raw_concept_len + 1)) assert len(concept_mask) == len(token_mask) snt_len.append(raw_sent_len - 2) # skip [CLS] and [SEP] sent_len = input_ids.index(tokenizer.tokenizer.sep_token_id) + 1 concept_len = len(input_ids) - sent_len mask = torch.zeros((max_input_len, max_input_len), dtype=torch.bool) mask[:sent_len + concept_len, :sent_len] = True bottom_right = ~SelfAttentionMask.get_mask(concept_len, device, ret_parameter=False) mask[sent_len:sent_len + concept_len, sent_len:sent_len + concept_len] = bottom_right for group in spans: if group[0] >= sent_len: for i in range(len(group)): for j in range(i + 1, len(group)): mask[group[i], group[j]] = True attention_mask.append(mask) _token_type_ids = [0] * sent_len + [1] * concept_len token_type_ids.append(_token_type_ids) assert len(input_ids) == len(_token_type_ids) last_concept_offset.append(raw_concept_len - 1) ret['attention_mask'] = torch.stack(attention_mask) ret['concept_mask'] = PadSequenceDataLoader.pad_data(concept_mask, 0, torch.bool) ret['token_mask'] = PadSequenceDataLoader.pad_data(token_mask, 0, torch.bool) ret['token_type_ids'] = PadSequenceDataLoader.pad_data(token_type_ids, 0, torch.long) ret['snt_len'] = PadSequenceDataLoader.pad_data(snt_len, 0, torch.long) ret['last_concept_offset'] = PadSequenceDataLoader.pad_data(last_concept_offset, 0, torch.long) return token_field def linearize(concept: List, edge: List, label='', prefix=REL, extra_arc=False): vur = defaultdict(dict) for v, u, r in edge: vur[v][u] = r concept_with_rel = [] edge_with_rel = [] reorder = dict() for v, c in enumerate(concept): reorder[v] = len(concept_with_rel) concept_with_rel.append(c) ur = vur[v] for u, r in ur.items(): if u < v: concept_with_rel.append(prefix + r) for k, v in reorder.items(): assert concept[k] == concept_with_rel[v] for v, c in enumerate(concept): ur = vur[v] for i, (u, r) in enumerate(ur.items()): if u < v: _v = reorder[v] _u = reorder[u] _m = _v + i + 1 edge_with_rel.append((_v, _m, label)) edge_with_rel.append((_m, _u, label)) if extra_arc: edge_with_rel.append((_v, _u, label)) return concept_with_rel, edge_with_rel def unlinearize(concept: List, edge: List, prefix=REL, extra_arc=False): real_concept, reorder = separate_concept_rel(concept, prefix) if extra_arc: edge = [x for x in edge if concept[x[0]].startswith(REL) or concept[x[1]].startswith(REL)] real_edge = [] for f, b in zip(edge[::2], edge[1::2]): if b[1] not in reorder: continue u = reorder[b[1]] if f[0] not in reorder: continue v = reorder[f[0]] r = concept[f[1]][len(prefix):] real_edge.append((v, u, r)) return real_concept, real_edge def separate_concept_rel(concept, prefix=REL): reorder = dict() real_concept = [] for i, c in enumerate(concept): if not c.startswith(prefix): reorder[i] = len(real_concept) real_concept.append(c) return real_concept, reorder def remove_unconnected_components(concept: List, edge: List): from scipy.sparse import csr_matrix from scipy.sparse.csgraph._traversal import connected_components row = np.array([x[0] for x in edge], dtype=np.int) col = np.array([x[1] for x in edge], dtype=np.int) data = np.ones(len(row), dtype=np.int) graph = csr_matrix((data, (row, col)), shape=(len(concept), len(concept))) n_components, labels = connected_components(csgraph=graph, directed=True, return_labels=True) if n_components > 1: unique, counts = np.unique(labels, return_counts=True) largest_component = max(zip(counts, unique))[-1] connected_nodes = set(np.where(labels == largest_component)[0]) reorder = dict() good_concept = [] good_edge = [] for i, c in enumerate(concept): if i in connected_nodes: reorder[i] = len(good_concept) good_concept.append(c) for v, u, r in edge: if v in connected_nodes and u in connected_nodes: good_edge.append((reorder[v], reorder[u], r)) concept, edge = good_concept, good_edge return concept, edge def largest_connected_component(triples: List): node_to_id = dict() concept = [] edge = [] for u, r, v in triples: if u not in node_to_id: node_to_id[u] = len(node_to_id) concept.append(u) if v not in node_to_id: node_to_id[v] = len(node_to_id) concept.append(v) edge.append((node_to_id[u], node_to_id[v], r)) concept, edge = remove_unconnected_components(concept, edge) return concept, edge def to_triples(concept: List, edge: List): return [(concept[u], r, concept[v]) for u, v, r in edge] def reverse_edge_for_levi_bfs(concept, edge): for v, u, r in edge: if r == '_reverse_': for x in v, u: if concept[x].startswith(REL) and not concept[x].endswith('_reverse_'): concept[x] += '_reverse_' def un_kahn(concept, edge): # (['want', 'rel=ARG1', 'rel=ARG0', 'believe', 'rel=ARG1', 'rel=ARG0', 'boy', 'girl'], # [(0, 1, 0.9999417066574097), (0, 2, 0.9999995231628418), (1, 3, 0.9999992847442627), (3, 4, 1.0), (3, 5, 0.9999996423721313), (2, 6, 0.9996106624603271), (4, 6, 0.9999767541885376), (5, 7, 0.9999860525131226)]) real_concept, reorder = separate_concept_rel(concept) tri_edge = dict() for m, (a, b, p1) in enumerate(edge): if concept[a].startswith(REL): continue for n, (c, d, p2) in enumerate(edge[m + 1:]): if b == c: key = (a, d) _, p = tri_edge.get(key, (None, 0)) if p1 * p2 > p: tri_edge[key] = (b, p1 * p2) real_edge = [] for (a, d), (r, p) in tri_edge.items(): u = reorder[a] r = concept[r][len(REL):] v = reorder[d] real_edge.append((v, u, r)) return real_concept, real_edge ================================================ FILE: hanlp/datasets/parsing/ctb5.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 18:44 from hanlp_common.constant import HANLP_URL _CTB_HOME = HANLP_URL + 'embeddings/SUDA-LA-CIP_20200109_021624.zip#' _CTB5_DEP_HOME = _CTB_HOME + 'BPNN/data/ctb5/' CTB5_DEP_TRAIN = _CTB5_DEP_HOME + 'train.conll' '''Training set for ctb5 dependency parsing.''' CTB5_DEP_DEV = _CTB5_DEP_HOME + 'dev.conll' '''Dev set for ctb5 dependency parsing.''' CTB5_DEP_TEST = _CTB5_DEP_HOME + 'test.conll' '''Test set for ctb5 dependency parsing.''' CIP_W2V_100_CN = _CTB_HOME + 'BPNN/data/embed.txt' ================================================ FILE: hanlp/datasets/parsing/ctb7.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 18:44 from hanlp.datasets.parsing.ctb5 import _CTB_HOME _CTB7_HOME = _CTB_HOME + 'BPNN/data/ctb7/' CTB7_DEP_TRAIN = _CTB7_HOME + 'train.conll' '''Training set for ctb7 dependency parsing.''' CTB7_DEP_DEV = _CTB7_HOME + 'dev.conll' '''Dev set for ctb7 dependency parsing.''' CTB7_DEP_TEST = _CTB7_HOME + 'test.conll' '''Test set for ctb7 dependency parsing.''' ================================================ FILE: hanlp/datasets/parsing/ctb8.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-10-14 20:54 from hanlp.datasets.parsing.loaders._ctb_utils import make_ctb _CTB8_HOME = 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/' CTB8_CWS_TRAIN = _CTB8_HOME + 'tasks/cws/train.txt' '''Training set for ctb8 Chinese word segmentation.''' CTB8_CWS_DEV = _CTB8_HOME + 'tasks/cws/dev.txt' '''Dev set for ctb8 Chinese word segmentation.''' CTB8_CWS_TEST = _CTB8_HOME + 'tasks/cws/test.txt' '''Test set for ctb8 Chinese word segmentation.''' CTB8_POS_TRAIN = _CTB8_HOME + 'tasks/pos/train.tsv' '''Training set for ctb8 PoS tagging.''' CTB8_POS_DEV = _CTB8_HOME + 'tasks/pos/dev.tsv' '''Dev set for ctb8 PoS tagging.''' CTB8_POS_TEST = _CTB8_HOME + 'tasks/pos/test.tsv' '''Test set for ctb8 PoS tagging.''' CTB8_BRACKET_LINE_TRAIN = _CTB8_HOME + 'tasks/par/train.txt' '''Training set for ctb8 constituency parsing with empty categories.''' CTB8_BRACKET_LINE_DEV = _CTB8_HOME + 'tasks/par/dev.txt' '''Dev set for ctb8 constituency parsing with empty categories.''' CTB8_BRACKET_LINE_TEST = _CTB8_HOME + 'tasks/par/test.txt' '''Test set for ctb8 constituency parsing with empty categories.''' CTB8_BRACKET_LINE_NOEC_TRAIN = _CTB8_HOME + 'tasks/par/train.noempty.txt' '''Training set for ctb8 constituency parsing without empty categories.''' CTB8_BRACKET_LINE_NOEC_DEV = _CTB8_HOME + 'tasks/par/dev.noempty.txt' '''Dev set for ctb8 constituency parsing without empty categories.''' CTB8_BRACKET_LINE_NOEC_TEST = _CTB8_HOME + 'tasks/par/test.noempty.txt' '''Test set for ctb8 constituency parsing without empty categories.''' CTB8_SD330_TRAIN = _CTB8_HOME + 'tasks/dep/train.conllx' '''Training set for ctb8 in Stanford Dependencies 3.3.0 standard.''' CTB8_SD330_DEV = _CTB8_HOME + 'tasks/dep/dev.conllx' '''Dev set for ctb8 in Stanford Dependencies 3.3.0 standard.''' CTB8_SD330_TEST = _CTB8_HOME + 'tasks/dep/test.conllx' '''Test set for ctb8 in Stanford Dependencies 3.3.0 standard.''' make_ctb(_CTB8_HOME) ================================================ FILE: hanlp/datasets/parsing/ctb9.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-10-14 20:54 from urllib.error import HTTPError from hanlp.datasets.parsing.loaders._ctb_utils import make_ctb from hanlp.utils.io_util import get_resource, path_from_url _CTB9_HOME = 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/' CTB9_CWS_TRAIN = _CTB9_HOME + 'tasks/cws/train.txt' '''Training set for ctb9 Chinese word segmentation.''' CTB9_CWS_DEV = _CTB9_HOME + 'tasks/cws/dev.txt' '''Dev set for ctb9 Chinese word segmentation.''' CTB9_CWS_TEST = _CTB9_HOME + 'tasks/cws/test.txt' '''Test set for ctb9 Chinese word segmentation.''' CTB9_POS_TRAIN = _CTB9_HOME + 'tasks/pos/train.tsv' '''Training set for ctb9 PoS tagging.''' CTB9_POS_DEV = _CTB9_HOME + 'tasks/pos/dev.tsv' '''Dev set for ctb9 PoS tagging.''' CTB9_POS_TEST = _CTB9_HOME + 'tasks/pos/test.tsv' '''Test set for ctb9 PoS tagging.''' CTB9_BRACKET_LINE_TRAIN = _CTB9_HOME + 'tasks/par/train.txt' '''Training set for ctb9 constituency parsing with empty categories.''' CTB9_BRACKET_LINE_DEV = _CTB9_HOME + 'tasks/par/dev.txt' '''Dev set for ctb9 constituency parsing with empty categories.''' CTB9_BRACKET_LINE_TEST = _CTB9_HOME + 'tasks/par/test.txt' '''Test set for ctb9 constituency parsing with empty categories.''' CTB9_BRACKET_LINE_NOEC_TRAIN = _CTB9_HOME + 'tasks/par/train.noempty.txt' '''Training set for ctb9 constituency parsing without empty categories.''' CTB9_BRACKET_LINE_NOEC_DEV = _CTB9_HOME + 'tasks/par/dev.noempty.txt' '''Dev set for ctb9 constituency parsing without empty categories.''' CTB9_BRACKET_LINE_NOEC_TEST = _CTB9_HOME + 'tasks/par/test.noempty.txt' '''Test set for ctb9 constituency parsing without empty categories.''' CTB9_SD330_TRAIN = _CTB9_HOME + 'tasks/dep/train.conllx' '''Training set for ctb9 in Stanford Dependencies 3.3.0 standard.''' CTB9_SD330_DEV = _CTB9_HOME + 'tasks/dep/dev.conllx' '''Dev set for ctb9 in Stanford Dependencies 3.3.0 standard.''' CTB9_SD330_TEST = _CTB9_HOME + 'tasks/dep/test.conllx' '''Test set for ctb9 in Stanford Dependencies 3.3.0 standard.''' try: get_resource(_CTB9_HOME) except HTTPError: raise FileNotFoundError( 'Chinese Treebank 9.0 is a copyright dataset owned by LDC which we cannot re-distribute. ' f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) ' f'then download it to {path_from_url(_CTB9_HOME)}' ) make_ctb(_CTB9_HOME) ================================================ FILE: hanlp/datasets/parsing/loaders/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:04 ================================================ FILE: hanlp/datasets/parsing/loaders/_ctb_utils.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-25 16:14 import os import shutil import sys from collections import defaultdict from os import listdir from os.path import join, isfile from typing import List from phrasetree.tree import Tree from hanlp.components.parsers.conll import read_conll from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr, read_tsv_as_sents, run_cmd, pushd from hanlp.utils.log_util import cprint from hanlp.utils.time_util import CountdownTimer # See Shao et al., 2017 # CTB9_ACADEMIA_SPLITS = { # 'train': ''' # 0044-0143, 0170-0270, 0400-0899, # 1001-1017, 1019, 1021-1035, 1037- # 1043, 1045-1059, 1062-1071, 1073- # 1117, 1120-1131, 1133-1140, 1143- # 1147, 1149-1151, 2000-2915, 4051- # 4099, 4112-4180, 4198-4368, 5000- # 5446, 6000-6560, 7000-7013 # ''', # 'dev': ''' # 0301-0326, 2916-3030, 4100-4106, # 4181-4189, 4369-4390, 5447-5492, # 6561-6630, 7013-7014 # ''', # 'test': ''' # 0001-0043, 0144-0169, 0271-0301, # 0900-0931, 1018, 1020, 1036, 1044, # 1060, 1061, 1072, 1118, 1119, 1132, # 1141, 1142, 1148, 3031-3145, 4107- # 4111, 4190-4197, 4391-4411, 5493- # 5558, 6631-6700, 7015-7017 # ''' # } # # # def _make_splits(splits: Dict[str, str]): # total = set() # for part, text in list(splits.items()): # if not isinstance(text, str): # continue # lines = text.replace('\n', '').split() # cids = set() # for line in lines: # for each in line.split(','): # each = each.strip() # if not each: # continue # if '-' in each: # start, end = each.split('-') # start, end = map(lambda x: int(x), [start, end]) # cids.update(range(start, end + 1)) # # cids.update(map(lambda x: f'{x:04d}', range(start, end))) # else: # cids.add(int(each)) # cids = set(f'{x:04d}' for x in cids) # assert len(cids & total) == 0, f'Overlap found in {part}' # splits[part] = cids # # return splits # # # _make_splits(CTB9_ACADEMIA_SPLITS) def convert_to_dependency(src, dst, language='zh', version='3.3.0', conllx=True, ud=False): cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version {version}. ' f'It might take a while [blink][yellow]...[/yellow][/blink]') if version == '3.3.0': sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip' elif version == '4.2.0': sp_home = 'https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip' else: raise ValueError(f'Unsupported version {version}') sp_home = get_resource(sp_home) # jar_path = get_resource(f'{sp_home}#stanford-parser.jar') if ud: jclass = 'edu.stanford.nlp.trees.international.pennchinese.UniversalChineseGrammaticalStructure' if language == 'zh' \ else 'edu.stanford.nlp.trees.ud.UniversalDependenciesConverter' else: jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \ else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure' cmd = f'java -cp {sp_home}/* {jclass} ' \ f'-treeFile {src}' if conllx: cmd += ' -conllx' if not ud: cmd += f' -basic -keepPunct' code, out, err = get_exitcode_stdout_stderr(cmd) with open(dst, 'w') as f: f.write(out) if code: raise RuntimeError(f'Conversion failed with code {code} for {src}. The err message is:\n {err}\n' f'Do you have java installed? Do you have enough memory?') def clean_ctb_bracketed(ctb_root, out_root): os.makedirs(out_root, exist_ok=True) ctb_root = join(ctb_root, 'bracketed') chtbs = _list_treebank_root(ctb_root) timer = CountdownTimer(len(chtbs)) for f in chtbs: with open(join(ctb_root, f), encoding='utf-8') as src, open(join(out_root, f + '.txt'), 'w', encoding='utf-8') as out: for line in src: if not line.strip().startswith('<'): out.write(line) timer.log('Cleaning up CTB [blink][yellow]...[/yellow][/blink]', erase=False) def _list_treebank_root(ctb_root): chtbs = [f for f in listdir(ctb_root) if isfile(join(ctb_root, f)) and f.startswith('chtb')] return sorted(chtbs) def list_treebank(ctb_home): ctb_home = get_resource(ctb_home) cleaned_root = join(ctb_home, 'cleaned_bracket') return _list_treebank_root(cleaned_root) def load_bracketed_trees(chtbs) -> List[Tree]: trees = [] for f in chtbs: with open(f, encoding='utf-8') as src: content = src.read() trees = [x for x in content.split('\n\n') if x.strip()] for tree in trees: tree = Tree.fromstring(tree) trees.append(tree) return trees def split_str_to_trees(text: str): trees = [] buffer = [] for line in text.split('\n'): if not line.strip(): continue if line.startswith('('): if buffer: trees.append('\n'.join(buffer).strip()) buffer = [] buffer.append(line) if buffer: trees.append('\n'.join(buffer).strip()) return trees def make_ctb_tasks(chtbs, out_root, part): for task in ['cws', 'pos', 'par', 'dep']: os.makedirs(join(out_root, task), exist_ok=True) timer = CountdownTimer(len(chtbs)) par_path = join(out_root, 'par', f'{part}.txt') with open(join(out_root, 'cws', f'{part}.txt'), 'w', encoding='utf-8') as cws, \ open(join(out_root, 'pos', f'{part}.tsv'), 'w', encoding='utf-8') as pos, \ open(par_path, 'w', encoding='utf-8') as par: for f in chtbs: with open(f, encoding='utf-8') as src: content = src.read() trees = split_str_to_trees(content) for tree in trees: try: tree = Tree.fromstring(tree) except ValueError: print(tree) exit(1) words = [] for word, tag in tree.pos(): if tag == '-NONE-' or not tag: continue tag = tag.split('-')[0] if tag == 'X': # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT tag = 'FW' pos.write('{}\t{}\n'.format(word, tag)) words.append(word) cws.write(' '.join(words)) par.write(tree.pformat(margin=sys.maxsize)) for fp in cws, pos, par: fp.write('\n') timer.log(f'Preprocesing the [blue]{part}[/blue] set of CTB [blink][yellow]...[/yellow][/blink]', erase=False) remove_all_ec(par_path) dep_path = join(out_root, 'dep', f'{part}.conllx') convert_to_dependency(par_path, dep_path) sents = list(read_conll(dep_path)) with open(dep_path, 'w') as out: for sent in sents: for i, cells in enumerate(sent): tag = cells[3] tag = tag.split('-')[0] # NT-SHORT ---> NT if tag == 'X': # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT tag = 'FW' cells[3] = cells[4] = tag out.write('\t'.join(str(x) for x in cells)) out.write('\n') out.write('\n') def reverse_splits(splits): cid_domain = dict() for domain, cids in splits.items(): for each in cids: cid_domain[each] = domain return cid_domain def split_chtb(chtbs: List[str], splits=None): train, dev, test = [], [], [] unused = [] for each in chtbs: name, domain, ext = each.split('.', 2) _, cid = name.split('_') if splits: if cid in splits['train']: bin = train elif cid in splits['dev']: bin = dev elif cid in splits['test']: bin = test else: bin = unused # raise IOError(f'{name} not in any splits') else: bin = train if name.endswith('8'): bin = dev elif name.endswith('9'): bin = test bin.append(each) return train, dev, test def id_of_chtb(each: str): return int(each.split('.')[0].split('_')[-1]) def make_ctb(ctb_home): ctb_home = get_resource(ctb_home) cleaned_root = join(ctb_home, 'cleaned_bracket') if not os.path.isdir(cleaned_root): clean_ctb_bracketed(ctb_home, cleaned_root) tasks_root = join(ctb_home, 'tasks') if not os.path.isdir(tasks_root): try: chtbs = _list_treebank_root(cleaned_root) print(f'For the {len(chtbs)} files in CTB, we apply the following splits:') train, dev, test = split_chtb(chtbs) for part, name in zip([train, dev, test], ['train', 'dev', 'test']): print(f'{name} = {[id_of_chtb(x) for x in part]}') cprint('[yellow]Each file id ending with 8/9 is put into ' 'dev/test respectively, the rest are put into train. ' 'Our splits ensure files are evenly split across each genre, which is recommended ' 'for production systems.[/yellow]') for part, name in zip([train, dev, test], ['train', 'dev', 'test']): make_ctb_tasks([join(cleaned_root, x) for x in part], tasks_root, name) cprint('Done pre-processing CTB. Enjoy your research with [blue]HanLP[/blue]!') except Exception as e: shutil.rmtree(tasks_root, ignore_errors=True) raise e def load_domains(ctb_home): """ Load file ids from a Chinese treebank grouped by domains. Args: ctb_home: Root path to CTB. Returns: A dict of sets, each represents a domain. """ ctb_home = get_resource(ctb_home) ctb_root = join(ctb_home, 'bracketed') chtbs = _list_treebank_root(ctb_root) domains = defaultdict(set) for each in chtbs: name, domain = each.split('.') _, fid = name.split('_') domains[domain].add(fid) return domains def ctb_pos_to_text_format(path, delimiter='_'): """ Convert ctb pos tagging corpus from tsv format to text format, where each word is followed by its pos tag. Args: path: File to be converted. delimiter: Delimiter between word and tag. """ path = get_resource(path) name, ext = os.path.splitext(path) with open(f'{name}.txt', 'w', encoding='utf-8') as out: for sent in read_tsv_as_sents(path): out.write(' '.join([delimiter.join(x) for x in sent])) out.write('\n') def remove_all_ec(path): """ Remove empty categories for all trees in this file and save them into a "noempty" file. Args: path: File path. """ script = get_resource('https://file.hankcs.com/bin/remove_ec.zip') with pushd(script): run_cmd(f'java -cp elit-ddr-0.0.5-SNAPSHOT.jar:elit-sdk-0.0.5-SNAPSHOT.jar:hanlp-1.7.8.jar:' f'fastutil-8.1.1.jar:. demo.RemoveEmptyCategoriesTreebank {path}') ================================================ FILE: hanlp/datasets/parsing/loaders/conll_dataset.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-08 16:10 from typing import Union, List, Callable, Dict from hanlp_common.constant import ROOT, EOS, BOS from hanlp.common.dataset import TransformableDataset from hanlp.components.parsers.conll import read_conll from hanlp.utils.io_util import TimingFileIterator class CoNLLParsingDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, generate_idx=None, prune: Callable[[Dict[str, List[str]]], bool] = None) -> None: """General class for CoNLL style dependency parsing datasets. Args: data: The local or remote path to a dataset, or a list of samples where each sample is a dict. transform: Predefined transform(s). cache: ``True`` to enable caching, so that transforms won't be called twice. generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when samples are re-ordered by a sampler. prune: A filter to prune unwanted samples. """ self._prune = prune super().__init__(data, transform, cache, generate_idx) def load_file(self, filepath): """Both ``.conllx`` and ``.conllu`` are supported. Their descriptions can be found in :class:`hanlp_common.conll.CoNLLWord` and :class:`hanlp_common.conll.CoNLLUWord` respectively. Args: filepath: ``.conllx`` or ``.conllu`` file path. """ if filepath.endswith('.conllu'): # See https://universaldependencies.org/format.html field_names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] else: field_names = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'] fp = TimingFileIterator(filepath) for idx, sent in enumerate(read_conll(fp)): sample = {} for i, field in enumerate(field_names): sample[field] = [cell[i] for cell in sent] if not self._prune or not self._prune(sample): yield sample fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]') def __len__(self) -> int: return len(self.data) def append_bos(sample: dict, pos_key='CPOS', bos=ROOT) -> dict: """ Args: sample: pos_key: bos: A special token inserted to the head of tokens. Returns: """ sample['token'] = [bos] + sample['FORM'] if pos_key in sample: sample['pos'] = [ROOT] + sample[pos_key] if 'HEAD' in sample: sample['arc'] = [0] + sample['HEAD'] sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] return sample def append_bos_eos(sample: dict) -> dict: sample['token'] = [BOS] + sample['FORM'] + [EOS] if 'CPOS' in sample: sample['pos'] = [BOS] + sample['CPOS'] + [EOS] if 'HEAD' in sample: sample['arc'] = [0] + sample['HEAD'] + [0] sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] + sample['DEPREL'][:1] return sample def get_sibs(sample: dict) -> dict: heads = sample.get('arc', None) if heads: sibs = [-1] * len(heads) for i in range(1, len(heads)): hi = heads[i] for j in range(i + 1, len(heads)): hj = heads[j] di, dj = hi - i, hj - j if hi >= 0 and hj >= 0 and hi == hj and di * dj > 0: if abs(di) > abs(dj): sibs[i] = j else: sibs[j] = i break sample['sib_id'] = [0] + sibs[1:] return sample ================================================ FILE: hanlp/datasets/parsing/loaders/constituency_dataset.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-28 19:27 from typing import List from phrasetree.tree import Tree from hanlp_common.constant import EOS, BOS from hanlp.common.dataset import TransformableDataset class ConstituencyDataset(TransformableDataset): def load_file(self, filepath: str): with open(filepath) as src: for line in src: line = line.strip() if not line: continue yield {'constituency': Tree.fromstring(line)} def unpack_tree_to_features(sample: dict): tree = sample.get('constituency', None) if tree: words, tags = zip(*tree.pos()) chart = [[None] * (len(words) + 1) for _ in range(len(words) + 1)] for i, j, label in factorize(binarize(tree)[0]): # if no_subcategory: # label = label.split('-')[0] chart[i][j] = label sample['token'] = [BOS] + list(words) + [EOS] sample['chart'] = chart return sample def append_bos_eos(sample: dict): if '_con_token' not in sample: sample['_con_token'] = sample['token'] sample['token'] = [BOS] + sample['token'] + [EOS] return sample def remove_subcategory(sample: dict): tree: Tree = sample.get('constituency', None) if tree: for subtree in tree.subtrees(): label = subtree.label() subtree.set_label(label.split('-')[0]) return sample def binarize(tree: Tree): r""" Conducts binarization over the tree. First, the tree is transformed to satisfy `Chomsky Normal Form (CNF)`_. Here we call :meth:`~tree.Tree.chomsky_normal_form` to conduct left-binarization. Second, all unary productions in the tree are collapsed. Args: tree (tree.Tree): The tree to be binarized. Returns: The binarized tree. Examples: >>> tree = Tree.fromstring(''' (TOP (S (NP (_ She)) (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis))))) (_ .))) ''') >>> print(Tree.binarize(tree)) (TOP (S (S|<> (NP (_ She)) (VP (VP|<> (_ enjoys)) (S+VP (VP|<> (_ playing)) (NP (_ tennis))))) (S|<> (_ .)))) .. _Chomsky Normal Form (CNF): https://en.wikipedia.org/wiki/Chomsky_normal_form """ tree: Tree = tree.copy(True) nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, Tree): nodes.extend([child for child in node]) if len(node) > 1: for i, child in enumerate(node): if not isinstance(child[0], Tree): node[i] = Tree(f"{node.label()}|<>", [child]) tree.chomsky_normal_form('left', 0, 0) tree.collapse_unary() return tree def factorize(tree, delete_labels=None, equal_labels=None): r""" Factorizes the tree into a sequence. The tree is traversed in pre-order. Args: tree (tree.Tree): The tree to be factorized. delete_labels (set[str]): A set of labels to be ignored. This is used for evaluation. If it is a pre-terminal label, delete the word along with the brackets. If it is a non-terminal label, just delete the brackets (don't delete childrens). In `EVALB`_, the default set is: {'TOP', 'S1', '-NONE-', ',', ':', '``', "''", '.', '?', '!', ''} Default: ``None``. equal_labels (dict[str, str]): The key-val pairs in the dict are considered equivalent (non-directional). This is used for evaluation. The default dict defined in `EVALB`_ is: {'ADVP': 'PRT'} Default: ``None``. Returns: The sequence of the factorized tree. Examples: >>> tree = Tree.fromstring('' (TOP (S (NP (_ She)) (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis))))) (_ .))) '') >>> Tree.factorize(tree) [(0, 5, 'TOP'), (0, 5, 'S'), (0, 1, 'NP'), (1, 4, 'VP'), (2, 4, 'S'), (2, 4, 'VP'), (3, 4, 'NP')] >>> Tree.factorize(tree, delete_labels={'TOP', 'S1', '-NONE-', ',', ':', '``', "''", '.', '?', '!', ''}) [(0, 5, 'S'), (0, 1, 'NP'), (1, 4, 'VP'), (2, 4, 'S'), (2, 4, 'VP'), (3, 4, 'NP')] .. _EVALB: https://nlp.cs.nyu.edu/evalb/ """ def track(tree, i): label = tree.label() if delete_labels is not None and label in delete_labels: label = None if equal_labels is not None: label = equal_labels.get(label, label) if len(tree) == 1 and not isinstance(tree[0], Tree): return (i + 1 if label is not None else i), [] j, spans = i, [] for child in tree: if isinstance(child, Tree): j, s = track(child, j) spans += s if label is not None and j > i: spans = [(i, j, label)] + spans return j, spans return track(tree, 0)[1] def build_tree(tokens: List[str], sequence): r""" Builds a constituency tree from the sequence. The sequence is generated in pre-order. During building the tree, the sequence is de-binarized to the original format (i.e., the suffixes ``|<>`` are ignored, the collapsed labels are recovered). Args: tokens : All tokens in a sentence. sequence (list[tuple]): A list of tuples used for generating a tree. Each tuple consits of the indices of left/right span boundaries and label of the span. Returns: A result constituency tree. Examples: >>> tree = Tree.totree(['She', 'enjoys', 'playing', 'tennis', '.'], 'TOP') >>> sequence = [(0, 5, 'S'), (0, 4, 'S|<>'), (0, 1, 'NP'), (1, 4, 'VP'), (1, 2, 'VP|<>'), (2, 4, 'S+VP'), (2, 3, 'VP|<>'), (3, 4, 'NP'), (4, 5, 'S|<>')] >>> print(Tree.build_tree(root, sequence)) (TOP (S (NP (_ She)) (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis))))) (_ .))) """ if not tokens: # User passed in [], which is the tokenized result of '' return Tree('TOP', []) tree = Tree('TOP', [Tree('_', [t]) for t in tokens]) root = tree.label() leaves = [subtree for subtree in tree.subtrees() if not isinstance(subtree[0], Tree)] def track(node): i, j, label = next(node) if j == i + 1: children = [leaves[i]] else: children = track(node) + track(node) if label.endswith('|<>'): return children labels = label.split('+') tree = Tree(labels[-1], children) for label in reversed(labels[:-1]): tree = Tree(label, [tree]) return [tree] return Tree(root, track(iter(sequence))) ================================================ FILE: hanlp/datasets/parsing/pmt1.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-02-15 04:14 import os.path from hanlp.utils.io_util import get_resource from hanlp.utils.log_util import cprint from hanlp_common.conll import CoNLLSentence, CoNLLWord _HOME = 'https://github.com/qiulikun/PKUMultiviewTreebank/archive/refs/heads/master.zip' PTM_V1_RAW = _HOME + '#199801_dependency_treebank_2014pos.txt' PTM_V1_TRAIN = _HOME + '#train.conllx' 'The training set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).' PTM_V1_DEV = _HOME + '#dev.conllx' 'The dev set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).' PTM_V1_TEST = _HOME + '#test.conllx' 'The test set of PKU Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`).' def _make_ptm(): raw = get_resource(PTM_V1_RAW) home = os.path.dirname(raw) done = True for part in ['train', 'dev', 'test']: if not os.path.isfile(os.path.join(home, f'{part}.conllx')): done = False break if done: return sents = [] with open(raw) as src: buffer = [] for line in src: line = line.strip() if line: buffer.append(line) else: if buffer: tok, pos, rel, arc = [x.split() for x in buffer] sent = CoNLLSentence() for i, (t, p, r, a) in enumerate(zip(tok, pos, rel, arc)): sent.append(CoNLLWord(i + 1, form=t, cpos=p, head=a, deprel=r)) sents.append(sent) buffer.clear() prev_offset = 0 # Sentences 12001-13000 and 13001-14463 are used as the development and test set, respectively. The remaining # sentences are used as training data. for part, offset in zip(['train', 'dev', 'test'], [12000, 13000, 14463]): with open(os.path.join(home, f'{part}.conllx'), 'w') as out: portion = sents[prev_offset:offset] cprint(f'[yellow]{len(portion)}[/yellow] sentences [cyan][{prev_offset + 1}:{offset})[/cyan] in {part}') for sent in portion: out.write(str(sent) + '\n\n') prev_offset = offset _make_ptm() ================================================ FILE: hanlp/datasets/parsing/ptb.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-02-17 15:46 _PTB_HOME = 'https://github.com/KhalilMrini/LAL-Parser/archive/master.zip#data/' PTB_TRAIN = _PTB_HOME + '02-21.10way.clean' '''Training set of PTB without empty categories. PoS tags are automatically predicted using 10-fold jackknifing (:cite:`collins-koo-2005-discriminative`).''' PTB_DEV = _PTB_HOME + '22.auto.clean' '''Dev set of PTB without empty categories. PoS tags are automatically predicted using 10-fold jackknifing (:cite:`collins-koo-2005-discriminative`).''' PTB_TEST = _PTB_HOME + '23.auto.clean' '''Test set of PTB without empty categories. PoS tags are automatically predicted using 10-fold jackknifing (:cite:`collins-koo-2005-discriminative`).''' PTB_SD330_TRAIN = _PTB_HOME + 'ptb_train_3.3.0.sd.clean' '''Training set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold jackknifing (:cite:`collins-koo-2005-discriminative`).''' PTB_SD330_DEV = _PTB_HOME + 'ptb_dev_3.3.0.sd.clean' '''Dev set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold jackknifing (:cite:`collins-koo-2005-discriminative`).''' PTB_SD330_TEST = _PTB_HOME + 'ptb_test_3.3.0.sd.clean' '''Test set of PTB in Stanford Dependencies 3.3.0 format. PoS tags are automatically predicted using 10-fold jackknifing (:cite:`collins-koo-2005-discriminative`).''' PTB_TOKEN_MAPPING = { "-LRB-": "(", "-RRB-": ")", "-LCB-": "{", "-RCB-": "}", "-LSB-": "[", "-RSB-": "]", "``": '"', "''": '"', "`": "'", '«': '"', '»': '"', '‘': "'", '’': "'", '“': '"', '”': '"', '„': '"', '‹': "'", '›': "'", "\u2013": "--", # en dash "\u2014": "--", # em dash } ================================================ FILE: hanlp/datasets/parsing/semeval15.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-28 14:40 # from hanlp.datasets.parsing.conll_dataset import CoNLLParsingDataset # # # class SemEval15Dataset(CoNLLParsingDataset): # def load_file(self, filepath: str): # pass import warnings from hanlp_common.constant import ROOT, PAD from hanlp_common.conll import CoNLLSentence def unpack_deps_to_head_deprel(sample: dict, pad_rel=None, arc_key='arc', rel_key='rel'): if 'DEPS' in sample: deps = ['_'] + sample['DEPS'] sample[arc_key] = arc = [] sample[rel_key] = rel = [] for each in deps: arc_per_token = [False] * len(deps) rel_per_token = [None] * len(deps) if each != '_': for ar in each.split('|'): a, r = ar.split(':') a = int(a) arc_per_token[a] = True rel_per_token[a] = r if not pad_rel: pad_rel = r arc.append(arc_per_token) rel.append(rel_per_token) if not pad_rel: pad_rel = PAD for i in range(len(rel)): rel[i] = [r if r else pad_rel for r in rel[i]] return sample def append_bos_to_form_pos(sample, pos_key='CPOS'): sample['token'] = [ROOT] + sample['FORM'] if pos_key in sample: sample['pos'] = [ROOT] + sample[pos_key] return sample def merge_head_deprel_with_2nd(sample: dict): if 'arc' in sample: arc_2nd = sample['arc_2nd'] rel_2nd = sample['rel_2nd'] for i, (arc, rel) in enumerate(zip(sample['arc'], sample['rel'])): if i: if arc_2nd[i][arc] and rel_2nd[i][arc] != rel: sample_str = CoNLLSentence.from_dict(sample, conllu=True).to_markdown() warnings.warn(f'The main dependency conflicts with 2nd dependency at ID={i}, ' \ 'which means joint mode might not be suitable. ' \ f'The sample is\n{sample_str}') arc_2nd[i][arc] = True rel_2nd[i][arc] = rel return sample ================================================ FILE: hanlp/datasets/parsing/semeval16.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 00:51 from hanlp_common.conll import CoNLLSentence import os from hanlp.utils.io_util import get_resource, merge_files from hanlp_common.io import eprint _SEMEVAL2016_HOME = 'https://github.com/HIT-SCIR/SemEval-2016/archive/master.zip' SEMEVAL2016_NEWS_TRAIN = _SEMEVAL2016_HOME + '#train/news.train.conll' SEMEVAL2016_NEWS_DEV = _SEMEVAL2016_HOME + '#validation/news.valid.conll' SEMEVAL2016_NEWS_TEST = _SEMEVAL2016_HOME + '#test/news.test.conll' SEMEVAL2016_NEWS_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/news.train.conllu' SEMEVAL2016_NEWS_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/news.valid.conllu' SEMEVAL2016_NEWS_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/news.test.conllu' SEMEVAL2016_TEXT_TRAIN = _SEMEVAL2016_HOME + '#train/text.train.conll' SEMEVAL2016_TEXT_DEV = _SEMEVAL2016_HOME + '#validation/text.valid.conll' SEMEVAL2016_TEXT_TEST = _SEMEVAL2016_HOME + '#test/text.test.conll' SEMEVAL2016_TEXT_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/text.train.conllu' SEMEVAL2016_TEXT_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/text.valid.conllu' SEMEVAL2016_TEXT_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/text.test.conllu' SEMEVAL2016_FULL_TRAIN_CONLLU = _SEMEVAL2016_HOME + '#train/full.train.conllu' SEMEVAL2016_FULL_DEV_CONLLU = _SEMEVAL2016_HOME + '#validation/full.valid.conllu' SEMEVAL2016_FULL_TEST_CONLLU = _SEMEVAL2016_HOME + '#test/full.test.conllu' def convert_conll_to_conllu(path): sents = CoNLLSentence.from_file(path, conllu=True) with open(os.path.splitext(path)[0] + '.conllu', 'w') as out: for sent in sents: for word in sent: if not word.deps: word.deps = [(word.head, word.deprel)] word.head = None word.deprel = None out.write(str(sent)) out.write('\n\n') for file in [SEMEVAL2016_NEWS_TRAIN, SEMEVAL2016_NEWS_DEV, SEMEVAL2016_NEWS_TEST, SEMEVAL2016_TEXT_TRAIN, SEMEVAL2016_TEXT_DEV, SEMEVAL2016_TEXT_TEST]: file = get_resource(file) conllu = os.path.splitext(file)[0] + '.conllu' if not os.path.isfile(conllu): eprint(f'Converting {os.path.basename(file)} to {os.path.basename(conllu)} ...') convert_conll_to_conllu(file) for group, part in zip([[SEMEVAL2016_NEWS_TRAIN_CONLLU, SEMEVAL2016_TEXT_TRAIN_CONLLU], [SEMEVAL2016_NEWS_DEV_CONLLU, SEMEVAL2016_TEXT_DEV_CONLLU], [SEMEVAL2016_NEWS_TEST_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU]], ['train', 'valid', 'test']): root = get_resource(_SEMEVAL2016_HOME) dst = f'{root}/train/full.{part}.conllu' if not os.path.isfile(dst): group = [get_resource(x) for x in group] eprint(f'Concatenating {os.path.basename(group[0])} and {os.path.basename(group[1])} ' f'into full dataset {os.path.basename(dst)} ...') merge_files(group, dst) ================================================ FILE: hanlp/datasets/parsing/ud/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-07 21:45 import os import shutil from hanlp.components.parsers.ud.udify_util import get_ud_treebank_files from hanlp.utils.io_util import get_resource from hanlp.utils.log_util import flash def concat_treebanks(home, version): ud_home = get_resource(home) treebanks = get_ud_treebank_files(ud_home) output_dir = os.path.abspath(os.path.join(ud_home, os.path.pardir, os.path.pardir, f'ud-multilingual-v{version}')) if os.path.isdir(output_dir): return output_dir os.makedirs(output_dir) train, dev, test = list(zip(*[treebanks[k] for k in treebanks])) for treebank, name in zip([train, dev, test], ["train.conllu", "dev.conllu", "test.conllu"]): flash(f'Concatenating {len(train)} treebanks into {name} [blink][yellow]...[/yellow][/blink]') with open(os.path.join(output_dir, name), 'w') as write: for t in treebank: if not t: continue with open(t, 'r') as read: shutil.copyfileobj(read, write) flash('') return output_dir ================================================ FILE: hanlp/datasets/parsing/ud/ud210.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-07 21:03 import glob import os from hanlp.utils.io_util import uncompress, get_resource _UD_210_URL = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4758/allzip" _UD_210_HOME = _UD_210_URL + '#ud-treebanks-v2.10/' _path = get_resource(_UD_210_URL) if os.path.isfile(_path): os.rename(_path, _path + '.zip') uncompress(_path + '.zip') uncompress(os.path.join(_path, 'ud-treebanks-v2.10.tgz')) # noinspection PyShadowingNames def _list_dir(path, home): prefix = home.lstrip('_').replace('_HOME', '') path = get_resource(path) with open('ud210.py', 'a') as out: for f in sorted(glob.glob(path + '/ud-treebanks-v2.10/UD_*')): basename = os.path.basename(f) name = basename[len('UD_'):] name = name.upper().replace('-', '_') for split in 'train', 'dev', 'test': sp = glob.glob(f + f'/*{split}.conllu') if not sp: continue sp = os.path.basename(sp[0]) out.write(f'{prefix}_{name}_{split.upper()} = {home} + "{basename}/{sp}"\n') out.write(f'"{prefix} {split} set of {name}."\n') def main(): _list_dir(_UD_210_URL, '_UD_210_HOME') pass if __name__ == '__main__': main() UD_210_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu" "UD_210 train set of AFRIKAANS_AFRIBOOMS." UD_210_AFRIKAANS_AFRIBOOMS_DEV = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu" "UD_210 dev set of AFRIKAANS_AFRIBOOMS." UD_210_AFRIKAANS_AFRIBOOMS_TEST = _UD_210_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu" "UD_210 test set of AFRIKAANS_AFRIBOOMS." UD_210_AKKADIAN_PISANDUB_TEST = _UD_210_HOME + "UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu" "UD_210 test set of AKKADIAN_PISANDUB." UD_210_AKKADIAN_RIAO_TEST = _UD_210_HOME + "UD_Akkadian-RIAO/akk_riao-ud-test.conllu" "UD_210 test set of AKKADIAN_RIAO." UD_210_AKUNTSU_TUDET_TEST = _UD_210_HOME + "UD_Akuntsu-TuDeT/aqz_tudet-ud-test.conllu" "UD_210 test set of AKUNTSU_TUDET." UD_210_ALBANIAN_TSA_TEST = _UD_210_HOME + "UD_Albanian-TSA/sq_tsa-ud-test.conllu" "UD_210 test set of ALBANIAN_TSA." UD_210_AMHARIC_ATT_TEST = _UD_210_HOME + "UD_Amharic-ATT/am_att-ud-test.conllu" "UD_210 test set of AMHARIC_ATT." UD_210_ANCIENT_GREEK_PROIEL_TRAIN = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu" "UD_210 train set of ANCIENT_GREEK_PROIEL." UD_210_ANCIENT_GREEK_PROIEL_DEV = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu" "UD_210 dev set of ANCIENT_GREEK_PROIEL." UD_210_ANCIENT_GREEK_PROIEL_TEST = _UD_210_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu" "UD_210 test set of ANCIENT_GREEK_PROIEL." UD_210_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu" "UD_210 train set of ANCIENT_GREEK_PERSEUS." UD_210_ANCIENT_GREEK_PERSEUS_DEV = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu" "UD_210 dev set of ANCIENT_GREEK_PERSEUS." UD_210_ANCIENT_GREEK_PERSEUS_TEST = _UD_210_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu" "UD_210 test set of ANCIENT_GREEK_PERSEUS." UD_210_ANCIENT_HEBREW_PTNK_TRAIN = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-train.conllu" "UD_210 train set of ANCIENT_HEBREW_PTNK." UD_210_ANCIENT_HEBREW_PTNK_DEV = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-dev.conllu" "UD_210 dev set of ANCIENT_HEBREW_PTNK." UD_210_ANCIENT_HEBREW_PTNK_TEST = _UD_210_HOME + "UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-test.conllu" "UD_210 test set of ANCIENT_HEBREW_PTNK." UD_210_APURINA_UFPA_TEST = _UD_210_HOME + "UD_Apurina-UFPA/apu_ufpa-ud-test.conllu" "UD_210 test set of APURINA_UFPA." UD_210_ARABIC_NYUAD_TRAIN = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu" "UD_210 train set of ARABIC_NYUAD." UD_210_ARABIC_NYUAD_DEV = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu" "UD_210 dev set of ARABIC_NYUAD." UD_210_ARABIC_NYUAD_TEST = _UD_210_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu" "UD_210 test set of ARABIC_NYUAD." UD_210_ARABIC_PADT_TRAIN = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-train.conllu" "UD_210 train set of ARABIC_PADT." UD_210_ARABIC_PADT_DEV = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-dev.conllu" "UD_210 dev set of ARABIC_PADT." UD_210_ARABIC_PADT_TEST = _UD_210_HOME + "UD_Arabic-PADT/ar_padt-ud-test.conllu" "UD_210 test set of ARABIC_PADT." UD_210_ARABIC_PUD_TEST = _UD_210_HOME + "UD_Arabic-PUD/ar_pud-ud-test.conllu" "UD_210 test set of ARABIC_PUD." UD_210_ARMENIAN_ARMTDP_TRAIN = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu" "UD_210 train set of ARMENIAN_ARMTDP." UD_210_ARMENIAN_ARMTDP_DEV = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu" "UD_210 dev set of ARMENIAN_ARMTDP." UD_210_ARMENIAN_ARMTDP_TEST = _UD_210_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu" "UD_210 test set of ARMENIAN_ARMTDP." UD_210_ARMENIAN_BSUT_TRAIN = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-train.conllu" "UD_210 train set of ARMENIAN_BSUT." UD_210_ARMENIAN_BSUT_DEV = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-dev.conllu" "UD_210 dev set of ARMENIAN_BSUT." UD_210_ARMENIAN_BSUT_TEST = _UD_210_HOME + "UD_Armenian-BSUT/hy_bsut-ud-test.conllu" "UD_210 test set of ARMENIAN_BSUT." UD_210_ASSYRIAN_AS_TEST = _UD_210_HOME + "UD_Assyrian-AS/aii_as-ud-test.conllu" "UD_210 test set of ASSYRIAN_AS." UD_210_BAMBARA_CRB_TEST = _UD_210_HOME + "UD_Bambara-CRB/bm_crb-ud-test.conllu" "UD_210 test set of BAMBARA_CRB." UD_210_BASQUE_BDT_TRAIN = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-train.conllu" "UD_210 train set of BASQUE_BDT." UD_210_BASQUE_BDT_DEV = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-dev.conllu" "UD_210 dev set of BASQUE_BDT." UD_210_BASQUE_BDT_TEST = _UD_210_HOME + "UD_Basque-BDT/eu_bdt-ud-test.conllu" "UD_210 test set of BASQUE_BDT." UD_210_BEJA_NSC_TEST = _UD_210_HOME + "UD_Beja-NSC/bej_nsc-ud-test.conllu" "UD_210 test set of BEJA_NSC." UD_210_BELARUSIAN_HSE_TRAIN = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-train.conllu" "UD_210 train set of BELARUSIAN_HSE." UD_210_BELARUSIAN_HSE_DEV = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-dev.conllu" "UD_210 dev set of BELARUSIAN_HSE." UD_210_BELARUSIAN_HSE_TEST = _UD_210_HOME + "UD_Belarusian-HSE/be_hse-ud-test.conllu" "UD_210 test set of BELARUSIAN_HSE." UD_210_BENGALI_BRU_TEST = _UD_210_HOME + "UD_Bengali-BRU/bn_bru-ud-test.conllu" "UD_210 test set of BENGALI_BRU." UD_210_BHOJPURI_BHTB_TEST = _UD_210_HOME + "UD_Bhojpuri-BHTB/bho_bhtb-ud-test.conllu" "UD_210 test set of BHOJPURI_BHTB." UD_210_BRETON_KEB_TEST = _UD_210_HOME + "UD_Breton-KEB/br_keb-ud-test.conllu" "UD_210 test set of BRETON_KEB." UD_210_BULGARIAN_BTB_TRAIN = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-train.conllu" "UD_210 train set of BULGARIAN_BTB." UD_210_BULGARIAN_BTB_DEV = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-dev.conllu" "UD_210 dev set of BULGARIAN_BTB." UD_210_BULGARIAN_BTB_TEST = _UD_210_HOME + "UD_Bulgarian-BTB/bg_btb-ud-test.conllu" "UD_210 test set of BULGARIAN_BTB." UD_210_BURYAT_BDT_TRAIN = _UD_210_HOME + "UD_Buryat-BDT/bxr_bdt-ud-train.conllu" "UD_210 train set of BURYAT_BDT." UD_210_BURYAT_BDT_TEST = _UD_210_HOME + "UD_Buryat-BDT/bxr_bdt-ud-test.conllu" "UD_210 test set of BURYAT_BDT." UD_210_CANTONESE_HK_TEST = _UD_210_HOME + "UD_Cantonese-HK/yue_hk-ud-test.conllu" "UD_210 test set of CANTONESE_HK." UD_210_CATALAN_ANCORA_TRAIN = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-train.conllu" "UD_210 train set of CATALAN_ANCORA." UD_210_CATALAN_ANCORA_DEV = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-dev.conllu" "UD_210 dev set of CATALAN_ANCORA." UD_210_CATALAN_ANCORA_TEST = _UD_210_HOME + "UD_Catalan-AnCora/ca_ancora-ud-test.conllu" "UD_210 test set of CATALAN_ANCORA." UD_210_CEBUANO_GJA_TEST = _UD_210_HOME + "UD_Cebuano-GJA/ceb_gja-ud-test.conllu" "UD_210 test set of CEBUANO_GJA." UD_210_CHINESE_CFL_TEST = _UD_210_HOME + "UD_Chinese-CFL/zh_cfl-ud-test.conllu" "UD_210 test set of CHINESE_CFL." UD_210_CHINESE_GSD_TRAIN = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-train.conllu" "UD_210 train set of CHINESE_GSD." UD_210_CHINESE_GSD_DEV = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-dev.conllu" "UD_210 dev set of CHINESE_GSD." UD_210_CHINESE_GSD_TEST = _UD_210_HOME + "UD_Chinese-GSD/zh_gsd-ud-test.conllu" "UD_210 test set of CHINESE_GSD." UD_210_CHINESE_GSDSIMP_TRAIN = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu" "UD_210 train set of CHINESE_GSDSIMP." UD_210_CHINESE_GSDSIMP_DEV = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu" "UD_210 dev set of CHINESE_GSDSIMP." UD_210_CHINESE_GSDSIMP_TEST = _UD_210_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu" "UD_210 test set of CHINESE_GSDSIMP." UD_210_CHINESE_HK_TEST = _UD_210_HOME + "UD_Chinese-HK/zh_hk-ud-test.conllu" "UD_210 test set of CHINESE_HK." UD_210_CHINESE_PUD_TEST = _UD_210_HOME + "UD_Chinese-PUD/zh_pud-ud-test.conllu" "UD_210 test set of CHINESE_PUD." UD_210_CHUKCHI_HSE_TEST = _UD_210_HOME + "UD_Chukchi-HSE/ckt_hse-ud-test.conllu" "UD_210 test set of CHUKCHI_HSE." UD_210_CLASSICAL_CHINESE_KYOTO_TRAIN = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu" "UD_210 train set of CLASSICAL_CHINESE_KYOTO." UD_210_CLASSICAL_CHINESE_KYOTO_DEV = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu" "UD_210 dev set of CLASSICAL_CHINESE_KYOTO." UD_210_CLASSICAL_CHINESE_KYOTO_TEST = _UD_210_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu" "UD_210 test set of CLASSICAL_CHINESE_KYOTO." UD_210_COPTIC_SCRIPTORIUM_TRAIN = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu" "UD_210 train set of COPTIC_SCRIPTORIUM." UD_210_COPTIC_SCRIPTORIUM_DEV = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu" "UD_210 dev set of COPTIC_SCRIPTORIUM." UD_210_COPTIC_SCRIPTORIUM_TEST = _UD_210_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu" "UD_210 test set of COPTIC_SCRIPTORIUM." UD_210_CROATIAN_SET_TRAIN = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-train.conllu" "UD_210 train set of CROATIAN_SET." UD_210_CROATIAN_SET_DEV = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-dev.conllu" "UD_210 dev set of CROATIAN_SET." UD_210_CROATIAN_SET_TEST = _UD_210_HOME + "UD_Croatian-SET/hr_set-ud-test.conllu" "UD_210 test set of CROATIAN_SET." UD_210_CZECH_CAC_TRAIN = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-train.conllu" "UD_210 train set of CZECH_CAC." UD_210_CZECH_CAC_DEV = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-dev.conllu" "UD_210 dev set of CZECH_CAC." UD_210_CZECH_CAC_TEST = _UD_210_HOME + "UD_Czech-CAC/cs_cac-ud-test.conllu" "UD_210 test set of CZECH_CAC." UD_210_CZECH_CLTT_TRAIN = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-train.conllu" "UD_210 train set of CZECH_CLTT." UD_210_CZECH_CLTT_DEV = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-dev.conllu" "UD_210 dev set of CZECH_CLTT." UD_210_CZECH_CLTT_TEST = _UD_210_HOME + "UD_Czech-CLTT/cs_cltt-ud-test.conllu" "UD_210 test set of CZECH_CLTT." UD_210_CZECH_FICTREE_TRAIN = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-train.conllu" "UD_210 train set of CZECH_FICTREE." UD_210_CZECH_FICTREE_DEV = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-dev.conllu" "UD_210 dev set of CZECH_FICTREE." UD_210_CZECH_FICTREE_TEST = _UD_210_HOME + "UD_Czech-FicTree/cs_fictree-ud-test.conllu" "UD_210 test set of CZECH_FICTREE." UD_210_CZECH_PDT_TRAIN = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-train.conllu" "UD_210 train set of CZECH_PDT." UD_210_CZECH_PDT_DEV = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-dev.conllu" "UD_210 dev set of CZECH_PDT." UD_210_CZECH_PDT_TEST = _UD_210_HOME + "UD_Czech-PDT/cs_pdt-ud-test.conllu" "UD_210 test set of CZECH_PDT." UD_210_CZECH_PUD_TEST = _UD_210_HOME + "UD_Czech-PUD/cs_pud-ud-test.conllu" "UD_210 test set of CZECH_PUD." UD_210_DANISH_DDT_TRAIN = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-train.conllu" "UD_210 train set of DANISH_DDT." UD_210_DANISH_DDT_DEV = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-dev.conllu" "UD_210 dev set of DANISH_DDT." UD_210_DANISH_DDT_TEST = _UD_210_HOME + "UD_Danish-DDT/da_ddt-ud-test.conllu" "UD_210 test set of DANISH_DDT." UD_210_DUTCH_ALPINO_TRAIN = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-train.conllu" "UD_210 train set of DUTCH_ALPINO." UD_210_DUTCH_ALPINO_DEV = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu" "UD_210 dev set of DUTCH_ALPINO." UD_210_DUTCH_ALPINO_TEST = _UD_210_HOME + "UD_Dutch-Alpino/nl_alpino-ud-test.conllu" "UD_210 test set of DUTCH_ALPINO." UD_210_DUTCH_LASSYSMALL_TRAIN = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu" "UD_210 train set of DUTCH_LASSYSMALL." UD_210_DUTCH_LASSYSMALL_DEV = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu" "UD_210 dev set of DUTCH_LASSYSMALL." UD_210_DUTCH_LASSYSMALL_TEST = _UD_210_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu" "UD_210 test set of DUTCH_LASSYSMALL." UD_210_ENGLISH_ATIS_TRAIN = _UD_210_HOME + "UD_English-Atis/en_atis-ud-train.conllu" "UD_210 train set of ENGLISH_ATIS." UD_210_ENGLISH_ATIS_DEV = _UD_210_HOME + "UD_English-Atis/en_atis-ud-dev.conllu" "UD_210 dev set of ENGLISH_ATIS." UD_210_ENGLISH_ATIS_TEST = _UD_210_HOME + "UD_English-Atis/en_atis-ud-test.conllu" "UD_210 test set of ENGLISH_ATIS." UD_210_ENGLISH_ESL_TRAIN = _UD_210_HOME + "UD_English-ESL/en_esl-ud-train.conllu" "UD_210 train set of ENGLISH_ESL." UD_210_ENGLISH_ESL_DEV = _UD_210_HOME + "UD_English-ESL/en_esl-ud-dev.conllu" "UD_210 dev set of ENGLISH_ESL." UD_210_ENGLISH_ESL_TEST = _UD_210_HOME + "UD_English-ESL/en_esl-ud-test.conllu" "UD_210 test set of ENGLISH_ESL." UD_210_ENGLISH_EWT_TRAIN = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-train.conllu" "UD_210 train set of ENGLISH_EWT." UD_210_ENGLISH_EWT_DEV = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-dev.conllu" "UD_210 dev set of ENGLISH_EWT." UD_210_ENGLISH_EWT_TEST = _UD_210_HOME + "UD_English-EWT/en_ewt-ud-test.conllu" "UD_210 test set of ENGLISH_EWT." UD_210_ENGLISH_GUM_TRAIN = _UD_210_HOME + "UD_English-GUM/en_gum-ud-train.conllu" "UD_210 train set of ENGLISH_GUM." UD_210_ENGLISH_GUM_DEV = _UD_210_HOME + "UD_English-GUM/en_gum-ud-dev.conllu" "UD_210 dev set of ENGLISH_GUM." UD_210_ENGLISH_GUM_TEST = _UD_210_HOME + "UD_English-GUM/en_gum-ud-test.conllu" "UD_210 test set of ENGLISH_GUM." UD_210_ENGLISH_GUMREDDIT_TRAIN = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-train.conllu" "UD_210 train set of ENGLISH_GUMREDDIT." UD_210_ENGLISH_GUMREDDIT_DEV = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-dev.conllu" "UD_210 dev set of ENGLISH_GUMREDDIT." UD_210_ENGLISH_GUMREDDIT_TEST = _UD_210_HOME + "UD_English-GUMReddit/en_gumreddit-ud-test.conllu" "UD_210 test set of ENGLISH_GUMREDDIT." UD_210_ENGLISH_LINES_TRAIN = _UD_210_HOME + "UD_English-LinES/en_lines-ud-train.conllu" "UD_210 train set of ENGLISH_LINES." UD_210_ENGLISH_LINES_DEV = _UD_210_HOME + "UD_English-LinES/en_lines-ud-dev.conllu" "UD_210 dev set of ENGLISH_LINES." UD_210_ENGLISH_LINES_TEST = _UD_210_HOME + "UD_English-LinES/en_lines-ud-test.conllu" "UD_210 test set of ENGLISH_LINES." UD_210_ENGLISH_PUD_TEST = _UD_210_HOME + "UD_English-PUD/en_pud-ud-test.conllu" "UD_210 test set of ENGLISH_PUD." UD_210_ENGLISH_PARTUT_TRAIN = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-train.conllu" "UD_210 train set of ENGLISH_PARTUT." UD_210_ENGLISH_PARTUT_DEV = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-dev.conllu" "UD_210 dev set of ENGLISH_PARTUT." UD_210_ENGLISH_PARTUT_TEST = _UD_210_HOME + "UD_English-ParTUT/en_partut-ud-test.conllu" "UD_210 test set of ENGLISH_PARTUT." UD_210_ENGLISH_PRONOUNS_TEST = _UD_210_HOME + "UD_English-Pronouns/en_pronouns-ud-test.conllu" "UD_210 test set of ENGLISH_PRONOUNS." UD_210_ERZYA_JR_TEST = _UD_210_HOME + "UD_Erzya-JR/myv_jr-ud-test.conllu" "UD_210 test set of ERZYA_JR." UD_210_ESTONIAN_EDT_TRAIN = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-train.conllu" "UD_210 train set of ESTONIAN_EDT." UD_210_ESTONIAN_EDT_DEV = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-dev.conllu" "UD_210 dev set of ESTONIAN_EDT." UD_210_ESTONIAN_EDT_TEST = _UD_210_HOME + "UD_Estonian-EDT/et_edt-ud-test.conllu" "UD_210 test set of ESTONIAN_EDT." UD_210_ESTONIAN_EWT_TRAIN = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-train.conllu" "UD_210 train set of ESTONIAN_EWT." UD_210_ESTONIAN_EWT_DEV = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-dev.conllu" "UD_210 dev set of ESTONIAN_EWT." UD_210_ESTONIAN_EWT_TEST = _UD_210_HOME + "UD_Estonian-EWT/et_ewt-ud-test.conllu" "UD_210 test set of ESTONIAN_EWT." UD_210_FAROESE_FARPAHC_TRAIN = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu" "UD_210 train set of FAROESE_FARPAHC." UD_210_FAROESE_FARPAHC_DEV = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu" "UD_210 dev set of FAROESE_FARPAHC." UD_210_FAROESE_FARPAHC_TEST = _UD_210_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu" "UD_210 test set of FAROESE_FARPAHC." UD_210_FAROESE_OFT_TEST = _UD_210_HOME + "UD_Faroese-OFT/fo_oft-ud-test.conllu" "UD_210 test set of FAROESE_OFT." UD_210_FINNISH_FTB_TRAIN = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-train.conllu" "UD_210 train set of FINNISH_FTB." UD_210_FINNISH_FTB_DEV = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-dev.conllu" "UD_210 dev set of FINNISH_FTB." UD_210_FINNISH_FTB_TEST = _UD_210_HOME + "UD_Finnish-FTB/fi_ftb-ud-test.conllu" "UD_210 test set of FINNISH_FTB." UD_210_FINNISH_OOD_TEST = _UD_210_HOME + "UD_Finnish-OOD/fi_ood-ud-test.conllu" "UD_210 test set of FINNISH_OOD." UD_210_FINNISH_PUD_TEST = _UD_210_HOME + "UD_Finnish-PUD/fi_pud-ud-test.conllu" "UD_210 test set of FINNISH_PUD." UD_210_FINNISH_TDT_TRAIN = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-train.conllu" "UD_210 train set of FINNISH_TDT." UD_210_FINNISH_TDT_DEV = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-dev.conllu" "UD_210 dev set of FINNISH_TDT." UD_210_FINNISH_TDT_TEST = _UD_210_HOME + "UD_Finnish-TDT/fi_tdt-ud-test.conllu" "UD_210 test set of FINNISH_TDT." UD_210_FRENCH_FQB_TEST = _UD_210_HOME + "UD_French-FQB/fr_fqb-ud-test.conllu" "UD_210 test set of FRENCH_FQB." UD_210_FRENCH_FTB_TRAIN = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-train.conllu" "UD_210 train set of FRENCH_FTB." UD_210_FRENCH_FTB_DEV = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-dev.conllu" "UD_210 dev set of FRENCH_FTB." UD_210_FRENCH_FTB_TEST = _UD_210_HOME + "UD_French-FTB/fr_ftb-ud-test.conllu" "UD_210 test set of FRENCH_FTB." UD_210_FRENCH_GSD_TRAIN = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-train.conllu" "UD_210 train set of FRENCH_GSD." UD_210_FRENCH_GSD_DEV = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-dev.conllu" "UD_210 dev set of FRENCH_GSD." UD_210_FRENCH_GSD_TEST = _UD_210_HOME + "UD_French-GSD/fr_gsd-ud-test.conllu" "UD_210 test set of FRENCH_GSD." UD_210_FRENCH_PUD_TEST = _UD_210_HOME + "UD_French-PUD/fr_pud-ud-test.conllu" "UD_210 test set of FRENCH_PUD." UD_210_FRENCH_PARTUT_TRAIN = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-train.conllu" "UD_210 train set of FRENCH_PARTUT." UD_210_FRENCH_PARTUT_DEV = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-dev.conllu" "UD_210 dev set of FRENCH_PARTUT." UD_210_FRENCH_PARTUT_TEST = _UD_210_HOME + "UD_French-ParTUT/fr_partut-ud-test.conllu" "UD_210 test set of FRENCH_PARTUT." UD_210_FRENCH_PARISSTORIES_TRAIN = _UD_210_HOME + "UD_French-ParisStories/fr_parisstories-ud-train.conllu" "UD_210 train set of FRENCH_PARISSTORIES." UD_210_FRENCH_PARISSTORIES_TEST = _UD_210_HOME + "UD_French-ParisStories/fr_parisstories-ud-test.conllu" "UD_210 test set of FRENCH_PARISSTORIES." UD_210_FRENCH_RHAPSODIE_TRAIN = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-train.conllu" "UD_210 train set of FRENCH_RHAPSODIE." UD_210_FRENCH_RHAPSODIE_DEV = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-dev.conllu" "UD_210 dev set of FRENCH_RHAPSODIE." UD_210_FRENCH_RHAPSODIE_TEST = _UD_210_HOME + "UD_French-Rhapsodie/fr_rhapsodie-ud-test.conllu" "UD_210 test set of FRENCH_RHAPSODIE." UD_210_FRENCH_SEQUOIA_TRAIN = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-train.conllu" "UD_210 train set of FRENCH_SEQUOIA." UD_210_FRENCH_SEQUOIA_DEV = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-dev.conllu" "UD_210 dev set of FRENCH_SEQUOIA." UD_210_FRENCH_SEQUOIA_TEST = _UD_210_HOME + "UD_French-Sequoia/fr_sequoia-ud-test.conllu" "UD_210 test set of FRENCH_SEQUOIA." UD_210_FRISIAN_DUTCH_FAME_TEST = _UD_210_HOME + "UD_Frisian_Dutch-Fame/qfn_fame-ud-test.conllu" "UD_210 test set of FRISIAN_DUTCH_FAME." UD_210_GALICIAN_CTG_TRAIN = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-train.conllu" "UD_210 train set of GALICIAN_CTG." UD_210_GALICIAN_CTG_DEV = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-dev.conllu" "UD_210 dev set of GALICIAN_CTG." UD_210_GALICIAN_CTG_TEST = _UD_210_HOME + "UD_Galician-CTG/gl_ctg-ud-test.conllu" "UD_210 test set of GALICIAN_CTG." UD_210_GALICIAN_TREEGAL_TRAIN = _UD_210_HOME + "UD_Galician-TreeGal/gl_treegal-ud-train.conllu" "UD_210 train set of GALICIAN_TREEGAL." UD_210_GALICIAN_TREEGAL_TEST = _UD_210_HOME + "UD_Galician-TreeGal/gl_treegal-ud-test.conllu" "UD_210 test set of GALICIAN_TREEGAL." UD_210_GERMAN_GSD_TRAIN = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-train.conllu" "UD_210 train set of GERMAN_GSD." UD_210_GERMAN_GSD_DEV = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-dev.conllu" "UD_210 dev set of GERMAN_GSD." UD_210_GERMAN_GSD_TEST = _UD_210_HOME + "UD_German-GSD/de_gsd-ud-test.conllu" "UD_210 test set of GERMAN_GSD." UD_210_GERMAN_HDT_TRAIN = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-train.conllu" "UD_210 train set of GERMAN_HDT." UD_210_GERMAN_HDT_DEV = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-dev.conllu" "UD_210 dev set of GERMAN_HDT." UD_210_GERMAN_HDT_TEST = _UD_210_HOME + "UD_German-HDT/de_hdt-ud-test.conllu" "UD_210 test set of GERMAN_HDT." UD_210_GERMAN_LIT_TEST = _UD_210_HOME + "UD_German-LIT/de_lit-ud-test.conllu" "UD_210 test set of GERMAN_LIT." UD_210_GERMAN_PUD_TEST = _UD_210_HOME + "UD_German-PUD/de_pud-ud-test.conllu" "UD_210 test set of GERMAN_PUD." UD_210_GOTHIC_PROIEL_TRAIN = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-train.conllu" "UD_210 train set of GOTHIC_PROIEL." UD_210_GOTHIC_PROIEL_DEV = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-dev.conllu" "UD_210 dev set of GOTHIC_PROIEL." UD_210_GOTHIC_PROIEL_TEST = _UD_210_HOME + "UD_Gothic-PROIEL/got_proiel-ud-test.conllu" "UD_210 test set of GOTHIC_PROIEL." UD_210_GREEK_GDT_TRAIN = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-train.conllu" "UD_210 train set of GREEK_GDT." UD_210_GREEK_GDT_DEV = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-dev.conllu" "UD_210 dev set of GREEK_GDT." UD_210_GREEK_GDT_TEST = _UD_210_HOME + "UD_Greek-GDT/el_gdt-ud-test.conllu" "UD_210 test set of GREEK_GDT." UD_210_GUAJAJARA_TUDET_TEST = _UD_210_HOME + "UD_Guajajara-TuDeT/gub_tudet-ud-test.conllu" "UD_210 test set of GUAJAJARA_TUDET." UD_210_GUARANI_OLDTUDET_TEST = _UD_210_HOME + "UD_Guarani-OldTuDeT/gn_oldtudet-ud-test.conllu" "UD_210 test set of GUARANI_OLDTUDET." UD_210_HEBREW_HTB_TRAIN = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-train.conllu" "UD_210 train set of HEBREW_HTB." UD_210_HEBREW_HTB_DEV = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-dev.conllu" "UD_210 dev set of HEBREW_HTB." UD_210_HEBREW_HTB_TEST = _UD_210_HOME + "UD_Hebrew-HTB/he_htb-ud-test.conllu" "UD_210 test set of HEBREW_HTB." UD_210_HEBREW_IAHLTWIKI_TRAIN = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-train.conllu" "UD_210 train set of HEBREW_IAHLTWIKI." UD_210_HEBREW_IAHLTWIKI_DEV = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-dev.conllu" "UD_210 dev set of HEBREW_IAHLTWIKI." UD_210_HEBREW_IAHLTWIKI_TEST = _UD_210_HOME + "UD_Hebrew-IAHLTwiki/he_iahltwiki-ud-test.conllu" "UD_210 test set of HEBREW_IAHLTWIKI." UD_210_HINDI_HDTB_TRAIN = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-train.conllu" "UD_210 train set of HINDI_HDTB." UD_210_HINDI_HDTB_DEV = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" "UD_210 dev set of HINDI_HDTB." UD_210_HINDI_HDTB_TEST = _UD_210_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-test.conllu" "UD_210 test set of HINDI_HDTB." UD_210_HINDI_PUD_TEST = _UD_210_HOME + "UD_Hindi-PUD/hi_pud-ud-test.conllu" "UD_210 test set of HINDI_PUD." UD_210_HINDI_ENGLISH_HIENCS_TRAIN = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu" "UD_210 train set of HINDI_ENGLISH_HIENCS." UD_210_HINDI_ENGLISH_HIENCS_DEV = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu" "UD_210 dev set of HINDI_ENGLISH_HIENCS." UD_210_HINDI_ENGLISH_HIENCS_TEST = _UD_210_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu" "UD_210 test set of HINDI_ENGLISH_HIENCS." UD_210_HITTITE_HITTB_TEST = _UD_210_HOME + "UD_Hittite-HitTB/hit_hittb-ud-test.conllu" "UD_210 test set of HITTITE_HITTB." UD_210_HUNGARIAN_SZEGED_TRAIN = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-train.conllu" "UD_210 train set of HUNGARIAN_SZEGED." UD_210_HUNGARIAN_SZEGED_DEV = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu" "UD_210 dev set of HUNGARIAN_SZEGED." UD_210_HUNGARIAN_SZEGED_TEST = _UD_210_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-test.conllu" "UD_210 test set of HUNGARIAN_SZEGED." UD_210_ICELANDIC_ICEPAHC_TRAIN = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu" "UD_210 train set of ICELANDIC_ICEPAHC." UD_210_ICELANDIC_ICEPAHC_DEV = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu" "UD_210 dev set of ICELANDIC_ICEPAHC." UD_210_ICELANDIC_ICEPAHC_TEST = _UD_210_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu" "UD_210 test set of ICELANDIC_ICEPAHC." UD_210_ICELANDIC_MODERN_TRAIN = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-train.conllu" "UD_210 train set of ICELANDIC_MODERN." UD_210_ICELANDIC_MODERN_DEV = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-dev.conllu" "UD_210 dev set of ICELANDIC_MODERN." UD_210_ICELANDIC_MODERN_TEST = _UD_210_HOME + "UD_Icelandic-Modern/is_modern-ud-test.conllu" "UD_210 test set of ICELANDIC_MODERN." UD_210_ICELANDIC_PUD_TEST = _UD_210_HOME + "UD_Icelandic-PUD/is_pud-ud-test.conllu" "UD_210 test set of ICELANDIC_PUD." UD_210_INDONESIAN_CSUI_TRAIN = _UD_210_HOME + "UD_Indonesian-CSUI/id_csui-ud-train.conllu" "UD_210 train set of INDONESIAN_CSUI." UD_210_INDONESIAN_CSUI_TEST = _UD_210_HOME + "UD_Indonesian-CSUI/id_csui-ud-test.conllu" "UD_210 test set of INDONESIAN_CSUI." UD_210_INDONESIAN_GSD_TRAIN = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-train.conllu" "UD_210 train set of INDONESIAN_GSD." UD_210_INDONESIAN_GSD_DEV = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-dev.conllu" "UD_210 dev set of INDONESIAN_GSD." UD_210_INDONESIAN_GSD_TEST = _UD_210_HOME + "UD_Indonesian-GSD/id_gsd-ud-test.conllu" "UD_210 test set of INDONESIAN_GSD." UD_210_INDONESIAN_PUD_TEST = _UD_210_HOME + "UD_Indonesian-PUD/id_pud-ud-test.conllu" "UD_210 test set of INDONESIAN_PUD." UD_210_IRISH_IDT_TRAIN = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-train.conllu" "UD_210 train set of IRISH_IDT." UD_210_IRISH_IDT_DEV = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-dev.conllu" "UD_210 dev set of IRISH_IDT." UD_210_IRISH_IDT_TEST = _UD_210_HOME + "UD_Irish-IDT/ga_idt-ud-test.conllu" "UD_210 test set of IRISH_IDT." UD_210_IRISH_TWITTIRISH_TEST = _UD_210_HOME + "UD_Irish-TwittIrish/ga_twittirish-ud-test.conllu" "UD_210 test set of IRISH_TWITTIRISH." UD_210_ITALIAN_ISDT_TRAIN = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-train.conllu" "UD_210 train set of ITALIAN_ISDT." UD_210_ITALIAN_ISDT_DEV = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-dev.conllu" "UD_210 dev set of ITALIAN_ISDT." UD_210_ITALIAN_ISDT_TEST = _UD_210_HOME + "UD_Italian-ISDT/it_isdt-ud-test.conllu" "UD_210 test set of ITALIAN_ISDT." UD_210_ITALIAN_MARKIT_TRAIN = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-train.conllu" "UD_210 train set of ITALIAN_MARKIT." UD_210_ITALIAN_MARKIT_DEV = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-dev.conllu" "UD_210 dev set of ITALIAN_MARKIT." UD_210_ITALIAN_MARKIT_TEST = _UD_210_HOME + "UD_Italian-MarkIT/it_markit-ud-test.conllu" "UD_210 test set of ITALIAN_MARKIT." UD_210_ITALIAN_PUD_TEST = _UD_210_HOME + "UD_Italian-PUD/it_pud-ud-test.conllu" "UD_210 test set of ITALIAN_PUD." UD_210_ITALIAN_PARTUT_TRAIN = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-train.conllu" "UD_210 train set of ITALIAN_PARTUT." UD_210_ITALIAN_PARTUT_DEV = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-dev.conllu" "UD_210 dev set of ITALIAN_PARTUT." UD_210_ITALIAN_PARTUT_TEST = _UD_210_HOME + "UD_Italian-ParTUT/it_partut-ud-test.conllu" "UD_210 test set of ITALIAN_PARTUT." UD_210_ITALIAN_POSTWITA_TRAIN = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-train.conllu" "UD_210 train set of ITALIAN_POSTWITA." UD_210_ITALIAN_POSTWITA_DEV = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu" "UD_210 dev set of ITALIAN_POSTWITA." UD_210_ITALIAN_POSTWITA_TEST = _UD_210_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-test.conllu" "UD_210 test set of ITALIAN_POSTWITA." UD_210_ITALIAN_TWITTIRO_TRAIN = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu" "UD_210 train set of ITALIAN_TWITTIRO." UD_210_ITALIAN_TWITTIRO_DEV = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu" "UD_210 dev set of ITALIAN_TWITTIRO." UD_210_ITALIAN_TWITTIRO_TEST = _UD_210_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu" "UD_210 test set of ITALIAN_TWITTIRO." UD_210_ITALIAN_VIT_TRAIN = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-train.conllu" "UD_210 train set of ITALIAN_VIT." UD_210_ITALIAN_VIT_DEV = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-dev.conllu" "UD_210 dev set of ITALIAN_VIT." UD_210_ITALIAN_VIT_TEST = _UD_210_HOME + "UD_Italian-VIT/it_vit-ud-test.conllu" "UD_210 test set of ITALIAN_VIT." UD_210_ITALIAN_VALICO_TEST = _UD_210_HOME + "UD_Italian-Valico/it_valico-ud-test.conllu" "UD_210 test set of ITALIAN_VALICO." UD_210_JAPANESE_BCCWJ_TRAIN = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu" "UD_210 train set of JAPANESE_BCCWJ." UD_210_JAPANESE_BCCWJ_DEV = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu" "UD_210 dev set of JAPANESE_BCCWJ." UD_210_JAPANESE_BCCWJ_TEST = _UD_210_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu" "UD_210 test set of JAPANESE_BCCWJ." UD_210_JAPANESE_BCCWJLUW_TRAIN = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-train.conllu" "UD_210 train set of JAPANESE_BCCWJLUW." UD_210_JAPANESE_BCCWJLUW_DEV = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-dev.conllu" "UD_210 dev set of JAPANESE_BCCWJLUW." UD_210_JAPANESE_BCCWJLUW_TEST = _UD_210_HOME + "UD_Japanese-BCCWJLUW/ja_bccwjluw-ud-test.conllu" "UD_210 test set of JAPANESE_BCCWJLUW." UD_210_JAPANESE_GSD_TRAIN = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-train.conllu" "UD_210 train set of JAPANESE_GSD." UD_210_JAPANESE_GSD_DEV = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-dev.conllu" "UD_210 dev set of JAPANESE_GSD." UD_210_JAPANESE_GSD_TEST = _UD_210_HOME + "UD_Japanese-GSD/ja_gsd-ud-test.conllu" "UD_210 test set of JAPANESE_GSD." UD_210_JAPANESE_GSDLUW_TRAIN = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-train.conllu" "UD_210 train set of JAPANESE_GSDLUW." UD_210_JAPANESE_GSDLUW_DEV = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-dev.conllu" "UD_210 dev set of JAPANESE_GSDLUW." UD_210_JAPANESE_GSDLUW_TEST = _UD_210_HOME + "UD_Japanese-GSDLUW/ja_gsdluw-ud-test.conllu" "UD_210 test set of JAPANESE_GSDLUW." UD_210_JAPANESE_MODERN_TEST = _UD_210_HOME + "UD_Japanese-Modern/ja_modern-ud-test.conllu" "UD_210 test set of JAPANESE_MODERN." UD_210_JAPANESE_PUD_TEST = _UD_210_HOME + "UD_Japanese-PUD/ja_pud-ud-test.conllu" "UD_210 test set of JAPANESE_PUD." UD_210_JAPANESE_PUDLUW_TEST = _UD_210_HOME + "UD_Japanese-PUDLUW/ja_pudluw-ud-test.conllu" "UD_210 test set of JAPANESE_PUDLUW." UD_210_JAVANESE_CSUI_TEST = _UD_210_HOME + "UD_Javanese-CSUI/jv_csui-ud-test.conllu" "UD_210 test set of JAVANESE_CSUI." UD_210_KAAPOR_TUDET_TEST = _UD_210_HOME + "UD_Kaapor-TuDeT/urb_tudet-ud-test.conllu" "UD_210 test set of KAAPOR_TUDET." UD_210_KANGRI_KDTB_TEST = _UD_210_HOME + "UD_Kangri-KDTB/xnr_kdtb-ud-test.conllu" "UD_210 test set of KANGRI_KDTB." UD_210_KARELIAN_KKPP_TEST = _UD_210_HOME + "UD_Karelian-KKPP/krl_kkpp-ud-test.conllu" "UD_210 test set of KARELIAN_KKPP." UD_210_KARO_TUDET_TEST = _UD_210_HOME + "UD_Karo-TuDeT/arr_tudet-ud-test.conllu" "UD_210 test set of KARO_TUDET." UD_210_KAZAKH_KTB_TRAIN = _UD_210_HOME + "UD_Kazakh-KTB/kk_ktb-ud-train.conllu" "UD_210 train set of KAZAKH_KTB." UD_210_KAZAKH_KTB_TEST = _UD_210_HOME + "UD_Kazakh-KTB/kk_ktb-ud-test.conllu" "UD_210 test set of KAZAKH_KTB." UD_210_KHUNSARI_AHA_TEST = _UD_210_HOME + "UD_Khunsari-AHA/kfm_aha-ud-test.conllu" "UD_210 test set of KHUNSARI_AHA." UD_210_KICHE_IU_TEST = _UD_210_HOME + "UD_Kiche-IU/quc_iu-ud-test.conllu" "UD_210 test set of KICHE_IU." UD_210_KOMI_PERMYAK_UH_TEST = _UD_210_HOME + "UD_Komi_Permyak-UH/koi_uh-ud-test.conllu" "UD_210 test set of KOMI_PERMYAK_UH." UD_210_KOMI_ZYRIAN_IKDP_TEST = _UD_210_HOME + "UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu" "UD_210 test set of KOMI_ZYRIAN_IKDP." UD_210_KOMI_ZYRIAN_LATTICE_TEST = _UD_210_HOME + "UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu" "UD_210 test set of KOMI_ZYRIAN_LATTICE." UD_210_KOREAN_GSD_TRAIN = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-train.conllu" "UD_210 train set of KOREAN_GSD." UD_210_KOREAN_GSD_DEV = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-dev.conllu" "UD_210 dev set of KOREAN_GSD." UD_210_KOREAN_GSD_TEST = _UD_210_HOME + "UD_Korean-GSD/ko_gsd-ud-test.conllu" "UD_210 test set of KOREAN_GSD." UD_210_KOREAN_KAIST_TRAIN = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-train.conllu" "UD_210 train set of KOREAN_KAIST." UD_210_KOREAN_KAIST_DEV = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-dev.conllu" "UD_210 dev set of KOREAN_KAIST." UD_210_KOREAN_KAIST_TEST = _UD_210_HOME + "UD_Korean-Kaist/ko_kaist-ud-test.conllu" "UD_210 test set of KOREAN_KAIST." UD_210_KOREAN_PUD_TEST = _UD_210_HOME + "UD_Korean-PUD/ko_pud-ud-test.conllu" "UD_210 test set of KOREAN_PUD." UD_210_KURMANJI_MG_TRAIN = _UD_210_HOME + "UD_Kurmanji-MG/kmr_mg-ud-train.conllu" "UD_210 train set of KURMANJI_MG." UD_210_KURMANJI_MG_TEST = _UD_210_HOME + "UD_Kurmanji-MG/kmr_mg-ud-test.conllu" "UD_210 test set of KURMANJI_MG." UD_210_LATIN_ITTB_TRAIN = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-train.conllu" "UD_210 train set of LATIN_ITTB." UD_210_LATIN_ITTB_DEV = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-dev.conllu" "UD_210 dev set of LATIN_ITTB." UD_210_LATIN_ITTB_TEST = _UD_210_HOME + "UD_Latin-ITTB/la_ittb-ud-test.conllu" "UD_210 test set of LATIN_ITTB." UD_210_LATIN_LLCT_TRAIN = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-train.conllu" "UD_210 train set of LATIN_LLCT." UD_210_LATIN_LLCT_DEV = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-dev.conllu" "UD_210 dev set of LATIN_LLCT." UD_210_LATIN_LLCT_TEST = _UD_210_HOME + "UD_Latin-LLCT/la_llct-ud-test.conllu" "UD_210 test set of LATIN_LLCT." UD_210_LATIN_PROIEL_TRAIN = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-train.conllu" "UD_210 train set of LATIN_PROIEL." UD_210_LATIN_PROIEL_DEV = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-dev.conllu" "UD_210 dev set of LATIN_PROIEL." UD_210_LATIN_PROIEL_TEST = _UD_210_HOME + "UD_Latin-PROIEL/la_proiel-ud-test.conllu" "UD_210 test set of LATIN_PROIEL." UD_210_LATIN_PERSEUS_TRAIN = _UD_210_HOME + "UD_Latin-Perseus/la_perseus-ud-train.conllu" "UD_210 train set of LATIN_PERSEUS." UD_210_LATIN_PERSEUS_TEST = _UD_210_HOME + "UD_Latin-Perseus/la_perseus-ud-test.conllu" "UD_210 test set of LATIN_PERSEUS." UD_210_LATIN_UDANTE_TRAIN = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-train.conllu" "UD_210 train set of LATIN_UDANTE." UD_210_LATIN_UDANTE_DEV = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-dev.conllu" "UD_210 dev set of LATIN_UDANTE." UD_210_LATIN_UDANTE_TEST = _UD_210_HOME + "UD_Latin-UDante/la_udante-ud-test.conllu" "UD_210 test set of LATIN_UDANTE." UD_210_LATVIAN_LVTB_TRAIN = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-train.conllu" "UD_210 train set of LATVIAN_LVTB." UD_210_LATVIAN_LVTB_DEV = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu" "UD_210 dev set of LATVIAN_LVTB." UD_210_LATVIAN_LVTB_TEST = _UD_210_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-test.conllu" "UD_210 test set of LATVIAN_LVTB." UD_210_LIGURIAN_GLT_TRAIN = _UD_210_HOME + "UD_Ligurian-GLT/lij_glt-ud-train.conllu" "UD_210 train set of LIGURIAN_GLT." UD_210_LIGURIAN_GLT_TEST = _UD_210_HOME + "UD_Ligurian-GLT/lij_glt-ud-test.conllu" "UD_210 test set of LIGURIAN_GLT." UD_210_LITHUANIAN_ALKSNIS_TRAIN = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu" "UD_210 train set of LITHUANIAN_ALKSNIS." UD_210_LITHUANIAN_ALKSNIS_DEV = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu" "UD_210 dev set of LITHUANIAN_ALKSNIS." UD_210_LITHUANIAN_ALKSNIS_TEST = _UD_210_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu" "UD_210 test set of LITHUANIAN_ALKSNIS." UD_210_LITHUANIAN_HSE_TRAIN = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-train.conllu" "UD_210 train set of LITHUANIAN_HSE." UD_210_LITHUANIAN_HSE_DEV = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-dev.conllu" "UD_210 dev set of LITHUANIAN_HSE." UD_210_LITHUANIAN_HSE_TEST = _UD_210_HOME + "UD_Lithuanian-HSE/lt_hse-ud-test.conllu" "UD_210 test set of LITHUANIAN_HSE." UD_210_LIVVI_KKPP_TRAIN = _UD_210_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-train.conllu" "UD_210 train set of LIVVI_KKPP." UD_210_LIVVI_KKPP_TEST = _UD_210_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-test.conllu" "UD_210 test set of LIVVI_KKPP." UD_210_LOW_SAXON_LSDC_TEST = _UD_210_HOME + "UD_Low_Saxon-LSDC/nds_lsdc-ud-test.conllu" "UD_210 test set of LOW_SAXON_LSDC." UD_210_MADI_JARAWARA_TEST = _UD_210_HOME + "UD_Madi-Jarawara/jaa_jarawara-ud-test.conllu" "UD_210 test set of MADI_JARAWARA." UD_210_MAKURAP_TUDET_TEST = _UD_210_HOME + "UD_Makurap-TuDeT/mpu_tudet-ud-test.conllu" "UD_210 test set of MAKURAP_TUDET." UD_210_MALTESE_MUDT_TRAIN = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-train.conllu" "UD_210 train set of MALTESE_MUDT." UD_210_MALTESE_MUDT_DEV = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-dev.conllu" "UD_210 dev set of MALTESE_MUDT." UD_210_MALTESE_MUDT_TEST = _UD_210_HOME + "UD_Maltese-MUDT/mt_mudt-ud-test.conllu" "UD_210 test set of MALTESE_MUDT." UD_210_MANX_CADHAN_TEST = _UD_210_HOME + "UD_Manx-Cadhan/gv_cadhan-ud-test.conllu" "UD_210 test set of MANX_CADHAN." UD_210_MARATHI_UFAL_TRAIN = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-train.conllu" "UD_210 train set of MARATHI_UFAL." UD_210_MARATHI_UFAL_DEV = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-dev.conllu" "UD_210 dev set of MARATHI_UFAL." UD_210_MARATHI_UFAL_TEST = _UD_210_HOME + "UD_Marathi-UFAL/mr_ufal-ud-test.conllu" "UD_210 test set of MARATHI_UFAL." UD_210_MBYA_GUARANI_DOOLEY_TEST = _UD_210_HOME + "UD_Mbya_Guarani-Dooley/gun_dooley-ud-test.conllu" "UD_210 test set of MBYA_GUARANI_DOOLEY." UD_210_MBYA_GUARANI_THOMAS_TEST = _UD_210_HOME + "UD_Mbya_Guarani-Thomas/gun_thomas-ud-test.conllu" "UD_210 test set of MBYA_GUARANI_THOMAS." UD_210_MOKSHA_JR_TEST = _UD_210_HOME + "UD_Moksha-JR/mdf_jr-ud-test.conllu" "UD_210 test set of MOKSHA_JR." UD_210_MUNDURUKU_TUDET_TEST = _UD_210_HOME + "UD_Munduruku-TuDeT/myu_tudet-ud-test.conllu" "UD_210 test set of MUNDURUKU_TUDET." UD_210_NAIJA_NSC_TRAIN = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-train.conllu" "UD_210 train set of NAIJA_NSC." UD_210_NAIJA_NSC_DEV = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-dev.conllu" "UD_210 dev set of NAIJA_NSC." UD_210_NAIJA_NSC_TEST = _UD_210_HOME + "UD_Naija-NSC/pcm_nsc-ud-test.conllu" "UD_210 test set of NAIJA_NSC." UD_210_NAYINI_AHA_TEST = _UD_210_HOME + "UD_Nayini-AHA/nyq_aha-ud-test.conllu" "UD_210 test set of NAYINI_AHA." UD_210_NEAPOLITAN_RB_TEST = _UD_210_HOME + "UD_Neapolitan-RB/nap_rb-ud-test.conllu" "UD_210 test set of NEAPOLITAN_RB." UD_210_NORTH_SAMI_GIELLA_TRAIN = _UD_210_HOME + "UD_North_Sami-Giella/sme_giella-ud-train.conllu" "UD_210 train set of NORTH_SAMI_GIELLA." UD_210_NORTH_SAMI_GIELLA_TEST = _UD_210_HOME + "UD_North_Sami-Giella/sme_giella-ud-test.conllu" "UD_210 test set of NORTH_SAMI_GIELLA." UD_210_NORWEGIAN_BOKMAAL_TRAIN = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu" "UD_210 train set of NORWEGIAN_BOKMAAL." UD_210_NORWEGIAN_BOKMAAL_DEV = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu" "UD_210 dev set of NORWEGIAN_BOKMAAL." UD_210_NORWEGIAN_BOKMAAL_TEST = _UD_210_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu" "UD_210 test set of NORWEGIAN_BOKMAAL." UD_210_NORWEGIAN_NYNORSK_TRAIN = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu" "UD_210 train set of NORWEGIAN_NYNORSK." UD_210_NORWEGIAN_NYNORSK_DEV = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu" "UD_210 dev set of NORWEGIAN_NYNORSK." UD_210_NORWEGIAN_NYNORSK_TEST = _UD_210_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu" "UD_210 test set of NORWEGIAN_NYNORSK." UD_210_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu" "UD_210 train set of NORWEGIAN_NYNORSKLIA." UD_210_NORWEGIAN_NYNORSKLIA_DEV = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu" "UD_210 dev set of NORWEGIAN_NYNORSKLIA." UD_210_NORWEGIAN_NYNORSKLIA_TEST = _UD_210_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu" "UD_210 test set of NORWEGIAN_NYNORSKLIA." UD_210_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu" "UD_210 train set of OLD_CHURCH_SLAVONIC_PROIEL." UD_210_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu" "UD_210 dev set of OLD_CHURCH_SLAVONIC_PROIEL." UD_210_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_210_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu" "UD_210 test set of OLD_CHURCH_SLAVONIC_PROIEL." UD_210_OLD_EAST_SLAVIC_BIRCHBARK_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-train.conllu" "UD_210 train set of OLD_EAST_SLAVIC_BIRCHBARK." UD_210_OLD_EAST_SLAVIC_BIRCHBARK_DEV = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-dev.conllu" "UD_210 dev set of OLD_EAST_SLAVIC_BIRCHBARK." UD_210_OLD_EAST_SLAVIC_BIRCHBARK_TEST = _UD_210_HOME + "UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-test.conllu" "UD_210 test set of OLD_EAST_SLAVIC_BIRCHBARK." UD_210_OLD_EAST_SLAVIC_RNC_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-RNC/orv_rnc-ud-train.conllu" "UD_210 train set of OLD_EAST_SLAVIC_RNC." UD_210_OLD_EAST_SLAVIC_RNC_TEST = _UD_210_HOME + "UD_Old_East_Slavic-RNC/orv_rnc-ud-test.conllu" "UD_210 test set of OLD_EAST_SLAVIC_RNC." UD_210_OLD_EAST_SLAVIC_TOROT_TRAIN = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-train.conllu" "UD_210 train set of OLD_EAST_SLAVIC_TOROT." UD_210_OLD_EAST_SLAVIC_TOROT_DEV = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-dev.conllu" "UD_210 dev set of OLD_EAST_SLAVIC_TOROT." UD_210_OLD_EAST_SLAVIC_TOROT_TEST = _UD_210_HOME + "UD_Old_East_Slavic-TOROT/orv_torot-ud-test.conllu" "UD_210 test set of OLD_EAST_SLAVIC_TOROT." UD_210_OLD_FRENCH_SRCMF_TRAIN = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu" "UD_210 train set of OLD_FRENCH_SRCMF." UD_210_OLD_FRENCH_SRCMF_DEV = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu" "UD_210 dev set of OLD_FRENCH_SRCMF." UD_210_OLD_FRENCH_SRCMF_TEST = _UD_210_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu" "UD_210 test set of OLD_FRENCH_SRCMF." UD_210_OLD_TURKISH_TONQQ_TEST = _UD_210_HOME + "UD_Old_Turkish-Tonqq/otk_tonqq-ud-test.conllu" "UD_210 test set of OLD_TURKISH_TONQQ." UD_210_PERSIAN_PERDT_TRAIN = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-train.conllu" "UD_210 train set of PERSIAN_PERDT." UD_210_PERSIAN_PERDT_DEV = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-dev.conllu" "UD_210 dev set of PERSIAN_PERDT." UD_210_PERSIAN_PERDT_TEST = _UD_210_HOME + "UD_Persian-PerDT/fa_perdt-ud-test.conllu" "UD_210 test set of PERSIAN_PERDT." UD_210_PERSIAN_SERAJI_TRAIN = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-train.conllu" "UD_210 train set of PERSIAN_SERAJI." UD_210_PERSIAN_SERAJI_DEV = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-dev.conllu" "UD_210 dev set of PERSIAN_SERAJI." UD_210_PERSIAN_SERAJI_TEST = _UD_210_HOME + "UD_Persian-Seraji/fa_seraji-ud-test.conllu" "UD_210 test set of PERSIAN_SERAJI." UD_210_POLISH_LFG_TRAIN = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-train.conllu" "UD_210 train set of POLISH_LFG." UD_210_POLISH_LFG_DEV = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-dev.conllu" "UD_210 dev set of POLISH_LFG." UD_210_POLISH_LFG_TEST = _UD_210_HOME + "UD_Polish-LFG/pl_lfg-ud-test.conllu" "UD_210 test set of POLISH_LFG." UD_210_POLISH_PDB_TRAIN = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-train.conllu" "UD_210 train set of POLISH_PDB." UD_210_POLISH_PDB_DEV = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-dev.conllu" "UD_210 dev set of POLISH_PDB." UD_210_POLISH_PDB_TEST = _UD_210_HOME + "UD_Polish-PDB/pl_pdb-ud-test.conllu" "UD_210 test set of POLISH_PDB." UD_210_POLISH_PUD_TEST = _UD_210_HOME + "UD_Polish-PUD/pl_pud-ud-test.conllu" "UD_210 test set of POLISH_PUD." UD_210_POMAK_PHILOTIS_TRAIN = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-train.conllu" "UD_210 train set of POMAK_PHILOTIS." UD_210_POMAK_PHILOTIS_DEV = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-dev.conllu" "UD_210 dev set of POMAK_PHILOTIS." UD_210_POMAK_PHILOTIS_TEST = _UD_210_HOME + "UD_Pomak-Philotis/qpm_philotis-ud-test.conllu" "UD_210 test set of POMAK_PHILOTIS." UD_210_PORTUGUESE_BOSQUE_TRAIN = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-train.conllu" "UD_210 train set of PORTUGUESE_BOSQUE." UD_210_PORTUGUESE_BOSQUE_DEV = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu" "UD_210 dev set of PORTUGUESE_BOSQUE." UD_210_PORTUGUESE_BOSQUE_TEST = _UD_210_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-test.conllu" "UD_210 test set of PORTUGUESE_BOSQUE." UD_210_PORTUGUESE_GSD_TRAIN = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-train.conllu" "UD_210 train set of PORTUGUESE_GSD." UD_210_PORTUGUESE_GSD_DEV = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-dev.conllu" "UD_210 dev set of PORTUGUESE_GSD." UD_210_PORTUGUESE_GSD_TEST = _UD_210_HOME + "UD_Portuguese-GSD/pt_gsd-ud-test.conllu" "UD_210 test set of PORTUGUESE_GSD." UD_210_PORTUGUESE_PUD_TEST = _UD_210_HOME + "UD_Portuguese-PUD/pt_pud-ud-test.conllu" "UD_210 test set of PORTUGUESE_PUD." UD_210_ROMANIAN_ART_TEST = _UD_210_HOME + "UD_Romanian-ArT/ro_art-ud-test.conllu" "UD_210 test set of ROMANIAN_ART." UD_210_ROMANIAN_NONSTANDARD_TRAIN = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu" "UD_210 train set of ROMANIAN_NONSTANDARD." UD_210_ROMANIAN_NONSTANDARD_DEV = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu" "UD_210 dev set of ROMANIAN_NONSTANDARD." UD_210_ROMANIAN_NONSTANDARD_TEST = _UD_210_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu" "UD_210 test set of ROMANIAN_NONSTANDARD." UD_210_ROMANIAN_RRT_TRAIN = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-train.conllu" "UD_210 train set of ROMANIAN_RRT." UD_210_ROMANIAN_RRT_DEV = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-dev.conllu" "UD_210 dev set of ROMANIAN_RRT." UD_210_ROMANIAN_RRT_TEST = _UD_210_HOME + "UD_Romanian-RRT/ro_rrt-ud-test.conllu" "UD_210 test set of ROMANIAN_RRT." UD_210_ROMANIAN_SIMONERO_TRAIN = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-train.conllu" "UD_210 train set of ROMANIAN_SIMONERO." UD_210_ROMANIAN_SIMONERO_DEV = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-dev.conllu" "UD_210 dev set of ROMANIAN_SIMONERO." UD_210_ROMANIAN_SIMONERO_TEST = _UD_210_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-test.conllu" "UD_210 test set of ROMANIAN_SIMONERO." UD_210_RUSSIAN_GSD_TRAIN = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-train.conllu" "UD_210 train set of RUSSIAN_GSD." UD_210_RUSSIAN_GSD_DEV = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-dev.conllu" "UD_210 dev set of RUSSIAN_GSD." UD_210_RUSSIAN_GSD_TEST = _UD_210_HOME + "UD_Russian-GSD/ru_gsd-ud-test.conllu" "UD_210 test set of RUSSIAN_GSD." UD_210_RUSSIAN_PUD_TEST = _UD_210_HOME + "UD_Russian-PUD/ru_pud-ud-test.conllu" "UD_210 test set of RUSSIAN_PUD." UD_210_RUSSIAN_SYNTAGRUS_TRAIN = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu" "UD_210 train set of RUSSIAN_SYNTAGRUS." UD_210_RUSSIAN_SYNTAGRUS_DEV = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu" "UD_210 dev set of RUSSIAN_SYNTAGRUS." UD_210_RUSSIAN_SYNTAGRUS_TEST = _UD_210_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu" "UD_210 test set of RUSSIAN_SYNTAGRUS." UD_210_RUSSIAN_TAIGA_TRAIN = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-train.conllu" "UD_210 train set of RUSSIAN_TAIGA." UD_210_RUSSIAN_TAIGA_DEV = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-dev.conllu" "UD_210 dev set of RUSSIAN_TAIGA." UD_210_RUSSIAN_TAIGA_TEST = _UD_210_HOME + "UD_Russian-Taiga/ru_taiga-ud-test.conllu" "UD_210 test set of RUSSIAN_TAIGA." UD_210_SANSKRIT_UFAL_TEST = _UD_210_HOME + "UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu" "UD_210 test set of SANSKRIT_UFAL." UD_210_SANSKRIT_VEDIC_TRAIN = _UD_210_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu" "UD_210 train set of SANSKRIT_VEDIC." UD_210_SANSKRIT_VEDIC_TEST = _UD_210_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu" "UD_210 test set of SANSKRIT_VEDIC." UD_210_SCOTTISH_GAELIC_ARCOSG_TRAIN = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu" "UD_210 train set of SCOTTISH_GAELIC_ARCOSG." UD_210_SCOTTISH_GAELIC_ARCOSG_DEV = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu" "UD_210 dev set of SCOTTISH_GAELIC_ARCOSG." UD_210_SCOTTISH_GAELIC_ARCOSG_TEST = _UD_210_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu" "UD_210 test set of SCOTTISH_GAELIC_ARCOSG." UD_210_SERBIAN_SET_TRAIN = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-train.conllu" "UD_210 train set of SERBIAN_SET." UD_210_SERBIAN_SET_DEV = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-dev.conllu" "UD_210 dev set of SERBIAN_SET." UD_210_SERBIAN_SET_TEST = _UD_210_HOME + "UD_Serbian-SET/sr_set-ud-test.conllu" "UD_210 test set of SERBIAN_SET." UD_210_SKOLT_SAMI_GIELLAGAS_TEST = _UD_210_HOME + "UD_Skolt_Sami-Giellagas/sms_giellagas-ud-test.conllu" "UD_210 test set of SKOLT_SAMI_GIELLAGAS." UD_210_SLOVAK_SNK_TRAIN = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-train.conllu" "UD_210 train set of SLOVAK_SNK." UD_210_SLOVAK_SNK_DEV = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-dev.conllu" "UD_210 dev set of SLOVAK_SNK." UD_210_SLOVAK_SNK_TEST = _UD_210_HOME + "UD_Slovak-SNK/sk_snk-ud-test.conllu" "UD_210 test set of SLOVAK_SNK." UD_210_SLOVENIAN_SSJ_TRAIN = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-train.conllu" "UD_210 train set of SLOVENIAN_SSJ." UD_210_SLOVENIAN_SSJ_DEV = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu" "UD_210 dev set of SLOVENIAN_SSJ." UD_210_SLOVENIAN_SSJ_TEST = _UD_210_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-test.conllu" "UD_210 test set of SLOVENIAN_SSJ." UD_210_SLOVENIAN_SST_TRAIN = _UD_210_HOME + "UD_Slovenian-SST/sl_sst-ud-train.conllu" "UD_210 train set of SLOVENIAN_SST." UD_210_SLOVENIAN_SST_TEST = _UD_210_HOME + "UD_Slovenian-SST/sl_sst-ud-test.conllu" "UD_210 test set of SLOVENIAN_SST." UD_210_SOI_AHA_TEST = _UD_210_HOME + "UD_Soi-AHA/soj_aha-ud-test.conllu" "UD_210 test set of SOI_AHA." UD_210_SOUTH_LEVANTINE_ARABIC_MADAR_TEST = _UD_210_HOME + "UD_South_Levantine_Arabic-MADAR/ajp_madar-ud-test.conllu" "UD_210 test set of SOUTH_LEVANTINE_ARABIC_MADAR." UD_210_SPANISH_ANCORA_TRAIN = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-train.conllu" "UD_210 train set of SPANISH_ANCORA." UD_210_SPANISH_ANCORA_DEV = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-dev.conllu" "UD_210 dev set of SPANISH_ANCORA." UD_210_SPANISH_ANCORA_TEST = _UD_210_HOME + "UD_Spanish-AnCora/es_ancora-ud-test.conllu" "UD_210 test set of SPANISH_ANCORA." UD_210_SPANISH_GSD_TRAIN = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-train.conllu" "UD_210 train set of SPANISH_GSD." UD_210_SPANISH_GSD_DEV = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-dev.conllu" "UD_210 dev set of SPANISH_GSD." UD_210_SPANISH_GSD_TEST = _UD_210_HOME + "UD_Spanish-GSD/es_gsd-ud-test.conllu" "UD_210 test set of SPANISH_GSD." UD_210_SPANISH_PUD_TEST = _UD_210_HOME + "UD_Spanish-PUD/es_pud-ud-test.conllu" "UD_210 test set of SPANISH_PUD." UD_210_SWEDISH_LINES_TRAIN = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-train.conllu" "UD_210 train set of SWEDISH_LINES." UD_210_SWEDISH_LINES_DEV = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-dev.conllu" "UD_210 dev set of SWEDISH_LINES." UD_210_SWEDISH_LINES_TEST = _UD_210_HOME + "UD_Swedish-LinES/sv_lines-ud-test.conllu" "UD_210 test set of SWEDISH_LINES." UD_210_SWEDISH_PUD_TEST = _UD_210_HOME + "UD_Swedish-PUD/sv_pud-ud-test.conllu" "UD_210 test set of SWEDISH_PUD." UD_210_SWEDISH_TALBANKEN_TRAIN = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu" "UD_210 train set of SWEDISH_TALBANKEN." UD_210_SWEDISH_TALBANKEN_DEV = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu" "UD_210 dev set of SWEDISH_TALBANKEN." UD_210_SWEDISH_TALBANKEN_TEST = _UD_210_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu" "UD_210 test set of SWEDISH_TALBANKEN." UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu" "UD_210 train set of SWEDISH_SIGN_LANGUAGE_SSLC." UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu" "UD_210 dev set of SWEDISH_SIGN_LANGUAGE_SSLC." UD_210_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_210_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu" "UD_210 test set of SWEDISH_SIGN_LANGUAGE_SSLC." UD_210_SWISS_GERMAN_UZH_TEST = _UD_210_HOME + "UD_Swiss_German-UZH/gsw_uzh-ud-test.conllu" "UD_210 test set of SWISS_GERMAN_UZH." UD_210_TAGALOG_TRG_TEST = _UD_210_HOME + "UD_Tagalog-TRG/tl_trg-ud-test.conllu" "UD_210 test set of TAGALOG_TRG." UD_210_TAGALOG_UGNAYAN_TEST = _UD_210_HOME + "UD_Tagalog-Ugnayan/tl_ugnayan-ud-test.conllu" "UD_210 test set of TAGALOG_UGNAYAN." UD_210_TAMIL_MWTT_TEST = _UD_210_HOME + "UD_Tamil-MWTT/ta_mwtt-ud-test.conllu" "UD_210 test set of TAMIL_MWTT." UD_210_TAMIL_TTB_TRAIN = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-train.conllu" "UD_210 train set of TAMIL_TTB." UD_210_TAMIL_TTB_DEV = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-dev.conllu" "UD_210 dev set of TAMIL_TTB." UD_210_TAMIL_TTB_TEST = _UD_210_HOME + "UD_Tamil-TTB/ta_ttb-ud-test.conllu" "UD_210 test set of TAMIL_TTB." UD_210_TATAR_NMCTT_TEST = _UD_210_HOME + "UD_Tatar-NMCTT/tt_nmctt-ud-test.conllu" "UD_210 test set of TATAR_NMCTT." UD_210_TEKO_TUDET_TEST = _UD_210_HOME + "UD_Teko-TuDeT/eme_tudet-ud-test.conllu" "UD_210 test set of TEKO_TUDET." UD_210_TELUGU_MTG_TRAIN = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-train.conllu" "UD_210 train set of TELUGU_MTG." UD_210_TELUGU_MTG_DEV = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-dev.conllu" "UD_210 dev set of TELUGU_MTG." UD_210_TELUGU_MTG_TEST = _UD_210_HOME + "UD_Telugu-MTG/te_mtg-ud-test.conllu" "UD_210 test set of TELUGU_MTG." UD_210_THAI_PUD_TEST = _UD_210_HOME + "UD_Thai-PUD/th_pud-ud-test.conllu" "UD_210 test set of THAI_PUD." UD_210_TUPINAMBA_TUDET_TEST = _UD_210_HOME + "UD_Tupinamba-TuDeT/tpn_tudet-ud-test.conllu" "UD_210 test set of TUPINAMBA_TUDET." UD_210_TURKISH_ATIS_TRAIN = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-train.conllu" "UD_210 train set of TURKISH_ATIS." UD_210_TURKISH_ATIS_DEV = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-dev.conllu" "UD_210 dev set of TURKISH_ATIS." UD_210_TURKISH_ATIS_TEST = _UD_210_HOME + "UD_Turkish-Atis/tr_atis-ud-test.conllu" "UD_210 test set of TURKISH_ATIS." UD_210_TURKISH_BOUN_TRAIN = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-train.conllu" "UD_210 train set of TURKISH_BOUN." UD_210_TURKISH_BOUN_DEV = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-dev.conllu" "UD_210 dev set of TURKISH_BOUN." UD_210_TURKISH_BOUN_TEST = _UD_210_HOME + "UD_Turkish-BOUN/tr_boun-ud-test.conllu" "UD_210 test set of TURKISH_BOUN." UD_210_TURKISH_FRAMENET_TRAIN = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-train.conllu" "UD_210 train set of TURKISH_FRAMENET." UD_210_TURKISH_FRAMENET_DEV = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-dev.conllu" "UD_210 dev set of TURKISH_FRAMENET." UD_210_TURKISH_FRAMENET_TEST = _UD_210_HOME + "UD_Turkish-FrameNet/tr_framenet-ud-test.conllu" "UD_210 test set of TURKISH_FRAMENET." UD_210_TURKISH_GB_TEST = _UD_210_HOME + "UD_Turkish-GB/tr_gb-ud-test.conllu" "UD_210 test set of TURKISH_GB." UD_210_TURKISH_IMST_TRAIN = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-train.conllu" "UD_210 train set of TURKISH_IMST." UD_210_TURKISH_IMST_DEV = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-dev.conllu" "UD_210 dev set of TURKISH_IMST." UD_210_TURKISH_IMST_TEST = _UD_210_HOME + "UD_Turkish-IMST/tr_imst-ud-test.conllu" "UD_210 test set of TURKISH_IMST." UD_210_TURKISH_KENET_TRAIN = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-train.conllu" "UD_210 train set of TURKISH_KENET." UD_210_TURKISH_KENET_DEV = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-dev.conllu" "UD_210 dev set of TURKISH_KENET." UD_210_TURKISH_KENET_TEST = _UD_210_HOME + "UD_Turkish-Kenet/tr_kenet-ud-test.conllu" "UD_210 test set of TURKISH_KENET." UD_210_TURKISH_PUD_TEST = _UD_210_HOME + "UD_Turkish-PUD/tr_pud-ud-test.conllu" "UD_210 test set of TURKISH_PUD." UD_210_TURKISH_PENN_TRAIN = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-train.conllu" "UD_210 train set of TURKISH_PENN." UD_210_TURKISH_PENN_DEV = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-dev.conllu" "UD_210 dev set of TURKISH_PENN." UD_210_TURKISH_PENN_TEST = _UD_210_HOME + "UD_Turkish-Penn/tr_penn-ud-test.conllu" "UD_210 test set of TURKISH_PENN." UD_210_TURKISH_TOURISM_TRAIN = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-train.conllu" "UD_210 train set of TURKISH_TOURISM." UD_210_TURKISH_TOURISM_DEV = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-dev.conllu" "UD_210 dev set of TURKISH_TOURISM." UD_210_TURKISH_TOURISM_TEST = _UD_210_HOME + "UD_Turkish-Tourism/tr_tourism-ud-test.conllu" "UD_210 test set of TURKISH_TOURISM." UD_210_TURKISH_GERMAN_SAGT_TRAIN = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-train.conllu" "UD_210 train set of TURKISH_GERMAN_SAGT." UD_210_TURKISH_GERMAN_SAGT_DEV = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-dev.conllu" "UD_210 dev set of TURKISH_GERMAN_SAGT." UD_210_TURKISH_GERMAN_SAGT_TEST = _UD_210_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-test.conllu" "UD_210 test set of TURKISH_GERMAN_SAGT." UD_210_UKRAINIAN_IU_TRAIN = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-train.conllu" "UD_210 train set of UKRAINIAN_IU." UD_210_UKRAINIAN_IU_DEV = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-dev.conllu" "UD_210 dev set of UKRAINIAN_IU." UD_210_UKRAINIAN_IU_TEST = _UD_210_HOME + "UD_Ukrainian-IU/uk_iu-ud-test.conllu" "UD_210 test set of UKRAINIAN_IU." UD_210_UMBRIAN_IKUVINA_TEST = _UD_210_HOME + "UD_Umbrian-IKUVINA/xum_ikuvina-ud-test.conllu" "UD_210 test set of UMBRIAN_IKUVINA." UD_210_UPPER_SORBIAN_UFAL_TRAIN = _UD_210_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu" "UD_210 train set of UPPER_SORBIAN_UFAL." UD_210_UPPER_SORBIAN_UFAL_TEST = _UD_210_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu" "UD_210 test set of UPPER_SORBIAN_UFAL." UD_210_URDU_UDTB_TRAIN = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-train.conllu" "UD_210 train set of URDU_UDTB." UD_210_URDU_UDTB_DEV = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-dev.conllu" "UD_210 dev set of URDU_UDTB." UD_210_URDU_UDTB_TEST = _UD_210_HOME + "UD_Urdu-UDTB/ur_udtb-ud-test.conllu" "UD_210 test set of URDU_UDTB." UD_210_UYGHUR_UDT_TRAIN = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-train.conllu" "UD_210 train set of UYGHUR_UDT." UD_210_UYGHUR_UDT_DEV = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-dev.conllu" "UD_210 dev set of UYGHUR_UDT." UD_210_UYGHUR_UDT_TEST = _UD_210_HOME + "UD_Uyghur-UDT/ug_udt-ud-test.conllu" "UD_210 test set of UYGHUR_UDT." UD_210_VIETNAMESE_VTB_TRAIN = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-train.conllu" "UD_210 train set of VIETNAMESE_VTB." UD_210_VIETNAMESE_VTB_DEV = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu" "UD_210 dev set of VIETNAMESE_VTB." UD_210_VIETNAMESE_VTB_TEST = _UD_210_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-test.conllu" "UD_210 test set of VIETNAMESE_VTB." UD_210_WARLPIRI_UFAL_TEST = _UD_210_HOME + "UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu" "UD_210 test set of WARLPIRI_UFAL." UD_210_WELSH_CCG_TRAIN = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-train.conllu" "UD_210 train set of WELSH_CCG." UD_210_WELSH_CCG_DEV = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-dev.conllu" "UD_210 dev set of WELSH_CCG." UD_210_WELSH_CCG_TEST = _UD_210_HOME + "UD_Welsh-CCG/cy_ccg-ud-test.conllu" "UD_210 test set of WELSH_CCG." UD_210_WESTERN_ARMENIAN_ARMTDP_TRAIN = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-train.conllu" "UD_210 train set of WESTERN_ARMENIAN_ARMTDP." UD_210_WESTERN_ARMENIAN_ARMTDP_DEV = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-dev.conllu" "UD_210 dev set of WESTERN_ARMENIAN_ARMTDP." UD_210_WESTERN_ARMENIAN_ARMTDP_TEST = _UD_210_HOME + "UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-test.conllu" "UD_210 test set of WESTERN_ARMENIAN_ARMTDP." UD_210_WOLOF_WTB_TRAIN = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-train.conllu" "UD_210 train set of WOLOF_WTB." UD_210_WOLOF_WTB_DEV = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-dev.conllu" "UD_210 dev set of WOLOF_WTB." UD_210_WOLOF_WTB_TEST = _UD_210_HOME + "UD_Wolof-WTB/wo_wtb-ud-test.conllu" "UD_210 test set of WOLOF_WTB." UD_210_XIBE_XDT_TEST = _UD_210_HOME + "UD_Xibe-XDT/sjo_xdt-ud-test.conllu" "UD_210 test set of XIBE_XDT." UD_210_YAKUT_YKTDT_TEST = _UD_210_HOME + "UD_Yakut-YKTDT/sah_yktdt-ud-test.conllu" "UD_210 test set of YAKUT_YKTDT." UD_210_YORUBA_YTB_TEST = _UD_210_HOME + "UD_Yoruba-YTB/yo_ytb-ud-test.conllu" "UD_210 test set of YORUBA_YTB." UD_210_YUPIK_SLI_TEST = _UD_210_HOME + "UD_Yupik-SLI/ess_sli-ud-test.conllu" "UD_210 test set of YUPIK_SLI." ================================================ FILE: hanlp/datasets/parsing/ud/ud210m.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-21 20:39 import os from hanlp.datasets.parsing.ud import concat_treebanks from hanlp.datasets.parsing.ud.ud210 import _UD_210_HOME _UD_210_MULTILINGUAL_HOME = concat_treebanks(_UD_210_HOME, '2.10') UD_210_MULTILINGUAL_TRAIN = os.path.join(_UD_210_MULTILINGUAL_HOME, 'train.conllu') "Training set of multilingual UD_210 obtained by concatenating all training sets." UD_210_MULTILINGUAL_DEV = os.path.join(_UD_210_MULTILINGUAL_HOME, 'dev.conllu') "Dev set of multilingual UD_210 obtained by concatenating all dev sets." UD_210_MULTILINGUAL_TEST = os.path.join(_UD_210_MULTILINGUAL_HOME, 'test.conllu') "Test set of multilingual UD_210 obtained by concatenating all test sets." ================================================ FILE: hanlp/datasets/parsing/ud/ud23.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-21 20:26 _UD_23_HOME = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2895/ud-treebanks-v2.3.tgz?sequence=1&isAllowed=y" _UD_24_HOME = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2988/ud-treebanks-v2.4.tgz?sequence=4&isAllowed=y" def _list_dir(path, home): prefix = home.lstrip('_').replace('_HOME', '') from hanlp.utils.io_util import get_resource import glob import os path = get_resource(path) with open('ud23.py', 'a') as out: for f in sorted(glob.glob(path + '/UD_*')): basename = os.path.basename(f) name = basename[len('UD_'):] name = name.upper().replace('-', '_') for split in 'train', 'dev', 'test': sp = glob.glob(f + f'/*{split}.conllu') if not sp: continue sp = os.path.basename(sp[0]) out.write(f'{prefix}_{name}_{split.upper()} = {home} + "#{basename}/{sp}"\n') def main(): _list_dir(_UD_23_HOME, '_UD_23_HOME') pass if __name__ == '__main__': main() UD_23_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu" UD_23_AFRIKAANS_AFRIBOOMS_DEV = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu" UD_23_AFRIKAANS_AFRIBOOMS_TEST = _UD_23_HOME + "#UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu" UD_23_AKKADIAN_PISANDUB_TEST = _UD_23_HOME + "#UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu" UD_23_AMHARIC_ATT_TEST = _UD_23_HOME + "#UD_Amharic-ATT/am_att-ud-test.conllu" UD_23_ANCIENT_GREEK_PROIEL_TRAIN = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu" UD_23_ANCIENT_GREEK_PROIEL_DEV = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu" UD_23_ANCIENT_GREEK_PROIEL_TEST = _UD_23_HOME + "#UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu" UD_23_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu" UD_23_ANCIENT_GREEK_PERSEUS_DEV = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu" UD_23_ANCIENT_GREEK_PERSEUS_TEST = _UD_23_HOME + "#UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu" UD_23_ARABIC_NYUAD_TRAIN = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu" UD_23_ARABIC_NYUAD_DEV = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu" UD_23_ARABIC_NYUAD_TEST = _UD_23_HOME + "#UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu" UD_23_ARABIC_PADT_TRAIN = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-train.conllu" UD_23_ARABIC_PADT_DEV = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-dev.conllu" UD_23_ARABIC_PADT_TEST = _UD_23_HOME + "#UD_Arabic-PADT/ar_padt-ud-test.conllu" UD_23_ARABIC_PUD_TEST = _UD_23_HOME + "#UD_Arabic-PUD/ar_pud-ud-test.conllu" UD_23_ARMENIAN_ARMTDP_TRAIN = _UD_23_HOME + "#UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu" UD_23_ARMENIAN_ARMTDP_TEST = _UD_23_HOME + "#UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu" UD_23_BAMBARA_CRB_TEST = _UD_23_HOME + "#UD_Bambara-CRB/bm_crb-ud-test.conllu" UD_23_BASQUE_BDT_TRAIN = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-train.conllu" UD_23_BASQUE_BDT_DEV = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-dev.conllu" UD_23_BASQUE_BDT_TEST = _UD_23_HOME + "#UD_Basque-BDT/eu_bdt-ud-test.conllu" UD_23_BELARUSIAN_HSE_TRAIN = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-train.conllu" UD_23_BELARUSIAN_HSE_DEV = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-dev.conllu" UD_23_BELARUSIAN_HSE_TEST = _UD_23_HOME + "#UD_Belarusian-HSE/be_hse-ud-test.conllu" UD_23_BRETON_KEB_TEST = _UD_23_HOME + "#UD_Breton-KEB/br_keb-ud-test.conllu" UD_23_BULGARIAN_BTB_TRAIN = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-train.conllu" UD_23_BULGARIAN_BTB_DEV = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-dev.conllu" UD_23_BULGARIAN_BTB_TEST = _UD_23_HOME + "#UD_Bulgarian-BTB/bg_btb-ud-test.conllu" UD_23_BURYAT_BDT_TRAIN = _UD_23_HOME + "#UD_Buryat-BDT/bxr_bdt-ud-train.conllu" UD_23_BURYAT_BDT_TEST = _UD_23_HOME + "#UD_Buryat-BDT/bxr_bdt-ud-test.conllu" UD_23_CANTONESE_HK_TEST = _UD_23_HOME + "#UD_Cantonese-HK/yue_hk-ud-test.conllu" UD_23_CATALAN_ANCORA_TRAIN = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-train.conllu" UD_23_CATALAN_ANCORA_DEV = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-dev.conllu" UD_23_CATALAN_ANCORA_TEST = _UD_23_HOME + "#UD_Catalan-AnCora/ca_ancora-ud-test.conllu" UD_23_CHINESE_CFL_TEST = _UD_23_HOME + "#UD_Chinese-CFL/zh_cfl-ud-test.conllu" UD_23_CHINESE_GSD_TRAIN = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-train.conllu" UD_23_CHINESE_GSD_DEV = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-dev.conllu" UD_23_CHINESE_GSD_TEST = _UD_23_HOME + "#UD_Chinese-GSD/zh_gsd-ud-test.conllu" UD_23_CHINESE_HK_TEST = _UD_23_HOME + "#UD_Chinese-HK/zh_hk-ud-test.conllu" UD_23_CHINESE_PUD_TEST = _UD_23_HOME + "#UD_Chinese-PUD/zh_pud-ud-test.conllu" UD_23_COPTIC_SCRIPTORIUM_TRAIN = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu" UD_23_COPTIC_SCRIPTORIUM_DEV = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu" UD_23_COPTIC_SCRIPTORIUM_TEST = _UD_23_HOME + "#UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu" UD_23_CROATIAN_SET_TRAIN = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-train.conllu" UD_23_CROATIAN_SET_DEV = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-dev.conllu" UD_23_CROATIAN_SET_TEST = _UD_23_HOME + "#UD_Croatian-SET/hr_set-ud-test.conllu" UD_23_CZECH_CAC_TRAIN = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-train.conllu" UD_23_CZECH_CAC_DEV = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-dev.conllu" UD_23_CZECH_CAC_TEST = _UD_23_HOME + "#UD_Czech-CAC/cs_cac-ud-test.conllu" UD_23_CZECH_CLTT_TRAIN = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-train.conllu" UD_23_CZECH_CLTT_DEV = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-dev.conllu" UD_23_CZECH_CLTT_TEST = _UD_23_HOME + "#UD_Czech-CLTT/cs_cltt-ud-test.conllu" UD_23_CZECH_FICTREE_TRAIN = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-train.conllu" UD_23_CZECH_FICTREE_DEV = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-dev.conllu" UD_23_CZECH_FICTREE_TEST = _UD_23_HOME + "#UD_Czech-FicTree/cs_fictree-ud-test.conllu" UD_23_CZECH_PDT_TRAIN = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-train.conllu" UD_23_CZECH_PDT_DEV = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-dev.conllu" UD_23_CZECH_PDT_TEST = _UD_23_HOME + "#UD_Czech-PDT/cs_pdt-ud-test.conllu" UD_23_CZECH_PUD_TEST = _UD_23_HOME + "#UD_Czech-PUD/cs_pud-ud-test.conllu" UD_23_DANISH_DDT_TRAIN = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-train.conllu" UD_23_DANISH_DDT_DEV = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-dev.conllu" UD_23_DANISH_DDT_TEST = _UD_23_HOME + "#UD_Danish-DDT/da_ddt-ud-test.conllu" UD_23_DUTCH_ALPINO_TRAIN = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-train.conllu" UD_23_DUTCH_ALPINO_DEV = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-dev.conllu" UD_23_DUTCH_ALPINO_TEST = _UD_23_HOME + "#UD_Dutch-Alpino/nl_alpino-ud-test.conllu" UD_23_DUTCH_LASSYSMALL_TRAIN = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu" UD_23_DUTCH_LASSYSMALL_DEV = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu" UD_23_DUTCH_LASSYSMALL_TEST = _UD_23_HOME + "#UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu" UD_23_ENGLISH_ESL_TRAIN = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-train.conllu" UD_23_ENGLISH_ESL_DEV = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-dev.conllu" UD_23_ENGLISH_ESL_TEST = _UD_23_HOME + "#UD_English-ESL/en_esl-ud-test.conllu" UD_23_ENGLISH_EWT_TRAIN = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-train.conllu" UD_23_ENGLISH_EWT_DEV = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-dev.conllu" UD_23_ENGLISH_EWT_TEST = _UD_23_HOME + "#UD_English-EWT/en_ewt-ud-test.conllu" UD_23_ENGLISH_GUM_TRAIN = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-train.conllu" UD_23_ENGLISH_GUM_DEV = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-dev.conllu" UD_23_ENGLISH_GUM_TEST = _UD_23_HOME + "#UD_English-GUM/en_gum-ud-test.conllu" UD_23_ENGLISH_LINES_TRAIN = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-train.conllu" UD_23_ENGLISH_LINES_DEV = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-dev.conllu" UD_23_ENGLISH_LINES_TEST = _UD_23_HOME + "#UD_English-LinES/en_lines-ud-test.conllu" UD_23_ENGLISH_PUD_TEST = _UD_23_HOME + "#UD_English-PUD/en_pud-ud-test.conllu" UD_23_ENGLISH_PARTUT_TRAIN = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-train.conllu" UD_23_ENGLISH_PARTUT_DEV = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-dev.conllu" UD_23_ENGLISH_PARTUT_TEST = _UD_23_HOME + "#UD_English-ParTUT/en_partut-ud-test.conllu" UD_23_ERZYA_JR_TEST = _UD_23_HOME + "#UD_Erzya-JR/myv_jr-ud-test.conllu" UD_23_ESTONIAN_EDT_TRAIN = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-train.conllu" UD_23_ESTONIAN_EDT_DEV = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-dev.conllu" UD_23_ESTONIAN_EDT_TEST = _UD_23_HOME + "#UD_Estonian-EDT/et_edt-ud-test.conllu" UD_23_FAROESE_OFT_TEST = _UD_23_HOME + "#UD_Faroese-OFT/fo_oft-ud-test.conllu" UD_23_FINNISH_FTB_TRAIN = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-train.conllu" UD_23_FINNISH_FTB_DEV = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-dev.conllu" UD_23_FINNISH_FTB_TEST = _UD_23_HOME + "#UD_Finnish-FTB/fi_ftb-ud-test.conllu" UD_23_FINNISH_PUD_TEST = _UD_23_HOME + "#UD_Finnish-PUD/fi_pud-ud-test.conllu" UD_23_FINNISH_TDT_TRAIN = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-train.conllu" UD_23_FINNISH_TDT_DEV = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-dev.conllu" UD_23_FINNISH_TDT_TEST = _UD_23_HOME + "#UD_Finnish-TDT/fi_tdt-ud-test.conllu" UD_23_FRENCH_FTB_TRAIN = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-train.conllu" UD_23_FRENCH_FTB_DEV = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-dev.conllu" UD_23_FRENCH_FTB_TEST = _UD_23_HOME + "#UD_French-FTB/fr_ftb-ud-test.conllu" UD_23_FRENCH_GSD_TRAIN = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-train.conllu" UD_23_FRENCH_GSD_DEV = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-dev.conllu" UD_23_FRENCH_GSD_TEST = _UD_23_HOME + "#UD_French-GSD/fr_gsd-ud-test.conllu" UD_23_FRENCH_PUD_TEST = _UD_23_HOME + "#UD_French-PUD/fr_pud-ud-test.conllu" UD_23_FRENCH_PARTUT_TRAIN = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-train.conllu" UD_23_FRENCH_PARTUT_DEV = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-dev.conllu" UD_23_FRENCH_PARTUT_TEST = _UD_23_HOME + "#UD_French-ParTUT/fr_partut-ud-test.conllu" UD_23_FRENCH_SEQUOIA_TRAIN = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-train.conllu" UD_23_FRENCH_SEQUOIA_DEV = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-dev.conllu" UD_23_FRENCH_SEQUOIA_TEST = _UD_23_HOME + "#UD_French-Sequoia/fr_sequoia-ud-test.conllu" UD_23_FRENCH_SPOKEN_TRAIN = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-train.conllu" UD_23_FRENCH_SPOKEN_DEV = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-dev.conllu" UD_23_FRENCH_SPOKEN_TEST = _UD_23_HOME + "#UD_French-Spoken/fr_spoken-ud-test.conllu" UD_23_GALICIAN_CTG_TRAIN = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-train.conllu" UD_23_GALICIAN_CTG_DEV = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-dev.conllu" UD_23_GALICIAN_CTG_TEST = _UD_23_HOME + "#UD_Galician-CTG/gl_ctg-ud-test.conllu" UD_23_GALICIAN_TREEGAL_TRAIN = _UD_23_HOME + "#UD_Galician-TreeGal/gl_treegal-ud-train.conllu" UD_23_GALICIAN_TREEGAL_TEST = _UD_23_HOME + "#UD_Galician-TreeGal/gl_treegal-ud-test.conllu" UD_23_GERMAN_GSD_TRAIN = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-train.conllu" UD_23_GERMAN_GSD_DEV = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-dev.conllu" UD_23_GERMAN_GSD_TEST = _UD_23_HOME + "#UD_German-GSD/de_gsd-ud-test.conllu" UD_23_GERMAN_PUD_TEST = _UD_23_HOME + "#UD_German-PUD/de_pud-ud-test.conllu" UD_23_GOTHIC_PROIEL_TRAIN = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-train.conllu" UD_23_GOTHIC_PROIEL_DEV = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-dev.conllu" UD_23_GOTHIC_PROIEL_TEST = _UD_23_HOME + "#UD_Gothic-PROIEL/got_proiel-ud-test.conllu" UD_23_GREEK_GDT_TRAIN = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-train.conllu" UD_23_GREEK_GDT_DEV = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-dev.conllu" UD_23_GREEK_GDT_TEST = _UD_23_HOME + "#UD_Greek-GDT/el_gdt-ud-test.conllu" UD_23_HEBREW_HTB_TRAIN = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-train.conllu" UD_23_HEBREW_HTB_DEV = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-dev.conllu" UD_23_HEBREW_HTB_TEST = _UD_23_HOME + "#UD_Hebrew-HTB/he_htb-ud-test.conllu" UD_23_HINDI_HDTB_TRAIN = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-train.conllu" UD_23_HINDI_HDTB_DEV = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" UD_23_HINDI_HDTB_TEST = _UD_23_HOME + "#UD_Hindi-HDTB/hi_hdtb-ud-test.conllu" UD_23_HINDI_PUD_TEST = _UD_23_HOME + "#UD_Hindi-PUD/hi_pud-ud-test.conllu" UD_23_HINDI_ENGLISH_HIENCS_TRAIN = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu" UD_23_HINDI_ENGLISH_HIENCS_DEV = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu" UD_23_HINDI_ENGLISH_HIENCS_TEST = _UD_23_HOME + "#UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu" UD_23_HUNGARIAN_SZEGED_TRAIN = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-train.conllu" UD_23_HUNGARIAN_SZEGED_DEV = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu" UD_23_HUNGARIAN_SZEGED_TEST = _UD_23_HOME + "#UD_Hungarian-Szeged/hu_szeged-ud-test.conllu" UD_23_INDONESIAN_GSD_TRAIN = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-train.conllu" UD_23_INDONESIAN_GSD_DEV = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-dev.conllu" UD_23_INDONESIAN_GSD_TEST = _UD_23_HOME + "#UD_Indonesian-GSD/id_gsd-ud-test.conllu" UD_23_INDONESIAN_PUD_TEST = _UD_23_HOME + "#UD_Indonesian-PUD/id_pud-ud-test.conllu" UD_23_IRISH_IDT_TRAIN = _UD_23_HOME + "#UD_Irish-IDT/ga_idt-ud-train.conllu" UD_23_IRISH_IDT_TEST = _UD_23_HOME + "#UD_Irish-IDT/ga_idt-ud-test.conllu" UD_23_ITALIAN_ISDT_TRAIN = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-train.conllu" UD_23_ITALIAN_ISDT_DEV = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-dev.conllu" UD_23_ITALIAN_ISDT_TEST = _UD_23_HOME + "#UD_Italian-ISDT/it_isdt-ud-test.conllu" UD_23_ITALIAN_PUD_TEST = _UD_23_HOME + "#UD_Italian-PUD/it_pud-ud-test.conllu" UD_23_ITALIAN_PARTUT_TRAIN = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-train.conllu" UD_23_ITALIAN_PARTUT_DEV = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-dev.conllu" UD_23_ITALIAN_PARTUT_TEST = _UD_23_HOME + "#UD_Italian-ParTUT/it_partut-ud-test.conllu" UD_23_ITALIAN_POSTWITA_TRAIN = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-train.conllu" UD_23_ITALIAN_POSTWITA_DEV = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu" UD_23_ITALIAN_POSTWITA_TEST = _UD_23_HOME + "#UD_Italian-PoSTWITA/it_postwita-ud-test.conllu" UD_23_JAPANESE_BCCWJ_TRAIN = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu" UD_23_JAPANESE_BCCWJ_DEV = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu" UD_23_JAPANESE_BCCWJ_TEST = _UD_23_HOME + "#UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu" UD_23_JAPANESE_GSD_TRAIN = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-train.conllu" UD_23_JAPANESE_GSD_DEV = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-dev.conllu" UD_23_JAPANESE_GSD_TEST = _UD_23_HOME + "#UD_Japanese-GSD/ja_gsd-ud-test.conllu" UD_23_JAPANESE_MODERN_TEST = _UD_23_HOME + "#UD_Japanese-Modern/ja_modern-ud-test.conllu" UD_23_JAPANESE_PUD_TEST = _UD_23_HOME + "#UD_Japanese-PUD/ja_pud-ud-test.conllu" UD_23_KAZAKH_KTB_TRAIN = _UD_23_HOME + "#UD_Kazakh-KTB/kk_ktb-ud-train.conllu" UD_23_KAZAKH_KTB_TEST = _UD_23_HOME + "#UD_Kazakh-KTB/kk_ktb-ud-test.conllu" UD_23_KOMI_ZYRIAN_IKDP_TEST = _UD_23_HOME + "#UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu" UD_23_KOMI_ZYRIAN_LATTICE_TEST = _UD_23_HOME + "#UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu" UD_23_KOREAN_GSD_TRAIN = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-train.conllu" UD_23_KOREAN_GSD_DEV = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-dev.conllu" UD_23_KOREAN_GSD_TEST = _UD_23_HOME + "#UD_Korean-GSD/ko_gsd-ud-test.conllu" UD_23_KOREAN_KAIST_TRAIN = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-train.conllu" UD_23_KOREAN_KAIST_DEV = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-dev.conllu" UD_23_KOREAN_KAIST_TEST = _UD_23_HOME + "#UD_Korean-Kaist/ko_kaist-ud-test.conllu" UD_23_KOREAN_PUD_TEST = _UD_23_HOME + "#UD_Korean-PUD/ko_pud-ud-test.conllu" UD_23_KURMANJI_MG_TRAIN = _UD_23_HOME + "#UD_Kurmanji-MG/kmr_mg-ud-train.conllu" UD_23_KURMANJI_MG_TEST = _UD_23_HOME + "#UD_Kurmanji-MG/kmr_mg-ud-test.conllu" UD_23_LATIN_ITTB_TRAIN = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-train.conllu" UD_23_LATIN_ITTB_DEV = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-dev.conllu" UD_23_LATIN_ITTB_TEST = _UD_23_HOME + "#UD_Latin-ITTB/la_ittb-ud-test.conllu" UD_23_LATIN_PROIEL_TRAIN = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-train.conllu" UD_23_LATIN_PROIEL_DEV = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-dev.conllu" UD_23_LATIN_PROIEL_TEST = _UD_23_HOME + "#UD_Latin-PROIEL/la_proiel-ud-test.conllu" UD_23_LATIN_PERSEUS_TRAIN = _UD_23_HOME + "#UD_Latin-Perseus/la_perseus-ud-train.conllu" UD_23_LATIN_PERSEUS_TEST = _UD_23_HOME + "#UD_Latin-Perseus/la_perseus-ud-test.conllu" UD_23_LATVIAN_LVTB_TRAIN = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-train.conllu" UD_23_LATVIAN_LVTB_DEV = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu" UD_23_LATVIAN_LVTB_TEST = _UD_23_HOME + "#UD_Latvian-LVTB/lv_lvtb-ud-test.conllu" UD_23_LITHUANIAN_HSE_TRAIN = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-train.conllu" UD_23_LITHUANIAN_HSE_DEV = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-dev.conllu" UD_23_LITHUANIAN_HSE_TEST = _UD_23_HOME + "#UD_Lithuanian-HSE/lt_hse-ud-test.conllu" UD_23_MALTESE_MUDT_TRAIN = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-train.conllu" UD_23_MALTESE_MUDT_DEV = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-dev.conllu" UD_23_MALTESE_MUDT_TEST = _UD_23_HOME + "#UD_Maltese-MUDT/mt_mudt-ud-test.conllu" UD_23_MARATHI_UFAL_TRAIN = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-train.conllu" UD_23_MARATHI_UFAL_DEV = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-dev.conllu" UD_23_MARATHI_UFAL_TEST = _UD_23_HOME + "#UD_Marathi-UFAL/mr_ufal-ud-test.conllu" UD_23_NAIJA_NSC_TEST = _UD_23_HOME + "#UD_Naija-NSC/pcm_nsc-ud-test.conllu" UD_23_NORTH_SAMI_GIELLA_TRAIN = _UD_23_HOME + "#UD_North_Sami-Giella/sme_giella-ud-train.conllu" UD_23_NORTH_SAMI_GIELLA_TEST = _UD_23_HOME + "#UD_North_Sami-Giella/sme_giella-ud-test.conllu" UD_23_NORWEGIAN_BOKMAAL_TRAIN = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu" UD_23_NORWEGIAN_BOKMAAL_DEV = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu" UD_23_NORWEGIAN_BOKMAAL_TEST = _UD_23_HOME + "#UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu" UD_23_NORWEGIAN_NYNORSK_TRAIN = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu" UD_23_NORWEGIAN_NYNORSK_DEV = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu" UD_23_NORWEGIAN_NYNORSK_TEST = _UD_23_HOME + "#UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu" UD_23_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_23_HOME + "#UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu" UD_23_NORWEGIAN_NYNORSKLIA_TEST = _UD_23_HOME + "#UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu" UD_23_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu" UD_23_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu" UD_23_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_23_HOME + "#UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu" UD_23_OLD_FRENCH_SRCMF_TRAIN = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu" UD_23_OLD_FRENCH_SRCMF_DEV = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu" UD_23_OLD_FRENCH_SRCMF_TEST = _UD_23_HOME + "#UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu" UD_23_PERSIAN_SERAJI_TRAIN = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-train.conllu" UD_23_PERSIAN_SERAJI_DEV = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-dev.conllu" UD_23_PERSIAN_SERAJI_TEST = _UD_23_HOME + "#UD_Persian-Seraji/fa_seraji-ud-test.conllu" UD_23_POLISH_LFG_TRAIN = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-train.conllu" UD_23_POLISH_LFG_DEV = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-dev.conllu" UD_23_POLISH_LFG_TEST = _UD_23_HOME + "#UD_Polish-LFG/pl_lfg-ud-test.conllu" UD_23_POLISH_SZ_TRAIN = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-train.conllu" UD_23_POLISH_SZ_DEV = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-dev.conllu" UD_23_POLISH_SZ_TEST = _UD_23_HOME + "#UD_Polish-SZ/pl_sz-ud-test.conllu" UD_23_PORTUGUESE_BOSQUE_TRAIN = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-train.conllu" UD_23_PORTUGUESE_BOSQUE_DEV = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu" UD_23_PORTUGUESE_BOSQUE_TEST = _UD_23_HOME + "#UD_Portuguese-Bosque/pt_bosque-ud-test.conllu" UD_23_PORTUGUESE_GSD_TRAIN = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-train.conllu" UD_23_PORTUGUESE_GSD_DEV = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-dev.conllu" UD_23_PORTUGUESE_GSD_TEST = _UD_23_HOME + "#UD_Portuguese-GSD/pt_gsd-ud-test.conllu" UD_23_PORTUGUESE_PUD_TEST = _UD_23_HOME + "#UD_Portuguese-PUD/pt_pud-ud-test.conllu" UD_23_ROMANIAN_NONSTANDARD_TRAIN = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu" UD_23_ROMANIAN_NONSTANDARD_DEV = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu" UD_23_ROMANIAN_NONSTANDARD_TEST = _UD_23_HOME + "#UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu" UD_23_ROMANIAN_RRT_TRAIN = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-train.conllu" UD_23_ROMANIAN_RRT_DEV = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-dev.conllu" UD_23_ROMANIAN_RRT_TEST = _UD_23_HOME + "#UD_Romanian-RRT/ro_rrt-ud-test.conllu" UD_23_RUSSIAN_GSD_TRAIN = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-train.conllu" UD_23_RUSSIAN_GSD_DEV = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-dev.conllu" UD_23_RUSSIAN_GSD_TEST = _UD_23_HOME + "#UD_Russian-GSD/ru_gsd-ud-test.conllu" UD_23_RUSSIAN_PUD_TEST = _UD_23_HOME + "#UD_Russian-PUD/ru_pud-ud-test.conllu" UD_23_RUSSIAN_SYNTAGRUS_TRAIN = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu" UD_23_RUSSIAN_SYNTAGRUS_DEV = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu" UD_23_RUSSIAN_SYNTAGRUS_TEST = _UD_23_HOME + "#UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu" UD_23_RUSSIAN_TAIGA_TRAIN = _UD_23_HOME + "#UD_Russian-Taiga/ru_taiga-ud-train.conllu" UD_23_RUSSIAN_TAIGA_TEST = _UD_23_HOME + "#UD_Russian-Taiga/ru_taiga-ud-test.conllu" UD_23_SANSKRIT_UFAL_TEST = _UD_23_HOME + "#UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu" UD_23_SERBIAN_SET_TRAIN = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-train.conllu" UD_23_SERBIAN_SET_DEV = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-dev.conllu" UD_23_SERBIAN_SET_TEST = _UD_23_HOME + "#UD_Serbian-SET/sr_set-ud-test.conllu" UD_23_SLOVAK_SNK_TRAIN = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-train.conllu" UD_23_SLOVAK_SNK_DEV = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-dev.conllu" UD_23_SLOVAK_SNK_TEST = _UD_23_HOME + "#UD_Slovak-SNK/sk_snk-ud-test.conllu" UD_23_SLOVENIAN_SSJ_TRAIN = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-train.conllu" UD_23_SLOVENIAN_SSJ_DEV = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu" UD_23_SLOVENIAN_SSJ_TEST = _UD_23_HOME + "#UD_Slovenian-SSJ/sl_ssj-ud-test.conllu" UD_23_SLOVENIAN_SST_TRAIN = _UD_23_HOME + "#UD_Slovenian-SST/sl_sst-ud-train.conllu" UD_23_SLOVENIAN_SST_TEST = _UD_23_HOME + "#UD_Slovenian-SST/sl_sst-ud-test.conllu" UD_23_SPANISH_ANCORA_TRAIN = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-train.conllu" UD_23_SPANISH_ANCORA_DEV = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-dev.conllu" UD_23_SPANISH_ANCORA_TEST = _UD_23_HOME + "#UD_Spanish-AnCora/es_ancora-ud-test.conllu" UD_23_SPANISH_GSD_TRAIN = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-train.conllu" UD_23_SPANISH_GSD_DEV = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-dev.conllu" UD_23_SPANISH_GSD_TEST = _UD_23_HOME + "#UD_Spanish-GSD/es_gsd-ud-test.conllu" UD_23_SPANISH_PUD_TEST = _UD_23_HOME + "#UD_Spanish-PUD/es_pud-ud-test.conllu" UD_23_SWEDISH_LINES_TRAIN = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-train.conllu" UD_23_SWEDISH_LINES_DEV = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-dev.conllu" UD_23_SWEDISH_LINES_TEST = _UD_23_HOME + "#UD_Swedish-LinES/sv_lines-ud-test.conllu" UD_23_SWEDISH_PUD_TEST = _UD_23_HOME + "#UD_Swedish-PUD/sv_pud-ud-test.conllu" UD_23_SWEDISH_TALBANKEN_TRAIN = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu" UD_23_SWEDISH_TALBANKEN_DEV = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu" UD_23_SWEDISH_TALBANKEN_TEST = _UD_23_HOME + "#UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu" UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu" UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu" UD_23_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_23_HOME + "#UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu" UD_23_TAGALOG_TRG_TEST = _UD_23_HOME + "#UD_Tagalog-TRG/tl_trg-ud-test.conllu" UD_23_TAMIL_TTB_TRAIN = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-train.conllu" UD_23_TAMIL_TTB_DEV = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-dev.conllu" UD_23_TAMIL_TTB_TEST = _UD_23_HOME + "#UD_Tamil-TTB/ta_ttb-ud-test.conllu" UD_23_TELUGU_MTG_TRAIN = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-train.conllu" UD_23_TELUGU_MTG_DEV = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-dev.conllu" UD_23_TELUGU_MTG_TEST = _UD_23_HOME + "#UD_Telugu-MTG/te_mtg-ud-test.conllu" UD_23_THAI_PUD_TEST = _UD_23_HOME + "#UD_Thai-PUD/th_pud-ud-test.conllu" UD_23_TURKISH_IMST_TRAIN = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-train.conllu" UD_23_TURKISH_IMST_DEV = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-dev.conllu" UD_23_TURKISH_IMST_TEST = _UD_23_HOME + "#UD_Turkish-IMST/tr_imst-ud-test.conllu" UD_23_TURKISH_PUD_TEST = _UD_23_HOME + "#UD_Turkish-PUD/tr_pud-ud-test.conllu" UD_23_UKRAINIAN_IU_TRAIN = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-train.conllu" UD_23_UKRAINIAN_IU_DEV = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-dev.conllu" UD_23_UKRAINIAN_IU_TEST = _UD_23_HOME + "#UD_Ukrainian-IU/uk_iu-ud-test.conllu" UD_23_UPPER_SORBIAN_UFAL_TRAIN = _UD_23_HOME + "#UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu" UD_23_UPPER_SORBIAN_UFAL_TEST = _UD_23_HOME + "#UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu" UD_23_URDU_UDTB_TRAIN = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-train.conllu" UD_23_URDU_UDTB_DEV = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-dev.conllu" UD_23_URDU_UDTB_TEST = _UD_23_HOME + "#UD_Urdu-UDTB/ur_udtb-ud-test.conllu" UD_23_UYGHUR_UDT_TRAIN = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-train.conllu" UD_23_UYGHUR_UDT_DEV = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-dev.conllu" UD_23_UYGHUR_UDT_TEST = _UD_23_HOME + "#UD_Uyghur-UDT/ug_udt-ud-test.conllu" UD_23_VIETNAMESE_VTB_TRAIN = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-train.conllu" UD_23_VIETNAMESE_VTB_DEV = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu" UD_23_VIETNAMESE_VTB_TEST = _UD_23_HOME + "#UD_Vietnamese-VTB/vi_vtb-ud-test.conllu" UD_23_WARLPIRI_UFAL_TEST = _UD_23_HOME + "#UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu" UD_23_YORUBA_YTB_TEST = _UD_23_HOME + "#UD_Yoruba-YTB/yo_ytb-ud-test.conllu" ================================================ FILE: hanlp/datasets/parsing/ud/ud23m.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-21 20:39 import os from hanlp.datasets.parsing.ud import concat_treebanks from .ud23 import _UD_23_HOME _UD_23_MULTILINGUAL_HOME = concat_treebanks(_UD_23_HOME, '2.3') UD_23_MULTILINGUAL_TRAIN = os.path.join(_UD_23_MULTILINGUAL_HOME, 'train.conllu') UD_23_MULTILINGUAL_DEV = os.path.join(_UD_23_MULTILINGUAL_HOME, 'dev.conllu') UD_23_MULTILINGUAL_TEST = os.path.join(_UD_23_MULTILINGUAL_HOME, 'test.conllu') ================================================ FILE: hanlp/datasets/parsing/ud/ud27.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-07 21:03 import glob import os from hanlp.utils.io_util import uncompress, get_resource _UD_27_URL = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3424/allzip" _UD_27_HOME = _UD_27_URL + '#ud-treebanks-v2.7/' _path = get_resource(_UD_27_URL) if os.path.isfile(_path): os.rename(_path, _path + '.zip') uncompress(_path + '.zip') uncompress(os.path.join(_path, 'ud-treebanks-v2.7.tgz')) # noinspection PyShadowingNames def _list_dir(path, home): prefix = home.lstrip('_').replace('_HOME', '') path = get_resource(path) with open('ud27.py', 'a') as out: for f in sorted(glob.glob(path + '/ud-treebanks-v2.7/UD_*')): basename = os.path.basename(f) name = basename[len('UD_'):] name = name.upper().replace('-', '_') for split in 'train', 'dev', 'test': sp = glob.glob(f + f'/*{split}.conllu') if not sp: continue sp = os.path.basename(sp[0]) out.write(f'{prefix}_{name}_{split.upper()} = {home} + "{basename}/{sp}"\n') out.write(f'"{prefix} {split} set of {name}."\n') def main(): _list_dir(_UD_27_URL, '_UD_27_HOME') pass if __name__ == '__main__': main() UD_27_AFRIKAANS_AFRIBOOMS_TRAIN = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu" "UD_27 train set of AFRIKAANS_AFRIBOOMS." UD_27_AFRIKAANS_AFRIBOOMS_DEV = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu" "UD_27 dev set of AFRIKAANS_AFRIBOOMS." UD_27_AFRIKAANS_AFRIBOOMS_TEST = _UD_27_HOME + "UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu" "UD_27 test set of AFRIKAANS_AFRIBOOMS." UD_27_AKKADIAN_PISANDUB_TEST = _UD_27_HOME + "UD_Akkadian-PISANDUB/akk_pisandub-ud-test.conllu" "UD_27 test set of AKKADIAN_PISANDUB." UD_27_AKKADIAN_RIAO_TEST = _UD_27_HOME + "UD_Akkadian-RIAO/akk_riao-ud-test.conllu" "UD_27 test set of AKKADIAN_RIAO." UD_27_AKUNTSU_TUDET_TEST = _UD_27_HOME + "UD_Akuntsu-TuDeT/aqz_tudet-ud-test.conllu" "UD_27 test set of AKUNTSU_TUDET." UD_27_ALBANIAN_TSA_TEST = _UD_27_HOME + "UD_Albanian-TSA/sq_tsa-ud-test.conllu" "UD_27 test set of ALBANIAN_TSA." UD_27_AMHARIC_ATT_TEST = _UD_27_HOME + "UD_Amharic-ATT/am_att-ud-test.conllu" "UD_27 test set of AMHARIC_ATT." UD_27_ANCIENT_GREEK_PROIEL_TRAIN = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu" "UD_27 train set of ANCIENT_GREEK_PROIEL." UD_27_ANCIENT_GREEK_PROIEL_DEV = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-dev.conllu" "UD_27 dev set of ANCIENT_GREEK_PROIEL." UD_27_ANCIENT_GREEK_PROIEL_TEST = _UD_27_HOME + "UD_Ancient_Greek-PROIEL/grc_proiel-ud-test.conllu" "UD_27 test set of ANCIENT_GREEK_PROIEL." UD_27_ANCIENT_GREEK_PERSEUS_TRAIN = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-train.conllu" "UD_27 train set of ANCIENT_GREEK_PERSEUS." UD_27_ANCIENT_GREEK_PERSEUS_DEV = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-dev.conllu" "UD_27 dev set of ANCIENT_GREEK_PERSEUS." UD_27_ANCIENT_GREEK_PERSEUS_TEST = _UD_27_HOME + "UD_Ancient_Greek-Perseus/grc_perseus-ud-test.conllu" "UD_27 test set of ANCIENT_GREEK_PERSEUS." UD_27_APURINA_UFPA_TEST = _UD_27_HOME + "UD_Apurina-UFPA/apu_ufpa-ud-test.conllu" "UD_27 test set of APURINA_UFPA." UD_27_ARABIC_NYUAD_TRAIN = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu" "UD_27 train set of ARABIC_NYUAD." UD_27_ARABIC_NYUAD_DEV = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-dev.conllu" "UD_27 dev set of ARABIC_NYUAD." UD_27_ARABIC_NYUAD_TEST = _UD_27_HOME + "UD_Arabic-NYUAD/ar_nyuad-ud-test.conllu" "UD_27 test set of ARABIC_NYUAD." UD_27_ARABIC_PADT_TRAIN = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-train.conllu" "UD_27 train set of ARABIC_PADT." UD_27_ARABIC_PADT_DEV = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-dev.conllu" "UD_27 dev set of ARABIC_PADT." UD_27_ARABIC_PADT_TEST = _UD_27_HOME + "UD_Arabic-PADT/ar_padt-ud-test.conllu" "UD_27 test set of ARABIC_PADT." UD_27_ARABIC_PUD_TEST = _UD_27_HOME + "UD_Arabic-PUD/ar_pud-ud-test.conllu" "UD_27 test set of ARABIC_PUD." UD_27_ARMENIAN_ARMTDP_TRAIN = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu" "UD_27 train set of ARMENIAN_ARMTDP." UD_27_ARMENIAN_ARMTDP_DEV = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-dev.conllu" "UD_27 dev set of ARMENIAN_ARMTDP." UD_27_ARMENIAN_ARMTDP_TEST = _UD_27_HOME + "UD_Armenian-ArmTDP/hy_armtdp-ud-test.conllu" "UD_27 test set of ARMENIAN_ARMTDP." UD_27_ASSYRIAN_AS_TEST = _UD_27_HOME + "UD_Assyrian-AS/aii_as-ud-test.conllu" "UD_27 test set of ASSYRIAN_AS." UD_27_BAMBARA_CRB_TEST = _UD_27_HOME + "UD_Bambara-CRB/bm_crb-ud-test.conllu" "UD_27 test set of BAMBARA_CRB." UD_27_BASQUE_BDT_TRAIN = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-train.conllu" "UD_27 train set of BASQUE_BDT." UD_27_BASQUE_BDT_DEV = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-dev.conllu" "UD_27 dev set of BASQUE_BDT." UD_27_BASQUE_BDT_TEST = _UD_27_HOME + "UD_Basque-BDT/eu_bdt-ud-test.conllu" "UD_27 test set of BASQUE_BDT." UD_27_BELARUSIAN_HSE_TRAIN = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-train.conllu" "UD_27 train set of BELARUSIAN_HSE." UD_27_BELARUSIAN_HSE_DEV = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-dev.conllu" "UD_27 dev set of BELARUSIAN_HSE." UD_27_BELARUSIAN_HSE_TEST = _UD_27_HOME + "UD_Belarusian-HSE/be_hse-ud-test.conllu" "UD_27 test set of BELARUSIAN_HSE." UD_27_BHOJPURI_BHTB_TEST = _UD_27_HOME + "UD_Bhojpuri-BHTB/bho_bhtb-ud-test.conllu" "UD_27 test set of BHOJPURI_BHTB." UD_27_BRETON_KEB_TEST = _UD_27_HOME + "UD_Breton-KEB/br_keb-ud-test.conllu" "UD_27 test set of BRETON_KEB." UD_27_BULGARIAN_BTB_TRAIN = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-train.conllu" "UD_27 train set of BULGARIAN_BTB." UD_27_BULGARIAN_BTB_DEV = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-dev.conllu" "UD_27 dev set of BULGARIAN_BTB." UD_27_BULGARIAN_BTB_TEST = _UD_27_HOME + "UD_Bulgarian-BTB/bg_btb-ud-test.conllu" "UD_27 test set of BULGARIAN_BTB." UD_27_BURYAT_BDT_TRAIN = _UD_27_HOME + "UD_Buryat-BDT/bxr_bdt-ud-train.conllu" "UD_27 train set of BURYAT_BDT." UD_27_BURYAT_BDT_TEST = _UD_27_HOME + "UD_Buryat-BDT/bxr_bdt-ud-test.conllu" "UD_27 test set of BURYAT_BDT." UD_27_CANTONESE_HK_TEST = _UD_27_HOME + "UD_Cantonese-HK/yue_hk-ud-test.conllu" "UD_27 test set of CANTONESE_HK." UD_27_CATALAN_ANCORA_TRAIN = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-train.conllu" "UD_27 train set of CATALAN_ANCORA." UD_27_CATALAN_ANCORA_DEV = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-dev.conllu" "UD_27 dev set of CATALAN_ANCORA." UD_27_CATALAN_ANCORA_TEST = _UD_27_HOME + "UD_Catalan-AnCora/ca_ancora-ud-test.conllu" "UD_27 test set of CATALAN_ANCORA." UD_27_CHINESE_CFL_TEST = _UD_27_HOME + "UD_Chinese-CFL/zh_cfl-ud-test.conllu" "UD_27 test set of CHINESE_CFL." UD_27_CHINESE_GSD_TRAIN = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-train.conllu" "UD_27 train set of CHINESE_GSD." UD_27_CHINESE_GSD_DEV = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-dev.conllu" "UD_27 dev set of CHINESE_GSD." UD_27_CHINESE_GSD_TEST = _UD_27_HOME + "UD_Chinese-GSD/zh_gsd-ud-test.conllu" "UD_27 test set of CHINESE_GSD." UD_27_CHINESE_GSDSIMP_TRAIN = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-train.conllu" "UD_27 train set of CHINESE_GSDSIMP." UD_27_CHINESE_GSDSIMP_DEV = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-dev.conllu" "UD_27 dev set of CHINESE_GSDSIMP." UD_27_CHINESE_GSDSIMP_TEST = _UD_27_HOME + "UD_Chinese-GSDSimp/zh_gsdsimp-ud-test.conllu" "UD_27 test set of CHINESE_GSDSIMP." UD_27_CHINESE_HK_TEST = _UD_27_HOME + "UD_Chinese-HK/zh_hk-ud-test.conllu" "UD_27 test set of CHINESE_HK." UD_27_CHINESE_PUD_TEST = _UD_27_HOME + "UD_Chinese-PUD/zh_pud-ud-test.conllu" "UD_27 test set of CHINESE_PUD." UD_27_CHUKCHI_HSE_TEST = _UD_27_HOME + "UD_Chukchi-HSE/ckt_hse-ud-test.conllu" "UD_27 test set of CHUKCHI_HSE." UD_27_CLASSICAL_CHINESE_KYOTO_TRAIN = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-train.conllu" "UD_27 train set of CLASSICAL_CHINESE_KYOTO." UD_27_CLASSICAL_CHINESE_KYOTO_DEV = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-dev.conllu" "UD_27 dev set of CLASSICAL_CHINESE_KYOTO." UD_27_CLASSICAL_CHINESE_KYOTO_TEST = _UD_27_HOME + "UD_Classical_Chinese-Kyoto/lzh_kyoto-ud-test.conllu" "UD_27 test set of CLASSICAL_CHINESE_KYOTO." UD_27_COPTIC_SCRIPTORIUM_TRAIN = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu" "UD_27 train set of COPTIC_SCRIPTORIUM." UD_27_COPTIC_SCRIPTORIUM_DEV = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-dev.conllu" "UD_27 dev set of COPTIC_SCRIPTORIUM." UD_27_COPTIC_SCRIPTORIUM_TEST = _UD_27_HOME + "UD_Coptic-Scriptorium/cop_scriptorium-ud-test.conllu" "UD_27 test set of COPTIC_SCRIPTORIUM." UD_27_CROATIAN_SET_TRAIN = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-train.conllu" "UD_27 train set of CROATIAN_SET." UD_27_CROATIAN_SET_DEV = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-dev.conllu" "UD_27 dev set of CROATIAN_SET." UD_27_CROATIAN_SET_TEST = _UD_27_HOME + "UD_Croatian-SET/hr_set-ud-test.conllu" "UD_27 test set of CROATIAN_SET." UD_27_CZECH_CAC_TRAIN = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-train.conllu" "UD_27 train set of CZECH_CAC." UD_27_CZECH_CAC_DEV = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-dev.conllu" "UD_27 dev set of CZECH_CAC." UD_27_CZECH_CAC_TEST = _UD_27_HOME + "UD_Czech-CAC/cs_cac-ud-test.conllu" "UD_27 test set of CZECH_CAC." UD_27_CZECH_CLTT_TRAIN = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-train.conllu" "UD_27 train set of CZECH_CLTT." UD_27_CZECH_CLTT_DEV = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-dev.conllu" "UD_27 dev set of CZECH_CLTT." UD_27_CZECH_CLTT_TEST = _UD_27_HOME + "UD_Czech-CLTT/cs_cltt-ud-test.conllu" "UD_27 test set of CZECH_CLTT." UD_27_CZECH_FICTREE_TRAIN = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-train.conllu" "UD_27 train set of CZECH_FICTREE." UD_27_CZECH_FICTREE_DEV = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-dev.conllu" "UD_27 dev set of CZECH_FICTREE." UD_27_CZECH_FICTREE_TEST = _UD_27_HOME + "UD_Czech-FicTree/cs_fictree-ud-test.conllu" "UD_27 test set of CZECH_FICTREE." UD_27_CZECH_PDT_TRAIN = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-train.conllu" "UD_27 train set of CZECH_PDT." UD_27_CZECH_PDT_DEV = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-dev.conllu" "UD_27 dev set of CZECH_PDT." UD_27_CZECH_PDT_TEST = _UD_27_HOME + "UD_Czech-PDT/cs_pdt-ud-test.conllu" "UD_27 test set of CZECH_PDT." UD_27_CZECH_PUD_TEST = _UD_27_HOME + "UD_Czech-PUD/cs_pud-ud-test.conllu" "UD_27 test set of CZECH_PUD." UD_27_DANISH_DDT_TRAIN = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-train.conllu" "UD_27 train set of DANISH_DDT." UD_27_DANISH_DDT_DEV = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-dev.conllu" "UD_27 dev set of DANISH_DDT." UD_27_DANISH_DDT_TEST = _UD_27_HOME + "UD_Danish-DDT/da_ddt-ud-test.conllu" "UD_27 test set of DANISH_DDT." UD_27_DUTCH_ALPINO_TRAIN = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-train.conllu" "UD_27 train set of DUTCH_ALPINO." UD_27_DUTCH_ALPINO_DEV = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu" "UD_27 dev set of DUTCH_ALPINO." UD_27_DUTCH_ALPINO_TEST = _UD_27_HOME + "UD_Dutch-Alpino/nl_alpino-ud-test.conllu" "UD_27 test set of DUTCH_ALPINO." UD_27_DUTCH_LASSYSMALL_TRAIN = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-train.conllu" "UD_27 train set of DUTCH_LASSYSMALL." UD_27_DUTCH_LASSYSMALL_DEV = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-dev.conllu" "UD_27 dev set of DUTCH_LASSYSMALL." UD_27_DUTCH_LASSYSMALL_TEST = _UD_27_HOME + "UD_Dutch-LassySmall/nl_lassysmall-ud-test.conllu" "UD_27 test set of DUTCH_LASSYSMALL." UD_27_ENGLISH_ESL_TRAIN = _UD_27_HOME + "UD_English-ESL/en_esl-ud-train.conllu" "UD_27 train set of ENGLISH_ESL." UD_27_ENGLISH_ESL_DEV = _UD_27_HOME + "UD_English-ESL/en_esl-ud-dev.conllu" "UD_27 dev set of ENGLISH_ESL." UD_27_ENGLISH_ESL_TEST = _UD_27_HOME + "UD_English-ESL/en_esl-ud-test.conllu" "UD_27 test set of ENGLISH_ESL." UD_27_ENGLISH_EWT_TRAIN = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-train.conllu" "UD_27 train set of ENGLISH_EWT." UD_27_ENGLISH_EWT_DEV = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-dev.conllu" "UD_27 dev set of ENGLISH_EWT." UD_27_ENGLISH_EWT_TEST = _UD_27_HOME + "UD_English-EWT/en_ewt-ud-test.conllu" "UD_27 test set of ENGLISH_EWT." UD_27_ENGLISH_GUM_TRAIN = _UD_27_HOME + "UD_English-GUM/en_gum-ud-train.conllu" "UD_27 train set of ENGLISH_GUM." UD_27_ENGLISH_GUM_DEV = _UD_27_HOME + "UD_English-GUM/en_gum-ud-dev.conllu" "UD_27 dev set of ENGLISH_GUM." UD_27_ENGLISH_GUM_TEST = _UD_27_HOME + "UD_English-GUM/en_gum-ud-test.conllu" "UD_27 test set of ENGLISH_GUM." UD_27_ENGLISH_GUMREDDIT_TRAIN = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-train.conllu" "UD_27 train set of ENGLISH_GUMREDDIT." UD_27_ENGLISH_GUMREDDIT_DEV = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-dev.conllu" "UD_27 dev set of ENGLISH_GUMREDDIT." UD_27_ENGLISH_GUMREDDIT_TEST = _UD_27_HOME + "UD_English-GUMReddit/en_gumreddit-ud-test.conllu" "UD_27 test set of ENGLISH_GUMREDDIT." UD_27_ENGLISH_LINES_TRAIN = _UD_27_HOME + "UD_English-LinES/en_lines-ud-train.conllu" "UD_27 train set of ENGLISH_LINES." UD_27_ENGLISH_LINES_DEV = _UD_27_HOME + "UD_English-LinES/en_lines-ud-dev.conllu" "UD_27 dev set of ENGLISH_LINES." UD_27_ENGLISH_LINES_TEST = _UD_27_HOME + "UD_English-LinES/en_lines-ud-test.conllu" "UD_27 test set of ENGLISH_LINES." UD_27_ENGLISH_PUD_TEST = _UD_27_HOME + "UD_English-PUD/en_pud-ud-test.conllu" "UD_27 test set of ENGLISH_PUD." UD_27_ENGLISH_PARTUT_TRAIN = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-train.conllu" "UD_27 train set of ENGLISH_PARTUT." UD_27_ENGLISH_PARTUT_DEV = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-dev.conllu" "UD_27 dev set of ENGLISH_PARTUT." UD_27_ENGLISH_PARTUT_TEST = _UD_27_HOME + "UD_English-ParTUT/en_partut-ud-test.conllu" "UD_27 test set of ENGLISH_PARTUT." UD_27_ENGLISH_PRONOUNS_TEST = _UD_27_HOME + "UD_English-Pronouns/en_pronouns-ud-test.conllu" "UD_27 test set of ENGLISH_PRONOUNS." UD_27_ERZYA_JR_TEST = _UD_27_HOME + "UD_Erzya-JR/myv_jr-ud-test.conllu" "UD_27 test set of ERZYA_JR." UD_27_ESTONIAN_EDT_TRAIN = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-train.conllu" "UD_27 train set of ESTONIAN_EDT." UD_27_ESTONIAN_EDT_DEV = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-dev.conllu" "UD_27 dev set of ESTONIAN_EDT." UD_27_ESTONIAN_EDT_TEST = _UD_27_HOME + "UD_Estonian-EDT/et_edt-ud-test.conllu" "UD_27 test set of ESTONIAN_EDT." UD_27_ESTONIAN_EWT_TRAIN = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-train.conllu" "UD_27 train set of ESTONIAN_EWT." UD_27_ESTONIAN_EWT_DEV = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-dev.conllu" "UD_27 dev set of ESTONIAN_EWT." UD_27_ESTONIAN_EWT_TEST = _UD_27_HOME + "UD_Estonian-EWT/et_ewt-ud-test.conllu" "UD_27 test set of ESTONIAN_EWT." UD_27_FAROESE_FARPAHC_TRAIN = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-train.conllu" "UD_27 train set of FAROESE_FARPAHC." UD_27_FAROESE_FARPAHC_DEV = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-dev.conllu" "UD_27 dev set of FAROESE_FARPAHC." UD_27_FAROESE_FARPAHC_TEST = _UD_27_HOME + "UD_Faroese-FarPaHC/fo_farpahc-ud-test.conllu" "UD_27 test set of FAROESE_FARPAHC." UD_27_FAROESE_OFT_TEST = _UD_27_HOME + "UD_Faroese-OFT/fo_oft-ud-test.conllu" "UD_27 test set of FAROESE_OFT." UD_27_FINNISH_FTB_TRAIN = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-train.conllu" "UD_27 train set of FINNISH_FTB." UD_27_FINNISH_FTB_DEV = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-dev.conllu" "UD_27 dev set of FINNISH_FTB." UD_27_FINNISH_FTB_TEST = _UD_27_HOME + "UD_Finnish-FTB/fi_ftb-ud-test.conllu" "UD_27 test set of FINNISH_FTB." UD_27_FINNISH_OOD_TEST = _UD_27_HOME + "UD_Finnish-OOD/fi_ood-ud-test.conllu" "UD_27 test set of FINNISH_OOD." UD_27_FINNISH_PUD_TEST = _UD_27_HOME + "UD_Finnish-PUD/fi_pud-ud-test.conllu" "UD_27 test set of FINNISH_PUD." UD_27_FINNISH_TDT_TRAIN = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-train.conllu" "UD_27 train set of FINNISH_TDT." UD_27_FINNISH_TDT_DEV = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-dev.conllu" "UD_27 dev set of FINNISH_TDT." UD_27_FINNISH_TDT_TEST = _UD_27_HOME + "UD_Finnish-TDT/fi_tdt-ud-test.conllu" "UD_27 test set of FINNISH_TDT." UD_27_FRENCH_FQB_TEST = _UD_27_HOME + "UD_French-FQB/fr_fqb-ud-test.conllu" "UD_27 test set of FRENCH_FQB." UD_27_FRENCH_FTB_TRAIN = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-train.conllu" "UD_27 train set of FRENCH_FTB." UD_27_FRENCH_FTB_DEV = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-dev.conllu" "UD_27 dev set of FRENCH_FTB." UD_27_FRENCH_FTB_TEST = _UD_27_HOME + "UD_French-FTB/fr_ftb-ud-test.conllu" "UD_27 test set of FRENCH_FTB." UD_27_FRENCH_GSD_TRAIN = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-train.conllu" "UD_27 train set of FRENCH_GSD." UD_27_FRENCH_GSD_DEV = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-dev.conllu" "UD_27 dev set of FRENCH_GSD." UD_27_FRENCH_GSD_TEST = _UD_27_HOME + "UD_French-GSD/fr_gsd-ud-test.conllu" "UD_27 test set of FRENCH_GSD." UD_27_FRENCH_PUD_TEST = _UD_27_HOME + "UD_French-PUD/fr_pud-ud-test.conllu" "UD_27 test set of FRENCH_PUD." UD_27_FRENCH_PARTUT_TRAIN = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-train.conllu" "UD_27 train set of FRENCH_PARTUT." UD_27_FRENCH_PARTUT_DEV = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-dev.conllu" "UD_27 dev set of FRENCH_PARTUT." UD_27_FRENCH_PARTUT_TEST = _UD_27_HOME + "UD_French-ParTUT/fr_partut-ud-test.conllu" "UD_27 test set of FRENCH_PARTUT." UD_27_FRENCH_SEQUOIA_TRAIN = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-train.conllu" "UD_27 train set of FRENCH_SEQUOIA." UD_27_FRENCH_SEQUOIA_DEV = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-dev.conllu" "UD_27 dev set of FRENCH_SEQUOIA." UD_27_FRENCH_SEQUOIA_TEST = _UD_27_HOME + "UD_French-Sequoia/fr_sequoia-ud-test.conllu" "UD_27 test set of FRENCH_SEQUOIA." UD_27_FRENCH_SPOKEN_TRAIN = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-train.conllu" "UD_27 train set of FRENCH_SPOKEN." UD_27_FRENCH_SPOKEN_DEV = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-dev.conllu" "UD_27 dev set of FRENCH_SPOKEN." UD_27_FRENCH_SPOKEN_TEST = _UD_27_HOME + "UD_French-Spoken/fr_spoken-ud-test.conllu" "UD_27 test set of FRENCH_SPOKEN." UD_27_GALICIAN_CTG_TRAIN = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-train.conllu" "UD_27 train set of GALICIAN_CTG." UD_27_GALICIAN_CTG_DEV = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-dev.conllu" "UD_27 dev set of GALICIAN_CTG." UD_27_GALICIAN_CTG_TEST = _UD_27_HOME + "UD_Galician-CTG/gl_ctg-ud-test.conllu" "UD_27 test set of GALICIAN_CTG." UD_27_GALICIAN_TREEGAL_TRAIN = _UD_27_HOME + "UD_Galician-TreeGal/gl_treegal-ud-train.conllu" "UD_27 train set of GALICIAN_TREEGAL." UD_27_GALICIAN_TREEGAL_TEST = _UD_27_HOME + "UD_Galician-TreeGal/gl_treegal-ud-test.conllu" "UD_27 test set of GALICIAN_TREEGAL." UD_27_GERMAN_GSD_TRAIN = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-train.conllu" "UD_27 train set of GERMAN_GSD." UD_27_GERMAN_GSD_DEV = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-dev.conllu" "UD_27 dev set of GERMAN_GSD." UD_27_GERMAN_GSD_TEST = _UD_27_HOME + "UD_German-GSD/de_gsd-ud-test.conllu" "UD_27 test set of GERMAN_GSD." UD_27_GERMAN_HDT_TRAIN = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-train.conllu" "UD_27 train set of GERMAN_HDT." UD_27_GERMAN_HDT_DEV = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-dev.conllu" "UD_27 dev set of GERMAN_HDT." UD_27_GERMAN_HDT_TEST = _UD_27_HOME + "UD_German-HDT/de_hdt-ud-test.conllu" "UD_27 test set of GERMAN_HDT." UD_27_GERMAN_LIT_TEST = _UD_27_HOME + "UD_German-LIT/de_lit-ud-test.conllu" "UD_27 test set of GERMAN_LIT." UD_27_GERMAN_PUD_TEST = _UD_27_HOME + "UD_German-PUD/de_pud-ud-test.conllu" "UD_27 test set of GERMAN_PUD." UD_27_GOTHIC_PROIEL_TRAIN = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-train.conllu" "UD_27 train set of GOTHIC_PROIEL." UD_27_GOTHIC_PROIEL_DEV = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-dev.conllu" "UD_27 dev set of GOTHIC_PROIEL." UD_27_GOTHIC_PROIEL_TEST = _UD_27_HOME + "UD_Gothic-PROIEL/got_proiel-ud-test.conllu" "UD_27 test set of GOTHIC_PROIEL." UD_27_GREEK_GDT_TRAIN = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-train.conllu" "UD_27 train set of GREEK_GDT." UD_27_GREEK_GDT_DEV = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-dev.conllu" "UD_27 dev set of GREEK_GDT." UD_27_GREEK_GDT_TEST = _UD_27_HOME + "UD_Greek-GDT/el_gdt-ud-test.conllu" "UD_27 test set of GREEK_GDT." UD_27_HEBREW_HTB_TRAIN = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-train.conllu" "UD_27 train set of HEBREW_HTB." UD_27_HEBREW_HTB_DEV = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-dev.conllu" "UD_27 dev set of HEBREW_HTB." UD_27_HEBREW_HTB_TEST = _UD_27_HOME + "UD_Hebrew-HTB/he_htb-ud-test.conllu" "UD_27 test set of HEBREW_HTB." UD_27_HINDI_HDTB_TRAIN = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-train.conllu" "UD_27 train set of HINDI_HDTB." UD_27_HINDI_HDTB_DEV = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" "UD_27 dev set of HINDI_HDTB." UD_27_HINDI_HDTB_TEST = _UD_27_HOME + "UD_Hindi-HDTB/hi_hdtb-ud-test.conllu" "UD_27 test set of HINDI_HDTB." UD_27_HINDI_PUD_TEST = _UD_27_HOME + "UD_Hindi-PUD/hi_pud-ud-test.conllu" "UD_27 test set of HINDI_PUD." UD_27_HINDI_ENGLISH_HIENCS_TRAIN = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-train.conllu" "UD_27 train set of HINDI_ENGLISH_HIENCS." UD_27_HINDI_ENGLISH_HIENCS_DEV = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-dev.conllu" "UD_27 dev set of HINDI_ENGLISH_HIENCS." UD_27_HINDI_ENGLISH_HIENCS_TEST = _UD_27_HOME + "UD_Hindi_English-HIENCS/qhe_hiencs-ud-test.conllu" "UD_27 test set of HINDI_ENGLISH_HIENCS." UD_27_HUNGARIAN_SZEGED_TRAIN = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-train.conllu" "UD_27 train set of HUNGARIAN_SZEGED." UD_27_HUNGARIAN_SZEGED_DEV = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-dev.conllu" "UD_27 dev set of HUNGARIAN_SZEGED." UD_27_HUNGARIAN_SZEGED_TEST = _UD_27_HOME + "UD_Hungarian-Szeged/hu_szeged-ud-test.conllu" "UD_27 test set of HUNGARIAN_SZEGED." UD_27_ICELANDIC_ICEPAHC_TRAIN = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-train.conllu" "UD_27 train set of ICELANDIC_ICEPAHC." UD_27_ICELANDIC_ICEPAHC_DEV = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-dev.conllu" "UD_27 dev set of ICELANDIC_ICEPAHC." UD_27_ICELANDIC_ICEPAHC_TEST = _UD_27_HOME + "UD_Icelandic-IcePaHC/is_icepahc-ud-test.conllu" "UD_27 test set of ICELANDIC_ICEPAHC." UD_27_ICELANDIC_PUD_TEST = _UD_27_HOME + "UD_Icelandic-PUD/is_pud-ud-test.conllu" "UD_27 test set of ICELANDIC_PUD." UD_27_INDONESIAN_CSUI_TRAIN = _UD_27_HOME + "UD_Indonesian-CSUI/id_csui-ud-train.conllu" "UD_27 train set of INDONESIAN_CSUI." UD_27_INDONESIAN_CSUI_TEST = _UD_27_HOME + "UD_Indonesian-CSUI/id_csui-ud-test.conllu" "UD_27 test set of INDONESIAN_CSUI." UD_27_INDONESIAN_GSD_TRAIN = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-train.conllu" "UD_27 train set of INDONESIAN_GSD." UD_27_INDONESIAN_GSD_DEV = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-dev.conllu" "UD_27 dev set of INDONESIAN_GSD." UD_27_INDONESIAN_GSD_TEST = _UD_27_HOME + "UD_Indonesian-GSD/id_gsd-ud-test.conllu" "UD_27 test set of INDONESIAN_GSD." UD_27_INDONESIAN_PUD_TEST = _UD_27_HOME + "UD_Indonesian-PUD/id_pud-ud-test.conllu" "UD_27 test set of INDONESIAN_PUD." UD_27_IRISH_IDT_TRAIN = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-train.conllu" "UD_27 train set of IRISH_IDT." UD_27_IRISH_IDT_DEV = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-dev.conllu" "UD_27 dev set of IRISH_IDT." UD_27_IRISH_IDT_TEST = _UD_27_HOME + "UD_Irish-IDT/ga_idt-ud-test.conllu" "UD_27 test set of IRISH_IDT." UD_27_ITALIAN_ISDT_TRAIN = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-train.conllu" "UD_27 train set of ITALIAN_ISDT." UD_27_ITALIAN_ISDT_DEV = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-dev.conllu" "UD_27 dev set of ITALIAN_ISDT." UD_27_ITALIAN_ISDT_TEST = _UD_27_HOME + "UD_Italian-ISDT/it_isdt-ud-test.conllu" "UD_27 test set of ITALIAN_ISDT." UD_27_ITALIAN_PUD_TEST = _UD_27_HOME + "UD_Italian-PUD/it_pud-ud-test.conllu" "UD_27 test set of ITALIAN_PUD." UD_27_ITALIAN_PARTUT_TRAIN = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-train.conllu" "UD_27 train set of ITALIAN_PARTUT." UD_27_ITALIAN_PARTUT_DEV = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-dev.conllu" "UD_27 dev set of ITALIAN_PARTUT." UD_27_ITALIAN_PARTUT_TEST = _UD_27_HOME + "UD_Italian-ParTUT/it_partut-ud-test.conllu" "UD_27 test set of ITALIAN_PARTUT." UD_27_ITALIAN_POSTWITA_TRAIN = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-train.conllu" "UD_27 train set of ITALIAN_POSTWITA." UD_27_ITALIAN_POSTWITA_DEV = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-dev.conllu" "UD_27 dev set of ITALIAN_POSTWITA." UD_27_ITALIAN_POSTWITA_TEST = _UD_27_HOME + "UD_Italian-PoSTWITA/it_postwita-ud-test.conllu" "UD_27 test set of ITALIAN_POSTWITA." UD_27_ITALIAN_TWITTIRO_TRAIN = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-train.conllu" "UD_27 train set of ITALIAN_TWITTIRO." UD_27_ITALIAN_TWITTIRO_DEV = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-dev.conllu" "UD_27 dev set of ITALIAN_TWITTIRO." UD_27_ITALIAN_TWITTIRO_TEST = _UD_27_HOME + "UD_Italian-TWITTIRO/it_twittiro-ud-test.conllu" "UD_27 test set of ITALIAN_TWITTIRO." UD_27_ITALIAN_VIT_TRAIN = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-train.conllu" "UD_27 train set of ITALIAN_VIT." UD_27_ITALIAN_VIT_DEV = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-dev.conllu" "UD_27 dev set of ITALIAN_VIT." UD_27_ITALIAN_VIT_TEST = _UD_27_HOME + "UD_Italian-VIT/it_vit-ud-test.conllu" "UD_27 test set of ITALIAN_VIT." UD_27_JAPANESE_BCCWJ_TRAIN = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-train.conllu" "UD_27 train set of JAPANESE_BCCWJ." UD_27_JAPANESE_BCCWJ_DEV = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-dev.conllu" "UD_27 dev set of JAPANESE_BCCWJ." UD_27_JAPANESE_BCCWJ_TEST = _UD_27_HOME + "UD_Japanese-BCCWJ/ja_bccwj-ud-test.conllu" "UD_27 test set of JAPANESE_BCCWJ." UD_27_JAPANESE_GSD_TRAIN = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-train.conllu" "UD_27 train set of JAPANESE_GSD." UD_27_JAPANESE_GSD_DEV = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-dev.conllu" "UD_27 dev set of JAPANESE_GSD." UD_27_JAPANESE_GSD_TEST = _UD_27_HOME + "UD_Japanese-GSD/ja_gsd-ud-test.conllu" "UD_27 test set of JAPANESE_GSD." UD_27_JAPANESE_MODERN_TEST = _UD_27_HOME + "UD_Japanese-Modern/ja_modern-ud-test.conllu" "UD_27 test set of JAPANESE_MODERN." UD_27_JAPANESE_PUD_TEST = _UD_27_HOME + "UD_Japanese-PUD/ja_pud-ud-test.conllu" "UD_27 test set of JAPANESE_PUD." UD_27_KARELIAN_KKPP_TEST = _UD_27_HOME + "UD_Karelian-KKPP/krl_kkpp-ud-test.conllu" "UD_27 test set of KARELIAN_KKPP." UD_27_KAZAKH_KTB_TRAIN = _UD_27_HOME + "UD_Kazakh-KTB/kk_ktb-ud-train.conllu" "UD_27 train set of KAZAKH_KTB." UD_27_KAZAKH_KTB_TEST = _UD_27_HOME + "UD_Kazakh-KTB/kk_ktb-ud-test.conllu" "UD_27 test set of KAZAKH_KTB." UD_27_KHUNSARI_AHA_TEST = _UD_27_HOME + "UD_Khunsari-AHA/kfm_aha-ud-test.conllu" "UD_27 test set of KHUNSARI_AHA." UD_27_KOMI_PERMYAK_UH_TEST = _UD_27_HOME + "UD_Komi_Permyak-UH/koi_uh-ud-test.conllu" "UD_27 test set of KOMI_PERMYAK_UH." UD_27_KOMI_ZYRIAN_IKDP_TEST = _UD_27_HOME + "UD_Komi_Zyrian-IKDP/kpv_ikdp-ud-test.conllu" "UD_27 test set of KOMI_ZYRIAN_IKDP." UD_27_KOMI_ZYRIAN_LATTICE_TEST = _UD_27_HOME + "UD_Komi_Zyrian-Lattice/kpv_lattice-ud-test.conllu" "UD_27 test set of KOMI_ZYRIAN_LATTICE." UD_27_KOREAN_GSD_TRAIN = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-train.conllu" "UD_27 train set of KOREAN_GSD." UD_27_KOREAN_GSD_DEV = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-dev.conllu" "UD_27 dev set of KOREAN_GSD." UD_27_KOREAN_GSD_TEST = _UD_27_HOME + "UD_Korean-GSD/ko_gsd-ud-test.conllu" "UD_27 test set of KOREAN_GSD." UD_27_KOREAN_KAIST_TRAIN = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-train.conllu" "UD_27 train set of KOREAN_KAIST." UD_27_KOREAN_KAIST_DEV = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-dev.conllu" "UD_27 dev set of KOREAN_KAIST." UD_27_KOREAN_KAIST_TEST = _UD_27_HOME + "UD_Korean-Kaist/ko_kaist-ud-test.conllu" "UD_27 test set of KOREAN_KAIST." UD_27_KOREAN_PUD_TEST = _UD_27_HOME + "UD_Korean-PUD/ko_pud-ud-test.conllu" "UD_27 test set of KOREAN_PUD." UD_27_KURMANJI_MG_TRAIN = _UD_27_HOME + "UD_Kurmanji-MG/kmr_mg-ud-train.conllu" "UD_27 train set of KURMANJI_MG." UD_27_KURMANJI_MG_TEST = _UD_27_HOME + "UD_Kurmanji-MG/kmr_mg-ud-test.conllu" "UD_27 test set of KURMANJI_MG." UD_27_LATIN_ITTB_TRAIN = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-train.conllu" "UD_27 train set of LATIN_ITTB." UD_27_LATIN_ITTB_DEV = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-dev.conllu" "UD_27 dev set of LATIN_ITTB." UD_27_LATIN_ITTB_TEST = _UD_27_HOME + "UD_Latin-ITTB/la_ittb-ud-test.conllu" "UD_27 test set of LATIN_ITTB." UD_27_LATIN_LLCT_TRAIN = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-train.conllu" "UD_27 train set of LATIN_LLCT." UD_27_LATIN_LLCT_DEV = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-dev.conllu" "UD_27 dev set of LATIN_LLCT." UD_27_LATIN_LLCT_TEST = _UD_27_HOME + "UD_Latin-LLCT/la_llct-ud-test.conllu" "UD_27 test set of LATIN_LLCT." UD_27_LATIN_PROIEL_TRAIN = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-train.conllu" "UD_27 train set of LATIN_PROIEL." UD_27_LATIN_PROIEL_DEV = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-dev.conllu" "UD_27 dev set of LATIN_PROIEL." UD_27_LATIN_PROIEL_TEST = _UD_27_HOME + "UD_Latin-PROIEL/la_proiel-ud-test.conllu" "UD_27 test set of LATIN_PROIEL." UD_27_LATIN_PERSEUS_TRAIN = _UD_27_HOME + "UD_Latin-Perseus/la_perseus-ud-train.conllu" "UD_27 train set of LATIN_PERSEUS." UD_27_LATIN_PERSEUS_TEST = _UD_27_HOME + "UD_Latin-Perseus/la_perseus-ud-test.conllu" "UD_27 test set of LATIN_PERSEUS." UD_27_LATVIAN_LVTB_TRAIN = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-train.conllu" "UD_27 train set of LATVIAN_LVTB." UD_27_LATVIAN_LVTB_DEV = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-dev.conllu" "UD_27 dev set of LATVIAN_LVTB." UD_27_LATVIAN_LVTB_TEST = _UD_27_HOME + "UD_Latvian-LVTB/lv_lvtb-ud-test.conllu" "UD_27 test set of LATVIAN_LVTB." UD_27_LITHUANIAN_ALKSNIS_TRAIN = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-train.conllu" "UD_27 train set of LITHUANIAN_ALKSNIS." UD_27_LITHUANIAN_ALKSNIS_DEV = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-dev.conllu" "UD_27 dev set of LITHUANIAN_ALKSNIS." UD_27_LITHUANIAN_ALKSNIS_TEST = _UD_27_HOME + "UD_Lithuanian-ALKSNIS/lt_alksnis-ud-test.conllu" "UD_27 test set of LITHUANIAN_ALKSNIS." UD_27_LITHUANIAN_HSE_TRAIN = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-train.conllu" "UD_27 train set of LITHUANIAN_HSE." UD_27_LITHUANIAN_HSE_DEV = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-dev.conllu" "UD_27 dev set of LITHUANIAN_HSE." UD_27_LITHUANIAN_HSE_TEST = _UD_27_HOME + "UD_Lithuanian-HSE/lt_hse-ud-test.conllu" "UD_27 test set of LITHUANIAN_HSE." UD_27_LIVVI_KKPP_TRAIN = _UD_27_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-train.conllu" "UD_27 train set of LIVVI_KKPP." UD_27_LIVVI_KKPP_TEST = _UD_27_HOME + "UD_Livvi-KKPP/olo_kkpp-ud-test.conllu" "UD_27 test set of LIVVI_KKPP." UD_27_MALTESE_MUDT_TRAIN = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-train.conllu" "UD_27 train set of MALTESE_MUDT." UD_27_MALTESE_MUDT_DEV = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-dev.conllu" "UD_27 dev set of MALTESE_MUDT." UD_27_MALTESE_MUDT_TEST = _UD_27_HOME + "UD_Maltese-MUDT/mt_mudt-ud-test.conllu" "UD_27 test set of MALTESE_MUDT." UD_27_MANX_CADHAN_TEST = _UD_27_HOME + "UD_Manx-Cadhan/gv_cadhan-ud-test.conllu" "UD_27 test set of MANX_CADHAN." UD_27_MARATHI_UFAL_TRAIN = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-train.conllu" "UD_27 train set of MARATHI_UFAL." UD_27_MARATHI_UFAL_DEV = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-dev.conllu" "UD_27 dev set of MARATHI_UFAL." UD_27_MARATHI_UFAL_TEST = _UD_27_HOME + "UD_Marathi-UFAL/mr_ufal-ud-test.conllu" "UD_27 test set of MARATHI_UFAL." UD_27_MBYA_GUARANI_DOOLEY_TEST = _UD_27_HOME + "UD_Mbya_Guarani-Dooley/gun_dooley-ud-test.conllu" "UD_27 test set of MBYA_GUARANI_DOOLEY." UD_27_MBYA_GUARANI_THOMAS_TEST = _UD_27_HOME + "UD_Mbya_Guarani-Thomas/gun_thomas-ud-test.conllu" "UD_27 test set of MBYA_GUARANI_THOMAS." UD_27_MOKSHA_JR_TEST = _UD_27_HOME + "UD_Moksha-JR/mdf_jr-ud-test.conllu" "UD_27 test set of MOKSHA_JR." UD_27_MUNDURUKU_TUDET_TEST = _UD_27_HOME + "UD_Munduruku-TuDeT/myu_tudet-ud-test.conllu" "UD_27 test set of MUNDURUKU_TUDET." UD_27_NAIJA_NSC_TRAIN = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-train.conllu" "UD_27 train set of NAIJA_NSC." UD_27_NAIJA_NSC_DEV = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-dev.conllu" "UD_27 dev set of NAIJA_NSC." UD_27_NAIJA_NSC_TEST = _UD_27_HOME + "UD_Naija-NSC/pcm_nsc-ud-test.conllu" "UD_27 test set of NAIJA_NSC." UD_27_NAYINI_AHA_TEST = _UD_27_HOME + "UD_Nayini-AHA/nyq_aha-ud-test.conllu" "UD_27 test set of NAYINI_AHA." UD_27_NORTH_SAMI_GIELLA_TRAIN = _UD_27_HOME + "UD_North_Sami-Giella/sme_giella-ud-train.conllu" "UD_27 train set of NORTH_SAMI_GIELLA." UD_27_NORTH_SAMI_GIELLA_TEST = _UD_27_HOME + "UD_North_Sami-Giella/sme_giella-ud-test.conllu" "UD_27 test set of NORTH_SAMI_GIELLA." UD_27_NORWEGIAN_BOKMAAL_TRAIN = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-train.conllu" "UD_27 train set of NORWEGIAN_BOKMAAL." UD_27_NORWEGIAN_BOKMAAL_DEV = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-dev.conllu" "UD_27 dev set of NORWEGIAN_BOKMAAL." UD_27_NORWEGIAN_BOKMAAL_TEST = _UD_27_HOME + "UD_Norwegian-Bokmaal/no_bokmaal-ud-test.conllu" "UD_27 test set of NORWEGIAN_BOKMAAL." UD_27_NORWEGIAN_NYNORSK_TRAIN = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-train.conllu" "UD_27 train set of NORWEGIAN_NYNORSK." UD_27_NORWEGIAN_NYNORSK_DEV = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-dev.conllu" "UD_27 dev set of NORWEGIAN_NYNORSK." UD_27_NORWEGIAN_NYNORSK_TEST = _UD_27_HOME + "UD_Norwegian-Nynorsk/no_nynorsk-ud-test.conllu" "UD_27 test set of NORWEGIAN_NYNORSK." UD_27_NORWEGIAN_NYNORSKLIA_TRAIN = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-train.conllu" "UD_27 train set of NORWEGIAN_NYNORSKLIA." UD_27_NORWEGIAN_NYNORSKLIA_DEV = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu" "UD_27 dev set of NORWEGIAN_NYNORSKLIA." UD_27_NORWEGIAN_NYNORSKLIA_TEST = _UD_27_HOME + "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-test.conllu" "UD_27 test set of NORWEGIAN_NYNORSKLIA." UD_27_OLD_CHURCH_SLAVONIC_PROIEL_TRAIN = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-train.conllu" "UD_27 train set of OLD_CHURCH_SLAVONIC_PROIEL." UD_27_OLD_CHURCH_SLAVONIC_PROIEL_DEV = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-dev.conllu" "UD_27 dev set of OLD_CHURCH_SLAVONIC_PROIEL." UD_27_OLD_CHURCH_SLAVONIC_PROIEL_TEST = _UD_27_HOME + "UD_Old_Church_Slavonic-PROIEL/cu_proiel-ud-test.conllu" "UD_27 test set of OLD_CHURCH_SLAVONIC_PROIEL." UD_27_OLD_FRENCH_SRCMF_TRAIN = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-train.conllu" "UD_27 train set of OLD_FRENCH_SRCMF." UD_27_OLD_FRENCH_SRCMF_DEV = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-dev.conllu" "UD_27 dev set of OLD_FRENCH_SRCMF." UD_27_OLD_FRENCH_SRCMF_TEST = _UD_27_HOME + "UD_Old_French-SRCMF/fro_srcmf-ud-test.conllu" "UD_27 test set of OLD_FRENCH_SRCMF." UD_27_OLD_RUSSIAN_RNC_TRAIN = _UD_27_HOME + "UD_Old_Russian-RNC/orv_rnc-ud-train.conllu" "UD_27 train set of OLD_RUSSIAN_RNC." UD_27_OLD_RUSSIAN_RNC_TEST = _UD_27_HOME + "UD_Old_Russian-RNC/orv_rnc-ud-test.conllu" "UD_27 test set of OLD_RUSSIAN_RNC." UD_27_OLD_RUSSIAN_TOROT_TRAIN = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-train.conllu" "UD_27 train set of OLD_RUSSIAN_TOROT." UD_27_OLD_RUSSIAN_TOROT_DEV = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-dev.conllu" "UD_27 dev set of OLD_RUSSIAN_TOROT." UD_27_OLD_RUSSIAN_TOROT_TEST = _UD_27_HOME + "UD_Old_Russian-TOROT/orv_torot-ud-test.conllu" "UD_27 test set of OLD_RUSSIAN_TOROT." UD_27_OLD_TURKISH_TONQQ_TEST = _UD_27_HOME + "UD_Old_Turkish-Tonqq/otk_tonqq-ud-test.conllu" "UD_27 test set of OLD_TURKISH_TONQQ." UD_27_PERSIAN_PERDT_TRAIN = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-train.conllu" "UD_27 train set of PERSIAN_PERDT." UD_27_PERSIAN_PERDT_DEV = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-dev.conllu" "UD_27 dev set of PERSIAN_PERDT." UD_27_PERSIAN_PERDT_TEST = _UD_27_HOME + "UD_Persian-PerDT/fa_perdt-ud-test.conllu" "UD_27 test set of PERSIAN_PERDT." UD_27_PERSIAN_SERAJI_TRAIN = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-train.conllu" "UD_27 train set of PERSIAN_SERAJI." UD_27_PERSIAN_SERAJI_DEV = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-dev.conllu" "UD_27 dev set of PERSIAN_SERAJI." UD_27_PERSIAN_SERAJI_TEST = _UD_27_HOME + "UD_Persian-Seraji/fa_seraji-ud-test.conllu" "UD_27 test set of PERSIAN_SERAJI." UD_27_POLISH_LFG_TRAIN = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-train.conllu" "UD_27 train set of POLISH_LFG." UD_27_POLISH_LFG_DEV = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-dev.conllu" "UD_27 dev set of POLISH_LFG." UD_27_POLISH_LFG_TEST = _UD_27_HOME + "UD_Polish-LFG/pl_lfg-ud-test.conllu" "UD_27 test set of POLISH_LFG." UD_27_POLISH_PDB_TRAIN = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-train.conllu" "UD_27 train set of POLISH_PDB." UD_27_POLISH_PDB_DEV = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-dev.conllu" "UD_27 dev set of POLISH_PDB." UD_27_POLISH_PDB_TEST = _UD_27_HOME + "UD_Polish-PDB/pl_pdb-ud-test.conllu" "UD_27 test set of POLISH_PDB." UD_27_POLISH_PUD_TEST = _UD_27_HOME + "UD_Polish-PUD/pl_pud-ud-test.conllu" "UD_27 test set of POLISH_PUD." UD_27_PORTUGUESE_BOSQUE_TRAIN = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-train.conllu" "UD_27 train set of PORTUGUESE_BOSQUE." UD_27_PORTUGUESE_BOSQUE_DEV = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-dev.conllu" "UD_27 dev set of PORTUGUESE_BOSQUE." UD_27_PORTUGUESE_BOSQUE_TEST = _UD_27_HOME + "UD_Portuguese-Bosque/pt_bosque-ud-test.conllu" "UD_27 test set of PORTUGUESE_BOSQUE." UD_27_PORTUGUESE_GSD_TRAIN = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-train.conllu" "UD_27 train set of PORTUGUESE_GSD." UD_27_PORTUGUESE_GSD_DEV = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-dev.conllu" "UD_27 dev set of PORTUGUESE_GSD." UD_27_PORTUGUESE_GSD_TEST = _UD_27_HOME + "UD_Portuguese-GSD/pt_gsd-ud-test.conllu" "UD_27 test set of PORTUGUESE_GSD." UD_27_PORTUGUESE_PUD_TEST = _UD_27_HOME + "UD_Portuguese-PUD/pt_pud-ud-test.conllu" "UD_27 test set of PORTUGUESE_PUD." UD_27_ROMANIAN_NONSTANDARD_TRAIN = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-train.conllu" "UD_27 train set of ROMANIAN_NONSTANDARD." UD_27_ROMANIAN_NONSTANDARD_DEV = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-dev.conllu" "UD_27 dev set of ROMANIAN_NONSTANDARD." UD_27_ROMANIAN_NONSTANDARD_TEST = _UD_27_HOME + "UD_Romanian-Nonstandard/ro_nonstandard-ud-test.conllu" "UD_27 test set of ROMANIAN_NONSTANDARD." UD_27_ROMANIAN_RRT_TRAIN = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-train.conllu" "UD_27 train set of ROMANIAN_RRT." UD_27_ROMANIAN_RRT_DEV = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-dev.conllu" "UD_27 dev set of ROMANIAN_RRT." UD_27_ROMANIAN_RRT_TEST = _UD_27_HOME + "UD_Romanian-RRT/ro_rrt-ud-test.conllu" "UD_27 test set of ROMANIAN_RRT." UD_27_ROMANIAN_SIMONERO_TRAIN = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-train.conllu" "UD_27 train set of ROMANIAN_SIMONERO." UD_27_ROMANIAN_SIMONERO_DEV = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-dev.conllu" "UD_27 dev set of ROMANIAN_SIMONERO." UD_27_ROMANIAN_SIMONERO_TEST = _UD_27_HOME + "UD_Romanian-SiMoNERo/ro_simonero-ud-test.conllu" "UD_27 test set of ROMANIAN_SIMONERO." UD_27_RUSSIAN_GSD_TRAIN = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-train.conllu" "UD_27 train set of RUSSIAN_GSD." UD_27_RUSSIAN_GSD_DEV = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-dev.conllu" "UD_27 dev set of RUSSIAN_GSD." UD_27_RUSSIAN_GSD_TEST = _UD_27_HOME + "UD_Russian-GSD/ru_gsd-ud-test.conllu" "UD_27 test set of RUSSIAN_GSD." UD_27_RUSSIAN_PUD_TEST = _UD_27_HOME + "UD_Russian-PUD/ru_pud-ud-test.conllu" "UD_27 test set of RUSSIAN_PUD." UD_27_RUSSIAN_SYNTAGRUS_TRAIN = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu" "UD_27 train set of RUSSIAN_SYNTAGRUS." UD_27_RUSSIAN_SYNTAGRUS_DEV = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu" "UD_27 dev set of RUSSIAN_SYNTAGRUS." UD_27_RUSSIAN_SYNTAGRUS_TEST = _UD_27_HOME + "UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu" "UD_27 test set of RUSSIAN_SYNTAGRUS." UD_27_RUSSIAN_TAIGA_TRAIN = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-train.conllu" "UD_27 train set of RUSSIAN_TAIGA." UD_27_RUSSIAN_TAIGA_DEV = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-dev.conllu" "UD_27 dev set of RUSSIAN_TAIGA." UD_27_RUSSIAN_TAIGA_TEST = _UD_27_HOME + "UD_Russian-Taiga/ru_taiga-ud-test.conllu" "UD_27 test set of RUSSIAN_TAIGA." UD_27_SANSKRIT_UFAL_TEST = _UD_27_HOME + "UD_Sanskrit-UFAL/sa_ufal-ud-test.conllu" "UD_27 test set of SANSKRIT_UFAL." UD_27_SANSKRIT_VEDIC_TRAIN = _UD_27_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-train.conllu" "UD_27 train set of SANSKRIT_VEDIC." UD_27_SANSKRIT_VEDIC_TEST = _UD_27_HOME + "UD_Sanskrit-Vedic/sa_vedic-ud-test.conllu" "UD_27 test set of SANSKRIT_VEDIC." UD_27_SCOTTISH_GAELIC_ARCOSG_TRAIN = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu" "UD_27 train set of SCOTTISH_GAELIC_ARCOSG." UD_27_SCOTTISH_GAELIC_ARCOSG_DEV = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-dev.conllu" "UD_27 dev set of SCOTTISH_GAELIC_ARCOSG." UD_27_SCOTTISH_GAELIC_ARCOSG_TEST = _UD_27_HOME + "UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-test.conllu" "UD_27 test set of SCOTTISH_GAELIC_ARCOSG." UD_27_SERBIAN_SET_TRAIN = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-train.conllu" "UD_27 train set of SERBIAN_SET." UD_27_SERBIAN_SET_DEV = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-dev.conllu" "UD_27 dev set of SERBIAN_SET." UD_27_SERBIAN_SET_TEST = _UD_27_HOME + "UD_Serbian-SET/sr_set-ud-test.conllu" "UD_27 test set of SERBIAN_SET." UD_27_SKOLT_SAMI_GIELLAGAS_TEST = _UD_27_HOME + "UD_Skolt_Sami-Giellagas/sms_giellagas-ud-test.conllu" "UD_27 test set of SKOLT_SAMI_GIELLAGAS." UD_27_SLOVAK_SNK_TRAIN = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-train.conllu" "UD_27 train set of SLOVAK_SNK." UD_27_SLOVAK_SNK_DEV = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-dev.conllu" "UD_27 dev set of SLOVAK_SNK." UD_27_SLOVAK_SNK_TEST = _UD_27_HOME + "UD_Slovak-SNK/sk_snk-ud-test.conllu" "UD_27 test set of SLOVAK_SNK." UD_27_SLOVENIAN_SSJ_TRAIN = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-train.conllu" "UD_27 train set of SLOVENIAN_SSJ." UD_27_SLOVENIAN_SSJ_DEV = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-dev.conllu" "UD_27 dev set of SLOVENIAN_SSJ." UD_27_SLOVENIAN_SSJ_TEST = _UD_27_HOME + "UD_Slovenian-SSJ/sl_ssj-ud-test.conllu" "UD_27 test set of SLOVENIAN_SSJ." UD_27_SLOVENIAN_SST_TRAIN = _UD_27_HOME + "UD_Slovenian-SST/sl_sst-ud-train.conllu" "UD_27 train set of SLOVENIAN_SST." UD_27_SLOVENIAN_SST_TEST = _UD_27_HOME + "UD_Slovenian-SST/sl_sst-ud-test.conllu" "UD_27 test set of SLOVENIAN_SST." UD_27_SOI_AHA_TEST = _UD_27_HOME + "UD_Soi-AHA/soj_aha-ud-test.conllu" "UD_27 test set of SOI_AHA." UD_27_SOUTH_LEVANTINE_ARABIC_MADAR_TEST = _UD_27_HOME + "UD_South_Levantine_Arabic-MADAR/ajp_madar-ud-test.conllu" "UD_27 test set of SOUTH_LEVANTINE_ARABIC_MADAR." UD_27_SPANISH_ANCORA_TRAIN = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-train.conllu" "UD_27 train set of SPANISH_ANCORA." UD_27_SPANISH_ANCORA_DEV = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-dev.conllu" "UD_27 dev set of SPANISH_ANCORA." UD_27_SPANISH_ANCORA_TEST = _UD_27_HOME + "UD_Spanish-AnCora/es_ancora-ud-test.conllu" "UD_27 test set of SPANISH_ANCORA." UD_27_SPANISH_GSD_TRAIN = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-train.conllu" "UD_27 train set of SPANISH_GSD." UD_27_SPANISH_GSD_DEV = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-dev.conllu" "UD_27 dev set of SPANISH_GSD." UD_27_SPANISH_GSD_TEST = _UD_27_HOME + "UD_Spanish-GSD/es_gsd-ud-test.conllu" "UD_27 test set of SPANISH_GSD." UD_27_SPANISH_PUD_TEST = _UD_27_HOME + "UD_Spanish-PUD/es_pud-ud-test.conllu" "UD_27 test set of SPANISH_PUD." UD_27_SWEDISH_LINES_TRAIN = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-train.conllu" "UD_27 train set of SWEDISH_LINES." UD_27_SWEDISH_LINES_DEV = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-dev.conllu" "UD_27 dev set of SWEDISH_LINES." UD_27_SWEDISH_LINES_TEST = _UD_27_HOME + "UD_Swedish-LinES/sv_lines-ud-test.conllu" "UD_27 test set of SWEDISH_LINES." UD_27_SWEDISH_PUD_TEST = _UD_27_HOME + "UD_Swedish-PUD/sv_pud-ud-test.conllu" "UD_27 test set of SWEDISH_PUD." UD_27_SWEDISH_TALBANKEN_TRAIN = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-train.conllu" "UD_27 train set of SWEDISH_TALBANKEN." UD_27_SWEDISH_TALBANKEN_DEV = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-dev.conllu" "UD_27 dev set of SWEDISH_TALBANKEN." UD_27_SWEDISH_TALBANKEN_TEST = _UD_27_HOME + "UD_Swedish-Talbanken/sv_talbanken-ud-test.conllu" "UD_27 test set of SWEDISH_TALBANKEN." UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_TRAIN = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-train.conllu" "UD_27 train set of SWEDISH_SIGN_LANGUAGE_SSLC." UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_DEV = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-dev.conllu" "UD_27 dev set of SWEDISH_SIGN_LANGUAGE_SSLC." UD_27_SWEDISH_SIGN_LANGUAGE_SSLC_TEST = _UD_27_HOME + "UD_Swedish_Sign_Language-SSLC/swl_sslc-ud-test.conllu" "UD_27 test set of SWEDISH_SIGN_LANGUAGE_SSLC." UD_27_SWISS_GERMAN_UZH_TEST = _UD_27_HOME + "UD_Swiss_German-UZH/gsw_uzh-ud-test.conllu" "UD_27 test set of SWISS_GERMAN_UZH." UD_27_TAGALOG_TRG_TEST = _UD_27_HOME + "UD_Tagalog-TRG/tl_trg-ud-test.conllu" "UD_27 test set of TAGALOG_TRG." UD_27_TAGALOG_UGNAYAN_TEST = _UD_27_HOME + "UD_Tagalog-Ugnayan/tl_ugnayan-ud-test.conllu" "UD_27 test set of TAGALOG_UGNAYAN." UD_27_TAMIL_MWTT_TEST = _UD_27_HOME + "UD_Tamil-MWTT/ta_mwtt-ud-test.conllu" "UD_27 test set of TAMIL_MWTT." UD_27_TAMIL_TTB_TRAIN = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-train.conllu" "UD_27 train set of TAMIL_TTB." UD_27_TAMIL_TTB_DEV = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-dev.conllu" "UD_27 dev set of TAMIL_TTB." UD_27_TAMIL_TTB_TEST = _UD_27_HOME + "UD_Tamil-TTB/ta_ttb-ud-test.conllu" "UD_27 test set of TAMIL_TTB." UD_27_TELUGU_MTG_TRAIN = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-train.conllu" "UD_27 train set of TELUGU_MTG." UD_27_TELUGU_MTG_DEV = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-dev.conllu" "UD_27 dev set of TELUGU_MTG." UD_27_TELUGU_MTG_TEST = _UD_27_HOME + "UD_Telugu-MTG/te_mtg-ud-test.conllu" "UD_27 test set of TELUGU_MTG." UD_27_THAI_PUD_TEST = _UD_27_HOME + "UD_Thai-PUD/th_pud-ud-test.conllu" "UD_27 test set of THAI_PUD." UD_27_TUPINAMBA_TUDET_TEST = _UD_27_HOME + "UD_Tupinamba-TuDeT/tpn_tudet-ud-test.conllu" "UD_27 test set of TUPINAMBA_TUDET." UD_27_TURKISH_BOUN_TRAIN = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-train.conllu" "UD_27 train set of TURKISH_BOUN." UD_27_TURKISH_BOUN_DEV = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-dev.conllu" "UD_27 dev set of TURKISH_BOUN." UD_27_TURKISH_BOUN_TEST = _UD_27_HOME + "UD_Turkish-BOUN/tr_boun-ud-test.conllu" "UD_27 test set of TURKISH_BOUN." UD_27_TURKISH_GB_TEST = _UD_27_HOME + "UD_Turkish-GB/tr_gb-ud-test.conllu" "UD_27 test set of TURKISH_GB." UD_27_TURKISH_IMST_TRAIN = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-train.conllu" "UD_27 train set of TURKISH_IMST." UD_27_TURKISH_IMST_DEV = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-dev.conllu" "UD_27 dev set of TURKISH_IMST." UD_27_TURKISH_IMST_TEST = _UD_27_HOME + "UD_Turkish-IMST/tr_imst-ud-test.conllu" "UD_27 test set of TURKISH_IMST." UD_27_TURKISH_PUD_TEST = _UD_27_HOME + "UD_Turkish-PUD/tr_pud-ud-test.conllu" "UD_27 test set of TURKISH_PUD." UD_27_TURKISH_GERMAN_SAGT_TRAIN = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-train.conllu" "UD_27 train set of TURKISH_GERMAN_SAGT." UD_27_TURKISH_GERMAN_SAGT_DEV = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-dev.conllu" "UD_27 dev set of TURKISH_GERMAN_SAGT." UD_27_TURKISH_GERMAN_SAGT_TEST = _UD_27_HOME + "UD_Turkish_German-SAGT/qtd_sagt-ud-test.conllu" "UD_27 test set of TURKISH_GERMAN_SAGT." UD_27_UKRAINIAN_IU_TRAIN = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-train.conllu" "UD_27 train set of UKRAINIAN_IU." UD_27_UKRAINIAN_IU_DEV = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-dev.conllu" "UD_27 dev set of UKRAINIAN_IU." UD_27_UKRAINIAN_IU_TEST = _UD_27_HOME + "UD_Ukrainian-IU/uk_iu-ud-test.conllu" "UD_27 test set of UKRAINIAN_IU." UD_27_UPPER_SORBIAN_UFAL_TRAIN = _UD_27_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-train.conllu" "UD_27 train set of UPPER_SORBIAN_UFAL." UD_27_UPPER_SORBIAN_UFAL_TEST = _UD_27_HOME + "UD_Upper_Sorbian-UFAL/hsb_ufal-ud-test.conllu" "UD_27 test set of UPPER_SORBIAN_UFAL." UD_27_URDU_UDTB_TRAIN = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-train.conllu" "UD_27 train set of URDU_UDTB." UD_27_URDU_UDTB_DEV = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-dev.conllu" "UD_27 dev set of URDU_UDTB." UD_27_URDU_UDTB_TEST = _UD_27_HOME + "UD_Urdu-UDTB/ur_udtb-ud-test.conllu" "UD_27 test set of URDU_UDTB." UD_27_UYGHUR_UDT_TRAIN = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-train.conllu" "UD_27 train set of UYGHUR_UDT." UD_27_UYGHUR_UDT_DEV = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-dev.conllu" "UD_27 dev set of UYGHUR_UDT." UD_27_UYGHUR_UDT_TEST = _UD_27_HOME + "UD_Uyghur-UDT/ug_udt-ud-test.conllu" "UD_27 test set of UYGHUR_UDT." UD_27_VIETNAMESE_VTB_TRAIN = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-train.conllu" "UD_27 train set of VIETNAMESE_VTB." UD_27_VIETNAMESE_VTB_DEV = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-dev.conllu" "UD_27 dev set of VIETNAMESE_VTB." UD_27_VIETNAMESE_VTB_TEST = _UD_27_HOME + "UD_Vietnamese-VTB/vi_vtb-ud-test.conllu" "UD_27 test set of VIETNAMESE_VTB." UD_27_WARLPIRI_UFAL_TEST = _UD_27_HOME + "UD_Warlpiri-UFAL/wbp_ufal-ud-test.conllu" "UD_27 test set of WARLPIRI_UFAL." UD_27_WELSH_CCG_TRAIN = _UD_27_HOME + "UD_Welsh-CCG/cy_ccg-ud-train.conllu" "UD_27 train set of WELSH_CCG." UD_27_WELSH_CCG_TEST = _UD_27_HOME + "UD_Welsh-CCG/cy_ccg-ud-test.conllu" "UD_27 test set of WELSH_CCG." UD_27_WOLOF_WTB_TRAIN = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-train.conllu" "UD_27 train set of WOLOF_WTB." UD_27_WOLOF_WTB_DEV = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-dev.conllu" "UD_27 dev set of WOLOF_WTB." UD_27_WOLOF_WTB_TEST = _UD_27_HOME + "UD_Wolof-WTB/wo_wtb-ud-test.conllu" "UD_27 test set of WOLOF_WTB." UD_27_YORUBA_YTB_TEST = _UD_27_HOME + "UD_Yoruba-YTB/yo_ytb-ud-test.conllu" "UD_27 test set of YORUBA_YTB." ================================================ FILE: hanlp/datasets/parsing/ud/ud27m.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-21 20:39 import os from hanlp.datasets.parsing.ud import concat_treebanks from hanlp.datasets.parsing.ud.ud27 import _UD_27_HOME _UD_27_MULTILINGUAL_HOME = concat_treebanks(_UD_27_HOME, '2.7') UD_27_MULTILINGUAL_TRAIN = os.path.join(_UD_27_MULTILINGUAL_HOME, 'train.conllu') "Training set of multilingual UD_27 obtained by concatenating all training sets." UD_27_MULTILINGUAL_DEV = os.path.join(_UD_27_MULTILINGUAL_HOME, 'dev.conllu') "Dev set of multilingual UD_27 obtained by concatenating all dev sets." UD_27_MULTILINGUAL_TEST = os.path.join(_UD_27_MULTILINGUAL_HOME, 'test.conllu') "Test set of multilingual UD_27 obtained by concatenating all test sets." ================================================ FILE: hanlp/datasets/pos/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:50 ================================================ FILE: hanlp/datasets/pos/ctb5.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:51 _CTB5_POS_HOME = 'http://file.hankcs.com/corpus/ctb5.1-pos.zip' CTB5_POS_TRAIN = f'{_CTB5_POS_HOME}#train.tsv' '''PoS training set for CTB5.''' CTB5_POS_DEV = f'{_CTB5_POS_HOME}#dev.tsv' '''PoS dev set for CTB5.''' CTB5_POS_TEST = f'{_CTB5_POS_HOME}#test.tsv' '''PoS test set for CTB5.''' ================================================ FILE: hanlp/datasets/qa/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-20 19:17 ================================================ FILE: hanlp/datasets/qa/hotpotqa.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-20 19:46 from enum import Enum, auto import torch import ujson from torch.nn.utils.rnn import pad_sequence from hanlp.common.dataset import TransformableDataset from hanlp_common.util import merge_list_of_dict HOTPOT_QA_TRAIN = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json' HOTPOT_QA_DISTRACTOR_DEV = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json' HOTPOT_QA_FULLWIKI_DEV = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json' class HotpotQADataset(TransformableDataset): def load_file(self, filepath): with open(filepath) as fd: return ujson.load(fd) class BuildGraph(object): def __init__(self, dst='graph') -> None: super().__init__() self.dst = dst def __call__(self, sample: dict): sample[self.dst] = build_graph(sample) return sample def hotpotqa_collate_fn(samples): batch = merge_list_of_dict(samples) max_seq_len = len(max([x['graph'] for x in samples], key=len)) arc = torch.zeros([len(samples), max_seq_len, max_seq_len]) token_offset = torch.zeros([len(samples), max_seq_len], dtype=torch.long) src_mask = torch.zeros([len(samples), max_seq_len], dtype=torch.bool) sp_candidate_mask = torch.zeros([len(samples), max_seq_len], dtype=torch.bool) sp_label = torch.zeros([len(samples), max_seq_len], dtype=torch.float) # sp = torch.zeros([len(samples), max_seq_len], dtype=torch.bool) tokens = [] offset = 0 for i, sample in enumerate(samples): graph = sample['graph'] for j, u in enumerate(graph): u: Vertex = u for v in u.to: v: Vertex = v arc[i, v.id, u.id] = 1 arc[i, u.id, v.id] = 1 # record each vertex's token offset token_offset[i, u.id] = offset src_mask[i, u.id] = True sp_candidate_mask[i, u.id] = u.is_sp_root_candidate() sp_label[i, u.id] = u.is_sp_root() offset += 1 tokens.extend(sample['token_id']) seq_lengths = torch.LongTensor(list(map(len, tokens))) tokens = [torch.LongTensor(x) for x in tokens] tokens = pad_sequence(tokens, batch_first=True) batch['adj'] = arc batch['tokens'] = tokens batch['src_mask'] = src_mask batch['seq_lengths'] = seq_lengths batch['token_offset'] = token_offset batch['sp_candidate_mask'] = sp_candidate_mask batch['sp_label'] = sp_label return batch def flat_sentence(sample: dict) -> dict: sample['token'] = token = [] for sent in sample['parsed_sentences']: token.append(['bos'] + [x.lower() for x in sent[0]]) return sample def create_sp_label(sample: dict) -> dict: sample['sp_label'] = sp_label = [] def label(title_, index_): for t, i in sample['supporting_facts']: if t == title_ and i == index_: return 1 return 0 for context in sample['context']: title, sents = context for idx, sent in enumerate(sents): sp_label.append(label(title, idx)) assert len(sample['supporting_facts']) == sum(sp_label) return sample class Type(Enum): Q_ROOT = auto() Q_WORD = auto() SP_ROOT = auto() SP_WORD = auto() NON_SP_ROOT = auto() NON_SP_WORD = auto() DOCUMENT_TITLE = auto() class Vertex(object): def __init__(self, id, type: Type, text=None) -> None: super().__init__() self.id = id self.type = type if not text: text = str(type).split('.')[-1] self.text = text self.to = [] self.rel = [] def connect(self, to, rel): self.to.append(to) self.rel.append(rel) def __str__(self) -> str: return f'{self.text} {self.id}' def __hash__(self) -> int: return self.id def is_word(self): return self.type in {Type.SP_WORD, Type.Q_WORD, Type.NON_SP_WORD} def is_question(self): return self.type in {Type.Q_ROOT, Type.Q_WORD} def is_sp(self): return self.type in {Type.SP_ROOT, Type.SP_WORD} def is_sp_root(self): return self.type in {Type.SP_ROOT} def is_sp_root_candidate(self): return self.type in {Type.SP_ROOT, Type.NON_SP_ROOT} def build_graph(each: dict, debug=False): raw_sents = [] raw_sents.append(each['question']) sp_idx = set() sp_sents = {} for sp in each['supporting_facts']: title, offset = sp ids = sp_sents.get(title, None) if ids is None: sp_sents[title] = ids = set() ids.add(offset) idx = 1 for document in each['context']: title, sents = document raw_sents += sents for i, s in enumerate(sents): if title in sp_sents and i in sp_sents[title]: sp_idx.add(idx) idx += 1 assert idx == len(raw_sents) parsed_sents = each['parsed_sentences'] assert len(raw_sents) == len(parsed_sents) graph = [] for idx, (raw, sent) in enumerate(zip(raw_sents, parsed_sents)): if debug: if idx > 1 and idx not in sp_idx: continue offset = len(graph) if idx == 0: if debug: print(f'Question: {raw}') graph.append(Vertex(len(graph), Type.Q_ROOT)) else: if debug: if idx in sp_idx: print(f'Supporting Fact: {raw}') graph.append(Vertex(len(graph), Type.SP_ROOT if idx in sp_idx else Type.NON_SP_ROOT)) tokens, heads, deprels = sent for t, h, d in zip(tokens, heads, deprels): graph.append( Vertex(len(graph), (Type.SP_WORD if idx in sp_idx else Type.NON_SP_WORD) if idx else Type.Q_WORD, t)) for i, (h, d) in enumerate(zip(heads, deprels)): graph[offset + h].connect(graph[offset + i + 1], d) q_root = graph[0] for u in graph: if u.type == Type.SP_ROOT or u.type == Type.NON_SP_ROOT: q_root.connect(u, 'supporting fact?') return graph ================================================ FILE: hanlp/datasets/srl/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-22 19:15 ================================================ FILE: hanlp/datasets/srl/loaders/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:05 ================================================ FILE: hanlp/datasets/srl/loaders/conll2012.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-22 19:15 import glob import json import os from typing import Union, List, Callable from hanlp.utils.span_util import enumerate_spans from hanlp.common.dataset import TransformableDataset from hanlp.common.transform import NamedTransform from hanlp.utils.io_util import read_tsv_as_sents, get_resource, TimingFileIterator from hanlp.utils.time_util import CountdownTimer class CoNLL2012BIOSRLDataset(TransformableDataset): def load_file(self, filepath: str): filepath = get_resource(filepath) if os.path.isfile(filepath): files = [filepath] else: assert os.path.isdir(filepath), f'{filepath} has to be a directory of CoNLL 2012' files = sorted(glob.glob(f'{filepath}/**/*gold_conll', recursive=True)) timer = CountdownTimer(len(files)) for fid, f in enumerate(files): timer.log(f'files loading[blink][yellow]...[/yellow][/blink]') # 0:DOCUMENT 1:PART 2:INDEX 3:WORD 4:POS 5:PARSE 6:LEMMA 7:FRAME 8:SENSE 9:SPEAKER 10:NE 11-N:ARGS N:COREF for sent in read_tsv_as_sents(f, ignore_prefix='#'): sense = [cell[7] for cell in sent] props = [cell[11:-1] for cell in sent] props = map(lambda p: p, zip(*props)) prd_bio_labels = [self._make_bio_labels(prop) for prop in props] prd_bio_labels = [self._remove_B_V(x) for x in prd_bio_labels] prd_indices = [i for i, x in enumerate(sense) if x != '-'] token = [x[3] for x in sent] srl = [None for x in token] for idx, labels in zip(prd_indices, prd_bio_labels): srl[idx] = labels srl = [x if x else ['O'] * len(token) for x in srl] yield {'token': token, 'srl': srl} @staticmethod def _make_bio_labels(prop): """Copied from https://github.com/hiroki13/span-based-srl/blob/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/utils/sent.py#L42 Args: prop: 1D: n_words; elem=bracket label Returns: 1D: n_words; elem=BIO label """ labels = [] prev = None for arg in prop: if arg.startswith('('): if arg.endswith(')'): prev = arg.split("*")[0][1:] label = 'B-' + prev prev = None else: prev = arg[1:-1] label = 'B-' + prev else: if prev: label = 'I-' + prev if arg.endswith(')'): prev = None else: label = 'O' labels.append(label) return labels @staticmethod def _remove_B_V(labels): return ['O' if x == 'B-V' else x for x in labels] class CoNLL2012SRLDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, doc_level_offset=True, generate_idx=None) -> None: self.doc_level_offset = doc_level_offset super().__init__(data, transform, cache, generate_idx=generate_idx) def load_file(self, filepath: str): """Load ``.jsonlines`` CoNLL12-style corpus. Samples of this corpus can be found using the following scripts. .. highlight:: python .. code-block:: python import json from hanlp_common.document import Document from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_DEV from hanlp.utils.io_util import get_resource with open(get_resource(ONTONOTES5_CONLL12_CHINESE_DEV)) as src: for line in src: doc = json.loads(line) print(Document(doc)) break Args: filepath: ``.jsonlines`` CoNLL12 corpus. """ filename = os.path.basename(filepath) reader = TimingFileIterator(filepath) num_docs, num_sentences = 0, 0 for line in reader: doc = json.loads(line) num_docs += 1 num_tokens_in_doc = 0 for sid, (sentence, srl) in enumerate(zip(doc['sentences'], doc['srl'])): if self.doc_level_offset: srl = [(x[0] - num_tokens_in_doc, x[1] - num_tokens_in_doc, x[2] - num_tokens_in_doc, x[3]) for x in srl] else: srl = [(x[0], x[1], x[2], x[3]) for x in srl] for x in srl: if any([o < 0 for o in x[:3]]): raise ValueError(f'Negative offset occurred, maybe doc_level_offset=False') if any([o >= len(sentence) for o in x[:3]]): raise ValueError('Offset exceeds sentence length, maybe doc_level_offset=True') deduplicated_srl = set() pa_set = set() for p, b, e, l in srl: pa = (p, b, e) if pa in pa_set: continue pa_set.add(pa) deduplicated_srl.add((p, b, e, l)) yield self.build_sample(sentence, deduplicated_srl, doc, sid) num_sentences += 1 num_tokens_in_doc += len(sentence) reader.log( f'{filename} {num_docs} documents, {num_sentences} sentences [blink][yellow]...[/yellow][/blink]') reader.erase() # noinspection PyMethodMayBeStatic def build_sample(self, sentence, deduplicated_srl, doc, sid): return { 'token': sentence, 'srl': deduplicated_srl } def group_pa_by_p(sample: dict) -> dict: if 'srl' in sample: srl: list = sample['srl'] grouped_srl = group_pa_by_p_(srl) sample['srl'] = grouped_srl return sample def group_pa_by_p_(srl): grouped_srl = {} for p, b, e, l in srl: bel = grouped_srl.get(p, None) if not bel: bel = grouped_srl[p] = set() bel.add((b, e, l)) return grouped_srl def filter_v_args(sample: dict) -> dict: if 'srl' in sample: sample['srl'] = [t for t in sample['srl'] if t[-1] not in ["V", "C-V"]] return sample def unpack_srl(sample: dict) -> dict: if 'srl' in sample: srl = sample['srl'] predicate_offset = [x[0] for x in srl] argument_begin_offset = [x[1] for x in srl] argument_end_offset = [x[2] for x in srl] srl_label = [x[-1] for x in srl] sample.update({ 'predicate_offset': predicate_offset, 'argument_begin_offset': argument_begin_offset, 'argument_end_offset': argument_end_offset, 'srl_label': srl_label, # We can obtain mask by srl_label > 0 # 'srl_mask': len(srl_label), }) return sample class SpanCandidatesGenerator(NamedTransform): def __init__(self, src: str, dst: str = None, max_span_width=None) -> None: if not dst: dst = f'{src}_span' super().__init__(src, dst) self.max_span_width = max_span_width def __call__(self, sample: dict) -> dict: sample[self.dst] = list(enumerate_spans(sample[self.src], max_span_width=self.max_span_width)) return sample class CoNLL2012SRLBIODataset(CoNLL2012SRLDataset): def build_sample(self, tokens, deduplicated_srl, doc, sid): # Convert srl to exclusive format deduplicated_srl = set((x[0], x[1], x[2] + 1, x[3]) for x in deduplicated_srl if x[3] != 'V') labels = [['O'] * len(tokens) for _ in range(len(tokens))] srl = group_pa_by_p_(deduplicated_srl) for p, args in sorted(srl.items()): labels_per_p = labels[p] for start, end, label in args: assert end > start assert label != 'V' # We don't predict predicate labels_per_p[start] = 'B-' + label for j in range(start + 1, end): labels_per_p[j] = 'I-' + label sample = { 'token': tokens, 'srl': labels, 'srl_set': deduplicated_srl, } if 'pos' in doc: sample['pos'] = doc['pos'][sid] return sample ================================================ FILE: hanlp/datasets/srl/loaders/ontonotes_loader.py ================================================ from typing import DefaultDict, List, Optional, Iterator, Set, Tuple, Dict from collections import defaultdict import codecs import os import logging from hanlp.utils.span_util import TypedSpan, enumerate_spans from phrasetree.tree import Tree logger = logging.getLogger(__name__) class OntonotesSentence: """ A class representing the annotations available for a single CONLL formatted sentence. # Parameters document_id : `str` This is a variation on the document filename sentence_id : `int` The integer ID of the sentence within a document. words : `List[str]` This is the tokens as segmented/tokenized in the Treebank. pos_tags : `List[str]` This is the Penn-Treebank-style part of speech. When parse information is missing, all parts of speech except the one for which there is some sense or proposition annotation are marked with a XX tag. The verb is marked with just a VERB tag. parse_tree : `nltk.Tree` An nltk Tree representing the parse. It includes POS tags as pre-terminal nodes. When the parse information is missing, the parse will be `None`. predicate_lemmas : `List[Optional[str]]` The predicate lemma of the words for which we have semantic role information or word sense information. All other indices are `None`. predicate_framenet_ids : `List[Optional[int]]` The PropBank frameset ID of the lemmas in `predicate_lemmas`, or `None`. word_senses : `List[Optional[float]]` The word senses for the words in the sentence, or `None`. These are floats because the word sense can have values after the decimal, like `1.1`. speakers : `List[Optional[str]]` The speaker information for the words in the sentence, if present, or `None` This is the speaker or author name where available. Mostly in Broadcast Conversation and Web Log data. When not available the rows are marked with an "-". named_entities : `List[str]` The BIO tags for named entities in the sentence. srl_frames : `List[Tuple[str, List[str]]]` A dictionary keyed by the verb in the sentence for the given Propbank frame labels, in a BIO format. coref_spans : `Set[TypedSpan]` The spans for entity mentions involved in coreference resolution within the sentence. Each element is a tuple composed of (cluster_id, (start_index, end_index)). Indices are `inclusive`. """ def __init__( self, document_id: str, sentence_id: int, words: List[str], pos_tags: List[str], parse_tree: Optional[Tree], predicate_lemmas: List[Optional[str]], predicate_framenet_ids: List[Optional[str]], word_senses: List[Optional[float]], speakers: List[Optional[str]], named_entities: List[str], srl_frames: List[Tuple[str, List[str]]], coref_spans: Set[TypedSpan], ) -> None: self.document_id = document_id self.sentence_id = sentence_id self.words = words self.pos_tags = pos_tags self.parse_tree = parse_tree self.predicate_lemmas = predicate_lemmas self.predicate_framenet_ids = predicate_framenet_ids self.word_senses = word_senses self.speakers = speakers self.named_entities = named_entities self.srl_frames = srl_frames self.coref_spans = coref_spans class Ontonotes: """ This `DatasetReader` is designed to read in the English OntoNotes v5.0 data in the format used by the CoNLL 2011/2012 shared tasks. In order to use this Reader, you must follow the instructions provided [here (v12 release):] (https://cemantix.org/data/ontonotes.html), which will allow you to download the CoNLL style annotations for the OntoNotes v5.0 release -- LDC2013T19.tgz obtained from LDC. Once you have run the scripts on the extracted data, you will have a folder structured as follows: ``` conll-formatted-ontonotes-5.0/ ── data ├── development └── data └── english └── annotations ├── bc ├── bn ├── mz ├── nw ├── pt ├── tc └── wb ├── test └── data └── english └── annotations ├── bc ├── bn ├── mz ├── nw ├── pt ├── tc └── wb └── train └── data └── english └── annotations ├── bc ├── bn ├── mz ├── nw ├── pt ├── tc └── wb ``` The file path provided to this class can then be any of the train, test or development directories(or the top level data directory, if you are not utilizing the splits). The data has the following format, ordered by column. 1. Document ID : `str` This is a variation on the document filename 2. Part number : `int` Some files are divided into multiple parts numbered as 000, 001, 002, ... etc. 3. Word number : `int` This is the word index of the word in that sentence. 4. Word : `str` This is the token as segmented/tokenized in the Treebank. Initially the `*_skel` file contain the placeholder [WORD] which gets replaced by the actual token from the Treebank which is part of the OntoNotes release. 5. POS Tag : `str` This is the Penn Treebank style part of speech. When parse information is missing, all part of speeches except the one for which there is some sense or proposition annotation are marked with a XX tag. The verb is marked with just a VERB tag. 6. Parse bit : `str` This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a `*`. When the parse information is missing, the first word of a sentence is tagged as `(TOP*` and the last word is tagged as `*)` and all intermediate words are tagged with a `*`. 7. Predicate lemma : `str` The predicate lemma is mentioned for the rows for which we have semantic role information or word sense information. All other rows are marked with a "-". 8. Predicate Frameset ID : `int` The PropBank frameset ID of the predicate in Column 7. 9. Word sense : `float` This is the word sense of the word in Column 3. 10. Speaker/Author : `str` This is the speaker or author name where available. Mostly in Broadcast Conversation and Web Log data. When not available the rows are marked with an "-". 11. Named Entities : `str` These columns identifies the spans representing various named entities. For documents which do not have named entity annotation, each line is represented with an `*`. 12. Predicate Arguments : `str` There is one column each of predicate argument structure information for the predicate mentioned in Column 7. If there are no predicates tagged in a sentence this is a single column with all rows marked with an `*`. -1. Co-reference : `str` Co-reference chain information encoded in a parenthesis structure. For documents that do not have co-reference annotations, each line is represented with a "-". """ def dataset_iterator(self, file_path: str) -> Iterator[OntonotesSentence]: """ An iterator over the entire dataset, yielding all sentences processed. """ for conll_file in self.dataset_path_iterator(file_path): yield from self.sentence_iterator(conll_file) @staticmethod def dataset_path_iterator(file_path: str) -> Iterator[str]: """ An iterator returning file_paths in a directory containing CONLL-formatted files. """ logger.info("Reading CONLL sentences from dataset files at: %s", file_path) for root, _, files in list(os.walk(file_path)): for data_file in files: # These are a relic of the dataset pre-processing. Every # file will be duplicated - one file called filename.gold_skel # and one generated from the preprocessing called filename.gold_conll. if not data_file.endswith("gold_conll"): continue yield os.path.join(root, data_file) def dataset_document_iterator(self, file_path: str) -> Iterator[List[OntonotesSentence]]: """ An iterator over CONLL formatted files which yields documents, regardless of the number of document annotations in a particular file. This is useful for conll data which has been preprocessed, such as the preprocessing which takes place for the 2012 CONLL Coreference Resolution task. """ with codecs.open(file_path, "r", encoding="utf8") as open_file: conll_rows = [] document: List[OntonotesSentence] = [] for line in open_file: line = line.strip() if line != "" and not line.startswith("#"): # Non-empty line. Collect the annotation. conll_rows.append(line) else: if conll_rows: document.append(self._conll_rows_to_sentence(conll_rows)) conll_rows = [] if line.startswith("#end document"): yield document document = [] if document: # Collect any stragglers or files which might not # have the '#end document' format for the end of the file. yield document def sentence_iterator(self, file_path: str) -> Iterator[OntonotesSentence]: """ An iterator over the sentences in an individual CONLL formatted file. """ for document in self.dataset_document_iterator(file_path): for sentence in document: yield sentence def _conll_rows_to_sentence(self, conll_rows: List[str]) -> OntonotesSentence: document_id: str = None sentence_id: int = None # The words in the sentence. sentence: List[str] = [] # The pos tags of the words in the sentence. pos_tags: List[str] = [] # the pieces of the parse tree. parse_pieces: List[str] = [] # The lemmatised form of the words in the sentence which # have SRL or word sense information. predicate_lemmas: List[str] = [] # The FrameNet ID of the predicate. predicate_framenet_ids: List[str] = [] # The sense of the word, if available. word_senses: List[float] = [] # The current speaker, if available. speakers: List[str] = [] verbal_predicates: List[str] = [] span_labels: List[List[str]] = [] current_span_labels: List[str] = [] # Cluster id -> List of (start_index, end_index) spans. clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list) # Cluster id -> List of start_indices which are open for this id. coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for index, row in enumerate(conll_rows): conll_components = row.split() document_id = conll_components[0] sentence_id = int(conll_components[1]) word = conll_components[3] pos_tag = conll_components[4] parse_piece = conll_components[5] # Replace brackets in text and pos tags # with a different token for parse trees. if pos_tag != "XX" and word != "XX": if word == "(": parse_word = "-LRB-" elif word == ")": parse_word = "-RRB-" else: parse_word = word if pos_tag == "(": pos_tag = "-LRB-" if pos_tag == ")": pos_tag = "-RRB-" (left_brackets, right_hand_side) = parse_piece.split("*") # only keep ')' if there are nested brackets with nothing in them. right_brackets = right_hand_side.count(")") * ")" parse_piece = f"{left_brackets} ({pos_tag} {parse_word}) {right_brackets}" else: # There are some bad annotations in the CONLL data. # They contain no information, so to make this explicit, # we just set the parse piece to be None which will result # in the overall parse tree being None. parse_piece = None lemmatised_word = conll_components[6] framenet_id = conll_components[7] word_sense = conll_components[8] speaker = conll_components[9] if not span_labels: # If this is the first word in the sentence, create # empty lists to collect the NER and SRL BIO labels. # We can't do this upfront, because we don't know how many # components we are collecting, as a sentence can have # variable numbers of SRL frames. span_labels = [[] for _ in conll_components[10:-1]] # Create variables representing the current label for each label # sequence we are collecting. current_span_labels = [None for _ in conll_components[10:-1]] self._process_span_annotations_for_word( conll_components[10:-1], span_labels, current_span_labels ) # If any annotation marks this word as a verb predicate, # we need to record its index. This also has the side effect # of ordering the verbal predicates by their location in the # sentence, automatically aligning them with the annotations. word_is_verbal_predicate = any("(V" in x for x in conll_components[11:-1]) if word_is_verbal_predicate: verbal_predicates.append(word) self._process_coref_span_annotations_for_word( conll_components[-1], index, clusters, coref_stacks ) sentence.append(word) pos_tags.append(pos_tag) parse_pieces.append(parse_piece) predicate_lemmas.append(lemmatised_word if lemmatised_word != "-" else None) predicate_framenet_ids.append(framenet_id if framenet_id != "-" else None) word_senses.append(float(word_sense) if word_sense != "-" else None) speakers.append(speaker if speaker != "-" else None) named_entities = span_labels[0] srl_frames = [ (predicate, labels) for predicate, labels in zip(verbal_predicates, span_labels[1:]) ] if all(parse_pieces): parse_tree = Tree.fromstring("".join(parse_pieces)) else: parse_tree = None coref_span_tuples: Set[TypedSpan] = { (cluster_id, span) for cluster_id, span_list in clusters.items() for span in span_list } return OntonotesSentence( document_id, sentence_id, sentence, pos_tags, parse_tree, predicate_lemmas, predicate_framenet_ids, word_senses, speakers, named_entities, srl_frames, coref_span_tuples, ) @staticmethod def _process_coref_span_annotations_for_word( label: str, word_index: int, clusters: DefaultDict[int, List[Tuple[int, int]]], coref_stacks: DefaultDict[int, List[int]], ) -> None: """ For a given coref label, add it to a currently open span(s), complete a span(s) or ignore it, if it is outside of all spans. This method mutates the clusters and coref_stacks dictionaries. # Parameters label : `str` The coref label for this word. word_index : `int` The word index into the sentence. clusters : `DefaultDict[int, List[Tuple[int, int]]]` A dictionary mapping cluster ids to lists of inclusive spans into the sentence. coref_stacks : `DefaultDict[int, List[int]]` Stacks for each cluster id to hold the start indices of active spans (spans which we are inside of when processing a given word). Spans with the same id can be nested, which is why we collect these opening spans on a stack, e.g: [Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1 """ if label != "-": for segment in label.split("|"): # The conll representation of coref spans allows spans to # overlap. If spans end or begin at the same word, they are # separated by a "|". if segment[0] == "(": # The span begins at this word. if segment[-1] == ")": # The span begins and ends at this word (single word span). cluster_id = int(segment[1:-1]) clusters[cluster_id].append((word_index, word_index)) else: # The span is starting, so we record the index of the word. cluster_id = int(segment[1:]) coref_stacks[cluster_id].append(word_index) else: # The span for this id is ending, but didn't start at this word. # Retrieve the start index from the document state and # add the span to the clusters for this id. cluster_id = int(segment[:-1]) start = coref_stacks[cluster_id].pop() clusters[cluster_id].append((start, word_index)) @staticmethod def _process_span_annotations_for_word( annotations: List[str], span_labels: List[List[str]], current_span_labels: List[Optional[str]], ) -> None: """ Given a sequence of different label types for a single word and the current span label we are inside, compute the BIO tag for each label and append to a list. # Parameters annotations : `List[str]` A list of labels to compute BIO tags for. span_labels : `List[List[str]]` A list of lists, one for each annotation, to incrementally collect the BIO tags for a sequence. current_span_labels : `List[Optional[str]]` The currently open span per annotation type, or `None` if there is no open span. """ for annotation_index, annotation in enumerate(annotations): # strip all bracketing information to # get the actual propbank label. label = annotation.strip("()*") if "(" in annotation: # Entering into a span for a particular semantic role label. # We append the label and set the current span for this annotation. bio_label = "B-" + label span_labels[annotation_index].append(bio_label) current_span_labels[annotation_index] = label elif current_span_labels[annotation_index] is not None: # If there's no '(' token, but the current_span_label is not None, # then we are inside a span. bio_label = "I-" + current_span_labels[annotation_index] span_labels[annotation_index].append(bio_label) else: # We're outside a span. span_labels[annotation_index].append("O") # Exiting a span, so we reset the current span label for this annotation. if ")" in annotation: current_span_labels[annotation_index] = None def make_coref_instance( sentences: List[List[str]], max_span_width: int, gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, max_sentences: int = None, remove_singleton_clusters: bool = True, ) -> dict: """ # Parameters sentences : `List[List[str]]`, required. A list of lists representing the tokenised words and sentences in the document. token_indexers : `Dict[str, TokenIndexer]` This is used to index the words in the document. See :class:`TokenIndexer`. max_span_width : `int`, required. The maximum width of candidate spans to consider. gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = None) A list of all clusters in the document, represented as word spans with absolute indices in the entire document. Each cluster contains some number of spans, which can be nested and overlap. If there are exact matches between clusters, they will be resolved using `_canonicalize_clusters`. wordpiece_modeling_tokenizer: `PretrainedTransformerTokenizer`, optional (default = None) If not None, this dataset reader does subword tokenization using the supplied tokenizer and distribute the labels to the resulting wordpieces. All the modeling will be based on wordpieces. If this is set to `False` (default), the user is expected to use `PretrainedTransformerMismatchedIndexer` and `PretrainedTransformerMismatchedEmbedder`, and the modeling will be on the word-level. max_sentences: int, optional (default = None) The maximum number of sentences in each document to keep. By default keeps all sentences. remove_singleton_clusters : `bool`, optional (default = True) Some datasets contain clusters that are singletons (i.e. no coreferents). This option allows the removal of them. # Returns An `Instance` containing the following `Fields`: text : `TextField` The text of the full document. spans : `ListField[SpanField]` A ListField containing the spans represented as `SpanFields` with respect to the document text. span_labels : `SequenceLabelField`, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a `SequenceLabelField` with respect to the spans `ListField`. """ if max_sentences is not None and len(sentences) > max_sentences: sentences = sentences[:max_sentences] total_length = sum(len(sentence) for sentence in sentences) if gold_clusters is not None: new_gold_clusters = [] for cluster in gold_clusters: new_cluster = [] for mention in cluster: if mention[1] < total_length: new_cluster.append(mention) if new_cluster: new_gold_clusters.append(new_cluster) gold_clusters = new_gold_clusters flattened_sentences = [_normalize_word(word) for sentence in sentences for word in sentence] flat_sentences_tokens = [word for word in flattened_sentences] text_field = flat_sentences_tokens cluster_dict = {} if gold_clusters is not None: gold_clusters = _canonicalize_clusters(gold_clusters) if remove_singleton_clusters: gold_clusters = [cluster for cluster in gold_clusters if len(cluster) > 1] for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List = [] span_labels: Optional[List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 for sentence in sentences: for start, end in enumerate_spans( sentence, offset=sentence_offset, max_span_width=max_span_width ): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append((start, end)) sentence_offset += len(sentence) span_field = spans # metadata: Dict[str, Any] = {"original_text": flattened_sentences} # if gold_clusters is not None: # metadata["clusters"] = gold_clusters # metadata_field = MetadataField(metadata) fields: Dict[str, List] = { "text": text_field, "spans": span_field, 'clusters': gold_clusters, # "metadata": metadata_field, } if span_labels is not None: fields["span_labels"] = span_labels return fields def _normalize_word(word): if word in ("/.", "/?"): return word[1:] else: return word def _canonicalize_clusters(clusters: List[List[Tuple[int, int]]]) -> List[List[Tuple[int, int]]]: """ The data might include 2 annotated spans which are identical, but have different ids. This checks all clusters for spans which are identical, and if it finds any, merges the clusters containing the identical spans. """ merged_clusters: List[Set[Tuple[int, int]]] = [] for cluster in clusters: cluster_with_overlapping_mention = None for mention in cluster: # Look at clusters we have already processed to # see if they contain a mention in the current # cluster for comparison. for cluster2 in merged_clusters: if mention in cluster2: # first cluster in merged clusters # which contains this mention. cluster_with_overlapping_mention = cluster2 break # Already encountered overlap - no need to keep looking. if cluster_with_overlapping_mention is not None: break if cluster_with_overlapping_mention is not None: # Merge cluster we are currently processing into # the cluster in the processed list. cluster_with_overlapping_mention.update(cluster) else: merged_clusters.append(set(cluster)) return [list(c) for c in merged_clusters] ================================================ FILE: hanlp/datasets/srl/ontonotes5/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-26 16:07 ONTONOTES5_HOME = 'https://catalog.ldc.upenn.edu/LDC2013T19/LDC2013T19.tgz#/ontonotes-release-5.0/data/' CONLL12_HOME = ONTONOTES5_HOME + '../conll-2012/' ================================================ FILE: hanlp/datasets/srl/ontonotes5/_utils.py ================================================ #!/usr/bin/env python import codecs import collections import glob import json import os import re import sys from pprint import pprint from typing import List, Dict, Union from hanlp_common.io import eprint, save_json from hanlp.common.transform import NormalizeToken from hanlp.datasets.parsing.loaders._ctb_utils import remove_all_ec, convert_to_dependency from hanlp.datasets.parsing.ptb import PTB_TOKEN_MAPPING from hanlp.utils.io_util import merge_files, get_resource, pushd, run_cmd, read_tsv_as_sents, replace_ext, \ get_exitcode_stdout_stderr from hanlp.utils.log_util import flash BEGIN_DOCUMENT_REGEX = re.compile(r"#begin document \((.*)\); part (\d+)") def flatten(l): return [item for sublist in l for item in sublist] def get_doc_key(doc_id, part): return "{}_{}".format(doc_id, int(part)) class DocumentState(object): def __init__(self): self.doc_key = None self.text = [] self.text_speakers = [] self.speakers = [] self.sentences = [] self.pos = [] self.lemma = [] self.pos_buffer = [] self.lemma_buffer = [] self.constituents = [] # {} self.const_stack = [] self.const_buffer = [] self.ner = [] self.ner_stack = [] self.ner_buffer = [] self.srl = [] self.argument_stacks = [] self.argument_buffers = [] self.predicate_buffer = [] self.clusters = collections.defaultdict(list) self.coref_stacks = collections.defaultdict(list) def assert_empty(self): assert self.doc_key is None assert len(self.text) == 0 assert len(self.text_speakers) == 0 assert len(self.speakers) == 0 assert len(self.sentences) == 0 assert len(self.srl) == 0 assert len(self.predicate_buffer) == 0 assert len(self.argument_buffers) == 0 assert len(self.argument_stacks) == 0 assert len(self.constituents) == 0 assert len(self.const_stack) == 0 assert len(self.const_buffer) == 0 assert len(self.ner) == 0 assert len(self.lemma_buffer) == 0 assert len(self.pos_buffer) == 0 assert len(self.ner_stack) == 0 assert len(self.ner_buffer) == 0 assert len(self.coref_stacks) == 0 assert len(self.clusters) == 0 def assert_finalizable(self): assert self.doc_key is not None assert len(self.text) == 0 assert len(self.text_speakers) == 0 assert len(self.speakers) > 0 assert len(self.sentences) > 0 assert len(self.constituents) > 0 assert len(self.const_stack) == 0 assert len(self.ner_stack) == 0 assert len(self.predicate_buffer) == 0 assert all(len(s) == 0 for s in list(self.coref_stacks.values())) def finalize_sentence(self): self.sentences.append(tuple(self.text)) del self.text[:] self.lemma.append(tuple(self.lemma_buffer)) del self.lemma_buffer[:] self.pos.append(tuple(self.pos_buffer)) del self.pos_buffer[:] self.speakers.append(tuple(self.text_speakers)) del self.text_speakers[:] assert len(self.predicate_buffer) == len(self.argument_buffers) self.srl.append([]) for pred, args in zip(self.predicate_buffer, self.argument_buffers): for start, end, label in args: self.srl[-1].append((pred, start, end, label)) self.predicate_buffer = [] self.argument_buffers = [] self.argument_stacks = [] self.constituents.append([c for c in self.const_buffer]) self.const_buffer = [] self.ner.append([c for c in self.ner_buffer]) self.ner_buffer = [] def finalize(self): merged_clusters = [] for c1 in list(self.clusters.values()): existing = None for m in c1: for c2 in merged_clusters: if m in c2: existing = c2 break if existing is not None: break if existing is not None: print("Merging clusters (shouldn't happen very often.)") existing.update(c1) else: merged_clusters.append(set(c1)) merged_clusters = [list(c) for c in merged_clusters] all_mentions = flatten(merged_clusters) assert len(all_mentions) == len(set(all_mentions)) assert len(self.sentences) == len(self.srl) assert len(self.sentences) == len(self.constituents) assert len(self.sentences) == len(self.ner) return { "doc_key": self.doc_key, "sentences": self.sentences, "lemma": self.lemma, "pos": self.pos, "speakers": self.speakers, "srl": self.srl, "constituents": self.constituents, "ner": self.ner, "clusters": merged_clusters } def filter_data(input_json_file, output_json_file, doc_ids_file=None, annotation=None): """Filter OntoNotes5 data based on CoNLL2012 (coref) doc ids. https://github.com/bcmi220/unisrl/blob/master/scripts/filter_conll2012_data.py Args: input_json_file: All documents. output_json_file: doc_ids_file: Returns: """ assert doc_ids_file or annotation doc_count = 0 sentence_count = 0 srl_count = 0 ner_count = 0 cluster_count = 0 word_count = 0 missing_count = 0 doc_ids = [] doc_ids_to_keys = collections.defaultdict(list) filtered_examples = {} ontonotes_root = os.path.abspath(os.path.join(os.path.dirname(input_json_file), *['..'] * 2)) language = os.path.basename(input_json_file).split('.')[1] if doc_ids_file: with open(doc_ids_file, "r") as f: for line in f: doc_id = line.strip().split("annotations/")[1] doc_ids.append(doc_id) doc_ids_to_keys[doc_id] = [] f.close() with codecs.open(input_json_file, "r", "utf8") as f: for jsonline in f: example = json.loads(jsonline) doc_key = example["doc_key"] dk_prefix = "_".join(doc_key.split("_")[:-1]) if doc_ids_file and dk_prefix not in doc_ids_to_keys: continue if annotation and not os.path.isfile( os.path.join(ontonotes_root, 'data/files/data', language, 'annotations', dk_prefix) + annotation): print(os.path.join(ontonotes_root, 'data/files/data', language, 'annotations', dk_prefix) + annotation) missing_count += 1 continue doc_ids_to_keys[dk_prefix].append(doc_key) filtered_examples[doc_key] = example sentences = example["sentences"] word_count += sum([len(s) for s in sentences]) sentence_count += len(sentences) srl_count += sum([len(srl) for srl in example["srl"]]) ner_count += sum([len(ner) for ner in example["ner"]]) coref = example["clusters"] cluster_count += len(coref) doc_count += 1 f.close() print(("Documents: {}\nSentences: {}\nWords: {}\nNER: {}, PAS: {}, Clusters: {}, No annotations: {}".format( doc_count, sentence_count, word_count, ner_count, srl_count, cluster_count, missing_count))) if doc_ids_file: with codecs.open(output_json_file, "w", "utf8") as f: for doc_id in doc_ids: # Arrange the files in order of id files for key in doc_ids_to_keys[doc_id]: f.write(json.dumps(filtered_examples[key], ensure_ascii=False)) f.write("\n") f.close() else: with codecs.open(output_json_file, "w", "utf8") as f: for doc in filtered_examples.values(): f.write(json.dumps(doc, ensure_ascii=False)) f.write("\n") f.close() def normalize_word(word, language): if language == "arabic": word = word[:word.find("#")] if word == "/." or word == "/?": return word[1:] else: return word def handle_bit(word_index, bit, stack, spans, label_set): asterisk_idx = bit.find("*") if asterisk_idx >= 0: open_parens = bit[:asterisk_idx] close_parens = bit[asterisk_idx + 1:] else: open_parens = bit[:-1] close_parens = bit[-1] current_idx = open_parens.find("(") while current_idx >= 0: next_idx = open_parens.find("(", current_idx + 1) if next_idx >= 0: label = open_parens[current_idx + 1:next_idx] else: label = open_parens[current_idx + 1:] label_set.add(label) stack.append((word_index, label)) current_idx = next_idx for c in close_parens: try: assert c == ")" except AssertionError: print(word_index, bit, spans, stack) continue open_index, label = stack.pop() spans.append((open_index, word_index, label)) ''' current_span = (open_index, word_index) if current_span in spans: spans[current_span] += "_" + label else: spans[current_span] = label spans[current_span] = label ''' def handle_line(line, document_state: DocumentState, language, labels, stats): begin_document_match = re.match(BEGIN_DOCUMENT_REGEX, line) if begin_document_match: document_state.assert_empty() document_state.doc_key = get_doc_key(begin_document_match.group(1), begin_document_match.group(2)) return None elif line.startswith("#end document"): document_state.assert_finalizable() finalized_state = document_state.finalize() stats["num_clusters"] += len(finalized_state["clusters"]) stats["num_mentions"] += sum(len(c) for c in finalized_state["clusters"]) # labels["{}_const_labels".format(language)].update(l for _, _, l in finalized_state["constituents"]) # labels["ner"].update(l for _, _, l in finalized_state["ner"]) return finalized_state else: row = line.split() # Starting a new sentence. if len(row) == 0: stats["max_sent_len_{}".format(language)] = max(len(document_state.text), stats["max_sent_len_{}".format(language)]) stats["num_sents_{}".format(language)] += 1 document_state.finalize_sentence() return None assert len(row) >= 12 doc_key = get_doc_key(row[0], row[1]) word = normalize_word(row[3], language) pos = row[4] parse = row[5] lemma = row[6] predicate_sense = row[7] speaker = row[9] ner = row[10] args = row[11:-1] coref = row[-1] word_index = len(document_state.text) + sum(len(s) for s in document_state.sentences) document_state.text.append(word) document_state.text_speakers.append(speaker) document_state.pos_buffer.append(pos) document_state.lemma_buffer.append(lemma) handle_bit(word_index, parse, document_state.const_stack, document_state.const_buffer, labels["categories"]) handle_bit(word_index, ner, document_state.ner_stack, document_state.ner_buffer, labels["ner"]) if len(document_state.argument_stacks) < len(args): document_state.argument_stacks = [[] for _ in args] document_state.argument_buffers = [[] for _ in args] for i, arg in enumerate(args): handle_bit(word_index, arg, document_state.argument_stacks[i], document_state.argument_buffers[i], labels["srl"]) if predicate_sense != "-": document_state.predicate_buffer.append(word_index) if coref != "-": for segment in coref.split("|"): if segment[0] == "(": if segment[-1] == ")": cluster_id = int(segment[1:-1]) document_state.clusters[cluster_id].append((word_index, word_index)) else: cluster_id = int(segment[1:]) document_state.coref_stacks[cluster_id].append(word_index) else: cluster_id = int(segment[:-1]) start = document_state.coref_stacks[cluster_id].pop() document_state.clusters[cluster_id].append((start, word_index)) return None def ontonotes_document_generator(input_path, language, labels, stats): with open(input_path, "r") as input_file: document_state = DocumentState() for line in input_file.readlines(): document = handle_line(line, document_state, language, labels, stats) if document is not None: yield document document_state = DocumentState() def convert_to_jsonlines(input_path, output_path, language, labels=None, stats=None): if labels is None: labels = collections.defaultdict(set) if stats is None: stats = collections.defaultdict(int) count = 0 with open(output_path, "w") as output_file: for document in ontonotes_document_generator(input_path, language, labels, stats): output_file.write(json.dumps(document, ensure_ascii=False)) output_file.write("\n") count += 1 return labels, stats def make_ontonotes_jsonlines(conll12_ontonotes_path, output_path, languages=None): if languages is None: languages = ['english', 'chinese', 'arabic'] for language in languages: make_ontonotes_language_jsonlines(conll12_ontonotes_path, output_path, language) def make_ontonotes_language_jsonlines(conll12_ontonotes_path, output_path=None, language='english'): conll12_ontonotes_path = get_resource(conll12_ontonotes_path) if output_path is None: output_path = os.path.dirname(conll12_ontonotes_path) for split in ['train', 'development', 'test']: pattern = f'{conll12_ontonotes_path}/data/{split}/data/{language}/annotations/*/*/*/*gold_conll' files = sorted(glob.glob(pattern, recursive=True)) assert files, f'No gold_conll files found in {pattern}' version = os.path.basename(files[0]).split('.')[-1].split('_')[0] if version.startswith('v'): assert all([version in os.path.basename(f) for f in files]) else: version = 'v5' lang_dir = f'{output_path}/{language}' if split == 'conll-2012-test': split = 'test' full_file = f'{lang_dir}/{split}.{language}.{version}_gold_conll' os.makedirs(lang_dir, exist_ok=True) print(f'Merging {len(files)} files to {full_file}') merge_files(files, full_file) v5_json_file = full_file.replace(f'.{version}_gold_conll', f'.{version}.jsonlines') print(f'Converting CoNLL file {full_file} to json file {v5_json_file}') labels, stats = convert_to_jsonlines(full_file, v5_json_file, language) print('Labels:') pprint(labels) print('Statistics:') pprint(stats) conll12_json_file = f'{lang_dir}/{split}.{language}.conll12.jsonlines' print(f'Applying CoNLL 12 official splits on {v5_json_file} to {conll12_json_file}') id_file = get_resource(f'https://file.hankcs.com/research/emnlp2021/conll.cemantix.org.zip#2012/download/ids/' f'{language}/coref/{split}.id') filter_data(v5_json_file, conll12_json_file, id_file) def ensure_python_points_to_python2(): exitcode, out, version = get_exitcode_stdout_stderr('python --version') if not version: version = out if not version.startswith('Python 2'): raise EnvironmentError(f'Your python command needs to be Python2, not {version.strip()}. Try:\n\n\t' 'ln -sf "$(which python2)" "$(which python)"') def make_gold_conll(ontonotes_path, language): ensure_python_points_to_python2() ontonotes_path = os.path.abspath(get_resource(ontonotes_path)) to_conll = get_resource( 'https://gist.githubusercontent.com/hankcs/46b9137016c769e4b6137104daf43a92/raw/66369de6c24b5ec47696ae307591f0d72c6f3f02/ontonotes_to_conll.sh') to_conll = os.path.abspath(to_conll) # shutil.rmtree(os.path.join(ontonotes_path, 'conll-2012'), ignore_errors=True) with pushd(ontonotes_path): try: flash(f'Converting [blue]{language}[/blue] to CoNLL format, ' f'this might take half an hour [blink][yellow]...[/yellow][/blink]') run_cmd(f'bash {to_conll} {ontonotes_path} {language}') flash('') except RuntimeError as e: flash(f'[red]Failed[/red] to convert {language} of {ontonotes_path} to CoNLL. See exceptions for detail') raise e def convert_jsonlines_to_IOBES(json_file, output_file=None, doc_level_offset=True, normalize_token=False): json_file = get_resource(json_file) if not output_file: output_file = os.path.splitext(json_file)[0] + '.ner.tsv' if normalize_token: transform = NormalizeToken(PTB_TOKEN_MAPPING, 'token') with open(json_file) as src, open(output_file, 'w', encoding='utf-8') as out: for line in src: doc = json.loads(line) offset = 0 for sent, ner in zip(doc['sentences'], doc['ner']): if normalize_token: sent = transform({'token': sent})['token'] tags = ['O'] * len(sent) for start, end, label in ner: if doc_level_offset: start -= offset end -= offset if start == end: tags[start] = 'S-' + label else: tags[start] = 'B-' + label for i in range(start + 1, end + 1): tags[i] = 'I-' + label tags[end] = 'E-' + label offset += len(sent) for token, tag in zip(sent, tags): out.write(f'{token}\t{tag}\n') out.write('\n') def make_ner_tsv_if_necessary(json_file): json_file = get_resource(json_file) output_file = os.path.splitext(json_file)[0] + '.ner.tsv' if not os.path.isfile(output_file): convert_jsonlines_to_IOBES(json_file, output_file) return output_file def batch_make_ner_tsv_if_necessary(json_files): for each in json_files: make_ner_tsv_if_necessary(each) def make_pos_tsv_if_necessary(json_file): json_file = get_resource(json_file) output_file = os.path.splitext(json_file)[0] + '.pos.tsv' if not os.path.isfile(output_file): make_pos_tsv(json_file, output_file) return output_file def make_pos_tsv(json_file, output_file): with open(json_file) as src, open(output_file, 'w', encoding='utf-8') as out: for line in src: doc = json.loads(line) for sent, pos in zip(doc['sentences'], doc['pos']): for token, tag in zip(sent, pos): out.write(f'{token}\t{tag}\n') out.write('\n') def batch_make_pos_tsv_if_necessary(json_files): for each in json_files: make_pos_tsv_if_necessary(each) def make_con_txt(conll_file, output_file): with open(output_file, 'w') as out: for sent in read_tsv_as_sents(conll_file): tree = [] pos_per_sent = [] for cell in sent: if cell[0] == '#begin' or cell[0] == '#end': continue if len(cell) < 8: print(cell) filename, sentence_id, token_id, word, POS, parse, framefile, roleset, *_ = cell parse = parse.replace('*', f'({POS} {word})') tree.append(parse) pos_per_sent.append(POS) bracketed = ' '.join(tree) out.write(bracketed) out.write('\n') def make_con_txt_if_necessary(json_file): json_file = get_resource(json_file) output_file = os.path.splitext(json_file)[0] + '.con.txt' if not os.path.isfile(output_file): make_con_txt(json_file, output_file) return output_file def batch_make_con_txt_if_necessary(json_files): for each in json_files: make_con_txt_if_necessary(each) def batch_remove_empty_category_if_necessary(json_files): for each in json_files: src = get_resource(each) dst = replace_ext(src, '.noempty.txt') if not os.path.isfile(dst): remove_all_ec(src) def make_dep_conllx(con_txt_file, output_file, language='en'): con_txt_file = get_resource(con_txt_file) convert_to_dependency(con_txt_file, output_file, language=language) def make_dep_conllx_if_necessary(con_txt_file: str, language='en'): con_txt_file = get_resource(con_txt_file) output_file = con_txt_file.replace('.con.txt', '.dep.conllx', 1) if os.path.isfile(output_file): return make_dep_conllx(con_txt_file, output_file, language) def batch_make_dep_conllx_if_necessary(con_txt_files, language='en'): for each in con_txt_files: make_dep_conllx_if_necessary(each, language) def make_ner_json_if_necessary(json_file): json_file = get_resource(json_file) output_file = os.path.splitext(json_file)[0] + '.ner.jsonlines' if not os.path.isfile(output_file): make_ner_json(json_file, output_file) return output_file def batch_make_ner_json_if_necessary(json_files): for each in json_files: make_ner_json_if_necessary(each) def make_ner_json(json_file, output_file): filter_data(json_file, output_file, doc_ids_file=None, annotation='.name') def make_srl_json_if_necessary(json_file): json_file = get_resource(json_file) output_file = os.path.splitext(json_file)[0] + '.srl.jsonlines' if not os.path.isfile(output_file): make_srl_json(json_file, output_file) return output_file def make_coref_json_if_necessary(json_file): json_file = get_resource(json_file) output_file = os.path.splitext(json_file)[0] + '.coref.jsonlines' if not os.path.isfile(output_file): make_coref_json(json_file, output_file) return output_file def batch_make_srl_json_if_necessary(json_files): for each in json_files: make_srl_json_if_necessary(each) def make_srl_json(json_file, output_file): filter_data(json_file, output_file, doc_ids_file=None, annotation='.prop') def batch_make_coref_json_if_necessary(json_files): for each in json_files: make_coref_json_if_necessary(each) def make_coref_json(json_file, output_file): filter_data(json_file, output_file, doc_ids_file=None, annotation='.coref') def load_raw_text(onf_file) -> List[str]: with open(onf_file) as src: sents = [] expect_sent = False expect_sent_line = False sent_parts = [] for line in src: line = line.strip() if line == 'Plain sentence:': expect_sent_line = True elif expect_sent_line: expect_sent_line = False expect_sent = True continue elif expect_sent: if not line: sents.append(' '.join(sent_parts)) expect_sent = False sent_parts = [] else: sent_parts.append(line) return sents def batch_load_raw_text(root: str) -> Dict[str, List[str]]: onf_files = sorted(glob.glob(os.path.join(root, '**/*.onf'), recursive=True)) sents = dict() for path in onf_files: filename = path.split('annotations/')[1][:-len('.onf')] sents[filename] = load_raw_text(path) return sents def make_raw_text_if_necessary(home: str): home = get_resource(home) jsonpath = os.path.join(home, 'text.jsonlines') if os.path.isfile(jsonpath): return sents = batch_load_raw_text(home) save_json(sents, jsonpath) class RestoreToken(NormalizeToken): def __init__(self, src: str, mapper: Union[str, dict] = None, dst: str = None) -> None: if not mapper: mapper = { '/-': '-', '/.': '.', } super().__init__(mapper, src, dst) def __call__(self, sample: dict) -> dict: src = sample[self.src] src = [[self.convert(y) for y in x] for x in src] sample[self.dst] = src return sample def main(): if len(sys.argv) != 3: eprint('2 arguments required: ontonotes_path output_path') exit(1) ontonotes_path = sys.argv[1] output_path = sys.argv[2] make_ontonotes_jsonlines(ontonotes_path, output_path) if __name__ == "__main__": main() ================================================ FILE: hanlp/datasets/srl/ontonotes5/chinese.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-26 16:07 import os from urllib.error import HTTPError import shutil from hanlp.datasets.srl.ontonotes5 import ONTONOTES5_HOME, CONLL12_HOME from hanlp.datasets.srl.ontonotes5._utils import make_gold_conll, make_ontonotes_language_jsonlines, \ batch_make_ner_tsv_if_necessary, batch_make_pos_tsv_if_necessary, batch_make_con_txt_if_necessary, \ batch_make_dep_conllx_if_necessary from hanlp.utils.io_util import get_resource, path_from_url from hanlp.utils.log_util import cprint, flash _ONTONOTES5_CHINESE_HOME = ONTONOTES5_HOME + 'files/data/chinese/' _ONTONOTES5_CONLL12_CHINESE_HOME = CONLL12_HOME + 'chinese/' ONTONOTES5_CONLL12_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.conll12.jsonlines' '''Training set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.conll12.jsonlines' '''Dev set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.conll12.jsonlines' '''Test set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_NER_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.conll12.ner.tsv' '''Training set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_NER_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.conll12.ner.tsv' '''Dev set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_NER_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.conll12.ner.tsv' '''Test set of OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.jsonlines' ONTONOTES5_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.jsonlines' ONTONOTES5_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.jsonlines' ONTONOTES5_CONLL_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4_gold_conll' ONTONOTES5_CONLL_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4_gold_conll' ONTONOTES5_CONLL_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4_gold_conll' ONTONOTES5_POS_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.pos.tsv' ONTONOTES5_POS_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.pos.tsv' ONTONOTES5_POS_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.pos.tsv' ONTONOTES5_CON_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.con.txt' ONTONOTES5_CON_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.con.txt' ONTONOTES5_CON_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.con.txt' ONTONOTES5_DEP_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.dep.conllx' ONTONOTES5_DEP_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.dep.conllx' ONTONOTES5_DEP_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.dep.conllx' # ONTONOTES5_CON_CHINESE_NOEC_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.con.noempty.txt' # ONTONOTES5_CON_CHINESE_NOEC_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.con.noempty.txt' # ONTONOTES5_CON_CHINESE_NOEC_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.con.noempty.txt' ONTONOTES5_NER_CHINESE_TRAIN = _ONTONOTES5_CONLL12_CHINESE_HOME + 'train.chinese.v4.ner.tsv' ONTONOTES5_NER_CHINESE_DEV = _ONTONOTES5_CONLL12_CHINESE_HOME + 'development.chinese.v4.ner.tsv' ONTONOTES5_NER_CHINESE_TEST = _ONTONOTES5_CONLL12_CHINESE_HOME + 'test.chinese.v4.ner.tsv' try: get_resource(ONTONOTES5_HOME, verbose=False) except HTTPError: intended_file_path = path_from_url(ONTONOTES5_HOME) cprint('Ontonotes 5.0 is a [red][bold]copyright[/bold][/red] dataset owned by LDC which we cannot re-distribute. ' f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) ' f'then download it to {intended_file_path}') cprint('Luckily, an [red]unofficial[/red] Chinese version is provided on GitHub ' 'which will be used for demonstration purpose.') unofficial_chinese = get_resource('https://github.com/GuocaiL/Coref_Resolution/archive/master.zip#data/') intended_home, _ = os.path.splitext(intended_file_path) intended_home = os.path.join(os.path.dirname(intended_home), 'ontonotes-release-5.0') intended_chinese = f'{intended_home}/data/files/data/chinese/' # print(os.path.dirname(intended_chinese)) # print(unofficial_chinese) # print(intended_chinese) for folder in ['annotations', 'metadata']: flash(f'Copying {unofficial_chinese}{folder} to {intended_chinese}{folder} [blink][yellow]...[/yellow][/blink]') shutil.copytree(f'{unofficial_chinese}{folder}', f'{intended_chinese}{folder}') flash('') try: get_resource(ONTONOTES5_CONLL12_CHINESE_TRAIN, verbose=False) except HTTPError: make_gold_conll(ONTONOTES5_HOME + '..', 'chinese') make_ontonotes_language_jsonlines(CONLL12_HOME + 'v4', language='chinese') batch_make_ner_tsv_if_necessary( [ONTONOTES5_CONLL12_CHINESE_TRAIN, ONTONOTES5_CONLL12_CHINESE_DEV, ONTONOTES5_CONLL12_CHINESE_TEST]) batch_make_ner_tsv_if_necessary( [ONTONOTES5_CONLL12_CHINESE_TRAIN, ONTONOTES5_CONLL12_CHINESE_DEV, ONTONOTES5_CONLL12_CHINESE_TEST]) batch_make_ner_tsv_if_necessary( [ONTONOTES5_CHINESE_TRAIN, ONTONOTES5_CHINESE_DEV, ONTONOTES5_CHINESE_TEST]) batch_make_pos_tsv_if_necessary( [ONTONOTES5_CHINESE_TRAIN, ONTONOTES5_CHINESE_DEV, ONTONOTES5_CHINESE_TEST]) batch_make_con_txt_if_necessary( [ONTONOTES5_CONLL_CHINESE_TRAIN, ONTONOTES5_CONLL_CHINESE_DEV, ONTONOTES5_CONLL_CHINESE_TEST]) batch_make_dep_conllx_if_necessary( [ONTONOTES5_CON_CHINESE_TRAIN, ONTONOTES5_CON_CHINESE_DEV, ONTONOTES5_CON_CHINESE_TEST], language='zh') # batch_remove_empty_category_if_necessary( # [ONTONOTES5_CON_CHINESE_TRAIN, ONTONOTES5_CON_CHINESE_DEV, ONTONOTES5_CON_CHINESE_TEST]) ================================================ FILE: hanlp/datasets/srl/ontonotes5/english.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-25 18:48 from urllib.error import HTTPError from hanlp.datasets.srl.ontonotes5 import ONTONOTES5_HOME, CONLL12_HOME from hanlp.datasets.srl.ontonotes5._utils import make_gold_conll, make_ontonotes_language_jsonlines, \ batch_make_ner_tsv_if_necessary, batch_make_pos_tsv_if_necessary, batch_make_con_txt_if_necessary, \ batch_make_dep_conllx_if_necessary from hanlp.utils.io_util import get_resource, path_from_url from hanlp.utils.log_util import cprint _ONTONOTES5_ENGLISH_HOME = ONTONOTES5_HOME + 'files/data/english/' _ONTONOTES5_CONLL12_ENGLISH_HOME = CONLL12_HOME + 'english/' ONTONOTES5_CONLL12_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.conll12.jsonlines' '''Training set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.conll12.jsonlines' '''Dev set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.conll12.jsonlines' '''Test set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.jsonlines' ONTONOTES5_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.jsonlines' ONTONOTES5_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.jsonlines' ONTONOTES5_CONLL_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4_gold_conll' ONTONOTES5_CONLL_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4_gold_conll' ONTONOTES5_CONLL_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4_gold_conll' ONTONOTES5_POS_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.pos.tsv' ONTONOTES5_POS_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.pos.tsv' ONTONOTES5_POS_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.pos.tsv' ONTONOTES5_CON_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.con.txt' ONTONOTES5_CON_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.con.txt' ONTONOTES5_CON_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.con.txt' ONTONOTES5_DEP_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.dep.conllx' ONTONOTES5_DEP_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.dep.conllx' ONTONOTES5_DEP_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.dep.conllx' # ONTONOTES5_CON_ENGLISH_NOEC_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.con.noempty.txt' # ONTONOTES5_CON_ENGLISH_NOEC_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.con.noempty.txt' # ONTONOTES5_CON_ENGLISH_NOEC_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.con.noempty.txt' ONTONOTES5_CONLL12_NER_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.conll12.ner.tsv' '''Training set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_NER_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.conll12.ner.tsv' '''Dev set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_CONLL12_NER_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.conll12.ner.tsv' '''Test set of English OntoNotes5 used in CoNLL12 (:cite:`pradhan-etal-2012-conll`).''' ONTONOTES5_NER_ENGLISH_TRAIN = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'train.english.v4.ner.tsv' ONTONOTES5_NER_ENGLISH_DEV = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'development.english.v4.ner.tsv' ONTONOTES5_NER_ENGLISH_TEST = _ONTONOTES5_CONLL12_ENGLISH_HOME + 'test.english.v4.ner.tsv' try: get_resource(ONTONOTES5_HOME, verbose=False) except HTTPError: intended_file_path = path_from_url(ONTONOTES5_HOME) cprint('Ontonotes 5.0 is a [red][bold]copyright[/bold][/red] dataset owned by LDC which we cannot re-distribute. ' f'Please apply for a licence from LDC (https://catalog.ldc.upenn.edu/LDC2016T13) ' f'then download it to {intended_file_path}') exit(1) try: get_resource(ONTONOTES5_CONLL12_ENGLISH_TRAIN, verbose=False) except HTTPError: make_gold_conll(ONTONOTES5_HOME + '..', 'english') make_ontonotes_language_jsonlines(CONLL12_HOME + 'v4', language='english') batch_make_ner_tsv_if_necessary( [ONTONOTES5_CONLL12_ENGLISH_TRAIN, ONTONOTES5_CONLL12_ENGLISH_DEV, ONTONOTES5_CONLL12_ENGLISH_TEST]) batch_make_ner_tsv_if_necessary( [ONTONOTES5_ENGLISH_TRAIN, ONTONOTES5_ENGLISH_DEV, ONTONOTES5_ENGLISH_TEST]) batch_make_pos_tsv_if_necessary( [ONTONOTES5_ENGLISH_TRAIN, ONTONOTES5_ENGLISH_DEV, ONTONOTES5_ENGLISH_TEST]) batch_make_con_txt_if_necessary( [ONTONOTES5_CONLL_ENGLISH_TRAIN, ONTONOTES5_CONLL_ENGLISH_DEV, ONTONOTES5_CONLL_ENGLISH_TEST]) batch_make_dep_conllx_if_necessary( [ONTONOTES5_CON_ENGLISH_TRAIN, ONTONOTES5_CON_ENGLISH_DEV, ONTONOTES5_CON_ENGLISH_TEST]) # batch_remove_empty_category_if_necessary( # [ONTONOTES5_CON_ENGLISH_TRAIN, ONTONOTES5_CON_ENGLISH_DEV, ONTONOTES5_CON_ENGLISH_TEST]) ================================================ FILE: hanlp/datasets/sts/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-20 16:25 ================================================ FILE: hanlp/datasets/sts/stsb.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-20 16:25 from typing import Union, List, Callable from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import read_cells STS_B_TRAIN = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-train.csv' STS_B_DEV = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-dev.csv' STS_B_TEST = 'http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz#sts-test.csv' class SemanticTextualSimilarityDataset(TransformableDataset): def __init__(self, data: Union[str, List], sent_a_col, sent_b_col, similarity_col, delimiter='auto', transform: Union[Callable, List] = None, cache=None, generate_idx=None) -> None: self.delimiter = delimiter self.similarity_col = similarity_col self.sent_b_col = sent_b_col self.sent_a_col = sent_a_col super().__init__(data, transform, cache, generate_idx) def load_file(self, filepath: str): for i, cells in enumerate(read_cells(filepath, strip=True, delimiter=self.delimiter)): yield { 'sent_a': cells[self.sent_a_col], 'sent_b': cells[self.sent_b_col], 'similarity': float(cells[self.similarity_col]) } ================================================ FILE: hanlp/datasets/tokenization/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-01 12:33 ================================================ FILE: hanlp/datasets/tokenization/ctb6.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:19 _CTB6_CWS_HOME = 'http://file.hankcs.com/corpus/ctb6_cws.zip' CTB6_CWS_TRAIN = _CTB6_CWS_HOME + '#train.txt' '''CTB6 training set.''' CTB6_CWS_DEV = _CTB6_CWS_HOME + '#dev.txt' '''CTB6 dev set.''' CTB6_CWS_TEST = _CTB6_CWS_HOME + '#test.txt' '''CTB6 test set.''' ================================================ FILE: hanlp/datasets/tokenization/loaders/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:06 ================================================ FILE: hanlp/datasets/tokenization/loaders/chunking_dataset.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-03 18:50 from typing import Union, List, Callable from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import get_resource from hanlp.utils.span_util import bmes_of from hanlp.utils.string_util import ispunct class ChunkingDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, generate_idx=None, max_seq_len=None, sent_delimiter=None) -> None: if not sent_delimiter: sent_delimiter = lambda x: ispunct(x) elif isinstance(sent_delimiter, str): sent_delimiter = set(list(sent_delimiter)) sent_delimiter = lambda x: x in sent_delimiter self.sent_delimiter = sent_delimiter self.max_seq_len = max_seq_len super().__init__(data, transform, cache, generate_idx) def load_file(self, filepath): max_seq_len = self.max_seq_len delimiter = self.sent_delimiter for chars, tags in self._generate_chars_tags(filepath, delimiter, max_seq_len): yield {'char': chars, 'tag': tags} @staticmethod def _generate_chars_tags(filepath, delimiter, max_seq_len): filepath = get_resource(filepath) with open(filepath, encoding='utf8') as src: for text in src: chars, tags = bmes_of(text, True) if max_seq_len and delimiter and len(chars) > max_seq_len: short_chars, short_tags = [], [] for idx, (char, tag) in enumerate(zip(chars, tags)): short_chars.append(char) short_tags.append(tag) if len(short_chars) >= max_seq_len and delimiter(char): yield short_chars, short_tags short_chars, short_tags = [], [] if short_chars: yield short_chars, short_tags else: yield chars, tags ================================================ FILE: hanlp/datasets/tokenization/loaders/multi_criteria_cws/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-11 20:35 _HOME = 'https://github.com/hankcs/multi-criteria-cws/archive/naive-mix.zip#data/raw/' CNC_TRAIN_ALL = _HOME + 'cnc/train-all.txt' CNC_TRAIN = _HOME + 'cnc/train.txt' CNC_DEV = _HOME + 'cnc/dev.txt' CNC_TEST = _HOME + 'cnc/test.txt' CTB_TRAIN_ALL = _HOME + 'ctb/train-all.txt' CTB_TRAIN = _HOME + 'ctb/train.txt' CTB_DEV = _HOME + 'ctb/dev.txt' CTB_TEST = _HOME + 'ctb/test.txt' SXU_TRAIN_ALL = _HOME + 'sxu/train-all.txt' SXU_TRAIN = _HOME + 'sxu/train.txt' SXU_DEV = _HOME + 'sxu/dev.txt' SXU_TEST = _HOME + 'sxu/test.txt' UDC_TRAIN_ALL = _HOME + 'udc/train-all.txt' UDC_TRAIN = _HOME + 'udc/train.txt' UDC_DEV = _HOME + 'udc/dev.txt' UDC_TEST = _HOME + 'udc/test.txt' WTB_TRAIN_ALL = _HOME + 'wtb/train-all.txt' WTB_TRAIN = _HOME + 'wtb/train.txt' WTB_DEV = _HOME + 'wtb/dev.txt' WTB_TEST = _HOME + 'wtb/test.txt' ZX_TRAIN_ALL = _HOME + 'zx/train-all.txt' ZX_TRAIN = _HOME + 'zx/train.txt' ZX_DEV = _HOME + 'zx/dev.txt' ZX_TEST = _HOME + 'zx/test.txt' ================================================ FILE: hanlp/datasets/tokenization/loaders/multi_criteria_cws/mcws_dataset.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-10-21 19:11 import os from typing import Union, List, Callable, Dict, Iterable from hanlp.datasets.tokenization.loaders.txt import TextTokenizingDataset from hanlp.utils.io_util import get_resource class MultiCriteriaTextTokenizingDataset(TextTokenizingDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, generate_idx=None, delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False) -> None: super().__init__(data, transform, cache, generate_idx, delimiter, max_seq_len, sent_delimiter, char_level, hard_constraint) def should_load_file(self, data) -> bool: return isinstance(data, (tuple, dict)) def load_file(self, filepath: Union[Iterable[str], Dict[str, str]]): """Load multi-criteria corpora specified in filepath. Args: filepath: A list of files where filename is its criterion. Or a dict of filename-criterion pairs. .. highlight:: bash .. code-block:: bash $ tree -L 2 . . ├── cnc │   ├── dev.txt │   ├── test.txt │   ├── train-all.txt │   └── train.txt ├── ctb │   ├── dev.txt │   ├── test.txt │   ├── train-all.txt │   └── train.txt ├── sxu │   ├── dev.txt │   ├── test.txt │   ├── train-all.txt │   └── train.txt ├── udc │   ├── dev.txt │   ├── test.txt │   ├── train-all.txt │   └── train.txt ├── wtb │   ├── dev.txt │   ├── test.txt │   ├── train-all.txt │   └── train.txt └── zx ├── dev.txt ├── test.txt ├── train-all.txt └── train.txt $ head -n 2 ctb/dev.txt 上海 浦东 开发 与 法制 建设 同步 新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 ) """ for eachpath in (filepath.items() if isinstance(filepath, dict) else filepath): if isinstance(eachpath, tuple): criteria, eachpath = eachpath eachpath = get_resource(eachpath) else: eachpath = get_resource(eachpath) criteria = os.path.basename(os.path.dirname(eachpath)) for sample in super().load_file(eachpath): sample['criteria'] = criteria yield sample def append_criteria_token(sample: dict, criteria_tokens: Dict[str, int], criteria_token_map: dict) -> dict: criteria = sample['criteria'] token = criteria_token_map.get(criteria, None) if not token: unused_tokens = list(criteria_tokens.keys()) size = len(criteria_token_map) assert size + 1 < len(unused_tokens), f'No unused token available for criteria {criteria}. ' \ f'Current criteria_token_map = {criteria_token_map}' token = criteria_token_map[criteria] = unused_tokens[size] sample['token_token_type_ids'] = [0] * len(sample['token_input_ids']) + [1] sample['token_input_ids'] = sample['token_input_ids'] + [criteria_tokens[token]] return sample ================================================ FILE: hanlp/datasets/tokenization/loaders/txt.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-01 12:35 from typing import Union, List, Callable from hanlp.common.dataset import TransformableDataset from hanlp.utils.io_util import TimingFileIterator from hanlp.utils.span_util import words_to_bmes, words_to_bi from hanlp.utils.string_util import split_long_sentence_into class TextTokenizingDataset(TransformableDataset): def __init__(self, data: Union[str, List], transform: Union[Callable, List] = None, cache=None, generate_idx=None, delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, ) -> None: """A dataset for tagging tokenization tasks. Args: data: The local or remote path to a dataset, or a list of samples where each sample is a dict. transform: Predefined transform(s). cache: ``True`` to enable caching, so that transforms won't be called twice. generate_idx: Create a :const:`~hanlp_common.constants.IDX` field for each sample to store its order in dataset. Useful for prediction when samples are re-ordered by a sampler. delimiter: Delimiter between tokens used to split a line in the corpus. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. """ self.hard_constraint = hard_constraint self.char_level = char_level self.sent_delimiter = sent_delimiter self.max_seq_len = max_seq_len self.delimiter = delimiter super().__init__(data, transform, cache, generate_idx) def load_file(self, filepath: str): """Load tokenized corpus. The format is one sentence per line, where each line consisits of tokens seperated by a delimiter (usually space). .. highlight:: bash .. code-block:: bash $ head train.txt 上海 浦东 开发 与 法制 建设 同步 新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 ) Args: filepath: The path to the corpus. """ f = TimingFileIterator(filepath) # longest_sent = 0 for line in f: line = line.rstrip('\n') tokens = line.split(self.delimiter) if not tokens: continue if self.max_seq_len and sum(len(t) for t in tokens) > self.max_seq_len: # debug = [] for short_sents in split_long_sentence_into(tokens, self.max_seq_len, self.sent_delimiter, char_level=self.char_level, hard_constraint=self.hard_constraint): # debug.extend(short_sents) # longest_sent = max(longest_sent, len(''.join(short_sents))) yield {'token': short_sents} # assert debug == tokens else: # longest_sent = max(longest_sent, len(''.join(tokens))) yield {'token': tokens} f.log(line[:20]) f.erase() # print(f'Longest sent: {longest_sent} in {filepath}') def generate_tags_for_subtokens(sample: dict, tagging_scheme='BMES'): """ Create a sequence of x for tokenization task. Each x is an atomic subtoken that will be tagged with BMES or BI tags. Args: sample: During prediction, it is a dict with 'token' being the input text, 'token_subtoken_offsets' being incremental offsets per each subtoken. During training, it is a dict with 'token' being a sequence of tokens, 'token_subtoken_offsets' being non-incremental offsets per each subtoken, 'token_subtoken_offsets_group' being subtoken offsets grouped by each token. tagging_scheme: Returns: """ # We could use token_token_span but we don't want token_token_span in the batch subtokens_group = sample.get('token_subtoken_offsets_group', None) sample['raw_token'] = sample['token'] tokens = sample.get('token_') or sample['token'] if subtokens_group: sample['token'] = subtokens_group_to_subtokens(tokens, subtokens_group) if tagging_scheme == 'BMES': sample['tag'] = words_to_bmes(subtokens_group) elif tagging_scheme == 'BI': sample['tag'] = words_to_bi(subtokens_group) else: raise NotImplementedError(f'Unsupported tagging scheme {tagging_scheme}.') else: sample['token'] = subtoken_offsets_to_subtokens(tokens, sample['token_subtoken_offsets']) return sample def subtoken_offsets_to_subtokens(text, token_subtoken_offsets): results = [] for b, e in token_subtoken_offsets: results.append(text[b:e]) return results def subtokens_group_to_subtokens(tokens, subtoken_offsets_group): results = [] for subtoken_offsets, token in zip(subtoken_offsets_group, tokens): for b, e in subtoken_offsets: results.append(token[b:e]) return results ================================================ FILE: hanlp/datasets/tokenization/sighan2005/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:42 import os from hanlp.utils.io_util import get_resource, split_file from hanlp.utils.log_util import logger SIGHAN2005 = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip' def make(train): root = get_resource(SIGHAN2005) train = os.path.join(root, train.split('#')[-1]) if not os.path.isfile(train): full = train.replace('_90.txt', '.utf8') logger.info(f'Splitting {full} into training set and valid set with 9:1 proportion') valid = train.replace('90.txt', '10.txt') split_file(full, train=0.9, dev=0.1, test=0, names={'train': train, 'dev': valid}) assert os.path.isfile(train), f'Failed to make {train}' assert os.path.isfile(valid), f'Failed to make {valid}' logger.info(f'Successfully made {train} {valid}') ================================================ FILE: hanlp/datasets/tokenization/sighan2005/as_.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:42 from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make SIGHAN2005_AS_DICT = SIGHAN2005 + "#" + "gold/as_training_words.utf8" '''Dictionary built on trainings set.''' SIGHAN2005_AS_TRAIN_ALL = SIGHAN2005 + "#" + "training/as_training.utf8" '''Full training set.''' SIGHAN2005_AS_TRAIN = SIGHAN2005 + "#" + "training/as_training_90.txt" '''Training set (first 90% of the full official training set).''' SIGHAN2005_AS_DEV = SIGHAN2005 + "#" + "training/as_training_10.txt" '''Dev set (last 10% of full official training set).''' SIGHAN2005_AS_TEST_INPUT = SIGHAN2005 + "#" + "testing/as_testing.utf8" '''Test input.''' SIGHAN2005_AS_TEST = SIGHAN2005 + "#" + "gold/as_testing_gold.utf8" '''Test set.''' make(SIGHAN2005_AS_TRAIN) ================================================ FILE: hanlp/datasets/tokenization/sighan2005/cityu.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:42 from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make SIGHAN2005_CITYU_DICT = SIGHAN2005 + "#" + "gold/cityu_training_words.utf8" '''Dictionary built on trainings set.''' SIGHAN2005_CITYU_TRAIN_ALL = SIGHAN2005 + "#" + "training/cityu_training.utf8" '''Full training set.''' SIGHAN2005_CITYU_TRAIN = SIGHAN2005 + "#" + "training/cityu_training_90.txt" '''Training set (first 90% of the full official training set).''' SIGHAN2005_CITYU_DEV = SIGHAN2005 + "#" + "training/cityu_training_10.txt" '''Dev set (last 10% of full official training set).''' SIGHAN2005_CITYU_TEST_INPUT = SIGHAN2005 + "#" + "testing/cityu_test.utf8" '''Test input.''' SIGHAN2005_CITYU_TEST = SIGHAN2005 + "#" + "gold/cityu_test_gold.utf8" '''Test set.''' make(SIGHAN2005_CITYU_TRAIN) ================================================ FILE: hanlp/datasets/tokenization/sighan2005/msr.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:42 from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make SIGHAN2005_MSR_DICT = SIGHAN2005 + "#" + "gold/msr_training_words.utf8" '''Dictionary built on trainings set.''' SIGHAN2005_MSR_TRAIN_ALL = SIGHAN2005 + "#" + "training/msr_training.utf8" '''Full training set.''' SIGHAN2005_MSR_TRAIN = SIGHAN2005 + "#" + "training/msr_training_90.txt" '''Training set (first 90% of the full official training set).''' SIGHAN2005_MSR_DEV = SIGHAN2005 + "#" + "training/msr_training_10.txt" '''Dev set (last 10% of full official training set).''' SIGHAN2005_MSR_TEST_INPUT = SIGHAN2005 + "#" + "testing/msr_test.utf8" '''Test input.''' SIGHAN2005_MSR_TEST = SIGHAN2005 + "#" + "gold/msr_test_gold.utf8" '''Test set.''' make(SIGHAN2005_MSR_TRAIN) ================================================ FILE: hanlp/datasets/tokenization/sighan2005/pku.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:42 from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make SIGHAN2005_PKU_DICT = SIGHAN2005 + "#" + "gold/pku_training_words.utf8" '''Dictionary built on trainings set.''' SIGHAN2005_PKU_TRAIN_ALL = SIGHAN2005 + "#" + "training/pku_training.utf8" '''Full training set.''' SIGHAN2005_PKU_TRAIN = SIGHAN2005 + "#" + "training/pku_training_90.txt" '''Training set (first 90% of the full official training set).''' SIGHAN2005_PKU_DEV = SIGHAN2005 + "#" + "training/pku_training_10.txt" '''Dev set (last 10% of full official training set).''' SIGHAN2005_PKU_TEST_INPUT = SIGHAN2005 + "#" + "testing/pku_test.utf8" '''Test input.''' SIGHAN2005_PKU_TEST = SIGHAN2005 + "#" + "gold/pku_test_gold.utf8" '''Test set.''' make(SIGHAN2005_PKU_TRAIN) ================================================ FILE: hanlp/layers/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-26 00:50 ================================================ FILE: hanlp/layers/cnn_encoder.py ================================================ from typing import Optional, Tuple import torch from torch.nn import Conv1d, Linear class CnnEncoder(torch.nn.Module): """ A `CnnEncoder` is a combination of multiple convolution layers and max pooling layers. As a [`Seq2VecEncoder`](./seq2vec_encoder.md), the input to this module is of shape `(batch_size, num_tokens, input_dim)`, and the output is of shape `(batch_size, output_dim)`. The CNN has one convolution layer for each ngram filter size. Each convolution operation gives out a vector of size num_filters. The number of times a convolution layer will be used is `num_tokens - ngram_size + 1`. The corresponding maxpooling layer aggregates all these outputs from the convolution layer and outputs the max. This operation is repeated for every ngram size passed, and consequently the dimensionality of the output after maxpooling is `len(ngram_filter_sizes) * num_filters`. This then gets (optionally) projected down to a lower dimensional output, specified by `output_dim`. We then use a fully connected layer to project in back to the desired output_dim. For more details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1. Registered as a `Seq2VecEncoder` with name "cnn". # Parameters embedding_dim : `int`, required This is the input dimension to the encoder. We need this because we can't do shape inference in pytorch, and we need to know what size filters to construct in the CNN. num_filters : `int`, required This is the output dim for each convolutional layer, which is the number of "filters" learned by that layer. ngram_filter_sizes : `Tuple[int]`, optional (default=`(2, 3, 4, 5)`) This specifies both the number of convolutional layers we will create and their sizes. The default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding ngrams of size 2 to 5 with some number of filters. conv_layer_activation : `Activation`, optional (default=`torch.nn.ReLU`) Activation to use after the convolution layers. output_dim : `Optional[int]`, optional (default=`None`) After doing convolutions and pooling, we'll project the collected features into a vector of this size. If this value is `None`, we will just return the result of the max pooling, giving an output of shape `len(ngram_filter_sizes) * num_filters`. """ def __init__( self, embedding_dim: int, num_filters: int, ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5), conv_layer_activation: str = 'ReLU', output_dim: Optional[int] = None, ) -> None: super().__init__() self._embedding_dim = embedding_dim self._num_filters = num_filters self._ngram_filter_sizes = ngram_filter_sizes self._activation = getattr(torch.nn, conv_layer_activation)() self._output_dim = output_dim self._convolution_layers = [ Conv1d( in_channels=self._embedding_dim, out_channels=self._num_filters, kernel_size=ngram_size, ) for ngram_size in self._ngram_filter_sizes ] for i, conv_layer in enumerate(self._convolution_layers): self.add_module("conv_layer_%d" % i, conv_layer) maxpool_output_dim = self._num_filters * len(self._ngram_filter_sizes) if self._output_dim: self.projection_layer = Linear(maxpool_output_dim, self._output_dim) else: self.projection_layer = None self._output_dim = maxpool_output_dim def get_input_dim(self) -> int: return self._embedding_dim def get_output_dim(self) -> int: return self._output_dim def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor): if mask is not None: tokens = tokens * mask.unsqueeze(-1) # Our input is expected to have shape `(batch_size, num_tokens, embedding_dim)`. The # convolution layers expect input of shape `(batch_size, in_channels, sequence_length)`, # where the conv layer `in_channels` is our `embedding_dim`. We thus need to transpose the # tensor first. tokens = torch.transpose(tokens, 1, 2) # Each convolution layer returns output of size `(batch_size, num_filters, pool_length)`, # where `pool_length = num_tokens - ngram_size + 1`. We then do an activation function, # then do max pooling over each filter for the whole input sequence. Because our max # pooling is simple, we just use `torch.max`. The resultant tensor of has shape # `(batch_size, num_conv_layers * num_filters)`, which then gets projected using the # projection layer, if requested. filter_outputs = [] for i in range(len(self._convolution_layers)): convolution_layer = getattr(self, "conv_layer_{}".format(i)) filter_outputs.append(self._activation(convolution_layer(tokens)).max(dim=2)[0]) # Now we have a list of `num_conv_layers` tensors of shape `(batch_size, num_filters)`. # Concatenating them gives us a tensor of shape `(batch_size, num_filters * num_conv_layers)`. maxpool_output = ( torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0] ) if self.projection_layer: result = self.projection_layer(maxpool_output) else: result = maxpool_output return result ================================================ FILE: hanlp/layers/crf/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-18 22:55 ================================================ FILE: hanlp/layers/crf/crf.py ================================================ # Copied from https://github.com/kmkurn/pytorch-crf # Copyright 2017 Kemal Kurniawan # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in # the Software without restriction, including without limitation the rights to # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies # of the Software, and to permit persons to whom the Software is furnished to do # so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. __version__ = '0.7.2' from typing import List, Optional import torch import torch.nn as nn class CRF(nn.Module): """Conditional random field. This module implements a conditional random field [LMP01]_. The forward computation of this class computes the log likelihood of the given sequence of tags and emission score tensor. This class also has `~CRF.decode` method which finds the best tag sequence given an emission score tensor using `Viterbi algorithm`_. Args: num_tags: Number of tags. batch_first: Whether the first dimension corresponds to the size of a minibatch. Attributes: start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size ``(num_tags,)``. end_transitions (`~torch.nn.Parameter`): End transition score tensor of size ``(num_tags,)``. transitions (`~torch.nn.Parameter`): Transition score tensor of size ``(num_tags, num_tags)``. .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001). "Conditional random fields: Probabilistic models for segmenting and labeling sequence data". *Proc. 18th International Conf. on Machine Learning*. Morgan Kaufmann. pp. 282–289. .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm """ def __init__(self, num_tags: int, batch_first: bool = True) -> None: if num_tags <= 0: raise ValueError(f'invalid number of tags: {num_tags}') super().__init__() self.num_tags = num_tags self.batch_first = batch_first self.start_transitions = nn.Parameter(torch.empty(num_tags)) self.end_transitions = nn.Parameter(torch.empty(num_tags)) self.transitions = nn.Parameter(torch.empty(num_tags, num_tags)) self.reset_parameters() def reset_parameters(self) -> None: """Initialize the transition parameters. The parameters will be initialized randomly from a uniform distribution between -0.1 and 0.1. """ nn.init.uniform_(self.start_transitions, -0.1, 0.1) nn.init.uniform_(self.end_transitions, -0.1, 0.1) nn.init.uniform_(self.transitions, -0.1, 0.1) def __repr__(self) -> str: return f'{self.__class__.__name__}(num_tags={self.num_tags})' def forward( self, emissions: torch.Tensor, tags: torch.LongTensor, mask: Optional[torch.ByteTensor] = None, reduction: str = 'sum', ) -> torch.Tensor: """Compute the conditional log likelihood of a sequence of tags given emission scores. Args: emissions (`~torch.Tensor`): Emission score tensor of size ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, ``(batch_size, seq_length, num_tags)`` otherwise. tags (`~torch.LongTensor`): Sequence of tags tensor of size ``(seq_length, batch_size)`` if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. reduction: Specifies the reduction to apply to the output: ``none|sum|mean|token_mean``. ``none``: no reduction will be applied. ``sum``: the output will be summed over batches. ``mean``: the output will be averaged over batches. ``token_mean``: the output will be averaged over tokens. Returns: `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if reduction is ``none``, ``()`` otherwise. """ self._validate(emissions, tags=tags, mask=mask) if reduction not in ('none', 'sum', 'mean', 'token_mean'): raise ValueError(f'invalid reduction: {reduction}') if mask is None: mask = torch.ones_like(tags, dtype=torch.uint8) if self.batch_first: emissions = emissions.transpose(0, 1) tags = tags.transpose(0, 1) mask = mask.transpose(0, 1) # shape: (batch_size,) numerator = self._compute_score(emissions, tags, mask) # shape: (batch_size,) denominator = self._compute_normalizer(emissions, mask) # shape: (batch_size,) llh = numerator - denominator if reduction == 'none': return llh if reduction == 'sum': return llh.sum() if reduction == 'mean': return llh.mean() assert reduction == 'token_mean' return llh.sum() / mask.type_as(emissions).sum() def decode(self, emissions: torch.Tensor, mask: Optional[torch.ByteTensor] = None) -> List[List[int]]: """Find the most likely tag sequence using Viterbi algorithm. Args: emissions (`~torch.Tensor`): Emission score tensor of size ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, ``(batch_size, seq_length, num_tags)`` otherwise. mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. Returns: List of list containing the best tag sequence for each batch. """ self._validate(emissions, mask=mask) if mask is None: mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8) if self.batch_first: emissions = emissions.transpose(0, 1) mask = mask.transpose(0, 1) return self._viterbi_decode(emissions, mask) def _validate( self, emissions: torch.Tensor, tags: Optional[torch.LongTensor] = None, mask: Optional[torch.ByteTensor] = None) -> None: if emissions.dim() != 3: raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}') if emissions.size(2) != self.num_tags: raise ValueError( f'expected last dimension of emissions is {self.num_tags}, ' f'got {emissions.size(2)}') if tags is not None: if emissions.shape[:2] != tags.shape: raise ValueError( 'the first two dimensions of emissions and tags must match, ' f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}') if mask is not None: if emissions.shape[:2] != mask.shape: raise ValueError( 'the first two dimensions of emissions and mask must match, ' f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}') no_empty_seq = not self.batch_first and mask[0].all() no_empty_seq_bf = self.batch_first and mask[:, 0].all() if not no_empty_seq and not no_empty_seq_bf: raise ValueError('mask of the first timestep must all be on') def _compute_score( self, emissions: torch.Tensor, tags: torch.LongTensor, mask: torch.ByteTensor) -> torch.Tensor: # emissions: (seq_length, batch_size, num_tags) # tags: (seq_length, batch_size) # mask: (seq_length, batch_size) assert emissions.dim() == 3 and tags.dim() == 2 assert emissions.shape[:2] == tags.shape assert emissions.size(2) == self.num_tags assert mask.shape == tags.shape assert mask[0].all() seq_length, batch_size = tags.shape mask = mask.type_as(emissions) # Start transition score and first emission # shape: (batch_size,) score = self.start_transitions[tags[0]] score += emissions[0, torch.arange(batch_size), tags[0]] for i in range(1, seq_length): # Transition score to next tag, only added if next timestep is valid (mask == 1) # shape: (batch_size,) score += self.transitions[tags[i - 1], tags[i]] * mask[i] # Emission score for next tag, only added if next timestep is valid (mask == 1) # shape: (batch_size,) score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i] # End transition score # shape: (batch_size,) seq_ends = mask.long().sum(dim=0) - 1 # shape: (batch_size,) last_tags = tags[seq_ends, torch.arange(batch_size)] # shape: (batch_size,) score += self.end_transitions[last_tags] return score def _compute_normalizer( self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor: # emissions: (seq_length, batch_size, num_tags) # mask: (seq_length, batch_size) assert emissions.dim() == 3 and mask.dim() == 2 assert emissions.shape[:2] == mask.shape assert emissions.size(2) == self.num_tags assert mask[0].all() seq_length = emissions.size(0) # Start transition score and first emission; score has size of # (batch_size, num_tags) where for each batch, the j-th column stores # the score that the first timestep has tag j # shape: (batch_size, num_tags) score = self.start_transitions + emissions[0] for i in range(1, seq_length): # Broadcast score for every possible next tag # shape: (batch_size, num_tags, 1) broadcast_score = score.unsqueeze(2) # Broadcast emission score for every possible current tag # shape: (batch_size, 1, num_tags) broadcast_emissions = emissions[i].unsqueeze(1) # Compute the score tensor of size (batch_size, num_tags, num_tags) where # for each sample, entry at row i and column j stores the sum of scores of all # possible tag sequences so far that end with transitioning from tag i to tag j # and emitting # shape: (batch_size, num_tags, num_tags) next_score = broadcast_score + self.transitions + broadcast_emissions # Sum over all possible current tags, but we're in score space, so a sum # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of # all possible tag sequences so far, that end in tag i # shape: (batch_size, num_tags) next_score = torch.logsumexp(next_score, dim=1) # Set score to the next score if this timestep is valid (mask == 1) # shape: (batch_size, num_tags) score = torch.where(mask[i].unsqueeze(1), next_score, score) # End transition score # shape: (batch_size, num_tags) score += self.end_transitions # Sum (log-sum-exp) over all possible tags # shape: (batch_size,) return torch.logsumexp(score, dim=1) def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) -> List[List[int]]: # emissions: (seq_length, batch_size, num_tags) # mask: (seq_length, batch_size) assert emissions.dim() == 3 and mask.dim() == 2 assert emissions.shape[:2] == mask.shape assert emissions.size(2) == self.num_tags assert mask[0].all() seq_length, batch_size = mask.shape # Start transition and first emission # shape: (batch_size, num_tags) score = self.start_transitions + emissions[0] history = [] # score is a tensor of size (batch_size, num_tags) where for every batch, # value at column j stores the score of the best tag sequence so far that ends # with tag j # history saves where the best tags candidate transitioned from; this is used # when we trace back the best tag sequence # Viterbi algorithm recursive case: we compute the score of the best tag sequence # for every possible next tag for i in range(1, seq_length): # Broadcast viterbi score for every possible next tag # shape: (batch_size, num_tags, 1) broadcast_score = score.unsqueeze(2) # Broadcast emission score for every possible current tag # shape: (batch_size, 1, num_tags) broadcast_emission = emissions[i].unsqueeze(1) # Compute the score tensor of size (batch_size, num_tags, num_tags) where # for each sample, entry at row i and column j stores the score of the best # tag sequence so far that ends with transitioning from tag i to tag j and emitting # shape: (batch_size, num_tags, num_tags) next_score = broadcast_score + self.transitions + broadcast_emission # Find the maximum score over all possible current tag # shape: (batch_size, num_tags) next_score, indices = next_score.max(dim=1) # Set score to the next score if this timestep is valid (mask == 1) # and save the index that produces the next score # shape: (batch_size, num_tags) score = torch.where(mask[i].unsqueeze(1), next_score, score) history.append(indices) # End transition score # shape: (batch_size, num_tags) score += self.end_transitions # Now, compute the best path for each sample # shape: (batch_size,) seq_ends = mask.long().sum(dim=0) - 1 best_tags_list = [] for idx in range(batch_size): # Find the tag which maximizes the score at the last timestep; this is our best tag # for the last timestep _, best_last_tag = score[idx].max(dim=0) best_tags = [best_last_tag.item()] # We trace back where the best last tag comes from, append that to our best tag # sequence, and trace it back again, and so on for hist in reversed(history[:seq_ends[idx]]): best_last_tag = hist[idx][best_tags[-1]] best_tags.append(best_last_tag.item()) # Reverse the order because we start from the last timestep best_tags.reverse() best_tags_list.append(best_tags) return best_tags_list ================================================ FILE: hanlp/layers/crf/crf_layer_tf.py ================================================ # ****************************************************************************** # Copyright 2017-2018 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ****************************************************************************** import tensorflow as tf from hanlp.layers.crf.crf_tf import crf_decode, crf_log_likelihood class CRF(tf.keras.layers.Layer): """Conditional Random Field layer (tf.keras) `CRF` can be used as the last layer in a network (as a classifier). Input shape (features) must be equal to the number of classes the CRF can predict (a linear layer is recommended). Note: the loss and accuracy functions of networks using `CRF` must use the provided loss and accuracy functions (denoted as loss and viterbi_accuracy) as the classification of sequences are used with the layers internal weights. Copyright: this is a modified version of https://github.com/NervanaSystems/nlp-architect/blob/master/nlp_architect/nn/tensorflow/python/keras/layers/crf.py Args: num_labels(int): the number of labels to tag each temporal input. Input shape: num_labels(int): the number of labels to tag each temporal input. Input shape: nD tensor with shape `(batch_size, sentence length, num_classes)`. Output shape: nD tensor with shape: `(batch_size, sentence length, num_classes)`. Returns: """ def __init__(self, num_classes, **kwargs): self.transitions = None super(CRF, self).__init__(**kwargs) # num of output labels self.output_dim = int(num_classes) self.input_spec = tf.keras.layers.InputSpec(min_ndim=3) self.supports_masking = False sequence_lengths = None def get_config(self): config = { 'output_dim': self.output_dim, 'supports_masking': self.supports_masking, 'transitions': tf.keras.backend.eval(self.transitions) } base_config = super(CRF, self).get_config() return dict(list(base_config.items()) + list(config.items())) def build(self, input_shape): assert len(input_shape) == 3 f_shape = tf.TensorShape(input_shape) input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]}) if f_shape[-1] is None: raise ValueError('The last dimension of the inputs to `CRF` ' 'should be defined. Found `None`.') if f_shape[-1] != self.output_dim: raise ValueError('The last dimension of the input shape must be equal to output' ' shape. Use a linear layer if needed.') self.input_spec = input_spec self.transitions = self.add_weight(name='transitions', shape=[self.output_dim, self.output_dim], initializer='glorot_uniform', trainable=True) self.built = True def compute_mask(self, inputs, mask=None): # Just pass the received mask from previous layer, to the next layer or # manipulate it if this layer changes the shape of the input return mask # pylint: disable=arguments-differ def call(self, inputs, sequence_lengths=None, mask=None, training=None, **kwargs): sequences = tf.convert_to_tensor(inputs, dtype=self.dtype) if sequence_lengths is not None: assert len(sequence_lengths.shape) == 2 assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32' seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list() assert seq_len_shape[1] == 1 sequence_lengths = tf.keras.backend.flatten(sequence_lengths) else: sequence_lengths = tf.math.count_nonzero(mask, axis=1) viterbi_sequence, _ = crf_decode(sequences, self.transitions, sequence_lengths) output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim) return tf.keras.backend.in_train_phase(sequences, output) # def loss(self, y_true, y_pred): # y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype) # log_likelihood, self.transitions = \ # crf_log_likelihood(y_pred, # tf.cast(y_true, dtype=tf.int32), # sequence_lengths, # transition_params=self.transitions) # return tf.reduce_mean(-log_likelihood) def compute_output_shape(self, input_shape): tf.TensorShape(input_shape).assert_has_rank(3) return input_shape[:2] + (self.output_dim,) @property def viterbi_accuracy(self): def accuracy(y_true, y_pred): shape = tf.shape(y_pred) sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1]) viterbi_sequence, _ = crf_decode(y_pred, self.transitions, sequence_lengths) output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim) return tf.keras.metrics.categorical_accuracy(y_true, output) accuracy.func_name = 'viterbi_accuracy' return accuracy class CRFLoss(object): def __init__(self, crf: CRF, dtype) -> None: super().__init__() self.crf = crf self.dtype = dtype self.__name__ = type(self).__name__ def __call__(self, y_true, y_pred, sample_weight=None, **kwargs): assert sample_weight is not None, 'your model has to support masking' if len(y_true.shape) == 3: y_true = tf.argmax(y_true, axis=-1) sequence_lengths = tf.math.count_nonzero(sample_weight, axis=1) y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype) log_likelihood, self.crf.transitions = \ crf_log_likelihood(y_pred, tf.cast(y_true, dtype=tf.int32), sequence_lengths, transition_params=self.crf.transitions) return tf.reduce_mean(-log_likelihood) class CRFWrapper(tf.keras.Model): def __init__(self, model: tf.keras.Model, num_classes=None, *args, **kwargs): super().__init__(*args, **kwargs) self.model = model self.crf = CRF(model.output.shape[-1] if not num_classes else num_classes) def call(self, inputs, training=None, mask=None): output = self.model(inputs, training=training, mask=mask) viterbi_output = self.crf(output) return viterbi_output def compute_output_shape(self, input_shape): return self.model.compute_output_shape(input_shape) ================================================ FILE: hanlp/layers/crf/crf_tf.py ================================================ # Copyright 2019 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf # TODO: Wrap functions in @tf.function once # https://github.com/tensorflow/tensorflow/issues/29075 is resolved def crf_sequence_score(inputs, tag_indices, sequence_lengths, transition_params): """Computes the unnormalized score for a tag sequence. Args: inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials to use as input to the CRF layer. tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we compute the unnormalized score. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: Returns: sequence_scores: A [batch_size] vector of unnormalized sequence scores. """ tag_indices = tf.cast(tag_indices, dtype=tf.int32) sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32) # If max_seq_len is 1, we skip the score calculation and simply gather the # unary potentials of the single tag. def _single_seq_fn(): batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0] example_inds = tf.reshape( tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1]) sequence_scores = tf.gather_nd( tf.squeeze(inputs, [1]), tf.concat([example_inds, tag_indices], axis=1)) sequence_scores = tf.where( tf.less_equal(sequence_lengths, 0), tf.zeros_like(sequence_scores), sequence_scores) return sequence_scores def _multi_seq_fn(): # Compute the scores of the given tag sequence. unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs) binary_scores = crf_binary_score(tag_indices, sequence_lengths, transition_params) sequence_scores = unary_scores + binary_scores return sequence_scores if inputs.shape[1] == 1: return _single_seq_fn() else: return _multi_seq_fn() def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths, transition_params): """Computes the unnormalized score of all tag sequences matching tag_bitmap. tag_bitmap enables more than one tag to be considered correct at each time step. This is useful when an observed output at a given time step is consistent with more than one tag, and thus the log likelihood of that observation must take into account all possible consistent tags. Using one-hot vectors in tag_bitmap gives results identical to crf_sequence_score. Args: inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials to use as input to the CRF layer. tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor representing all active tags at each index for which to calculate the unnormalized score. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: Returns: sequence_scores: A [batch_size] vector of unnormalized sequence scores. """ tag_bitmap = tf.cast(tag_bitmap, dtype=tf.bool) sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32) filtered_inputs = tf.where(tag_bitmap, inputs, tf.fill(tf.shape(inputs), float("-inf"))) # If max_seq_len is 1, we skip the score calculation and simply gather the # unary potentials of all active tags. def _single_seq_fn(): return tf.reduce_logsumexp( filtered_inputs, axis=[1, 2], keepdims=False) def _multi_seq_fn(): # Compute the logsumexp of all scores of sequences matching the given tags. return crf_log_norm( inputs=filtered_inputs, sequence_lengths=sequence_lengths, transition_params=transition_params) if inputs.shape[1] == 1: return _single_seq_fn() else: return _multi_seq_fn() def crf_log_norm(inputs, sequence_lengths, transition_params): """Computes the normalization for a CRF. Args: inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials to use as input to the CRF layer. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: Returns: log_norm: A [batch_size] vector of normalizers for a CRF. """ sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32) # Split up the first and rest of the inputs in preparation for the forward # algorithm. first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1]) first_input = tf.squeeze(first_input, [1]) # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over # the "initial state" (the unary potentials). def _single_seq_fn(): log_norm = tf.reduce_logsumexp(first_input, [1]) # Mask `log_norm` of the sequences with length <= zero. log_norm = tf.where( tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm), log_norm) return log_norm def _multi_seq_fn(): """Forward computation of alpha values.""" rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1]) # Compute the alpha values in the forward algorithm in order to get the # partition function. alphas = crf_forward(rest_of_input, first_input, transition_params, sequence_lengths) log_norm = tf.reduce_logsumexp(alphas, [1]) # Mask `log_norm` of the sequences with length <= zero. log_norm = tf.where( tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm), log_norm) return log_norm if inputs.shape[1] == 1: return _single_seq_fn() else: return _multi_seq_fn() def crf_log_likelihood(inputs, tag_indices, sequence_lengths, transition_params=None): """Computes the log-likelihood of tag sequences in a CRF. Args: inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials to use as input to the CRF layer. tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we compute the log-likelihood. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: A [num_tags, num_tags] transition matrix, (Default value = None) Returns: log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of each example, given the sequence of tag indices. transition_params: A [num_tags, num_tags] transition matrix. This is either provided by the caller or created in this function. """ num_tags = inputs.shape[2] # cast type to handle different types tag_indices = tf.cast(tag_indices, dtype=tf.int32) sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32) if transition_params is None: initializer = tf.keras.initializers.GlorotUniform() transition_params = tf.Variable( initializer([num_tags, num_tags]), "transitions") sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths, transition_params) log_norm = crf_log_norm(inputs, sequence_lengths, transition_params) # Normalize the scores to get the log-likelihood per example. log_likelihood = sequence_scores - log_norm return log_likelihood, transition_params def crf_unary_score(tag_indices, sequence_lengths, inputs): """Computes the unary scores of tag sequences. Args: tag_indices: A [batch_size, max_seq_len] matrix of tag indices. sequence_lengths: A [batch_size] vector of true sequence lengths. inputs: Returns: unary_scores: A [batch_size] vector of unary scores. """ assert len(tag_indices.shape) == 2, 'tag_indices: A [batch_size, max_seq_len] matrix of tag indices.' tag_indices = tf.cast(tag_indices, dtype=tf.int32) sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32) batch_size = tf.shape(inputs)[0] max_seq_len = tf.shape(inputs)[1] num_tags = tf.shape(inputs)[2] flattened_inputs = tf.reshape(inputs, [-1]) offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1) offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0) # Use int32 or int64 based on tag_indices' dtype. if tag_indices.dtype == tf.int64: offsets = tf.cast(offsets, tf.int64) flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1]) unary_scores = tf.reshape( tf.gather(flattened_inputs, flattened_tag_indices), [batch_size, max_seq_len]) masks = tf.sequence_mask( sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32) unary_scores = tf.reduce_sum(unary_scores * masks, 1) return unary_scores def crf_binary_score(tag_indices, sequence_lengths, transition_params): """Computes the binary scores of tag sequences. Args: tag_indices: A [batch_size, max_seq_len] matrix of tag indices. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: Returns: binary_scores: A [batch_size] vector of binary scores. """ tag_indices = tf.cast(tag_indices, dtype=tf.int32) sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32) num_tags = tf.shape(transition_params)[0] num_transitions = tf.shape(tag_indices)[1] - 1 # Truncate by one on each side of the sequence to get the start and end # indices of each transition. start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions]) end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions]) # Encode the indices in a flattened representation. flattened_transition_indices = start_tag_indices * \ num_tags + end_tag_indices flattened_transition_params = tf.reshape(transition_params, [-1]) # Get the binary scores based on the flattened representation. binary_scores = tf.gather(flattened_transition_params, flattened_transition_indices) masks = tf.sequence_mask( sequence_lengths, maxlen=tf.shape(tag_indices)[1], dtype=tf.float32) truncated_masks = tf.slice(masks, [0, 1], [-1, -1]) binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1) return binary_scores def crf_forward(inputs, state, transition_params, sequence_lengths): """Computes the alpha values in a linear-chain CRF. See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference. Args: inputs: A [batch_size, num_tags] matrix of unary potentials. state: A [batch_size, num_tags] matrix containing the previous alpha values. transition_params: A [num_tags, num_tags] matrix of binary potentials. This matrix is expanded into a [1, num_tags, num_tags] in preparation for the broadcast summation occurring within the cell. sequence_lengths: A [batch_size] vector of true sequence lengths. Returns: new_alphas: A [batch_size, num_tags] matrix containing the new alpha values. """ sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32) sequence_lengths = tf.maximum( tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 2) inputs = tf.transpose(inputs, [1, 0, 2]) transition_params = tf.expand_dims(transition_params, 0) def _scan_fn(state, inputs): state = tf.expand_dims(state, 2) transition_scores = state + transition_params new_alphas = inputs + tf.reduce_logsumexp(transition_scores, [1]) return new_alphas all_alphas = tf.transpose(tf.scan(_scan_fn, inputs, state), [1, 0, 2]) idxs = tf.stack( [tf.range(tf.shape(sequence_lengths)[0]), sequence_lengths], axis=1) return tf.gather_nd(all_alphas, idxs) def viterbi_decode(score, transition_params): """Decode the highest scoring sequence of tags outside of TensorFlow. This should only be used at test time. Args: score: A [seq_len, num_tags] matrix of unary potentials. transition_params: A [num_tags, num_tags] matrix of binary potentials. Returns: viterbi: A [seq_len] list of integers containing the highest scoring tag indices. viterbi_score: A float containing the score for the Viterbi sequence. """ trellis = np.zeros_like(score) backpointers = np.zeros_like(score, dtype=np.int32) trellis[0] = score[0] for t in range(1, score.shape[0]): v = np.expand_dims(trellis[t - 1], 1) + transition_params trellis[t] = score[t] + np.max(v, 0) backpointers[t] = np.argmax(v, 0) viterbi = [np.argmax(trellis[-1])] for bp in reversed(backpointers[1:]): viterbi.append(bp[viterbi[-1]]) viterbi.reverse() viterbi_score = np.max(trellis[-1]) return viterbi, viterbi_score class CrfDecodeForwardRnnCell(tf.keras.layers.AbstractRNNCell): """Computes the forward decoding in a linear-chain CRF.""" def __init__(self, transition_params, **kwargs): """Initialize the CrfDecodeForwardRnnCell. Args: transition_params: A [num_tags, num_tags] matrix of binary potentials. This matrix is expanded into a [1, num_tags, num_tags] in preparation for the broadcast summation occurring within the cell. """ super(CrfDecodeForwardRnnCell, self).__init__(**kwargs) self._transition_params = tf.expand_dims(transition_params, 0) self._num_tags = transition_params.shape[0] @property def state_size(self): return self._num_tags @property def output_size(self): return self._num_tags def build(self, input_shape): super(CrfDecodeForwardRnnCell, self).build(input_shape) def call(self, inputs, state): """Build the CrfDecodeForwardRnnCell. Args: inputs: A [batch_size, num_tags] matrix of unary potentials. state: A [batch_size, num_tags] matrix containing the previous step's score values. Returns: backpointers: A [batch_size, num_tags] matrix of backpointers. new_state: A [batch_size, num_tags] matrix of new score values. """ state = tf.expand_dims(state[0], 2) transition_scores = state + self._transition_params new_state = inputs + tf.reduce_max(transition_scores, [1]) backpointers = tf.argmax(transition_scores, 1) backpointers = tf.cast(backpointers, dtype=tf.int32) return backpointers, new_state def crf_decode_forward(inputs, state, transition_params, sequence_lengths): """Computes forward decoding in a linear-chain CRF. Args: inputs: A [batch_size, num_tags] matrix of unary potentials. state: A [batch_size, num_tags] matrix containing the previous step's score values. transition_params: A [num_tags, num_tags] matrix of binary potentials. sequence_lengths: A [batch_size] vector of true sequence lengths. Returns: backpointers: A [batch_size, num_tags] matrix of backpointers. new_state: A [batch_size, num_tags] matrix of new score values. """ sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32) mask = tf.sequence_mask(sequence_lengths, tf.shape(inputs)[1]) crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params) crf_fwd_layer = tf.keras.layers.RNN( crf_fwd_cell, return_sequences=True, return_state=True) return crf_fwd_layer(inputs, state, mask=mask) def crf_decode_backward(inputs, state): """Computes backward decoding in a linear-chain CRF. Args: inputs: A [batch_size, num_tags] matrix of backpointer of next step (in time order). state: A [batch_size, 1] matrix of tag index of next step. Returns: new_tags: A [batch_size, num_tags] tensor containing the new tag indices. """ inputs = tf.transpose(inputs, [1, 0, 2]) def _scan_fn(state, inputs): state = tf.squeeze(state, axis=[1]) idxs = tf.stack([tf.range(tf.shape(inputs)[0]), state], axis=1) new_tags = tf.expand_dims(tf.gather_nd(inputs, idxs), axis=-1) return new_tags return tf.transpose(tf.scan(_scan_fn, inputs, state), [1, 0, 2]) def crf_decode(potentials, transition_params, sequence_length): """Decode the highest scoring sequence of tags in TensorFlow. This is a function for tensor. Args: potentials: A [batch_size, max_seq_len, num_tags] tensor of unary potentials. transition_params: A [num_tags, num_tags] matrix of binary potentials. sequence_length: A [batch_size] vector of true sequence lengths. Returns: decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`. Contains the highest scoring tag indices. best_score: A [batch_size] vector, containing the score of `decode_tags`. """ sequence_length = tf.cast(sequence_length, dtype=tf.int32) # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag # and the max activation. def _single_seq_fn(): squeezed_potentials = tf.squeeze(potentials, [1]) decode_tags = tf.expand_dims(tf.argmax(squeezed_potentials, axis=1), 1) best_score = tf.reduce_max(squeezed_potentials, axis=1) return tf.cast(decode_tags, dtype=tf.int32), best_score def _multi_seq_fn(): """Decoding of highest scoring sequence.""" # Computes forward decoding. Get last score and backpointers. initial_state = tf.slice(potentials, [0, 0, 0], [-1, 1, -1]) initial_state = tf.squeeze(initial_state, axis=[1]) inputs = tf.slice(potentials, [0, 1, 0], [-1, -1, -1]) sequence_length_less_one = tf.maximum( tf.constant(0, dtype=sequence_length.dtype), sequence_length - 1) backpointers, last_score = crf_decode_forward( inputs, initial_state, transition_params, sequence_length_less_one) backpointers = tf.reverse_sequence( backpointers, sequence_length_less_one, seq_axis=1) initial_state = tf.cast(tf.argmax(last_score, axis=1), dtype=tf.int32) initial_state = tf.expand_dims(initial_state, axis=-1) decode_tags = crf_decode_backward(backpointers, initial_state) decode_tags = tf.squeeze(decode_tags, axis=[2]) decode_tags = tf.concat([initial_state, decode_tags], axis=1) decode_tags = tf.reverse_sequence( decode_tags, sequence_length, seq_axis=1) best_score = tf.reduce_max(last_score, axis=1) return decode_tags, best_score if potentials.shape[1] == 1: return _single_seq_fn() else: return _multi_seq_fn() ================================================ FILE: hanlp/layers/dropout.py ================================================ # -*- coding:utf-8 -*- # Date: 2020-06-05 17:47 from typing import List import torch import torch.nn as nn class WordDropout(nn.Module): def __init__(self, p: float, oov_token: int, exclude_tokens: List[int] = None) -> None: super().__init__() self.oov_token = oov_token self.p = p if not exclude_tokens: exclude_tokens = [0] self.exclude = exclude_tokens @staticmethod def token_dropout(tokens: torch.LongTensor, oov_token: int, exclude_tokens: List[int], p: float = 0.2, training: float = True) -> torch.LongTensor: """During training, randomly replaces some of the non-padding tokens to a mask token with probability ``p`` Adopted from https://github.com/Hyperparticle/udify Args: tokens: The current batch of padded sentences with word ids oov_token: The mask token exclude_tokens: The tokens for padding the input batch p: The probability a word gets mapped to the unknown token training: Applies the dropout if set to ``True`` tokens: torch.LongTensor: oov_token: int: exclude_tokens: List[int]: p: float: (Default value = 0.2) training: float: (Default value = True) Returns: A copy of the input batch with token dropout applied """ if training and p > 0: # This creates a mask that only considers unpadded tokens for mapping to oov padding_mask = tokens.new_ones(tokens.size(), dtype=torch.bool) for pad in exclude_tokens: padding_mask &= (tokens != pad) # Create a uniformly random mask selecting either the original words or OOV tokens dropout_mask = (tokens.new_empty(tokens.size(), dtype=torch.float).uniform_() < p) oov_mask = dropout_mask & padding_mask oov_fill = tokens.new_empty(tokens.size(), dtype=torch.long).fill_(oov_token) result = torch.where(oov_mask, oov_fill, tokens) return result else: return tokens def forward(self, tokens: torch.LongTensor) -> torch.LongTensor: return self.token_dropout(tokens, self.oov_token, self.exclude, self.p, self.training) class SharedDropout(nn.Module): def __init__(self, p=0.5, batch_first=True): super(SharedDropout, self).__init__() self.p = p self.batch_first = batch_first def extra_repr(self): s = f"p={self.p}" if self.batch_first: s += f", batch_first={self.batch_first}" return s def forward(self, x): if self.training: if self.batch_first: mask = self.get_mask(x[:, 0], self.p) else: mask = self.get_mask(x[0], self.p) x *= mask.unsqueeze(1) if self.batch_first else mask return x @staticmethod def get_mask(x, p): mask = x.new_empty(x.shape).bernoulli_(1 - p) mask = mask / (1 - p) return mask class IndependentDropout(nn.Module): def __init__(self, p=0.5): r""" For :math:`N` tensors, they use different dropout masks respectively. When :math:`N-M` of them are dropped, the remaining :math:`M` ones are scaled by a factor of :math:`N/M` to compensate, and when all of them are dropped together, zeros are returned. Copied from https://github.com/yzhangcs/parser/master/supar/modules/dropout.py. Args: p (float): The probability of an element to be zeroed. Default: 0.5. Examples: >>> x, y = torch.ones(1, 3, 5), torch.ones(1, 3, 5) >>> x, y = IndependentDropout()(x, y) >>> x tensor([[[1., 1., 1., 1., 1.], [0., 0., 0., 0., 0.], [2., 2., 2., 2., 2.]]]) >>> y tensor([[[1., 1., 1., 1., 1.], [2., 2., 2., 2., 2.], [0., 0., 0., 0., 0.]]]) """ super(IndependentDropout, self).__init__() self.p = p def extra_repr(self): return f"p={self.p}" def forward(self, *items): if self.training: masks = [x.new_empty(x.shape[:2]).bernoulli_(1 - self.p) for x in items] total = sum(masks) scale = len(items) / total.max(torch.ones_like(total)) masks = [mask * scale for mask in masks] items = [item * mask.unsqueeze(dim=-1) for item, mask in zip(items, masks)] return items class LockedDropout(nn.Module): def __init__(self, dropout_rate=0.5): super(LockedDropout, self).__init__() self.dropout_rate = dropout_rate def forward(self, x): if not self.training or not self.dropout_rate: return x if x.dim() == 3: mask = x.new(x.size(0), 1, x.size(2)).bernoulli_(1 - self.dropout_rate) / (1 - self.dropout_rate) mask = mask.expand_as(x) elif x.dim() == 2: mask = torch.empty_like(x).bernoulli_(1 - self.dropout_rate) / (1 - self.dropout_rate) else: raise ValueError(f'Unsupported dim: {x.dim()}. Only 2d (T,C) or 3d (B,T,C) is supported') return mask * x ================================================ FILE: hanlp/layers/embeddings/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-24 21:48 ================================================ FILE: hanlp/layers/embeddings/char_cnn.py ================================================ # Adopted from https://github.com/allenai/allennlp under Apache Licence 2.0. # Changed the packaging and created a subclass CharCNNEmbedding from typing import Union, Tuple, Optional, Callable import torch from torch import nn from hanlp.layers.cnn_encoder import CnnEncoder from hanlp.layers.time_distributed import TimeDistributed from hanlp_common.configurable import AutoConfigurable from hanlp.common.transform import VocabDict, ToChar from hanlp.common.vocab import Vocab from hanlp.layers.embeddings.embedding import EmbeddingDim, Embedding class CharCNN(nn.Module): def __init__(self, field: str, embed: Union[int, Embedding], num_filters: int, ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5), conv_layer_activation: str = 'ReLU', output_dim: Optional[int] = None, vocab_size=None) -> None: """A `CnnEncoder` is a combination of multiple convolution layers and max pooling layers. The input to this module is of shape `(batch_size, num_tokens, input_dim)`, and the output is of shape `(batch_size, output_dim)`. The CNN has one convolution layer for each ngram filter size. Each convolution operation gives out a vector of size num_filters. The number of times a convolution layer will be used is `num_tokens - ngram_size + 1`. The corresponding maxpooling layer aggregates all these outputs from the convolution layer and outputs the max. This operation is repeated for every ngram size passed, and consequently the dimensionality of the output after maxpooling is `len(ngram_filter_sizes) * num_filters`. This then gets (optionally) projected down to a lower dimensional output, specified by `output_dim`. We then use a fully connected layer to project in back to the desired output_dim. For more details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1. See allennlp.modules.seq2vec_encoders.cnn_encoder.CnnEncoder, Apache 2.0 Args: field: The field in samples this encoder will work on. embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object. num_filters: This is the output dim for each convolutional layer, which is the number of "filters" learned by that layer. ngram_filter_sizes: This specifies both the number of convolutional layers we will create and their sizes. The default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding ngrams of size 2 to 5 with some number of filters. conv_layer_activation: `Activation`, optional (default=`torch.nn.ReLU`) Activation to use after the convolution layers. output_dim: After doing convolutions and pooling, we'll project the collected features into a vector of this size. If this value is `None`, we will just return the result of the max pooling, giving an output of shape `len(ngram_filter_sizes) * num_filters`. vocab_size: The size of character vocab. Returns: A tensor of shape `(batch_size, output_dim)`. """ super().__init__() EmbeddingDim.__init__(self) # the embedding layer if isinstance(embed, int): embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed) else: raise ValueError(f'Unrecognized type for {embed}') self.field = field self.embed = TimeDistributed(embed) self.encoder = TimeDistributed( CnnEncoder(embed.embedding_dim, num_filters, ngram_filter_sizes, conv_layer_activation, output_dim)) self.embedding_dim = output_dim or num_filters * len(ngram_filter_sizes) def forward(self, batch: dict, **kwargs): tokens: torch.Tensor = batch[f'{self.field}_char_id'] mask = tokens.ge(0) x = self.embed(tokens) return self.encoder(x, mask) def get_output_dim(self) -> int: return self.embedding_dim class CharCNNEmbedding(Embedding, AutoConfigurable): def __init__(self, field, embed: Union[int, Embedding], num_filters: int, ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5), conv_layer_activation: str = 'ReLU', output_dim: Optional[int] = None, min_word_length=None ) -> None: """ Args: field: The character field in samples this encoder will work on. embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object. num_filters: This is the output dim for each convolutional layer, which is the number of "filters" learned by that layer. ngram_filter_sizes: This specifies both the number of convolutional layers we will create and their sizes. The default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding ngrams of size 2 to 5 with some number of filters. conv_layer_activation: `Activation`, optional (default=`torch.nn.ReLU`) Activation to use after the convolution layers. output_dim: After doing convolutions and pooling, we'll project the collected features into a vector of this size. If this value is `None`, we will just return the result of the max pooling, giving an output of shape `len(ngram_filter_sizes) * num_filters`. min_word_length: For ngram filter with max size, the input (chars) is required to have at least max size chars. """ super().__init__() if min_word_length is None: min_word_length = max(ngram_filter_sizes) self.min_word_length = min_word_length self.output_dim = output_dim self.conv_layer_activation = conv_layer_activation self.ngram_filter_sizes = ngram_filter_sizes self.num_filters = num_filters self.embed = embed self.field = field def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]: if isinstance(self.embed, Embedding): self.embed.transform(vocabs=vocabs) vocab_name = self.vocab_name if vocab_name not in vocabs: vocabs[vocab_name] = Vocab() return ToChar(self.field, vocab_name, min_word_length=self.min_word_length, pad=vocabs[vocab_name].safe_pad_token) @property def vocab_name(self): vocab_name = f'{self.field}_char' return vocab_name def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]: embed = self.embed if isinstance(embed, Embedding): embed = embed.module(vocabs=vocabs) return CharCNN(self.field, embed, self.num_filters, self.ngram_filter_sizes, self.conv_layer_activation, self.output_dim, vocab_size=len(vocabs[self.vocab_name])) ================================================ FILE: hanlp/layers/embeddings/char_cnn_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-20 21:15 from functools import reduce import tensorflow as tf from hanlp.common.vocab_tf import VocabTF from hanlp.utils.tf_util import hanlp_register @hanlp_register class CharCNNEmbeddingTF(tf.keras.layers.Layer): def __init__(self, word_vocab: VocabTF, char_vocab: VocabTF, char_embedding=100, kernel_size=3, filters=50, dropout=0.5, trainable=True, name=None, dtype=None, dynamic=False, **kwargs): super().__init__(trainable, name, dtype, dynamic, **kwargs) self.char_embedding = char_embedding self.filters = filters self.kernel_size = kernel_size self.char_vocab = char_vocab self.word_vocab = word_vocab self.embedding = tf.keras.layers.Embedding(input_dim=len(self.char_vocab), output_dim=char_embedding, trainable=True, mask_zero=True) self.dropout = tf.keras.layers.Dropout(dropout) self.cnn = tf.keras.layers.Conv1D(filters, kernel_size, padding='same') def call(self, inputs: tf.Tensor, **kwargs): mask = tf.not_equal(inputs, self.word_vocab.pad_token) inputs = tf.ragged.boolean_mask(inputs, mask) chars = tf.strings.unicode_split(inputs, input_encoding='UTF-8') chars = chars.to_tensor(default_value=self.char_vocab.pad_token) chars = self.char_vocab.lookup(chars) embed = self.embedding(chars) weights = embed._keras_mask embed = self.dropout(embed) features = masked_conv1d_and_max(embed, weights, self.cnn) features._keras_mask = mask return features def compute_output_shape(self, input_shape): return super().compute_output_shape(input_shape) def get_config(self): config = { 'char_embedding': self.char_embedding, 'kernel_size': self.kernel_size, 'filters': self.filters, 'dropout': self.dropout.rate, } base_config = super(CharCNNEmbeddingTF, self).get_config() return dict(list(base_config.items()) + list(config.items())) def masked_conv1d_and_max(t, weights, conv1d): """Applies 1d convolution and a masked max-pooling https://github.com/guillaumegenthial/tf_ner/blob/master/models/chars_conv_lstm_crf/masked_conv.py Args: t(tf.Tensor): A tensor with at least 3 dimensions [d1, d2, ..., dn-1, dn] weights(tf.Tensor of tf.bool): A Tensor of shape [d1, d2, dn-1] filters(int): number of filters kernel_size(int): kernel size for the temporal convolution conv1d: Returns: """ # Get shape and parameters shape = tf.shape(t) ndims = t.shape.ndims dim1 = reduce(lambda x, y: x * y, [shape[i] for i in range(ndims - 2)]) dim2 = shape[-2] dim3 = t.shape[-1] # Reshape weights weights = tf.reshape(weights, shape=[dim1, dim2, 1]) weights = tf.cast(weights, tf.float32) # Reshape input and apply weights flat_shape = [dim1, dim2, dim3] t = tf.reshape(t, shape=flat_shape) t *= weights # Apply convolution t_conv = conv1d(t) t_conv *= weights # Reduce max -- set to zero if all padded t_conv += (1. - weights) * tf.reduce_min(t_conv, axis=-2, keepdims=True) t_max = tf.reduce_max(t_conv, axis=-2) # Reshape the output final_shape = [shape[i] for i in range(ndims - 2)] + [conv1d.filters] t_max = tf.reshape(t_max, shape=final_shape) return t_max ================================================ FILE: hanlp/layers/embeddings/char_rnn.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-02 23:49 from typing import Optional, Callable, Union import torch import torch.nn as nn from torch.nn.utils.rnn import pack_padded_sequence from hanlp_common.configurable import AutoConfigurable from hanlp.common.transform import VocabDict, ToChar from hanlp.common.vocab import Vocab from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim class CharRNN(nn.Module, EmbeddingDim): def __init__(self, field, vocab_size, embed: Union[int, nn.Embedding], hidden_size): """Character level RNN embedding module. Args: field: The field in samples this encoder will work on. vocab_size: The size of character vocab. embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object. hidden_size: The hidden size of RNNs. """ super(CharRNN, self).__init__() self.field = field # the embedding layer if isinstance(embed, int): self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed) elif isinstance(embed, nn.Module): self.embed = embed embed = embed.embedding_dim else: raise ValueError(f'Unrecognized type for {embed}') # the lstm layer self.lstm = nn.LSTM(input_size=embed, hidden_size=hidden_size, batch_first=True, bidirectional=True) def forward(self, batch, mask, **kwargs): x = batch[f'{self.field}_char_id'] # [batch_size, seq_len, fix_len] mask = x.ne(0) # [batch_size, seq_len] lens = mask.sum(-1) char_mask = lens.gt(0) # [n, fix_len, n_embed] x = self.embed(batch) if isinstance(self.embed, EmbeddingDim) else self.embed(x[char_mask]) x = pack_padded_sequence(x[char_mask], lens[char_mask].cpu(), True, False) x, (h, _) = self.lstm(x) # [n, fix_len, n_out] h = torch.cat(torch.unbind(h), -1) # [batch_size, seq_len, n_out] embed = h.new_zeros(*lens.shape, h.size(-1)) embed = embed.masked_scatter_(char_mask.unsqueeze(-1), h) return embed @property def embedding_dim(self) -> int: return self.lstm.hidden_size * 2 class CharRNNEmbedding(Embedding, AutoConfigurable): def __init__(self, field, embed, hidden_size, max_word_length=None) -> None: """Character level RNN embedding module builder. Args: field: The field in samples this encoder will work on. embed: An ``Embedding`` object or the feature size to create an ``Embedding`` object. hidden_size: The hidden size of RNNs. max_word_length: Character sequence longer than ``max_word_length`` will be truncated. """ super().__init__() self.field = field self.hidden_size = hidden_size self.embed = embed self.max_word_length = max_word_length def transform(self, vocabs: VocabDict, **kwargs) -> Optional[Callable]: if isinstance(self.embed, Embedding): self.embed.transform(vocabs=vocabs) vocab_name = self.vocab_name if vocab_name not in vocabs: vocabs[vocab_name] = Vocab() return ToChar(self.field, vocab_name, max_word_length=self.max_word_length) @property def vocab_name(self): vocab_name = f'{self.field}_char' return vocab_name def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]: embed = self.embed if isinstance(self.embed, Embedding): embed = self.embed.module(vocabs=vocabs) return CharRNN(self.field, len(vocabs[self.vocab_name]), embed, self.hidden_size) ================================================ FILE: hanlp/layers/embeddings/char_rnn_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-20 17:02 import tensorflow as tf from hanlp.common.vocab_tf import VocabTF from hanlp.utils.tf_util import hanlp_register @hanlp_register class CharRNNEmbeddingTF(tf.keras.layers.Layer): def __init__(self, word_vocab: VocabTF, char_vocab: VocabTF, char_embedding=100, char_rnn_units=25, dropout=0.5, trainable=True, name=None, dtype=None, dynamic=False, **kwargs): super().__init__(trainable, name, dtype, dynamic, **kwargs) self.char_embedding = char_embedding self.char_rnn_units = char_rnn_units self.char_vocab = char_vocab self.word_vocab = word_vocab self.embedding = tf.keras.layers.Embedding(input_dim=len(self.char_vocab), output_dim=char_embedding, trainable=True, mask_zero=True) self.dropout = tf.keras.layers.Dropout(dropout) self.rnn = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=char_rnn_units, return_state=True), name='bilstm') def call(self, inputs: tf.Tensor, **kwargs): mask = tf.not_equal(inputs, self.word_vocab.pad_token) inputs = tf.ragged.boolean_mask(inputs, mask) chars = tf.strings.unicode_split(inputs, input_encoding='UTF-8') chars = chars.to_tensor(default_value=self.char_vocab.pad_token) chars = self.char_vocab.lookup(chars) embed = self.embedding(chars) char_mask = embed._keras_mask embed = self.dropout(embed) embed_shape = tf.shape(embed) embed = tf.reshape(embed, [-1, embed_shape[2], embed_shape[3]]) char_mask = tf.reshape(char_mask, [-1, embed_shape[2]]) all_zeros = tf.reduce_sum(tf.cast(char_mask, tf.int32), axis=1) == 0 char_mask_shape = tf.shape(char_mask) hole = tf.zeros(shape=(char_mask_shape[0], char_mask_shape[1] - 1), dtype=tf.bool) all_zeros = tf.expand_dims(all_zeros, -1) non_all_zeros = tf.concat([all_zeros, hole], axis=1) char_mask = tf.logical_or(char_mask, non_all_zeros) output, h_fw, c_fw, h_bw, c_bw = self.rnn(embed, mask=char_mask) hidden = tf.concat([h_fw, h_bw], axis=-1) # hidden = output hidden = tf.reshape(hidden, [embed_shape[0], embed_shape[1], -1]) hidden._keras_mask = mask return hidden def get_config(self): config = { 'char_embedding': self.char_embedding, 'char_rnn_units': self.char_rnn_units, 'dropout': self.dropout.rate, } base_config = super(CharRNNEmbeddingTF, self).get_config() return dict(list(base_config.items()) + list(config.items())) ================================================ FILE: hanlp/layers/embeddings/concat_embedding.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-20 17:08 import tensorflow as tf from hanlp.utils.tf_util import hanlp_register, copy_mask @hanlp_register class ConcatEmbedding(tf.keras.layers.Layer): def __init__(self, *embeddings, trainable=True, name=None, dtype=None, dynamic=False, **kwargs): self.embeddings = [] for embed in embeddings: embed: tf.keras.layers.Layer = tf.keras.utils.deserialize_keras_object(embed) if isinstance(embed, dict) else embed self.embeddings.append(embed) if embed.trainable: trainable = True if embed.dynamic: dynamic = True if embed.supports_masking: self.supports_masking = True super().__init__(trainable, name, dtype, dynamic, **kwargs) def build(self, input_shape): for embed in self.embeddings: embed.build(input_shape) super().build(input_shape) def compute_mask(self, inputs, mask=None): for embed in self.embeddings: mask = embed.compute_mask(inputs, mask) if mask is not None: return mask return mask def call(self, inputs, **kwargs): embeds = [embed.call(inputs) for embed in self.embeddings] feature = tf.concat(embeds, axis=-1) for embed in embeds: mask = copy_mask(embed, feature) if mask is not None: break return feature def get_config(self): config = { 'embeddings': [embed.get_config() for embed in self.embeddings], } base_config = super(ConcatEmbedding, self).get_config() return dict(list(base_config.items()) + list(config.items())) def compute_output_shape(self, input_shape): dim = 0 for embed in self.embeddings: dim += embed.compute_output_shape(input_shape)[-1] return input_shape + dim ================================================ FILE: hanlp/layers/embeddings/contextual_string_embedding.py ================================================ # Most codes of this file is adopted from flair, which is licenced under: # # The MIT License (MIT) # # Flair is licensed under the following MIT License (MIT) Copyright © 2018 Zalando SE, https://tech.zalando.com # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os from typing import List, Dict, Callable import torch import torch.nn as nn from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from hanlp_common.configurable import Configurable from hanlp.common.transform import TransformList, FieldToIndex from hanlp.common.vocab import Vocab from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim from hanlp.utils.io_util import get_resource from hanlp.utils.torch_util import pad_lists, batched_index_select from tests import cdroot class RNNLanguageModel(nn.Module): """Container module with an encoder, a recurrent module, and a decoder.""" def __init__(self, n_tokens, is_forward_lm: bool, hidden_size: int, embedding_size: int = 100): super(RNNLanguageModel, self).__init__() self.is_forward_lm: bool = is_forward_lm self.n_tokens = n_tokens self.hidden_size = hidden_size self.embedding_size = embedding_size self.encoder = nn.Embedding(n_tokens, embedding_size) self.rnn = nn.LSTM(embedding_size, hidden_size, batch_first=True) def forward(self, ids: torch.LongTensor, lens: torch.LongTensor): emb = self.encoder(ids) x = pack_padded_sequence(emb, lens, True, False) x, _ = self.rnn(x) x, _ = pad_packed_sequence(x, True) return x @classmethod def load_language_model(cls, model_file): model_file = get_resource(model_file) state = torch.load(model_file) model = RNNLanguageModel(state['n_tokens'], state['is_forward_lm'], state['hidden_size'], state['embedding_size']) model.load_state_dict(state['state_dict'], strict=False) return model def save(self, file): model_state = { 'state_dict': self.state_dict(), 'n_tokens': self.n_tokens, 'is_forward_lm': self.is_forward_lm, 'hidden_size': self.hidden_size, 'embedding_size': self.embedding_size, } torch.save(model_state, file, pickle_protocol=4) class ContextualStringEmbeddingModule(nn.Module, EmbeddingDim): def __init__(self, field: str, path: str, trainable=False) -> None: super().__init__() self.field = field path = get_resource(path) f = os.path.join(path, 'forward.pt') b = os.path.join(path, 'backward.pt') self.f: RNNLanguageModel = RNNLanguageModel.load_language_model(f) self.b: RNNLanguageModel = RNNLanguageModel.load_language_model(b) if not trainable: for p in self.parameters(): p.requires_grad_(False) def __call__(self, batch: dict, **kwargs): args = ['f_char_id', 'f_offset', 'b_char_id', 'b_offset'] keys = [f'{self.field}_{key}' for key in args] args = [batch[key] for key in keys] return super().__call__(*args, **kwargs) @property def embedding_dim(self): return self.f.rnn.hidden_size + self.b.rnn.hidden_size def run_lm(self, lm, ids: torch.Tensor, offsets: torch.LongTensor): lens = offsets.max(-1)[0] + 1 rnn_output = lm(ids, lens) return batched_index_select(rnn_output, offsets) def forward(self, f_chars_id: torch.Tensor, f_offset: torch.LongTensor, b_chars_id: torch.Tensor, b_offset: torch.LongTensor, **kwargs): f = self.run_lm(self.f, f_chars_id, f_offset) b = self.run_lm(self.b, b_chars_id, b_offset) return torch.cat([f, b], dim=-1) def embed(self, sents: List[List[str]], vocab: Dict[str, int]): f_chars, f_offsets = [], [] b_chars, b_offsets = [], [] transform = ContextualStringEmbeddingTransform('token') for tokens in sents: sample = transform({'token': tokens}) for each, name in zip([f_chars, b_chars, f_offsets, b_offsets], 'f_chars, b_chars, f_offsets, b_offsets'.split(', ')): each.append(sample[f'token_{name}']) f_ids = [] for cs in f_chars: f_ids.append([vocab[c] for c in cs]) f_ids = pad_lists(f_ids) f_offsets = pad_lists(f_offsets) b_ids = [] for cs in b_chars: b_ids.append([vocab[c] for c in cs]) b_ids = pad_lists(b_ids) b_offsets = pad_lists(b_offsets) return self.forward(f_ids, f_offsets, b_ids, b_offsets) class ContextualStringEmbeddingTransform(Configurable): def __init__(self, src: str) -> None: self.src = src def __call__(self, sample: dict): tokens = sample[self.src] f_o = [] b_o = [] sentence_text = ' '.join(tokens) end_marker = ' ' extra_offset = 1 # f input_text = '\n' + sentence_text + end_marker f_chars = list(input_text) # b sentence_text = sentence_text[::-1] input_text = '\n' + sentence_text + end_marker b_chars = list(input_text) offset_forward: int = extra_offset offset_backward: int = len(sentence_text) + extra_offset for token in tokens: offset_forward += len(token) f_o.append(offset_forward) b_o.append(offset_backward) # This language model is tokenized offset_forward += 1 offset_backward -= 1 offset_backward -= len(token) sample[f'{self.src}_f_char'] = f_chars sample[f'{self.src}_b_char'] = b_chars sample[f'{self.src}_f_offset'] = f_o sample[f'{self.src}_b_offset'] = b_o return sample class ContextualStringEmbedding(Embedding): def __init__(self, field, path, trainable=False) -> None: super().__init__() self.trainable = trainable self.path = path self.field = field def transform(self, **kwargs) -> Callable: vocab = Vocab() vocab.load(os.path.join(get_resource(self.path), 'vocab.json')) return TransformList(ContextualStringEmbeddingTransform(self.field), FieldToIndex(f'{self.field}_f_char', vocab), FieldToIndex(f'{self.field}_b_char', vocab)) def module(self, **kwargs) -> nn.Module: return ContextualStringEmbeddingModule(self.field, self.path, self.trainable) def main(): # _validate() flair = ContextualStringEmbedding('token', 'FASTTEXT_DEBUG_EMBEDDING_EN') print(flair.config) def _validate(): cdroot() flair = ContextualStringEmbeddingModule('token', 'FLAIR_LM_WMT11_EN') vocab = torch.load('/home/hhe43/flair/item2idx.pt') vocab = dict((x.decode(), y) for x, y in vocab.items()) # vocab = Vocab(token_to_idx=vocab, pad_token='') # vocab.lock() # vocab.summary() # vocab.save('vocab.json') tokens = 'I love Berlin .'.split() sent = ' '.join(tokens) embed = flair.embed([tokens, tokens], vocab) gold = torch.load('/home/hhe43/flair/gold.pt') print(torch.allclose(embed[1, :, :2048], gold, atol=1e-6)) # print(torch.all(torch.eq(embed[1, :, :], gold))) if __name__ == '__main__': main() ================================================ FILE: hanlp/layers/embeddings/contextual_string_embedding_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-19 03:24 from typing import List import tensorflow as tf import numpy as np from hanlp.components.rnn_language_model_tf import RNNLanguageModel from hanlp_common.constant import PAD from hanlp.utils.io_util import get_resource from hanlp.utils.tf_util import copy_mask, hanlp_register, str_tensor_2d_to_list from hanlp_common.util import infer_space_after @hanlp_register class ContextualStringEmbeddingTF(tf.keras.layers.Layer): def __init__(self, forward_model_path=None, backward_model_path=None, max_word_len=10, trainable=False, name=None, dtype=None, dynamic=True, **kwargs): assert dynamic, 'ContextualStringEmbedding works only in eager mode' super().__init__(trainable, name, dtype, dynamic, **kwargs) assert any([forward_model_path, backward_model_path]), 'At least one model is required' self.forward_model_path = forward_model_path self.backward_model_path = backward_model_path self.forward_model = self._load_lm(forward_model_path) if forward_model_path else None self.backward_model = self._load_lm(backward_model_path) if backward_model_path else None if trainable: self._fw = self.forward_model.model self._bw = self.backward_model.model for m in self._fw, self._bw: m.trainable = True self.supports_masking = True self.max_word_len = max_word_len def call(self, inputs, **kwargs): str_inputs = str_tensor_2d_to_list(inputs) outputs = self.embed(str_inputs) copy_mask(inputs, outputs) return outputs def _load_lm(self, filepath): filepath = get_resource(filepath) lm = RNNLanguageModel() lm.load(filepath) model: tf.keras.Sequential = lm.model for idx, layer in enumerate(model.layers): if isinstance(layer, tf.keras.layers.LSTM): lm.model = tf.keras.Sequential(model.layers[:idx + 1]) # discard dense layer return lm def embed(self, texts: List[List[str]]): """Embedding sentences (list of words) with contextualized string embedding Args: texts: List of words, not chars texts: List[List[str]]: Returns: """ fw = None if self.forward_model: fw = self._run_rnn(texts, model=self.forward_model) bw = None if self.backward_model: bw = self._run_rnn(texts, model=self.backward_model) if not all(x is not None for x in [fw, bw]): return fw if fw is not None else bw else: return tf.concat([fw, bw], axis=-1) def _run_rnn(self, texts, model): embeddings = [] inputs = [] offsets = [] tokenizer = model.transform.tokenize_func() backward = not model.config['forward'] for sent in texts: raw, off = self._get_raw_string(sent, tokenizer) inputs.append(raw) offsets.append(off) outputs = model.model_from_config.predict(model.transform.inputs_to_dataset(inputs)) if backward: outputs = tf.reverse(outputs, axis=[1]) maxlen = len(max(texts, key=len)) for hidden, off, sent in zip(outputs, offsets, texts): embed = [] for (start, end), word in zip(off, sent): embed.append(hidden[end - 1, :]) if len(embed) < maxlen: embed += [np.zeros_like(embed[-1])] * (maxlen - len(embed)) embeddings.append(np.stack(embed)) return tf.stack(embeddings) def _get_raw_string(self, sent: List[str], tokenizer): raw_string = [] offsets = [] whitespace_after = infer_space_after(sent) start = 0 for word, space in zip(sent, whitespace_after): chars = tokenizer(word) chars = chars[:self.max_word_len] if space: chars += [' '] end = start + len(chars) offsets.append((start, end)) start = end raw_string += chars return raw_string, offsets def get_config(self): config = { 'forward_model_path': self.forward_model_path, 'backward_model_path': self.backward_model_path, 'max_word_len': self.max_word_len, } base_config = super(ContextualStringEmbeddingTF, self).get_config() return dict(list(base_config.items()) + list(config.items())) @property def output_dim(self): dim = 0 for model in self.forward_model, self.backward_model: if model: dim += model.config['rnn_units'] return dim def compute_output_shape(self, input_shape): return input_shape + self.output_dim def compute_mask(self, inputs, mask=None): return tf.not_equal(inputs, PAD) ================================================ FILE: hanlp/layers/embeddings/contextual_word_embedding.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-05 13:50 from typing import Optional, Union, List, Any, Dict, Tuple import torch from torch import nn from hanlp_common.configurable import AutoConfigurable from hanlp.layers.embeddings.embedding import Embedding from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder from hanlp.layers.transformers.encoder import TransformerEncoder from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, AutoConfig_, AutoTokenizer_ from hanlp.transform.transformer_tokenizer import TransformerSequenceTokenizer class ContextualWordEmbeddingModule(TransformerEncoder): def __init__(self, field: str, transformer: str, transformer_tokenizer: PreTrainedTokenizer, average_subwords=False, scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None, word_dropout=None, max_sequence_length=None, ret_raw_hidden_states=False, transformer_args: Dict[str, Any] = None, trainable=True, training=True) -> None: """A contextualized word embedding module. Args: field: The field to work on. Usually some token fields. transformer: An identifier of a ``PreTrainedModel``. transformer_tokenizer: average_subwords: ``True`` to average subword representations. scalar_mix: Layer attention. word_dropout: Dropout rate of randomly replacing a subword with MASK. max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding window. ret_raw_hidden_states: ``True`` to return hidden states of each layer. transformer_args: Extra arguments passed to the transformer. trainable: ``False`` to use static embeddings. training: ``False`` to skip loading weights from pre-trained transformers. """ super().__init__(transformer, transformer_tokenizer, average_subwords, scalar_mix, word_dropout, max_sequence_length, ret_raw_hidden_states, transformer_args, trainable, training) self.field = field # noinspection PyMethodOverriding # noinspection PyTypeChecker def forward(self, batch: dict, mask=None, **kwargs): input_ids: torch.LongTensor = batch[f'{self.field}_input_ids'] token_span: torch.LongTensor = batch.get(f'{self.field}_token_span', None) # input_device = input_ids.device # this_device = self.get_device() # if input_device != this_device: # input_ids = input_ids.to(this_device) # token_span = token_span.to(this_device) # We might want to apply mask here output: Union[torch.Tensor, List[torch.Tensor]] = super().forward(input_ids, token_span=token_span, **kwargs) # if input_device != this_device: # if isinstance(output, torch.Tensor): # output = output.to(input_device) # else: # output = [x.to(input_device) for x in output] return output def get_output_dim(self): return self.transformer.config.hidden_size def get_device(self): device: torch.device = next(self.parameters()).device return device class ContextualWordEmbedding(Embedding, AutoConfigurable): def __init__(self, field: str, transformer: str, average_subwords=False, scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None, word_dropout: Optional[Union[float, Tuple[float, str]]] = None, max_sequence_length=None, truncate_long_sequences=False, cls_is_bos=False, sep_is_eos=False, ret_token_span=True, ret_subtokens=False, ret_subtokens_group=False, ret_prefix_mask=False, ret_raw_hidden_states=False, transformer_args: Dict[str, Any] = None, use_fast=True, do_basic_tokenize=True, trainable=True) -> None: """A contextual word embedding builder which builds a :class:`~hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule` and a :class:`~hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer`. Args: field: The field to work on. Usually some token fields. transformer: An identifier of a ``PreTrainedModel``. average_subwords: ``True`` to average subword representations. scalar_mix: Layer attention. word_dropout: Dropout rate of randomly replacing a subword with MASK. max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding window. truncate_long_sequences: ``True`` to return hidden states of each layer. cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is. ``False`` (default) means the first token is not [CLS], it will have its own embedding other than the embedding of [CLS]. sep_is_eos: ``True`` means the last token of input is [SEP]. ``False`` means it's not but [SEP] will be appended, ``None`` means it dependents on `input[-1] == [EOS]`. ret_token_span: ``True`` to return span of each token measured by subtoken offsets. ret_subtokens: ``True`` to return list of subtokens belonging to each token. ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token. ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token. ret_raw_hidden_states: ``True`` to return hidden states of each layer. transformer_args: Extra arguments passed to the transformer. use_fast: Whether or not to try to load the fast version of the tokenizer. do_basic_tokenize: Whether to do basic tokenization before wordpiece. trainable: ``False`` to use static embeddings. """ super().__init__() self.truncate_long_sequences = truncate_long_sequences self.transformer_args = transformer_args self.trainable = trainable self.ret_subtokens_group = ret_subtokens_group self.ret_subtokens = ret_subtokens self.ret_raw_hidden_states = ret_raw_hidden_states self.sep_is_eos = sep_is_eos self.cls_is_bos = cls_is_bos self.max_sequence_length = max_sequence_length self.word_dropout = word_dropout self.scalar_mix = scalar_mix self.average_subwords = average_subwords self.transformer = transformer self.field = field self._transformer_tokenizer = AutoTokenizer_.from_pretrained(self.transformer, use_fast=use_fast, do_basic_tokenize=do_basic_tokenize) self._tokenizer_transform = TransformerSequenceTokenizer(self._transformer_tokenizer, field, truncate_long_sequences=truncate_long_sequences, ret_prefix_mask=ret_prefix_mask, ret_token_span=ret_token_span, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos, ret_subtokens=ret_subtokens, ret_subtokens_group=ret_subtokens_group, max_seq_length=self.max_sequence_length ) def transform(self, **kwargs) -> TransformerSequenceTokenizer: return self._tokenizer_transform def module(self, training=True, **kwargs) -> Optional[nn.Module]: return ContextualWordEmbeddingModule(self.field, self.transformer, self._transformer_tokenizer, self.average_subwords, self.scalar_mix, self.word_dropout, self.max_sequence_length, self.ret_raw_hidden_states, self.transformer_args, self.trainable, training=training) def get_output_dim(self): config = AutoConfig_.from_pretrained(self.transformer) return config.hidden_size def get_tokenizer(self): return self._transformer_tokenizer def find_transformer(embed: nn.Module): if isinstance(embed, ContextualWordEmbeddingModule): return embed if isinstance(embed, nn.ModuleList): for child in embed: found = find_transformer(child) if found: return found ================================================ FILE: hanlp/layers/embeddings/embedding.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-02 13:04 from abc import ABC, abstractmethod from typing import Callable, List, Optional, Iterable import torch from torch import nn from torch.nn import Module from hanlp_common.configurable import AutoConfigurable from hanlp.common.transform import TransformList from hanlp.layers.dropout import IndependentDropout class EmbeddingDim(ABC): @property @abstractmethod def embedding_dim(self) -> int: return -1 def get_output_dim(self) -> int: return self.embedding_dim class Embedding(AutoConfigurable, ABC): def __init__(self) -> None: """ Base class for embedding builders. """ super().__init__() def transform(self, **kwargs) -> Optional[Callable]: """Build a transform function for this embedding. Args: **kwargs: Containing vocabs, training etc. Not finalized for now. Returns: A transform function. """ return None def module(self, **kwargs) -> Optional[nn.Module]: """Build a module for this embedding. Args: **kwargs: Containing vocabs, training etc. Not finalized for now. Returns: A module. """ return None class ConcatModuleList(nn.ModuleList, EmbeddingDim): def __init__(self, *modules: Optional[Iterable[Module]], dropout=None) -> None: """A ``nn.ModuleList`` to bundle several embeddings modules. Args: *modules: Embedding layers. dropout: Dropout applied on the concatenated embedding. """ super().__init__(*modules) if dropout: dropout = IndependentDropout(p=dropout) self.dropout = dropout @property def embedding_dim(self) -> int: return sum(embed.embedding_dim for embed in self) def get_output_dim(self) -> int: return sum(embed.get_output_dim() for embed in self) # noinspection PyMethodOverriding def forward(self, batch: dict, **kwargs): embeds = [embed(batch, **kwargs) for embed in self.embeddings] if self.dropout: embeds = self.dropout(*embeds) return torch.cat(embeds, -1) @property def embeddings(self): embeddings = [x for x in self] if self.dropout: embeddings.remove(self.dropout) return embeddings class EmbeddingList(Embedding): def __init__(self, *embeddings_, embeddings: dict = None, dropout=None) -> None: """An embedding builder to bundle several embedding builders. Args: *embeddings_: A list of embedding builders. embeddings: Deserialization for a dict of embedding builders. dropout: Dropout applied on the concatenated embedding. """ # noinspection PyTypeChecker self.dropout = dropout self._embeddings: List[Embedding] = list(embeddings_) if embeddings: for each in embeddings: if isinstance(each, dict): each = AutoConfigurable.from_config(each) self._embeddings.append(each) self.embeddings = [e.config for e in self._embeddings] def transform(self, **kwargs): transforms = [e.transform(**kwargs) for e in self._embeddings] transforms = [t for t in transforms if t] return TransformList(*transforms) def module(self, **kwargs): modules = [e.module(**kwargs) for e in self._embeddings] modules = [m for m in modules if m] return ConcatModuleList(modules, dropout=self.dropout) def to_list(self): return self._embeddings def find_embedding_by_class(embed: Embedding, cls): if isinstance(embed, cls): return embed if isinstance(embed, EmbeddingList): for child in embed.to_list(): found = find_embedding_by_class(child, cls) if found: return found ================================================ FILE: hanlp/layers/embeddings/fast_text.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-27 15:06 import logging import os import sys from typing import Optional, Callable import fasttext import torch from torch import nn from torch.nn.utils.rnn import pad_sequence from hanlp_common.configurable import AutoConfigurable from torch.utils.data import DataLoader from hanlp.common.dataset import PadSequenceDataLoader, TransformableDataset from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import EmbeddingNamedTransform from hanlp.common.vocab import Vocab from hanlp.layers.embeddings.embedding import Embedding from hanlp.utils.io_util import get_resource, stdout_redirected from hanlp.utils.log_util import flash class FastTextTransform(EmbeddingNamedTransform): def __init__(self, filepath: str, src, dst=None, **kwargs) -> None: if not dst: dst = src + '_fasttext' self.filepath = filepath flash(f'Loading fasttext model {filepath} [blink][yellow]...[/yellow][/blink]') filepath = get_resource(filepath) with stdout_redirected(to=os.devnull, stdout=sys.stderr): self._model = fasttext.load_model(filepath) flash('') output_dim = self._model['king'].size super().__init__(output_dim, src, dst) def __call__(self, sample: dict): word = sample[self.src] if isinstance(word, str): vector = self.embed(word) else: vector = torch.stack([self.embed(each) for each in word]) sample[self.dst] = vector return sample def embed(self, word: str): return torch.tensor(self._model[word]) class SelectFromBatchModule(torch.nn.Module): def __init__(self, key) -> None: super().__init__() self.key = key def __call__(self, batch: dict, mask=None, **kwargs): return batch[self.key] class FastTextEmbeddingModule(SelectFromBatchModule): def __init__(self, key, embedding_dim: int) -> None: """An embedding layer for fastText (:cite:`bojanowski2017enriching`). Args: key: Field name. embedding_dim: Size of this embedding layer """ super().__init__(key) self.embedding_dim = embedding_dim def __call__(self, batch: dict, mask=None, **kwargs): outputs = super().__call__(batch, **kwargs) outputs = pad_sequence(outputs, True, 0) if mask is not None: outputs = outputs.to(mask.device) return outputs def __repr__(self): s = self.__class__.__name__ + '(' s += f'key={self.key}, embedding_dim={self.embedding_dim}' s += ')' return s def get_output_dim(self): return self.embedding_dim class FastTextEmbedding(Embedding, AutoConfigurable): def __init__(self, src: str, filepath: str) -> None: """An embedding layer builder for fastText (:cite:`bojanowski2017enriching`). Args: src: Field name. filepath: Filepath to pretrained fastText embeddings. """ super().__init__() self.src = src self.filepath = filepath self._fasttext = FastTextTransform(self.filepath, self.src) def transform(self, **kwargs) -> Optional[Callable]: return self._fasttext def module(self, **kwargs) -> Optional[nn.Module]: return FastTextEmbeddingModule(self._fasttext.dst, self._fasttext.output_dim) class FastTextDataset(TransformableDataset): def load_file(self, filepath: str): raise NotImplementedError('Not supported.') class FastTextEmbeddingComponent(TorchComponent): def __init__(self, **kwargs) -> None: """ Toy example of Word2VecEmbedding. It simply returns the embedding of a given word Args: **kwargs: """ super().__init__(**kwargs) def build_dataloader(self, data, shuffle=False, device=None, logger: logging.Logger = None, **kwargs) -> DataLoader: embed: FastTextEmbedding = self.config.embed dataset = FastTextDataset([{'token': data}], transform=embed.transform()) return PadSequenceDataLoader(dataset, device=device) def build_optimizer(self, **kwargs): raise NotImplementedError('Not supported.') def build_criterion(self, **kwargs): raise NotImplementedError('Not supported.') def build_metric(self, **kwargs): raise NotImplementedError('Not supported.') def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): raise NotImplementedError('Not supported.') def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): raise NotImplementedError('Not supported.') def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs): raise NotImplementedError('Not supported.') def load_vocabs(self, save_dir, filename='vocabs.json'): pass def load_weights(self, save_dir, filename='model.pt', **kwargs): pass def build_model(self, training=True, **kwargs) -> torch.nn.Module: embed: FastTextEmbedding = self.config.embed return embed.module() def predict(self, data: str, **kwargs): dataloader = self.build_dataloader(data, device=self.device) for batch in dataloader: # It's a toy so doesn't really do batching return self.model(batch)[0] @property def devices(self): return [torch.device('cpu')] ================================================ FILE: hanlp/layers/embeddings/fast_text_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-29 13:14 import os import sys import numpy as np import tensorflow as tf from tensorflow.python.keras.utils import tf_utils from hanlp_common.constant import PAD from hanlp.utils.io_util import get_resource, stdout_redirected from hanlp.utils.log_util import logger from hanlp.utils.tf_util import hanlp_register @hanlp_register class FastTextEmbeddingTF(tf.keras.layers.Embedding): def __init__(self, filepath: str, padding=PAD, name=None, **kwargs): import fasttext self.padding = padding.encode('utf-8') self.filepath = filepath filepath = get_resource(filepath) assert os.path.isfile(filepath), f'Resolved path {filepath} is not a file' logger.debug('Loading fasttext model from [{}].'.format(filepath)) # fasttext print a blank line here with stdout_redirected(to=os.devnull, stdout=sys.stderr): self.model = fasttext.load_model(filepath) kwargs.pop('input_dim', None) kwargs.pop('output_dim', None) kwargs.pop('mask_zero', None) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim=len(self.model.words), output_dim=self.model['king'].size, mask_zero=padding is not None, trainable=False, dtype=tf.string, name=name, **kwargs) embed_fn = np.frompyfunc(self.embed, 1, 1) # vf = np.vectorize(self.embed, otypes=[np.ndarray]) self._embed_np = embed_fn def embed(self, word): return self.model[word] def embed_np(self, words: np.ndarray): output = self._embed_np(words) if self.mask_zero: mask = words != self.padding output *= mask output = np.stack(output.reshape(-1)).reshape(list(words.shape) + [self.output_dim]) return output, tf.constant(mask) else: output = np.stack(output.reshape(-1)).reshape(list(words.shape) + [self.output_dim]) return output @tf_utils.shape_type_conversion def build(self, input_shape): self.built = True @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape + (self.output_dim,) def call(self, inputs: tf.Tensor): if isinstance(inputs, list): inputs = inputs[0] if not hasattr(inputs, 'numpy'): # placeholder tensor inputs = tf.expand_dims(inputs, axis=-1) inputs = tf.tile(inputs, [1] * (len(inputs.shape) - 1) + [self.output_dim]) inputs = tf.zeros_like(inputs, dtype=tf.float32) return inputs # seq_len = inputs.shape[-1] # if not seq_len: # seq_len = 1 # return tf.zeros([1, seq_len, self.output_dim]) if self.mask_zero: outputs, masks = self.embed_np(inputs.numpy()) outputs = tf.constant(outputs) outputs._keras_mask = masks else: outputs = self.embed_np(inputs.numpy()) outputs = tf.constant(outputs) return outputs def compute_mask(self, inputs, mask=None): if not self.mask_zero: return None return tf.not_equal(inputs, self.padding) def get_config(self): config = { 'filepath': self.filepath, 'padding': self.padding.decode('utf-8') } base_config = super(FastTextEmbeddingTF, self).get_config() for junk in 'embeddings_initializer' \ , 'batch_input_shape' \ , 'embeddings_regularizer' \ , 'embeddings_constraint' \ , 'activity_regularizer' \ , 'trainable' \ , 'input_length' \ : base_config.pop(junk) return dict(list(base_config.items()) + list(config.items())) ================================================ FILE: hanlp/layers/embeddings/util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-09 15:45 from typing import Union import torch from torch import nn from hanlp.common.vocab import Vocab from hanlp.utils.init_util import embedding_uniform from hanlp.utils.torch_util import load_word2vec, load_word2vec_as_vocab_tensor def index_word2vec_with_vocab(filepath: str, vocab: Vocab, extend_vocab=True, unk=None, lowercase=False, init='uniform', normalize=None) -> torch.Tensor: """ Args: filepath: The path to pretrained embedding. vocab: The vocabulary from training set. extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file. unk: UNK token. lowercase: Convert words in pretrained embeddings into lowercase. init: Indicate which initialization to use for oov tokens. normalize: ``True`` or a method to normalize the embedding matrix. Returns: An embedding matrix. """ pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath) if unk and unk in pret_vocab: pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk) if extend_vocab: vocab.unlock() for word in pret_vocab: vocab.get_idx(word.lower() if lowercase else word) vocab.lock() ids = [] unk_id_offset = 0 for word, idx in vocab.token_to_idx.items(): word_id = pret_vocab.get(word, None) # Retry lower case if word_id is None: word_id = pret_vocab.get(word.lower(), None) if word_id is None: word_id = len(pret_vocab) + unk_id_offset unk_id_offset += 1 ids.append(word_id) if unk_id_offset: unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1)) if init and init != 'zeros': if init == 'uniform': init = embedding_uniform else: raise ValueError(f'Unsupported init {init}') unk_embeds = init(unk_embeds) pret_matrix = torch.cat([pret_matrix, unk_embeds]) ids = torch.LongTensor(ids) embedding = pret_matrix.index_select(0, ids) if normalize == 'norm': embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) elif normalize == 'l2': embedding = torch.nn.functional.normalize(embedding, p=2, dim=1) elif normalize == 'std': embedding /= torch.std(embedding) else: raise ValueError(f'Unsupported normalization method {normalize}') return embedding def build_word2vec_with_vocab(embed: Union[str, int], vocab: Vocab, extend_vocab=True, unk=None, lowercase=False, trainable=False, init='zeros', normalize=None) -> nn.Embedding: """Build word2vec embedding and a vocab. Args: embed: vocab: The vocabulary from training set. extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file. unk: UNK token. lowercase: Convert words in pretrained embeddings into lowercase. trainable: ``False`` to use static embeddings. init: Indicate which initialization to use for oov tokens. normalize: ``True`` or a method to normalize the embedding matrix. Returns: An embedding matrix. """ if isinstance(embed, str): embed = index_word2vec_with_vocab(embed, vocab, extend_vocab, unk, lowercase, init, normalize) embed = nn.Embedding.from_pretrained(embed, freeze=not trainable, padding_idx=vocab.pad_idx) return embed elif isinstance(embed, int): embed = nn.Embedding(len(vocab), embed, padding_idx=vocab.pad_idx) return embed else: raise ValueError(f'Unsupported parameter type: {embed}') ================================================ FILE: hanlp/layers/embeddings/util_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-09 15:46 from typing import Union import tensorflow as tf from hanlp.common.transform_tf import Transform from hanlp.common.vocab_tf import VocabTF from hanlp.layers.embeddings.char_cnn_tf import CharCNNEmbeddingTF from hanlp.layers.embeddings.char_rnn_tf import CharRNNEmbeddingTF from hanlp.layers.embeddings.concat_embedding import ConcatEmbedding from hanlp.layers.embeddings.contextual_string_embedding_tf import ContextualStringEmbeddingTF from hanlp.layers.embeddings.fast_text_tf import FastTextEmbeddingTF from hanlp.layers.embeddings.word2vec_tf import Word2VecEmbeddingTF, StringWord2VecEmbeddingTF, Word2VecEmbeddingV1 _upgrade = tf.keras.utils.get_custom_objects() for k, v in list(_upgrade.items()): if k.startswith('HanLP>') and k.endswith('TF'): _upgrade[k[:-2]] = v def build_embedding(embeddings: Union[str, int, dict], word_vocab: VocabTF, transform: Transform): if not embeddings: return None config = transform.config if isinstance(embeddings, int): embeddings = tf.keras.layers.Embedding(input_dim=len(word_vocab), output_dim=embeddings, trainable=True, mask_zero=True) config.embedding_trainable = True elif isinstance(embeddings, dict): # Upgrade to 2.1 embed_name = embeddings['class_name'].split('>')[-1] if embeddings['class_name'].startswith('HanLP>') and not embeddings['class_name'].endswith('TF'): embed_name += 'TF' # Embeddings need vocab if embed_name in (Word2VecEmbeddingTF.__name__, StringWord2VecEmbeddingTF.__name__): # Vocab won't present in the dict embeddings['config']['vocab'] = word_vocab elif embed_name in (CharRNNEmbeddingTF.__name__, CharCNNEmbeddingTF.__name__): embeddings['config']['word_vocab'] = word_vocab embeddings['config']['char_vocab'] = transform.char_vocab transform.map_x = False layer: tf.keras.layers.Embedding = tf.keras.utils.deserialize_keras_object(embeddings) # Embedding specific configuration if layer.__class__.__name__ in ('FastTextEmbedding', 'FastTextEmbeddingTF'): config.run_eagerly = True # fasttext can only run in eager mode config.embedding_trainable = False transform.map_x = False # fasttext accept string instead of int return layer elif isinstance(embeddings, list): if embeddings_require_string_input(embeddings): # those embeddings require string as input transform.map_x = False # use the string version of Word2VecEmbedding instead for embed in embeddings: if embed['class_name'].split('>')[-1] == Word2VecEmbeddingTF.__name__: embed['class_name'] = 'HanLP>' + StringWord2VecEmbeddingTF.__name__ return ConcatEmbedding(*[build_embedding(embed, word_vocab, transform) for embed in embeddings]) else: assert isinstance(embeddings, str), 'embedding should be str or int or dict' # word_vocab.unlock() embeddings = Word2VecEmbeddingV1(path=embeddings, vocab=word_vocab, trainable=config.get('embedding_trainable', False)) embeddings = embeddings.array_ks return embeddings def any_embedding_in(embeddings, *cls): names = set(x.__name__ for x in cls) names.update(list(x[:-2] for x in names if x.endswith('TF'))) for embed in embeddings: if isinstance(embed, dict) and embed['class_name'].split('>')[-1] in names: return True return False def embeddings_require_string_input(embeddings): if not isinstance(embeddings, list): embeddings = [embeddings] return any_embedding_in(embeddings, CharRNNEmbeddingTF, CharCNNEmbeddingTF, FastTextEmbeddingTF, ContextualStringEmbeddingTF) def embeddings_require_char_input(embeddings): if not isinstance(embeddings, list): embeddings = [embeddings] return any_embedding_in(embeddings, CharRNNEmbeddingTF, CharCNNEmbeddingTF, ContextualStringEmbeddingTF) ================================================ FILE: hanlp/layers/embeddings/word2vec.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-09 13:38 import logging import math import os.path from typing import Optional, Callable, Union, List, Dict import torch from hanlp_common.configurable import AutoConfigurable from hanlp_common.constant import HANLP_VERBOSE from hanlp_trie.trie import Trie from torch import nn from torch.utils.data import DataLoader from hanlp.common.dataset import TransformableDataset, PadSequenceDataLoader from hanlp.common.torch_component import TorchComponent from hanlp.common.transform import VocabDict from hanlp.common.vocab import Vocab from hanlp.layers.dropout import WordDropout from hanlp.layers.embeddings.embedding import Embedding, EmbeddingDim from hanlp.layers.embeddings.util import build_word2vec_with_vocab from hanlp.utils.log_util import flash from hanlp.utils.torch_util import load_word2vec_as_vocab_tensor class Word2VecEmbeddingModule(nn.Module, EmbeddingDim): def __init__(self, field: str, embed: nn.Embedding, word_dropout: WordDropout = None, cpu=False, second_channel=False, num_tokens_in_trn=None, unk_idx=1) -> None: """A word2vec style embedding module which maps a token to its embedding through looking up a pre-defined table. Args: field: The field to work on. Usually some token fields. embed: An ``Embedding`` layer. word_dropout: The probability of randomly replacing a token with ``UNK``. cpu: Reside on CPU instead of GPU. second_channel: A trainable second channel for each token, which will be added to pretrained embeddings. num_tokens_in_trn: The number of tokens in training set. unk_idx: The index of ``UNK``. """ super().__init__() self.cpu = cpu self.field = field self.embed = embed self.word_dropout = word_dropout self.num_tokens_in_trn = num_tokens_in_trn self.unk_idx = unk_idx if second_channel: n_words, n_embed = embed.weight.size() if num_tokens_in_trn: n_words = num_tokens_in_trn second_channel = nn.Embedding(num_embeddings=n_words, embedding_dim=n_embed) nn.init.zeros_(second_channel.weight) self.second_channel = second_channel def forward(self, batch: dict, **kwargs): x: torch.Tensor = batch[f'{self.field}_id'] if self.cpu: device = x.device x = x.cpu() if self.word_dropout: x = self.word_dropout(x) if self.second_channel: ext_mask = x.ge(self.second_channel.num_embeddings) ext_words = x.masked_fill(ext_mask, self.unk_idx) x = self.embed(x) + self.second_channel(ext_words) else: x = self.embed(x) if self.cpu: # noinspection PyUnboundLocalVariable x = x.to(device) return x @property def embedding_dim(self) -> int: return self.embed.embedding_dim # noinspection PyMethodOverriding # def to(self, device, **kwargs): # print(self.cpu) # exit(1) # if self.cpu: # return super(Word2VecEmbeddingModule, self).to(-1, **kwargs) # return super(Word2VecEmbeddingModule, self).to(device, **kwargs) def _apply(self, fn): if not self.cpu: # This might block all fn not limiting to moving between devices. return super(Word2VecEmbeddingModule, self)._apply(fn) class Word2VecEmbedding(Embedding, AutoConfigurable): def __init__(self, field, embed: Union[int, str], extend_vocab=True, pad=None, unk=None, lowercase=False, trainable=False, second_channel=False, word_dropout: float = 0, normalize=False, cpu=False, init='zeros') -> None: """A word2vec style embedding builder which maps a token to its embedding through looking up a pre-defined table. Args: field: The field to work on. Usually some token fields. embed: A path to pre-trained embedding file or an integer defining the size of randomly initialized embedding. extend_vocab: Unlock vocabulary of training set to add those tokens in pre-trained embedding file. pad: The padding token. unk: The unknown token. lowercase: Convert words in pretrained embeddings into lowercase. trainable: ``False`` to use static embeddings. second_channel: A trainable second channel for each token, which will be added to pretrained embeddings. word_dropout: The probability of randomly replacing a token with ``UNK``. normalize: ``l2`` or ``std`` to normalize the embedding matrix. cpu: Reside on CPU instead of GPU. init: Indicate which initialization to use for oov tokens. """ super().__init__() self.pad = pad self.second_channel = second_channel self.cpu = cpu self.normalize = normalize self.word_dropout = word_dropout self.init = init self.lowercase = lowercase self.unk = unk self.extend_vocab = extend_vocab self.trainable = trainable self.embed = embed self.field = field def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]: vocab = vocabs[self.field] num_tokens_in_trn = len(vocab) embed = build_word2vec_with_vocab(self.embed, vocab, self.extend_vocab, self.unk, self.lowercase, self.trainable, normalize=self.normalize) if self.word_dropout: assert vocab.unk_token, f'unk_token of vocab {self.field} has to be set in order to ' \ f'make use of word_dropout' padding = [] if vocab.pad_token: padding.append(vocab.pad_idx) word_dropout = WordDropout(self.word_dropout, vocab.unk_idx, exclude_tokens=padding) else: word_dropout = None return Word2VecEmbeddingModule(self.field, embed, word_dropout=word_dropout, cpu=self.cpu, second_channel=self.second_channel, num_tokens_in_trn=num_tokens_in_trn, unk_idx=vocab.unk_idx) def transform(self, vocabs: VocabDict = None, **kwargs) -> Optional[Callable]: assert vocabs is not None if self.field not in vocabs: vocabs[self.field] = Vocab(pad_token=self.pad, unk_token=self.unk) return super().transform(**kwargs) class Word2VecDataset(TransformableDataset): def load_file(self, filepath: str): raise NotImplementedError('Not supported.') class Word2VecEmbeddingComponent(TorchComponent): def __init__(self, **kwargs) -> None: """ Toy example of Word2VecEmbedding. It simply returns the embedding of a given word Args: **kwargs: """ super().__init__(**kwargs) self._tokenizer: Trie = None def build_dataloader(self, data: List[str], shuffle=False, device=None, logger: logging.Logger = None, doc2vec=False, batch_size=32, **kwargs) -> DataLoader: dataset = Word2VecDataset([{'token': x} for x in data], transform=self._tokenize if doc2vec else self.vocabs) return PadSequenceDataLoader(dataset, device=device, batch_size=batch_size) def build_optimizer(self, **kwargs): raise NotImplementedError('Not supported.') def build_criterion(self, **kwargs): raise NotImplementedError('Not supported.') def build_metric(self, **kwargs): raise NotImplementedError('Not supported.') def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): raise NotImplementedError('Not supported.') def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): raise NotImplementedError('Not supported.') def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs): raise NotImplementedError('Not supported.') def load_vocabs(self, save_dir, filename='vocabs.json'): self.vocabs['token'] = Vocab() def load_weights(self, save_dir, filename='model.pt', **kwargs): pass def build_model(self, training=True, **kwargs) -> torch.nn.Module: self._tokenizer = None embed: Word2VecEmbedding = self.config.embed model = embed.module(self.vocabs) return model def predict(self, word: str, doc2vec=False, **kwargs): dataloader = self.build_dataloader([word], device=self.device, doc2vec=doc2vec) for batch in dataloader: # It's a toy so doesn't really do batching embeddings = self.model(batch)[0] if doc2vec: embeddings = embeddings[0].mean(dim=0) return embeddings @torch.no_grad() def most_similar(self, words: Union[str, List[str]], topk=10, doc2vec=False, similarity_less_than=None, batch_size=32) -> Union[Dict[str, float], List[Dict[str, float]]]: """Find the `topk` most similar words of a given word or phrase. Args: words: A word or phrase or multiple words/phrases. topk: Number of top similar words. doc2vec: Enable doc2vec model for processing OOV and phrases. similarity_less_than: Only return words with a similarity less than this value. batch_size: Number of words or phrases per batch. Returns: Similar words and similarities stored in a dict. """ flat = isinstance(words, str) if flat: words = [words] dataloader = self.build_dataloader(words, device=self.device, doc2vec=doc2vec, batch_size=batch_size) results = [] vocab = self.vocabs['token'] for batch in dataloader: embeddings = self.model(batch) token_id = batch['token_id'] if doc2vec: lens = token_id.count_nonzero(dim=1) embeddings = embeddings.sum(1) embeddings = embeddings / lens.unsqueeze(1) block_word_id = batch['block_word_id'] token_is_unk = (lens == 1) & (token_id[:, 0] == vocab.unk_idx) else: block_word_id = token_id token_is_unk = token_id == vocab.unk_idx similarities = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), self.model.embed.weight, dim=-1) if similarity_less_than is not None: similarities[similarities > similarity_less_than] = -math.inf similarities[torch.arange(similarities.size(0), device=self.device), block_word_id] = -math.inf scores, indices = similarities.topk(topk) for sc, idx, unk in zip(scores.tolist(), indices.tolist(), token_is_unk.tolist()): results.append(dict() if unk else dict(zip([vocab.idx_to_token[i] for i in idx], sc))) if flat: results = results[0] return results def _tokenize(self, sample: dict) -> dict: tokens = sample['token'] ids = [idx for b, e, idx in self.tokenizer.parse_longest(tokens)] vocab = self.vocabs['token'] if not ids: ids = [vocab.unk_idx] sample['token_id'] = ids sample['block_word_id'] = ids[0] if len(ids) == 1 else vocab.pad_idx return sample @property def tokenizer(self): if not self._tokenizer: if HANLP_VERBOSE: flash('Building Trie-based tokenizer for Doc2Vec [blink][yellow]...[/yellow][/blink]') self._tokenizer = Trie(self.vocabs['token'].token_to_idx) if HANLP_VERBOSE: flash('') return self._tokenizer def load_config(self, save_dir, filename='config.json', **kwargs): if os.path.isfile(save_dir): self.config.update({'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbeddingComponent', 'embed': Word2VecEmbedding(field='token', embed=save_dir, normalize='l2')}) return super().load_config(save_dir, filename, **kwargs) class GazetterTransform(object): def __init__(self, field, words: dict) -> None: super().__init__() self.field = field self.trie = Trie() for word, idx in words.items(): self.trie[word] = idx def __call__(self, sample: dict) -> dict: tokens = sample[self.field] lexicons = self.trie.parse(tokens) skips_l2r = [[] for _ in range(len(tokens))] skips_r2l = [[] for _ in range(len(tokens))] for w, i, s, e in lexicons: e = e - 1 skips_l2r[e].append((s, w, i)) skips_r2l[s].append((e, w, i)) for direction, value in zip(['skips_l2r', 'skips_r2l'], [skips_l2r, skips_r2l]): sample[f'{self.field}_{direction}_offset'] = [list(map(lambda x: x[0], p)) for p in value] sample[f'{self.field}_{direction}_id'] = [list(map(lambda x: x[-1], p)) for p in value] sample[f'{self.field}_{direction}_count'] = list(map(len, value)) return sample class GazetteerEmbedding(Embedding, AutoConfigurable): def __init__(self, embed: str, field='char', trainable=False) -> None: self.trainable = trainable self.embed = embed self.field = field vocab, matrix = load_word2vec_as_vocab_tensor(self.embed) ids = [] _vocab = {} for word, idx in vocab.items(): if len(word) > 1: ids.append(idx) _vocab[word] = len(_vocab) ids = torch.tensor(ids) _matrix = matrix.index_select(0, ids) self._vocab = _vocab self._matrix = _matrix def transform(self, **kwargs) -> Optional[Callable]: return GazetterTransform(self.field, self._vocab) def module(self, **kwargs) -> Optional[nn.Module]: embed = nn.Embedding.from_pretrained(self._matrix, freeze=not self.trainable) return embed @staticmethod def _remove_short_tokens(word2vec): word2vec = dict((w, v) for w, v in word2vec.items() if len(w) > 1) return word2vec ================================================ FILE: hanlp/layers/embeddings/word2vec_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-24 21:49 import os from typing import Tuple, Union, List import numpy as np import tensorflow as tf from tensorflow.python.ops import math_ops from hanlp.common.vocab_tf import VocabTF from hanlp.utils.io_util import get_resource from hanlp.utils.torch_util import load_word2vec from hanlp.utils.tf_util import hanlp_register from hanlp_common.util import DummyContext class Word2VecEmbeddingV1(tf.keras.layers.Layer): def __init__(self, path: str = None, vocab: VocabTF = None, normalize: bool = False, load_all=True, mask_zero=True, trainable=False, name=None, dtype=None, dynamic=False, **kwargs): super().__init__(trainable, name, dtype, dynamic, **kwargs) if load_all and vocab and vocab.locked: vocab.unlock() self.vocab, self.array_np = self._load(path, vocab, normalize) self.vocab.lock() self.array_ks = tf.keras.layers.Embedding(input_dim=len(self.vocab), output_dim=self.dim, trainable=trainable, embeddings_initializer=tf.keras.initializers.Constant(self.array_np), mask_zero=mask_zero) self.mask_zero = mask_zero self.supports_masking = mask_zero def compute_mask(self, inputs, mask=None): if not self.mask_zero: return None return math_ops.not_equal(inputs, self.vocab.pad_idx) def call(self, inputs, **kwargs): return self.array_ks(inputs, **kwargs) def compute_output_shape(self, input_shape): return input_shape[0], self.dim @staticmethod def _load(path, vocab, normalize=False) -> Tuple[VocabTF, Union[np.ndarray, None]]: if not vocab: vocab = VocabTF() if not path: return vocab, None assert vocab.unk_idx is not None word2vec, dim = load_word2vec(path) for word in word2vec: vocab.get_idx(word) pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32) state = np.random.get_state() np.random.seed(0) bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32) scale = np.sqrt(3.0 / dim) for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) if vec is None: vec = word2vec.get(word.lower(), None) # if vec is not None: # vec += bias if vec is None: # vec = np.random.uniform(-scale, scale, [dim]) vec = np.zeros([dim], dtype=np.float32) pret_embs[idx] = vec # noinspection PyTypeChecker np.random.set_state(state) return vocab, pret_embs @property def size(self): if self.array_np is not None: return self.array_np.shape[0] @property def dim(self): if self.array_np is not None: return self.array_np.shape[1] @property def shape(self): if self.array_np is None: return None return self.array_np.shape def get_vector(self, word: str) -> np.ndarray: assert self.array_np is not None return self.array_np[self.vocab.get_idx_without_add(word)] def __getitem__(self, word: Union[str, List, tf.Tensor]) -> np.ndarray: if isinstance(word, str): return self.get_vector(word) elif isinstance(word, list): vectors = np.zeros(shape=(len(word), self.dim)) for idx, token in enumerate(word): vectors[idx] = self.get_vector(token) return vectors elif isinstance(word, tf.Tensor): if word.dtype == tf.string: word_ids = self.vocab.token_to_idx_table.lookup(word) return tf.nn.embedding_lookup(self.array_tf, word_ids) elif word.dtype == tf.int32 or word.dtype == tf.int64: return tf.nn.embedding_lookup(self.array_tf, word) @hanlp_register class Word2VecEmbeddingTF(tf.keras.layers.Embedding): def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=True, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, cpu=True, **kwargs): filepath = get_resource(filepath) word2vec, _output_dim = load_word2vec(filepath) if output_dim: assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}' output_dim = _output_dim # if the `unk` token exists in the pretrained, # then replace it with a self-defined one, usually the one in word vocab if unk and unk in word2vec: word2vec[vocab.safe_unk_token] = word2vec.pop(unk) if vocab is None: vocab = VocabTF() vocab.update(word2vec.keys()) if expand_vocab and vocab.mutable: for word in word2vec: vocab.get_idx(word.lower() if lowercase else word) if input_dim: assert input_dim == len(vocab), f'input_dim = {input_dim} does not match {filepath}' input_dim = len(vocab) # init matrix self._embeddings_initializer = embeddings_initializer embeddings_initializer = tf.keras.initializers.get(embeddings_initializer) with tf.device('cpu:0') if cpu else DummyContext(): pret_embs = embeddings_initializer(shape=[input_dim, output_dim]).numpy() # insert to pret_embs for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) # Retry lower case if vec is None and lowercase: vec = word2vec.get(word.lower(), None) if vec is not None: pret_embs[idx] = vec if normalize: pret_embs /= np.std(pret_embs) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs) self.filepath = filepath self.expand_vocab = expand_vocab self.lowercase = lowercase def get_config(self): config = { 'filepath': self.filepath, 'expand_vocab': self.expand_vocab, 'lowercase': self.lowercase, } base_config = super(Word2VecEmbeddingTF, self).get_config() base_config['embeddings_initializer'] = self._embeddings_initializer return dict(list(base_config.items()) + list(config.items())) @hanlp_register class StringWord2VecEmbeddingTF(Word2VecEmbeddingTF): def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=False, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, **kwargs): if vocab is None: vocab = VocabTF() self.vocab = vocab super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim, output_dim, unk, normalize, embeddings_initializer, embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name, **kwargs) def call(self, inputs): assert inputs.dtype == tf.string, \ f'Expect tf.string but got tf.{inputs.dtype.name}. {inputs}' \ f'Please pass tf.{inputs.dtype.name} in.' inputs = self.vocab.lookup(inputs) # inputs._keras_mask = tf.not_equal(inputs, self.vocab.pad_idx) return super().call(inputs) def compute_mask(self, inputs, mask=None): if not self.mask_zero: return None return tf.not_equal(inputs, self.vocab.pad_token) ================================================ FILE: hanlp/layers/feed_forward.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-06 14:37 from typing import Union, List from hanlp.layers import feedforward from hanlp.common.structure import ConfigTracker class FeedForward(feedforward.FeedForward, ConfigTracker): def __init__(self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]], activations: Union[str, List[str]], dropout: Union[float, List[float]] = 0.0) -> None: super().__init__(input_dim, num_layers, hidden_dims, activations, dropout) ConfigTracker.__init__(self, locals()) ================================================ FILE: hanlp/layers/feedforward.py ================================================ """ A feed-forward neural network. """ from typing import List, Union import torch from hanlp.utils.torch_util import activation_from_name class FeedForward(torch.nn.Module): """ This `Module` is a feed-forward neural network, just a sequence of `Linear` layers with activation functions in between. # Parameters input_dim : `int`, required The dimensionality of the input. We assume the input has shape `(batch_size, input_dim)`. num_layers : `int`, required The number of `Linear` layers to apply to the input. hidden_dims : `Union[int, List[int]]`, required The output dimension of each of the `Linear` layers. If this is a single `int`, we use it for all `Linear` layers. If it is a `List[int]`, `len(hidden_dims)` must be `num_layers`. activations : `Union[Activation, List[Activation]]`, required The activation function to use after each `Linear` layer. If this is a single function, we use it after all `Linear` layers. If it is a `List[Activation]`, `len(activations)` must be `num_layers`. Activation must have torch.nn.Module type. dropout : `Union[float, List[float]]`, optional (default = `0.0`) If given, we will apply this amount of dropout after each layer. Semantics of `float` versus `List[float]` is the same as with other parameters. # Examples ```python FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2) #> FeedForward( #> (_activations): ModuleList( #> (0): ReLU() #> (1): ReLU() #> ) #> (_linear_layers): ModuleList( #> (0): Linear(in_features=124, out_features=64, bias=True) #> (1): Linear(in_features=64, out_features=32, bias=True) #> ) #> (_dropout): ModuleList( #> (0): Dropout(p=0.2, inplace=False) #> (1): Dropout(p=0.2, inplace=False) #> ) #> ) ``` """ def __init__( self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]], activations: Union[str, List[str]], dropout: Union[float, List[float]] = 0.0, ) -> None: super().__init__() if not isinstance(hidden_dims, list): hidden_dims = [hidden_dims] * num_layers # type: ignore if not isinstance(activations, list): activations = [activations] * num_layers # type: ignore activations = [activation_from_name(a)() for a in activations] if not isinstance(dropout, list): dropout = [dropout] * num_layers # type: ignore if len(hidden_dims) != num_layers: raise ValueError( "len(hidden_dims) (%d) != num_layers (%d)" % (len(hidden_dims), num_layers) ) if len(activations) != num_layers: raise ValueError( "len(activations) (%d) != num_layers (%d)" % (len(activations), num_layers) ) if len(dropout) != num_layers: raise ValueError( "len(dropout) (%d) != num_layers (%d)" % (len(dropout), num_layers) ) self._activations = torch.nn.ModuleList(activations) input_dims = [input_dim] + hidden_dims[:-1] linear_layers = [] for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims): linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim)) self._linear_layers = torch.nn.ModuleList(linear_layers) dropout_layers = [torch.nn.Dropout(p=value) for value in dropout] self._dropout = torch.nn.ModuleList(dropout_layers) self._output_dim = hidden_dims[-1] self.input_dim = input_dim def get_output_dim(self): return self._output_dim def get_input_dim(self): return self.input_dim def forward(self, inputs: torch.Tensor) -> torch.Tensor: output = inputs for layer, activation, dropout in zip( self._linear_layers, self._activations, self._dropout ): output = dropout(activation(layer(output))) return output ================================================ FILE: hanlp/layers/scalar_mix.py ================================================ # This file is modified from udify, which is licensed under the MIT license: # MIT License # # Copyright (c) 2019 Dan Kondratyuk # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. """ The dot-product "Layer Attention" that is applied to the layers of BERT, along with layer dropout to reduce overfitting """ from typing import List, Tuple import torch from torch.nn import ParameterList, Parameter from hanlp.common.structure import ConfigTracker class ScalarMixWithDropout(torch.nn.Module): """Computes a parameterised scalar mixture of N tensors, ``mixture = gamma * sum(s_k * tensor_k)`` where ``s = softmax(w)``, with ``w`` and ``gamma`` scalar parameters. If ``do_layer_norm=True`` then apply layer normalization to each tensor before weighting. If ``dropout > 0``, then for each scalar weight, adjust its softmax weight mass to 0 with the dropout probability (i.e., setting the unnormalized weight to -inf). This effectively should redistribute dropped probability mass to all other weights. Args: Returns: """ def __init__(self, mixture_range: Tuple[int, int], do_layer_norm: bool = False, initial_scalar_parameters: List[float] = None, trainable: bool = True, dropout: float = None, dropout_value: float = -1e20, **kwargs) -> None: super(ScalarMixWithDropout, self).__init__() self.mixture_range = mixture_range mixture_size = mixture_range[1] - mixture_range[0] self.mixture_size = mixture_size self.do_layer_norm = do_layer_norm self.dropout = dropout if initial_scalar_parameters is None: initial_scalar_parameters = [0.0] * mixture_size elif len(initial_scalar_parameters) != mixture_size: raise ValueError("Length of initial_scalar_parameters {} differs " "from mixture_size {}".format( initial_scalar_parameters, mixture_size)) # self.scalar_parameters = ParameterList( # [Parameter(torch.FloatTensor([initial_scalar_parameters[i]]), # requires_grad=trainable) for i # in range(mixture_size)]) self.scalar_parameters = Parameter(torch.FloatTensor(initial_scalar_parameters), requires_grad=True) self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=trainable) if self.dropout: dropout_mask = torch.zeros(len(self.scalar_parameters)) dropout_fill = torch.empty(len(self.scalar_parameters)).fill_(dropout_value) self.register_buffer("dropout_mask", dropout_mask) self.register_buffer("dropout_fill", dropout_fill) def forward(self, tensors: List[torch.Tensor], # pylint: disable=arguments-differ mask: torch.Tensor = None) -> torch.Tensor: """Compute a weighted average of the ``tensors``. The input tensors an be any shape with at least two dimensions, but must all be the same shape. When ``do_layer_norm=True``, the ``mask`` is required input. If the ``tensors`` are dimensioned ``(dim_0, ..., dim_{n-1}, dim_n)``, then the ``mask`` is dimensioned ``(dim_0, ..., dim_{n-1})``, as in the typical case with ``tensors`` of shape ``(batch_size, timesteps, dim)`` and ``mask`` of shape ``(batch_size, timesteps)``. When ``do_layer_norm=False`` the ``mask`` is ignored. Args: tensors: List[torch.Tensor]: # pylint: disable: (Default value = arguments-differmask: torch.Tensor = None) Returns: """ if len(tensors) != self.mixture_size: tensors = tensors[self.mixture_range[0]:self.mixture_range[1]] if len(tensors) != self.mixture_size: raise ValueError("{} tensors were passed, but the module was initialized to " "mix {} tensors.".format(len(tensors), self.mixture_size)) def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): tensor_masked = tensor * broadcast_mask mean = torch.sum(tensor_masked) / num_elements_not_masked variance = torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / num_elements_not_masked return (tensor - mean) / torch.sqrt(variance + 1E-12) weights = self.scalar_parameters if self.dropout: weights = torch.where(self.dropout_mask.uniform_() > self.dropout, weights, self.dropout_fill) normed_weights = torch.nn.functional.softmax(weights, dim=0) if not self.do_layer_norm: return self.gamma * torch.einsum('i,ijkl->jkl', normed_weights, tensors) # pieces = [] # for weight, tensor in zip(normed_weights, tensors): # pieces.append(weight * tensor) # return self.gamma * sum(pieces) else: normed_weights = torch.split(normed_weights, split_size_or_sections=1) mask_float = mask.float() broadcast_mask = mask_float.unsqueeze(-1) input_dim = tensors[0].size(-1) num_elements_not_masked = torch.sum(mask_float) * input_dim pieces = [] for weight, tensor in zip(normed_weights, tensors): pieces.append(weight * _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked)) return self.gamma * sum(pieces) class ScalarMixWithDropoutBuilder(ConfigTracker, ScalarMixWithDropout): def __init__(self, mixture_range: Tuple[int, int], do_layer_norm: bool = False, initial_scalar_parameters: List[float] = None, trainable: bool = True, dropout: float = None, dropout_value: float = -1e20) -> None: super().__init__(locals()) def build(self): return ScalarMixWithDropout(**self.config) ================================================ FILE: hanlp/layers/time_distributed.py ================================================ """ A wrapper that unrolls the second (time) dimension of a tensor into the first (batch) dimension, applies some other `Module`, and then rolls the time dimension back up. """ from typing import List import torch class TimeDistributed(torch.nn.Module): """ Given an input shaped like `(batch_size, time_steps, [rest])` and a `Module` that takes inputs like `(batch_size, [rest])`, `TimeDistributed` reshapes the input to be `(batch_size * time_steps, [rest])`, applies the contained `Module`, then reshapes it back. Note that while the above gives shapes with `batch_size` first, this `Module` also works if `batch_size` is second - we always just combine the first two dimensions, then split them. It also reshapes keyword arguments unless they are not tensors or their name is specified in the optional `pass_through` iterable. """ def __init__(self, module): super().__init__() self._module = module def forward(self, *inputs, pass_through: List[str] = None, **kwargs): pass_through = pass_through or [] reshaped_inputs = [self._reshape_tensor(input_tensor) for input_tensor in inputs] # Need some input to then get the batch_size and time_steps. some_input = None if inputs: some_input = inputs[-1] reshaped_kwargs = {} for key, value in kwargs.items(): if isinstance(value, torch.Tensor) and key not in pass_through: if some_input is None: some_input = value value = self._reshape_tensor(value) reshaped_kwargs[key] = value reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs) if some_input is None: raise RuntimeError("No input tensor to time-distribute") # Now get the output back into the right shape. # (batch_size, time_steps, **output_size) new_size = some_input.size()[:2] + reshaped_outputs.size()[1:] outputs = reshaped_outputs.contiguous().view(new_size) return outputs @staticmethod def _reshape_tensor(input_tensor): input_size = input_tensor.size() if len(input_size) <= 2: raise RuntimeError(f"No dimension to distribute: {input_size}") # Squash batch_size and time_steps into a single axis; result has shape # (batch_size * time_steps, **input_size). squashed_shape = [-1] + list(input_size[2:]) return input_tensor.contiguous().view(*squashed_shape) ================================================ FILE: hanlp/layers/transformers/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 15:17 # mute transformers import logging logging.getLogger('transformers.file_utils').setLevel(logging.ERROR) logging.getLogger('transformers.filelock').setLevel(logging.ERROR) logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR) logging.getLogger('transformers.configuration_utils').setLevel(logging.ERROR) logging.getLogger('transformers.modeling_tf_utils').setLevel(logging.ERROR) logging.getLogger('transformers.modeling_utils').setLevel(logging.ERROR) logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR) ================================================ FILE: hanlp/layers/transformers/encoder.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-22 21:06 import warnings from typing import Union, Dict, Any, Sequence, Tuple, Optional import torch from torch import nn from hanlp.layers.dropout import WordDropout from hanlp.layers.scalar_mix import ScalarMixWithDropout, ScalarMixWithDropoutBuilder from hanlp.layers.transformers.resource import get_tokenizer_mirror from hanlp.layers.transformers.pt_imports import PreTrainedModel, PreTrainedTokenizer, AutoTokenizer, AutoModel_, \ BertTokenizer, AutoTokenizer_ from hanlp.layers.transformers.utils import transformer_encode # noinspection PyAbstractClass class TransformerEncoder(nn.Module): def __init__(self, transformer: Union[PreTrainedModel, str], transformer_tokenizer: PreTrainedTokenizer, average_subwords=False, scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None, word_dropout=None, max_sequence_length=None, ret_raw_hidden_states=False, transformer_args: Dict[str, Any] = None, trainable=Union[bool, Optional[Tuple[int, int]]], training=True) -> None: """A pre-trained transformer encoder. Args: transformer: A ``PreTrainedModel`` or an identifier of a ``PreTrainedModel``. transformer_tokenizer: A ``PreTrainedTokenizer``. average_subwords: ``True`` to average subword representations. scalar_mix: Layer attention. word_dropout: Dropout rate of randomly replacing a subword with MASK. max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding window. If ``None``, then the ``max_position_embeddings`` of the transformer will be used. ret_raw_hidden_states: ``True`` to return hidden states of each layer. transformer_args: Extra arguments passed to the transformer. trainable: ``False`` to use static embeddings. training: ``False`` to skip loading weights from pre-trained transformers. """ super().__init__() self.ret_raw_hidden_states = ret_raw_hidden_states self.average_subwords = average_subwords if word_dropout: oov = transformer_tokenizer.mask_token_id if isinstance(word_dropout, Sequence): word_dropout, replacement = word_dropout if replacement == 'unk': # Electra English has to use unk oov = transformer_tokenizer.unk_token_id elif replacement == 'mask': # UDify uses [MASK] oov = transformer_tokenizer.mask_token_id else: oov = replacement pad = transformer_tokenizer.pad_token_id cls = transformer_tokenizer.cls_token_id sep = transformer_tokenizer.sep_token_id excludes = [pad, cls, sep] self.word_dropout = WordDropout(p=word_dropout, oov_token=oov, exclude_tokens=excludes) else: self.word_dropout = None if isinstance(transformer, str): output_hidden_states = scalar_mix is not None if transformer_args is None: transformer_args = dict() transformer_args['output_hidden_states'] = output_hidden_states transformer = AutoModel_.from_pretrained(transformer, training=training or not trainable, **transformer_args) if max_sequence_length is None: max_sequence_length = transformer.config.max_position_embeddings self.max_sequence_length = max_sequence_length if hasattr(transformer, 'encoder') and hasattr(transformer, 'decoder'): # For seq2seq model, use its encoder transformer = transformer.encoder self.transformer = transformer if not trainable: transformer.requires_grad_(False) elif isinstance(trainable, tuple): layers = [] if hasattr(transformer, 'embeddings'): layers.append(transformer.embeddings) layers.extend(transformer.encoder.layer) for i, layer in enumerate(layers): if i < trainable[0] or i >= trainable[1]: layer.requires_grad_(False) if isinstance(scalar_mix, ScalarMixWithDropoutBuilder): self.scalar_mix: ScalarMixWithDropout = scalar_mix.build() else: self.scalar_mix = None def forward(self, input_ids: torch.LongTensor, attention_mask=None, token_type_ids=None, token_span=None, **kwargs): if self.word_dropout: input_ids = self.word_dropout(input_ids) x = transformer_encode(self.transformer, input_ids, attention_mask, token_type_ids, token_span, layer_range=self.scalar_mix.mixture_range if self.scalar_mix else 0, max_sequence_length=self.max_sequence_length, average_subwords=self.average_subwords, ret_raw_hidden_states=self.ret_raw_hidden_states) if self.ret_raw_hidden_states: x, raw_hidden_states = x if self.scalar_mix: x = self.scalar_mix(x) if self.ret_raw_hidden_states: # noinspection PyUnboundLocalVariable return x, raw_hidden_states return x @staticmethod def build_transformer(config, training=True) -> PreTrainedModel: kwargs = {} if config.scalar_mix and config.scalar_mix > 0: kwargs['output_hidden_states'] = True transformer = AutoModel_.from_pretrained(config.transformer, training=training, **kwargs) return transformer @staticmethod def build_transformer_tokenizer(config_or_str, use_fast=True, do_basic_tokenize=True) -> PreTrainedTokenizer: return AutoTokenizer_.from_pretrained(config_or_str, use_fast, do_basic_tokenize) ================================================ FILE: hanlp/layers/transformers/loader_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-04 06:05 import tensorflow as tf from transformers import TFAutoModel from hanlp.layers.transformers.pt_imports import AutoTokenizer_, AutoModel_ def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False): tokenizer = AutoTokenizer_.from_pretrained(transformer) if tokenizer_only: return tokenizer l_bert = TFAutoModel.from_pretrained(transformer) l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids") l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids") l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids") output = l_bert(input_ids=l_input_ids, token_type_ids=l_token_type_ids, attention_mask=l_mask_ids).last_hidden_state if not tagging: output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output) logits = tf.keras.layers.Dense(num_labels)(output) model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits) model.build(input_shape=(None, max_seq_length)) return model, tokenizer ================================================ FILE: hanlp/layers/transformers/pt_imports.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-09 11:25 import os import warnings from hanlp.layers.transformers.resource import get_tokenizer_mirror, get_model_mirror if os.environ.get('TOKENIZERS_PARALLELISM', None) is None: os.environ["TOKENIZERS_PARALLELISM"] = "false" from transformers import BertTokenizer, BertConfig, PretrainedConfig, AutoConfig, AutoTokenizer, PreTrainedTokenizer, \ BertTokenizerFast, AlbertConfig, BertModel, AutoModel, PreTrainedModel, AutoModelForSequenceClassification, \ AutoModelForTokenClassification, BartModel class AutoModel_(AutoModel): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, training=True, **kwargs): pretrained_model_name_or_path = get_model_mirror(pretrained_model_name_or_path) if training: return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) else: if isinstance(pretrained_model_name_or_path, str): pretrained_model_name_or_path = get_tokenizer_mirror(pretrained_model_name_or_path) return super().from_config(AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)) else: assert not kwargs return super().from_config(pretrained_model_name_or_path) class AutoConfig_(AutoConfig): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): pretrained_model_name_or_path = get_tokenizer_mirror(pretrained_model_name_or_path) return super().from_pretrained(pretrained_model_name_or_path, **kwargs) class AutoTokenizer_(AutoTokenizer): @classmethod def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, do_basic_tokenize=True) -> PreTrainedTokenizer: if isinstance(pretrained_model_name_or_path, str): transformer = pretrained_model_name_or_path else: transformer = pretrained_model_name_or_path.transformer additional_config = dict() if transformer.startswith('voidful/albert_chinese_') or transformer.startswith('uer/albert'): cls = BertTokenizer elif transformer == 'cl-tohoku/bert-base-japanese-char': # Since it's char level model, it's OK to use char level tok instead of fugashi # from hanlp.utils.lang.ja.bert_tok import BertJapaneseTokenizerFast # cls = BertJapaneseTokenizerFast from transformers import BertJapaneseTokenizer cls = BertJapaneseTokenizer # from transformers import BertTokenizerFast # cls = BertTokenizerFast additional_config['word_tokenizer_type'] = 'basic' elif transformer == "Langboat/mengzi-bert-base": cls = BertTokenizerFast if use_fast else BertTokenizer else: cls = AutoTokenizer if use_fast and not do_basic_tokenize: warnings.warn('`do_basic_tokenize=False` might not work when `use_fast=True`') tokenizer = cls.from_pretrained(get_tokenizer_mirror(transformer), use_fast=use_fast, do_basic_tokenize=do_basic_tokenize, **additional_config) tokenizer.name_or_path = transformer return tokenizer ================================================ FILE: hanlp/layers/transformers/relative_transformer.py ================================================ # A modified version of the implementation from the following paper: # TENER: Adapting Transformer Encoder for Named Entity Recognition # Hang Yan, Bocao Deng, Xiaonan Li, Xipeng Qiu import math import torch import torch.nn.functional as F from torch import Tensor, nn from hanlp.common.structure import ConfigTracker class RelativeSinusoidalPositionalEmbedding(nn.Module): """This module produces sinusoidal positional embeddings of any length. Padding symbols are ignored. Args: embedding_dim: embedding size of each position padding_idx: Returns: """ def __init__(self, embedding_dim, padding_idx, init_size=1024): super().__init__() self.embedding_dim = embedding_dim self.padding_idx = padding_idx assert init_size % 2 == 0 weights = self.get_embedding( init_size + 1, embedding_dim, padding_idx, ) self.register_buffer('weights', weights) def get_embedding(self, num_embeddings, embedding_dim, padding_idx=None): """Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of "Attention Is All You Need". Args: num_embeddings: embedding_dim: padding_idx: (Default value = None) Returns: """ half_dim = embedding_dim // 2 emb = math.log(10000) / (half_dim - 1) emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) emb = torch.arange(-num_embeddings // 2, num_embeddings // 2, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) if embedding_dim % 2 == 1: # zero pad emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) if padding_idx is not None: emb[padding_idx, :] = 0 self.origin_shift = num_embeddings // 2 + 1 return emb def forward(self, inputs: Tensor): """Input is expected to be of size [bsz x seqlen]. Args: inputs: Tensor: Returns: """ bsz, seq_len = inputs.size() max_pos = self.padding_idx + seq_len if max_pos >= self.origin_shift: # recompute/expand embeddings if needed weights = self.get_embedding( max_pos * 2, self.embedding_dim, self.padding_idx, ) weights = weights.to(self.weights.device) del self.weights self.origin_shift = weights.size(0) // 2 self.register_buffer('weights', weights) positions = torch.arange(-seq_len, seq_len).to(inputs.device).long() + self.origin_shift # 2*seq_len embed = self.weights.index_select(0, positions.long()).detach() return embed class RelativeMultiHeadAttn(nn.Module): def __init__(self, in_features, num_heads, dropout, r_w_bias=None, r_r_bias=None, init_seq_length=1024, k_as_x=True): """ Args: in_features: num_heads: dropout: r_w_bias: n_head x head_dim or None r_r_bias: n_head x head_dim or None init_seq_length: k_as_x: """ super().__init__() self.k_as_x = k_as_x if k_as_x: self.qv_linear = nn.Linear(in_features, in_features * 2, bias=False) else: self.qkv_linear = nn.Linear(in_features, in_features * 3, bias=False) self.n_head = num_heads self.head_dim = in_features // num_heads self.dropout_layer = nn.Dropout(dropout) self.pos_embed = RelativeSinusoidalPositionalEmbedding(self.head_dim, 0, init_seq_length) if r_r_bias is None or r_w_bias is None: # Biases are not shared self.r_r_bias = nn.Parameter(nn.init.xavier_normal_(torch.zeros(num_heads, in_features // num_heads))) self.r_w_bias = nn.Parameter(nn.init.xavier_normal_(torch.zeros(num_heads, in_features // num_heads))) else: self.r_r_bias = r_r_bias # r_r_bias就是v self.r_w_bias = r_w_bias # r_w_bias就是u def forward(self, x, mask): """ Args: x: batch_size x max_len x d_model mask: batch_size x max_len Returns: """ batch_size, max_len, d_model = x.size() pos_embed = self.pos_embed(mask) # l x head_dim if self.k_as_x: qv = self.qv_linear(x) # batch_size x max_len x d_model2 q, v = torch.chunk(qv, chunks=2, dim=-1) k = x.view(batch_size, max_len, self.n_head, -1).transpose(1, 2) else: qkv = self.qkv_linear(x) # batch_size x max_len x d_model3 q, k, v = torch.chunk(qkv, chunks=3, dim=-1) k = k.view(batch_size, max_len, self.n_head, -1).transpose(1, 2) q = q.view(batch_size, max_len, self.n_head, -1).transpose(1, 2) v = v.view(batch_size, max_len, self.n_head, -1).transpose(1, 2) # b x n x l x d rw_head_q = q + self.r_r_bias[:, None] AC = torch.einsum('bnqd,bnkd->bnqk', [rw_head_q, k]) # b x n x l x d, n是head D_ = torch.einsum('nd,ld->nl', self.r_w_bias, pos_embed)[None, :, None] # head x 2max_len, 每个head对位置的bias B_ = torch.einsum('bnqd,ld->bnql', q, pos_embed) # bsz x head x max_len x 2max_len,每个query对每个shift的偏移 E_ = torch.einsum('bnqd,ld->bnql', k, pos_embed) # bsz x head x max_len x 2max_len, key对relative的bias BD = B_ + D_ # bsz x head x max_len x 2max_len, 要转换为bsz x head x max_len x max_len if self.k_as_x: BD = self._shift(BD) attn = AC + BD else: BDE = self._shift(BD) + self._transpose_shift(E_) attn = AC + BDE attn = attn.masked_fill(mask[:, None, None, :].eq(0), float('-inf')) attn = F.softmax(attn, dim=-1) attn = self.dropout_layer(attn) v = torch.matmul(attn, v).transpose(1, 2).reshape(batch_size, max_len, d_model) # b x n x l x d return v def _shift(self, BD): """类似 -3 -2 -1 0 1 2 -3 -2 -1 0 1 2 -3 -2 -1 0 1 2 转换为 0 1 2 -1 0 1 -2 -1 0 Args: BD: batch_size x n_head x max_len x 2max_len Returns: batch_size x n_head x max_len x max_len """ bsz, n_head, max_len, _ = BD.size() zero_pad = BD.new_zeros(bsz, n_head, max_len, 1) BD = torch.cat([BD, zero_pad], dim=-1).view(bsz, n_head, -1, max_len) # bsz x n_head x (2max_len+1) x max_len BD = BD.narrow(dim=2, start=0, length=2 * max_len) \ .view(bsz, n_head, max_len, -1) # bsz x n_head x 2max_len x max_len BD = BD.narrow(dim=-1, start=max_len, length=max_len) return BD def _transpose_shift(self, E): """类似 -3 -2 -1 0 1 2 -30 -20 -10 00 10 20 -300 -200 -100 000 100 200 转换为 0 -10 -200 1 00 -100 2 10 000 Args: E: batch_size x n_head x max_len x 2max_len Returns: batch_size x n_head x max_len x max_len """ bsz, n_head, max_len, _ = E.size() zero_pad = E.new_zeros(bsz, n_head, max_len, 1) # bsz x n_head x -1 x (max_len+1) E = torch.cat([E, zero_pad], dim=-1).view(bsz, n_head, -1, max_len) indice = (torch.arange(max_len) * 2 + 1).to(E.device) E = E.index_select(index=indice, dim=-2).transpose(-1, -2) # bsz x n_head x max_len x max_len return E class RelativeTransformerLayer(nn.Module): def __init__(self, in_features, num_heads=4, feedforward_dim=256, dropout=0.2, dropout_attn=None, after_norm=True, k_as_x=True, init_seq_length=1024): super().__init__() if dropout_attn is None: dropout_attn = dropout self.after_norm = after_norm self.norm1 = nn.LayerNorm(in_features) self.norm2 = nn.LayerNorm(in_features) self.self_attn = RelativeMultiHeadAttn(in_features, num_heads, dropout=dropout_attn, init_seq_length=init_seq_length, k_as_x=k_as_x) self.ffn = nn.Sequential(nn.Linear(in_features, feedforward_dim), nn.LeakyReLU(), nn.Dropout(dropout, inplace=True), nn.Linear(feedforward_dim, in_features), nn.Dropout(dropout, inplace=True)) def forward(self, x, mask): """ Args: x: batch_size x max_len x hidden_size mask: batch_size x max_len, 为0的地方为pad Returns: batch_size x max_len x hidden_size """ residual = x if not self.after_norm: x = self.norm1(x) x = self.self_attn(x, mask) x = x + residual if self.after_norm: x = self.norm1(x) residual = x if not self.after_norm: x = self.norm2(x) x = self.ffn(x) x = residual + x if self.after_norm: x = self.norm2(x) return x class RelativeTransformer(nn.Module): def __init__(self, in_features, num_layers, feedforward_dim, num_heads, dropout, dropout_attn=None, after_norm=True, init_seq_length=1024, k_as_x=True): super().__init__() self.layers = nn.ModuleList([ RelativeTransformerLayer(in_features, feedforward_dim, num_heads, dropout, dropout_attn, after_norm, init_seq_length=init_seq_length, k_as_x=k_as_x) for _ in range(num_layers) ]) def forward(self, x: Tensor, mask: Tensor): """ Args: x: batch_size x max_len mask: batch_size x max_len. 有value的地方为1 x: Tensor: mask: Tensor: Returns: """ if not x.numel(): return x for layer in self.layers: x = layer(x, mask) return x class RelativeTransformerEncoder(RelativeTransformer, ConfigTracker): def __init__(self, in_features, num_layers=2, num_heads=4, feedforward_dim=256, dropout=0.1, dropout_attn=0.1, after_norm=True, k_as_x=True, ): super().__init__(in_features, num_layers, num_heads, feedforward_dim, dropout, dropout_attn, after_norm) ConfigTracker.__init__(self, locals()) def get_output_dim(self): return self.config['in_features'] ================================================ FILE: hanlp/layers/transformers/resource.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-20 12:43 from hanlp.utils.io_util import get_resource from hanlp_common.constant import HANLP_URL tokenizer_mirrors = { 'hfl/chinese-electra-180g-base-discriminator': HANLP_URL + 'transformers/electra_zh_base_20210706_125233.zip', 'hfl/chinese-electra-180g-small-discriminator': HANLP_URL + 'transformers/electra_zh_small_20210706_125427.zip', 'xlm-roberta-base': HANLP_URL + 'transformers/xlm-roberta-base_20210706_125502.zip', 'cl-tohoku/bert-base-japanese-char': HANLP_URL + 'transformers/bert-base-japanese-char_20210602_215445.zip', 'bart5-chinese-small': HANLP_URL + 'transformers/bart5-chinese-small_tok_20210723_180743.zip', 'ernie-gram': HANLP_URL + 'transformers/ernie-gram_20220207_103518.zip', 'xlm-roberta-base-no-space': HANLP_URL + 'transformers/xlm-roberta-base-no-space-tokenizer_20220610_204241.zip', 'mMiniLMv2L6-no-space': HANLP_URL + 'transformers/mMiniLMv2L6-no-space-tokenizer_20220616_094859.zip', 'mMiniLMv2L12-no-space': HANLP_URL + 'transformers/mMiniLMv2L12-no-space-tokenizer_20220616_095900.zip', } model_mirrors = { 'bart5-chinese-small': HANLP_URL + 'transformers/bart5-chinese-small_20210723_203923.zip', 'xlm-roberta-base-no-space': HANLP_URL + 'transformers/xlm-roberta-base-no-space_20220610_203944.zip', 'mMiniLMv2L6-no-space': HANLP_URL + 'transformers/mMiniLMv2L6-no-space_20220616_094949.zip', 'mMiniLMv2L12-no-space': HANLP_URL + 'transformers/mMiniLMv2L12-no-space_20220616_095924.zip', } def get_tokenizer_mirror(transformer: str) -> str: m = tokenizer_mirrors.get(transformer, None) if m: return get_resource(m) return transformer def get_model_mirror(transformer: str) -> str: m = model_mirrors.get(transformer, None) if m: return get_resource(m) return transformer ================================================ FILE: hanlp/layers/transformers/tf_imports.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-08 21:57 from transformers import BertTokenizer, BertConfig, PretrainedConfig, TFAutoModel, \ AutoConfig, AutoTokenizer, PreTrainedTokenizer, TFPreTrainedModel, TFAlbertModel, TFAutoModelWithLMHead, \ BertTokenizerFast, TFAlbertForMaskedLM, AlbertConfig, TFBertModel ================================================ FILE: hanlp/layers/transformers/utils.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-15 21:22 from collections import defaultdict from typing import Tuple, Union import torch from torch.nn import functional as F from hanlp.components.parsers.ud import udify_util as util from hanlp.layers.transformers.pt_imports import PreTrainedModel def transformer_encode(transformer: PreTrainedModel, input_ids, attention_mask=None, token_type_ids=None, token_span=None, layer_range: Union[int, Tuple[int, int]] = 0, max_sequence_length=None, average_subwords=False, ret_raw_hidden_states=False): """Run transformer and pool its outputs. Args: transformer: A transformer model. input_ids: Indices of subwords. attention_mask: Mask for these subwords. token_type_ids: Type ids for each subword. token_span: The spans of tokens. layer_range: The range of layers to use. Note that the 0-th layer means embedding layer, so the last 3 layers of a 12-layer BERT will be (10, 13). max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding window. average_subwords: ``True`` to average subword representations. ret_raw_hidden_states: ``True`` to return hidden states of each layer. Returns: Pooled outputs. """ if max_sequence_length and input_ids.size(-1) > max_sequence_length: # TODO: split token type ids in transformer_sliding_window if token type ids are not always 1 outputs = transformer_sliding_window(transformer, input_ids, max_pieces=max_sequence_length) else: if attention_mask is None: attention_mask = input_ids.ne(0) if transformer.config.output_hidden_states: outputs = transformer(input_ids, attention_mask, token_type_ids)[-1] else: outputs = transformer(input_ids, attention_mask, token_type_ids)[0] if transformer.config.output_hidden_states: if isinstance(layer_range, int): outputs = outputs[layer_range:] else: outputs = outputs[layer_range[0], layer_range[1]] # Slow pick # hs = [] # for h in outputs: # hs.append(pick_tensor_for_each_token(h, token_span, average_subwords)) # Fast pick if not isinstance(outputs, torch.Tensor): x = torch.stack(outputs) else: x = outputs L, B, T, F = x.size() x = x.flatten(end_dim=1) # tile token_span as x if token_span is not None: token_span = token_span.repeat(L, 1, 1) hs = pick_tensor_for_each_token(x, token_span, average_subwords).view(L, B, -1, F) if ret_raw_hidden_states: return hs, outputs return hs else: if ret_raw_hidden_states: return pick_tensor_for_each_token(outputs, token_span, average_subwords), outputs return pick_tensor_for_each_token(outputs, token_span, average_subwords) def pick_tensor_for_each_token(h, token_span, average_subwords): if token_span is None: return h if average_subwords and token_span.size(-1) > 1: batch_size = h.size(0) h_span = h.gather(1, token_span.view(batch_size, -1).unsqueeze(-1).expand(-1, -1, h.shape[-1])) h_span = h_span.view(batch_size, *token_span.shape[1:], -1) n_sub_tokens = token_span.ne(0) n_sub_tokens[:, 0, 0] = True h_span = (h_span * n_sub_tokens.unsqueeze(-1)).sum(2) n_sub_tokens = n_sub_tokens.sum(-1).unsqueeze(-1) zero_mask = n_sub_tokens == 0 if torch.any(zero_mask): n_sub_tokens[zero_mask] = 1 # avoid dividing by zero embed = h_span / n_sub_tokens else: embed = h.gather(1, token_span[:, :, 0].unsqueeze(-1).expand(-1, -1, h.size(-1))) return embed def transformer_sliding_window(transformer: PreTrainedModel, input_ids: torch.LongTensor, input_mask=None, offsets: torch.LongTensor = None, token_type_ids: torch.LongTensor = None, max_pieces=512, start_tokens: int = 1, end_tokens: int = 1, ret_cls=None, ) -> torch.Tensor: """ Args: transformer: input_ids: torch.LongTensor: input_mask: (Default value = None) offsets: torch.LongTensor: (Default value = None) token_type_ids: torch.LongTensor: (Default value = None) max_pieces: (Default value = 512) start_tokens: int: (Default value = 1) end_tokens: int: (Default value = 1) ret_cls: (Default value = None) Returns: """ # pylint: disable=arguments-differ batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1) initial_dims = list(input_ids.shape[:-1]) # The embedder may receive an input tensor that has a sequence length longer than can # be fit. In that case, we should expect the wordpiece indexer to create padded windows # of length `max_pieces` for us, and have them concatenated into one long sequence. # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..." # We can then split the sequence into sub-sequences of that length, and concatenate them # along the batch dimension so we effectively have one huge batch of partial sentences. # This can then be fed into BERT without any sentence length issues. Keep in mind # that the memory consumption can dramatically increase for large batches with extremely # long sentences. needs_split = full_seq_len > max_pieces if needs_split: input_ids = split_to_sliding_window(input_ids, max_pieces) # if token_type_ids is None: # token_type_ids = torch.zeros_like(input_ids) if input_mask is None: input_mask = (input_ids != 0).long() # input_ids may have extra dimensions, so we reshape down to 2-d # before calling the BERT model and then reshape back at the end. outputs = transformer(input_ids=util.combine_initial_dims_to_1d_or_2d(input_ids), # token_type_ids=util.combine_initial_dims_to_1d_or_2d(token_type_ids), attention_mask=util.combine_initial_dims_to_1d_or_2d(input_mask)).to_tuple() if len(outputs) == 3: all_encoder_layers = outputs.hidden_states all_encoder_layers = torch.stack(all_encoder_layers) elif len(outputs) == 2: all_encoder_layers, _ = outputs[:2] else: all_encoder_layers = outputs[0] if needs_split: if ret_cls is not None: cls_mask = input_ids[:, 0] == input_ids[0][0] cls_hidden = all_encoder_layers[:, 0, :] if ret_cls == 'max': cls_hidden[~cls_mask] = -1e20 else: cls_hidden[~cls_mask] = 0 cls_mask = cls_mask.view(-1, batch_size).transpose(0, 1) cls_hidden = cls_hidden.reshape(cls_mask.size(1), batch_size, -1).transpose(0, 1) if ret_cls == 'max': cls_hidden = cls_hidden.max(1)[0] elif ret_cls == 'raw': return cls_hidden, cls_mask else: cls_hidden = torch.sum(cls_hidden, dim=1) cls_hidden /= torch.sum(cls_mask, dim=1, keepdim=True) return cls_hidden else: recombined_embeddings, select_indices = restore_from_sliding_window(all_encoder_layers, batch_size, max_pieces, full_seq_len, start_tokens, end_tokens) initial_dims.append(len(select_indices)) else: recombined_embeddings = all_encoder_layers # Recombine the outputs of all layers # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim) # recombined = torch.cat(combined, dim=2) # input_mask = (recombined_embeddings != 0).long() # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim) if offsets is None: # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim) dims = initial_dims if needs_split else input_ids.size() layers = util.uncombine_initial_dims(recombined_embeddings, dims) else: # offsets is (batch_size, d1, ..., dn, orig_sequence_length) offsets2d = util.combine_initial_dims_to_1d_or_2d(offsets) # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length) range_vector = util.get_range_vector(offsets2d.size(0), device=util.get_device_of(recombined_embeddings)).unsqueeze(1) # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length) selected_embeddings = recombined_embeddings[:, range_vector, offsets2d] layers = util.uncombine_initial_dims(selected_embeddings, offsets.size()) return layers def split_to_sliding_window(input_ids, max_pieces): # Split the flattened list by the window size, `max_pieces` split_input_ids = list(input_ids.split(max_pieces, dim=-1)) # We want all sequences to be the same length, so pad the last sequence last_window_size = split_input_ids[-1].size(-1) padding_amount = max_pieces - last_window_size split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0) # Now combine the sequences along the batch dimension input_ids = torch.cat(split_input_ids, dim=0) return input_ids def restore_from_sliding_window(all_encoder_layers, batch_size, max_pieces, full_seq_len, start_tokens, end_tokens): # First, unpack the output embeddings into one long sequence again unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=-3) unpacked_embeddings = torch.cat(unpacked_embeddings, dim=-2) # Next, select indices of the sequence such that it will result in embeddings representing the original # sentence. To capture maximal context, the indices will be the middle part of each embedded window # sub-sequence (plus any leftover start and final edge windows), e.g., # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 # "[CLS] I went to the very fine [SEP] [CLS] the very fine store to eat [SEP]" # with max_pieces = 8 should produce max context indices [2, 3, 4, 10, 11, 12] with additional start # and final windows with indices [0, 1] and [14, 15] respectively. # Find the stride as half the max pieces, ignoring the special start and end tokens # Calculate an offset to extract the centermost embeddings of each window stride = (max_pieces - start_tokens - end_tokens) // 2 stride_offset = stride // 2 + start_tokens first_window = list(range(stride_offset)) max_context_windows = [i for i in range(full_seq_len) if stride_offset - 1 < i % max_pieces < stride_offset + stride] final_window_start = max_context_windows[-1] + 1 final_window = list(range(final_window_start, full_seq_len)) select_indices = first_window + max_context_windows + final_window select_indices = torch.LongTensor(select_indices).to(unpacked_embeddings.device) recombined_embeddings = unpacked_embeddings.index_select(-2, select_indices) return recombined_embeddings, select_indices def build_optimizer_for_pretrained(model: torch.nn.Module, pretrained: torch.nn.Module, lr=1e-5, weight_decay=0.01, eps=1e-8, transformer_lr=None, transformer_weight_decay=None, no_decay=('bias', 'LayerNorm.bias', 'LayerNorm.weight'), **kwargs): if transformer_lr is None: transformer_lr = lr if transformer_weight_decay is None: transformer_weight_decay = weight_decay params = defaultdict(lambda: defaultdict(list)) pretrained = set(pretrained.parameters()) if isinstance(no_decay, tuple): def no_decay_fn(name): return any(nd in name for nd in no_decay) else: assert callable(no_decay), 'no_decay has to be callable or a tuple of str' no_decay_fn = no_decay for n, p in model.named_parameters(): is_pretrained = 'pretrained' if p in pretrained else 'non_pretrained' is_no_decay = 'no_decay' if no_decay_fn(n) else 'decay' params[is_pretrained][is_no_decay].append(p) grouped_parameters = [ {'params': params['pretrained']['decay'], 'weight_decay': transformer_weight_decay, 'lr': transformer_lr}, {'params': params['pretrained']['no_decay'], 'weight_decay': 0.0, 'lr': transformer_lr}, {'params': params['non_pretrained']['decay'], 'weight_decay': weight_decay, 'lr': lr}, {'params': params['non_pretrained']['no_decay'], 'weight_decay': 0.0, 'lr': lr}, ] from transformers import optimization return optimization.AdamW( grouped_parameters, lr=lr, weight_decay=weight_decay, eps=eps, no_deprecation_warning=True, # For backwards compatability **kwargs) def build_optimizer_scheduler_with_transformer(model: torch.nn.Module, transformer: torch.nn.Module, lr: float, transformer_lr: float, num_training_steps: int, warmup_steps: Union[float, int], weight_decay: float, adam_epsilon: float, no_decay=('bias', 'LayerNorm.bias', 'LayerNorm.weight')): optimizer = build_optimizer_for_pretrained(model, transformer, lr, weight_decay, eps=adam_epsilon, transformer_lr=transformer_lr, no_decay=no_decay) if isinstance(warmup_steps, float): assert 0 < warmup_steps < 1, 'warmup_steps has to fall in range (0, 1) when it is float.' warmup_steps = num_training_steps * warmup_steps from transformers import optimization scheduler = optimization.get_linear_schedule_with_warmup(optimizer, warmup_steps, num_training_steps) return optimizer, scheduler def get_optimizers( model: torch.nn.Module, num_training_steps: int, learning_rate=5e-5, adam_epsilon=1e-8, weight_decay=0.0, warmup_steps=0.1, ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: """ Modified from https://github.com/huggingface/transformers/blob/7b75aa9fa55bee577e2c7403301ed31103125a35/src/transformers/trainer.py#L232 Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. """ if isinstance(warmup_steps, float): assert 0 < warmup_steps < 1 warmup_steps = int(num_training_steps * warmup_steps) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] from transformers import AdamW, get_linear_schedule_with_warmup optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps ) return optimizer, scheduler def collect_decay_params(model, weight_decay): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] return optimizer_grouped_parameters ================================================ FILE: hanlp/layers/transformers/utils_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 15:32 import tensorflow as tf from hanlp.optimizers.adamw import create_optimizer from hanlp.utils.log_util import logger def config_is(config, model='bert'): return model in type(config).__name__.lower() def convert_examples_to_features( words, max_seq_length, tokenizer, labels=None, label_map=None, cls_token_at_end=False, cls_token="[CLS]", cls_token_segment_id=1, sep_token="[SEP]", sep_token_extra=False, pad_on_left=False, pad_token_id=0, pad_token_segment_id=0, pad_token_label_id=0, sequence_a_segment_id=0, mask_padding_with_zero=True, unk_token='[UNK]', do_padding=True ): """Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) Args: words: max_seq_length: tokenizer: labels: (Default value = None) label_map: (Default value = None) cls_token_at_end: (Default value = False) cls_token: (Default value = "[CLS]") cls_token_segment_id: (Default value = 1) sep_token: (Default value = "[SEP]") sep_token_extra: (Default value = False) pad_on_left: (Default value = False) pad_token_id: (Default value = 0) pad_token_segment_id: (Default value = 0) pad_token_label_id: (Default value = 0) sequence_a_segment_id: (Default value = 0) mask_padding_with_zero: (Default value = True) unk_token: (Default value = '[UNK]') do_padding: (Default value = True) Returns: """ args = locals() if not labels: labels = words pad_token_label_id = False tokens = [] label_ids = [] for word, label in zip(words, labels): word_tokens = tokenizer.tokenize(word) if not word_tokens: # some wired chars cause the tagger to return empty list word_tokens = [unk_token] * len(word) tokens.extend(word_tokens) # Use the real label id for the first token of the word, and padding ids for the remaining tokens label_ids.extend([label_map[label] if label_map else True] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 if len(tokens) > max_seq_length - special_tokens_count: logger.warning( f'Input tokens {words} exceed the max sequence length of {max_seq_length - special_tokens_count}. ' f'The exceeded part will be truncated and ignored. ' f'You are recommended to split your long text into several sentences within ' f'{max_seq_length - special_tokens_count} tokens beforehand.') tokens = tokens[: (max_seq_length - special_tokens_count)] label_ids = label_ids[: (max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # token_type_ids: 0 0 0 0 0 0 0 # # Where "token_type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens += [sep_token] label_ids += [pad_token_label_id] if sep_token_extra: # roberta uses an extra separator b/w pairs of sentences tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [sequence_a_segment_id] * len(tokens) if cls_token_at_end: tokens += [cls_token] label_ids += [pad_token_label_id] segment_ids += [cls_token_segment_id] else: tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) if do_padding: # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token_id] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids label_ids = ([pad_token_label_id] * padding_length) + label_ids else: input_ids += [pad_token_id] * padding_length input_mask += [0 if mask_padding_with_zero else 1] * padding_length segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length, f'failed for:\n {args}' else: assert len(set(len(x) for x in [input_ids, input_mask, segment_ids, label_ids])) == 1 return input_ids, input_mask, segment_ids, label_ids def build_adamw_optimizer(config, learning_rate, epsilon, clipnorm, train_steps, use_amp, warmup_steps, weight_decay_rate): opt = create_optimizer(init_lr=learning_rate, epsilon=epsilon, weight_decay_rate=weight_decay_rate, clipnorm=clipnorm, num_train_steps=train_steps, num_warmup_steps=warmup_steps) # opt = tfa.optimizers.AdamW(learning_rate=3e-5, epsilon=1e-08, weight_decay=0.01) # opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) config.optimizer = tf.keras.utils.serialize_keras_object(opt) lr_config = config.optimizer['config']['learning_rate']['config'] if 'decay_schedule_fn' in lr_config: lr_config['decay_schedule_fn'] = dict( (k, v) for k, v in lr_config['decay_schedule_fn'].items() if not k.startswith('_')) if use_amp: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') return opt def adjust_tokens_for_transformers(sentence): """Adjust tokens for BERT See https://github.com/DoodleJZ/HPSG-Neural-Parser/blob/master/src_joint/Zparser.py#L1204 Args: sentence: Returns: """ cleaned_words = [] for word in sentence: # word = BERT_TOKEN_MAPPING.get(word, word) if word == "n't" and cleaned_words: cleaned_words[-1] = cleaned_words[-1] + "n" word = "'t" cleaned_words.append(word) return cleaned_words ================================================ FILE: hanlp/layers/weight_normalization.py ================================================ # Copyright 2019 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from hanlp.utils.tf_util import hanlp_register @hanlp_register class WeightNormalization(tf.keras.layers.Wrapper): """This wrapper reparameterizes a layer by decoupling the weight's magnitude and direction. This speeds up convergence by improving the conditioning of the optimization problem. Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks: https://arxiv.org/abs/1602.07868 Tim Salimans, Diederik P. Kingma (2016) WeightNormalization wrapper works for keras and tf layers. ```python net = WeightNormalization( tf.keras.layers.Conv2D(2, 2, activation='relu'), input_shape=(32, 32, 3), data_init=True)(x) net = WeightNormalization( tf.keras.layers.Conv2D(16, 5, activation='relu'), data_init=True)(net) net = WeightNormalization( tf.keras.layers.Dense(120, activation='relu'), data_init=True)(net) net = WeightNormalization( tf.keras.layers.Dense(n_classes), data_init=True)(net) ``` Args: layer: a layer instance data_init: If Returns: Raises: ValueError: If not initialized with a ValueError: If NotImplementedError: If """ def __init__(self, layer, data_init=True, **kwargs): super(WeightNormalization, self).__init__(layer, **kwargs) self.data_init = data_init self._track_trackable(layer, name='layer') self._init_critical_section = tf.CriticalSection(name='init_mutex') self.is_rnn = isinstance(self.layer, tf.keras.layers.RNN) def build(self, input_shape): """Build `Layer` Args: input_shape: Returns: """ input_shape = tf.TensorShape(input_shape) self.input_spec = tf.keras.layers.InputSpec( shape=[None] + input_shape[1:]) if not self.layer.built: self.layer.build(input_shape) kernel_layer = self.layer.cell if self.is_rnn else self.layer if not hasattr(kernel_layer, 'kernel'): raise ValueError('`WeightNormalization` must wrap a layer that' ' contains a `kernel` for weights') # The kernel's filter or unit dimension is -1 self.layer_depth = int(kernel_layer.kernel.shape[-1]) self.kernel_norm_axes = list(range(kernel_layer.kernel.shape.rank - 1)) self.g = self.add_weight( name='g', shape=(self.layer_depth,), initializer='ones', dtype=kernel_layer.kernel.dtype, trainable=True) self.v = kernel_layer.kernel self._initialized = self.add_weight( name='initialized', shape=None, initializer='zeros', dtype=tf.dtypes.bool, trainable=False) if self.data_init: # Used for data initialization in self._data_dep_init. with tf.name_scope('data_dep_init'): layer_config = tf.keras.layers.serialize(self.layer) layer_config['config']['trainable'] = False self._naked_clone_layer = tf.keras.layers.deserialize( layer_config) self._naked_clone_layer.build(input_shape) self._naked_clone_layer.set_weights(self.layer.get_weights()) if self.is_rnn: self._naked_clone_layer.cell.activation = None else: self._naked_clone_layer.activation = None self.built = True def call(self, inputs): """Call `Layer` Args: inputs: Returns: """ def _do_nothing(): return tf.identity(self.g) def _update_weights(): # Ensure we read `self.g` after _update_weights. with tf.control_dependencies(self._initialize_weights(inputs)): return tf.identity(self.g) g = self._init_critical_section.execute(lambda: tf.cond( self._initialized, _do_nothing, _update_weights)) with tf.name_scope('compute_weights'): # Replace kernel by normalized weight variable. self.layer.kernel = tf.nn.l2_normalize( self.v, axis=self.kernel_norm_axes) * g # Ensure we calculate result after updating kernel. update_kernel = tf.identity(self.layer.kernel) with tf.control_dependencies([update_kernel]): outputs = self.layer(inputs) return outputs def compute_output_shape(self, input_shape): return tf.TensorShape( self.layer.compute_output_shape(input_shape).as_list()) def _initialize_weights(self, inputs): """Initialize weight g. The initial value of g could either from the initial value in v, or by the input value if self.data_init is True. Args: inputs: Returns: """ with tf.control_dependencies([ tf.debugging.assert_equal( # pylint: disable=bad-continuation self._initialized, False, message='The layer has been initialized.') ]): if self.data_init: assign_tensors = self._data_dep_init(inputs) else: assign_tensors = self._init_norm() assign_tensors.append(self._initialized.assign(True)) return assign_tensors def _init_norm(self): """Set the weight g with the norm of the weight vector.""" with tf.name_scope('init_norm'): v_flat = tf.reshape(self.v, [-1, self.layer_depth]) v_norm = tf.linalg.norm(v_flat, axis=0) g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,))) return [g_tensor] def _data_dep_init(self, inputs): """Data dependent initialization. Args: inputs: Returns: """ with tf.name_scope('data_dep_init'): # Generate data dependent init values x_init = self._naked_clone_layer(inputs) data_norm_axes = list(range(x_init.shape.rank - 1)) m_init, v_init = tf.nn.moments(x_init, data_norm_axes) scale_init = 1. / tf.math.sqrt(v_init + 1e-10) # Assign data dependent init values g_tensor = self.g.assign(self.g * scale_init) if hasattr(self.layer, 'bias') and self.layer.bias is not None: bias_tensor = self.layer.bias.assign(-m_init * scale_init) return [g_tensor, bias_tensor] else: return [g_tensor] def get_config(self): config = {'data_init': self.data_init} base_config = super(WeightNormalization, self).get_config() return dict(list(base_config.items()) + list(config.items())) ================================================ FILE: hanlp/losses/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-20 01:28 ================================================ FILE: hanlp/losses/sparse_categorical_crossentropy.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-20 01:29 import tensorflow as tf from hanlp.utils.tf_util import hanlp_register @hanlp_register class SparseCategoricalCrossentropyOverNonzeroWeights(object): def __init__(self) -> None: super().__init__() self.__name__ = type(self).__name__ def __call__(self, y_true, y_pred, sample_weight=None, **kwargs): loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) if sample_weight is not None: loss = loss * sample_weight loss = tf.reduce_sum(loss) if sample_weight is not None: # This is equivalent to SUM_OVER_BATCH_SIZE # loss /= tf.reduce_sum(tf.ones_like(sample_weight, dtype=loss.dtype)) # This one is SUM_BY_NONZERO_WEIGHTS loss /= tf.reduce_sum(sample_weight) return loss @hanlp_register class SparseCategoricalCrossentropyOverBatchFirstDim(object): def __init__(self) -> None: super().__init__() self.__name__ = type(self).__name__ def __call__(self, y_true, y_pred, sample_weight=None, **kwargs): loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) if sample_weight is not None: loss = loss * sample_weight # could use sum of sample_weight[:,0] too loss = tf.reduce_sum(loss) / tf.cast(tf.shape(y_true)[0], tf.float32) return loss def get_config(self): return {} @hanlp_register class MaskedSparseCategoricalCrossentropyOverBatchFirstDim(object): def __init__(self, mask_value=0) -> None: super().__init__() self.mask_value = mask_value self.__name__ = type(self).__name__ def __call__(self, y_true, y_pred, sample_weight=None, **kwargs): assert sample_weight is None, 'the mask will be computed via y_true != mask_value, ' \ 'it might conflict with sample_weight' active_loss = tf.not_equal(y_true, self.mask_value) active_labels = tf.boolean_mask(y_true, active_loss) active_logits = tf.boolean_mask(y_pred, active_loss) loss = tf.keras.losses.sparse_categorical_crossentropy(active_labels, active_logits, from_logits=True) loss = tf.reduce_sum(loss) / tf.cast(tf.shape(y_true)[0], tf.float32) return loss ================================================ FILE: hanlp/metrics/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-09-14 21:55 ================================================ FILE: hanlp/metrics/accuracy.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-12 17:56 from typing import Optional, Iterable import torch from hanlp.metrics.metric import Metric class CategoricalAccuracy(Metric): """ Categorical Top-K accuracy. Assumes integer labels, with each item to be classified having a single correct class. Tie break enables equal distribution of scores among the classes with same maximum predicted scores. Copied from AllenNLP and added several methods. """ def __init__(self, top_k: int = 1, tie_break: bool = False) -> None: if top_k > 1 and tie_break: raise ValueError( "Tie break in Categorical Accuracy can be done only for maximum (top_k = 1)" ) if top_k <= 0: raise ValueError("top_k passed to Categorical Accuracy must be > 0") self._top_k = top_k self._tie_break = tie_break self.correct_count = 0.0 self.total_count = 0.0 def __call__( self, predictions: torch.Tensor, gold_labels: torch.Tensor, mask: Optional[torch.BoolTensor] = None, ): """ # Parameters predictions : `torch.Tensor`, required. A tensor of predictions of shape (batch_size, ..., num_classes). gold_labels : `torch.Tensor`, required. A tensor of integer class label of shape (batch_size, ...). It must be the same shape as the `predictions` tensor without the `num_classes` dimension. mask : `torch.BoolTensor`, optional (default = `None`). A masking tensor the same size as `gold_labels`. """ predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Some sanity checks. num_classes = predictions.size(-1) if gold_labels.dim() != predictions.dim() - 1: raise ValueError( "gold_labels must have dimension == predictions.size() - 1 but " "found tensor of shape: {}".format(predictions.size()) ) if (gold_labels >= num_classes).any(): raise ValueError( "A gold label passed to Categorical Accuracy contains an id >= {}, " "the number of classes.".format(num_classes) ) predictions = predictions.view((-1, num_classes)) gold_labels = gold_labels.view(-1).long() if not self._tie_break: # Top K indexes of the predictions (or fewer, if there aren't K of them). # Special case topk == 1, because it's common and .max() is much faster than .topk(). if self._top_k == 1: top_k = predictions.max(-1)[1].unsqueeze(-1) else: top_k = predictions.topk(min(self._top_k, predictions.shape[-1]), -1)[1] # This is of shape (batch_size, ..., top_k). correct = top_k.eq(gold_labels.unsqueeze(-1)).float() else: # prediction is correct if gold label falls on any of the max scores. distribute score by tie_counts max_predictions = predictions.max(-1)[0] max_predictions_mask = predictions.eq(max_predictions.unsqueeze(-1)) # max_predictions_mask is (rows X num_classes) and gold_labels is (batch_size) # ith entry in gold_labels points to index (0-num_classes) for ith row in max_predictions # For each row check if index pointed by gold_label is was 1 or not (among max scored classes) correct = max_predictions_mask[ torch.arange(gold_labels.numel(), device=gold_labels.device).long(), gold_labels ].float() tie_counts = max_predictions_mask.sum(-1) correct /= tie_counts.float() correct.unsqueeze_(-1) if mask is not None: correct *= mask.view(-1, 1) self.total_count += mask.sum() else: self.total_count += gold_labels.numel() self.correct_count += correct.sum() @property def score(self): if self.total_count > 1e-12: accuracy = float(self.correct_count) / float(self.total_count) else: accuracy = 0.0 return accuracy def __repr__(self) -> str: return f'Accuracy:{self.score:.2%}' @staticmethod def detach_tensors(*tensors: torch.Tensor) -> Iterable[torch.Tensor]: """ If you actually passed gradient-tracking Tensors to a Metric, there will be a huge memory leak, because it will prevent garbage collection for the computation graph. This method ensures the tensors are detached. """ # Check if it's actually a tensor in case something else was passed. return (x.detach() if isinstance(x, torch.Tensor) else x for x in tensors) def reset(self): self.correct_count = 0.0 self.total_count = 0.0 class BooleanAccuracy(Metric): """ Just checks batch-equality of two tensors and computes an accuracy metric based on that. That is, if your prediction has shape (batch_size, dim_1, ..., dim_n), this metric considers that as a set of `batch_size` predictions and checks that each is *entirely* correct across the remaining dims. This means the denominator in the accuracy computation is `batch_size`, with the caveat that predictions that are totally masked are ignored (in which case the denominator is the number of predictions that have at least one unmasked element). This is similar to [`CategoricalAccuracy`](./categorical_accuracy.md), if you've already done a `.max()` on your predictions. If you have categorical output, though, you should typically just use `CategoricalAccuracy`. The reason you might want to use this instead is if you've done some kind of constrained inference and don't have a prediction tensor that matches the API of `CategoricalAccuracy`, which assumes a final dimension of size `num_classes`. """ def __init__(self) -> None: self._correct_count = 0.0 self._total_count = 0.0 def __call__( self, predictions: torch.Tensor, gold_labels: torch.Tensor, mask: Optional[torch.BoolTensor] = None, ): """ # Parameters predictions : `torch.Tensor`, required. A tensor of predictions of shape (batch_size, ...). gold_labels : `torch.Tensor`, required. A tensor of the same shape as `predictions`. mask : `torch.BoolTensor`, optional (default = `None`). A tensor of the same shape as `predictions`. """ predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask) # Some sanity checks. if gold_labels.size() != predictions.size(): raise ValueError( f"gold_labels must have shape == predictions.size() but " f"found tensor of shape: {gold_labels.size()}" ) if mask is not None and mask.size() != predictions.size(): raise ValueError( f"mask must have shape == predictions.size() but " f"found tensor of shape: {mask.size()}" ) batch_size = predictions.size(0) if mask is not None: # We can multiply by the mask up front, because we're just checking equality below, and # this way everything that's masked will be equal. predictions = predictions * mask gold_labels = gold_labels * mask # We want to skip predictions that are completely masked; # so we'll keep predictions that aren't. keep = mask.view(batch_size, -1).max(dim=1)[0] else: keep = torch.ones(batch_size, device=predictions.device).bool() predictions = predictions.view(batch_size, -1) gold_labels = gold_labels.view(batch_size, -1) # At this point, predictions is (batch_size, rest_of_dims_combined), # so .eq -> .prod will be 1 if every element of the instance prediction is correct # and 0 if at least one element of the instance prediction is wrong. # Because of how we're handling masking, masked positions are automatically "correct". correct = predictions.eq(gold_labels).prod(dim=1).float() # Since masked positions are correct, we need to explicitly exclude instance predictions # where the entire prediction is masked (because they look "correct"). self._correct_count += (correct * keep).sum() self._total_count += keep.sum() def get_metric(self, reset: bool = False): """ # Returns The accumulated accuracy. """ if self._total_count > 0: accuracy = float(self._correct_count) / float(self._total_count) else: accuracy = 0.0 if reset: self.reset() return accuracy def reset(self): self._correct_count = 0.0 self._total_count = 0.0 @staticmethod def detach_tensors(*tensors: torch.Tensor) -> Iterable[torch.Tensor]: """ If you actually passed gradient-tracking Tensors to a Metric, there will be a huge memory leak, because it will prevent garbage collection for the computation graph. This method ensures the tensors are detached. """ # Check if it's actually a tensor in case something else was passed. return (x.detach() if isinstance(x, torch.Tensor) else x for x in tensors) ================================================ FILE: hanlp/metrics/amr/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-24 12:47 ================================================ FILE: hanlp/metrics/amr/smatch_eval.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-24 12:47 import os import warnings from typing import Union from hanlp.metrics.f1 import F1_ from hanlp.metrics.mtl import MetricDict from hanlp.utils.io_util import get_resource, run_cmd, pushd from hanlp.utils.log_util import flash _SMATCH_SCRIPT = 'https://github.com/ChunchuanLv/amr-evaluation-tool-enhanced/archive/master.zip#evaluation.sh' _FAST_SMATCH_SCRIPT = 'https://github.com/jcyk/AMR-gs/archive/master.zip#tools/fast_smatch/compute_smatch.sh' class SmatchScores(MetricDict): @property def score(self): return self['Smatch'].score def smatch_eval(pred, gold, use_fast=False) -> Union[SmatchScores, F1_]: script = get_resource(_FAST_SMATCH_SCRIPT if use_fast else _SMATCH_SCRIPT) home = os.path.dirname(script) pred = os.path.realpath(pred) gold = os.path.realpath(gold) with pushd(home): flash('Running evaluation script [blink][yellow]...[/yellow][/blink]') cmd = f'bash {script} {pred} {gold}' text = run_cmd(cmd) flash('') return format_fast_scores(text) if use_fast else format_official_scores(text) def post_process(pred, amr_version): pred = os.path.realpath(pred) utils_tar_gz = get_amr_utils(amr_version) util_dir = get_resource(utils_tar_gz) stog_home = get_resource('https://github.com/jcyk/AMR-gs/archive/master.zip') with pushd(stog_home): run_cmd( f'python3 -u -m stog.data.dataset_readers.amr_parsing.postprocess.postprocess ' f'--amr_path {pred} --util_dir {util_dir} --v 2') return pred + '.post' def get_amr_utils(amr_version): if amr_version == '1.0': utils_tar_gz = 'https://www.cs.jhu.edu/~s.zhang/data/AMR/amr_1.0_utils.tar.gz' elif amr_version == '2.0': utils_tar_gz = 'https://www.cs.jhu.edu/~s.zhang/data/AMR/amr_2.0_utils.tar.gz' elif amr_version == '3.0': utils_tar_gz = 'https://file.hankcs.com/research/amr2020/amr_3.0_utils.tgz' else: raise ValueError(f'Unsupported AMR version {amr_version}') return utils_tar_gz def format_official_scores(text: str): # Smatch -> P: 0.136, R: 0.107, F: 0.120 # Unlabeled -> P: 0.229, R: 0.180, F: 0.202 # No WSD -> P: 0.137, R: 0.108, F: 0.120 # Non_sense_frames -> P: 0.008, R: 0.008, F: 0.008 # Wikification -> P: 0.000, R: 0.000, F: 0.000 # Named Ent. -> P: 0.222, R: 0.092, F: 0.130 # Negations -> P: 0.000, R: 0.000, F: 0.000 # IgnoreVars -> P: 0.005, R: 0.003, F: 0.003 # Concepts -> P: 0.075, R: 0.036, F: 0.049 # Frames -> P: 0.007, R: 0.007, F: 0.007 # Reentrancies -> P: 0.113, R: 0.060, F: 0.079 # SRL -> P: 0.145, R: 0.104, F: 0.121 scores = SmatchScores() for line in text.split('\n'): line = line.strip() if not line: continue name, vs = line.split(' -> ') try: p, r, f = [float(x.split(': ')[-1]) for x in vs.split(', ')] except ValueError: warnings.warn(f'Failed to parse results from smatch: {line}') p, r, f = float("nan"), float("nan"), float("nan") scores[name] = F1_(p, r, f) return scores def format_fast_scores(text: str): # using fast smatch # Precision: 0.137 # Recall: 0.108 # Document F-score: 0.121 scores = [] for line in text.split('\n'): line = line.strip() if not line or ':' not in line: continue name, score = line.split(': ') scores.append(float(score)) assert len(scores) == 3 return F1_(*scores) ================================================ FILE: hanlp/metrics/chunking/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 03:49 ================================================ FILE: hanlp/metrics/chunking/binary_chunking_f1.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-02 14:27 from collections import defaultdict from typing import List, Union import torch from hanlp.metrics.f1 import F1 class BinaryChunkingF1(F1): def __call__(self, pred_tags: torch.LongTensor, gold_tags: torch.LongTensor, lens: List[int] = None): if lens is None: lens = [gold_tags.size(1)] * gold_tags.size(0) self.update(self.decode_spans(pred_tags, lens), self.decode_spans(gold_tags, lens)) def update(self, pred_tags, gold_tags): for pred, gold in zip(pred_tags, gold_tags): super().__call__(set(pred), set(gold)) @staticmethod def decode_spans(pred_tags: torch.LongTensor, lens: Union[List[int], torch.LongTensor]): if isinstance(lens, torch.Tensor): lens = lens.tolist() batch_pred = defaultdict(list) for batch, offset in pred_tags.nonzero(as_tuple=False).tolist(): batch_pred[batch].append(offset) batch_pred_spans = [[(0, l)] for l in lens] for batch, offsets in batch_pred.items(): l = lens[batch] batch_pred_spans[batch] = list(zip(offsets, offsets[1:] + [l])) return batch_pred_spans ================================================ FILE: hanlp/metrics/chunking/bmes_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-09-14 21:55 from hanlp.common.vocab_tf import VocabTF from hanlp.metrics.chunking.chunking_f1_tf import ChunkingF1_TF from hanlp.metrics.chunking.sequence_labeling import get_entities class BMES_F1_TF(ChunkingF1_TF): def __init__(self, tag_vocab: VocabTF, from_logits=True, suffix=False, name='f1', dtype=None, **kwargs): super().__init__(tag_vocab, from_logits, name, dtype, **kwargs) self.nb_correct = 0 self.nb_pred = 0 self.nb_true = 0 self.suffix = suffix def update_tags(self, true_tags, pred_tags): for t, p in zip(true_tags, pred_tags): self.update_entities(get_entities(t, self.suffix), get_entities(p, self.suffix)) return self.result() def update_entities(self, true_entities, pred_entities): true_entities = set(true_entities) pred_entities = set(pred_entities) nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) nb_true = len(true_entities) self.nb_correct += nb_correct self.nb_pred += nb_pred self.nb_true += nb_true def result(self): nb_correct = self.nb_correct nb_pred = self.nb_pred nb_true = self.nb_true p = nb_correct / nb_pred if nb_pred > 0 else 0 r = nb_correct / nb_true if nb_true > 0 else 0 score = 2 * p * r / (p + r) if p + r > 0 else 0 return score def reset_states(self): self.nb_correct = 0 self.nb_pred = 0 self.nb_true = 0 ================================================ FILE: hanlp/metrics/chunking/chunking_f1.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-11 22:14 import io from collections import defaultdict from typing import List, Set, Tuple, Dict from hanlp.metrics.chunking.conlleval import calculate_metrics, DetailedF1, metrics from hanlp.metrics.chunking.sequence_labeling import get_entities from hanlp.metrics.f1 import F1 from hanlp.metrics.metric import Metric class ChunkingF1(F1): def __call__(self, pred_tags: List[List[str]], gold_tags: List[List[str]]): for p, g in zip(pred_tags, gold_tags): pred = set(get_entities(p)) gold = set(get_entities(g)) self.nb_pred += len(pred) self.nb_true += len(gold) self.nb_correct += len(pred & gold) class DetailedSpanF1(Metric): def __init__(self, do_confusion_matrix=False): self.correct_chunk = 0 # number of correctly identified chunks self.correct_unlabeled = 0 self.total_gold = 0 # number of chunks in corpus self.total_pred = 0 # number of identified chunks self.token_counter = 0 # token counter (ignores sentence breaks) # counts by type self.t_correct_chunk = defaultdict(int) self.t_total_gold = defaultdict(int) self.t_total_pred = defaultdict(int) self.do_confusion_matrix = do_confusion_matrix if do_confusion_matrix: self.pred_labels = [] self.gold_labels = [] @property def states(self): return (self.t_correct_chunk, self.t_total_gold, self.t_total_pred) def reset_state(self): self.correct_chunk = 0 # number of correctly identified chunks self.total_gold = 0 # number of chunks in corpus self.total_pred = 0 # number of identified chunks self.token_counter = 0 # token counter (ignores sentence breaks) for state in self.states: state.clear() if self.do_confusion_matrix: self.pred_labels = [] self.gold_labels = [] @property def score(self): overall = calculate_metrics( self.correct_chunk, self.total_pred, self.total_gold ) return overall.fscore def __call__(self, pred: Set[Tuple[int, int, str]], gold: Set[Tuple[int, int, str]], num_tokens=None): pred_chunks_unlabeled = set((b, e) for b, e, l in pred) gold_chunks_unlabeled = set((b, e) for b, e, l in gold) self.correct_unlabeled += len(pred_chunks_unlabeled & gold_chunks_unlabeled) self.correct_chunk += len(pred & gold) self.total_gold += len(gold) self.total_pred += len(pred) if num_tokens: self.token_counter += num_tokens def group_by_tag(collection: Set[Tuple[int, int, str]]): group = defaultdict(set) for b, e, l in collection: group[l].add((b, e)) return group pred_tags = group_by_tag(pred) gold_tags = group_by_tag(gold) for l in pred_tags.keys() | gold_tags.keys(): self.t_correct_chunk[l] += len(pred_tags[l] & gold_tags[l]) self.t_total_gold[l] += len(gold_tags[l]) self.t_total_pred[l] += len(pred_tags[l]) if self.do_confusion_matrix: def group_by_span(collection: Set[Tuple[int, int, str]]): group = dict() for b, e, l in collection: group[(b, e)] = l return group pred_spans = group_by_span(pred) gold_spans = group_by_span(gold) for span in pred_spans.keys() & gold_spans.keys(): self.pred_labels.append(pred_spans[span]) self.gold_labels.append(gold_spans[span]) def reset(self): self.reset_state() def report(self) -> Tuple[DetailedF1, Dict[str, DetailedF1], str]: out = io.StringIO() c = self out.write('processed %d tokens with %d phrases; ' % (c.token_counter, c.total_gold)) out.write('found: %d phrases; correct: %d.\n' % (c.total_pred, c.correct_chunk)) overall = calculate_metrics(c.correct_unlabeled, c.total_pred, c.total_gold) out.write('%17s: ' % 'unlabeled overall') out.write('precision: %6.2f%%; ' % (100. * overall.prec)) out.write('recall: %6.2f%%; ' % (100. * overall.rec)) out.write('FB1: %6.2f\n' % (100. * overall.fscore)) overall, by_type = metrics(self) out.write('%17s: ' % 'labeled overall') out.write('precision: %6.2f%%; ' % (100. * overall.prec)) out.write('recall: %6.2f%%; ' % (100. * overall.rec)) out.write('FB1: %6.2f\n' % (100. * overall.fscore)) for i, m in sorted(by_type.items()): out.write('%17s: ' % i) out.write('precision: %6.2f%%; ' % (100. * m.prec)) out.write('recall: %6.2f%%; ' % (100. * m.rec)) out.write('FB1: %6.2f %d\n' % (100. * m.fscore, c.t_total_pred[i])) text = out.getvalue() out.close() return overall, by_type, text def __str__(self) -> str: return self.report()[-1] def confusion_matrix(self): from sklearn.metrics import confusion_matrix labels = sorted(self.gold_labels + self.pred_labels) return confusion_matrix(self.gold_labels, self.pred_labels, labels=labels), labels ================================================ FILE: hanlp/metrics/chunking/chunking_f1_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 23:09 from abc import ABC, abstractmethod import tensorflow as tf from hanlp.common.vocab_tf import VocabTF class ChunkingF1_TF(tf.keras.metrics.Metric, ABC): def __init__(self, tag_vocab: VocabTF, from_logits=True, name='f1', dtype=None, **kwargs): super().__init__(name, dtype, dynamic=True, **kwargs) self.tag_vocab = tag_vocab self.from_logits = from_logits def update_the_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs): if sample_weight is None: if hasattr(y_pred, '_keras_mask'): mask = y_pred._keras_mask else: mask = None else: mask = sample_weight if self.tag_vocab.pad_idx is not None and mask is None: # in this case, the model doesn't compute mask but provide a masking index, it's ok to mask = y_true != self.tag_vocab.pad_idx assert mask is not None, 'ChunkingF1 requires masking, check your _keras_mask or compute_mask' if self.from_logits: y_pred = tf.argmax(y_pred, axis=-1) y_true = self.to_tags(y_true, mask) y_pred = self.to_tags(y_pred, mask) return self.update_tags(y_true, y_pred) def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs): return self.update_the_state(y_true, y_pred, sample_weight) def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight: tf.Tensor = None, **kwargs): return self.update_the_state(y_true, y_pred, sample_weight) def to_tags(self, y: tf.Tensor, sample_weight: tf.Tensor): batch = [] y = y.numpy() sample_weight = sample_weight.numpy() for sent, mask in zip(y, sample_weight): tags = [] for tag, m in zip(sent, mask): if not m: continue tag = int(tag) if self.tag_vocab.pad_idx is not None and tag == self.tag_vocab.pad_idx: # If model predicts , it will fail most metrics. So replace it with a valid one tag = 1 tags.append(self.tag_vocab.get_token(tag)) batch.append(tags) return batch @abstractmethod def update_tags(self, true_tags, pred_tags): pass @abstractmethod def result(self): pass ================================================ FILE: hanlp/metrics/chunking/conlleval.py ================================================ #!/usr/bin/env python # Python version of the evaluation script from CoNLL'00- # Intentional differences: # - accept any space as delimiter by default # - optional file argument (default STDIN) # - option to set boundary (-b argument) # - LaTeX output (-l argument) not supported # - raw tags (-r argument) not supported import io import sys from collections import defaultdict, namedtuple from typing import Tuple, Union, List from hanlp.utils.span_util import bio_tags_to_spans from hanlp.metrics.metric import Metric ANY_SPACE = '' class FormatError(Exception): pass DetailedF1 = namedtuple('Metrics', 'tp fp fn prec rec fscore') class EvalCounts(object): def __init__(self): self.correct_chunk = 0 # number of correctly identified chunks self.correct_tags = 0 # number of correct chunk tags self.total_gold = 0 # number of chunks in corpus self.total_pred = 0 # number of identified chunks self.token_counter = 0 # token counter (ignores sentence breaks) # counts by type self.t_correct_chunk = defaultdict(int) self.t_total_gold = defaultdict(int) self.t_total_pred = defaultdict(int) @property def states(self): return (self.t_correct_chunk, self.t_total_gold, self.t_total_pred) def reset_state(self): self.correct_chunk = 0 # number of correctly identified chunks self.correct_tags = 0 # number of correct chunk tags self.total_gold = 0 # number of chunks in corpus self.total_pred = 0 # number of identified chunks self.token_counter = 0 # token counter (ignores sentence breaks) for state in self.states: state.clear() class SpanF1(Metric): def __init__(self, label_encoding='IOBES') -> None: super().__init__() self.label_encoding = label_encoding self.count = EvalCounts() def reset(self): self.count = EvalCounts() @property def score(self): return self.result(False, False).fscore def reset_state(self): self.count.reset_state() def update_state(self, true_seqs: List[str], pred_seqs: List[str]): if self.label_encoding == 'IOBES': count = evaluate_iobes(true_seqs, pred_seqs) elif self.label_encoding in ['IOB2', 'BIO']: count = evaluate_iob2(true_seqs, pred_seqs) else: raise ValueError(f'Unrecognized label encoding {self.label_encoding}') self.count.correct_chunk += count.correct_chunk self.count.correct_tags += count.correct_tags self.count.total_gold += count.total_gold self.count.total_pred += count.total_pred self.count.token_counter += count.token_counter for s, n in zip(self.count.states, count.states): for k, v in n.items(): s[k] = s.get(k, 0) + v def batch_update_state(self, true_seqs: List[List[str]], pred_seqs: List[List[str]]): for t, p in zip(true_seqs, pred_seqs): self.update_state(t, p) def result(self, full=True, verbose=True) -> Union[Tuple[DetailedF1, dict, str], DetailedF1]: if full: out = io.StringIO() overall, by_type = report(self.count, out) text = out.getvalue() if verbose: print(text) out.close() return overall, by_type, text else: overall, _ = metrics(self.count) return overall # torch convention: put pred before gold def __call__(self, pred_seqs: List[List[str]], true_seqs: List[List[str]]): return self.batch_update_state(true_seqs, pred_seqs) def __repr__(self) -> str: result = self.result(False, False) return f"P: {result.prec:.2%} R: {result.rec:.2%} F: {result.fscore:.2%}" def parse_args(argv): import argparse parser = argparse.ArgumentParser( description='evaluate tagging results using CoNLL criteria', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) arg = parser.add_argument arg('-b', '--boundary', metavar='STR', default='-X-', help='sentence boundary') arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, help='character delimiting items in input') arg('-o', '--otag', metavar='CHAR', default='O', help='alternative outside tag') arg('file', nargs='?', default=None) return parser.parse_args(argv) def split_tag(chunk_tag): """split chunk tag into IOBES prefix and chunk_type e.g. B-PER -> (B, PER) O -> (O, None) Args: chunk_tag: Returns: """ if chunk_tag == 'O': return ('O', None) return chunk_tag.split('-', maxsplit=1) def evaluate_iobes(true_seqs, pred_seqs): counts = EvalCounts() in_correct = False # currently processed chunks is correct until now last_correct = 'O' # previous chunk tag in corpus last_correct_type = '' # type of previously identified chunk tag last_guessed = 'O' # previously identified chunk tag last_guessed_type = '' # type of previous chunk tag in corpus for true_tag, pred_tag in zip(true_seqs, pred_seqs): guessed, guessed_type = split_tag(pred_tag) correct, correct_type = split_tag(true_tag) end_correct = end_of_chunk(last_correct, correct, last_correct_type, correct_type) end_guessed = end_of_chunk(last_guessed, guessed, last_guessed_type, guessed_type) start_correct = start_of_chunk(last_correct, correct, last_correct_type, correct_type) start_guessed = start_of_chunk(last_guessed, guessed, last_guessed_type, guessed_type) if in_correct: if (end_correct and end_guessed and last_guessed_type == last_correct_type): in_correct = False counts.correct_chunk += 1 counts.t_correct_chunk[last_correct_type] += 1 elif (end_correct != end_guessed or guessed_type != correct_type): in_correct = False if start_correct and start_guessed and guessed_type == correct_type: in_correct = True if start_correct: counts.total_gold += 1 counts.t_total_gold[correct_type] += 1 if start_guessed: counts.total_pred += 1 counts.t_total_pred[guessed_type] += 1 if correct == guessed and guessed_type == correct_type: counts.correct_tags += 1 counts.token_counter += 1 last_guessed = guessed last_correct = correct last_guessed_type = guessed_type last_correct_type = correct_type if in_correct: counts.correct_chunk += 1 counts.t_correct_chunk[last_correct_type] += 1 return counts def evaluate_iob2(true_seqs, pred_seqs): counts = EvalCounts() gold = set(bio_tags_to_spans(true_seqs)) pred = set(bio_tags_to_spans(pred_seqs)) counts.correct_chunk = len(gold & pred) counts.total_pred = len(pred) counts.total_gold = len(gold) return counts def uniq(iterable): seen = set() return [i for i in iterable if not (i in seen or seen.add(i))] def calculate_metrics(correct, guessed, total): tp, fp, fn = correct, guessed - correct, total - correct p = 0. if tp + fp == 0 else 1. * tp / (tp + fp) r = 0. if tp + fn == 0 else 1. * tp / (tp + fn) f = 0. if p + r == 0 else 2 * p * r / (p + r) return DetailedF1(tp, fp, fn, p, r, f) def calc_metrics(tp, p, t, percent=True): """compute overall precision, recall and FB1 (default values are 0.0) if percent is True, return 100 * original decimal value Args: tp: p: t: percent: (Default value = True) Returns: """ precision = tp / p if p else 0 recall = tp / t if t else 0 fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0 if percent: return 100 * precision, 100 * recall, 100 * fb1 else: return precision, recall, fb1 def metrics(counts): c = counts overall = calculate_metrics( c.correct_chunk, c.total_pred, c.total_gold ) by_type = {} for t in uniq(list(c.t_total_gold.keys()) + list(c.t_total_pred.keys())): by_type[t] = calculate_metrics( c.t_correct_chunk[t], c.t_total_pred[t], c.t_total_gold[t] ) return overall, by_type def report(counts, out=None): if out is None: out = sys.stdout overall, by_type = metrics(counts) c = counts out.write('processed %d tokens with %d phrases; ' % (c.token_counter, c.total_gold)) out.write('found: %d phrases; correct: %d.\n' % (c.total_pred, c.correct_chunk)) if c.token_counter > 0: out.write('accuracy: %6.2f%%; ' % (100. * c.correct_tags / c.token_counter)) out.write('precision: %6.2f%%; ' % (100. * overall.prec)) out.write('recall: %6.2f%%; ' % (100. * overall.rec)) out.write('FB1: %6.2f\n' % (100. * overall.fscore)) for i, m in sorted(by_type.items()): out.write('%17s: ' % i) out.write('precision: %6.2f%%; ' % (100. * m.prec)) out.write('recall: %6.2f%%; ' % (100. * m.rec)) out.write('FB1: %6.2f %d\n' % (100. * m.fscore, c.t_total_pred[i])) return overall, by_type def end_of_chunk(prev_tag, tag, prev_type, type_): # check if a chunk ended between the previous and current word # arguments: previous and current chunk tags, previous and current types return ((prev_tag == "B" and tag == "B") or (prev_tag == "B" and tag == "O") or (prev_tag == "I" and tag == "B") or (prev_tag == "I" and tag == "O") or (prev_tag == "E" and tag == "E") or (prev_tag == "E" and tag == "I") or (prev_tag == "E" and tag == "O") or (prev_tag == "I" and tag == "O") or (prev_tag != "O" and prev_tag != "." and prev_type != type_) or (prev_tag == "]" or prev_tag == "[")) def start_of_chunk(prev_tag, tag, prev_type, type_): # check if a chunk started between the previous and current word # arguments: previous and current chunk tags, previous and current types chunkStart = ((prev_tag == "B" and tag == "B") or (prev_tag == "B" and tag == "B") or (prev_tag == "I" and tag == "B") or (prev_tag == "O" and tag == "B") or (prev_tag == "O" and tag == "I") or (prev_tag == "E" and tag == "E") or (prev_tag == "E" and tag == "I") or (prev_tag == "O" and tag == "E") or (prev_tag == "O" and tag == "I") or (tag != "O" and tag != "." and prev_type != type_) or (tag == "]" or tag == "[")) # corrected 1998-12-22: these chunks are assumed to have length 1 # print("startOfChunk?", prevTag, tag, prevType, type) # print(chunkStart) return chunkStart def main(argv): args = parse_args(argv[1:]) if args.file is None: counts = evaluate_iobes(sys.stdin, args) else: with open(args.file, encoding='utf-8') as f: counts = evaluate_iobes(f, args) report(counts) if __name__ == '__main__': sys.exit(main(sys.argv)) ================================================ FILE: hanlp/metrics/chunking/iobes_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-09-14 21:55 from hanlp.common.vocab_tf import VocabTF from hanlp.metrics.chunking.conlleval import SpanF1 from hanlp.metrics.chunking.chunking_f1_tf import ChunkingF1_TF class IOBES_F1_TF(ChunkingF1_TF): def __init__(self, tag_vocab: VocabTF, from_logits=True, name='f1', dtype=None, **kwargs): super().__init__(tag_vocab, from_logits, name, dtype, **kwargs) self.state = SpanF1() def update_tags(self, true_tags, pred_tags): # true_tags = list(itertools.chain.from_iterable(true_tags)) # pred_tags = list(itertools.chain.from_iterable(pred_tags)) # self.state.update_state(true_tags, pred_tags) for gold, pred in zip(true_tags, pred_tags): self.state.update_state(gold, pred) return self.result() def result(self): return self.state.result(full=False, verbose=False).fscore def reset_states(self): self.state.reset_state() ================================================ FILE: hanlp/metrics/chunking/sequence_labeling.py ================================================ # MIT License # # Copyright (c) 2018 chakki # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. """Metrics to assess performance on sequence labeling task given prediction Functions named as ``*_score`` return a scalar value to maximize: the higher the better """ from collections import defaultdict import numpy as np def iobes_to_span(words, tags): delimiter = ' ' if all([len(w) == 1 for w in words]): delimiter = '' # might be Chinese entities = [] for tag, start, end in get_entities(tags): entities.append((delimiter.join(words[start:end]), tag, start, end)) yield entities def get_entities(seq, suffix=False): """Gets entities from sequence. Args: seq(list): sequence of labels. suffix: (Default value = False) Returns: list: list of (chunk_type, chunk_start, chunk_end). Example: >>> from seqeval.metrics.sequence_labeling import get_entities >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC'] >>> get_entities(seq) [('PER', 0, 2), ('LOC', 3, 4)] """ # for nested list if any(isinstance(s, list) for s in seq): seq = [item for sublist in seq for item in sublist + ['O']] prev_tag = 'O' prev_type = '' begin_offset = 0 chunks = [] for i, chunk in enumerate(seq + ['O']): if suffix: tag = chunk[-1] type_ = chunk[:-2] else: tag = chunk[0] type_ = chunk[2:] if end_of_chunk(prev_tag, tag, prev_type, type_): chunks.append((prev_type, begin_offset, i)) if start_of_chunk(prev_tag, tag, prev_type, type_): begin_offset = i prev_tag = tag prev_type = type_ return chunks def end_of_chunk(prev_tag, tag, prev_type, type_): """Checks if a chunk ended between the previous and current word. Args: prev_tag: previous chunk tag. tag: current chunk tag. prev_type: previous type. type_: current type. Returns: chunk_end: boolean. """ chunk_end = False if prev_tag == 'E': chunk_end = True if prev_tag == 'S': chunk_end = True if prev_tag == 'B' and tag == 'B': chunk_end = True if prev_tag == 'B' and tag == 'S': chunk_end = True if prev_tag == 'B' and tag == 'O': chunk_end = True if prev_tag == 'I' and tag == 'B': chunk_end = True if prev_tag == 'I' and tag == 'S': chunk_end = True if prev_tag == 'I' and tag == 'O': chunk_end = True if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: chunk_end = True return chunk_end def start_of_chunk(prev_tag, tag, prev_type, type_): """Checks if a chunk started between the previous and current word. Args: prev_tag: previous chunk tag. tag: current chunk tag. prev_type: previous type. type_: current type. Returns: chunk_start: boolean. """ chunk_start = False if tag == 'B': chunk_start = True if tag == 'S': chunk_start = True if prev_tag == 'E' and tag == 'E': chunk_start = True if prev_tag == 'E' and tag == 'I': chunk_start = True if prev_tag == 'S' and tag == 'E': chunk_start = True if prev_tag == 'S' and tag == 'I': chunk_start = True if prev_tag == 'O' and tag == 'E': chunk_start = True if prev_tag == 'O' and tag == 'I': chunk_start = True if tag != 'O' and tag != '.' and prev_type != type_: chunk_start = True return chunk_start def f1_score(y_true, y_pred, average='micro', suffix=False): """Compute the F1 score. The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:: F1 = 2 * (precision * recall) / (precision + recall) Args: y_true: 2d array. Ground truth (correct) target values. y_pred: 2d array. Estimated targets as returned by a tagger. average: (Default value = 'micro') suffix: (Default value = False) Returns: score: float. Example: >>> from seqeval.metrics import f1_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> f1_score(y_true, y_pred) 0.50 """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) nb_true = len(true_entities) p = nb_correct / nb_pred if nb_pred > 0 else 0 r = nb_correct / nb_true if nb_true > 0 else 0 score = 2 * p * r / (p + r) if p + r > 0 else 0 return score def accuracy_score(y_true, y_pred): """Accuracy classification score. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must *exactly* match the corresponding set of labels in y_true. Args: y_true: 2d array. Ground truth (correct) target values. y_pred: 2d array. Estimated targets as returned by a tagger. Returns: score: float. Example: >>> from seqeval.metrics import accuracy_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> accuracy_score(y_true, y_pred) 0.80 """ if any(isinstance(s, list) for s in y_true): y_true = [item for sublist in y_true for item in sublist] y_pred = [item for sublist in y_pred for item in sublist] nb_correct = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred)) nb_true = len(y_true) score = nb_correct / nb_true return score def precision_score(y_true, y_pred, average='micro', suffix=False): """Compute the precision. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample. The best value is 1 and the worst value is 0. Args: y_true: 2d array. Ground truth (correct) target values. y_pred: 2d array. Estimated targets as returned by a tagger. average: (Default value = 'micro') suffix: (Default value = False) Returns: score: float. Example: >>> from seqeval.metrics import precision_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> precision_score(y_true, y_pred) 0.50 """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) score = nb_correct / nb_pred if nb_pred > 0 else 0 return score def recall_score(y_true, y_pred, average='micro', suffix=False): """Compute the recall. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The best value is 1 and the worst value is 0. Args: y_true: 2d array. Ground truth (correct) target values. y_pred: 2d array. Estimated targets as returned by a tagger. average: (Default value = 'micro') suffix: (Default value = False) Returns: score: float. Example: >>> from seqeval.metrics import recall_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> recall_score(y_true, y_pred) 0.50 """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) nb_correct = len(true_entities & pred_entities) nb_true = len(true_entities) score = nb_correct / nb_true if nb_true > 0 else 0 return score def performance_measure(y_true, y_pred): """Compute the performance metrics: TP, FP, FN, TN Args: y_true: 2d array. Ground truth (correct) target values. y_pred: 2d array. Estimated targets as returned by a tagger. Returns: performance_dict: dict Example: >>> from seqeval.metrics import performance_measure >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O']] >>> performance_measure(y_true, y_pred) (3, 3, 1, 4) """ performace_dict = dict() if any(isinstance(s, list) for s in y_true): y_true = [item for sublist in y_true for item in sublist] y_pred = [item for sublist in y_pred for item in sublist] performace_dict['TP'] = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred) if ((y_t != 'O') or (y_p != 'O'))) performace_dict['FP'] = sum(y_t != y_p for y_t, y_p in zip(y_true, y_pred)) performace_dict['FN'] = sum(((y_t != 'O') and (y_p == 'O')) for y_t, y_p in zip(y_true, y_pred)) performace_dict['TN'] = sum((y_t == y_p == 'O') for y_t, y_p in zip(y_true, y_pred)) return performace_dict def classification_report(y_true, y_pred, digits=2, suffix=False): """Build a text report showing the main classification metrics. Args: y_true: 2d array. Ground truth (correct) target values. y_pred: 2d array. Estimated targets as returned by a classifier. digits: int. Number of digits for formatting output floating point values. (Default value = 2) suffix: (Default value = False) Returns: report: string. Text summary of the precision, recall, F1 score for each class. Examples: >>> from seqeval.metrics import classification_report >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> print(classification_report(y_true, y_pred)) precision recall f1-score support MISC 0.00 0.00 0.00 1 PER 1.00 1.00 1.00 1 micro avg 0.50 0.50 0.50 2 macro avg 0.50 0.50 0.50 2 """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) name_width = 0 d1 = defaultdict(set) d2 = defaultdict(set) for e in true_entities: d1[e[0]].add((e[1], e[2])) name_width = max(name_width, len(e[0])) for e in pred_entities: d2[e[0]].add((e[1], e[2])) last_line_heading = 'macro avg' width = max(name_width, len(last_line_heading), digits) headers = ["precision", "recall", "f1-score", "support"] head_fmt = u'{:>{width}s} ' + u' {:>9}' * len(headers) report = head_fmt.format(u'', *headers, width=width) report += u'\n\n' row_fmt = u'{:>{width}s} ' + u' {:>9.{digits}f}' * 3 + u' {:>9}\n' ps, rs, f1s, s = [], [], [], [] for type_name, true_entities in d1.items(): pred_entities = d2[type_name] nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) nb_true = len(true_entities) p = nb_correct / nb_pred if nb_pred > 0 else 0 r = nb_correct / nb_true if nb_true > 0 else 0 f1 = 2 * p * r / (p + r) if p + r > 0 else 0 report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits) ps.append(p) rs.append(r) f1s.append(f1) s.append(nb_true) report += u'\n' # compute averages report += row_fmt.format('micro avg', precision_score(y_true, y_pred, suffix=suffix), recall_score(y_true, y_pred, suffix=suffix), f1_score(y_true, y_pred, suffix=suffix), np.sum(s), width=width, digits=digits) report += row_fmt.format(last_line_heading, np.average(ps, weights=s), np.average(rs, weights=s), np.average(f1s, weights=s), np.sum(s), width=width, digits=digits) return report ================================================ FILE: hanlp/metrics/f1.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-10 14:55 from abc import ABC from hanlp.metrics.metric import Metric class F1(Metric, ABC): def __init__(self, nb_pred=0, nb_true=0, nb_correct=0) -> None: super().__init__() self.nb_correct = nb_correct self.nb_pred = nb_pred self.nb_true = nb_true def __repr__(self) -> str: p, r, f = self.prf return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}" @property def prf(self): nb_correct = self.nb_correct nb_pred = self.nb_pred nb_true = self.nb_true p = nb_correct / nb_pred if nb_pred > 0 else .0 r = nb_correct / nb_true if nb_true > 0 else .0 f = 2 * p * r / (p + r) if p + r > 0 else .0 return p, r, f @property def score(self): return self.prf[-1] def reset(self): self.nb_correct = 0 self.nb_pred = 0 self.nb_true = 0 def __call__(self, pred: set, gold: set): self.nb_correct += len(pred & gold) self.nb_pred += len(pred) self.nb_true += len(gold) class F1_(Metric): def __init__(self, p, r, f) -> None: super().__init__() self.f = f self.r = r self.p = p @property def score(self): return self.f def __call__(self, pred, gold): raise NotImplementedError() def reset(self): self.f = self.r = self.p = 0 def __repr__(self) -> str: p, r, f = self.p, self.r, self.f return f"P: {p:.2%} R: {r:.2%} F1: {f:.2%}" ================================================ FILE: hanlp/metrics/metric.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-03 11:35 from abc import ABC, abstractmethod class Metric(ABC): def __lt__(self, other): return self.score < other def __le__(self, other): return self.score <= other def __eq__(self, other): return self.score == other def __ge__(self, other): return self.score >= other def __gt__(self, other): return self.score > other def __ne__(self, other): return self.score != other @property @abstractmethod def score(self): pass @abstractmethod def __call__(self, pred, gold, mask=None): pass def __repr__(self) -> str: return f'{self.score}:.4f' def __float__(self): return self.score @abstractmethod def reset(self): pass ================================================ FILE: hanlp/metrics/mtl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-03 00:16 from hanlp.metrics.metric import Metric class MetricDict(Metric, dict): _COLORS = ["magenta", "cyan", "green", "yellow"] @property def score(self): return sum(float(x) for x in self.values()) / len(self) def __call__(self, pred, gold): for metric in self.values(): metric(pred, gold) def reset(self): for metric in self.values(): metric.reset() def __repr__(self) -> str: return ' '.join(f'({k} {v})' for k, v in self.items()) def cstr(self, idx=None, level=0) -> str: if idx is None: idx = [0] prefix = '' for _, (k, v) in enumerate(self.items()): color = self._COLORS[idx[0] % len(self._COLORS)] idx[0] += 1 child_is_dict = isinstance(v, MetricDict) _level = min(level, 2) # if level != 0 and not child_is_dict: # _level = 2 lb = '{[(' rb = '}])' k = f'[bold][underline]{k}[/underline][/bold]' prefix += f'[{color}]{lb[_level]}{k} [/{color}]' if child_is_dict: prefix += v.cstr(idx, level + 1) else: prefix += f'[{color}]{v}[/{color}]' prefix += f'[{color}]{rb[_level]}[/{color}]' return prefix ================================================ FILE: hanlp/metrics/parsing/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-27 00:48 ================================================ FILE: hanlp/metrics/parsing/attachmentscore.py ================================================ # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from hanlp.metrics.metric import Metric class AttachmentScore(Metric): def __init__(self, eps=1e-12): super(AttachmentScore, self).__init__() self.eps = eps self.total = 0.0 self.correct_arcs = 0.0 self.correct_rels = 0.0 def __repr__(self): return f"UAS: {self.uas:.2%} LAS: {self.las:.2%}" # noinspection PyMethodOverriding def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask): arc_mask = arc_preds.eq(arc_golds)[mask] rel_mask = rel_preds.eq(rel_golds)[mask] & arc_mask self.total += len(arc_mask) self.correct_arcs += arc_mask.sum().item() self.correct_rels += rel_mask.sum().item() def __lt__(self, other): return self.score < other def __le__(self, other): return self.score <= other def __ge__(self, other): return self.score >= other def __gt__(self, other): return self.score > other @property def score(self): return self.las @property def uas(self): return self.correct_arcs / (self.total + self.eps) @property def las(self): return self.correct_rels / (self.total + self.eps) def reset(self): self.total = 0.0 self.correct_arcs = 0.0 self.correct_rels = 0.0 ================================================ FILE: hanlp/metrics/parsing/conllx_eval.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-08 22:35 import tempfile from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr CONLLX_EVAL = get_resource( 'https://github.com/elikip/bist-parser/archive/master.zip' + '#bmstparser/src/utils/eval.pl') def evaluate(gold_file, pred_file): """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski) Args: gold_file(str): The gold conllx file pred_file(str): The pred conllx file Returns: """ gold_file = get_resource(gold_file) fixed_pred_file = tempfile.NamedTemporaryFile().name copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False) if gold_file.endswith('.conllu'): fixed_gold_file = tempfile.NamedTemporaryFile().name copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False) gold_file = fixed_gold_file exitcode, out, err = get_exitcode_stdout_stderr(f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}') if exitcode: raise RuntimeError(f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.') lines = out.split('\n')[-4:] las = int(lines[0].split()[3]) / int(lines[0].split()[5]) uas = int(lines[1].split()[3]) / int(lines[1].split()[5]) return uas, las def copy_cols(gold_file, pred_file, copied_pred_file, keep_comments=True): """Copy the first 6 columns from gold file to pred file Args: gold_file: pred_file: copied_pred_file: keep_comments: (Default value = True) Returns: """ with open(copied_pred_file, 'w') as to_out, open(pred_file) as pred_file, open(gold_file) as gold_file: for idx, (p, g) in enumerate(zip(pred_file, gold_file)): while p.startswith('#'): p = next(pred_file) if not g.strip(): if p.strip(): raise ValueError( f'Prediction file {pred_file.name} does not end a sentence at line {idx + 1}\n{p.strip()}') to_out.write('\n') continue while g.startswith('#') or '-' in g.split('\t')[0]: if keep_comments or g.startswith('-'): to_out.write(g) g = next(gold_file) to_out.write('\t'.join(str(x) for x in g.split('\t')[:6] + p.split('\t')[6:])) ================================================ FILE: hanlp/metrics/parsing/labeled_f1.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-27 21:42 from hanlp.metrics.metric import Metric class LabeledF1(Metric): def __init__(self): super(LabeledF1, self).__init__() self.sum_gold_arcs_wo_punc = 0.0 self.sum_pred_arcs_wo_punc = 0.0 self.correct_arcs_wo_punc = 0.0 self.correct_rels_wo_punc = 0.0 def __repr__(self): return f"UF: {self.uf:4.2%} LF: {self.lf:4.2%}" def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask): mask_gold = mask & arc_golds mask_pred = mask & arc_preds correct_mask = mask_gold & mask_pred correct_arcs_wo_punc = (arc_preds == arc_golds)[correct_mask] correct_rels_wo_punc = (rel_preds == rel_golds)[correct_mask] & correct_arcs_wo_punc self.sum_gold_arcs_wo_punc += float(mask_gold.sum()) self.sum_pred_arcs_wo_punc += float(mask_pred.sum()) self.correct_arcs_wo_punc += float(correct_arcs_wo_punc.sum()) self.correct_rels_wo_punc += float(correct_rels_wo_punc.sum()) def __lt__(self, other): return self.score < other def __le__(self, other): return self.score <= other def __ge__(self, other): return self.score >= other def __gt__(self, other): return self.score > other @property def score(self): return self.las @property def uas(self): return self.uf @property def las(self): return self.lf @property def ur(self): if not self.sum_gold_arcs_wo_punc: return .0 return self.correct_arcs_wo_punc / self.sum_gold_arcs_wo_punc @property def up(self): if not self.sum_pred_arcs_wo_punc: return .0 return self.correct_arcs_wo_punc / self.sum_pred_arcs_wo_punc @property def lr(self): if not self.sum_gold_arcs_wo_punc: return .0 return self.correct_rels_wo_punc / self.sum_gold_arcs_wo_punc @property def lp(self): if not self.sum_pred_arcs_wo_punc: return .0 return self.correct_rels_wo_punc / self.sum_pred_arcs_wo_punc @property def uf(self): rp = self.ur + self.up if not rp: return .0 return 2 * self.ur * self.up / rp @property def lf(self): rp = self.lr + self.lp if not rp: return .0 return 2 * self.lr * self.lp / rp def reset(self): self.sum_gold_arcs_wo_punc = 0.0 self.sum_pred_arcs_wo_punc = 0.0 self.correct_arcs_wo_punc = 0.0 self.correct_rels_wo_punc = 0.0 def to_dict(self) -> dict: return {'UF': self.uf, 'LF': self.lf} ================================================ FILE: hanlp/metrics/parsing/labeled_f1_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-27 21:42 import tensorflow as tf class LabeledF1TF(object): def __init__(self): super(LabeledF1TF, self).__init__() self.sum_gold_arcs_wo_punc = 0.0 self.sum_pred_arcs_wo_punc = 0.0 self.correct_arcs_wo_punc = 0.0 self.correct_rels_wo_punc = 0.0 def __repr__(self): return f"UF: {self.uf:6.2%} LF: {self.lf:6.2%}" def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask): mask = mask.unsqueeze(-1).expand_as(arc_preds) mask = mask & mask.transpose(1, 2) mask_gold = mask & arc_golds mask_pred = mask & arc_preds correct_arcs_wo_punc = (arc_preds == arc_golds)[mask_gold & mask_pred] correct_rels_wo_punc = (rel_preds == rel_golds)[mask_gold & mask_pred] & correct_arcs_wo_punc self.sum_gold_arcs_wo_punc += float(tf.math.count_nonzero(mask_gold)) self.sum_pred_arcs_wo_punc += float(tf.math.count_nonzero(mask_pred)) self.correct_arcs_wo_punc += float(tf.math.count_nonzero(correct_arcs_wo_punc)) self.correct_rels_wo_punc += float(tf.math.count_nonzero(correct_rels_wo_punc)) def __lt__(self, other): return self.score < other def __le__(self, other): return self.score <= other def __ge__(self, other): return self.score >= other def __gt__(self, other): return self.score > other @property def score(self): return self.las @property def uas(self): return self.uf @property def las(self): return self.lf @property def ur(self): if not self.sum_gold_arcs_wo_punc: return 0 return self.correct_arcs_wo_punc / self.sum_gold_arcs_wo_punc @property def up(self): if not self.sum_pred_arcs_wo_punc: return 0 return self.correct_arcs_wo_punc / self.sum_pred_arcs_wo_punc @property def lr(self): if not self.sum_gold_arcs_wo_punc: return 0 return self.correct_rels_wo_punc / self.sum_gold_arcs_wo_punc @property def lp(self): if not self.sum_pred_arcs_wo_punc: return 0 return self.correct_rels_wo_punc / self.sum_pred_arcs_wo_punc @property def uf(self): rp = self.ur + self.up if not rp: return 0 return 2 * self.ur * self.up / rp @property def lf(self): rp = self.lr + self.lp if not rp: return 0 return 2 * self.lr * self.lp / rp def reset_states(self): self.sum_gold_arcs_wo_punc = 0.0 self.sum_pred_arcs_wo_punc = 0.0 self.correct_arcs_wo_punc = 0.0 self.correct_rels_wo_punc = 0.0 def to_dict(self) -> dict: return {'UF': self.uf, 'LF': self.lf} ================================================ FILE: hanlp/metrics/parsing/labeled_score.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-27 00:49 import tensorflow as tf class LabeledScore(object): def __init__(self, eps=1e-5): super(LabeledScore, self).__init__() self.eps = eps self.total = 0.0 self.correct_arcs = 0.0 self.correct_rels = 0.0 def __repr__(self): return f"UAS: {self.uas:6.2%} LAS: {self.las:6.2%}" def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask): arc_mask = (arc_preds == arc_golds)[mask] rel_mask = (rel_preds == rel_golds)[mask] & arc_mask self.total += len(arc_mask) self.correct_arcs += int(tf.math.count_nonzero(arc_mask)) self.correct_rels += int(tf.math.count_nonzero(rel_mask)) def __lt__(self, other): return self.score < other def __le__(self, other): return self.score <= other def __ge__(self, other): return self.score >= other def __gt__(self, other): return self.score > other @property def score(self): return self.las @property def uas(self): return self.correct_arcs / (self.total + self.eps) @property def las(self): return self.correct_rels / (self.total + self.eps) def reset_states(self): self.total = 0.0 self.correct_arcs = 0.0 self.correct_rels = 0.0 def to_dict(self) -> dict: return {'UAS': self.uas, 'LAS': self.las} ================================================ FILE: hanlp/metrics/parsing/semdep_eval.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 2017 Timothy Dozat # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import codecs import sys from collections import namedtuple # =============================================================== def sdp_eval(gold_files, sys_files, labeled=False): """Modified from https://github.com/tdozat/Parser-v3/blob/2ff4061373e8aac8c962537a6220e1d5b196abf6/scripts/semdep_eval.py Dozat claimed "I tested it against the official eval script and it reported identical LF1". Args: gold_files: sys_files: labeled: (Default value = False) Returns: """ correct = 0 predicted = 0 actual = 0 n_tokens = 0 n_sequences = 0 current_seq_correct = False n_correct_sequences = 0 current_sent = 0 if isinstance(gold_files, str): gold_files = [gold_files] if isinstance(sys_files, str): sys_files = [sys_files] for gold_file, sys_file in zip(gold_files, sys_files): with codecs.open(gold_file, encoding='utf-8') as gf, \ codecs.open(sys_file, encoding='utf-8') as sf: gold_line = gf.readline() gold_i = 1 sys_i = 0 while gold_line: while gold_line.startswith('#'): current_sent += 1 gold_i += 1 n_sequences += 1 n_correct_sequences += current_seq_correct current_seq_correct = True gold_line = gf.readline() if gold_line.rstrip() != '': sys_line = sf.readline() sys_i += 1 while sys_line.startswith('#') or sys_line.rstrip() == '' or sys_line.split('\t')[0] == '0': sys_line = sf.readline() sys_i += 1 gold_line = gold_line.rstrip().split('\t') sys_line = sys_line.rstrip().split('\t') # assert sys_line[1] == gold_line[1], 'Files are misaligned at lines {}, {}'.format(gold_i, sys_i) # Compute the gold edges gold_node = gold_line[8] if gold_node != '_': gold_node = gold_node.split('|') if labeled: gold_edges = set(tuple(gold_edge.split(':', 1)) for gold_edge in gold_node) else: gold_edges = set(gold_edge.split(':', 1)[0] for gold_edge in gold_node) else: gold_edges = set() # Compute the sys edges sys_node = sys_line[8] if sys_node != '_': sys_node = sys_node.split('|') if labeled: sys_edges = set(tuple(sys_edge.split(':', 1)) for sys_edge in sys_node) else: sys_edges = set(sys_edge.split(':', 1)[0] for sys_edge in sys_node) else: sys_edges = set() correct_edges = gold_edges & sys_edges if len(correct_edges) != len(gold_edges): current_seq_correct = False correct += len(correct_edges) predicted += len(sys_edges) actual += len(gold_edges) n_tokens += 1 # current_fp += len(sys_edges) - len(gold_edges & sys_edges) gold_line = gf.readline() gold_i += 1 # print(correct, predicted - correct, actual - correct) Accuracy = namedtuple('Accuracy', ['precision', 'recall', 'F1', 'seq_acc']) precision = correct / (predicted + 1e-12) recall = correct / (actual + 1e-12) F1 = 2 * precision * recall / (precision + recall + 1e-12) seq_acc = n_correct_sequences / n_sequences return Accuracy(precision, recall, F1, seq_acc) # =============================================================== def main(): """ """ files = sys.argv[1:] n_files = len(files) assert (n_files % 2) == 0 gold_files, sys_files = files[:n_files // 2], files[n_files // 2:] UAS = sdp_eval(gold_files, sys_files, labeled=False) LAS = sdp_eval(gold_files, sys_files, labeled=True) # print(UAS.F1, UAS.seq_acc) print('UAS={:0.1f}'.format(UAS.F1 * 100)) print('LAS={:0.1f}'.format(LAS.F1 * 100)) if __name__ == '__main__': main() ================================================ FILE: hanlp/metrics/parsing/span.py ================================================ # MIT License # # Copyright (c) 2020 Yu Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from collections import Counter from hanlp.metrics.metric import Metric class SpanMetric(Metric): def __init__(self, eps=1e-12): super().__init__() self.reset(eps) # noinspection PyAttributeOutsideInit def reset(self, eps=1e-12): self.n = 0.0 self.n_ucm = 0.0 self.n_lcm = 0.0 self.utp = 0.0 self.ltp = 0.0 self.pred = 0.0 self.gold = 0.0 self.eps = eps def __call__(self, preds, golds): for pred, gold in zip(preds, golds): upred = Counter([(i, j) for i, j, label in pred]) ugold = Counter([(i, j) for i, j, label in gold]) utp = list((upred & ugold).elements()) lpred = Counter(pred) lgold = Counter(gold) ltp = list((lpred & lgold).elements()) self.n += 1 self.n_ucm += len(utp) == len(pred) == len(gold) self.n_lcm += len(ltp) == len(pred) == len(gold) self.utp += len(utp) self.ltp += len(ltp) self.pred += len(pred) self.gold += len(gold) return self def __repr__(self): s = f"UCM: {self.ucm:.2%} LCM: {self.lcm:.2%} " s += f"UP: {self.up:.2%} UR: {self.ur:.2%} UF: {self.uf:.2%} " s += f"LP: {self.lp:.2%} LR: {self.lr:.2%} LF: {self.lf:.2%}" return s @property def score(self): return self.lf @property def ucm(self): return self.n_ucm / (self.n + self.eps) @property def lcm(self): return self.n_lcm / (self.n + self.eps) @property def up(self): return self.utp / (self.pred + self.eps) @property def ur(self): return self.utp / (self.gold + self.eps) @property def uf(self): return 2 * self.utp / (self.pred + self.gold + self.eps) @property def lp(self): return self.ltp / (self.pred + self.eps) @property def lr(self): return self.ltp / (self.gold + self.eps) @property def lf(self): return 2 * self.ltp / (self.pred + self.gold + self.eps) ================================================ FILE: hanlp/metrics/spearman_correlation.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-23 16:12 import torch from hanlp.metrics.metric import Metric def _get_ranks(x: torch.Tensor) -> torch.Tensor: argsort = x.argsort() ranks = torch.zeros_like(argsort, device=x.device) ranks[argsort] = torch.arange(len(x), device=x.device) return ranks def spearman_correlation(x: torch.Tensor, y: torch.Tensor): """Compute correlation between 2 1-D vectors. Adopted from https://discuss.pytorch.org/t/spearmans-correlation/91931/5 Args: x: Shape (N, ) y: Shape (N, ) """ x_rank = _get_ranks(x) y_rank = _get_ranks(y) n = x.size(0) upper = 6 * torch.sum((x_rank - y_rank).pow(2)) down = n * (n ** 2 - 1.0) return 1.0 - (upper / down) class SpearmanCorrelation(Metric): """ This `Metric` calculates the sample Spearman correlation coefficient (r) between two tensors. Each element in the two tensors is assumed to be a different observation of the variable (i.e., the input tensors are implicitly flattened into vectors and the correlation is calculated between the vectors). """ @property def score(self): return spearman_correlation(self.total_predictions, self.total_gold_labels).item() def __init__(self) -> None: super().__init__() self.total_predictions = torch.zeros(0) self.total_gold_labels = torch.zeros(0) def __call__( self, predictions: torch.Tensor, gold_labels: torch.Tensor, mask=None ): """ # Parameters predictions : `torch.Tensor`, required. A tensor of predictions of shape (batch_size, ...). gold_labels : `torch.Tensor`, required. A tensor of the same shape as `predictions`. """ if mask is not None: raise NotImplemented('mask not supported in SpearmanCorrelation for now.') # Flatten predictions, gold_labels, and mask. We calculate the Spearman correlation between # the vectors, since each element in the predictions and gold_labels tensor is assumed # to be a separate observation. predictions = predictions.reshape(-1) gold_labels = gold_labels.reshape(-1) self.total_predictions = self.total_predictions.to(predictions.device) self.total_gold_labels = self.total_gold_labels.to(gold_labels.device) self.total_predictions = torch.cat((self.total_predictions, predictions), 0) self.total_gold_labels = torch.cat((self.total_gold_labels, gold_labels), 0) def reset(self): self.total_predictions = torch.zeros(0) self.total_gold_labels = torch.zeros(0) def __str__(self) -> str: return f'spearman: {self.score * 100:.2f}' ================================================ FILE: hanlp/metrics/srl/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-16 18:44 ================================================ FILE: hanlp/metrics/srl/srlconll.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-07-16 18:44 import os from hanlp.utils.io_util import get_resource, get_exitcode_stdout_stderr, run_cmd def official_conll_05_evaluate(pred_path, gold_path): script_root = get_resource('http://www.lsi.upc.edu/~srlconll/srlconll-1.1.tgz') lib_path = f'{script_root}/lib' if lib_path not in os.environ.get("PERL5LIB", ""): os.environ['PERL5LIB'] = f'{lib_path}:{os.environ.get("PERL5LIB", "")}' bin_path = f'{script_root}/bin' if bin_path not in os.environ.get('PATH', ''): os.environ['PATH'] = f'{bin_path}:{os.environ.get("PATH", "")}' eval_info_gold_pred = run_cmd(f'perl {script_root}/bin/srl-eval.pl {gold_path} {pred_path}') eval_info_pred_gold = run_cmd(f'perl {script_root}/bin/srl-eval.pl {pred_path} {gold_path}') conll_recall = float(eval_info_gold_pred.strip().split("\n")[6].strip().split()[5]) / 100 conll_precision = float(eval_info_pred_gold.strip().split("\n")[6].strip().split()[5]) / 100 if conll_recall + conll_precision > 0: conll_f1 = 2 * conll_recall * conll_precision / (conll_recall + conll_precision) else: conll_f1 = 0 return conll_precision, conll_recall, conll_f1 def run_perl(script, src, dst=None): os.environ['PERL5LIB'] = f'' exitcode, out, err = get_exitcode_stdout_stderr( f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}') if exitcode: # cpanm -l ~/.local namespace::autoclean # cpanm -l ~/.local Moose # cpanm -l ~/.local MooseX::SemiAffordanceAccessor module raise RuntimeError(err) with open(dst, 'w') as ofile: ofile.write(out) return dst ================================================ FILE: hanlp/optimizers/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-11-11 18:44 ================================================ FILE: hanlp/optimizers/adamw/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-11-11 18:44 import tensorflow as tf from hanlp.optimizers.adamw.optimization import WarmUp, AdamWeightDecay # from hanlp.optimization.adamw.optimizers_v2 import AdamW # from hanlp.optimization.adamw.utils import get_weight_decays # def create_optimizer(model, init_lr, num_train_steps, num_warmup_steps): # """Creates an optimizer with learning rate schedule.""" # wd_dict = get_weight_decays(model) # # # Implements linear decay of the learning rate. # learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( # initial_learning_rate=init_lr, # decay_steps=num_train_steps, # end_learning_rate=0.0) # if num_warmup_steps: # learning_rate_fn = WarmUp(initial_learning_rate=init_lr, # decay_schedule_fn=learning_rate_fn, # warmup_steps=num_warmup_steps) # optimizer = AdamW( # learning_rate=learning_rate_fn, # weight_decay_rate=0.01, # beta_1=0.9, # beta_2=0.999, # epsilon=1e-6, # exclude_from_weight_decay=['layer_norm', 'bias']) # return optimizer def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01, epsilon=1e-6, clipnorm=None): """Creates an optimizer with learning rate schedule. Args: init_lr: num_train_steps: num_warmup_steps: weight_decay_rate: (Default value = 0.01) epsilon: (Default value = 1e-6) clipnorm: (Default value = None) Returns: """ # Implements linear decay of the learning rate. learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0) if num_warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) additional_args = {} if clipnorm: additional_args['clipnorm'] = clipnorm optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=epsilon, exclude_from_weight_decay=['LayerNorm', 'bias'], **additional_args ) # {'LayerNorm/gamma:0', 'LayerNorm/beta:0'} return optimizer ================================================ FILE: hanlp/optimizers/adamw/optimization.py ================================================ # Copyright 2019 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Functions and classes related to optimization (weight updates).""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import re import tensorflow as tf class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): """Applys a warmup schedule on a given learning rate decay schedule.""" def __init__( self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None): super(WarmUp, self).__init__() self.initial_learning_rate = initial_learning_rate self.warmup_steps = warmup_steps self.power = power self.decay_schedule_fn = decay_schedule_fn self.name = name def __call__(self, step): with tf.name_scope(self.name or 'WarmUp') as name: # Implements polynomial warmup. i.e., if global_step < warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. global_step_float = tf.cast(step, tf.float32) warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) warmup_percent_done = global_step_float / warmup_steps_float warmup_learning_rate = ( self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)) return tf.cond(global_step_float < warmup_steps_float, lambda: warmup_learning_rate, lambda: self.decay_schedule_fn(step), name=name) def get_config(self): return { 'initial_learning_rate': self.initial_learning_rate, 'decay_schedule_fn': self.decay_schedule_fn, 'warmup_steps': self.warmup_steps, 'power': self.power, 'name': self.name } def create_optimizer(init_lr, num_train_steps, num_warmup_steps): """Creates an optimizer with learning rate schedule. Args: init_lr: num_train_steps: num_warmup_steps: Returns: """ # Implements linear decay of the learning rate. learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0) if num_warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=['layer_norm', 'bias']) return optimizer try: AdamTF = tf.keras.optimizers.legacy.Adam # avoid slowdown when using v2.11+ Keras optimizers on M1/M2 Macs except: AdamTF = tf.keras.optimizers.Adam class AdamWeightDecay(AdamTF): """Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact with the m and v parameters in strange ways. Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent to adding the square of the weights to the loss with plain (non-momentum) SGD. Args: Returns: """ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, amsgrad=False, weight_decay_rate=0.0, include_in_weight_decay=None, exclude_from_weight_decay=None, name='AdamWeightDecay', **kwargs): super(AdamWeightDecay, self).__init__( learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) self.weight_decay_rate = weight_decay_rate self._include_in_weight_decay = include_in_weight_decay self._exclude_from_weight_decay = exclude_from_weight_decay @classmethod def from_config(cls, config): """Creates an optimizer from its config with WarmUp custom object. Args: config: Returns: """ custom_objects = {'WarmUp': WarmUp} return super(AdamWeightDecay, cls).from_config( config, custom_objects=custom_objects) def _prepare_local(self, var_device, var_dtype, apply_state): super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state) apply_state['weight_decay_rate'] = tf.constant( self.weight_decay_rate, name='adam_weight_decay_rate') def _decay_weights_op(self, var, learning_rate, apply_state): do_decay = self._do_use_weight_decay(var.name) if do_decay: return var.assign_sub( learning_rate * var * apply_state['weight_decay_rate'], use_locking=self._use_locking) return tf.no_op() def apply_gradients(self, grads_and_vars, name=None): grads, tvars = list(zip(*grads_and_vars)) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars)) def _get_lr(self, var_device, var_dtype, apply_state): """Retrieves the learning rate with the given state. Args: var_device: var_dtype: apply_state: Returns: """ if apply_state is None: return self._decayed_lr_t[var_dtype], {} apply_state = apply_state or {} coefficients = apply_state.get((var_device, var_dtype)) if coefficients is None: coefficients = self._fallback_apply_state(var_device, var_dtype) apply_state[(var_device, var_dtype)] = coefficients return coefficients['lr_t'], dict(apply_state=apply_state) def _resource_apply_dense(self, grad, var, apply_state=None): lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): return super(AdamWeightDecay, self)._resource_apply_dense( grad, var, **kwargs) def _resource_apply_sparse(self, grad, var, indices, apply_state=None): lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): return super(AdamWeightDecay, self)._resource_apply_sparse( grad, var, indices, **kwargs) def get_config(self): config = super(AdamWeightDecay, self).get_config() config.update({ 'weight_decay_rate': self.weight_decay_rate, }) return config def _do_use_weight_decay(self, param_name): """Whether to use L2 weight decay for `param_name`. Args: param_name: Returns: """ if self.weight_decay_rate == 0: return False if self._include_in_weight_decay: for r in self._include_in_weight_decay: if re.search(r, param_name) is not None: return True if self._exclude_from_weight_decay: for r in self._exclude_from_weight_decay: if re.search(r, param_name) is not None: return False return True def apply_gradients(self, grads_and_vars, name=None, **kwargs): grads, tvars = list(zip(*grads_and_vars)) return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name, **kwargs) ================================================ FILE: hanlp/pretrained/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 19:10 from hanlp.pretrained import tok from hanlp.pretrained import dep from hanlp.pretrained import sdp from hanlp.pretrained import glove from hanlp.pretrained import pos from hanlp.pretrained import rnnlm from hanlp.pretrained import word2vec from hanlp.pretrained import ner from hanlp.pretrained import classifiers from hanlp.pretrained import fasttext from hanlp.pretrained import mtl from hanlp.pretrained import eos from hanlp.pretrained import sts from hanlp.pretrained import constituency from hanlp.pretrained import amr from hanlp.pretrained import amr2text from hanlp.pretrained import srl # Will be filled up during runtime ALL = {} ================================================ FILE: hanlp/pretrained/amr.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-01-25 11:47 from hanlp_common.constant import HANLP_URL AMR3_SEQ2SEQ_BART_LARGE = HANLP_URL + 'amr/amr3_seq2seq_bart_large_83.30_20220125_114450.zip' '''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large parser trained on Abstract Meaning Representation 3.0 (:cite:`knight2014abstract`). Its performance is =================== ========= ========= ========= Metric P R F1 =================== ========= ========= ========= Smatch 84.00 82.60 83.30 Unlabeled 86.40 84.90 85.70 No WSD 84.50 83.10 83.80 Non_sense_frames 91.90 91.30 91.60 Wikification 81.70 80.80 81.20 Named Ent. 89.20 87.00 88.10 Negations 71.70 70.90 71.30 IgnoreVars 73.80 73.10 73.50 Concepts 90.70 89.60 90.10 Frames 88.50 87.90 88.20 Reentrancies 70.40 71.80 71.10 SRL 79.00 79.60 79.30 =================== ========= ========= ========= Note this parser does NOT perform wikification. ''' AMR3_GRAPH_PRETRAIN_PARSER = HANLP_URL + 'amr/amr3_graph_pretrain_parser_20221207_153759.zip' '''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large parser trained on Abstract Meaning Representation 3.0 (:cite:`knight2014abstract`) with graph pre-training (:cite:`bai-etal-2022-graph`). Its performance is ``84.3`` according to their official repository. Using ``amr-evaluation-enhanced``, the performance is slightly lower: =================== ========= ========= ========= Metric P R F1 =================== ========= ========= ========= Smatch 84.4 83.6 84.0 Unlabeled 86.7 85.8 86.2 No WSD 84.9 84.1 84.5 Non_sense_frames 91.8 91.6 91.7 Wikification 83.6 81.7 82.6 Named Ent. 89.3 87.4 88.4 Negations 71.6 72.2 71.9 IgnoreVars 74.6 74.2 74.4 Concepts 90.7 90.0 90.3 Frames 88.8 88.5 88.7 Reentrancies 72.1 72.9 72.5 SRL 80.1 80.7 80.4 =================== ========= ========= ========= Note this parser does NOT perform wikification. ''' MRP2020_AMR_ENG_ZHO_XLM_BASE = 'http://download.hanlp.com/amr/extra/amr-eng-zho-xlm-roberta-base_20220412_223756.zip' '''A wrapper for the Permutation-invariant Semantic Parser (:cite:`samuel-straka-2020-ufal`) trained on MRP2020 English and Chinese AMR corpus. It was ranked the top in the MRP2020 competition, while this release is a base version. See the original paper for the detailed performance. Note this model requires tokens and lemmas (for English) to be provided as inputs. ''' MRP2020_AMR_ZHO_MENGZI_BASE = 'http://download.hanlp.com/amr/extra/amr-zho-mengzi-base_20220415_101941.zip' '''A Chinese Permutation-invariant Semantic Parser (:cite:`samuel-straka-2020-ufal`) trained on MRP2020 Chinese AMR corpus using Mengzi BERT base (:cite:`zhang2021mengzi`). Its performance on dev set is ``{amr-zho [tops F1: 85.43%][anchors F1: 93.41%][labels F1: 87.68%][properties F1: 82.02%][edges F1: 73.17%] [attributes F1: 0.00%][all F1: 84.11%]}``. Test set performance is unknown since the test set is not released to the public. ''' # Will be filled up during runtime ALL = {} ================================================ FILE: hanlp/pretrained/amr2text.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-12-07 15:19 from hanlp_common.constant import HANLP_URL AMR3_GRAPH_PRETRAIN_GENERATION = HANLP_URL + 'amr2text/amr3_graph_pretrain_generation_20221207_153535.zip' '''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large AMR2Text generator trained on Abstract Meaning Representation 3.0 (:cite:`knight2014abstract`) with graph pre-training (:cite:`bai-etal-2022-graph`). Its Sacre-BLEU is ``50.38`` according to their official repository. ''' # Will be filled up during runtime ALL = {} ================================================ FILE: hanlp/pretrained/classifiers.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 03:51 from hanlp_common.constant import HANLP_URL CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20211228_163210.zip' SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20211228_164917.zip' LID_176_FASTTEXT_BASE = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' ''' 126MB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes. ''' LID_176_FASTTEXT_SMALL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz' ''' 917kB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes. ''' ALL = {} ================================================ FILE: hanlp/pretrained/constituency.py ================================================ # -*- coding:utf-8 -*- # Author=hankcs # Date=2022-01-18 10:34 from hanlp_common.constant import HANLP_URL CTB9_CON_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_con_electra_small_20220215_230116.zip' 'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with major categories. ' \ 'Its performance is UCM=39.06% LCM=34.99% UP=90.05% UR=90.01% UF=90.03% LP=87.02% LR=86.98% LF=87.00%.' CTB9_CON_FULL_TAG_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_full_tag_con_electra_small_20220118_103119.zip' 'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \ 'Its performance is UCM=38.29% LCM=28.95% UP=90.16% UR=90.13% UF=90.15% LP=83.46% LR=83.43% LF=83.45%.' CTB9_CON_FULL_TAG_ERNIE_GRAM = 'http://download.hanlp.com/constituency/extra/ctb9_full_tag_con_ernie_20220331_121430.zip' 'ERNIE-GRAM (:cite:`xiao-etal-2021-ernie`) base tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \ 'Its performance is UCM=42.04% LCM=31.72% UP=91.33% UR=91.53% UF=91.43% LP=85.31% LR=85.49% LF=85.40%.' # Will be filled up during runtime ALL = {} ================================================ FILE: hanlp/pretrained/dep.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 02:55 from hanlp_common.constant import HANLP_URL CTB5_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb5_20191229_025833.zip' 'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB5.' CTB7_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb7_20200109_022431.zip' 'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB7.' CTB9_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/ctb9_dep_electra_small_20220216_100306.zip' 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-SD330. ' \ 'Performance is UAS=87.68% LAS=83.54%.' PMT1_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/pmt_dep_electra_small_20220218_134518.zip' 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on PKU ' \ 'Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`). Performance is UAS=91.21% LAS=88.65%.' CTB9_UDC_ELECTRA_SMALL = HANLP_URL + 'dep/udc_dep_electra_small_20220218_095452.zip' 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-UD420. ' \ 'Performance is UAS=85.92% LAS=81.13% .' PTB_BIAFFINE_DEP_EN = HANLP_URL + 'dep/ptb_dep_biaffine_20200101_174624.zip' 'Biaffine LSTM model (:cite:`dozat:17a`) trained on PTB.' ALL = {} ================================================ FILE: hanlp/pretrained/eos.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-22 13:22 from hanlp_common.constant import HANLP_URL UD_CTB_EOS_MUL = HANLP_URL + 'eos/eos_ud_ctb_mul_20201222_133543.zip' 'EOS model (:cite:`Schweter:Ahmed:2019`) trained on concatenated UD2.3 and CTB9.' # Will be filled up during runtime ALL = {} ================================================ FILE: hanlp/pretrained/fasttext.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-30 18:57 FASTTEXT_DEBUG_EMBEDDING_EN = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext.debug.bin.zip' FASTTEXT_CC_300_EN = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz' 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Common Crawl.' FASTTEXT_WIKI_NYT_AMAZON_FRIENDS_200_EN \ = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext-200-wikipedia-nytimes-amazon-friends-20191107.bin' 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on wikipedia, nytimes and friends.' FASTTEXT_WIKI_300_ZH = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.zip#wiki.zh.bin' 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Chinese Wikipedia.' FASTTEXT_WIKI_300_ZH_CLASSICAL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.zip#wiki.zh_classical.bin' 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on traditional Chinese wikipedia.' ALL = {} ================================================ FILE: hanlp/pretrained/glove.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-27 20:42 _GLOVE_6B_ROOT = 'http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip' GLOVE_6B_50D = _GLOVE_6B_ROOT + '#' + 'glove.6B.50d.txt' 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 50d trained on 6B tokens.' GLOVE_6B_100D = _GLOVE_6B_ROOT + '#' + 'glove.6B.100d.txt' 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 100d trained on 6B tokens.' GLOVE_6B_200D = _GLOVE_6B_ROOT + '#' + 'glove.6B.200d.txt' 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 200d trained on 6B tokens.' GLOVE_6B_300D = _GLOVE_6B_ROOT + '#' + 'glove.6B.300d.txt' 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 6B tokens.' GLOVE_840B_300D = 'http://nlp.stanford.edu/data/glove.840B.300d.zip' 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 840B tokens.' ALL = {} ================================================ FILE: hanlp/pretrained/mtl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-22 13:16 from hanlp_common.constant import HANLP_URL OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip' "Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep, sdp and con model trained on open-source Chinese corpus." OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH = HANLP_URL + 'mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip' "Electra (:cite:`clark2020electra`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on open-source Chinese corpus." CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip' "Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep (SD Standard), sdp and con model trained on close-source Chinese corpus." CLOSE_TOK_POS_NER_SRL_UDEP_SDP_CON_ELECTRA_SMALL_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20220626_175100.zip' ''' Electra (:cite:`clark2020electra`) small version of joint tok, pos, ner, srl, dep (UD Standard), sdp and con model trained on close-source Chinese corpus. Performance: ``{con UCM: 39.33% LCM: 35.69% UP: 90.24% UR: 90.28% UF: 90.26% LP: 87.55% LR: 87.59% LF: 87.57%}{dep UAS: 86.80% LAS: 82.82%}{ner/msra P: 95.45% R: 96.65% F1: 96.05%}{ner/ontonotes P: 75.98% R: 79.09% F1: 77.50%}{ner/pku P: 95.77% R: 96.75% F1: 96.26%}{pos/863 Accuracy:94.83%}{pos/ctb Accuracy:96.57%}{pos/pku Accuracy:97.54%}{sdp UF: 85.55% LF: 73.67%}{srl P: 75.71% R: 74.25% F1: 74.97%}{tok/coarse P: 97.77% R: 97.70% F1: 97.74%}{tok/fine P: 97.44% R: 97.32% F1: 97.38%}``. ''' CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip' "Electra (:cite:`clark2020electra`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus." CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH = HANLP_URL + 'mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip' "ERNIE (:cite:`xiao-etal-2021-ernie`) base version of joint tok, pos, ner, srl, dep, sdp and con model trained on close-source Chinese corpus." UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L6_no_space_20220731_161526.zip' ''' mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 small version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora. The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``. Performance: ``{con UCM: 15.48% LCM: 11.45% UP: 68.92% UR: 66.88% UF: 67.88% LP: 61.19% LR: 59.38% LF: 60.27%}{ner P: 76.06% R: 77.83% F1: 76.93%}{sdp/dm UF: 91.84% LF: 91.00%}{sdp/pas UF: 95.46% LF: 93.90%}{sdp/psd UF: 91.94% LF: 81.26%}{srl [predicate P: 91.71% R: 74.51% F1: 82.22%][e2e P: 77.48% R: 55.28% F1: 64.52%]}{tok P: 93.17% R: 93.53% F1: 93.35%}{ud [lemmas Accuracy:81.74%][upos Accuracy:85.94%][deps UAS: 80.60% LAS: 71.21%][feats Accuracy:77.17%]}``. ''' UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L12 = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L12_no_space_20220807_133143.zip' ''' mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 base version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora. The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``. Performance: ``{con UCM: 17.32% LCM: 13.28% UP: 70.53% UR: 68.73% UF: 69.62% LP: 63.03% LR: 61.42% LF: 62.22%}{ner P: 76.91% R: 78.72% F1: 77.80%}{sdp/dm UF: 92.78% LF: 92.02%}{sdp/pas UF: 96.43% LF: 95.02%}{sdp/psd UF: 92.75% LF: 81.86%}{srl [predicate P: 91.82% R: 77.57% F1: 84.10%][e2e P: 78.33% R: 59.14% F1: 67.40%]}{tok P: 93.69% R: 94.34% F1: 94.02%}{ud [lemmas Accuracy:82.48%][upos Accuracy:87.09%][deps UAS: 82.41% LAS: 73.69%][feats Accuracy:78.58%]}``. ''' UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE = HANLP_URL + 'mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20220608_003435.zip' ''' XLM-R (:cite:`conneau-etal-2020-unsupervised`) base version of joint tok, pos, lem, fea, ner, srl, dep, sdp and con model trained on UD 2.10 and OntoNotes5 corpora. The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``. Performance: ``{con UCM: 20.31% LCM: 16.82% UP: 77.50% UR: 76.63% UF: 77.06% LP: 71.25% LR: 70.46% LF: 70.85%}{ner P: 79.93% R: 80.76% F1: 80.34%}{sdp/dm UF: 93.71% LF: 93.00%}{sdp/pas UF: 97.63% LF: 96.37%}{sdp/psd UF: 93.08% LF: 80.95%}{srl [predicate P: 90.95% R: 84.25% F1: 87.47%][e2e P: 78.89% R: 67.32% F1: 72.65%]}{tok P: 98.50% R: 98.70% F1: 98.60%}{ud [lemmas Accuracy:85.95%][upos Accuracy:89.95%][deps UAS: 85.78% LAS: 78.51%][feats Accuracy:82.18%]}``. ''' NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA = HANLP_URL + 'mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip' 'BERT (:cite:`devlin-etal-2019-bert`) base char encoder trained on NPCMJ/UD/Kyoto corpora with decoders including tok, pos, ner, dep, con, srl.' # Will be filled up during runtime ALL = {} ================================================ FILE: hanlp/pretrained/ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-30 20:07 from hanlp_common.constant import HANLP_URL MSRA_NER_BERT_BASE_ZH = HANLP_URL + 'ner/ner_bert_base_msra_20211227_114712.zip' 'BERT model (:cite:`devlin-etal-2019-bert`) trained on MSRA with 3 entity types.' MSRA_NER_ALBERT_BASE_ZH = HANLP_URL + 'ner/msra_ner_albert_base_20211228_173323.zip' 'ALBERT model (:cite:`Lan2020ALBERT:`) trained on MSRA with 3 entity types.' MSRA_NER_ELECTRA_SMALL_ZH = HANLP_URL + 'ner/msra_ner_electra_small_20220215_205503.zip' 'Electra small model (:cite:`clark2020electra`) trained on MSRA with 26 entity types. F1 = `95.16`' CONLL03_NER_BERT_BASE_CASED_EN = HANLP_URL + 'ner/ner_conll03_bert_base_cased_en_20211227_121443.zip' 'BERT model (:cite:`devlin-etal-2019-bert`) trained on CoNLL03.' ALL = {} ================================================ FILE: hanlp/pretrained/pos.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 01:57 from hanlp_common.constant import HANLP_URL CTB5_POS_RNN = HANLP_URL + 'pos/ctb5_pos_rnn_20200113_235925.zip' 'An old school BiLSTM tagging model trained on CTB5.' CTB5_POS_RNN_FASTTEXT_ZH = HANLP_URL + 'pos/ctb5_pos_rnn_fasttext_20191230_202639.zip' 'An old school BiLSTM tagging model with FastText (:cite:`bojanowski2017enriching`) embeddings trained on CTB5.' CTB9_POS_ALBERT_BASE = HANLP_URL + 'pos/ctb9_albert_base_20211228_163935.zip' 'ALBERT model (:cite:`Lan2020ALBERT:`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). This is a TF component.' CTB9_POS_ELECTRA_SMALL_TF = HANLP_URL + 'pos/pos_ctb_electra_small_20211227_121341.zip' 'Electra small model (:cite:`clark2020electra`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.75`. This is a TF component.' CTB9_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_ctb_electra_small_20220215_111944.zip' 'Electra small model (:cite:`clark2020electra`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.26`.' CTB9_POS_RADICAL_ELECTRA_SMALL = HANLP_URL + 'pos/pos_ctb_radical_electra_small_20220215_111932.zip' 'Electra small model (:cite:`clark2020electra`) with radical embeddings (:cite:`he2018dual`) trained on CTB9 (:cite:`https://doi.org/10.35111/gvd0-xk91`). Accuracy = `96.14`.' C863_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_863_electra_small_20220217_101958.zip' 'Electra small model (:cite:`clark2020electra`) trained on Chinese 863 corpus. Accuracy = `95.19`.' PKU_POS_ELECTRA_SMALL = HANLP_URL + 'pos/pos_pku_electra_small_20220217_142436.zip' 'Electra small model (:cite:`clark2020electra`) trained on Chinese PKU corpus. Accuracy = `97.55`.' PTB_POS_RNN_FASTTEXT_EN = HANLP_URL + 'pos/ptb_pos_rnn_fasttext_20220418_101708.zip' 'An old school BiLSTM tagging model with FastText (:cite:`bojanowski2017enriching`) embeddings trained on PTB.' ALL = {} ================================================ FILE: hanlp/pretrained/rnnlm.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-19 03:47 from hanlp_common.constant import HANLP_URL FLAIR_LM_FW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_fw_wmt11_en' 'The forward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).' FLAIR_LM_BW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_bw_wmt11_en' 'The backward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).' FLAIR_LM_WMT11_EN = HANLP_URL + 'lm/flair_lm_wmt11_en_20200601_205350.zip' 'The BiLSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).' ALL = {} ================================================ FILE: hanlp/pretrained/sdp.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-31 23:54 from hanlp_common.constant import HANLP_URL SEMEVAL16_NEWS_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-news-biaffine_20191231_235407.zip' 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 news data.' SEMEVAL16_TEXT_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-text-biaffine_20200101_002257.zip' 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text data.' SEMEVAL16_ALL_ELECTRA_SMALL_ZH = HANLP_URL + 'sdp/semeval16_sdp_electra_small_20220719_171433.zip' 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text and news data. Performance: ``UF: 83.03% LF: 72.58%``' SEMEVAL15_PAS_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_pas_20200103_152405.zip' 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PAS data.' SEMEVAL15_PSD_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_psd_20200106_123009.zip' 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PSD data.' SEMEVAL15_DM_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_dm_20200106_122808.zip' 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 DM data.' ALL = {} ================================================ FILE: hanlp/pretrained/srl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-08-07 19:07 from hanlp_common.constant import HANLP_URL CPB3_SRL_ELECTRA_SMALL = HANLP_URL + 'srl/cpb3_electra_small_crf_has_transform_20220218_135910.zip' 'Electra small model (:cite:`clark2020electra`) trained on CPB3. P=75.87% R=76.24% F1=76.05%.' ALL = {} ================================================ FILE: hanlp/pretrained/sts.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-24 12:51 from hanlp_common.constant import HANLP_URL STS_ELECTRA_BASE_ZH = HANLP_URL + 'sts/sts_electra_base_zh_20210530_200109.zip' 'A naive regression model trained on concatenated STS corpora.' # Will be filled up during runtime ALL = {} ================================================ FILE: hanlp/pretrained/tok.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 21:12 from hanlp_common.constant import HANLP_URL SIGHAN2005_PKU_CONVSEG = HANLP_URL + 'tok/sighan2005-pku-convseg_20200110_153722.zip' 'Conv model (:cite:`wang-xu-2017-convolutional`) trained on sighan2005 pku dataset.' SIGHAN2005_MSR_CONVSEG = HANLP_URL + 'tok/convseg-msr-nocrf-noembed_20200110_153524.zip' 'Conv model (:cite:`wang-xu-2017-convolutional`) trained on sighan2005 msr dataset.' CTB6_CONVSEG = HANLP_URL + 'tok/ctb6_convseg_nowe_nocrf_20200110_004046.zip' 'Conv model (:cite:`wang-xu-2017-convolutional`) trained on CTB6 dataset.' PKU_NAME_MERGED_SIX_MONTHS_CONVSEG = HANLP_URL + 'tok/pku98_6m_conv_ngram_20200110_134736.zip' 'Conv model (:cite:`wang-xu-2017-convolutional`) trained on pku98 six months dataset with familiy name and given name merged into one unit.' LARGE_ALBERT_BASE = HANLP_URL + 'tok/large_corpus_cws_albert_base_20211228_160926.zip' 'ALBERT model (:cite:`Lan2020ALBERT:`) trained on the largest CWS dataset in the world.' SIGHAN2005_PKU_BERT_BASE_ZH = HANLP_URL + 'tok/sighan2005_pku_bert_base_zh_20201231_141130.zip' 'BERT model (:cite:`devlin-etal-2019-bert`) trained on sighan2005 pku dataset.' COARSE_ELECTRA_SMALL_ZH = HANLP_URL + 'tok/coarse_electra_small_20220616_012050.zip' 'Electra (:cite:`clark2020electra`) small model trained on coarse-grained CWS corpora. Its performance is ``P: 98.34% R: 98.38% F1: 98.36%`` which is ' \ 'much higher than that of MTL model ' FINE_ELECTRA_SMALL_ZH = HANLP_URL + 'tok/fine_electra_small_20220615_231803.zip' 'Electra (:cite:`clark2020electra`) small model trained on fine-grained CWS corpora. Its performance is ``P: 98.14% R: 98.07% F1: 98.11%`` which is ' \ 'much higher than that of MTL model ' CTB9_TOK_ELECTRA_SMALL = HANLP_URL + 'tok/ctb9_electra_small_20220215_205427.zip' 'Electra (:cite:`clark2020electra`) small model trained on CTB9. Its performance is P=97.15% R=97.36% F1=97.26% which is ' \ 'much higher than that of MTL model ' CTB9_TOK_ELECTRA_BASE = 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_20220426_111949.zip' 'Electra (:cite:`clark2020electra`) base model trained on CTB9. Its performance is ``P: 97.62% R: 97.67% F1: 97.65%`` ' \ 'which is much higher than that of MTL model ' CTB9_TOK_ELECTRA_BASE_CRF = 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_crf_20220426_161255.zip' 'Electra (:cite:`clark2020electra`) base model trained on CTB9. Its performance is ``P: 97.68% R: 97.71% F1: 97.69%`` ' \ 'which is much higher than that of MTL model ' MSR_TOK_ELECTRA_BASE_CRF = 'http://download.hanlp.com/tok/extra/msra_crf_electra_base_20220507_113936.zip' 'Electra (:cite:`clark2020electra`) base model trained on MSR CWS dataset. Its performance is ``P: 98.71% R: 98.64% F1: 98.68%`` ' \ 'which is much higher than that of MTL model ' UD_TOK_MMINILMV2L6 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip' ''' mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L6xH384 based tokenizer trained on UD 2.10. The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``. Performance: ``P: 94.99% R: 94.74% F1: 94.86%``. ''' UD_TOK_MMINILMV2L12 = HANLP_URL + 'tok/ud_tok_mMiniLMv2L12_no_space_mul_20220619_091159.zip' ''' mMiniLMv2 (:cite:`wang-etal-2021-minilmv2`) L12xH384 based tokenizer trained on UD 2.10. The following 130 languages are supported: ``Afrikaans, Akkadian, Akuntsu, Albanian, Amharic, AncientGreek (to 1453), Ancient Hebrew, Apurinã, Arabic, Armenian, AssyrianNeo-Aramaic, Bambara, Basque, Beja, Belarusian, Bengali, Bhojpuri, Breton, Bulgarian, Catalan, Cebuano, Central Siberian Yupik, Chinese, Chukot, ChurchSlavic, Coptic, Croatian, Czech, Danish, Dutch, Emerillon, English, Erzya, Estonian, Faroese, Finnish, French, Galician, German, Gothic, Guajajára, Guarani, Hebrew, Hindi, Hittite, Hungarian, Icelandic, Indonesian, Irish, Italian, Japanese, Javanese, K\'iche\', Kangri, Karelian, Karo(Brazil), Kazakh, Khunsari, Komi-Permyak, Komi-Zyrian, Korean, Latin, Latvian, Ligurian, LiteraryChinese, Lithuanian, Livvi, LowGerman, Madi, Makuráp, Maltese, Manx, Marathi, MbyáGuaraní, Modern Greek (1453-), Moksha, Mundurukú, Nayini, Neapolitan, Nigerian Pidgin, NorthernKurdish, Northern Sami, Norwegian, OldFrench (842-ca. 1400), OldRussian, Old Turkish, Persian, Polish, Portuguese, Romanian, Russia Buriat, Russian, Sanskrit, ScottishGaelic, Serbian, SkoltSami, Slovak, Slovenian, Soi, South Levantine Arabic, Spanish, Swedish, SwedishSign Language, SwissGerman, Tagalog, Tamil, Tatar, Telugu, Thai, Tupinambá, Turkish, Uighur, Ukrainian, Umbrian, UpperSorbian, Urdu, Urubú-Kaapor, Vietnamese, Warlpiri, Welsh, Western Armenian, WesternFrisian, Wolof, Xibe, Yakut, Yoruba, YueChinese``. Performance: ``P: 95.41% R: 95.25% F1: 95.33%``. ''' # Will be filled up during runtime ALL = {} ================================================ FILE: hanlp/pretrained/word2vec.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 18:25 from hanlp_common.constant import HANLP_URL CONVSEG_W2V_NEWS_TENSITE = HANLP_URL + 'embeddings/convseg_embeddings.zip' CONVSEG_W2V_NEWS_TENSITE_WORD_PKU = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.pku.words.w2v50' CONVSEG_W2V_NEWS_TENSITE_WORD_MSR = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.msr.words.w2v50' CONVSEG_W2V_NEWS_TENSITE_CHAR = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.w2v200' SEMEVAL16_EMBEDDINGS_CN = HANLP_URL + 'embeddings/semeval16_embeddings.zip' SEMEVAL16_EMBEDDINGS_300_NEWS_CN = SEMEVAL16_EMBEDDINGS_CN + '#news.fasttext.300.txt' SEMEVAL16_EMBEDDINGS_300_TEXT_CN = SEMEVAL16_EMBEDDINGS_CN + '#text.fasttext.300.txt' CTB5_FASTTEXT_300_CN = HANLP_URL + 'embeddings/ctb.fasttext.300.txt.zip' TENCENT_AILAB_EMBEDDING_SMALL_200 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d200-v0.2.0-s.tar.gz#tencent-ailab-embedding-zh-d200-v0.2.0-s.txt' 'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with small vocabulary size and 200 dimension provided by Tencent AI lab.' TENCENT_AILAB_EMBEDDING_LARGE_200 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d200-v0.2.0.tar.gz#tencent-ailab-embedding-zh-d200-v0.2.0.txt' 'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with large vocabulary size and 200 dimension provided by Tencent AI lab.' TENCENT_AILAB_EMBEDDING_SMALL_100 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d100-v0.2.0-s.tar.gz#tencent-ailab-embedding-zh-d100-v0.2.0-s.txt' 'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with small vocabulary size and 100 dimension provided by Tencent AI lab.' TENCENT_AILAB_EMBEDDING_LARGE_100 = 'https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d100-v0.2.0.tar.gz#tencent-ailab-embedding-zh-d100-v0.2.0.txt' 'Chinese word embeddings (:cite:`NIPS2013_9aa42b31`) with large vocabulary size and 100 dimension provided by Tencent AI lab.' MERGE_SGNS_BIGRAM_CHAR_300_ZH = 'http://download.hanlp.com/embeddings/extra/merge_sgns_bigram_char300_20220130_214613.txt.zip' 'Chinese word embeddings trained with context features (word, ngram, character, and more) using Skip-Gram with Negative Sampling (SGNS) (:cite:`li-etal-2018-analogical`).' RADICAL_CHAR_EMBEDDING_100 = HANLP_URL + 'embeddings/radical_char_vec_20191229_013849.zip#character.vec.txt' 'Chinese character embedding enhanced with rich radical information (:cite:`he2018dual`).' _SUBWORD_ENCODING_CWS = 'http://download.hanlp.com/embeddings/extra/subword_encoding_cws_20200524_190636.zip' SUBWORD_ENCODING_CWS_ZH_WIKI_BPE_50 = _SUBWORD_ENCODING_CWS + '#zh.wiki.bpe.vs200000.d50.w2v.txt' SUBWORD_ENCODING_CWS_GIGAWORD_UNI = _SUBWORD_ENCODING_CWS + '#gigaword_chn.all.a2b.uni.ite50.vec' SUBWORD_ENCODING_CWS_GIGAWORD_BI = _SUBWORD_ENCODING_CWS + '#gigaword_chn.all.a2b.bi.ite50.vec' SUBWORD_ENCODING_CWS_CTB_GAZETTEER_50 = _SUBWORD_ENCODING_CWS + '#ctb.50d.vec' ALL = {} ================================================ FILE: hanlp/transform/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 22:24 ================================================ FILE: hanlp/transform/conll_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-08 15:30 from abc import abstractmethod from collections import Counter from typing import Union, Tuple, Iterable, Any, Generator import numpy as np import tensorflow as tf from transformers import PreTrainedTokenizer, PretrainedConfig from hanlp_common.constant import ROOT from hanlp_common.structure import SerializableDict from hanlp.common.transform_tf import Transform from hanlp.common.vocab_tf import VocabTF from hanlp.components.parsers.alg_tf import tolist, kmeans, randperm, arange from hanlp.components.parsers.conll import read_conll from hanlp_common.conll import CoNLLWord, CoNLLUWord, CoNLLSentence from hanlp.layers.transformers.utils_tf import config_is, adjust_tokens_for_transformers, convert_examples_to_features from hanlp.utils.log_util import logger from hanlp.utils.string_util import ispunct from hanlp_common.util import merge_locals_kwargs class CoNLLTransform(Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2, use_pos=True, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.form_vocab: VocabTF = None if use_pos: self.cpos_vocab: VocabTF = None self.rel_vocab: VocabTF = None self.puncts: tf.Tensor = None @property def use_pos(self): return self.config.get('use_pos', True) def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: form, cpos = x return self.form_vocab.token_to_idx_table.lookup(form), self.cpos_vocab.token_to_idx_table.lookup(cpos) def y_to_idx(self, y): head, rel = y return head, self.rel_vocab.token_to_idx_table.lookup(rel) def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: if len(X) == 2: form_batch, cposes_batch = X mask = tf.not_equal(form_batch, 0) elif len(X) == 3: form_batch, cposes_batch, mask = X else: raise ValueError(f'Expect X to be 2 or 3 elements but got {repr(X)}') sents = [] for form_sent, cposes_sent, length in zip(form_batch, cposes_batch, tf.math.count_nonzero(mask, axis=-1)): forms = tolist(form_sent)[1:length + 1] cposes = tolist(cposes_sent)[1:length + 1] sents.append([(self.form_vocab.idx_to_token[f], self.cpos_vocab.idx_to_token[c]) for f, c in zip(forms, cposes)]) return sents def lock_vocabs(self): super().lock_vocabs() self.puncts = tf.constant([i for s, i in self.form_vocab.token_to_idx.items() if ispunct(s)], dtype=tf.int64) def file_to_inputs(self, filepath: str, gold=True): assert gold, 'only support gold file for now' use_pos = self.use_pos conllu = filepath.endswith('.conllu') for sent in read_conll(filepath): for i, cell in enumerate(sent): form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] # if conllu: # deps = cell[8] # deps = [x.split(':', 1) for x in deps.split('|')] # heads = [int(x[0]) for x in deps if '_' not in x[0] and '.' not in x[0]] # rels = [x[1] for x in deps if '_' not in x[0] and '.' not in x[0]] # if head in heads: # offset = heads.index(head) # if not self.rel_vocab or rels[offset] in self.rel_vocab: # deprel = rels[offset] sent[i] = [form, cpos, head, deprel] if use_pos else [form, head, deprel] yield sent @property def bos(self): if self.form_vocab.idx_to_token is None: return ROOT return self.form_vocab.idx_to_token[2] def input_is_single_sample(self, input: Any) -> bool: if self.use_pos: return isinstance(input[0][0], str) if len(input[0]) else False else: return isinstance(input[0], str) if len(input[0]) else False @abstractmethod def batched_inputs_to_batches(self, corpus, indices, shuffle): pass def len_of_sent(self, sent): return 1 + len(sent) # take ROOT into account def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None, drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset: if shuffle: def generator(): # custom bucketing, load corpus into memory corpus = list(x for x in (samples() if callable(samples) else samples)) lengths = [self.len_of_sent(i) for i in corpus] if len(corpus) < 32: n_buckets = 1 else: n_buckets = min(self.config.n_buckets, len(corpus)) buckets = dict(zip(*kmeans(lengths, n_buckets))) sizes, buckets = zip(*[ (size, bucket) for size, bucket in buckets.items() ]) # the number of chunks in each bucket, which is clipped by # range [1, len(bucket)] chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in zip(sizes, buckets)] range_fn = randperm if shuffle else arange max_samples_per_batch = self.config.get('max_samples_per_batch', None) for i in tolist(range_fn(len(buckets))): split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1 for j in range(chunks[i])] # how many sentences in each batch for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes): indices = [buckets[i][j] for j in tolist(batch_indices)] if max_samples_per_batch: for j in range(0, len(indices), max_samples_per_batch): yield from self.batched_inputs_to_batches(corpus, indices[j:j + max_samples_per_batch], shuffle) else: yield from self.batched_inputs_to_batches(corpus, indices, shuffle) else: def generator(): # custom bucketing, load corpus into memory corpus = list(x for x in (samples() if callable(samples) else samples)) n_tokens = 0 batch = [] for idx, sent in enumerate(corpus): sent_len = self.len_of_sent(sent) if n_tokens + sent_len > batch_size and batch: yield from self.batched_inputs_to_batches(corpus, batch, shuffle) n_tokens = 0 batch = [] n_tokens += sent_len batch.append(idx) if batch: yield from self.batched_inputs_to_batches(corpus, batch, shuffle) # next(generator()) return Transform.samples_to_dataset(self, generator, False, False, 0, False, repeat, drop_remainder, prefetch, cache) class CoNLL_DEP_Transform(CoNLLTransform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2, **kwargs) -> None: super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, **kwargs) def batched_inputs_to_batches(self, corpus, indices, shuffle): """Convert batched inputs to batches of samples Args: corpus(list): A list of inputs indices(list): A list of indices, each list belongs to a batch shuffle: Returns: """ raw_batch = [[], [], [], []] for idx in indices: for b in raw_batch: b.append([]) for cells in corpus[idx]: for b, c, v in zip(raw_batch, cells, [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]): b[-1].append(v.get_idx_without_add(c) if v else c) batch = [] for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab, None, self.rel_vocab]): b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post', value=v.safe_pad_token_idx if v else 0, dtype='int64') batch.append(b) assert len(batch) == 4 yield (batch[0], batch[1]), (batch[2], batch[3]) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = (tf.int64, tf.int64), (tf.int64, tf.int64) shapes = ([None, None], [None, None]), ([None, None], [None, None]) values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), ( 0, self.rel_vocab.safe_pad_token_idx) return types, shapes, values def inputs_to_samples(self, inputs, gold=False): token_mapping: dict = self.config.get('token_mapping', None) use_pos = self.config.get('use_pos', True) for sent in inputs: sample = [] for i, cell in enumerate(sent): if isinstance(cell, tuple): cell = list(cell) elif isinstance(cell, str): cell = [cell] if token_mapping: cell[0] = token_mapping.get(cell[0], cell[0]) if self.config['lower']: cell[0] = cell[0].lower() if not gold: cell += [0, self.rel_vocab.safe_pad_token] sample.append(cell) # insert root word with arbitrary fields, anyway it will be masked # form, cpos, head, deprel = sample[0] sample.insert(0, [self.bos, self.bos, 0, self.bos] if use_pos else [self.bos, 0, self.bos]) yield sample def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y if inputs is None: inputs = self.X_to_inputs(X) ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs) sents = [] for x, y in zip(inputs, ys): sent = CoNLLSentence() for idx, (cell, (head, deprel)) in enumerate(zip(x, y)): if self.use_pos and not self.config.get('joint_pos', None): form, cpos = cell else: form, cpos = cell, None if conll: sent.append( CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll' else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents def fit(self, trn_path: str, **kwargs) -> int: use_pos = self.config.use_pos self.form_vocab = VocabTF() self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk if self.use_pos: self.cpos_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): num_samples += 1 for idx, cell in enumerate(sent): if use_pos: form, cpos, head, deprel = cell else: form, head, deprel = cell if idx == 0: root = form else: counter[form] += 1 if use_pos: self.cpos_vocab.add(cpos) self.rel_vocab.add(deprel) for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]: self.form_vocab.add(token) return num_samples @property def root_rel_idx(self): root_rel_idx = self.config.get('root_rel_idx', None) if root_rel_idx is None: for idx, rel in enumerate(self.rel_vocab.idx_to_token): if 'root' in rel.lower() and rel != self.bos: self.config['root_rel_idx'] = root_rel_idx = idx break return root_rel_idx def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: arc_preds, rel_preds, mask = Y sents = [] for arc_sent, rel_sent, length in zip(arc_preds, rel_preds, tf.math.count_nonzero(mask, axis=-1)): arcs = tolist(arc_sent)[1:length + 1] rels = tolist(rel_sent)[1:length + 1] sents.append([(a, self.rel_vocab.idx_to_token[r]) for a, r in zip(arcs, rels)]) return sents class CoNLL_Transformer_Transform(CoNLL_DEP_Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=0, max_seq_length=256, use_pos=False, mask_p=None, graph=False, topk=None, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.tokenizer: PreTrainedTokenizer = None self.transformer_config: PretrainedConfig = None if graph: self.orphan_relation = ROOT def lock_vocabs(self): super().lock_vocabs() if self.graph: CoNLL_SDP_Transform._find_orphan_relation(self) def fit(self, trn_path: str, **kwargs) -> int: if self.config.get('joint_pos', None): self.config.use_pos = True if self.graph: # noinspection PyCallByClass num = CoNLL_SDP_Transform.fit(self, trn_path, **kwargs) else: num = super().fit(trn_path, **kwargs) if self.config.get('topk', None): counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): for idx, cell in enumerate(sent): form, head, deprel = cell counter[form] += 1 self.topk_vocab = VocabTF() for k, v in counter.most_common(self.config.topk): self.topk_vocab.add(k) return num def inputs_to_samples(self, inputs, gold=False): if self.graph: yield from CoNLL_SDP_Transform.inputs_to_samples(self, inputs, gold) else: yield from super().inputs_to_samples(inputs, gold) def file_to_inputs(self, filepath: str, gold=True): if self.graph: yield from CoNLL_SDP_Transform.file_to_inputs(self, filepath, gold) else: yield from super().file_to_inputs(filepath, gold) @property def mask_p(self) -> float: return self.config.get('mask_p', None) @property def graph(self): return self.config.get('graph', None) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: mask_p = self.mask_p types = (tf.int64, (tf.int64, tf.int64, tf.int64)), (tf.bool if self.graph else tf.int64, tf.int64, tf.int64) if mask_p else ( tf.bool if self.graph else tf.int64, tf.int64) if self.graph: shapes = ([None, None], ([None, None], [None, None], [None, None])), ( [None, None, None], [None, None, None], [None, None]) if mask_p else ( [None, None, None], [None, None, None]) else: shapes = ([None, None], ([None, None], [None, None], [None, None])), ( [None, None], [None, None], [None, None]) if mask_p else ([None, None], [None, None]) values = (self.form_vocab.safe_pad_token_idx, (0, 0, 0)), \ (0, self.rel_vocab.safe_pad_token_idx, 0) if mask_p else (0, self.rel_vocab.safe_pad_token_idx) types_shapes_values = types, shapes, values if self.use_pos: types_shapes_values = [((shapes[0][0], shapes[0][1] + (shapes[0][0],)), shapes[1]) for shapes in types_shapes_values] return types_shapes_values def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: form_batch, feat, prefix_mask = X sents = [] for form_sent, length in zip(form_batch, tf.math.count_nonzero(prefix_mask, axis=-1)): forms = tolist(form_sent)[1:length + 1] sents.append([self.form_vocab.idx_to_token[f] for f in forms]) return sents def batched_inputs_to_batches(self, corpus, indices, shuffle): use_pos = self.use_pos if use_pos: raw_batch = [[], [], [], []] else: raw_batch = [[], [], []] if self.graph: max_len = len(max([corpus[i] for i in indices], key=len)) for idx in indices: arc = np.zeros((max_len, max_len), dtype=np.bool) rel = np.zeros((max_len, max_len), dtype=np.int64) for b in raw_batch[:2 if use_pos else 1]: b.append([]) for m, cells in enumerate(corpus[idx]): if use_pos: for b, c, v in zip(raw_batch, cells, [None, self.cpos_vocab]): b[-1].append(v.get_idx_without_add(c) if v else c) else: for b, c, v in zip(raw_batch, cells, [None]): b[-1].append(c) for n, r in zip(cells[-2], cells[-1]): arc[m, n] = True rid = self.rel_vocab.get_idx_without_add(r) if rid is None: logger.warning(f'Relation OOV: {r} not exists in train') continue rel[m, n] = rid raw_batch[-2].append(arc) raw_batch[-1].append(rel) else: for idx in indices: for s in raw_batch: s.append([]) for cells in corpus[idx]: if use_pos: for s, c, v in zip(raw_batch, cells, [None, self.cpos_vocab, None, self.rel_vocab]): s[-1].append(v.get_idx_without_add(c) if v else c) else: for s, c, v in zip(raw_batch, cells, [None, None, self.rel_vocab]): s[-1].append(v.get_idx_without_add(c) if v else c) # Transformer tokenizing config = self.transformer_config tokenizer = self.tokenizer xlnet = config_is(config, 'xlnet') roberta = config_is(config, 'roberta') pad_token = tokenizer.pad_token pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0] cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token max_seq_length = self.config.max_seq_length batch_forms = [] batch_input_ids = [] batch_input_mask = [] batch_prefix_offset = [] mask_p = self.mask_p if mask_p: batch_masked_offsets = [] mask_token_id = tokenizer.mask_token_id for sent_idx, sent in enumerate(raw_batch[0]): batch_forms.append([self.form_vocab.get_idx_without_add(token) for token in sent]) sent = adjust_tokens_for_transformers(sent) sent = sent[1:] # remove use [CLS] instead pad_label_idx = self.form_vocab.pad_idx input_ids, input_mask, segment_ids, prefix_mask = \ convert_examples_to_features(sent, max_seq_length, tokenizer, cls_token_at_end=xlnet, # xlnet has a cls token at the end cls_token=cls_token, cls_token_segment_id=2 if xlnet else 0, sep_token=sep_token, sep_token_extra=roberta, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=xlnet, # pad on the left for xlnet pad_token_id=pad_token_id, pad_token_segment_id=4 if xlnet else 0, pad_token_label_id=pad_label_idx, do_padding=False) num_masks = sum(prefix_mask) # assert len(sent) == num_masks # each token has a True subtoken if num_masks < len(sent): # long sent gets truncated, +1 for root batch_forms[-1] = batch_forms[-1][:num_masks + 1] # form raw_batch[-1][sent_idx] = raw_batch[-1][sent_idx][:num_masks + 1] # head raw_batch[-2][sent_idx] = raw_batch[-2][sent_idx][:num_masks + 1] # rel raw_batch[-3][sent_idx] = raw_batch[-3][sent_idx][:num_masks + 1] # pos prefix_mask[0] = True # is now [CLS] prefix_offset = [idx for idx, m in enumerate(prefix_mask) if m] batch_input_ids.append(input_ids) batch_input_mask.append(input_mask) batch_prefix_offset.append(prefix_offset) if mask_p: if shuffle: size = int(np.ceil(mask_p * len(prefix_offset[1:]))) # never mask [CLS] mask_offsets = np.random.choice(np.arange(1, len(prefix_offset)), size, replace=False) for offset in sorted(mask_offsets): assert 0 < offset < len(input_ids) # mask_word = raw_batch[0][sent_idx][offset] # mask_prefix = tokenizer.convert_ids_to_tokens([input_ids[prefix_offset[offset]]])[0] # assert mask_word.startswith(mask_prefix) or mask_prefix.startswith( # mask_word) or mask_prefix == "'", \ # f'word {mask_word} prefix {mask_prefix} not match' # could vs couldn # mask_offsets.append(input_ids[offset]) # subword token # mask_offsets.append(offset) # form token input_ids[prefix_offset[offset]] = mask_token_id # mask prefix # whole word masking, mask the rest of the word for i in range(prefix_offset[offset] + 1, len(input_ids) - 1): if prefix_mask[i]: break input_ids[i] = mask_token_id batch_masked_offsets.append(sorted(mask_offsets)) else: batch_masked_offsets.append([0]) # No masking in prediction batch_forms = tf.keras.preprocessing.sequence.pad_sequences(batch_forms, padding='post', value=self.form_vocab.safe_pad_token_idx, dtype='int64') batch_input_ids = tf.keras.preprocessing.sequence.pad_sequences(batch_input_ids, padding='post', value=pad_token_id, dtype='int64') batch_input_mask = tf.keras.preprocessing.sequence.pad_sequences(batch_input_mask, padding='post', value=0, dtype='int64') batch_prefix_offset = tf.keras.preprocessing.sequence.pad_sequences(batch_prefix_offset, padding='post', value=0, dtype='int64') batch_heads = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-2], padding='post', value=0, dtype='int64') batch_rels = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[-1], padding='post', value=self.rel_vocab.safe_pad_token_idx, dtype='int64') if mask_p: batch_masked_offsets = tf.keras.preprocessing.sequence.pad_sequences(batch_masked_offsets, padding='post', value=pad_token_id, dtype='int64') feats = (tf.constant(batch_input_ids, dtype='int64'), tf.constant(batch_input_mask, dtype='int64'), tf.constant(batch_prefix_offset)) if use_pos: batch_pos = tf.keras.preprocessing.sequence.pad_sequences(raw_batch[1], padding='post', value=self.cpos_vocab.safe_pad_token_idx, dtype='int64') feats += (batch_pos,) yield (batch_forms, feats), \ (batch_heads, batch_rels, batch_masked_offsets) if mask_p else (batch_heads, batch_rels) def len_of_sent(self, sent): # Transformer tokenizing config = self.transformer_config tokenizer = self.tokenizer xlnet = config_is(config, 'xlnet') roberta = config_is(config, 'roberta') pad_token = tokenizer.pad_token pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0] cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token max_seq_length = self.config.max_seq_length sent = sent[1:] # remove use [CLS] instead pad_label_idx = self.form_vocab.pad_idx sent = [x[0] for x in sent] sent = adjust_tokens_for_transformers(sent) input_ids, input_mask, segment_ids, prefix_mask = \ convert_examples_to_features(sent, max_seq_length, tokenizer, cls_token_at_end=xlnet, # xlnet has a cls token at the end cls_token=cls_token, cls_token_segment_id=2 if xlnet else 0, sep_token=sep_token, sep_token_extra=roberta, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=xlnet, # pad on the left for xlnet pad_token_id=pad_token_id, pad_token_segment_id=4 if xlnet else 0, pad_token_label_id=pad_label_idx, do_padding=False) return len(input_ids) def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None, drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset: if shuffle: return CoNLL_DEP_Transform.samples_to_dataset(self, samples, map_x, map_y, batch_size, shuffle, repeat, drop_remainder, prefetch, cache) def generator(): # custom bucketing, load corpus into memory corpus = list(x for x in (samples() if callable(samples) else samples)) n_tokens = 0 batch = [] for idx, sent in enumerate(corpus): sent_len = self.len_of_sent(sent) if n_tokens + sent_len > batch_size and batch: yield from self.batched_inputs_to_batches(corpus, batch, shuffle) n_tokens = 0 batch = [] n_tokens += sent_len batch.append(idx) if batch: yield from self.batched_inputs_to_batches(corpus, batch, shuffle) # debug for transformer # next(generator()) return Transform.samples_to_dataset(self, generator, False, False, 0, False, repeat, drop_remainder, prefetch, cache) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: if self.graph: ys = CoNLL_SDP_Transform.Y_to_outputs(self, Y, gold, inputs, X) ys = [[([t[0] for t in l], [t[1] for t in l]) for l in y] for y in ys] return ys return super().Y_to_outputs(Y, gold, inputs, X) class CoNLL_SDP_Transform(CoNLLTransform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2, use_pos=True, **kwargs) -> None: super().__init__(config, map_x, map_y, lower, n_buckets, min_freq, use_pos, **kwargs) self.orphan_relation = ROOT def lock_vocabs(self): super().lock_vocabs() # heuristic to find the orphan relation self._find_orphan_relation() def _find_orphan_relation(self): for rel in self.rel_vocab.idx_to_token: if 'root' in rel.lower(): self.orphan_relation = rel break def file_to_inputs(self, filepath: str, gold=True): assert gold, 'only support gold file for now' use_pos = self.use_pos conllu = filepath.endswith('.conllu') enhanced_only = self.config.get('enhanced_only', None) for i, sent in enumerate(read_conll(filepath)): parsed_sent = [] if conllu: for cell in sent: ID = cell[0] form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] deps = cell[8] deps = [x.split(':', 1) for x in deps.split('|')] heads = [int(x[0]) for x in deps if x[0].isdigit()] rels = [x[1] for x in deps if x[0].isdigit()] if enhanced_only: if head in heads: offset = heads.index(head) heads.pop(offset) rels.pop(offset) else: if head not in heads: heads.append(head) rels.append(deprel) parsed_sent.append([form, cpos, heads, rels] if use_pos else [form, heads, rels]) else: prev_cells = None heads = [] rels = [] for j, cell in enumerate(sent): ID = cell[0] form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] if prev_cells and ID != prev_cells[0]: # found end of token parsed_sent.append( [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels]) heads = [] rels = [] heads.append(head) rels.append(deprel) prev_cells = [ID, form, cpos, head, deprel] if use_pos else [ID, form, head, deprel] parsed_sent.append( [prev_cells[1], prev_cells[2], heads, rels] if use_pos else [prev_cells[1], heads, rels]) yield parsed_sent def fit(self, trn_path: str, **kwargs) -> int: self.form_vocab = VocabTF() self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk if self.use_pos: self.cpos_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): num_samples += 1 for idx, cell in enumerate(sent): if len(cell) == 4: form, cpos, head, deprel = cell elif len(cell) == 3: if self.use_pos: form, cpos = cell[0] else: form = cell[0] head, deprel = cell[1:] else: raise ValueError('Unknown data arrangement') if idx == 0: root = form else: counter[form] += 1 if self.use_pos: self.cpos_vocab.add(cpos) self.rel_vocab.update(deprel) for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]: self.form_vocab.add(token) return num_samples def inputs_to_samples(self, inputs, gold=False): use_pos = self.use_pos for sent in inputs: sample = [] for i, cell in enumerate(sent): if isinstance(cell, tuple): cell = list(cell) elif isinstance(cell, str): cell = [cell] if self.config['lower']: cell[0] = cell[0].lower() if not gold: cell += [[0], [self.rel_vocab.safe_pad_token]] sample.append(cell) # insert root word with arbitrary fields, anyway it will be masked if use_pos: form, cpos, head, deprel = sample[0] sample.insert(0, [self.bos, self.bos, [0], deprel]) else: form, head, deprel = sample[0] sample.insert(0, [self.bos, [0], deprel]) yield sample def batched_inputs_to_batches(self, corpus, indices, shuffle): use_pos = self.use_pos raw_batch = [[], [], [], []] if use_pos else [[], [], []] max_len = len(max([corpus[i] for i in indices], key=len)) for idx in indices: arc = np.zeros((max_len, max_len), dtype=bool) rel = np.zeros((max_len, max_len), dtype=np.int64) for b in raw_batch[:2]: b.append([]) for m, cells in enumerate(corpus[idx]): if use_pos: for b, c, v in zip(raw_batch, cells, [self.form_vocab, self.cpos_vocab]): b[-1].append(v.get_idx_without_add(c)) else: for b, c, v in zip(raw_batch, cells, [self.form_vocab]): b[-1].append(v.get_idx_without_add(c)) for n, r in zip(cells[-2], cells[-1]): arc[m, n] = True rid = self.rel_vocab.get_idx_without_add(r) if rid is None: logger.warning(f'Relation OOV: {r} not exists in train') continue rel[m, n] = rid raw_batch[-2].append(arc) raw_batch[-1].append(rel) batch = [] for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]): b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post', value=v.safe_pad_token_idx, dtype='int64') batch.append(b) batch += raw_batch[2:] assert len(batch) == 4 yield (batch[0], batch[1]), (batch[2], batch[3]) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = (tf.int64, tf.int64), (tf.bool, tf.int64) shapes = ([None, None], [None, None]), ([None, None, None], [None, None, None]) values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), ( False, self.rel_vocab.safe_pad_token_idx) return types, shapes, values def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: arc_preds, rel_preds, mask = Y sents = [] for arc_sent, rel_sent, length in zip(arc_preds, rel_preds, tf.math.count_nonzero(mask, axis=-1)): sent = [] for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])): ar = [] for idx, (a, r) in enumerate(zip(arc, rel)): if a: ar.append((idx + 1, self.rel_vocab.idx_to_token[r])) if not ar: # orphan ar.append((0, self.orphan_relation)) sent.append(ar) sents.append(sent) return sents def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y xs = inputs ys = self.Y_to_outputs((arc_preds, rel_preds, mask)) sents = [] for x, y in zip(xs, ys): sent = CoNLLSentence() for idx, ((form, cpos), pred) in enumerate(zip(x, y)): head = [p[0] for p in pred] deprel = [p[1] for p in pred] if conll: sent.append(CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents ================================================ FILE: hanlp/transform/glue_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-08 16:34 from hanlp_common.structure import SerializableDict from hanlp.datasets.glu.glue import STANFORD_SENTIMENT_TREEBANK_2_TRAIN, MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV from hanlp.transform.table_tf import TableTransform class StanfordSentimentTreebank2Transorm(TableTransform): pass class MicrosoftResearchParaphraseCorpus(TableTransform): def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=(3, 4), y_column=0, skip_header=True, delimiter='auto', **kwargs) -> None: super().__init__(config, map_x, map_y, x_columns, y_column, skip_header, delimiter, **kwargs) def main(): # _test_sst2() _test_mrpc() def _test_sst2(): transform = StanfordSentimentTreebank2Transorm() transform.fit(STANFORD_SENTIMENT_TREEBANK_2_TRAIN) transform.lock_vocabs() transform.label_vocab.summary() transform.build_config() dataset = transform.file_to_dataset(STANFORD_SENTIMENT_TREEBANK_2_TRAIN) for batch in dataset.take(1): print(batch) def _test_mrpc(): transform = MicrosoftResearchParaphraseCorpus() transform.fit(MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV) transform.lock_vocabs() transform.label_vocab.summary() transform.build_config() dataset = transform.file_to_dataset(MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV) for batch in dataset.take(1): print(batch) ================================================ FILE: hanlp/transform/table_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-11-10 21:00 from abc import ABC from typing import Tuple, Union import numpy as np import tensorflow as tf from hanlp_common.structure import SerializableDict from hanlp.common.transform_tf import Transform from hanlp_common.constant import PAD from hanlp.common.vocab_tf import create_label_vocab from hanlp.utils.io_util import read_cells from hanlp.utils.log_util import logger class TableTransform(Transform, ABC): def __init__(self, config: SerializableDict = None, map_x=False, map_y=True, x_columns=None, y_column=-1, multi_label=False, skip_header=True, delimiter='auto', **kwargs) -> None: super().__init__(config, map_x, map_y, x_columns=x_columns, y_column=y_column, multi_label=multi_label, skip_header=skip_header, delimiter=delimiter, **kwargs) self.label_vocab = create_label_vocab() def file_to_inputs(self, filepath: str, gold=True): x_columns = self.config.x_columns y_column = self.config.y_column num_features = self.config.get('num_features', None) for cells in read_cells(filepath, skip_header=self.config.skip_header, delimiter=self.config.delimiter): #multi-label: Dataset in .tsv format: x_columns: at most 2 columns being a sentence pair while in most # cases just one column being the doc content. y_column being the single label, which shall be modified # to load a list of labels. if x_columns: inputs = tuple(c for i, c in enumerate(cells) if i in x_columns), cells[y_column] else: if y_column != -1: cells[-1], cells[y_column] = cells[y_column], cells[-1] inputs = tuple(cells[:-1]), cells[-1] if num_features is None: num_features = len(inputs[0]) self.config.num_features = num_features # multi-label support if self.config.get('multi_label', None): assert type(inputs[1]) is str, 'Y value has to be string' if inputs[1][0] == '[': # multi-label is in literal form of a list labels = eval(inputs[1]) else: labels = inputs[1].strip().split(',') inputs = inputs[0], labels else: assert num_features == len(inputs[0]), f'Numbers of columns {num_features} ' \ f'inconsistent with current {len(inputs[0])}' yield inputs def inputs_to_samples(self, inputs, gold=False): pad = self.label_vocab.safe_pad_token for cells in inputs: if gold: yield cells else: yield cells, pad def y_to_idx(self, y) -> tf.Tensor: return self.label_vocab.lookup(y) def fit(self, trn_path: str, **kwargs): samples = 0 for t in self.file_to_samples(trn_path, gold=True): if self.config.get('multi_label', None): for l in t[1]: self.label_vocab.add(l) else: self.label_vocab.add(t[1]) # the second one regardless of t is pair or triple samples += 1 return samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: num_features = self.config.num_features # It's crucial to use tuple instead of list for all the three types = tuple([tf.string] * num_features), tf.string shapes = tuple([[]] * num_features), [] values = tuple([PAD] * num_features), self.label_vocab.safe_pad_token return types, shapes, values def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: logger.warning('TableTransform can not map x to idx. Please override x_to_idx') return x ================================================ FILE: hanlp/transform/tacred_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-14 17:06 from typing import Union, Tuple import tensorflow as tf from hanlp_common.structure import SerializableDict from hanlp.common.transform_tf import Transform from hanlp.common.vocab_tf import VocabTF from hanlp_common.io import load_json from hanlp_common.util import merge_locals_kwargs def get_positions(start_idx, end_idx, length): """Get subj/obj position sequence. Args: start_idx: end_idx: length: Returns: """ return list(range(-start_idx, 0)) + [0] * (end_idx - start_idx + 1) + \ list(range(1, length - end_idx)) class TACREDTransform(Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.token_vocab = VocabTF() self.pos_vocab = VocabTF(pad_token=None, unk_token=None) self.ner_vocab = VocabTF(pad_token=None) self.deprel_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None) def fit(self, trn_path: str, **kwargs) -> int: count = 0 for (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation in self.file_to_samples( trn_path, gold=True): count += 1 self.token_vocab.update(tokens) self.pos_vocab.update(pos) self.ner_vocab.update(ner) self.deprel_vocab.update(deprel) self.rel_vocab.add(relation) return count def file_to_inputs(self, filepath: str, gold=True): data = load_json(filepath) for d in data: tokens = list(d['token']) ss, se = d['subj_start'], d['subj_end'] os, oe = d['obj_start'], d['obj_end'] pos = d['stanford_pos'] ner = d['stanford_ner'] deprel = d['stanford_deprel'] head = [int(x) for x in d['stanford_head']] assert any([x == 0 for x in head]) relation = d['relation'] yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation def inputs_to_samples(self, inputs, gold=False): for input in inputs: if gold: (tokens, pos, ner, head, deprel, ss, se, os, oe), relation = input else: tokens, pos, ner, head, deprel, ss, se, os, oe = input relation = self.rel_vocab.safe_pad_token l = len(tokens) subj_positions = get_positions(ss, se, l) obj_positions = get_positions(os, oe, l) subj_type = ner[ss] obj_type = ner[os] # anonymize tokens tokens[ss:se + 1] = ['SUBJ-' + subj_type] * (se - ss + 1) tokens[os:oe + 1] = ['OBJ-' + obj_type] * (oe - os + 1) # min head is 0, but root is not included in tokens, so take 1 off from each head head = [h - 1 for h in head] yield (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: # (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation types = (tf.string, tf.string, tf.string, tf.int32, tf.string, tf.int32, tf.int32, tf.string, tf.string), tf.string shapes = ([None], [None], [None], [None], [None], [None], [None], [], []), [] pads = (self.token_vocab.safe_pad_token, self.pos_vocab.safe_pad_token, self.ner_vocab.safe_pad_token, 0, self.deprel_vocab.safe_pad_token, 0, 0, self.ner_vocab.safe_pad_token, self.ner_vocab.safe_pad_token), self.rel_vocab.safe_pad_token return types, shapes, pads def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type = x tokens = self.token_vocab.lookup(tokens) pos = self.pos_vocab.lookup(pos) ner = self.ner_vocab.lookup(ner) deprel = self.deprel_vocab.lookup(deprel) subj_type = self.ner_vocab.lookup(subj_type) obj_type = self.ner_vocab.lookup(obj_type) return tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type def y_to_idx(self, y) -> tf.Tensor: return self.rel_vocab.lookup(y) ================================================ FILE: hanlp/transform/text_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-04 11:46 from typing import Union, Tuple, Iterable, Any import tensorflow as tf from hanlp_common.structure import SerializableDict from hanlp.common.transform_tf import Transform from hanlp.common.vocab_tf import VocabTF from hanlp.metrics.chunking.sequence_labeling import get_entities from hanlp.utils.file_read_backwards import FileReadBackwards from hanlp.utils.io_util import read_tsv_as_sents class TextTransform(Transform): def __init__(self, forward=True, seq_len=10, tokenizer='char', config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None: super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs) self.vocab: VocabTF = None def tokenize_func(self): if self.config.tokenizer == 'char': return list elif self.config.tokenizer == 'whitespace': return lambda x: x.split() else: return lambda x: x.split(self.config.tokenizer) def fit(self, trn_path: str, **kwargs) -> int: self.vocab = VocabTF() num_samples = 0 for x, y in self.file_to_inputs(trn_path): self.vocab.update(x) num_samples += 1 return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] defaults = self.vocab.pad_token, self.vocab.pad_token return types, shapes, defaults def file_to_inputs(self, filepath: str, gold=True): forward = self.config.forward seq_len = self.config.seq_len buffer = [] tokenizer = self.tokenize_func() with open(filepath, encoding='utf-8') if forward else FileReadBackwards(filepath, encoding="utf-8") as src: for line in src: tokens = tokenizer(line) buffer += tokens while len(buffer) > seq_len: yield buffer[:seq_len], buffer[1:1 + seq_len] buffer.pop(0) def inputs_to_samples(self, inputs, gold=False): forward = self.config.forward for t in inputs: if gold: x, y = t else: x, y = t, t if not forward: x = list(reversed(x)) y = list(reversed(y)) yield x, y def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.x_to_idx(y) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable: pred = tf.argmax(Y, axis=-1) for ys, ms in zip(pred, inputs): ret = [] for y in ys: ret.append(self.vocab.idx_to_token[int(y)]) yield ret def input_is_single_sample(self, input: Any) -> bool: return isinstance(input[0], str) def bmes_to_flat(inpath, outpath): with open(outpath, 'w', encoding='utf-8') as out: for sent in read_tsv_as_sents(inpath): chunks = get_entities([cells[1] for cells in sent]) chars = [cells[0] for cells in sent] words = [] for tag, start, end in chunks: word = ''.join(chars[start: end]) words.append(word) out.write(' '.join(f'{word}/{tag}' for word, (tag, _, _) in zip(words, chunks))) out.write('\n') ================================================ FILE: hanlp/transform/transformer_tokenizer.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-03 16:23 import warnings from typing import Union, Optional from hanlp_common.constant import BOS, EOS from hanlp_common.structure import SerializableDict from hanlp.layers.transformers.pt_imports import PreTrainedTokenizer, PretrainedConfig, AutoTokenizer_ from hanlp_trie import DictInterface class TransformerTokenizer(object): def __init__(self, max_seq_length=512, truncate_long_sequences=True) -> None: self.truncate_long_sequences = truncate_long_sequences self.max_seq_length = max_seq_length def sliding_window(self, flat_wordpiece_ids, same_tail=True): if same_tail: start_piece_ids, flat_wordpiece_ids, end_piece_ids = flat_wordpiece_ids[:1], \ flat_wordpiece_ids[1:-1], flat_wordpiece_ids[-1:] else: start_piece_ids, flat_wordpiece_ids, end_piece_ids = flat_wordpiece_ids[:1], \ flat_wordpiece_ids[1:], [] window_length = self.max_seq_length - len(start_piece_ids) - len(end_piece_ids) stride = window_length // 2 wordpiece_windows = [start_piece_ids + flat_wordpiece_ids[i:i + window_length] + end_piece_ids for i in range(0, len(flat_wordpiece_ids), stride)] # Check for overlap in the last window. Throw it away if it is redundant. last_window = wordpiece_windows[-1][1:] penultimate_window = wordpiece_windows[-2] if last_window == penultimate_window[-len(last_window):]: wordpiece_windows = wordpiece_windows[:-1] wordpiece_ids = [wordpiece for sequence in wordpiece_windows for wordpiece in sequence] return wordpiece_ids class TransformerTextTokenizer(TransformerTokenizer): _KEY = ['input_ids', 'attention_mask', 'token_type_ids'] def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], text_a_key: str, text_b_key: str = None, output_key=None, max_seq_length=512, truncate_long_sequences=True) -> None: super().__init__(max_seq_length, truncate_long_sequences) self.text_b = text_b_key self.text_a = text_a_key if output_key is None: output_key = self.text_a if text_b_key: output_key += '_' + text_b_key if output_key == '': output_key = self._KEY else: output_key = [f'{output_key}_{key}' for key in self._KEY] self.output_key = output_key if isinstance(tokenizer, str): tokenizer = AutoTokenizer_.from_pretrained(tokenizer) self.tokenizer = tokenizer def __call__(self, sample: dict): text_a = sample[self.text_a] text_b = sample[self.text_b] if self.text_b else None max_seq_length = self.max_seq_length if self.truncate_long_sequences else None encoding = self.tokenizer.encode_plus(text_a, text_b, max_length=max_seq_length) results = dict((k, encoding.data.get(k, None)) for k in self._KEY) if not self.truncate_long_sequences and len(results['input_ids']) > self.max_seq_length: # TODO: other fields should be properly handled too results['input_ids'] = self.sliding_window(results['input_ids']) if not results['token_type_ids']: results['token_type_ids'] = encoding[0].type_ids for k, v in zip(self.output_key, [results[_] for _ in self._KEY]): sample[k] = v return sample class TransformerSequenceTokenizer(TransformerTokenizer): def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], input_key, output_key=None, max_seq_length=512, truncate_long_sequences=False, config: PretrainedConfig = None, cls_token_at_end=False, cls_token_segment_id=0, pad_token_segment_id=0, pad_on_left=False, do_padding=False, sep_token_extra=False, ret_mask_and_type=False, ret_prefix_mask=False, ret_token_span=True, ret_subtokens=False, ret_subtokens_group=False, cls_is_bos=False, sep_is_eos=False, do_basic_tokenize=True, use_fast=True, dict_force=None, strip_cls_sep=True, check_space_before=None, ) -> None: """A transformer tokenizer for token-level tasks. It honors the boundary of tokens and tokenize each token into several subtokens then merge them. The information about each subtoken belongs to which token are kept and returned as a new field in the sample. It also provides out-of-box sliding window trick on long sequences. Args: tokenizer: The identifier of a pre-trained tokenizer or a ``PreTrainedTokenizer``. input_key: The token key in samples. output_key: The output keys to store results. max_seq_length: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. truncate_long_sequences: ``True`` to truncate exceeded parts of long sequences. ``False`` to enable sliding window. config: The ``PretrainedConfig`` to determine the model structure of the transformer, so that special tokenization can be applied. cls_token_at_end: ``True`` to put ``[CLS]`` at the end of input tokens. cls_token_segment_id: The id of ``[CLS]``. pad_token_segment_id: The id of ``[SEP]``. pad_on_left: ``True`` to put ``[PAD]`` at the left side of input tokens. do_padding: ``True`` to pad sequence to the left. sep_token_extra: ``True`` to have two ``[SEP]``. ret_mask_and_type: ``True`` to return masks and type ids. ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token. ret_token_span: ``True`` to return span of each token measured by subtoken offsets. ret_subtokens: ``True`` to return list of subtokens belonging to each token for tokenization purpose. When enabled, the prefix mask for each subtoken is set to True as each subtoken is a token unit in tokenization task. Similarity, the token span for each token will be a continuous integer sequence. ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token. cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is. ``False`` (default) means the first token is not [CLS], it will have its own embedding other than the embedding of [CLS]. sep_is_eos: ``True`` means the last token of input is [SEP]. ``False`` means it's not but [SEP] will be appended, ``None`` means it dependents on `input[-1] == [EOS]`. do_basic_tokenize: Whether to do basic tokenization before wordpiece. use_fast: Whether or not to try to load the fast version of the tokenizer. dict_force: A dictionary doing longest-prefix-match on input text so that the head and tail of each keyword won't be concatenated to other tokens by transformer tokenizers. strip_cls_sep: ``True`` to strip [CLS] and [SEP] off the input tokens. check_space_before: ``True`` to detect the space before each token to handle underline in sentence piece tokenization. Examples: .. highlight:: python .. code-block:: python transform = TransformerSequenceTokenizer('bert-base-uncased', 'token') sample = {'token': 'HanLP good'.split()} print(transform(sample)) """ super().__init__(max_seq_length, truncate_long_sequences) tokenizer_name = tokenizer if isinstance(tokenizer, str) else tokenizer.name_or_path if check_space_before is None: # These tokenizer is BPE-based which appends a space before each token and tokenizes loving into # ['▁lo', 'ving'], tokenize 商品 into ['▁', '商品']. For the later case, the prefix '▁' has to be removed # as there is no space between some languages like Chinese check_space_before = tokenizer_name in ('xlm-roberta-base', 'xlm-roberta-large', 'google/mt5-small', 'google/mt5-base', 'xlm-roberta-base-no-space', 'mMiniLMv2L6-no-space', 'mMiniLMv2L12-no-space') self.check_space_before = check_space_before self.ret_subtokens_group = ret_subtokens_group self.ret_subtokens = ret_subtokens self.sep_is_eos = sep_is_eos self.ret_prefix_mask = ret_prefix_mask self.ret_mask_and_type = ret_mask_and_type self.cls_is_bos = cls_is_bos self.ret_token_span = ret_token_span if not output_key or isinstance(output_key, str): suffixes = ['input_ids'] if ret_mask_and_type: suffixes += 'attention_mask', 'token_type_ids' if ret_prefix_mask: suffixes += ['prefix_mask'] if ret_token_span: suffixes.append('token_span') if output_key is None: output_key = [f'{input_key}_{key}' for key in suffixes] elif output_key == '': output_key = suffixes else: output_key = [f'{output_key}_{key}' for key in suffixes] self.input_key = input_key self.output_key = output_key if config: xlnet = config_is(config, 'xlnet') pad_token_segment_id = 4 if xlnet else 0 cls_token_segment_id = 2 if xlnet else 0 cls_token_at_end = xlnet pad_on_left = xlnet if isinstance(tokenizer, str): tokenizer = AutoTokenizer_.from_pretrained(tokenizer, use_fast=use_fast, do_basic_tokenize=do_basic_tokenize) if use_fast: # Dirty fix upstream bug: https://github.com/hankcs/HanLP/issues/1602 if hasattr(tokenizer, '_tokenizer') and hasattr(tokenizer._tokenizer, 'no_truncation'): _t = tokenizer._tokenizer _t.no_truncation() _t.no_padding() _t.no_truncation = _t.no_padding = lambda: None pad_token = tokenizer.pad_token self.pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0] self.pad_token_segment_id = pad_token_segment_id if tokenizer_name in ('google/mt5-small', 'google/mt5-base'): # mt5 doesn't have cls or sep, but we can use something similar self.has_cls = False self.cls_token = '▁' self.cls_token_id = tokenizer.convert_tokens_to_ids(self.cls_token) self.sep_token = tokenizer.eos_token self.sep_token_id = tokenizer.eos_token_id else: self.has_cls = True self.cls_token = tokenizer.cls_token self.sep_token = tokenizer.sep_token self.cls_token_segment_id = cls_token_segment_id self.cls_token_id = tokenizer.cls_token_id self.sep_token_id = tokenizer.sep_token_id self.sep_token_extra = sep_token_extra self.cls_token_at_end = cls_token_at_end self.tokenizer = tokenizer self.pad_on_left = pad_on_left self.do_padding = do_padding if self.ret_token_span or not self.truncate_long_sequences: assert not self.cls_token_at_end assert not self.pad_on_left # if self.ret_subtokens: # if not use_fast: # raise NotImplementedError( # 'ret_subtokens is not available when using Python tokenizers. ' # 'To use this feature, set use_fast = True.') self.dict: Optional[DictInterface] = dict_force # For tokenization of raw text self.strip_cls_sep = strip_cls_sep def __call__(self, sample: dict): input_tokens = sample[self.input_key] input_is_str = isinstance(input_tokens, str) tokenizer = self.tokenizer ret_token_span = self.ret_token_span if input_is_str: # This happens in a tokenizer component where the raw sentence is fed. # noinspection PyShadowingNames def tokenize_str(input_str, add_special_tokens=True): if tokenizer.is_fast: encoding = tokenizer.encode_plus(input_str, return_offsets_mapping=True, add_special_tokens=add_special_tokens).encodings[0] subtoken_offsets = encoding.offsets input_tokens = encoding.tokens input_ids = encoding.ids # Fill up missing non-blank characters swallowed by HF tokenizer offset = 0 fixed_offsets = [] fixed_tokens = [] fixed_ids = [] for token, id, (b, e) in zip(input_tokens, input_ids, subtoken_offsets): if b > offset: missing_token = input_str[offset: b] if not missing_token.isspace(): # In the future, we may want space back fixed_tokens.append(missing_token) fixed_ids.append(tokenizer.unk_token_id) fixed_offsets.append((offset, b)) if e == offset: # LI™ -> LIT + M if fixed_offsets and fixed_offsets[-1][0] < b: fixed_offsets[-1] = (fixed_offsets[-1][0], b) fixed_tokens.append(token) fixed_ids.append(id) fixed_offsets.append((b, e)) offset = e subtoken_offsets = fixed_offsets input_tokens = fixed_tokens input_ids = fixed_ids if add_special_tokens: subtoken_offsets = subtoken_offsets[1 if self.has_cls else 0:-1] # Edge case that the input_str is swallowed in whole if input_str and not subtoken_offsets and not input_str.isspace(): __index = 1 if add_special_tokens and self.has_cls else 0 input_tokens.insert(__index, input_str) input_ids.insert(__index, tokenizer.unk_token_id) subtoken_offsets.append((0, len(input_str))) if not self.has_cls: input_tokens = [self.cls_token] + input_tokens input_ids = [self.cls_token_id] + input_ids else: input_tokens = tokenizer.tokenize(input_str) subtoken_offsets = [] _o = 0 for each in input_tokens: subtoken_offsets.append((_o, _o + len(each))) _o += len(each) if add_special_tokens: input_tokens = [self.cls_token] + input_tokens + [self.sep_token] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) if self.check_space_before: non_blank_offsets = [i for i in range(len(input_tokens)) if input_tokens[i] != '▁'] if add_special_tokens and not self.has_cls: non_blank_offsets.insert(0, 0) input_tokens = [input_tokens[i] for i in non_blank_offsets] input_ids = [input_ids[i] for i in non_blank_offsets] if add_special_tokens: non_blank_offsets = non_blank_offsets[1:-1] subtoken_offsets = [subtoken_offsets[i - 1] for i in non_blank_offsets] else: subtoken_offsets = [subtoken_offsets[i] for i in non_blank_offsets] # MT5 generates tokens like ▁of, which is bad for the tokenizer. So we want to remove the prefix. for i, token in enumerate(input_tokens[1:-1] if add_special_tokens else input_tokens): if input_str[subtoken_offsets[i][0]] == ' ': subtoken_offsets[i] = (subtoken_offsets[i][0] + 1, subtoken_offsets[i][1]) # The following block will tokenize each empty string (space) into an unk token # if add_special_tokens: # if len(input_tokens) == 2: # bos and eos, meaning that the text contains only some spaces # input_tokens.insert(1, input_str) # input_ids.insert(1, tokenizer.unk_token_id) # subtoken_offsets.append((0, len(input_str))) # else: # if not input_ids: # This chunk might be some control chars getting removed by tokenizer # input_tokens = [input_str] # input_ids = [tokenizer.unk_token_id] # subtoken_offsets = [(0, len(input_str))] return input_tokens, input_ids, subtoken_offsets if self.dict: chunks = self.dict.split(sample.get(f'{self.input_key}_', input_tokens)) # Match original text directly _input_tokens, _input_ids, _subtoken_offsets = [self.cls_token], [self.cls_token_id], [] _offset = 0 custom_words = sample['custom_words'] = [] char_offset = 0 for chunk in chunks: if isinstance(chunk, str): # Use transformed text as it's what models are trained on chunk = input_tokens[char_offset:char_offset + len(chunk)] tokens, ids, offsets = tokenize_str(chunk, add_special_tokens=False) char_offset += len(chunk) else: begin, end, label = chunk _offset = begin # chunk offset is on char level, at this moment, there is no concept of tokens, just subtokens if isinstance(label, list): tokens, ids, offsets, delta = [], [], [], 0 for token in label: _tokens, _ids, _offsets = tokenize_str(token, add_special_tokens=False) tokens.extend(_tokens) # track the subword offset of this chunk, -1 for [CLS] custom_words.append( (len(_input_ids) + len(ids) - 1, len(_input_ids) + len(ids) - 1 + len(_ids), token)) ids.extend(_ids) offsets.extend((x[0] + delta, x[1] + delta) for x in _offsets) delta = offsets[-1][-1] else: tokens, ids, offsets = tokenize_str(input_tokens[begin:end], add_special_tokens=False) # offsets = [(offsets[0][0], offsets[-1][-1])] custom_words.append((len(_input_ids) - 1, len(_input_ids) + len(ids) - 1, label)) char_offset = end _input_tokens.extend(tokens) _input_ids.extend(ids) _subtoken_offsets.extend((x[0] + _offset, x[1] + _offset) for x in offsets) _offset = _subtoken_offsets[-1][-1] subtoken_offsets = _subtoken_offsets input_tokens = _input_tokens + [self.sep_token] input_ids = _input_ids + [self.sep_token_id] else: input_tokens, input_ids, subtoken_offsets = tokenize_str(input_tokens, add_special_tokens=True) if self.ret_subtokens: sample[f'{self.input_key}_subtoken_offsets'] = subtoken_offsets cls_is_bos = self.cls_is_bos if cls_is_bos is None: cls_is_bos = input_tokens[0] == BOS sep_is_eos = self.sep_is_eos if sep_is_eos is None: sep_is_eos = input_tokens[-1] == EOS if self.strip_cls_sep: if cls_is_bos: input_tokens = input_tokens[1:] if sep_is_eos: input_tokens = input_tokens[:-1] if not self.ret_mask_and_type: # only need input_ids and token_span, use a light version if input_is_str: prefix_mask = self._init_prefix_mask(input_ids) else: if input_tokens: return_offsets_mapping = tokenizer.is_fast and self.ret_subtokens encodings = tokenizer.batch_encode_plus( input_tokens, return_offsets_mapping=return_offsets_mapping, # Many tokenizers do not offer fast version add_special_tokens=False ) subtoken_ids_per_token = encodings.data['input_ids'] if return_offsets_mapping: offsets_mapping = [encoding.offsets for encoding in encodings.encodings] else: offsets_mapping = [] for token, subtoken_ids in zip(input_tokens, subtoken_ids_per_token): if len(subtoken_ids) > len(token): # … --> ... del subtoken_ids[len(token):] if not subtoken_ids: subtoken_ids = [tokenizer.unk_token_id] # Since non-fast tok generates no mapping, we have to guess char_per_subtoken = max(len(token) // len(subtoken_ids), 1) bes = [(b, b + char_per_subtoken) for b in range(0, len(token), char_per_subtoken)] if not bes: # the token is an empty string bes = [(0, 0)] if len(bes) != len(subtoken_ids): bes[len(subtoken_ids) - 1] = (bes[len(subtoken_ids) - 1][0], len(token)) del bes[len(subtoken_ids):] offsets_mapping.append(bes) else: encodings = SerializableDict() subtoken_ids_per_token = [] encodings.data = {'input_ids': subtoken_ids_per_token} if self.check_space_before: # noinspection PyUnboundLocalVariable for token, subtokens, mapping, encoding in zip(input_tokens, subtoken_ids_per_token, offsets_mapping, encodings.encodings): # Remove ▁ generated by spm for 2 reasons: # 1. During decoding, mostly no ▁ will be created unless blanks are placed between tokens (which # is true for English but in English it will likely be concatenated to the token following it) # 2. For T5, '▁' is used as CLS if len(subtokens) > 1 and encoding.tokens[0] == '▁': subtokens.pop(0) if mapping: mapping.pop(0) # Some tokens get stripped out subtoken_ids_per_token = [ids if ids else [tokenizer.unk_token_id] for ids in subtoken_ids_per_token] input_ids = sum(subtoken_ids_per_token, [self.cls_token_id]) if self.sep_is_eos is None: # None means to check whether sep is at the tail or between tokens if sep_is_eos: input_ids += [self.sep_token_id] elif self.sep_token_id not in input_ids: input_ids += [self.sep_token_id] else: input_ids += [self.sep_token_id] # else self.sep_is_eos == False means sep is between tokens and don't bother to check if self.ret_subtokens: prefix_mask = self._init_prefix_mask(input_ids) # if self.check_space_before: # if offsets_mapping[0] and not input_tokens[0].startswith(' '): # prefix_mask[1] = False else: prefix_mask = [False] * len(input_ids) offset = 1 for _subtokens in subtoken_ids_per_token: prefix_mask[offset] = True offset += len(_subtokens) if self.ret_subtokens: subtoken_offsets = [] for token, offsets in zip(input_tokens, offsets_mapping): if offsets: subtoken_offsets.append(offsets) else: subtoken_offsets.append([(0, len(token))]) if self.ret_subtokens_group: sample[f'{self.input_key}_subtoken_offsets_group'] = subtoken_offsets else: sample[f'{self.input_key}_subtoken_offsets'] = sum(subtoken_offsets, []) else: input_ids, attention_mask, token_type_ids, prefix_mask = \ convert_examples_to_features(input_tokens, None, tokenizer, cls_token_at_end=self.cls_token_at_end, # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=self.cls_token_segment_id, sep_token=self.sep_token, sep_token_extra=self.sep_token_extra, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=self.pad_on_left, # pad on the left for xlnet pad_token_id=self.pad_token_id, pad_token_segment_id=self.pad_token_segment_id, pad_token_label_id=0, do_padding=self.do_padding) if len(input_ids) > self.max_seq_length: if self.truncate_long_sequences: # raise SequenceTooLong( # f'Input tokens {input_tokens} exceed the max sequence length of {self.max_seq_length - 2}. ' # f'For sequence tasks, truncate_long_sequences = True is not supported.' # f'You are recommended to split your long text into several sentences within ' # f'{self.max_seq_length - 2} tokens beforehand. ' # f'Or simply set truncate_long_sequences = False to enable sliding window.') input_ids = input_ids[:self.max_seq_length] prefix_mask = prefix_mask[:self.max_seq_length] warnings.warn( f'Input tokens {input_tokens} exceed the max sequence length of {self.max_seq_length - 2}. ' f'The exceeded part will be truncated and ignored. ' f'You are recommended to split your long text into several sentences within ' f'{self.max_seq_length - 2} tokens beforehand.' f'Or simply set truncate_long_sequences = False to enable sliding window.' ) else: input_ids = self.sliding_window(input_ids, input_ids[-1] == self.sep_token_id) if prefix_mask: if cls_is_bos: prefix_mask[0] = True if sep_is_eos: prefix_mask[-1] = True outputs = [input_ids] if self.ret_mask_and_type: # noinspection PyUnboundLocalVariable outputs += [attention_mask, token_type_ids] if self.ret_prefix_mask: outputs += [prefix_mask] if ret_token_span and prefix_mask: if cls_is_bos: token_span = [[0]] else: token_span = [] offset = 1 span = [] for mask in prefix_mask[1:len(prefix_mask) if sep_is_eos is None else -1]: # skip [CLS] and [SEP] if mask and span: token_span.append(span) span = [] span.append(offset) offset += 1 if span: token_span.append(span) if sep_is_eos: assert offset == len(prefix_mask) - 1 token_span.append([offset]) outputs.append(token_span) for k, v in zip(self.output_key, outputs): sample[k] = v return sample def _init_prefix_mask(self, input_ids): prefix_mask = [True] * len(input_ids) if not self.cls_is_bos: prefix_mask[0] = False if not self.sep_is_eos: prefix_mask[-1] = False return prefix_mask def config_is(config, model='bert'): return model in type(config).__name__.lower() def convert_examples_to_features( words, max_seq_length: Optional[int], tokenizer, labels=None, label_map=None, cls_token_at_end=False, cls_token="[CLS]", cls_token_segment_id=1, sep_token="[SEP]", sep_token_extra=False, pad_on_left=False, pad_token_id=0, pad_token_segment_id=0, pad_token_label_id=0, sequence_a_segment_id=0, mask_padding_with_zero=True, unk_token='[UNK]', do_padding=True ): """Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) Args: words: max_seq_length: tokenizer: labels: (Default value = None) label_map: (Default value = None) cls_token_at_end: (Default value = False) cls_token: (Default value = "[CLS]") cls_token_segment_id: (Default value = 1) sep_token: (Default value = "[SEP]") sep_token_extra: (Default value = False) pad_on_left: (Default value = False) pad_token_id: (Default value = 0) pad_token_segment_id: (Default value = 0) pad_token_label_id: (Default value = 0) sequence_a_segment_id: (Default value = 0) mask_padding_with_zero: (Default value = True) unk_token: (Default value = '[UNK]') do_padding: (Default value = True) Returns: """ args = locals() if not labels: labels = words pad_token_label_id = False tokens = [] label_ids = [] for word, label in zip(words, labels): word_tokens = tokenizer.tokenize(word) if not word_tokens: # some wired chars cause the tagger to return empty list word_tokens = [unk_token] * len(word) tokens.extend(word_tokens) # Use the real label id for the first token of the word, and padding ids for the remaining tokens label_ids.extend([label_map[label] if label_map else True] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 if max_seq_length and len(tokens) > max_seq_length - special_tokens_count: warnings.warn( f'Input tokens {words} exceed the max sequence length of {max_seq_length - special_tokens_count}. ' f'The exceeded part will be truncated and ignored. ' f'You are recommended to split your long text into several sentences within ' f'{max_seq_length - special_tokens_count} tokens beforehand.') tokens = tokens[: (max_seq_length - special_tokens_count)] label_ids = label_ids[: (max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # token_type_ids: 0 0 0 0 0 0 0 # # Where "token_type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens += [sep_token] label_ids += [pad_token_label_id] if sep_token_extra: # roberta uses an extra separator b/w pairs of sentences tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [sequence_a_segment_id] * len(tokens) if cls_token_at_end: tokens += [cls_token] label_ids += [pad_token_label_id] segment_ids += [cls_token_segment_id] else: tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) if do_padding: # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token_id] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids label_ids = ([pad_token_label_id] * padding_length) + label_ids else: input_ids += [pad_token_id] * padding_length input_mask += [0 if mask_padding_with_zero else 1] * padding_length segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length, f'failed for:\n {args}' else: assert len(set(len(x) for x in [input_ids, input_mask, segment_ids, label_ids])) == 1 return input_ids, input_mask, segment_ids, label_ids def main(): transformer = 'bert-base-uncased' tokenizer: PreTrainedTokenizer = AutoTokenizer_.from_pretrained(transformer) # _test_text_transform(tokenizer) _test_sequence_transform(tokenizer) def _test_text_transform(tokenizer): transform = TransformerTextTokenizer(tokenizer, 'text') sample = {'text': 'HanLP good'} print(transform(sample)) def _test_sequence_transform(tokenizer): transform = TransformerSequenceTokenizer(tokenizer, 'token') sample = {'token': 'HanLP good'.split()} print(transform(sample)) if __name__ == '__main__': main() ================================================ FILE: hanlp/transform/tsv_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-06-13 21:15 import functools from abc import ABC from typing import Tuple, Union, Optional, Iterable, List import tensorflow as tf from hanlp_common.structure import SerializableDict from hanlp.common.transform_tf import Transform from hanlp.common.vocab_tf import VocabTF from hanlp.utils.io_util import generate_words_tags_from_tsv from hanlp.utils.tf_util import str_tensor_to_str from hanlp_common.util import merge_locals_kwargs def dataset_from_tsv(tsv_file_path, word_vocab: VocabTF, char_vocab: VocabTF, tag_vocab: VocabTF, batch_size=32, shuffle=None, repeat=None, prefetch=1, lower=False, **kwargs): generator = functools.partial(generate_words_tags_from_tsv, tsv_file_path, word_vocab, char_vocab, tag_vocab, lower) return dataset_from_generator(generator, word_vocab, tag_vocab, batch_size, shuffle, repeat, prefetch, **kwargs) def dataset_from_generator(generator, word_vocab, tag_vocab, batch_size=32, shuffle=None, repeat=None, prefetch=1, **kwargs): shapes = [None], [None] types = tf.string, tf.string defaults = word_vocab.pad_token, tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token dataset = tf.data.Dataset.from_generator(generator, output_shapes=shapes, output_types=types) if shuffle: if isinstance(shuffle, bool): shuffle = 1024 dataset = dataset.shuffle(shuffle) if repeat: dataset = dataset.repeat(repeat) dataset = dataset.padded_batch(batch_size, shapes, defaults).prefetch(prefetch) return dataset def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \ -> Tuple[VocabTF, VocabTF, VocabTF]: word_vocab = VocabTF() char_vocab = VocabTF() tag_vocab = VocabTF(unk_token=None) with open(tsv_file_path, encoding='utf-8') as tsv_file: for line in tsv_file: cells = line.strip().split() if cells: word, tag = cells if lower: word_vocab.add(word.lower()) else: word_vocab.add(word) char_vocab.update(list(word)) tag_vocab.add(tag) if lock_word_vocab: word_vocab.lock() if lock_char_vocab: char_vocab.lock() if lock_tag_vocab: tag_vocab.lock() return word_vocab, char_vocab, tag_vocab class TsvTaggingFormat(Transform, ABC): def file_to_inputs(self, filepath: str, gold=True): assert gold, 'TsvTaggingFormat does not support reading non-gold files' yield from generate_words_tags_from_tsv(filepath, gold=gold, lower=self.config.get('lower', False), max_seq_length=self.max_seq_length) @property def max_seq_length(self): return self.config.get('max_seq_length', None) class TSVTaggingTransform(TsvTaggingFormat, Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, use_char=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.word_vocab: Optional[VocabTF] = None self.tag_vocab: Optional[VocabTF] = None self.char_vocab: Optional[VocabTF] = None def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = VocabTF() self.tag_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, True): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = VocabTF() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] values = self.word_vocab.pad_token, self.tag_vocab.first_token return types, shapes, values def inputs_to_samples(self, inputs, gold=False): lower = self.config.get('lower', False) if gold: if lower: for x, y in inputs: yield x.lower(), y else: yield from inputs else: for x in inputs: yield x.lower() if lower else x, [self.padding_values[-1]] * len(x) def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.word_vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.tag_vocab.lookup(y) def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: for xs in X: words = [] for x in xs: words.append(str_tensor_to_str(x) if self.char_vocab else self.word_vocab.idx_to_token[int(x)]) yield words def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, **kwargs) -> Iterable: if not gold: Y = tf.argmax(Y, axis=2) for ys, xs in zip(Y, inputs): tags = [] for y, x in zip(ys, xs): tags.append(self.tag_vocab.idx_to_token[int(y)]) yield tags def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input[0], str) def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]): text = '' for word, gold_tag, pred_tag in zip(input, truth, output): text += ' '.join([word, gold_tag, pred_tag]) + '\n' text += '\n' return text ================================================ FILE: hanlp/transform/txt_tf.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-24 15:07 import functools from abc import ABC from typing import Tuple, Union, List, Iterable import tensorflow as tf from hanlp.common.transform_tf import Transform from hanlp.common.vocab_tf import VocabTF from hanlp.utils.io_util import get_resource from hanlp.utils.lang.zh.char_table import CharTable from hanlp.utils.span_util import bmes_of, bmes_to_words from hanlp.utils.string_util import split_long_sent def generate_words_per_line(file_path): with open(file_path, encoding='utf-8') as src: for line in src: cells = line.strip().split() if not cells: continue yield cells def words_to_bmes(words): tags = [] for w in words: if not w: raise ValueError('{} contains None or zero-length word {}'.format(str(words), w)) if len(w) == 1: tags.append('S') else: tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E']) return tags def extract_ngram_features_and_tags(sentence, bigram_only=False, window_size=4, segmented=True): """ Feature extraction for windowed approaches See Also https://github.com/chqiwang/convseg/ Parameters ---------- sentence bigram_only window_size segmented Returns ------- """ chars, tags = bmes_of(sentence, segmented) chars = CharTable.normalize_chars(chars) ret = [] ret.append(chars) # TODO: optimize ngram generation using https://www.tensorflow.org/api_docs/python/tf/strings/ngrams ret.extend(extract_ngram_features(chars, bigram_only, window_size)) ret.append(tags) return tuple(ret[:-1]), ret[-1] # x, y def extract_ngram_features(chars, bigram_only, window_size): ret = [] if bigram_only: chars = ['', ''] + chars + ['', ''] ret.append([a + b if a and b else '' for a, b in zip(chars[:-4], chars[1:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[1:-3], chars[2:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[2:-2], chars[3:])]) ret.append([a + b if a and b else '' for a, b in zip(chars[3:-1], chars[4:])]) elif window_size > 0: chars = ['', '', ''] + chars + ['', '', ''] # single char if window_size >= 1: ret.append(chars[3:-3]) if window_size >= 2: # bi chars ret.append([a + b if a and b else '' for a, b in zip(chars[2:], chars[3:-3])]) ret.append([a + b if a and b else '' for a, b in zip(chars[3:-3], chars[4:])]) if window_size >= 3: # tri chars ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[1:], chars[2:], chars[3:-3])]) ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[2:], chars[3:-3], chars[4:])]) ret.append( [a + b + c if a and b and c else '' for a, b, c in zip(chars[3:-3], chars[4:], chars[5:])]) if window_size >= 4: # four chars ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[0:], chars[1:], chars[2:], chars[3:-3])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[1:], chars[2:], chars[3:-3], chars[4:])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[2:], chars[3:-3], chars[4:], chars[5:])]) ret.append([a + b + c + d if a and b and c and d else '' for a, b, c, d in zip(chars[3:-3], chars[4:], chars[5:], chars[6:])]) return ret def generate_ngram_bmes(file_path, bigram_only=False, window_size=4, gold=True): with open(file_path, encoding='utf-8') as src: for line in src: sentence = line.strip() if not sentence: continue yield extract_ngram_features_and_tags(sentence, bigram_only, window_size, gold) def vocab_from_txt(txt_file_path, bigram_only=False, window_size=4, **kwargs) -> Tuple[VocabTF, VocabTF, VocabTF]: char_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(pad_token=None, unk_token=None) for X, Y in generate_ngram_bmes(txt_file_path, bigram_only, window_size, gold=True): char_vocab.update(X[0]) for ngram in X[1:]: ngram_vocab.update(filter(lambda x: x, ngram)) tag_vocab.update(Y) return char_vocab, ngram_vocab, tag_vocab def dataset_from_txt(txt_file_path: str, char_vocab: VocabTF, ngram_vocab: VocabTF, tag_vocab: VocabTF, bigram_only=False, window_size=4, segmented=True, batch_size=32, shuffle=None, repeat=None, prefetch=1): generator = functools.partial(generate_ngram_bmes, txt_file_path, bigram_only, window_size, segmented) return dataset_from_generator(generator, char_vocab, ngram_vocab, tag_vocab, bigram_only, window_size, batch_size, shuffle, repeat, prefetch) def dataset_from_generator(generator, char_vocab, ngram_vocab, tag_vocab, bigram_only=False, window_size=4, batch_size=32, shuffle=None, repeat=None, prefetch=1): if bigram_only: ngram_size = 4 else: ngram_size = window_size * (window_size + 1) // 2 vec_dim = 2 + ngram_size shapes = tuple([[None]] * (vec_dim - 1)), [None] types = tuple([tf.string] * (vec_dim - 1)), tf.string defaults = tuple([char_vocab.pad_token] + [ ngram_vocab.pad_token if ngram_vocab else char_vocab.pad_token] * ngram_size), ( tag_vocab.pad_token if tag_vocab.pad_token else tag_vocab.first_token) dataset = tf.data.Dataset.from_generator(generator, output_shapes=shapes, output_types=types) if shuffle: if isinstance(shuffle, bool): shuffle = 1024 dataset = dataset.shuffle(shuffle) if repeat: dataset = dataset.repeat(repeat) dataset = dataset.padded_batch(batch_size, shapes, defaults).prefetch(prefetch) return dataset class TxtFormat(Transform, ABC): def file_to_inputs(self, filepath: str, gold=True): filepath = get_resource(filepath) with open(filepath, encoding='utf-8') as src: for line in src: sentence = line.strip() if not sentence: continue yield sentence class TxtBMESFormat(TxtFormat, ABC): def file_to_inputs(self, filepath: str, gold=True): max_seq_length = self.config.get('max_seq_length', False) if max_seq_length: if 'transformer' in self.config: max_seq_length -= 2 # allow for [CLS] and [SEP] delimiter = set() delimiter.update('。!?:;、,,;!?、,') for text in super().file_to_inputs(filepath, gold): chars, tags = bmes_of(text, gold) if max_seq_length: start = 0 for short_chars in split_long_sent(chars, delimiter, max_seq_length): end = start + len(short_chars) yield short_chars, tags[start:end] start = end else: yield chars, tags def input_is_single_sample(self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input, str) def inputs_to_samples(self, inputs, gold=False): for chars, tags in (inputs if gold else zip(inputs, [None] * len(inputs))): if not gold: tags = [self.tag_vocab.safe_pad_token] * len(chars) chars = CharTable.normalize_chars(chars) yield chars, tags def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, batch=None) -> Iterable: yield from self.Y_to_tokens(self.tag_vocab, Y, gold, inputs) def Y_to_tokens(self, tag_vocab, Y, gold, inputs): if not gold: Y = tf.argmax(Y, axis=2) for text, ys in zip(inputs, Y): tags = [tag_vocab.idx_to_token[int(y)] for y in ys[:len(text)]] yield bmes_to_words(list(text), tags) ================================================ FILE: hanlp/utils/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-24 22:12 from . import rules def ls_resource_in_module(root) -> dict: res = dict() for k, v in root.__dict__.items(): if k.startswith('_') or v == root: continue if isinstance(v, str): if v.startswith('http') and not v.endswith('/') and not v.endswith('#') and not v.startswith('_'): res[k] = v elif type(v).__name__ == 'module': res.update(ls_resource_in_module(v)) if 'ALL' in root.__dict__ and isinstance(root.__dict__['ALL'], dict): root.__dict__['ALL'].update(res) return res ================================================ FILE: hanlp/utils/component_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-31 19:24 import os from hanlp_common.constant import HANLP_VERBOSE from hanlp_common.io import load_json, eprint, save_json from hanlp_common.reflection import object_from_classpath, str_to_type from hanlp import pretrained from hanlp import version from hanlp.common.component import Component from hanlp.utils.io_util import get_resource, get_latest_info_from_pypi, check_version_conflicts from hanlp_common.util import isdebugging def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, verbose=HANLP_VERBOSE, **kwargs) -> Component: """ Load a component from a ``meta.json`` (legacy TensorFlow component) or a ``config.json`` file. Args: save_dir: The identifier. meta_filename (str): The meta file of that saved component, which stores the classpath and version. transform_only: Load and return only the transform. **kwargs: Extra parameters passed to ``component.load()``. Returns: A component. """ identifier = save_dir load_path = save_dir save_dir = get_resource(save_dir) if save_dir.endswith('.json'): meta_filename = os.path.basename(save_dir) save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, meta_filename) if not os.path.isfile(metapath): tf_model = False metapath = os.path.join(save_dir, 'config.json') else: tf_model = True cls = None if not os.path.isfile(metapath): tips = '' if save_dir.isupper(): from difflib import SequenceMatcher similar_keys = sorted(pretrained.ALL.keys(), key=lambda k: SequenceMatcher(None, k, identifier).ratio(), reverse=True)[:5] tips = f'Check its spelling based on the available keys:\n' + \ f'{sorted(pretrained.ALL.keys())}\n' + \ f'Tips: it might be one of {similar_keys}' # These components are not intended to be loaded in this way, but I'm tired of explaining it again and again if identifier in pretrained.word2vec.ALL.values(): save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, 'config.json') save_json({'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbeddingComponent', 'embed': {'classpath': 'hanlp.layers.embeddings.word2vec.Word2VecEmbedding', 'embed': identifier, 'field': 'token', 'normalize': 'l2'}, 'hanlp_version': version.__version__}, metapath) elif identifier in pretrained.fasttext.ALL.values(): save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, 'config.json') save_json({'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbeddingComponent', 'embed': {'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbedding', 'filepath': identifier, 'src': 'token'}, 'hanlp_version': version.__version__}, metapath) elif identifier in {pretrained.classifiers.LID_176_FASTTEXT_SMALL, pretrained.classifiers.LID_176_FASTTEXT_BASE}: save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, 'config.json') save_json({'classpath': 'hanlp.components.classifiers.fasttext_classifier.FastTextClassifier', 'model_path': identifier, 'hanlp_version': version.__version__}, metapath) else: raise FileNotFoundError(f'The identifier {save_dir} resolves to a nonexistent meta file {metapath}. {tips}') meta: dict = load_json(metapath) cls = meta.get('classpath', cls) if not cls: cls = meta.get('class_path', None) # For older version if tf_model: # tf models are trained with version < 2.1. To migrate them to 2.1, map their classpath to new locations upgrade = { 'hanlp.components.tok_tf.TransformerTokenizerTF': 'hanlp.components.tokenizers.tok_tf.TransformerTokenizerTF', 'hanlp.components.pos.RNNPartOfSpeechTagger': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF', 'hanlp.components.pos_tf.RNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.RNNPartOfSpeechTaggerTF', 'hanlp.components.pos_tf.CNNPartOfSpeechTaggerTF': 'hanlp.components.taggers.pos_tf.CNNPartOfSpeechTaggerTF', 'hanlp.components.ner_tf.TransformerNamedEntityRecognizerTF': 'hanlp.components.ner.ner_tf.TransformerNamedEntityRecognizerTF', 'hanlp.components.parsers.biaffine_parser.BiaffineDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineDependencyParserTF', 'hanlp.components.parsers.biaffine_parser.BiaffineSemanticDependencyParser': 'hanlp.components.parsers.biaffine_parser_tf.BiaffineSemanticDependencyParserTF', 'hanlp.components.tok_tf.NgramConvTokenizerTF': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF', 'hanlp.components.classifiers.transformer_classifier.TransformerClassifier': 'hanlp.components.classifiers.transformer_classifier_tf.TransformerClassifierTF', 'hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger': 'hanlp.components.taggers.transformers.transformer_tagger_tf.TransformerTaggerTF', 'hanlp.components.tok.NgramConvTokenizer': 'hanlp.components.tokenizers.tok_tf.NgramConvTokenizerTF', } cls = upgrade.get(cls, cls) assert cls, f'{meta_filename} doesn\'t contain classpath field' try: obj: Component = object_from_classpath(cls) if hasattr(obj, 'load'): if transform_only: # noinspection PyUnresolvedReferences obj.load_transform(save_dir) else: if os.path.isfile(os.path.join(save_dir, 'config.json')): obj.load(save_dir, verbose=verbose, **kwargs) else: obj.load(metapath, **kwargs) obj.config['load_path'] = load_path return obj except ModuleNotFoundError as e: if isdebugging(): raise e from None else: raise ModuleNotFoundError( f'Some modules ({e.name} etc.) required by this model are missing. Please install the full version:' '\n\n\tpip install hanlp[full] -U') from None except ValueError as e: if e.args and isinstance(e.args[0], str) and 'Internet connection' in e.args[0]: raise ConnectionError( 'Hugging Face 🤗 Transformers failed to download because your Internet connection is either off or bad.\n' 'See https://hanlp.hankcs.com/docs/install.html#server-without-internet for solutions.') \ from None raise e from None except Exception as e: # Some users often install an incompatible tf and put the blame on HanLP. Teach them the basics. try: you_installed_wrong_versions, extras = check_version_conflicts(extras=('full',) if tf_model else None) except Exception as check_e: you_installed_wrong_versions, extras = None, None if you_installed_wrong_versions: raise version.NotCompatible(you_installed_wrong_versions + '\nPlease reinstall HanLP in the proper way:' + '\n\n\tpip install --upgrade hanlp' + ( f'[{",".join(extras)}]' if extras else '')) from None eprint(f'Failed to load {identifier}') from pkg_resources import parse_version model_version = meta.get("hanlp_version", '2.0.0-alpha.0') if model_version == '2.0.0': # Quick fix: the first version used a wrong string model_version = '2.0.0-alpha.0' model_version = parse_version(model_version) installed_version = parse_version(version.__version__) try: latest_version = get_latest_info_from_pypi() except: latest_version = None if model_version > installed_version: eprint(f'{identifier} was created with hanlp-{model_version}, ' f'while you are running a lower version: {installed_version}. ') if installed_version != latest_version: eprint( f'Please upgrade HanLP with:\n' f'\n\tpip install --upgrade hanlp\n') eprint( 'If the problem still persists, please submit an issue to https://github.com/hankcs/HanLP/issues\n' 'When reporting an issue, make sure to paste the FULL ERROR LOG below.') eprint(f'{"ERROR LOG BEGINS":=^80}') import platform eprint(f'OS: {platform.platform()}') eprint(f'Python: {platform.python_version()}') import torch eprint(f'PyTorch: {torch.__version__}') if tf_model: try: import tensorflow tf_version = tensorflow.__version__ eprint(f'TensorFlow: {tf_version}') except ModuleNotFoundError: tf_version = 'not installed' eprint(f'TensorFlow: {tf_version}') except Exception as tf_e: eprint(f'TensorFlow cannot be imported due to {tf_e.__class__.__name__}: {e}. ' f'Note this is not a bug of HanLP, but rather a compatability issue caused by TensorFlow.') eprint(f'HanLP: {version.__version__}') import sys sys.stderr.flush() try: if e.args and isinstance(e.args, tuple): for i in range(len(e.args)): if isinstance(e.args[i], str): from hanlp_common.util import set_tuple_with e.args = set_tuple_with(e.args, e.args[i] + f'\n{"ERROR LOG ENDS":=^80}', i) break except: pass raise e from None def load_from_meta(meta: dict) -> Component: if 'load_path' in meta: return load_from_meta_file(meta['load_path']) cls = meta.get('class_path', None) or meta.get('classpath', None) assert cls, f'{meta} doesn\'t contain classpath field' cls = str_to_type(cls) return cls.from_config(meta) ================================================ FILE: hanlp/utils/file_read_backwards/__init__.py ================================================ # -*- coding: utf-8 -*- from .file_read_backwards import FileReadBackwards # noqa: F401 __author__ = """Robin Robin""" __email__ = 'robinsquare42@gmail.com' __version__ = '2.0.0' ================================================ FILE: hanlp/utils/file_read_backwards/buffer_work_space.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """BufferWorkSpace module.""" import os new_lines = ["\r\n", "\n", "\r"] new_lines_bytes = [n.encode("ascii") for n in new_lines] # we only support encodings that's backward compat with ascii class BufferWorkSpace: """It is a helper module for FileReadBackwards.""" def __init__(self, fp, chunk_size): """Convention for the data. When read_buffer is not None, it represents contents of the file from `read_position` onwards that has not been processed/returned. read_position represents the file pointer position that has been read into read_buffer initialized to be just past the end of file. """ self.fp = fp self.read_position = _get_file_size(self.fp) # set the previously read position to the self.read_buffer = None self.chunk_size = chunk_size def add_to_buffer(self, content, read_position): """Add additional bytes content as read from the read_position. Args: content(bytes): data to be added to buffer working BufferWorkSpac. read_position(int): where in the file pointer the data was read from. Returns: """ self.read_position = read_position if self.read_buffer is None: self.read_buffer = content else: self.read_buffer = content + self.read_buffer def yieldable(self): """ """ if self.read_buffer is None: return False t = _remove_trailing_new_line(self.read_buffer) n = _find_furthest_new_line(t) if n >= 0: return True # we have read in entire file and have some unprocessed lines if self.read_position == 0 and self.read_buffer is not None: return True return False def return_line(self): """ Args: Returns: Precondition: self.yieldable() must be True """ assert(self.yieldable()) t = _remove_trailing_new_line(self.read_buffer) i = _find_furthest_new_line(t) if i >= 0: l = i + 1 after_new_line = slice(l, None) up_to_include_new_line = slice(0, l) r = t[after_new_line] self.read_buffer = t[up_to_include_new_line] else: # the case where we have read in entire file and at the "last" line r = t self.read_buffer = None return r def read_until_yieldable(self): """Read in additional chunks until it is yieldable.""" while not self.yieldable(): read_content, read_position = _get_next_chunk(self.fp, self.read_position, self.chunk_size) self.add_to_buffer(read_content, read_position) def has_returned_every_line(self): """ """ if self.read_position == 0 and self.read_buffer is None: return True return False def _get_file_size(fp): return os.fstat(fp.fileno()).st_size def _get_next_chunk(fp, previously_read_position, chunk_size): """Return next chunk of data that we would from the file pointer. Args: fp: file previously_read_position: file pointer position that we have read from chunk_size: desired read chunk_size Returns: (bytestring, int): data that has been read in, the file pointer position where the data has been read from """ seek_position, read_size = _get_what_to_read_next(fp, previously_read_position, chunk_size) fp.seek(seek_position) read_content = fp.read(read_size) read_position = seek_position return read_content, read_position def _get_what_to_read_next(fp, previously_read_position, chunk_size): """Return information on which file pointer position to read from and how many bytes. Args: fp: past_read_positon: int chunk_size: int previously_read_position: Returns: (int, int): The next seek position, how many bytes to read next """ seek_position = max(previously_read_position - chunk_size, 0) read_size = chunk_size # examples: say, our new_lines are potentially "\r\n", "\n", "\r" # find a reading point where it is not "\n", rewind further if necessary # if we have "\r\n" and we read in "\n", # the next iteration would treat "\r" as a different new line. # Q: why don't I just check if it is b"\n", but use a function ? # A: so that we can potentially expand this into generic sets of separators, later on. while seek_position > 0: fp.seek(seek_position) if _is_partially_read_new_line(fp.read(1)): seek_position -= 1 read_size += 1 # as we rewind further, let's make sure we read more to compensate else: break # take care of special case when we are back to the beginnin of the file read_size = min(previously_read_position - seek_position, read_size) return seek_position, read_size def _remove_trailing_new_line(l): """Remove a single instance of new line at the end of l if it exists. Args: l: Returns: : bytestring """ # replace only 1 instance of newline # match longest line first (hence the reverse=True), we want to match "\r\n" rather than "\n" if we can for n in sorted(new_lines_bytes, key=lambda x: len(x), reverse=True): if l.endswith(n): remove_new_line = slice(None, -len(n)) return l[remove_new_line] return l def _find_furthest_new_line(read_buffer): """Return -1 if read_buffer does not contain new line otherwise the position of the rightmost newline. Args: read_buffer: bytestring Returns: int: The right most position of new line character in read_buffer if found, else -1 """ new_line_positions = [read_buffer.rfind(n) for n in new_lines_bytes] return max(new_line_positions) def _is_partially_read_new_line(b): """Return True when b is part of a new line separator found at index >= 1, False otherwise. Args: b: bytestring Returns: bool """ for n in new_lines_bytes: if n.find(b) >= 1: return True return False ================================================ FILE: hanlp/utils/file_read_backwards/file_read_backwards.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """FileReadBackwards module.""" import io import os from .buffer_work_space import BufferWorkSpace supported_encodings = ["utf-8", "ascii", "latin-1"] # any encodings that are backward compatible with ascii should work class FileReadBackwards: """Class definition for `FileReadBackwards`. A `FileReadBackwards` will spawn a `FileReadBackwardsIterator` and keep an opened file handler. It can be used as a Context Manager. If done so, when exited, it will close its file handler. In any mode, `close()` can be called to close the file handler.. Args: Returns: """ def __init__(self, path, encoding="utf-8", chunk_size=io.DEFAULT_BUFFER_SIZE): """Constructor for FileReadBackwards. Args: path: Path to the file to be read encoding (str): Encoding chunk_size (int): How many bytes to read at a time """ if encoding.lower() not in supported_encodings: error_message = "{0} encoding was not supported/tested.".format(encoding) error_message += "Supported encodings are '{0}'".format(",".join(supported_encodings)) raise NotImplementedError(error_message) self.path = path self.encoding = encoding.lower() self.chunk_size = chunk_size self.iterator = FileReadBackwardsIterator(io.open(self.path, mode="rb"), self.encoding, self.chunk_size) def __iter__(self): """Return its iterator.""" return self.iterator def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): """Closes all opened its file handler and propagates all exceptions on exit.""" self.close() return False def close(self): """Closes all opened it s file handler.""" self.iterator.close() def readline(self): """ """ try: r = next(self.iterator) + os.linesep return r except StopIteration: return "" class FileReadBackwardsIterator: """Iterator for `FileReadBackwards`. This will read backwards line by line a file. It holds an opened file handler. Args: Returns: """ def __init__(self, fp, encoding, chunk_size): """Constructor for FileReadBackwardsIterator Args: fp (File): A file that we wish to start reading backwards from encoding (str): Encoding of the file chunk_size (int): How many bytes to read at a time """ self.path = fp.name self.encoding = encoding self.chunk_size = chunk_size self.__fp = fp self.__buf = BufferWorkSpace(self.__fp, self.chunk_size) def __iter__(self): return self def next(self): """Returns unicode string from the last line until the beginning of file. Gets exhausted if:: * already reached the beginning of the file on previous iteration * the file got closed When it gets exhausted, it closes the file handler. Args: Returns: """ # Using binary mode, because some encodings such as "utf-8" use variable number of # bytes to encode different Unicode points. # Without using binary mode, we would probably need to understand each encoding more # and do the seek operations to find the proper boundary before issuing read if self.closed: raise StopIteration if self.__buf.has_returned_every_line(): self.close() raise StopIteration self.__buf.read_until_yieldable() r = self.__buf.return_line() return r.decode(self.encoding) __next__ = next @property def closed(self): """The status of the file handler. :return: True if the file handler is still opened. False otherwise. Args: Returns: """ return self.__fp.closed def close(self): """Closes the file handler.""" self.__fp.close() ================================================ FILE: hanlp/utils/init_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-27 13:25 import math import torch from torch import nn import functools def embedding_uniform(tensor:torch.Tensor, seed=233): gen = torch.Generator().manual_seed(seed) with torch.no_grad(): fan_out = tensor.size(-1) bound = math.sqrt(3.0 / fan_out) return tensor.uniform_(-bound, bound, generator=gen) ================================================ FILE: hanlp/utils/io_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-26 15:02 import contextlib import glob import gzip import json import logging import os import platform import random import shlex import shutil import sys import tarfile import tempfile import urllib import zipfile from contextlib import contextmanager from pathlib import Path from subprocess import Popen, PIPE from typing import Tuple, Optional, Union, List from urllib.parse import urlparse from urllib.request import urlretrieve from hanlp_downloader import Downloader from hanlp_downloader.log import DownloadCallback from packaging.version import Version import hanlp from hanlp_common.constant import HANLP_URL, HANLP_VERBOSE from hanlp.utils.log_util import logger, cprint, remove_color_tag from hanlp.utils.string_util import split_long_sentence_into from hanlp.utils.time_util import now_filename, CountdownTimer from hanlp.version import __version__ from hanlp_common.io import eprint def load_jsonl(path, verbose=False): if verbose: src = TimingFileIterator(path) else: src = open(path, encoding='utf-8') for line in src: yield json.loads(line) if not verbose: src.close() def make_debug_corpus(path, delimiter=None, percentage=0.1, max_samples=100): files = [] if os.path.isfile(path): files.append(path) elif os.path.isdir(path): files += [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and '.debug' not in f and not f.startswith('.')] else: raise FileNotFoundError(path) for filepath in files: filename, file_extension = os.path.splitext(filepath) if not delimiter: if file_extension in {'.tsv', '.conll', '.conllx', '.conllu'}: delimiter = '\n\n' else: delimiter = '\n' with open(filepath, encoding='utf-8') as src, open(filename + '.debug' + file_extension, 'w', encoding='utf-8') as out: samples = src.read().strip().split(delimiter) max_samples = min(max_samples, int(len(samples) * percentage)) out.write(delimiter.join(samples[:max_samples])) def path_join(path, *paths): return os.path.join(path, *paths) def makedirs(path): os.makedirs(path, exist_ok=True) return path def tempdir(name=None): path = tempfile.gettempdir() if name: path = makedirs(path_join(path, name)) return path def tempdir_human(): return tempdir(now_filename()) def temp_lock(path): from filelock import FileLock import hashlib lock = FileLock(f"{tempdir()}/.{hashlib.md5(path.encode('utf8')).hexdigest()}.lock") return lock def hanlp_home_default(): """Default data directory depending on the platform and environment variables""" if windows(): return os.path.join(os.environ.get('APPDATA'), 'hanlp') else: return os.path.join(os.path.expanduser("~"), '.hanlp') def windows(): system = platform.system() return system == 'Windows' def hanlp_home(): """ Home directory for HanLP resources. Returns: Data directory in the filesystem for storage, for example when downloading models. This home directory can be customized with the following shell command or equivalent environment variable on Windows systems. .. highlight:: bash .. code-block:: bash $ export HANLP_HOME=/data/hanlp """ return os.getenv('HANLP_HOME', hanlp_home_default()) def file_exist(filename) -> bool: return os.path.isfile(filename) def remove_file(filename): if file_exist(filename): os.remove(filename) def parent_dir(path): return os.path.normpath(os.path.join(path, os.pardir)) def download(url, save_path=None, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True, verbose=HANLP_VERBOSE): if not save_path: save_path = path_from_url(url, save_dir, prefix, append_location) if os.path.isfile(save_path): if verbose: eprint('Using local {}, ignore {}'.format(save_path, url)) return save_path else: makedirs(parent_dir(save_path)) if verbose: eprint('Downloading {} to {}'.format(url, save_path)) tmp_path = '{}.downloading'.format(save_path) remove_file(tmp_path) try: downloader = Downloader(url, tmp_path, 4, headers={ 'User-agent': f'HanLP/{__version__} ({platform.platform()})'}) if verbose: downloader.subscribe(DownloadCallback(show_header=False)) downloader.start_sync() except BaseException as e: remove_file(tmp_path) url = url.split('#')[0] try: installed_version, latest_version = check_outdated() except: installed_version, latest_version = None, None # No Internet if installed_version != latest_version: # Always prompt user to upgrade whenever a new version is available hints = f'[green]Please upgrade to the latest version ({latest_version}) with:[/green]' \ f'\n\n\t[yellow]pip install -U hanlp[/yellow]\n' else: # Otherwise, prompt user to re-try hints = f'[green]Please re-try or download it to {save_path} by yourself ' if not windows(): hints += f'with:[/green]\n\n\t[yellow]wget {url} -O {save_path}[/yellow]\n\n' else: hints += 'using some decent downloading tools.[/green]\n' if not url.startswith(HANLP_URL): hints += 'For third party data, unrestricted connectivity to the global network may be required.' else: hints += 'See also https://hanlp.hankcs.com/docs/install.html#install-models for instructions.' message = f'Download failed due to [red]{repr(e)}[/red].\n' \ f'{hints}' if verbose: cprint(message) if hasattr(e, 'msg'): e.msg += '\n' + remove_color_tag(message) elif hasattr(e, 'args') and e.args and isinstance(e.args, tuple) and isinstance(e.args[0], str): e.args = (e.args[0] + '\n' + remove_color_tag(message),) + e.args[1:] raise e from None remove_file(save_path) os.rename(tmp_path, save_path) return save_path def parse_url_path(url): parsed: urllib.parse.ParseResult = urlparse(url) path = parsed.path.strip('/') return parsed.netloc, path def uncompress(path, dest=None, remove=True, verbose=HANLP_VERBOSE): """Uncompress a file and clean up uncompressed files once an error is triggered. Args: path: The path to a compressed file dest: The dest folder. remove: Remove archive file after decompression. verbose: ``True`` to print log message. Returns: Destination path. """ # assert path.endswith('.zip') prefix, ext = split_if_compressed(path) folder_name = os.path.basename(prefix) file_is_zip = ext == '.zip' root_of_folder = None if ext == '.gz': try: with gzip.open(path, 'rb') as f_in, open(prefix, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) except Exception as e: remove_file(prefix) remove_file(path) raise e else: try: with zipfile.ZipFile(path, "r") if ext == '.zip' else tarfile.open(path, 'r:*') as archive: if not dest: namelist = sorted(archive.namelist() if file_is_zip else archive.getnames()) if namelist[0] == '.': namelist = namelist[1:] namelist = [p[len('./'):] if p.startswith('./') else p for p in namelist] if ext == '.tgz': roots = set(x.split('/')[0] for x in namelist) if len(roots) == 1: root_of_folder = next(iter(roots)) else: # only one file, root_of_folder = '' root_of_folder = namelist[0].strip('/') if len(namelist) > 1 else '' if all(f.split('/')[0] == root_of_folder for f in namelist[1:]) or not root_of_folder: dest = os.path.dirname(path) # only one folder, unzip to the same dir else: root_of_folder = None dest = prefix # assume zip contains more than one file or folder if verbose: eprint('Decompressing {} to {}'.format(path, dest)) archive.extractall(dest) if root_of_folder: if root_of_folder != folder_name: # move root to match folder name os.rename(path_join(dest, root_of_folder), path_join(dest, folder_name)) dest = path_join(dest, folder_name) elif len(namelist) == 1: dest = path_join(dest, namelist[0]) except Exception as e: remove_file(path) if os.path.exists(prefix): if os.path.isfile(prefix): os.remove(prefix) elif os.path.isdir(prefix): shutil.rmtree(prefix) raise e if remove: remove_file(path) return dest def split_if_compressed(path: str, compressed_ext=('.zip', '.tgz', '.gz', 'bz2', '.xz')) -> Tuple[str, Optional[str]]: tar_gz = '.tar.gz' if path.endswith(tar_gz): root, ext = path[:-len(tar_gz)], tar_gz else: root, ext = os.path.splitext(path) if ext in compressed_ext or ext == tar_gz: return root, ext return path, None def get_resource(path: str, save_dir=hanlp_home(), extract=True, prefix=HANLP_URL, append_location=True, verbose=HANLP_VERBOSE): """Fetch real (local) path for a resource (model, corpus, whatever) to ``save_dir``. Args: path: A local path (which will returned as is) or a remote URL (which will be downloaded, decompressed then returned). save_dir: Where to store the resource (Default value = :meth:`hanlp.utils.io_util.hanlp_home`) extract: Whether to unzip it if it's a zip file (Default value = True) prefix: A prefix when matched with an URL (path), then that URL is considered to be official. For official resources, they will not go to a folder called ``thirdparty`` under :const:`~hanlp_common.constants.HANLP_HOME`. append_location: Whether to put unofficial files in a ``thirdparty`` folder. verbose: Whether to print log messages. Returns: The real path to the resource. """ _path = path path = hanlp.pretrained.ALL.get(path, path) anchor: str = None compressed = None if os.path.isdir(path): return path elif os.path.isfile(path): pass elif path.startswith('http:') or path.startswith('https:'): url = path if '#' in url: url, anchor = url.split('#', maxsplit=1) realpath = path_from_url(path, save_dir, prefix, append_location) realpath, compressed = split_if_compressed(realpath) # check if resource is there if anchor: if anchor.startswith('/'): # indicates the folder name has to be polished anchor = anchor.lstrip('/') parts = anchor.split('/') renamed_realpath = str(Path(realpath).parent.joinpath(parts[0])) if os.path.isfile(realpath + compressed): os.rename(realpath + compressed, renamed_realpath + compressed) realpath = renamed_realpath anchor = '/'.join(parts[1:]) child = path_join(realpath, anchor) if os.path.exists(child): return child elif os.path.isdir(realpath) or (os.path.isfile(realpath) and (compressed and extract)): return realpath else: if compressed: pattern = realpath + '.*' files = glob.glob(pattern) files = list(filter(lambda x: not x.endswith('.downloading') and not x.endswith(compressed), files)) if files: if len(files) > 1: logger.debug(f'Found multiple files with {pattern}, will use the first one.') return files[0] # realpath is where its path after exaction if compressed: realpath += compressed with temp_lock(path): if not os.path.isfile(realpath): path = download(url=path, save_path=realpath, verbose=verbose) else: path = realpath if extract and compressed: with temp_lock(path): if os.path.isfile(path): path = uncompress(path, verbose=verbose) else: # other process must have already decompressed it and deleted it return get_resource(_path, save_dir, extract, prefix, append_location, verbose) if anchor: path = path_join(path, anchor) return path def path_from_url(url, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True): """Map a URL to a local path. Args: url: Remote URL. save_dir: The root folder to save this file. prefix: The prefix of official website. Any URLs starting with this prefix will be considered official. append_location: Whether to put unofficial files in a ``thirdparty`` folder. Returns: The real path that this URL is mapped to. """ if not save_dir: save_dir = hanlp_home() domain, relative_path = parse_url_path(url) if append_location: if not url.startswith(prefix): save_dir = os.path.join(save_dir, 'thirdparty', domain) else: # remove the relative path in prefix middle = prefix.split(domain)[-1].lstrip('/') if relative_path.startswith(middle): relative_path = relative_path[len(middle):] realpath = os.path.join(save_dir, relative_path) else: realpath = os.path.join(save_dir, os.path.basename(relative_path)) return realpath def human_bytes(file_size: int) -> str: file_size /= 1024 # KB if file_size > 1024: file_size /= 1024 # MB if file_size > 1024: file_size /= 1024 # GB return '%.1f GB' % file_size return '%.1f MB' % file_size return '%d KB' % file_size def read_cells(filepath: str, delimiter='auto', strip=True, skip_header=False): filepath = get_resource(filepath) if delimiter == 'auto': if filepath.endswith('.tsv'): delimiter = '\t' elif filepath.endswith('.csv'): delimiter = ',' else: delimiter = None with open(filepath, encoding='utf-8') as src: if skip_header: next(src) for line in src: line = line.strip() if not line: continue cells = line.split(delimiter) if strip: cells = [c.strip() for c in cells] yield cells def replace_ext(filepath, ext) -> str: """ Replace the extension of filepath to ext. Args: filepath: Filepath to be replaced. ext: Extension to replace. Returns: A new path. """ file_prefix, _ = os.path.splitext(filepath) return file_prefix + ext def read_tsv_as_sents(tsv_file_path, ignore_prefix=None, delimiter=None): sent = [] tsv_file_path = get_resource(tsv_file_path) with open(tsv_file_path, encoding='utf-8') as tsv_file: for line in tsv_file: if ignore_prefix and line.startswith(ignore_prefix): continue line = line.strip() cells = line.split(delimiter) if line and cells: sent.append(cells) elif sent: yield sent sent = [] if sent: yield sent def generate_words_tags_from_tsv(tsv_file_path, lower=False, gold=True, max_seq_length=None, sent_delimiter=None, char_level=False, hard_constraint=False): for sent in read_tsv_as_sents(tsv_file_path): words = [cells[0] for cells in sent] if max_seq_length: offset = 0 # try to split the sequence to make it fit into max_seq_length for shorter_words in split_long_sentence_into(words, max_seq_length, sent_delimiter, char_level, hard_constraint): if gold: shorter_tags = [cells[1] for cells in sent[offset:offset + len(shorter_words)]] offset += len(shorter_words) else: shorter_tags = None if lower: shorter_words = [word.lower() for word in shorter_words] yield shorter_words, shorter_tags else: if gold: try: tags = [cells[1] for cells in sent] except: raise ValueError(f'Failed to load {tsv_file_path}: {sent}') else: tags = None if lower: words = [word.lower() for word in words] yield words, tags def split_file(filepath, train=0.8, dev=0.1, test=0.1, names=None, shuffle=False): num_samples = 0 if filepath.endswith('.tsv'): for sent in read_tsv_as_sents(filepath): num_samples += 1 else: with open(filepath, encoding='utf-8') as src: for sample in src: num_samples += 1 splits = {'train': train, 'dev': dev, 'test': test} splits = dict((k, v) for k, v in splits.items() if v) splits = dict((k, v / sum(splits.values())) for k, v in splits.items()) accumulated = 0 r = [] for k, v in splits.items(): r.append(accumulated) accumulated += v r.append(accumulated) splits[k] = accumulated if names is None: names = {} name, ext = os.path.splitext(filepath) filenames = [names.get(split, name + '.' + split + ext) for split in splits.keys()] outs = [open(f, 'w', encoding='utf-8') for f in filenames] if shuffle: shuffle = list(range(num_samples)) random.shuffle(shuffle) if filepath.endswith('.tsv'): src = read_tsv_as_sents(filepath) else: src = open(filepath, encoding='utf-8') for idx, sample in enumerate(src): if shuffle: idx = shuffle[idx] ratio = idx / num_samples for sid, out in enumerate(outs): if r[2 * sid] <= ratio < r[2 * sid + 1]: if isinstance(sample, list): sample = '\n'.join('\t'.join(x) for x in sample) + '\n\n' out.write(sample) break if not filepath.endswith('.tsv'): src.close() for out in outs: out.close() return filenames def fileno(file_or_fd): try: fd = getattr(file_or_fd, 'fileno', lambda: file_or_fd)() except: return None if not isinstance(fd, int): raise ValueError("Expected a file (`.fileno()`) or a file descriptor") return fd @contextmanager def stdout_redirected(to=os.devnull, stdout=None): """Redirect stdout to else where. Copied from https://stackoverflow.com/questions/4675728/redirect-stdout-to-a-file-in-python/22434262#22434262 Args: to: Target device. stdout: Source device. """ if windows(): # This doesn't play well with windows yield None return if stdout is None: stdout = sys.stdout stdout_fd = fileno(stdout) if not stdout_fd: yield None return # copy stdout_fd before it is overwritten # NOTE: `copied` is inheritable on Windows when duplicating a standard stream with os.fdopen(os.dup(stdout_fd), 'wb') as copied: stdout.flush() # flush library buffers that dup2 knows nothing about try: os.dup2(fileno(to), stdout_fd) # $ exec >&to except ValueError: # filename with open(to, 'wb') as to_file: os.dup2(to_file.fileno(), stdout_fd) # $ exec > to try: yield stdout # allow code to be run with the redirected stdout finally: # restore stdout to its previous value # NOTE: dup2 makes stdout_fd inheritable unconditionally try: stdout.flush() os.dup2(copied.fileno(), stdout_fd) # $ exec >&copied except: # This is the best we can do pass def get_exitcode_stdout_stderr(cmd): """Execute the external command and get its exitcode, stdout and stderr. See https://stackoverflow.com/a/21000308/3730690 Args: cmd: Command. Returns: Exit code, stdout, stderr. """ args = shlex.split(cmd) proc = Popen(args, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() exitcode = proc.returncode return exitcode, out.decode('utf-8'), err.decode('utf-8') def run_cmd(cmd: str) -> str: exitcode, out, err = get_exitcode_stdout_stderr(cmd) if exitcode: raise RuntimeError(err + '\nThe command is:\n' + cmd) return out @contextlib.contextmanager def pushd(new_dir): previous_dir = os.getcwd() os.chdir(new_dir) try: yield finally: os.chdir(previous_dir) def basename_no_ext(path): basename = os.path.basename(path) no_ext, ext = os.path.splitext(basename) return no_ext def file_cache(path: str, purge=False): cache_name = path + '.cache' cache_time = os.path.getmtime(cache_name) if os.path.isfile(cache_name) and not purge else 0 file_time = os.path.getmtime(path) cache_valid = cache_time > file_time return cache_name, cache_valid def merge_files(files: List[str], dst: str): with open(dst, 'wb') as write: for f in files: with open(f, 'rb') as read: shutil.copyfileobj(read, write) class TimingFileIterator(CountdownTimer): def __init__(self, filepath) -> None: super().__init__(os.path.getsize(filepath)) self.filepath = filepath def __iter__(self): if not os.path.isfile(self.filepath): raise FileNotFoundError(self.filepath) fp = open(self.filepath, encoding='utf-8', errors='ignore') line = fp.readline() while line: yield line self.current = fp.tell() line = fp.readline() fp.close() def log(self, info=None, ratio_percentage=True, ratio=True, step=0, interval=0.5, erase=True, logger: Union[logging.Logger, bool] = None, newline=False, ratio_width=None): assert step == 0 super().log(info, ratio_percentage, ratio, step, interval, erase, logger, newline, ratio_width) @property def ratio(self) -> str: return f'{human_bytes(self.current)}/{human_bytes(self.total)}' @property def ratio_width(self) -> int: return len(f'{human_bytes(self.total)}') * 2 + 1 def close(self): pass def check_outdated(package='hanlp', version=__version__, repository_url='https://pypi.python.org/pypi/%s/json'): """Given the name of a package on PyPI and a version (both strings), checks if the given version is the latest version of the package available. Returns a 2-tuple (installed_version, latest_version) `repository_url` is a `%` style format string to use a different repository PyPI repository URL, e.g. test.pypi.org or a private repository. The string is formatted with the package name. Adopted from https://github.com/alexmojaki/outdated/blob/master/outdated/__init__.py Args: package: Package name. version: Installed version string. repository_url: URL on pypi. Returns: Parsed installed version and latest version. """ installed_version = Version(version) latest_version = get_latest_info_from_pypi(package, repository_url) return installed_version, latest_version def get_latest_info_from_pypi(package='hanlp', repository_url='https://pypi.python.org/pypi/%s/json'): url = repository_url % package response = urllib.request.urlopen(url).read() return Version(json.loads(response)['info']['version']) def check_version_conflicts(extras=None): from pkg_resources import get_distribution, Requirement, WorkingSet, VersionConflict, DistributionNotFound pkg = get_distribution('hanlp') if not extras: extras = pkg.extras if isinstance(extras, list): extras = tuple(extras) requirements: List[Requirement] = pkg.requires(extras=extras) error = None try: WorkingSet().resolve( requirements, extras=extras ) except VersionConflict as e: error = e.with_context('hanlp').report() except DistributionNotFound as e: error = str(e) return error, extras ================================================ FILE: hanlp/utils/lang/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-09 18:46 __doc__ = ''' This package holds misc utils for specific languages. ''' ================================================ FILE: hanlp/utils/lang/en/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 19:28 ================================================ FILE: hanlp/utils/lang/en/english_tokenizer.py ================================================ #!/usr/bin/env python """ Regex-based word tokenizers. Note that small/full/half-width character variants are *not* covered. If a text were to contains such characters, normalize it first. A modified version of https://github.com/fnl/segtok - dropped dependency on regex - dropped web_tokenize - supported concat word """ __author__ = 'Florian Leitner ' from re import compile, UNICODE, VERBOSE SENTENCE_TERMINALS = '.!?\u203C\u203D\u2047\u2048\u2049\u3002' \ '\uFE52\uFE57\uFF01\uFF0E\uFF1F\uFF61' "The list of valid Unicode sentence terminal characters." # Note that Unicode the category Pd is NOT a good set for valid word-breaking hyphens, # because it contains many dashes that should not be considered part of a word. HYPHENS = '\u00AD\u058A\u05BE\u0F0C\u1400\u1806\u2010-\u2012\u2e17\u30A0-' "Any valid word-breaking hyphen, including ASCII hyphen minus." APOSTROPHES = '\'\u00B4\u02B9\u02BC\u2019\u2032' """All apostrophe-like marks, including the ASCII "single quote".""" APOSTROPHE = r"[\u00B4\u02B9\u02BC\u2019\u2032]" """Any apostrophe-like marks, including "prime" but not the ASCII "single quote".""" LINEBREAK = r'(?:\r\n|\n|\r|\u2028)' """Any valid linebreak sequence (Windows, Unix, Mac, or U+2028).""" LETTER = r'[^\W\d_]' """Any Unicode letter character that can form part of a word: Ll, Lm, Lt, Lu.""" NUMBER = r'\d' """Any Unicode number character: Nd or Nl.""" POWER = r'\u207B?[\u00B9\u00B2\u00B3]' """Superscript 1, 2, and 3, optionally prefixed with a minus sign.""" SUBDIGIT = r'[\u2080-\u2089]' """Subscript digits.""" ALNUM = LETTER[:-1] + NUMBER + ']' """Any alphanumeric Unicode character: letter or number.""" HYPHEN = r'[%s]' % HYPHENS SPACE = r'\s' """Any unicode space character plus the (horizontal) tab.""" APO_MATCHER = compile(APOSTROPHE, UNICODE) """Matcher for any apostrophe.""" HYPHENATED_LINEBREAK = compile( r'({alnum}{hyphen}){space}*?{linebreak}{space}*?({alnum})'.format( alnum=ALNUM, hyphen=HYPHEN, linebreak=LINEBREAK, space=SPACE ), UNICODE ) """ The pattern matches any alphanumeric Unicode character, followed by a hyphen, a single line-break surrounded by optional (non-breaking) spaces, and terminates with a alphanumeric character on this next line. The opening char and hyphen as well as the terminating char are captured in two groups. """ IS_POSSESSIVE = compile(r"{alnum}+(?:{hyphen}{alnum}+)*(?:{apo}[sS]|[sS]{apo})$".format( alnum=ALNUM, hyphen=HYPHEN, apo="['" + APOSTROPHE[1:] ), UNICODE ) """A pattern that matches English words with a possessive s terminal form.""" IS_CONTRACTION = compile(r"{alnum}+(?:{hyphen}{alnum}+)*{apo}(?:d|ll|m|re|s|t|ve)$".format( alnum=ALNUM, hyphen=HYPHEN, apo="['" + APOSTROPHE[1:] ), UNICODE ) """A pattern that matches tokens with valid English contractions ``'(d|ll|m|re|s|t|ve)``.""" MAP_CONCAT_WORD = {'aint': [2, 4], 'arent': [3, 5], 'cant': [2, 4], 'cannot': [3, 6], 'coulda': [5, 6], 'couldnt': [5, 7], 'didnt': [3, 5], 'doncha': [2, 3, 6], 'dont': [2, 4], 'doesnt': [4, 6], 'dunno': [2, 3, 5], 'finna': [3, 5], 'gimme': [3, 5], 'gonna': [3, 5], 'gotta': [3, 5], 'hadnt': [3, 5], 'hasnt': [3, 5], 'havent': [4, 6], 'isnt': [2, 4], 'itd': [2, 3], 'itll': [2, 4], 'lemme': [3, 5], 'lets': [3, 4], 'mightnt': [5, 7], 'mustnt': [4, 6], 'shant': [3, 5], 'shoulda': [6, 7], 'shouldnt': [6, 8], 'thatd': [4, 5], 'thatll': [4, 6], 'thats': [4, 5], 'theyd': [4, 5], 'theyre': [4, 6], 'theyve': [4, 6], 'wanna': [3, 5], 'wasnt': [3, 5], 'weve': [2, 4], 'werent': [4, 6], 'whadya': [3, 4, 6], 'whatcha': [4, 7], 'whatre': [4, 6], 'whats': [4, 5], 'whatve': [4, 6], 'whatz': [4, 5], 'whod': [3, 4], 'wholl': [3, 5], 'woncha': [2, 3, 6], 'wont': [2, 4], 'woulda': [5, 6], 'wouldnt': [5, 7], 'youd': [3, 4], 'youll': [3, 5], 'youve': [3, 5], "'tis": [2, 4], "'twas": [2, 5], "d'ye": [2, 4], "don'cha": [2, 4, 7], "i'mma": [1, 3, 5], "i'mmm": [1, 5], "more'n": [4, 6], '’tis': [2, 4], '’twas': [2, 5], 'd’ye': [2, 4], 'don’cha': [2, 4, 7], 'i’mma': [1, 3, 5], 'i’mmm': [1, 5], 'more’n': [4, 6]} RE_APOSTROPHE = compile(r'(?i)[a-z](n[\'\u2019]t|[\'\u2019](ll|nt|re|ve|[dmstz]))(\W|$)') def split_possessive_markers(tokens): """ A function to split possessive markers at the end of alphanumeric (and hyphenated) tokens. Takes the output of any of the tagger functions and produces and updated list. To use it, simply wrap the tagger function, for example:: >>> my_sentence = "This is Fred's latest book." >>> split_possessive_markers(tokenize_english(my_sentence)) ['This', 'is', 'Fred', "'s", 'latest', 'book', '.'] :param tokens: a list of tokens :returns: an updated list if a split was made or the original list otherwise """ idx = -1 for token in list(tokens): idx += 1 if IS_POSSESSIVE.match(token) is not None: if token[-1].lower() == 's' and token[-2] in APOSTROPHES: tokens.insert(idx, token[:-2]) idx += 1 tokens[idx] = token[-2:] elif token[-2].lower() == 's' and token[-1] in APOSTROPHES: tokens.insert(idx, token[:-1]) idx += 1 tokens[idx] = token[-1:] return tokens def split_contractions(tokens): """ A function to split apostrophe contractions at the end of alphanumeric (and hyphenated) tokens. Takes the output of any of the tagger functions and produces and updated list. :param tokens: a list of tokens :returns: an updated list if a split was made or the original list otherwise """ idx = -1 for token in list(tokens): idx += 1 if IS_CONTRACTION.match(token) is not None: length = len(token) if length > 1: for pos in range(length - 1, -1, -1): if token[pos] in APOSTROPHES: if 2 < length and pos + 2 == length and token[-1] == 't' and token[pos - 1] == 'n': pos -= 1 tokens.insert(idx, token[:pos]) idx += 1 tokens[idx] = token[pos:] return tokens def _matches(regex): """Regular expression compiling function decorator.""" def match_decorator(fn): automaton = compile(regex, UNICODE | VERBOSE) fn.split = automaton.split fn.match = automaton.match return fn return match_decorator @_matches(r'\s+') def space_tokenizer(sentence): """ For a given input `sentence`, return a list of its tokens. Split on Unicode spaces ``\\s+`` (i.e., any kind of **Unicode** space character). The separating space characters are not included in the resulting token list. """ return [token for token in space_tokenizer.split(sentence) if token] @_matches(r'(%s+)' % ALNUM) def symbol_tokenizer(sentence): """ The symbol tagger extends the :func:`space_tokenizer` by separating alphanumerics. Separates alphanumeric Unicode character sequences in already space-split tokens. """ return [token for span in space_tokenizer(sentence) for token in symbol_tokenizer.split(span) if token] @_matches(r"""((?: # Dots, except ellipsis {alnum} \. (?!\.\.) | # Comma, surrounded by digits (e.g., chemicals) or letters {alnum} , (?={alnum}) | # Colon, surrounded by digits (e.g., time, references) {number} : (?={number}) | # Hyphen, surrounded by digits (e.g., DNA endings: "5'-ACGT-3'") or letters {alnum} {apo}? {hyphen} (?={alnum}) # incl. optional apostrophe for DNA segments | # Apostophes, non-consecutive {apo} (?!{apo}) | # ASCII single quote, surrounded by digits or letters (no dangling allowed) {alnum} ' (?={alnum}) | # ASCII single quote after an s and at the token's end s ' $ | # Terminal dimensions (superscript minus, 1, 2, and 3) attached to physical units # size-prefix unit-acronym dimension \b [yzafpn\u00B5mcdhkMGTPEZY]? {letter}{{1,3}} {power} $ | # Atom counts (subscript numbers) and ionization states (optional superscript # 2 or 3 followed by a + or -) are attached to valid fragments of a chemical formula \b (?:[A-Z][a-z]?|[\)\]])+ {subdigit}+ (?:[\u00B2\u00B3]?[\u207A\u207B])? | # Any (Unicode) letter, digit, or the underscore {alnum} )+)""".format(alnum=ALNUM, apo=APOSTROPHE, power=POWER, subdigit=SUBDIGIT, hyphen=HYPHEN, letter=LETTER, number=NUMBER)) def tokenize_english(sentence): """ A modified version of the segtok tagger: https://github.com/fnl/segtok This tagger extends the alphanumeric :func:`symbol_tokenizer` by splitting fewer cases: 1. Dots appearing after a letter are maintained as part of the word, except for the last word in a sentence if that dot is the sentence terminal. Therefore, abbreviation marks (words containing or ending in a ``.``, like "i.e.") remain intact and URL or ID segments remain complete ("www.ex-ample.com", "EC1.2.3.4.5", etc.). The only dots that never are attached are triple dots (``...``; ellipsis). 2. Commas surrounded by alphanumeric characters are maintained in the word, too, e.g. ``a,b``. Colons surrounded by digits are maintained, e.g., 'at 12:30pm' or 'Isaiah 12:3'. Commas, semi-colons, and colons dangling at the end of a token are always spliced off. 3. Any two alphanumeric letters that are separated by a single hyphen are joined together; Those "inner" hyphens may optionally be followed by a linebreak surrounded by spaces; The spaces will be removed, however. For example, ``Hel- \\r\\n \t lo`` contains a (Windows) linebreak and will be returned as ``Hel-lo``. 4. Apostrophes are always allowed in words as long as they are not repeated; The single quote ASCII letter ``'`` is only allowed as a terminal apostrophe after the letter ``s``, otherwise it must be surrounded by letters. To support DNA and chemicals, a apostrophe (prime) may be located before the hyphen, as in the single token "5'-ACGT-3'" (if any non-ASCII hyphens are used instead of the shown single quote). 5. Superscript 1, 2, and 3, optionally prefixed with a superscript minus, are attached to a word if it is no longer than 3 letters (optionally 4 if the first letter is a power prefix in the range from yocto, y (10^-24) to yotta, Y (10^+24)). 6. Subscript digits are attached if prefixed with letters that look like a chemical formula. """ if not sentence: return [] flat = not isinstance(sentence, list) if flat: sents = [sentence] else: sents = sentence results = [] for sentence in sents: pruned = HYPHENATED_LINEBREAK.sub(r'\1\2', sentence) tokens = [token for span in space_tokenizer(pruned) for token in tokenize_english.split(span) if token] # splice the sentence terminal off the last word/token if it has any at its borders # only look for the sentence terminal in the last three tokens for idx, word in enumerate(reversed(tokens[-3:]), 1): if (tokenize_english.match(word) and not APO_MATCHER.match(word)) or \ any(t in word for t in SENTENCE_TERMINALS): last = len(word) - 1 if 0 == last or u'...' == word: # any case of "..." or any single char (last == 0) pass # leave the token as it is elif any(word.rfind(t) == last for t in SENTENCE_TERMINALS): # "stuff." tokens[-idx] = word[:-1] tokens.insert(len(tokens) - idx + 1, word[-1]) elif any(word.find(t) == 0 for t in SENTENCE_TERMINALS): # ".stuff" tokens[-idx] = word[0] tokens.insert(len(tokens) - idx + 1, word[1:]) break # keep splicing off any dangling commas and (semi-) colons dirty = True while dirty: dirty = False for idx, word in enumerate(reversed(tokens), 1): while len(word) > 1 and word[-1] in u',;:': char = word[-1] # the dangling comma/colon word = word[:-1] tokens[-idx] = word tokens.insert(len(tokens) - idx + 1, char) idx += 1 dirty = True if dirty: break # restart check to avoid index errors # split concat words chunks = [] for token in tokens: t = MAP_CONCAT_WORD.get(token.lower(), None) if t: i = 0 for j in t: chunks.append(token[i:j]) i = j else: chunks.append(token) tokens = chunks # split APOSTROPHE chunks = [] for token in tokens: m = RE_APOSTROPHE.search(token) if m: chunks.append(token[:m.start(1)]) chunks.append(token[m.start(1):m.end(1)]) if m.end(1) < len(token): chunks.append(token[m.end(1):]) else: chunks.append(token) tokens = chunks results.append(tokens) return results[0] if flat else results ================================================ FILE: hanlp/utils/lang/ja/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-13 13:24 ================================================ FILE: hanlp/utils/lang/ja/bert_tok.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-13 13:24 from typing import Union, Optional from transformers import BertTokenizerFast, TensorType, BatchEncoding, BertJapaneseTokenizer as _BertJapaneseTokenizer from transformers.file_utils import PaddingStrategy from transformers.tokenization_utils_base import TextInput, PreTokenizedInput, EncodedInput, TruncationStrategy class BertJapaneseTokenizer(_BertJapaneseTokenizer): # We may need to customize character level tokenization to handle English words and URLs pass class BertJapaneseTokenizerFast(BertTokenizerFast): def encode_plus( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, **kwargs ) -> BatchEncoding: """ Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated, ``__call__`` should be used instead. Args: text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)): The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids`` method). text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using the ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids`` method). """ text = list(text) is_split_into_words = True encoding = BertJapaneseTokenizer.encode_plus(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs ) offsets = encoding.encodings[0].offsets fixed_offsets = [(b + i, e + i) for i, (b, e) in enumerate(offsets)] # TODO: This doesn't work with rust tokenizers encoding.encodings[0].offsets.clear() encoding.encodings[0].offsets.extend(fixed_offsets) return encoding ================================================ FILE: hanlp/utils/lang/zh/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-09 18:47 ================================================ FILE: hanlp/utils/lang/zh/char_table.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-09 19:07 from typing import List from hanlp.utils.io_util import get_resource from hanlp_common.io import load_json HANLP_CHAR_TABLE_TXT = 'https://file.hankcs.com/corpus/char_table.zip#CharTable.txt' HANLP_CHAR_TABLE_JSON = 'https://file.hankcs.com/corpus/char_table.json.zip' class CharTable: convert = {} @staticmethod def convert_char(c): if not CharTable.convert: CharTable._init() return CharTable.convert.get(c, c) @staticmethod def normalize_text(text: str) -> str: return ''.join(CharTable.convert_char(c) for c in text) @staticmethod def normalize_chars(chars: List[str]) -> List[str]: return [CharTable.convert_char(c) for c in chars] @staticmethod def _init(): CharTable.convert = CharTable.load() @staticmethod def load(): mapper = {} with open(get_resource(HANLP_CHAR_TABLE_TXT), encoding='utf-8') as src: for line in src: cells = line.rstrip('\n') if len(cells) != 3: continue a, _, b = cells mapper[a] = b return mapper class JsonCharTable(CharTable): @staticmethod def load(): return load_json(get_resource(HANLP_CHAR_TABLE_JSON)) ================================================ FILE: hanlp/utils/lang/zh/localization.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-05 02:09 task = { 'dep': '依存句法树', 'token': '单词', 'pos': '词性', 'ner': '命名实体', 'srl': '语义角色' } pos = { 'VA': '表语形容词', 'VC': '系动词', 'VE': '动词有无', 'VV': '其他动词', 'NR': '专有名词', 'NT': '时间名词', 'NN': '其他名词', 'LC': '方位词', 'PN': '代词', 'DT': '限定词', 'CD': '概数词', 'OD': '序数词', 'M': '量词', 'AD': '副词', 'P': '介词', 'CC': '并列连接词', 'CS': '从属连词', 'DEC': '补语成分“的”', 'DEG': '属格“的”', 'DER': '表结果的“得”', 'DEV': '表方式的“地”', 'AS': '动态助词', 'SP': '句末助词', 'ETC': '表示省略', 'MSP': '其他小品词', 'IJ': '句首感叹词', 'ON': '象声词', 'LB': '长句式表被动', 'SB': '短句式表被动', 'BA': '把字句', 'JJ': '其他名词修饰语', 'FW': '外来语', 'PU': '标点符号', 'NOI': '噪声', 'URL': '网址' } ner = { 'NT': '机构团体', 'NS': '地名', 'NR': '人名' } dep = { 'nn': '复合名词修饰', 'punct': '标点符号', 'nsubj': '名词性主语', 'conj': '连接性状语', 'dobj': '直接宾语', 'advmod': '名词性状语', 'prep': '介词性修饰语', 'nummod': '数词修饰语', 'amod': '形容词修饰语', 'pobj': '介词性宾语', 'rcmod': '相关关系', 'cpm': '补语', 'assm': '关联标记', 'assmod': '关联修饰', 'cc': '并列关系', 'elf': '类别修饰', 'ccomp': '从句补充', 'det': '限定语', 'lobj': '时间介词', 'range': '数量词间接宾语', 'asp': '时态标记', 'tmod': '时间修饰语', 'plmod': '介词性地点修饰', 'attr': '属性', 'mmod': '情态动词', 'loc': '位置补语', 'top': '主题', 'pccomp': '介词补语', 'etc': '省略关系', 'lccomp': '位置补语', 'ordmod': '量词修饰', 'xsubj': '控制主语', 'neg': '否定修饰', 'rcomp': '结果补语', 'comod': '并列联合动词', 'vmod': '动词修饰', 'prtmod': '小品词', 'ba': '把字关系', 'dvpm': '地字修饰', 'dvpmod': '地字动词短语', 'prnmod': '插入词修饰', 'cop': '系动词', 'pass': '被动标记', 'nsubjpass': '被动名词主语', 'clf': '类别修饰', 'dep': '依赖关系', 'root': '核心关系' } ================================================ FILE: hanlp/utils/log_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-24 22:12 import datetime import io import logging import os import sys from logging import LogRecord import termcolor from hanlp_common.constant import IPYTHON class ColoredFormatter(logging.Formatter): def __init__(self, fmt=None, datefmt=None, style='%', enable=True): super().__init__(fmt, datefmt, style) self.enable = enable def formatMessage(self, record: LogRecord) -> str: message = super().formatMessage(record) if self.enable: return color_format(message) else: return remove_color_tag(message) def init_logger(name=None, root_dir=None, level=logging.INFO, mode='w', fmt="%(asctime)s %(levelname)s %(message)s", datefmt='%Y-%m-%d %H:%M:%S') -> logging.Logger: if not name: name = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S") rootLogger = logging.getLogger(os.path.join(root_dir, name) if root_dir else name) rootLogger.propagate = False consoleHandler = logging.StreamHandler(sys.stdout) # stderr will be rendered as red which is bad consoleHandler.setFormatter(ColoredFormatter(fmt, datefmt=datefmt)) attached_to_std = False for handler in rootLogger.handlers: if isinstance(handler, logging.StreamHandler): if handler.stream == sys.stderr or handler.stream == sys.stdout: attached_to_std = True break if not attached_to_std: rootLogger.addHandler(consoleHandler) rootLogger.setLevel(level) consoleHandler.setLevel(level) if root_dir: os.makedirs(root_dir, exist_ok=True) log_path = "{0}/{1}.log".format(root_dir, name) fileHandler = logging.FileHandler(log_path, mode=mode) fileHandler.setFormatter(ColoredFormatter(fmt, datefmt=datefmt, enable=False)) rootLogger.addHandler(fileHandler) fileHandler.setLevel(level) return rootLogger logger = init_logger(name='hanlp', level=os.environ.get('HANLP_LOG_LEVEL', 'INFO')) def enable_debug(debug=True): logger.setLevel(logging.DEBUG if debug else logging.ERROR) class ErasablePrinter(object): def __init__(self, out=sys.stderr): self._last_print_width = 0 self.out = out def erase(self): if self._last_print_width: if IPYTHON: self.out.write("\r") self.out.write(" " * self._last_print_width) else: self.out.write("\b" * self._last_print_width) self.out.write(" " * self._last_print_width) self.out.write("\b" * self._last_print_width) self.out.write("\r") # \r is essential when multi-lines were printed self._last_print_width = 0 def print(self, msg: str, color=True): self.erase() if color: if IPYTHON: msg, _len = color_format_len(msg) _len = len(msg) else: msg, _len = color_format_len(msg) self._last_print_width = _len else: self._last_print_width = len(msg) self.out.write(msg) self.out.flush() _printer = ErasablePrinter() def flash(line: str, color=True): _printer.print(line, color) def color_format(msg: str): for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES: for c, v in tag.items(): start, end = f'[{c}]', f'[/{c}]' msg = msg.replace(start, '\033[%dm' % v).replace(end, termcolor.RESET) return msg def remove_color_tag(msg: str): for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES: for c, v in tag.items(): start, end = f'[{c}]', f'[/{c}]' msg = msg.replace(start, '').replace(end, '') return msg def color_format_len(msg: str): _len = len(msg) for tag in termcolor.COLORS, termcolor.HIGHLIGHTS, termcolor.ATTRIBUTES: for c, v in tag.items(): start, end = f'[{c}]', f'[/{c}]' msg, delta = _replace_color_offset(msg, start, '\033[%dm' % v) _len -= delta msg, delta = _replace_color_offset(msg, end, termcolor.RESET) _len -= delta return msg, _len def _replace_color_offset(msg: str, color: str, ctrl: str): chunks = msg.split(color) delta = (len(chunks) - 1) * len(color) return ctrl.join(chunks), delta def cprint(*args, file=None, **kwargs): out = io.StringIO() print(*args, file=out, **kwargs) text = out.getvalue() out.close() c_text = color_format(text) print(c_text, end='', file=file) def main(): # cprint('[blink][yellow]...[/yellow][/blink]') # show_colors_and_formats() show_colors() # print('previous', end='') # for i in range(10): # flash(f'[red]{i}[/red]') def show_colors_and_formats(): msg = '' for c in termcolor.COLORS.keys(): for h in termcolor.HIGHLIGHTS.keys(): for a in termcolor.ATTRIBUTES.keys(): msg += f'[{c}][{h}][{a}] {c}+{h}+{a} [/{a}][/{h}][/{c}]' logger.info(msg) def show_colors(): msg = '' for c in termcolor.COLORS.keys(): cprint(f'[{c}]"{c}",[/{c}]') # Generates tables for Doxygen flavored Markdown. See the Doxygen # documentation for details: # http://www.doxygen.nl/manual/markdown.html#md_tables # Translation dictionaries for table alignment if __name__ == '__main__': main() ================================================ FILE: hanlp/utils/rules.py ================================================ import re _SEPARATOR = r'@' _RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) _AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) _AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) _UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + _SEPARATOR + r'(\w)', re.UNICODE) _UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + _SEPARATOR + r'(\w)', re.UNICODE) def _replace_with_separator(text, separator, regexs): replacement = r"\1" + separator + r"\2" result = text for regex in regexs: result = regex.sub(replacement, result) return result def split_sentence(text, best=True): text = re.sub(r'([。!??])([^”’])', r"\1\n\2", text) text = re.sub(r'(\.{6})([^”’])', r"\1\n\2", text) text = re.sub(r'(…{2})([^”’])', r"\1\n\2", text) text = re.sub(r'([。!??][”’])([^,。!??])', r'\1\n\2', text) for chunk in text.split("\n"): chunk = chunk.strip() if not chunk: continue if not best: yield chunk continue processed = _replace_with_separator(chunk, _SEPARATOR, [_AB_SENIOR, _AB_ACRONYM]) sents = list(_RE_SENTENCE.finditer(processed)) if not sents: yield chunk continue for sentence in sents: sentence = _replace_with_separator(sentence.group(), r" ", [_UNDO_AB_SENIOR, _UNDO_AB_ACRONYM]) yield sentence ================================================ FILE: hanlp/utils/span_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-06-12 20:34 import warnings from typing import Dict, List, Tuple, Callable, Set, Optional def generate_words_per_line(file_path): with open(file_path, encoding='utf-8') as src: for line in src: cells = line.strip().split() if not cells: continue yield cells def words_to_bmes(words): tags = [] for w in words: if not w: raise ValueError('{} contains None or zero-length word {}'.format(str(words), w)) if len(w) == 1: tags.append('S') else: tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E']) return tags def words_to_bi(words): tags = [] for w in words: if not w: raise ValueError('{} contains None or zero-length word {}'.format(str(words), w)) tags.extend(['B'] + ['I'] * (len(w) - 1)) return tags def bmes_to_words(chars, tags): result = [] if len(chars) == 0: return result word = chars[0] for c, t in zip(chars[1:], tags[1:]): if t == 'B' or t == 'S': result.append(word) word = '' word += c if len(word) != 0: result.append(word) return result def bmes_to_spans(tags): result = [] offset = 0 pre_offset = 0 for t in tags[1:]: offset += 1 if t == 'B' or t == 'S': result.append((pre_offset, offset)) pre_offset = offset if offset != len(tags): result.append((pre_offset, len(tags))) return result def bmes_of(sentence, segmented): if segmented: chars = [] tags = [] words = sentence.split() for w in words: chars.extend(list(w)) if len(w) == 1: tags.append('S') else: tags.extend(['B'] + ['M'] * (len(w) - 2) + ['E']) else: chars = list(sentence) tags = ['S'] * len(chars) return chars, tags def iobes_to_bilou(src, dst): with open(src) as src, open(dst, 'w') as out: for line in src: line = line.strip() if not line: out.write('\n') continue word, tag = line.split('\t') if tag.startswith('E-'): tag = 'L-' + tag[2:] elif tag.startswith('S-'): tag = 'U-' + tag[2:] out.write(f'{word}\t{tag}\n') def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]: """ Given labels and a constraint type, returns the allowed transitions. It will additionally include transitions for the start and end states, which are used by the conditional random field. # Parameters constraint_type : `str`, required Indicates which constraint to apply. Current choices are "BIO", "IOB1", "BIOUL", and "BMES". labels : `Dict[int, str]`, required A mapping {label_id -> label}. Most commonly this would be the value from Vocabulary.get_index_to_token_vocabulary() # Returns `List[Tuple[int, int]]` The allowed transitions (from_label_id, to_label_id). """ num_labels = len(labels) start_tag = num_labels end_tag = num_labels + 1 labels_with_boundaries = list(labels.items()) + [(start_tag, "START"), (end_tag, "END")] allowed = [] for from_label_index, from_label in labels_with_boundaries: if from_label in ("START", "END"): from_tag = from_label from_entity = "" else: from_tag = from_label[0] from_entity = from_label[1:] for to_label_index, to_label in labels_with_boundaries: if to_label in ("START", "END"): to_tag = to_label to_entity = "" else: to_tag = to_label[0] to_entity = to_label[1:] if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity): allowed.append((from_label_index, to_label_index)) return allowed def is_transition_allowed( constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str ): """ Given a constraint type and strings `from_tag` and `to_tag` that represent the origin and destination of the transition, return whether the transition is allowed under the given constraint type. # Parameters constraint_type : `str`, required Indicates which constraint to apply. Current choices are "BIO", "IOB1", "BIOUL", and "BMES". from_tag : `str`, required The tag that the transition originates from. For example, if the label is `I-PER`, the `from_tag` is `I`. from_entity : `str`, required The entity corresponding to the `from_tag`. For example, if the label is `I-PER`, the `from_entity` is `PER`. to_tag : `str`, required The tag that the transition leads to. For example, if the label is `I-PER`, the `to_tag` is `I`. to_entity : `str`, required The entity corresponding to the `to_tag`. For example, if the label is `I-PER`, the `to_entity` is `PER`. # Returns `bool` Whether the transition is allowed under the given `constraint_type`. """ if to_tag == "START" or from_tag == "END": # Cannot transition into START or from END return False if constraint_type == "BIOUL": if from_tag == "START": return to_tag in ("O", "B", "U") if to_tag == "END": return from_tag in ("O", "L", "U") return any( [ # O can transition to O, B-* or U-* # L-x can transition to O, B-*, or U-* # U-x can transition to O, B-*, or U-* from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"), # B-x can only transition to I-x or L-x # I-x can only transition to I-x or L-x from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity, ] ) elif constraint_type == "BIO": if from_tag == "START": return to_tag in ("O", "B") if to_tag == "END": return from_tag in ("O", "B", "I") return any( [ # Can always transition to O or B-x to_tag in ("O", "B"), # Can only transition to I-x from B-x or I-x to_tag == "I" and from_tag in ("B", "I") and from_entity == to_entity, ] ) elif constraint_type == "IOB1": if from_tag == "START": return to_tag in ("O", "I") if to_tag == "END": return from_tag in ("O", "B", "I") return any( [ # Can always transition to O or I-x to_tag in ("O", "I"), # Can only transition to B-x from B-x or I-x, where # x is the same tag. to_tag == "B" and from_tag in ("B", "I") and from_entity == to_entity, ] ) elif constraint_type == "BMES": if from_tag == "START": return to_tag in ("B", "S") if to_tag == "END": return from_tag in ("E", "S") return any( [ # Can only transition to B or S from E or S. to_tag in ("B", "S") and from_tag in ("E", "S"), # Can only transition to M-x from B-x, where # x is the same tag. to_tag == "M" and from_tag in ("B", "M") and from_entity == to_entity, # Can only transition to E-x from B-x or M-x, where # x is the same tag. to_tag == "E" and from_tag in ("B", "M") and from_entity == to_entity, ] ) else: raise ValueError(f"Unknown constraint type: {constraint_type}") TypedSpan = Tuple[int, Tuple[int, int]] TypedStringSpan = Tuple[str, Tuple[int, int]] class InvalidTagSequence(Exception): def __init__(self, tag_sequence=None): super().__init__() self.tag_sequence = tag_sequence def __str__(self): return " ".join(self.tag_sequence) T = str def enumerate_spans( sentence: List[T], offset: int = 0, max_span_width: int = None, min_span_width: int = 1, filter_function: Callable[[List[T]], bool] = None, ) -> List[Tuple[int, int]]: """ Given a sentence, return all token spans within the sentence. Spans are `inclusive`. Additionally, you can provide a maximum and minimum span width, which will be used to exclude spans outside of this range. Finally, you can provide a function mapping `List[T] -> bool`, which will be applied to every span to decide whether that span should be included. This allows filtering by length, regex matches, pos tags or any Spacy `Token` attributes, for example. # Parameters sentence : `List[T]`, required. The sentence to generate spans for. The type is generic, as this function can be used with strings, or Spacy `Tokens` or other sequences. offset : `int`, optional (default = `0`) A numeric offset to add to all span start and end indices. This is helpful if the sentence is part of a larger structure, such as a document, which the indices need to respect. max_span_width : `int`, optional (default = `None`) The maximum length of spans which should be included. Defaults to len(sentence). min_span_width : `int`, optional (default = `1`) The minimum length of spans which should be included. Defaults to 1. filter_function : `Callable[[List[T]], bool]`, optional (default = `None`) A function mapping sequences of the passed type T to a boolean value. If `True`, the span is included in the returned spans from the sentence, otherwise it is excluded.. """ max_span_width = max_span_width or len(sentence) filter_function = filter_function or (lambda x: True) spans: List[Tuple[int, int]] = [] for start_index in range(len(sentence)): last_end_index = min(start_index + max_span_width, len(sentence)) first_end_index = min(start_index + min_span_width - 1, len(sentence)) for end_index in range(first_end_index, last_end_index): start = offset + start_index end = offset + end_index # add 1 to end index because span indices are inclusive. if filter_function(sentence[slice(start_index, end_index + 1)]): spans.append((start, end)) return spans def bio_tags_to_spans( tag_sequence: List[str], classes_to_ignore: List[str] = None ) -> List[TypedStringSpan]: """ Given a sequence corresponding to BIO tags, extracts spans. Spans are inclusive and can be of zero length, representing a single word span. Ill-formed spans are also included (i.e those which do not start with a "B-LABEL"), as otherwise it is possible to get a perfect precision score whilst still predicting ill-formed spans in addition to the correct spans. This function works properly when the spans are unlabeled (i.e., your labels are simply "B", "I", and "O"). # Parameters tag_sequence : `List[str]`, required. The integer class labels for a sequence. classes_to_ignore : `List[str]`, optional (default = `None`). A list of string class labels `excluding` the bio tag which should be ignored when extracting spans. # Returns spans : `List[TypedStringSpan]` The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)). Note that the label `does not` contain any BIO tag prefixes. """ classes_to_ignore = classes_to_ignore or [] spans: Set[Tuple[str, Tuple[int, int]]] = set() span_start = 0 span_end = 0 active_conll_tag = None for index, string_tag in enumerate(tag_sequence): # Actual BIO tag. bio_tag = string_tag[0] if bio_tag not in ["B", "I", "O"]: raise InvalidTagSequence(tag_sequence) conll_tag = string_tag[2:] if bio_tag == "O" or conll_tag in classes_to_ignore: # The span has ended. if active_conll_tag is not None: spans.add((active_conll_tag, (span_start, span_end))) active_conll_tag = None # We don't care about tags we are # told to ignore, so we do nothing. continue elif bio_tag == "B": # We are entering a new span; reset indices # and active tag to new span. if active_conll_tag is not None: spans.add((active_conll_tag, (span_start, span_end))) active_conll_tag = conll_tag span_start = index span_end = index elif bio_tag == "I" and conll_tag == active_conll_tag: # We're inside a span. span_end += 1 else: # This is the case the bio label is an "I", but either: # 1) the span hasn't started - i.e. an ill formed span. # 2) The span is an I tag for a different conll annotation. # We'll process the previous span if it exists, but also # include this span. This is important, because otherwise, # a model may get a perfect F1 score whilst still including # false positive ill-formed spans. if active_conll_tag is not None: spans.add((active_conll_tag, (span_start, span_end))) active_conll_tag = conll_tag span_start = index span_end = index # Last token might have been a part of a valid span. if active_conll_tag is not None: spans.add((active_conll_tag, (span_start, span_end))) return list(spans) def iob1_tags_to_spans( tag_sequence: List[str], classes_to_ignore: List[str] = None ) -> List[TypedStringSpan]: """ Given a sequence corresponding to IOB1 tags, extracts spans. Spans are inclusive and can be of zero length, representing a single word span. Ill-formed spans are also included (i.e., those where "B-LABEL" is not preceded by "I-LABEL" or "B-LABEL"). # Parameters tag_sequence : `List[str]`, required. The integer class labels for a sequence. classes_to_ignore : `List[str]`, optional (default = `None`). A list of string class labels `excluding` the bio tag which should be ignored when extracting spans. # Returns spans : `List[TypedStringSpan]` The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)). Note that the label `does not` contain any BIO tag prefixes. """ classes_to_ignore = classes_to_ignore or [] spans: Set[Tuple[str, Tuple[int, int]]] = set() span_start = 0 span_end = 0 active_conll_tag = None prev_bio_tag = None prev_conll_tag = None for index, string_tag in enumerate(tag_sequence): curr_bio_tag = string_tag[0] curr_conll_tag = string_tag[2:] if curr_bio_tag not in ["B", "I", "O"]: raise InvalidTagSequence(tag_sequence) if curr_bio_tag == "O" or curr_conll_tag in classes_to_ignore: # The span has ended. if active_conll_tag is not None: spans.add((active_conll_tag, (span_start, span_end))) active_conll_tag = None elif _iob1_start_of_chunk(prev_bio_tag, prev_conll_tag, curr_bio_tag, curr_conll_tag): # We are entering a new span; reset indices # and active tag to new span. if active_conll_tag is not None: spans.add((active_conll_tag, (span_start, span_end))) active_conll_tag = curr_conll_tag span_start = index span_end = index else: # bio_tag == "I" and curr_conll_tag == active_conll_tag # We're continuing a span. span_end += 1 prev_bio_tag = string_tag[0] prev_conll_tag = string_tag[2:] # Last token might have been a part of a valid span. if active_conll_tag is not None: spans.add((active_conll_tag, (span_start, span_end))) return list(spans) def _iob1_start_of_chunk( prev_bio_tag: Optional[str], prev_conll_tag: Optional[str], curr_bio_tag: str, curr_conll_tag: str, ) -> bool: if curr_bio_tag == "B": return True if curr_bio_tag == "I" and prev_bio_tag == "O": return True if curr_bio_tag != "O" and prev_conll_tag != curr_conll_tag: return True return False def bioul_tags_to_spans( tag_sequence: List[str], classes_to_ignore: List[str] = None ) -> List[TypedStringSpan]: """ Given a sequence corresponding to BIOUL tags, extracts spans. Spans are inclusive and can be of zero length, representing a single word span. Ill-formed spans are not allowed and will raise `InvalidTagSequence`. This function works properly when the spans are unlabeled (i.e., your labels are simply "B", "I", "O", "U", and "L"). # Parameters tag_sequence : `List[str]`, required. The tag sequence encoded in BIOUL, e.g. ["B-PER", "L-PER", "O"]. classes_to_ignore : `List[str]`, optional (default = `None`). A list of string class labels `excluding` the bio tag which should be ignored when extracting spans. # Returns spans : `List[TypedStringSpan]` The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)). """ spans = [] classes_to_ignore = classes_to_ignore or [] index = 0 while index < len(tag_sequence): label = tag_sequence[index] if label[0] == "U": spans.append((label.partition("-")[2], (index, index))) elif label[0] == "B": start = index while label[0] != "L": index += 1 if index >= len(tag_sequence): raise InvalidTagSequence(tag_sequence) label = tag_sequence[index] if not (label[0] == "I" or label[0] == "L"): raise InvalidTagSequence(tag_sequence) spans.append((label.partition("-")[2], (start, index))) else: if label != "O": raise InvalidTagSequence(tag_sequence) index += 1 return [span for span in spans if span[0] not in classes_to_ignore] def iobes_tags_to_spans( tag_sequence: List[str], classes_to_ignore: List[str] = None ) -> List[TypedStringSpan]: """ Given a sequence corresponding to BIOUL tags, extracts spans. Spans are inclusive and can be of zero length, representing a single word span. Ill-formed spans are not allowed and will raise `InvalidTagSequence`. This function works properly when the spans are unlabeled (i.e., your labels are simply "B", "I", "O", "U", and "L"). # Parameters tag_sequence : `List[str]`, required. The tag sequence encoded in BIOUL, e.g. ["B-PER", "L-PER", "O"]. classes_to_ignore : `List[str]`, optional (default = `None`). A list of string class labels `excluding` the bio tag which should be ignored when extracting spans. # Returns spans : `List[TypedStringSpan]` The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)). """ spans = [] classes_to_ignore = classes_to_ignore or [] index = 0 while index < len(tag_sequence): label = tag_sequence[index] if label[0] == "S": spans.append((label.partition("-")[2], (index, index))) elif label[0] == "B": start = index while label[0] != "E": index += 1 if index >= len(tag_sequence): raise InvalidTagSequence(tag_sequence) label = tag_sequence[index] if not (label[0] == "I" or label[0] == "E"): raise InvalidTagSequence(tag_sequence) spans.append((label.partition("-")[2], (start, index))) else: if label != "O": raise InvalidTagSequence(tag_sequence) index += 1 return [span for span in spans if span[0] not in classes_to_ignore] def iob1_to_bioul(tag_sequence: List[str]) -> List[str]: warnings.warn( "iob1_to_bioul has been replaced with 'to_bioul' to allow more encoding options.", FutureWarning, ) return to_bioul(tag_sequence) def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]: """ Given a tag sequence encoded with IOB1 labels, recode to BIOUL. In the IOB1 scheme, I is a token inside a span, O is a token outside a span and B is the beginning of span immediately following another span of the same type. In the BIO scheme, I is a token inside a span, O is a token outside a span and B is the beginning of a span. # Parameters tag_sequence : `List[str]`, required. The tag sequence encoded in IOB1, e.g. ["I-PER", "I-PER", "O"]. encoding : `str`, optional, (default = `"IOB1"`). The encoding type to convert from. Must be either "IOB1" or "BIO". # Returns bioul_sequence : `List[str]` The tag sequence encoded in IOB1, e.g. ["B-PER", "L-PER", "O"]. """ if encoding not in {"IOB1", "BIO"}: raise ValueError(f"Invalid encoding {encoding} passed to 'to_bioul'.") def replace_label(full_label, new_label): # example: full_label = 'I-PER', new_label = 'U', returns 'U-PER' parts = list(full_label.partition("-")) parts[0] = new_label return "".join(parts) def pop_replace_append(in_stack, out_stack, new_label): # pop the last element from in_stack, replace the label, append # to out_stack tag = in_stack.pop() new_tag = replace_label(tag, new_label) out_stack.append(new_tag) def process_stack(stack, out_stack): # process a stack of labels, add them to out_stack if len(stack) == 1: # just a U token pop_replace_append(stack, out_stack, "U") else: # need to code as BIL recoded_stack = [] pop_replace_append(stack, recoded_stack, "L") while len(stack) >= 2: pop_replace_append(stack, recoded_stack, "I") pop_replace_append(stack, recoded_stack, "B") recoded_stack.reverse() out_stack.extend(recoded_stack) # Process the tag_sequence one tag at a time, adding spans to a stack, # then recode them. bioul_sequence = [] stack: List[str] = [] for label in tag_sequence: # need to make a dict like # token = {'token': 'Matt', "labels": {'conll2003': "B-PER"} # 'gold': 'I-PER'} # where 'gold' is the raw value from the CoNLL data set if label == "O" and len(stack) == 0: bioul_sequence.append(label) elif label == "O" and len(stack) > 0: # need to process the entries on the stack plus this one process_stack(stack, bioul_sequence) bioul_sequence.append(label) elif label[0] == "I": # check if the previous type is the same as this one # if it is then append to stack # otherwise this start a new entity if the type # is different if len(stack) == 0: if encoding == "BIO": raise InvalidTagSequence(tag_sequence) stack.append(label) else: # check if the previous type is the same as this one this_type = label.partition("-")[2] prev_type = stack[-1].partition("-")[2] if this_type == prev_type: stack.append(label) else: if encoding == "BIO": raise InvalidTagSequence(tag_sequence) # a new entity process_stack(stack, bioul_sequence) stack.append(label) elif label[0] == "B": if len(stack) > 0: process_stack(stack, bioul_sequence) stack.append(label) else: raise InvalidTagSequence(tag_sequence) # process the stack if len(stack) > 0: process_stack(stack, bioul_sequence) return bioul_sequence def bmes_tags_to_spans( tag_sequence: List[str], classes_to_ignore: List[str] = None ) -> List[TypedStringSpan]: """ Given a sequence corresponding to BMES tags, extracts spans. Spans are inclusive and can be of zero length, representing a single word span. Ill-formed spans are also included (i.e those which do not start with a "B-LABEL"), as otherwise it is possible to get a perfect precision score whilst still predicting ill-formed spans in addition to the correct spans. This function works properly when the spans are unlabeled (i.e., your labels are simply "B", "M", "E" and "S"). # Parameters tag_sequence : `List[str]`, required. The integer class labels for a sequence. classes_to_ignore : `List[str]`, optional (default = `None`). A list of string class labels `excluding` the bio tag which should be ignored when extracting spans. # Returns spans : `List[TypedStringSpan]` The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)). Note that the label `does not` contain any BIO tag prefixes. """ def extract_bmes_tag_label(text): bmes_tag = text[0] label = text[2:] return bmes_tag, label spans: List[Tuple[str, List[int]]] = [] prev_bmes_tag: Optional[str] = None for index, tag in enumerate(tag_sequence): bmes_tag, label = extract_bmes_tag_label(tag) if bmes_tag in ("B", "S"): # Regardless of tag, we start a new span when reaching B & S. spans.append((label, [index, index])) elif bmes_tag in ("M", "E") and prev_bmes_tag in ("B", "M") and spans[-1][0] == label: # Only expand the span if # 1. Valid transition: B/M -> M/E. # 2. Matched label. spans[-1][1][1] = index else: # Best effort split for invalid span. spans.append((label, [index, index])) # update previous BMES tag. prev_bmes_tag = bmes_tag classes_to_ignore = classes_to_ignore or [] return [ # to tuple. (span[0], (span[1][0], span[1][1])) for span in spans if span[0] not in classes_to_ignore ] ================================================ FILE: hanlp/utils/string_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-25 00:19 import unicodedata from typing import List, Dict, Tuple def format_scores(results: Dict[str, float]) -> str: return ' - '.join(f'{k}: {v:.4f}' for (k, v) in results.items()) def ispunct(token): return all(unicodedata.category(char).startswith('P') for char in token) def split_long_sentence_into(tokens: List[str], max_seq_length, sent_delimiter=None, char_level=False, hard_constraint=False): punct_offset = [i for i, x in enumerate(tokens) if ((sent_delimiter and x in sent_delimiter) or (not sent_delimiter and ispunct(x)))] if not punct_offset: # treat every token as punct punct_offset = [i for i in range(len(tokens))] punct_offset += [len(tokens)] token_to_char_offset = [] if char_level: offset = 0 for token in tokens: token_to_char_offset.append(offset) offset += len(token) token_to_char_offset.append(offset) start = 0 for i, offset in enumerate(punct_offset[:-1]): end = punct_offset[i + 1] length_at_next_punct = _len(start, end, token_to_char_offset, char_level) if length_at_next_punct >= max_seq_length: if hard_constraint: yield from _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level) else: yield tokens[start: offset + 1] start = offset + 1 offset = punct_offset[-1] if start < offset: offset -= 1 length_at_next_punct = _len(start, offset, token_to_char_offset, char_level) if length_at_next_punct >= max_seq_length and hard_constraint: yield from _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level) else: yield tokens[start:] def _gen_short_sent(tokens, start, offset, max_seq_length, token_to_char_offset, char_level): while start <= offset: for j in range(offset + 1, start, -1): if _len(start, j, token_to_char_offset, char_level) <= max_seq_length or j == start + 1: yield tokens[start: j] start = j break def _len(start, end, token_to_char_offset, char_level): if char_level: length_at_next_punct = token_to_char_offset[end] - token_to_char_offset[start] else: length_at_next_punct = end - start return length_at_next_punct def guess_delimiter(tokens): if all(ord(c) < 128 for c in ''.join(tokens)): delimiter_in_entity = ' ' else: delimiter_in_entity = '' return delimiter_in_entity def split_long_sent(sent, delimiters, max_seq_length): parts = [] offset = 0 for idx, char in enumerate(sent): if char in delimiters: parts.append(sent[offset:idx + 1]) offset = idx + 1 if not parts: yield sent return short = [] for idx, part in enumerate(parts): short += part if idx == len(parts) - 1: yield short else: if len(short) + len(parts[idx + 1]) > max_seq_length: yield short short = [] def possible_tokenization(text: str) -> List[Tuple[str]]: """Enumerate all possible tokenizations of a text. Args: text: A text. Returns: All possible tokenizations. """ states = [((), ())] for c in text: new_states = [] for t, b in states: # to split new_states.append((t + (''.join(b + (c,)),), ())) # not to split new_states.append((t, b + (c,))) states = new_states return [t for t, b in states if not b] ================================================ FILE: hanlp/utils/tf_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-27 01:27 import json import logging import os import random from typing import List import numpy as np from hanlp_common.constant import PAD def set_gpu(idx=0): """Restrict TensorFlow to only use the GPU of idx Args: idx: (Default value = 0) Returns: """ gpus = get_visible_gpus() if gpus: try: tf.config.experimental.set_visible_devices(gpus[idx], 'GPU') logical_devices = tf.config.experimental.list_logical_devices('GPU') assert len(logical_devices) == 1 except RuntimeError as e: # Virtual devices must be set before GPUs have been initialized # print(e) raise e def get_visible_gpus(): gpus = tf.config.experimental.list_physical_devices('GPU') return gpus def set_gpu_memory_growth(growth=True): gpus = get_visible_gpus() if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, growth) except RuntimeError as e: # Memory growth must be set before GPUs have been initialized # print(e) raise e def nice_gpu(): """Use GPU nicely.""" set_gpu_memory_growth() set_gpu() def shut_up_python_logging(): logging.getLogger('tensorflow').setLevel(logging.ERROR) import absl.logging logging.root.removeHandler(absl.logging._absl_handler) absl.logging._warn_preinit_stderr = False def set_tf_loglevel(level=logging.ERROR): if level >= logging.FATAL: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' if level >= logging.ERROR: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2' if level >= logging.WARNING: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '1' else: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '0' shut_up_python_logging() logging.getLogger('tensorflow').setLevel(level) set_tf_loglevel() shut_up_python_logging() import tensorflow as tf nice_gpu() def size_of_dataset(dataset: tf.data.Dataset) -> int: count = 0 for element in dataset.unbatch().batch(1): count += 1 return count def summary_of_model(model: tf.keras.Model): """https://stackoverflow.com/a/53668338/3730690 Args: model: tf.keras.Model: Returns: """ if not model.built: return 'model structure unknown until calling fit() with some data' line_list = [] model.summary(print_fn=lambda x: line_list.append(x)) summary = "\n".join(line_list) return summary def register_custom_cls(custom_cls, name=None): if not name: name = custom_cls.__name__ tf.keras.utils.get_custom_objects()[name] = custom_cls def set_seed_tf(seed=233): tf.random.set_seed(seed) np.random.seed(seed) random.seed(seed) def nice(): nice_gpu() set_seed_tf() def hanlp_register(arg): """Registers a class with the Keras serialization framework. Args: arg: Returns: """ class_name = arg.__name__ registered_name = 'HanLP' + '>' + class_name # if tf_inspect.isclass(arg) and not hasattr(arg, 'get_config'): # raise ValueError( # 'Cannot register a class that does not have a get_config() method.') tf.keras.utils.get_custom_objects()[registered_name] = arg return arg def tensor_is_eager(tensor: tf.Tensor): return hasattr(tensor, 'numpy') def copy_mask(src: tf.Tensor, dst: tf.Tensor): mask = getattr(src, '_keras_mask', None) if mask is not None: dst._keras_mask = mask return mask def get_callback_by_class(callbacks: List[tf.keras.callbacks.Callback], cls) -> tf.keras.callbacks.Callback: for callback in callbacks: if isinstance(callback, cls): return callback def tf_bernoulli(shape, p, dtype=None): return tf.keras.backend.random_binomial(shape, p, dtype) def str_tensor_to_str(str_tensor: tf.Tensor) -> str: return str_tensor.numpy().decode('utf-8') def str_tensor_2d_to_list(str_tensor: tf.Tensor, pad=PAD) -> List[List[str]]: l = [] for i in str_tensor: sent = [] for j in i: j = str_tensor_to_str(j) if j == pad: break sent.append(j) l.append(sent) return l def str_tensor_to_list(pred): return [tag.predict('utf-8') for tag in pred] def format_metrics(metrics: List[tf.keras.metrics.Metric]): return ' - '.join(f'{m.name}: {m.result():.4f}' for m in metrics) class NumpyEncoder(json.JSONEncoder): def default(self, obj): """Special json encoder for numpy types See https://interviewbubble.com/typeerror-object-of-type-float32-is-not-json-serializable/ Args: obj: Object to be json encoded. Returns: Json string. """ if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)): return int(obj) elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj, (np.ndarray,)): #### This is the fix return obj.tolist() return json.JSONEncoder.default(self, obj) ================================================ FILE: hanlp/utils/time_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-08-27 00:01 import datetime import logging import sys import time from typing import Union from hanlp.utils.log_util import ErasablePrinter, color_format, color_format_len def human_time_delta(days, hours, minutes, seconds, delimiter=' ') -> str: units = locals().copy() units.pop('delimiter') non_zero = False result = [] for key, val in sorted(units.items()): append = False if non_zero: append = True elif val: non_zero = True append = True if append: result.append('{} {}'.format(val, key[0])) if not non_zero: return '0 s' return delimiter.join(result) def seconds_to_time_delta(seconds): seconds = round(seconds) days = seconds // 86400 hours = seconds // 3600 % 24 minutes = seconds // 60 % 60 seconds = seconds % 60 return days, hours, minutes, seconds def report_time_delta(seconds, human=True): days, hours, minutes, seconds = seconds_to_time_delta(seconds) if human: return human_time_delta(days, hours, minutes, seconds) return days, hours, minutes, seconds class HumanTimeDelta(object): def __init__(self, delta_seconds) -> None: super().__init__() self.delta_seconds = delta_seconds def report(self, human=True): return report_time_delta(self.delta_seconds, human) def __str__(self) -> str: return self.report(human=True) def __truediv__(self, scalar): return HumanTimeDelta(self.delta_seconds / scalar) class CountdownTimer(ErasablePrinter): def __init__(self, total: int, out=sys.stdout) -> None: super().__init__(out=out) self.total = total self.current = 0 self.start = time.time() self.finished_in = None self.last_log_time = 0 def update(self, n=1): self.current += n self.current = min(self.total, self.current) if self.current == self.total: self.finished_in = time.time() - self.start @property def ratio(self) -> str: return f'{self.current}/{self.total}' @property def ratio_percentage(self) -> str: return f'{self.current / self.total:.2%}' @property def eta(self) -> float: elapsed = self.elapsed if self.finished_in: eta = 0 else: eta = elapsed / max(self.current, 0.1) * (self.total - self.current) return eta @property def elapsed(self) -> float: if self.finished_in: elapsed = self.finished_in else: elapsed = time.time() - self.start return elapsed @property def elapsed_human(self) -> str: return human_time_delta(*seconds_to_time_delta(self.elapsed)) @property def elapsed_average(self) -> float: return self.elapsed / self.current @property def elapsed_average_human(self) -> str: return human_time_delta(*seconds_to_time_delta(self.elapsed_average)) @property def eta_human(self) -> str: return human_time_delta(*seconds_to_time_delta(self.eta)) @property def total_time(self) -> float: elapsed = self.elapsed if self.finished_in: t = self.finished_in else: t = elapsed / max(self.current, 1) * self.total return t @property def total_time_human(self) -> str: return human_time_delta(*seconds_to_time_delta(self.total_time)) def stop(self, total=None): if not self.finished_in or total: self.finished_in = time.time() - self.start if not total: self.total = self.current else: self.current = total self.total = total @property def et_eta(self): _ = self.elapsed if self.finished_in: return self.elapsed else: return self.eta @property def et_eta_human(self): text = human_time_delta(*seconds_to_time_delta(self.et_eta)) if self.finished_in: return f'ET: {text}' else: return f'ETA: {text}' @property def finished(self): return self.total == self.current def log(self, info=None, ratio_percentage=True, ratio=True, step=1, interval=0.5, erase=True, logger: Union[logging.Logger, bool] = None, newline=False, ratio_width=None): self.update(step) now = time.time() if now - self.last_log_time > interval or self.finished: cells = [] if ratio_percentage: cells.append(self.ratio_percentage) if ratio: ratio = self.ratio if not ratio_width: ratio_width = self.ratio_width ratio = ratio.rjust(ratio_width) cells.append(ratio) cells += [info, self.et_eta_human] cells = [x for x in cells if x] msg = f'{" ".join(cells)}' self.last_log_time = now self.print(msg, newline, erase, logger) @property def ratio_width(self) -> int: return len(f'{self.total}') * 2 + 1 def print(self, msg, newline=False, erase=True, logger=None): self.erase() msg_len = 0 if newline else len(msg) if self.finished and logger: sys.stdout.flush() if isinstance(logger, logging.Logger): logger.info(msg) else: msg, msg_len = color_format_len(msg) sys.stdout.write(msg) if newline: sys.stdout.write('\n') msg_len = 0 self._last_print_width = msg_len if self.finished and not logger: if erase: self.erase() else: sys.stdout.write("\n") self._last_print_width = 0 sys.stdout.flush() class Timer(object): def __init__(self) -> None: self.last = time.time() def start(self): self.last = time.time() def stop(self) -> HumanTimeDelta: now = time.time() seconds = now - self.last self.last = now return HumanTimeDelta(seconds) def now_human(year='y'): now = datetime.datetime.now() return now.strftime(f"%{year}-%m-%d %H:%M:%S") def now_datetime(): return now_human('Y') def now_filename(fmt="%y%m%d_%H%M%S"): """Generate filename using current datetime, in 20180102_030405 format Args: fmt: (Default value = "%y%m%d_%H%M%S") Returns: """ now = datetime.datetime.now() return now.strftime(fmt) ================================================ FILE: hanlp/utils/torch_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-05-09 15:52 import os import random import time from typing import List, Union, Dict, Tuple import numpy as np import torch from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit, nvmlShutdown, nvmlDeviceGetCount from torch import nn from torch.nn.utils.rnn import pad_sequence from hanlp.utils.io_util import get_resource, replace_ext, TimingFileIterator from hanlp.utils.log_util import logger, flash from hanlp_common.constant import HANLP_VERBOSE from hanlp_common.io import load_pickle, save_pickle def gpus_available() -> Dict[int, float]: if not torch.cuda.is_available(): return dict() try: nvmlInit() gpus = {} visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None) if visible_devices is None: visible_devices = list(range(nvmlDeviceGetCount())) else: visible_devices = {int(x.strip()) for x in visible_devices.split(',')} for i, real_id in enumerate(visible_devices): h = nvmlDeviceGetHandleByIndex(real_id) info = nvmlDeviceGetMemoryInfo(h) total = info.total free = info.free ratio = free / total gpus[i] = ratio # print(f'total : {info.total}') # print(f'free : {info.free}') # print(f'used : {info.used}') # t = torch.cuda.get_device_properties(0).total_memory # c = torch.cuda.memory_cached(0) # a = torch.cuda.memory_allocated(0) # print(t, c, a) nvmlShutdown() return dict(sorted(gpus.items(), key=lambda x: x[1], reverse=True)) except Exception as e: logger.debug(f'Failed to get gpu info due to {e}') return dict((i, 1.0) for i in range(torch.cuda.device_count())) def cuda_devices(query=None) -> List[int]: """Decide which GPUs to use Args: query: (Default value = None) Returns: """ if isinstance(query, list): if len(query) == 0: return [-1] return query if query is None: query = gpus_available() if not query: return [] size, idx = max((v, k) for k, v in query.items()) # When multiple GPUs have the same size, randomly pick one to avoid conflicting gpus_with_same_size = [k for k, v in query.items() if v == size] query = random.choice(gpus_with_same_size) if isinstance(query, float): gpus = gpus_available() if not query: return [] query = [k for k, v in gpus.items() if v > query] elif isinstance(query, int): query = [query] return query def pad_lists(sequences: List[List], dtype=torch.long, padding_value=0): return pad_sequence([torch.tensor(x, dtype=dtype) for x in sequences], True, padding_value) def set_seed(seed=233, dont_care_speed=False): """Copied from https://github.com/huggingface/transformers/blob/7b75aa9fa55bee577e2c7403301ed31103125a35/src/transformers/trainer.py#L76 Args: seed: (Default value = 233) dont_care_speed: True may have a negative single-run performance impact, but ensures deterministic Returns: """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # ^^ safe to call this function even if cuda is not available torch.cuda.manual_seed_all(seed) if dont_care_speed: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def batched_index_select(input, index, dim=1): """ Args: input: B x * x ... x * index: B x M dim: (Default value = 1) Returns: """ views = [input.shape[0]] + [1 if i != dim else -1 for i in range(1, len(input.shape))] expanse = list(input.shape) expanse[0] = -1 expanse[dim] = -1 index = index.view(views).expand(expanse) return torch.gather(input, dim, index) def truncated_normal_(tensor, mean=0, std=1): size = tensor.shape tmp = tensor.new_empty(size + (4,)).normal_() valid = (tmp < 2) & (tmp > -2) ind = valid.max(-1, keepdim=True)[1] tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1)) tensor.data.mul_(std).add_(mean) return tensor def dtype_of(e: Union[int, bool, float]): if isinstance(e, bool): return torch.bool if isinstance(e, int): return torch.long if isinstance(e, float): return torch.float raise ValueError(f'Unsupported type of {repr(e)}') def mean_model(model: torch.nn.Module): return float(torch.mean(torch.stack([torch.sum(p) for p in model.parameters() if p.requires_grad]))) def main(): start = time.time() print(gpus_available()) print(time.time() - start) # print(gpus_available()) # print(cuda_devices()) # print(cuda_devices(0.1)) if __name__ == '__main__': main() def clip_grad_norm(model: nn.Module, grad_norm, transformer: nn.Module = None, transformer_grad_norm=None): if transformer_grad_norm is None: if grad_norm is not None: nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), grad_norm) else: is_transformer = [] non_transformer = [] transformer = set(transformer.parameters()) for p in model.parameters(): if not p.requires_grad: continue if p in transformer: is_transformer.append(p) else: non_transformer.append(p) nn.utils.clip_grad_norm_(non_transformer, grad_norm) nn.utils.clip_grad_norm_(is_transformer, transformer_grad_norm) def load_word2vec(path, delimiter=' ', cache=True) -> Tuple[Dict[str, np.ndarray], int]: realpath = get_resource(path) binpath = replace_ext(realpath, '.pkl') if cache: try: flash('Loading word2vec from cache [blink][yellow]...[/yellow][/blink]') word2vec, dim = load_pickle(binpath) flash('') return word2vec, dim except IOError: pass dim = None word2vec = dict() f = TimingFileIterator(realpath) for idx, line in enumerate(f): f.log('Loading word2vec from text file [blink][yellow]...[/yellow][/blink]') line = line.rstrip().split(delimiter) if len(line) > 2: if dim is None: dim = len(line) else: if len(line) != dim: logger.warning('{}#{} length mismatches with {}'.format(path, idx + 1, dim)) continue word, vec = line[0], line[1:] word2vec[word] = np.array(vec, dtype=np.float32) dim -= 1 if cache: flash('Caching word2vec [blink][yellow]...[/yellow][/blink]') save_pickle((word2vec, dim), binpath) flash('') return word2vec, dim def load_word2vec_as_vocab_tensor(path, delimiter=' ', cache=True) -> Tuple[Dict[str, int], torch.Tensor]: realpath = get_resource(path) vocab_path = replace_ext(realpath, '.vocab') matrix_path = replace_ext(realpath, '.pt') if cache: try: if HANLP_VERBOSE: flash('Loading vocab and matrix from cache [blink][yellow]...[/yellow][/blink]') vocab = load_pickle(vocab_path) matrix = torch.load(matrix_path, map_location='cpu') if HANLP_VERBOSE: flash('') return vocab, matrix except IOError: pass word2vec, dim = load_word2vec(path, delimiter, cache) vocab = dict((k, i) for i, k in enumerate(word2vec.keys())) matrix = torch.Tensor(np.stack(list(word2vec.values()))) if cache: flash('Caching vocab and matrix [blink][yellow]...[/yellow][/blink]') save_pickle(vocab, vocab_path) torch.save(matrix, matrix_path) flash('') return vocab, matrix def save_word2vec(word2vec: dict, filepath, delimiter=' '): with open(filepath, 'w', encoding='utf-8') as out: for w, v in word2vec.items(): out.write(f'{w}{delimiter}') out.write(f'{delimiter.join(str(x) for x in v)}\n') def lengths_to_mask(seq_len, max_len=None): r""" .. code-block:: >>> seq_len = torch.arange(2, 16) >>> mask = lengths_to_mask(seq_len) >>> print(mask.size()) torch.Size([14, 15]) >>> seq_len = np.arange(2, 16) >>> mask = lengths_to_mask(seq_len) >>> print(mask.shape) (14, 15) >>> seq_len = torch.arange(2, 16) >>> mask = lengths_to_mask(seq_len, max_len=100) >>>print(mask.size()) torch.Size([14, 100]) :param torch.LongTensor seq_len: (B,) :param int max_len: max sequence length。 :return: torch.Tensor (B, max_len) """ assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}." batch_size = seq_len.size(0) max_len = int(max_len) if max_len else seq_len.max().long() broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len) mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1)) return mask def activation_from_name(name: str): return getattr(torch.nn, name) def filter_state_dict_safely(model_state: dict, load_state: dict): safe_state = dict() for k, v in load_state.items(): model_v = model_state.get(k, None) if model_v is not None and model_v.shape == v.shape: safe_state[k] = v return safe_state ================================================ FILE: hanlp/version.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 19:26 __version__ = '2.1.0-beta.64' """HanLP version""" class NotCompatible(Exception): pass ================================================ FILE: plugins/README.md ================================================ # Plugins for HanLP This directory contains modules shared across several individual packages or non core APIs. If you plan to submit any plugins, please put it here too. For developers, run the following set-up. ```bash pip install -e hanlp_trie pip install -e hanlp_common pip install -e hanlp_restful ``` ================================================ FILE: plugins/hanlp_common/README.md ================================================ # Common utilities and structures for HanLP [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker) The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch. ## Installation ```bash pip install hanlp ``` ## License HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website. ================================================ FILE: plugins/hanlp_common/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-16 22:20 ================================================ FILE: plugins/hanlp_common/hanlp_common/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-16 22:21 ================================================ FILE: plugins/hanlp_common/hanlp_common/amr.py ================================================ # MIT License # # Copyright (c) 2019 Sheng Zhang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import json import logging import re import traceback from collections import Counter, defaultdict from hanlp_common.io import eprint try: import networkx as nx import penman from penman import Triple except ModuleNotFoundError: traceback.print_exc() eprint('AMR support requires the full version which can be installed via:\n' 'pip install hanlp_common[full]') exit(1) DEFAULT_PADDING_TOKEN = "@@PADDING@@" DEFAULT_OOV_TOKEN = "@@UNKNOWN@@" logger = logging.getLogger('amr') # Disable inverting ':mod' relation. penman.AMRCodec._inversions.pop('domain') penman.AMRCodec._deinversions.pop('mod') amr_codec = penman.AMRCodec(indent=6) WORDSENSE_RE = re.compile(r'-\d\d$') QUOTED_RE = re.compile(r'^".*"$') def is_abstract_token(token): return re.search(r'^([A-Z]+_)+\d+$', token) or re.search(r'^\d0*$', token) def is_english_punct(c): return re.search(r'^[,.?!:;"\'-(){}\[\]]$', c) def find_similar_token(token, tokens): token = re.sub(r'-\d\d$', '', token) # .lower()) for i, t in enumerate(tokens): if token == t: return tokens[i] # t = t.lower() # if (token == t or # (t.startswith(token) and len(token) > 3) or # token + 'd' == t or # token + 'ed' == t or # re.sub('ly$', 'le', t) == token or # re.sub('tive$', 'te', t) == token or # re.sub('tion$', 'te', t) == token or # re.sub('ied$', 'y', t) == token or # re.sub('ly$', '', t) == token # ): # return tokens[i] return None class AMR: def __init__(self, id=None, sentence=None, graph=None, tokens=None, lemmas=None, pos_tags=None, ner_tags=None, abstract_map=None, misc=None): self.id = id self.sentence = sentence self.graph = graph self.tokens = tokens self.lemmas = lemmas self.pos_tags = pos_tags self.ner_tags = ner_tags self.abstract_map = abstract_map self.misc = misc def is_named_entity(self, index): return self.ner_tags[index] not in ('0', 'O') def get_named_entity_span(self, index): if self.ner_tags is None or not self.is_named_entity(index): return [] span = [index] tag = self.ner_tags[index] prev = index - 1 while prev > 0 and self.ner_tags[prev] == tag: span.append(prev) prev -= 1 next = index + 1 while next < len(self.ner_tags) and self.ner_tags[next] == tag: span.append(next) next += 1 return span def find_span_indexes(self, span): for i, token in enumerate(self.tokens): if token == span[0]: _span = self.tokens[i: i + len(span)] if len(_span) == len(span) and all(x == y for x, y in zip(span, _span)): return list(range(i, i + len(span))) return None def replace_span(self, indexes, new, pos=None, ner=None): self.tokens = self.tokens[:indexes[0]] + new + self.tokens[indexes[-1] + 1:] self.lemmas = self.lemmas[:indexes[0]] + new + self.lemmas[indexes[-1] + 1:] if pos is None: pos = [self.pos_tags[indexes[0]]] self.pos_tags = self.pos_tags[:indexes[0]] + pos + self.pos_tags[indexes[-1] + 1:] if ner is None: ner = [self.ner_tags[indexes[0]]] self.ner_tags = self.ner_tags[:indexes[0]] + ner + self.ner_tags[indexes[-1] + 1:] def remove_span(self, indexes): self.replace_span(indexes, [], [], []) def __repr__(self): fields = [] for k, v in dict( id=self.id, snt=self.sentence, tokens=self.tokens, lemmas=self.lemmas, pos_tags=self.pos_tags, ner_tags=self.ner_tags, abstract_map=self.abstract_map, misc=self.misc, graph=self.graph ).items(): if v is None: continue if k == 'misc': fields += v elif k == 'graph': fields.append(str(v)) else: if not isinstance(v, str): v = json.dumps(v) fields.append('# ::{} {}'.format(k, v)) return '\n'.join(fields) def get_src_tokens(self): return self.lemmas if self.lemmas else self.sentence.split() class AMRNode: attribute_priority = [ 'instance', 'quant', 'mode', 'value', 'name', 'li', 'mod', 'frequency', 'month', 'day', 'year', 'time', 'unit', 'decade', 'poss' ] def __init__(self, identifier, attributes=None, copy_of=None): self.identifier = identifier if attributes is None: self.attributes = [] else: self.attributes = attributes # self._sort_attributes() self._num_copies = 0 self.copy_of = copy_of def _sort_attributes(self): def get_attr_priority(attr): if attr in self.attribute_priority: return self.attribute_priority.index(attr), attr if not re.search(r'^(ARG|op|snt)', attr): return len(self.attribute_priority), attr else: return len(self.attribute_priority) + 1, attr self.attributes.sort(key=lambda x: get_attr_priority(x[0])) def __hash__(self): return hash(self.identifier) def __eq__(self, other): if not isinstance(other, AMRNode): return False return self.identifier == other.identifier def __repr__(self): ret = str(self.identifier) for k, v in self.attributes: if k == 'instance': ret += ' / ' + v break return ret def __str__(self): ret = repr(self) for key, value in self.attributes: if key == 'instance': continue ret += '\n\t:{} {}'.format(key, value) return ret @property def instance(self): for key, value in self.attributes: if key == 'instance': return value else: return None @property def ops(self): ops = [] for key, value in self.attributes: if re.search(r'op\d+', key): ops.append((int(key[2:]), value)) if len(ops): ops.sort(key=lambda x: x[0]) return [v for k, v in ops] def copy(self): attributes = None if self.attributes is not None: attributes = self.attributes[:] self._num_copies += 1 copy = AMRNode(self.identifier + '_copy_{}'.format(self._num_copies), attributes, self) return copy def remove_attribute(self, attr, value): self.attributes.remove((attr, value)) def add_attribute(self, attr, value): self.attributes.append((attr, value)) def replace_attribute(self, attr, old, new): index = self.attributes.index((attr, old)) self.attributes[index] = (attr, new) def get_frame_attributes(self): for k, v in self.attributes: if isinstance(v, str) and re.search(r'-\d\d$', v): yield k, v def get_senseless_attributes(self): for k, v in self.attributes: if isinstance(v, str) and not re.search(r'-\d\d$', v): yield k, v class AMRGraph(penman.Graph): edge_label_priority = ( 'mod name time location degree poss domain quant manner unit purpose topic condition part-of compared-to ' 'duration source ord beneficiary concession direction frequency consist-of example medium location-of ' 'manner-of quant-of time-of instrument prep-in destination accompanier prep-with extent instrument-of age ' 'path concession-of subevent-of prep-as prep-to prep-against prep-on prep-for degree-of prep-under part ' 'condition-of prep-without topic-of season duration-of poss-of prep-from prep-at range purpose-of source-of ' 'subevent example-of value path-of scale conj-as-if prep-into prep-by prep-on-behalf-of medium-of prep-among ' 'calendar beneficiary-of prep-along-with extent-of age-of frequency-of dayperiod accompanier-of ' 'destination-of prep-amid prep-toward prep-in-addition-to ord-of name-of weekday direction-of prep-out-of ' 'timezone subset-of'.split()) def __init__(self, penman_graph): super(AMRGraph, self).__init__() self._triples = penman_graph._triples self._top = penman_graph._top self._build_extras() self._src_tokens = [] def __str__(self): self._triples = penman.alphanum_order(self._triples) return amr_codec.encode(self) def _build_extras(self): G = nx.DiGraph() self.variable_to_node = {} for v in self.variables(): if type(v) is not str: continue attributes = [(t.relation, t.target) for t in self.attributes(source=v)] node = AMRNode(v, attributes) G.add_node(node) self.variable_to_node[v] = node edge_set = set() for edge in self.edges(): if type(edge.source) is not str: continue source = self.variable_to_node[edge.source] target = self.variable_to_node[edge.target] relation = edge.relation if relation == 'instance': continue if source == target: continue if edge.inverted: source, target, relation = target, source, amr_codec.invert_relation(edge.relation) if (source, target) in edge_set: target = target.copy() edge_set.add((source, target)) G.add_edge(source, target, label=relation) self._G = G def attributes(self, source=None, relation=None, target=None): # Refine attributes because there's a bug in penman.attributes() # See https://github.com/goodmami/penman/issues/29 attrmatch = lambda a: ( (source is None or source == a.source) and (relation is None or relation == a.relation) and (target is None or target == a.target) ) variables = self.variables() attrs = [t for t in self.triples() if t.target not in variables or t.relation == 'instance'] return list(filter(attrmatch, attrs)) def _update_penman_graph(self, triples): self._triples = triples if self._top not in self.variables(): self._top = None def is_name_node(self, node): edges = list(self._G.in_edges(node)) return any(self._G[source][target].get('label', None) == 'name' for source, target in edges) def get_name_node_type(self, node): edges = list(self._G.in_edges(node)) for source, target in edges: if self._G[source][target].get('label', None) == 'name': return source.instance raise KeyError def get_name_node_wiki(self, node): edges = list(self._G.in_edges(node)) for source, target in edges: if self._G[source][target].get('label', None) == 'name': for attr, value in source.attributes: if attr == 'wiki': if value != '-': value = value[1:-1] # remove quotes return value return None def set_name_node_wiki(self, node, wiki): edges = list(self._G.in_edges(node)) parent = None for source, target in edges: if self._G[source][target].get('label', None) == 'name': parent = source break if parent: if wiki != '-': wiki = '"{}"'.format(wiki) self.add_node_attribute(parent, 'wiki', wiki) def is_date_node(self, node): return node.instance == 'date-entity' def add_edge(self, source, target, label): self._G.add_edge(source, target, label=label) t = penman.Triple(source=source.identifier, relation=label, target=target.identifier) triples = self._triples + [t] triples = penman.alphanum_order(triples) self._update_penman_graph(triples) def remove_edge(self, x, y): if isinstance(x, AMRNode) and isinstance(y, AMRNode): self._G.remove_edge(x, y) if isinstance(x, AMRNode): x = x.identifier if isinstance(y, AMRNode): y = y.identifier triples = [t for t in self._triples if not (t.source == x and t.target == y)] self._update_penman_graph(triples) def update_edge_label(self, x, y, old, new): self._G[x][y]['label'] = new triples = [] for t in self._triples: if t.source == x.identifier and t.target == y.identifier and t.relation == old: t = Triple(x.identifier, new, y.identifier) triples.append(t) self._update_penman_graph(triples) def add_node(self, instance): identifier = instance[0] assert identifier.isalpha() if identifier in self.variables(): i = 2 while identifier + str(i) in self.variables(): i += 1 identifier += str(i) triples = self._triples + [Triple(identifier, 'instance', instance)] self._triples = penman.alphanum_order(triples) node = AMRNode(identifier, [('instance', instance)]) self._G.add_node(node) return node def remove_node(self, node): self._G.remove_node(node) triples = [t for t in self._triples if t.source != node.identifier] self._update_penman_graph(triples) def replace_node_attribute(self, node, attr, old, new): node.replace_attribute(attr, old, new) triples = [] found = False for t in self._triples: if t.source == node.identifier and t.relation == attr and t.target == old: found = True t = penman.Triple(source=node.identifier, relation=attr, target=new) triples.append(t) if not found: raise KeyError self._triples = penman.alphanum_order(triples) def remove_node_attribute(self, node, attr, value): node.remove_attribute(attr, value) triples = [t for t in self._triples if not (t.source == node.identifier and t.relation == attr and t.target == value)] self._update_penman_graph(triples) def add_node_attribute(self, node, attr, value): node.add_attribute(attr, value) t = penman.Triple(source=node.identifier, relation=attr, target=value) self._triples = penman.alphanum_order(self._triples + [t]) def remove_node_ops(self, node): ops = [] for attr, value in node.attributes: if re.search(r'^op\d+$', attr): ops.append((attr, value)) for attr, value in ops: self.remove_node_attribute(node, attr, value) def remove_subtree(self, root): children = [] removed_nodes = set() for _, child in list(self._G.edges(root)): self.remove_edge(root, child) children.append(child) for child in children: if len(list(self._G.in_edges(child))) == 0: removed_nodes.update(self.remove_subtree(child)) if len(list(self._G.in_edges(root))) == 0: self.remove_node(root) removed_nodes.add(root) return removed_nodes def get_subtree(self, root, max_depth): if max_depth == 0: return [] nodes = [root] children = [child for _, child in self._G.edges(root)] nodes += children for child in children: if len(list(self._G.in_edges(child))) == 1: nodes = nodes + self.get_subtree(child, max_depth - 1) return nodes def get_nodes(self): return self._G.nodes def get_edges(self): return self._G.edges def set_src_tokens(self, sentence): if type(sentence) is not list: sentence = sentence.split(" ") self._src_tokens = sentence def get_src_tokens(self): return self._src_tokens def get_list_node(self, replace_copy=True): visited = defaultdict(int) node_list = [] def dfs(node, relation, parent): node_list.append(( node if node.copy_of is None or not replace_copy else node.copy_of, relation, parent if parent.copy_of is None or not replace_copy else parent.copy_of)) if len(self._G[node]) > 0 and visited[node] == 0: visited[node] = 1 for child_node, child_relation in self.sort_edges(self._G[node].items()): dfs(child_node, child_relation["label"], node) dfs( self.variable_to_node[self._top], 'root', self.variable_to_node[self._top] ) return node_list def sort_edges(self, edges): return edges def get_tgt_tokens(self): node_list = self.get_list_node() tgt_token = [] visited = defaultdict(int) for node, relation, parent_node in node_list: instance = [attr[1] for attr in node.attributes if attr[0] == "instance"] assert len(instance) == 1 tgt_token.append(str(instance[0])) if len(node.attributes) > 1 and visited[node] == 0: for attr in node.attributes: if attr[0] != "instance": tgt_token.append(str(attr[1])) visited[node] = 1 return tgt_token def get_list_data(self, amr, bos=None, eos=None, bert_tokenizer=None, max_tgt_length=None): node_list = self.get_list_node() tgt_tokens = [] head_tags = [] head_indices = [] node_to_idx = defaultdict(list) visited = defaultdict(int) def update_info(node, relation, parent, token): head_indices.append(1 + node_to_idx[parent][-1]) head_tags.append(relation) tgt_tokens.append(str(token)) for node, relation, parent_node in node_list: node_to_idx[node].append(len(tgt_tokens)) instance = [attr[1] for attr in node.attributes if attr[0] == "instance"] assert len(instance) == 1 instance = instance[0] update_info(node, relation, parent_node, instance) if len(node.attributes) > 1 and visited[node] == 0: for attr in node.attributes: if attr[0] != "instance": update_info(node, attr[0], node, attr[1]) visited[node] = 1 def trim_very_long_tgt_tokens(tgt_tokens, head_tags, head_indices, node_to_idx): tgt_tokens = tgt_tokens[:max_tgt_length] head_tags = head_tags[:max_tgt_length] head_indices = head_indices[:max_tgt_length] for node, indices in node_to_idx.items(): invalid_indices = [index for index in indices if index >= max_tgt_length] for index in invalid_indices: indices.remove(index) return tgt_tokens, head_tags, head_indices, node_to_idx if max_tgt_length is not None: tgt_tokens, head_tags, head_indices, node_to_idx = trim_very_long_tgt_tokens( tgt_tokens, head_tags, head_indices, node_to_idx) copy_offset = 0 if bos: tgt_tokens = [bos] + tgt_tokens copy_offset += 1 if eos: tgt_tokens = tgt_tokens + [eos] head_indices[node_to_idx[self.variable_to_node[self.top]][0]] = 0 # Target side Coreference tgt_copy_indices = [i for i in range(len(tgt_tokens))] for node, indices in node_to_idx.items(): if len(indices) > 1: copy_idx = indices[0] + copy_offset for token_idx in indices[1:]: tgt_copy_indices[token_idx + copy_offset] = copy_idx tgt_copy_map = [(token_idx, copy_idx) for token_idx, copy_idx in enumerate(tgt_copy_indices)] for i, copy_index in enumerate(tgt_copy_indices): # Set the coreferred target to 0 if no coref is available. if i == copy_index: tgt_copy_indices[i] = 0 tgt_token_counter = Counter(tgt_tokens) tgt_copy_mask = [0] * len(tgt_tokens) for i, token in enumerate(tgt_tokens): if tgt_token_counter[token] > 1: tgt_copy_mask[i] = 1 def add_source_side_tags_to_target_side(_src_tokens, _src_tags): assert len(_src_tags) == len(_src_tokens) tag_counter = defaultdict(lambda: defaultdict(int)) for src_token, src_tag in zip(_src_tokens, _src_tags): tag_counter[src_token][src_tag] += 1 tag_lut = {DEFAULT_OOV_TOKEN: DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN: DEFAULT_OOV_TOKEN} for src_token in set(_src_tokens): tag = max(tag_counter[src_token].keys(), key=lambda x: tag_counter[src_token][x]) tag_lut[src_token] = tag tgt_tags = [] for tgt_token in tgt_tokens: sim_token = find_similar_token(tgt_token, _src_tokens) if sim_token is not None: index = _src_tokens.index(sim_token) tag = _src_tags[index] else: tag = DEFAULT_OOV_TOKEN tgt_tags.append(tag) return tgt_tags, tag_lut # Source Copy src_tokens = self.get_src_tokens() src_token_ids = None src_token_subword_index = None src_pos_tags = amr.pos_tags src_copy_vocab = SourceCopyVocabulary(src_tokens) src_copy_indices = src_copy_vocab.index_sequence(tgt_tokens) src_copy_map = src_copy_vocab.get_copy_map(src_tokens) tgt_pos_tags, pos_tag_lut = add_source_side_tags_to_target_side(src_tokens, src_pos_tags) if bert_tokenizer is not None: src_token_ids, src_token_subword_index = bert_tokenizer.tokenize(src_tokens, True) src_must_copy_tags = [1 if is_abstract_token(t) else 0 for t in src_tokens] src_copy_invalid_ids = set(src_copy_vocab.index_sequence( [t for t in src_tokens if is_english_punct(t)])) return { "tgt_tokens": tgt_tokens, "tgt_pos_tags": tgt_pos_tags, "tgt_copy_indices": tgt_copy_indices, "tgt_copy_map": tgt_copy_map, "tgt_copy_mask": tgt_copy_mask, "src_tokens": src_tokens, "src_token_ids": src_token_ids, "src_token_subword_index": src_token_subword_index, "src_must_copy_tags": src_must_copy_tags, "src_pos_tags": src_pos_tags, "src_copy_vocab": src_copy_vocab, "src_copy_indices": src_copy_indices, "src_copy_map": src_copy_map, "pos_tag_lut": pos_tag_lut, "head_tags": head_tags, "head_indices": head_indices, "src_copy_invalid_ids": src_copy_invalid_ids } @classmethod def decode(cls, raw_graph_string): _graph = amr_codec.decode(raw_graph_string) return cls(_graph) @classmethod def from_lists(cls, all_list): head_tags = all_list['head_tags'] head_indices = all_list['head_indices'] tgt_tokens = all_list['tokens'] tgt_copy_indices = all_list['coref'] variables = [] variables_count = defaultdict(int) for i, token in enumerate(tgt_tokens): if tgt_copy_indices[i] != i: variables.append(variables[tgt_copy_indices[i]]) else: if token[0] in variables_count: variables.append(token[0] + str(variables_count[token[0]])) else: variables.append(token[0]) variables_count[token[0]] += 1 Triples = [] for variable, token in zip(variables, tgt_tokens): Triples.append(Triple(variable, "instance", token)) Triples.append( Triple( head_indices[variable], head_tags[variable], variable ) ) @classmethod def from_prediction(cls, prediction): def is_attribute_value(value): return re.search(r'(^".*"$|^[^a-zA-Z]+$)', value) is not None def is_attribute_edge(label): return label in ('instance', 'mode', 'li', 'value', 'month', 'year', 'day', 'decade', 'ARG6') def normalize_number(text): if re.search(r'^\d+,\d+$', text): text = text.replace(',', '') return text def abstract_node(value): return re.search(r'^([A-Z]+|DATE_ATTRS|SCORE_ENTITY|ORDINAL_ENTITY)_\d+$', value) def abstract_attribute(value): return re.search(r'^_QUANTITY_\d+$', value) def correct_multiroot(heads): for i in range(1, len(heads)): if heads[i] == 0: heads[i] = 1 return heads nodes = [normalize_number(n) for n in prediction['nodes']] heads = correct_multiroot(prediction['heads']) corefs = [int(x) for x in prediction['corefs']] head_labels = prediction['head_labels'] triples = [] top = None # Build the variable map from variable to instance. variable_map = {} for coref_index in corefs: node = nodes[coref_index - 1] head_label = head_labels[coref_index - 1] if (re.search(r'[/:\\()]', node) or is_attribute_value(node) or is_attribute_edge(head_label) or abstract_attribute(node)): continue variable_map['vv{}'.format(coref_index)] = node for head_index in heads: if head_index == 0: continue node = nodes[head_index - 1] coref_index = corefs[head_index - 1] variable_map['vv{}'.format(coref_index)] = node # Build edge triples and other attribute triples. for i, head_index in enumerate(heads): if head_index == 0: top_variable = 'vv{}'.format(corefs[i]) if top_variable not in variable_map: variable_map[top_variable] = nodes[i] top = top_variable continue head_variable = 'vv{}'.format(corefs[head_index - 1]) modifier = nodes[i] modifier_variable = 'vv{}'.format(corefs[i]) label = head_labels[i] assert head_variable in variable_map if modifier_variable in variable_map: triples.append((head_variable, label, modifier_variable)) else: # Add quotes if there's a backslash. if re.search(r'[/:\\()]', modifier) and not re.search(r'^".*"$', modifier): modifier = '"{}"'.format(modifier) triples.append((head_variable, label, modifier)) for var, node in variable_map.items(): if re.search(r'^".*"$', node): node = node[1:-1] if re.search(r'[/:\\()]', node): parts = re.split(r'[/:\\()]', node) for part in parts[::-1]: if len(part): node = part break else: node = re.sub(r'[/:\\()]', '_', node) triples.append((var, 'instance', node)) if len(triples) == 0: triples.append(('vv1', 'instance', 'string-entity')) top = 'vv1' triples.sort(key=lambda x: int(x[0].replace('vv', ''))) graph = penman.Graph() graph._top = top graph._triples = [penman.Triple(*t) for t in triples] graph = cls(graph) try: GraphRepair.do(graph, nodes) amr_codec.encode(graph) except Exception as e: graph._top = top graph._triples = [penman.Triple(*t) for t in triples] graph = cls(graph) return graph class SourceCopyVocabulary: def __init__(self, sentence, pad_token=DEFAULT_PADDING_TOKEN, unk_token=DEFAULT_OOV_TOKEN): if type(sentence) is not list: sentence = sentence.split(" ") self.src_tokens = sentence self.pad_token = pad_token self.unk_token = unk_token self.token_to_idx = {self.pad_token: 0, self.unk_token: 1} self.idx_to_token = {0: self.pad_token, 1: self.unk_token} self.vocab_size = 2 for token in sentence: if token not in self.token_to_idx: self.token_to_idx[token] = self.vocab_size self.idx_to_token[self.vocab_size] = token self.vocab_size += 1 def get_token_from_idx(self, idx): return self.idx_to_token[idx] def get_token_idx(self, token): return self.token_to_idx.get(token, self.token_to_idx[self.unk_token]) def index_sequence(self, list_tokens): return [self.get_token_idx(token) for token in list_tokens] def get_copy_map(self, list_tokens): src_indices = [self.get_token_idx(self.unk_token)] + self.index_sequence(list_tokens) return [ (src_idx, src_token_idx) for src_idx, src_token_idx in enumerate(src_indices) ] def get_special_tok_list(self): return [self.pad_token, self.unk_token] def __repr__(self): return json.dumps(self.idx_to_token) def is_similar(instances1, instances2): if len(instances1) < len(instances2): small = instances1 large = instances2 else: small = instances2 large = instances1 coverage1 = sum(1 for x in small if x in large) / len(small) coverage2 = sum(1 for x in large if x in small) / len(large) return coverage1 > .8 and coverage2 > .8 class GraphRepair: def __init__(self, graph, nodes): self.graph = graph self.nodes = nodes self.repaired_items = set() @staticmethod def do(graph, nodes): gr = GraphRepair(graph, nodes) gr.remove_redundant_edges() gr.remove_unknown_nodes() def remove_unknown_nodes(self): graph = self.graph nodes = [node for node in graph.get_nodes()] for node in nodes: for attr, value in node.attributes: if value == '@@UNKNOWN@@' and attr != 'instance': graph.remove_node_attribute(node, attr, value) if node.instance == '@@UNKNOWN@@': if len(list(graph._G.edges(node))) == 0: for source, target in list(graph._G.in_edges(node)): graph.remove_edge(source, target) graph.remove_node(node) self.repaired_items.add('remove-unknown-node') def remove_redundant_edges(self): """ Edge labels such as ARGx, ARGx-of, and 'opx' should only appear at most once in each node's outgoing edges. """ graph = self.graph nodes = [node for node in graph.get_nodes()] removed_nodes = set() for node in nodes: if node in removed_nodes: continue edges = list(graph._G.edges(node)) edge_counter = defaultdict(list) for source, target in edges: label = graph._G[source][target]['label'] # `name`, `ARGx`, and `ARGx-of` should only appear once. if label == 'name': # or label.startswith('ARG'): edge_counter[label].append(target) # the target of `opx' should only appear once. elif label.startswith('op') or label.startswith('snt'): edge_counter[str(target.instance)].append(target) else: edge_counter[label + str(target.instance)].append(target) for label, children in edge_counter.items(): if len(children) == 1: continue if label == 'name': # remove redundant edges. for target in children[1:]: if len(list(graph._G.in_edges(target))) == 1 and len(list(graph._G.edges(target))) == 0: graph.remove_edge(node, target) graph.remove_node(target) removed_nodes.add(target) self.repaired_items.add('remove-redundant-edge') continue visited_children = set() groups = [] for i, target in enumerate(children): if target in visited_children: continue subtree_instances1 = [n.instance for n in graph.get_subtree(target, 5)] group = [(target, subtree_instances1)] visited_children.add(target) for _t in children[i + 1:]: if _t in visited_children or target.instance != _t.instance: continue subtree_instances2 = [n.instance for n in graph.get_subtree(_t, 5)] if is_similar(subtree_instances1, subtree_instances2): group.append((_t, subtree_instances2)) visited_children.add(_t) groups.append(group) for group in groups: if len(group) == 1: continue kept_target, _ = max(group, key=lambda x: len(x[1])) for target, _ in group: if target == kept_target: continue graph.remove_edge(node, target) removed_nodes.update(graph.remove_subtree(target)) ================================================ FILE: plugins/hanlp_common/hanlp_common/configurable.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-16 22:24 from hanlp_common.reflection import str_to_type, classpath_of class Configurable(object): @staticmethod def from_config(config: dict, **kwargs): """Build an object from config. Args: config: A ``dict`` holding parameters for its constructor. It has to contain a `classpath` key, which has a classpath str as its value. ``classpath`` will determine the type of object being deserialized. kwargs: Arguments not used. Returns: A deserialized object. """ cls = config.get('classpath', None) assert cls, f'{config} doesn\'t contain classpath field' cls = str_to_type(cls) deserialized_config = dict(config) for k, v in config.items(): if isinstance(v, dict) and 'classpath' in v: deserialized_config[k] = Configurable.from_config(v) if cls.from_config == Configurable.from_config: deserialized_config.pop('classpath') return cls(**deserialized_config) else: return cls.from_config(deserialized_config) class AutoConfigurable(Configurable): @property def config(self) -> dict: """ The config of this object, which are public properties. If any properties needs to be excluded from this config, simply declare it with prefix ``_``. """ return dict([('classpath', classpath_of(self))] + [(k, v.config if hasattr(v, 'config') else v) for k, v in self.__dict__.items() if not k.startswith('_')]) def __repr__(self) -> str: return repr(self.config) ================================================ FILE: plugins/hanlp_common/hanlp_common/conll.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-19 20:50 from typing import Union, List from hanlp_common.structure import SerializableDict from hanlp_common.visualization import pretty_tree_horizontal, make_table, markdown_table class CoNLLWord(SerializableDict): def __init__(self, id, form, lemma=None, cpos=None, pos=None, feats=None, head=None, deprel=None, phead=None, pdeprel=None): """CoNLL (:cite:`buchholz-marsi-2006-conll`) format template, see http://anthology.aclweb.org/W/W06/W06-2920.pdf Args: id (int): Token counter, starting at 1 for each new sentence. form (str): Word form or punctuation symbol. lemma (str): Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available. cpos (str): Coarse-grained part-of-speech tag, where the tagset depends on the treebank. pos (str): Fine-grained part-of-speech tag, where the tagset depends on the treebank. feats (str): Unordered set of syntactic and/or morphological features (depending on the particular treebank), or an underscore if not available. head (Union[int, List[int]]): Head of the current token, which is either a value of ID, or zero (’0’) if the token links to the virtual root node of the sentence. deprel (Union[str, List[str]]): Dependency relation to the HEAD. phead (int): Projective head of current token, which is either a value of ID or zero (’0’), or an underscore if not available. pdeprel (str): Dependency relation to the PHEAD, or an underscore if not available. """ self.id = sanitize_conll_int_value(id) self.form = form self.cpos = cpos self.pos = pos self.head = sanitize_conll_int_value(head) self.deprel = deprel self.lemma = lemma self.feats = feats self.phead = phead self.pdeprel = pdeprel def __str__(self): if isinstance(self.head, list): return '\n'.join('\t'.join(['_' if v is None else v for v in values]) for values in [ [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats, None if head is None else str(head), deprel, self.phead, self.pdeprel] for head, deprel in zip(self.head, self.deprel) ]) values = [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats, None if self.head is None else str(self.head), self.deprel, self.phead, self.pdeprel] return '\t'.join(['_' if v is None else v for v in values]) @property def nonempty_fields(self): """ Get the values of nonempty fields as a list. """ return list(f for f in [self.form, self.lemma, self.cpos, self.pos, self.feats, self.head, self.deprel, self.phead, self.pdeprel] if f) def get_pos(self): """ Get the precisest pos for this word. Returns: ``self.pos`` or ``self.cpos``. """ return self.pos or self.cpos class CoNLLUWord(SerializableDict): def __init__(self, id: Union[int, str], form, lemma=None, upos=None, xpos=None, feats=None, head=None, deprel=None, deps=None, misc=None): """CoNLL-U format template, see https://universaldependencies.org/format.html Args: id (Union[int, str]): Token counter, starting at 1 for each new sentence. form (Union[str, None]): Word form or punctuation symbol. lemma (str): Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available. upos (str): Universal part-of-speech tag. xpos (str): Language-specific part-of-speech tag; underscore if not available. feats (str): List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available. head (int): Head of the current token, which is either a value of ID, or zero (’0’) if the token links to the virtual root node of the sentence. deprel (str): Dependency relation to the HEAD. deps (Union[List[Tuple[int, str], str]): Projective head of current token, which is either a value of ID or zero (’0’), or an underscore if not available. misc (str): Dependency relation to the PHEAD, or an underscore if not available. """ self.id = sanitize_conll_int_value(id) self.form = form self.upos = upos self.xpos = xpos if isinstance(head, list): assert deps is None, 'When head is a list, deps has to be None' assert isinstance(deprel, list), 'When head is a list, deprel has to be a list' assert len(deprel) == len(head), 'When head is a list, deprel has to match its length' deps = list(zip(head, deprel)) head = None deprel = None self.head = sanitize_conll_int_value(head) self.deprel = deprel self.lemma = lemma self.feats = feats if deps == '_': deps = None if isinstance(deps, str): self.deps = [] for pair in deps.split('|'): h, r = pair.split(':') h = int(h) self.deps.append((h, r)) else: self.deps = deps self.misc = misc def __str__(self): deps = self.deps if not deps: deps = None else: deps = '|'.join(f'{h}:{r}' for h, r in deps) values = [str(self.id), self.form, self.lemma, self.upos, self.xpos, self.feats, str(self.head) if self.head is not None else None, self.deprel, deps, self.misc] return '\t'.join(['_' if v is None else v for v in values]) @property def nonempty_fields(self): """ Get the values of nonempty fields as a list. """ return list(f for f in [self.form, self.lemma, self.upos, self.xpos, self.feats, self.head, self.deprel, self.deps, self.misc] if f) def get_pos(self): """ Get the precisest pos for this word. Returns: ``self.xpos`` or ``self.upos`` """ return self.xpos or self.upos class CoNLLSentence(list): def __init__(self, words=None): """ A list of :class:`~hanlp_common.conll.CoNLLWord` or :class:`~hanlp_common.conll.CoNLLUWord`. It is a sub-class of :class:`list` and its words can be accessed in the same way as accessing list elements. Args: words (list[Union[CoNLLWord, CoNLLUWord]]): A list of words. """ super().__init__() if words: self.extend(words) def __str__(self): return '\n'.join([word.__str__() for word in self]) @staticmethod def from_str(conll: str, conllu=False): """Build a CoNLLSentence from CoNLL-X format str Args: conll (str): CoNLL-X or CoNLL-U format string conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token. Returns: A :class:`~hanlp_common.conll.CoNLLSentence`. """ words: List[CoNLLWord] = [] prev_id = None for line in conll.strip().split('\n'): if line.startswith('#'): continue cells = line.split('\t') cells = [None if c == '_' else c for c in cells] if '-' in cells[0]: continue cells[0] = int(cells[0]) cells[6] = int(cells[6]) if cells[0] != prev_id: words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells)) else: if isinstance(words[-1].head, list): words[-1].head.append(cells[6]) words[-1].deprel.append(cells[7]) else: words[-1].head = [words[-1].head] + [cells[6]] words[-1].deprel = [words[-1].deprel] + [cells[7]] prev_id = cells[0] if conllu: for word in words: # type: CoNLLUWord if isinstance(word.head, list): assert not word.deps word.deps = list(zip(word.head, word.deprel)) word.head = None word.deprel = None return CoNLLSentence(words) @staticmethod def from_file(path: str, conllu=False): """Build a CoNLLSentence from ``.conllx`` or ``.conllu`` file Args: path: Path to the file. conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token. Returns: A :class:`~hanlp_common.conll.CoNLLSentence`. """ with open(path) as src: return [CoNLLSentence.from_str(x, conllu) for x in src.read().split('\n\n') if x.strip()] @staticmethod def from_dict(d: dict, conllu=False): """Build a CoNLLSentence from a dict. Args: d: A dict storing a list for each field, where each index corresponds to a token. conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token. Returns: A :class:`~hanlp_common.conll.CoNLLSentence`. """ if conllu: headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] else: headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'] words: List[Union[CoNLLWord, CoNLLUWord]] = [] for cells in zip(*list(d[f] for f in headings)): words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells)) return CoNLLSentence(words) def to_markdown(self, headings: Union[str, List[str]] = 'auto') -> str: r"""Convert into markdown string. Args: headings: ``auto`` to automatically detect the word type. When passed a list of string, they are treated as headings for each field. Returns: A markdown representation of this sentence. """ cells = [str(word).split('\t') for word in self] if headings == 'auto': if isinstance(self[0], CoNLLWord): headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'] else: # conllu headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] for each in cells: # if '|' in each[8]: # each[8] = f'`{each[8]}`' each[8] = each[8].replace('|', '⎮') alignment = [('^', '>'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '>'), ('^', '<'), ('^', '<'), ('^', '<')] text = markdown_table(headings, cells, alignment=alignment) return text def to_tree(self, extras: List[str] = None) -> str: """Convert into a pretty tree string which can be printed to show the tree structure. Args: extras: Extra table to be aligned to this tree. Returns: A pretty tree string along with extra table if passed any. """ arrows = [] for word in self: # type: Union[CoNLLWord, CoNLLUWord] if word.head: arrows.append({'from': word.head - 1, 'to': word.id - 1}) tree = pretty_tree_horizontal(arrows) rows = [['Dep Tree', 'Token', 'Relation']] has_lem = all(x.lemma for x in self) has_pos = all(x.get_pos() for x in self) if has_lem: rows[0].append('Lemma') if has_pos: rows[0].append('PoS') if extras: rows[0].extend(extras[0]) for i, (word, arc) in enumerate(zip(self, tree)): cell_per_word = [arc] cell_per_word.append(word.form) cell_per_word.append(word.deprel) if has_lem: cell_per_word.append(word.lemma) if has_pos: cell_per_word.append(word.get_pos()) if extras: cell_per_word.extend(extras[i + 1]) rows.append(cell_per_word) return make_table(rows, insert_header=True) @property def projective(self): """ ``True`` if this tree is projective. """ return isprojective([x.head for x in self]) class CoNLLSentenceList(list): def __str__(self) -> str: return '\n\n'.join(str(x) for x in self) def sanitize_conll_int_value(value: Union[str, int]): if value is None or isinstance(value, int): return value if value == '_': return None if isinstance(value, str): return int(value) return value def isprojective(sequence): r""" Checks if a dependency tree is projective. This also works for partial annotation. Besides the obvious crossing arcs, the examples below illustrate two non-projective cases which are hard to detect in the scenario of partial annotation. Args: sequence (list[int]): A list of head indices. Returns: ``True`` if the tree is projective, ``False`` otherwise. Examples: >>> isprojective([2, -1, 1]) # -1 denotes un-annotated cases False >>> isprojective([3, -1, 2]) False """ pairs = [(h, d) for d, h in enumerate(sequence, 1) if h >= 0] for i, (hi, di) in enumerate(pairs): for hj, dj in pairs[i + 1:]: (li, ri), (lj, rj) = sorted([hi, di]), sorted([hj, dj]) if li <= hj <= ri and hi == dj: return False if lj <= hi <= rj and hj == di: return False if (li < lj < ri or li < rj < ri) and (li - lj) * (ri - rj) > 0: return False return True ================================================ FILE: plugins/hanlp_common/hanlp_common/constant.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-06-13 22:41 import os PAD = '' '''Padding token.''' UNK = '' '''Unknown token.''' CLS = '[CLS]' BOS = '' EOS = '' ROOT = BOS IDX = '_idx_' '''Key for index.''' HANLP_URL = os.getenv('HANLP_URL', 'https://file.hankcs.com/hanlp/') '''Resource URL.''' HANLP_VERBOSE = os.environ.get('HANLP_VERBOSE', '1').lower() in ('1', 'true', 'yes') '''Enable verbose or not.''' NULL = '' PRED = 'PRED' IPYTHON = os.environ.get('HANLP_IPYTHON', '1').lower() in ('1', 'true', 'yes') # Allow the user to disable IPYTHON if IPYTHON: try: # noinspection PyUnresolvedReferences,PyStatementEffect get_ipython except NameError: IPYTHON = False ================================================ FILE: plugins/hanlp_common/hanlp_common/document.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-31 04:16 import json import re import warnings from typing import List, Union from phrasetree.tree import Tree from hanlp_common.conll import CoNLLUWord, CoNLLSentence, CoNLLSentenceList from hanlp_common.constant import PRED, IPYTHON from hanlp_common.util import collapse_json, prefix_match from hanlp_common.visualization import tree_to_list, list_to_tree, render_labeled_span, make_table class Document(dict): def __init__(self, *args, **kwargs) -> None: r"""A dict structure holding parsed annotations. A document is a subclass of ``dict`` and it supports every interface of ``dict``\. Additionally, it supports interfaces to deal with various linguistic structures. Its ``str`` and ``dict`` representations are made to be compatible with JSON serialization. Args: *args: An iterator of key-value pairs. **kwargs: Arguments from ``**`` operator. Examples:: # Create a document doc = Document( tok=[["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司"]], pos=[["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN"]], ner=[[["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4], ["自然语义科技公司", "ORGANIZATION", 5, 9]]], dep=[[[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"], [9, "compound"], [9, "compound"], [9, "compound"], [5, "dobj"]]] ) # print(doc) or str(doc) to get its JSON representation print(doc) # Access an annotation by its task name print(doc['tok']) # Get number of sentences print(f'It has {doc.count_sentences()} sentence(s)') # Access the n-th sentence print(doc.squeeze(0)['tok']) # Pretty print it right in your console or notebook doc.pretty_print() # To save the pretty prints in a str pretty_text: str = '\n\n'.join(doc.to_pretty()) """ super().__init__(*args, **kwargs) for k, v in list(self.items()): if not v: continue if k == 'con': if isinstance(v, Tree) or isinstance(v[0], Tree): continue flat = isinstance(v[0], str) if flat: v = [v] ls = [] for each in v: if not isinstance(each, Tree): ls.append(list_to_tree(each)) if flat: ls = ls[0] self[k] = ls elif k == 'amr': from hanlp_common.amr import AMRGraph import penman if isinstance(v, AMRGraph) or isinstance(v[0], AMRGraph): continue flat = isinstance(v[0][0], str) if flat: v = [v] graphs = [AMRGraph(penman.Graph(triples)) for triples in v] if flat: graphs = graphs[0] self[k] = graphs def to_json(self, ensure_ascii=False, indent=2) -> str: """Convert to json string. Args: ensure_ascii: ``False`` to allow for non-ascii text. indent: Indent per nested structure. Returns: A text representation in ``str``. """ d = self.to_dict() text = json.dumps(d, ensure_ascii=ensure_ascii, indent=indent, default=lambda o: repr(o)) text = collapse_json(text, 4) return text def to_dict(self): """Convert to a json compatible dict. Returns: A dict representation. """ d = dict(self) for k, v in self.items(): if v == [] or v is None: continue if k == 'con': if not isinstance(v, Tree) and not isinstance(v[0], Tree): continue flat = isinstance(v, Tree) if flat: v = [v] ls = [] for each in v: if isinstance(each, Tree): ls.append(tree_to_list(each)) if flat: ls = ls[0] d[k] = ls return d def __str__(self) -> str: return self.to_json() def to_conll(self, tok='tok', lem='lem', pos='pos', fea='fea', dep='dep', sdp='sdp') -> Union[ CoNLLSentence, List[CoNLLSentence]]: """ Convert to :class:`~hanlp_common.conll.CoNLLSentence`. Args: tok (str): Field name for tok. lem (str): Field name for lem. pos (str): Field name for upos. fea (str): Field name for feats. dep (str): Field name for dependency parsing. sdp (str): Field name for semantic dependency parsing. Returns: A :class:`~hanlp_common.conll.CoNLLSentence` representation. """ tok = prefix_match(tok, self) lem = prefix_match(lem, self) pos = prefix_match(pos, self) fea = prefix_match(fea, self) dep = prefix_match(dep, self) sdp = prefix_match(sdp, self) results = CoNLLSentenceList() if not tok or not self[tok]: return results self = self._to_doc_without_spans(tok) flat = isinstance(self[tok][0], str) if flat: d = Document((k, [v]) for k, v in self.items()) else: d = self for sample in [dict(zip(d, t)) for t in zip(*d.values())]: def get(_k, _i): _v = sample.get(_k, None) if not _v: return None return _v[_i] sent = CoNLLSentence() for i, _tok in enumerate(sample[tok]): _dep = get(dep, i) if not _dep: _dep = (None, None) sent.append( CoNLLUWord(i + 1, form=_tok, lemma=get(lem, i), upos=get(pos, i), feats=get(fea, i), head=_dep[0], deprel=_dep[1], deps=None if not get(sdp, i) else '|'.join(f'{x[0]}:{x[1]}' for x in get(sdp, i)))) results.append(sent) if flat: return results[0] return results def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con', show_header=True, html=False) -> Union[str, List[str]]: """ Convert to a pretty text representation which can be printed to visualize linguistic structures. Args: tok: Token key. lem: Lemma key. pos: Part-of-speech key. dep: Dependency parse tree key. sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet. ner: Named entity key. srl: Semantic role labeling key. con: Constituency parsing key. show_header: ``True`` to include a header which indicates each field with its name. html: ``True`` to output HTML format so that non-ASCII characters can align correctly. Returns: A pretty string. """ results = [] tok = prefix_match(tok, self) pos = prefix_match(pos, self) ner = prefix_match(ner, self) conlls = self.to_conll(tok=tok, lem=lem, pos=pos, dep=dep, sdp=sdp) flat = isinstance(conlls, CoNLLSentence) if flat: conlls: List[CoNLLSentence] = [conlls] def condense(block_, extras_=None): text_ = make_table(block_, insert_header=False) text_ = [x.split('\t', 1) for x in text_.split('\n')] text_ = [[x[0], x[1].replace('\t', '')] for x in text_] if extras_: for r, s in zip(extras_, text_): r.extend(s) return text_ for i, conll in enumerate(conlls): conll: CoNLLSentence = conll tokens = [x.form for x in conll] length = len(conll) extras = [[] for j in range(length + 1)] if ner in self: ner_samples = self[ner] if flat: ner_samples = [ner_samples] ner_per_sample = ner_samples[i] # For nested NER, use the longest span start_offsets = [None for i in range(length)] for ent, label, b, e in ner_per_sample: if not start_offsets[b] or e > start_offsets[b][-1]: start_offsets[b] = (ent, label, b, e) ner_per_sample = [y for y in start_offsets if y] header = ['Token', 'NER', 'Type'] block = [[] for _ in range(length + 1)] _ner = [] _type = [] offset = 0 for ent, label, b, e in ner_per_sample: render_labeled_span(b, e, _ner, _type, label, offset) offset = e if offset != length: _ner.extend([''] * (length - offset)) _type.extend([''] * (length - offset)) if any(_type): block[0].extend(header) for j, (_s, _t) in enumerate(zip(_ner, _type)): block[j + 1].extend((tokens[j], _s, _t)) text = condense(block, extras) if srl in self: srl_samples = self[srl] if flat: srl_samples = [srl_samples] srl_per_sample = srl_samples[i] for k, pas in enumerate(srl_per_sample): if not pas: continue block = [[] for _ in range(length + 1)] header = ['Token', 'SRL', f'PA{k + 1}'] _srl = [] _type = [] offset = 0 p_index = None for _, label, b, e in pas: render_labeled_span(b, e, _srl, _type, label, offset) offset = e if label == PRED: p_index = b if len(_srl) != length: _srl.extend([''] * (length - offset)) _type.extend([''] * (length - offset)) if p_index is not None: _srl[p_index] = '╟──►' # _type[j] = 'V' if len(block) != len(_srl) + 1: # warnings.warn(f'Unable to visualize overlapped spans: {pas}') continue block[0].extend(header) while len(_srl) < length: _srl.append('') while len(_type) < length: _type.append('') for j, (_s, _t) in enumerate(zip(_srl, _type)): block[j + 1].extend((tokens[j], _s, _t)) text = condense(block, extras) if con in self: con_samples: Tree = self[con] if flat: con_samples: List[Tree] = [con_samples] tree = con_samples[i] block = [[] for _ in range(length + 1)] block[0].extend(('Token', 'PoS')) for j, t in enumerate(tree.pos()): block[j + 1].extend(t) for height in range(2, tree.height() + (0 if len(tree) == 1 else 1)): offset = 0 spans = [] labels = [] for k, subtree in enumerate(tree.subtrees(lambda x: x.height() == height)): subtree: Tree = subtree b, e = offset, offset + len(subtree.leaves()) if height >= 3: b, e = subtree[0].center, subtree[-1].center + 1 subtree.center = b + (e - b) // 2 render_labeled_span(b, e, spans, labels, subtree.label(), offset, unidirectional=True) offset = e if len(spans) != length: spans.extend([''] * (length - len(spans))) if len(labels) != length: labels.extend([''] * (length - len(labels))) if height < 3: continue block[0].extend(['', f'{height}']) for j, (_s, _t) in enumerate(zip(spans, labels)): block[j + 1].extend((_s, _t)) # check short arrows and increase their length for j, arrow in enumerate(spans): if not arrow: # -1 current tag ; -2 arrow to current tag ; -3 = prev tag ; -4 = arrow to prev tag if block[j + 1][-3] or block[j + 1][-4] == '───►': if height > 3: if block[j + 1][-3]: block[j + 1][-1] = block[j + 1][-3] block[j + 1][-2] = '───►' else: block[j + 1][-1] = '────' block[j + 1][-2] = '────' block[j + 1][-3] = '────' if block[j + 1][-4] == '───►': block[j + 1][-4] = '────' else: block[j + 1][-1] = '────' if block[j + 1][-1] == '────': block[j + 1][-2] = '────' if not block[j + 1][-4]: block[j + 1][-4] = '────' # If the root label is shorter than the level number, extend it to the same length level_len = len(block[0][-1]) for row in block[1:]: if row[-1] and len(row[-1]) < level_len: row[-1] = row[-1] + ' ' * (level_len - len(row[-1])) text = condense(block) # Cosmetic issues for row in text[1:]: while ' ─' in row[1]: row[1] = row[1].replace(' ─', ' ──') row[1] = row[1].replace('─ ─', '───') row[1] = re.sub(r'([►─])([\w-]*)(\s+)([│├])', lambda m: f'{m.group(1)}{m.group(2)}{"─" * len(m.group(3))}{"┤" if m.group(4) == "│" else "┼"}', row[1]) row[1] = re.sub(r'►(─+)►', r'─\1►', row[1]) for r, s in zip(extras, text): r.extend(s) # warnings.warn('Unable to visualize non-projective trees.') if dep in self and conll.projective: text = conll.to_tree(extras) if not show_header: text = text.split('\n') text = '\n'.join(text[2:]) results.append(text) elif any(extras): results.append(make_table(extras, insert_header=True)) else: results.append(' '.join(['/'.join(str(f) for f in x.nonempty_fields) for x in conll])) if html: def to_html(pretty_text: str) -> str: lines = [x for x in pretty_text.split('\n') if x] cells = [] for line in lines: cells.append(line.split('\t')) num_cols = len(cells[0]) cols = [] for i in range(num_cols): cols.append([]) for row in cells: cols[-1].append(row[i]) html = '
' for i, each in enumerate(cols): html += '
'
                    if i != len(cols) - 1:
                        each = [x + ' ' for x in each]
                    html += '
'.join([x.replace(' ', ' ') for x in each]) html += '
' html += '
' return html results = [to_html(x) for x in results] if flat: return results[0] return results def pretty_print(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con', show_header=True, html=IPYTHON): """ Print a pretty text representation which visualizes linguistic structures. Args: tok: Token key. lem: Lemma key. pos: Part-of-speech key. dep: Dependency parse tree key. sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet. ner: Named entity key. srl: Semantic role labeling key. con: Constituency parsing key. show_header: ``True`` to print a header which indicates each field with its name. html: ``True`` to output HTML format so that non-ASCII characters can align correctly. """ results = self.to_pretty(tok, lem, pos, dep, sdp, ner, srl, con, show_header, html=html) if isinstance(results, str): results = [results] if html and IPYTHON: from IPython.core.display import display, HTML display(HTML('
'.join(results))) else: sent_new_line = '\n\n' if any('\n' in x for x in results) else '\n' print(sent_new_line.join(results)) def translate(self, lang, tok='tok', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl'): """ Translate tags for each annotation. This is an inplace operation. .. Attention:: Note that the translated document might not print well in terminal due to non-ASCII characters. Args: lang: Target language to be translated to. tok: Token key. pos: Part-of-speech key. dep: Dependency parse tree key. sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet. ner: Named entity key. srl: Semantic role labeling key. Returns: The translated document. """ if lang == 'zh': from hanlp.utils.lang.zh import localization else: raise NotImplementedError(f'No translation for {lang}. ' f'Please contribute to our translation at https://github.com/hankcs/HanLP') flat = isinstance(self[tok][0], str) for task, name in zip(['pos', 'ner', 'dep', 'sdp', 'srl'], [pos, ner, dep, sdp, srl]): annotations = self.get(name, None) if not annotations: continue if flat: annotations = [annotations] translate: dict = getattr(localization, name, None) if not translate: continue for anno_per_sent in annotations: for i, v in enumerate(anno_per_sent): if task == 'ner' or task == 'dep': v[1] = translate.get(v[1], v[1]) else: anno_per_sent[i] = translate.get(v, v) return self def squeeze(self, i=0): r""" Squeeze the dimension of each field into one. It's intended to convert a nested document like ``[[sent_i]]`` to ``[sent_i]``. When there are multiple sentences, only the ``i-th`` one will be returned. Note this is not an inplace operation. Args: i: Keep the element at ``index`` for all ``list``\s. Returns: A squeezed document with only one sentence. """ sq = Document() for k, v in self.items(): sq[k] = v[i] if isinstance(v, list) else v return sq def _to_doc_without_spans(self, tok: str): """ Remove the spans attached to tokens and return a new document. Args: tok: The key to tokens. Returns: A new document or itself. """ tokens: Union[List[str], List[List[str]], List[str, int, int], List[List[str, int, int]]] = self[tok] if isinstance(tokens[0], str): return self elif isinstance(tokens[0][-1], int): tokens = [x[0] for x in tokens] elif isinstance(tokens[0][-1], str): return self else: tokens = [[t[0] for t in x] for x in tokens] d = Document(**self) d[tok] = tokens return d def get_by_prefix(self, prefix: str): """ Get value by the prefix of a key. Args: prefix: The prefix of a key. If multiple keys are matched, only the first one will be used. Returns: The value assigned with the matched key. """ key = prefix_match(prefix, self) if not key: return None return self[key] def count_sentences(self) -> int: """ Count number of sentences in this document. Returns: Number of sentences. """ tok = self.get_by_prefix('tok') if isinstance(tok[0], str): return 1 return len(tok) ================================================ FILE: plugins/hanlp_common/hanlp_common/io.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-16 22:38 import json import os import pickle import sys from typing import Union def save_pickle(item, path): with open(path, 'wb') as f: pickle.dump(item, f) def load_pickle(path): with open(path, 'rb') as f: return pickle.load(f) def save_json(item: Union[dict, list, str, int, float], path: str, ensure_ascii=False, cls=None, default=lambda o: repr(o), indent=2): dirname = os.path.dirname(path) if dirname: os.makedirs(dirname, exist_ok=True) with open(path, 'w', encoding='utf-8') as out: json.dump(item, out, ensure_ascii=ensure_ascii, indent=indent, cls=cls, default=default) def load_json(path): with open(path, encoding='utf-8') as src: return json.load(src) def filename_is_json(filename): filename, file_extension = os.path.splitext(filename) return file_extension in ['.json', '.jsonl'] def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) ================================================ FILE: plugins/hanlp_common/hanlp_common/reflection.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 16:41 import importlib import inspect def classpath_of(obj) -> str: """get the full class path of object Args: obj: return: Returns: """ if inspect.isfunction(obj): return module_path_of(obj) return "{0}.{1}".format(obj.__class__.__module__, obj.__class__.__name__) def module_path_of(func) -> str: return inspect.getmodule(func).__name__ + '.' + func.__name__ def object_from_classpath(classpath, **kwargs): classpath = str_to_type(classpath) if inspect.isfunction(classpath): return classpath return classpath(**kwargs) def str_to_type(classpath): """convert class path in str format to a type Args: classpath: class path Returns: type """ module_name, class_name = classpath.rsplit(".", 1) cls = getattr(importlib.import_module(module_name), class_name) return cls def type_to_str(type_object) -> str: """convert a type object to class path in str format Args: type_object: type Returns: class path """ cls_name = str(type_object) assert cls_name.startswith(""), 'illegal input' cls_name = cls_name[:-len("'>")] return cls_name ================================================ FILE: plugins/hanlp_common/hanlp_common/structure.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-19 20:56 import json from collections import OrderedDict from hanlp_common.io import filename_is_json, save_pickle, load_pickle, save_json, load_json class Serializable(object): """A super class for save/load operations.""" def save(self, path, fmt=None): if not fmt: if filename_is_json(path): self.save_json(path) else: self.save_pickle(path) elif fmt in ['json', 'jsonl']: self.save_json(path) else: self.save_pickle(path) def load(self, path, fmt=None): if not fmt: if filename_is_json(path): self.load_json(path) else: self.load_pickle(path) elif fmt in ['json', 'jsonl']: self.load_json(path) else: self.load_pickle(path) def save_pickle(self, path): """Save to path Args: path: Returns: """ save_pickle(self, path) def load_pickle(self, path): """Load from path Args: path(str): file path Returns: """ item = load_pickle(path) return self.copy_from(item) def save_json(self, path): save_json(self.to_dict(), path) def load_json(self, path): item = load_json(path) return self.copy_from(item) # @abstractmethod def copy_from(self, item): self.__dict__ = item.__dict__ # raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) def to_json(self, ensure_ascii=False, indent=2, sort=False) -> str: d = self.to_dict() if sort: d = OrderedDict(sorted(d.items())) return json.dumps(d, ensure_ascii=ensure_ascii, indent=indent, default=lambda o: repr(o)) def to_dict(self) -> dict: return self.__dict__ class SerializableDict(Serializable, dict): def save_json(self, path): save_json(self, path) def copy_from(self, item): if isinstance(item, dict): self.clear() self.update(item) def __getattr__(self, key): if key.startswith('__'): return dict.__getattr__(key) return self.__getitem__(key) def __setattr__(self, key, value): return self.__setitem__(key, value) def to_dict(self) -> dict: return self ================================================ FILE: plugins/hanlp_common/hanlp_common/util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-27 19:09 import math from typing import Union, Any, List, Optional, Tuple, Iterable, Dict import inspect from itertools import chain, combinations def powerset(iterable, descending=False): """ powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3) Args: iterable: Returns: """ s = list(iterable) sizes = range(len(s), -1, -1) if descending else range(len(s) + 1) return chain.from_iterable(combinations(s, r) for r in sizes) def isdebugging(): """See Also https://stackoverflow.com/questions/333995/how-to-detect-that-python-code-is-being-executed-through-the-debugger""" for frame in inspect.stack(): if frame[1].endswith("pydevd.py"): return True return False def list_is_list_of_lists(sent: Union[Any, List[Any]]) -> Optional[bool]: if not sent: return None return isinstance(sent[0], list) def set_tuple_with(t: Tuple, v, at=0) -> Tuple: t = list(t) t[at] = v return tuple(t) def consume_keys_from_dict(keys: Iterable, d: dict) -> dict: consumed = {} for k in keys: if k in d: consumed[k] = d.pop(k) return consumed def merge_dict(d: dict, overwrite=False, inplace=False, **kwargs): """Merging the provided dict with other kvs Args: d: kwargs: d: dict: overwrite: (Default value = False) inplace: (Default value = False) **kwargs: Returns: """ nd = dict([(k, v) for k, v in d.items()] + [(k, v) for k, v in kwargs.items() if overwrite or k not in d]) if inplace: d.update(nd) return d return nd def merge_locals_kwargs(locals: dict, kwargs: dict = None, excludes=('self', 'kwargs', '__class__')): if not kwargs: kwargs = dict() return merge_dict(dict((k, v) for k, v in list(locals.items()) if k not in excludes), **kwargs) def infer_space_after(sent: List[str]): last_token = None quote_count: int = 0 # infer whitespace after field whitespace_after = [True] * len(sent) for token in range(len(sent)): if sent[token] == '"': quote_count += 1 if quote_count % 2 != 0: whitespace_after[token] = False elif last_token is not None: whitespace_after[last_token] = False if last_token is not None: if sent[token] in [".", ":", ",", ";", ")", "n't", "!", "?"]: whitespace_after[last_token] = False if sent[token].startswith("'"): whitespace_after[last_token] = False if sent[token] in ["("]: whitespace_after[token] = False last_token = token return whitespace_after def collapse_json(text, indent=12): """Compacts a string of json data by collapsing whitespace after the specified indent level NOTE: will not produce correct results when indent level is not a multiple of the json indent level Args: text: indent: (Default value = 12) Returns: """ initial = " " * indent out = [] # final json output sublevel = [] # accumulation list for sublevel entries pending = None # holder for consecutive entries at exact indent level for line in text.splitlines(): if line.startswith(initial): if line[indent] == " ": # found a line indented further than the indent level, so add # it to the sublevel list if pending: # the first item in the sublevel will be the pending item # that was the previous line in the json sublevel.append(pending) pending = None item = line.strip() sublevel.append(item) if item.endswith(","): sublevel.append(" ") elif sublevel: # found a line at the exact indent level *and* we have sublevel # items. This means the sublevel items have come to an end sublevel.append(line.strip()) out.append("".join(sublevel)) sublevel = [] else: # found a line at the exact indent level but no items indented # further, so possibly start a new sub-level if pending: # if there is already a pending item, it means that # consecutive entries in the json had the exact same # indentation and that last pending item was not the start # of a new sublevel. out.append(pending) pending = line.rstrip() else: if pending: # it's possible that an item will be pending but not added to # the output yet, so make sure it's not forgotten. out.append(pending) pending = None if sublevel: out.append("".join(sublevel)) out.append(line) return "\n".join(out) class DummyContext(object): def __enter__(self): pass def __exit__(self, exc_type, exc_val, exc_tb): pass def merge_list_of_dict(samples: List[Dict]) -> dict: batch = {} for each in samples: for k, v in each.items(): vs = batch.get(k, None) if vs is None: vs = [] batch[k] = vs vs.append(v) return batch def split_dict(batch: Dict[str, Any]) -> List[Dict[str, Any]]: samples = [] batch = dict((k, v) for k, v in batch.items() if isinstance(v, list)) num_samples = len(max(batch.values(), key=len)) for i in range(num_samples): samples.append(dict((k, v[i]) for k, v in batch.items())) return samples def reorder(samples: List, order: List[int]) -> List: return [samples[i] for i in sorted(range(len(order)), key=lambda k: order[k])] def k_fold(k, total, i): trn = math.ceil(i / k * total) tst = math.ceil((i + 1) / k * total) return list(range(0, trn)) + list(range(tst, total)), list(range(trn, tst)) def dfs(graph, start): seen = set() path = [] q = [start] while q: v = q.pop() if v not in seen: seen.add(v) path.append(v) q.extend(graph[v]) return path def topological_sort(graph, start): seen = set() stack = [] order = [] q = [start] while q: v = q.pop() if v not in seen: seen.add(v) q.extend(graph[v]) while stack and v not in graph[stack[-1]]: order.append(stack.pop()) stack.append(v) return stack + order[::-1] def prefix_match(target, sources: Iterable[str]): if target is None: return None if target in sources: return target for each in sources: if each.startswith(target): return each ================================================ FILE: plugins/hanlp_common/hanlp_common/visualization.py ================================================ # -*- coding:utf-8 -*- # Modified from https://github.com/tylerneylon/explacy import io from collections import defaultdict from pprint import pprint from phrasetree.tree import Tree def make_table(rows, insert_header=False): col_widths = [max(len(s) for s in col) for col in zip(*rows[1:])] rows[0] = [x[:l] for x, l in zip(rows[0], col_widths)] fmt = '\t'.join('%%-%ds' % width for width in col_widths) if insert_header: rows.insert(1, ['─' * width for width in col_widths]) return '\n'.join(fmt % tuple(row) for row in rows) def _start_end(arrow): start, end = arrow['from'], arrow['to'] mn = min(start, end) mx = max(start, end) return start, end, mn, mx def pretty_tree_horizontal(arrows, _do_print_debug_info=False): """Print the dependency tree horizontally Args: arrows: _do_print_debug_info: (Default value = False) Returns: """ # Set the base height; these may increase to allow room for arrowheads after this. arrows_with_deps = defaultdict(set) for i, arrow in enumerate(arrows): arrow['underset'] = set() if _do_print_debug_info: print('Arrow %d: "%s" -> "%s"' % (i, arrow['from'], arrow['to'])) num_deps = 0 start, end, mn, mx = _start_end(arrow) for j, other in enumerate(arrows): if arrow is other: continue o_start, o_end, o_mn, o_mx = _start_end(other) if ((start == o_start and mn <= o_end <= mx) or (start != o_start and mn <= o_start <= mx)): num_deps += 1 if _do_print_debug_info: print('%d is over %d' % (i, j)) arrow['underset'].add(j) arrow['num_deps_left'] = arrow['num_deps'] = num_deps arrows_with_deps[num_deps].add(i) if _do_print_debug_info: print('') print('arrows:') pprint(arrows) print('') print('arrows_with_deps:') pprint(arrows_with_deps) # Render the arrows in characters. Some heights will be raised to make room for arrowheads. sent_len = (max([max(arrow['from'], arrow['to']) for arrow in arrows]) if arrows else 0) + 1 lines = [[] for i in range(sent_len)] num_arrows_left = len(arrows) while num_arrows_left > 0: assert len(arrows_with_deps[0]) arrow_index = arrows_with_deps[0].pop() arrow = arrows[arrow_index] src, dst, mn, mx = _start_end(arrow) # Check the height needed. height = 3 if arrow['underset']: height = max(arrows[i]['height'] for i in arrow['underset']) + 1 height = max(height, 3, len(lines[dst]) + 3) arrow['height'] = height if _do_print_debug_info: print('') print('Rendering arrow %d: "%s" -> "%s"' % (arrow_index, arrow['from'], arrow['to'])) print(' height = %d' % height) goes_up = src > dst # Draw the outgoing src line. if lines[src] and len(lines[src]) < height: lines[src][-1].add('w') while len(lines[src]) < height - 1: lines[src].append(set(['e', 'w'])) if len(lines[src]) < height: lines[src].append({'e'}) lines[src][height - 1].add('n' if goes_up else 's') # Draw the incoming dst line. lines[dst].append(u'►') while len(lines[dst]) < height: lines[dst].append(set(['e', 'w'])) lines[dst][-1] = set(['e', 's']) if goes_up else set(['e', 'n']) # Draw the adjoining vertical line. for i in range(mn + 1, mx): while len(lines[i]) < height - 1: lines[i].append(' ') lines[i].append(set(['n', 's'])) # Update arrows_with_deps. for arr_i, arr in enumerate(arrows): if arrow_index in arr['underset']: arrows_with_deps[arr['num_deps_left']].remove(arr_i) arr['num_deps_left'] -= 1 arrows_with_deps[arr['num_deps_left']].add(arr_i) num_arrows_left -= 1 return render_arrows(lines) def render_arrows(lines): arr_chars = {'ew': u'─', 'ns': u'│', 'en': u'└', 'es': u'┌', 'enw': u'┴', 'ensw': u'┼', 'ens': u'├', 'esw': u'┬'} # Convert the character lists into strings. max_len = max(len(line) for line in lines) for i in range(len(lines)): lines[i] = [arr_chars[''.join(sorted(ch))] if type(ch) is set else ch for ch in lines[i]] lines[i] = ''.join(reversed(lines[i])) lines[i] = ' ' * (max_len - len(lines[i])) + lines[i] return lines def render_span(begin, end, unidirectional=False): if end - begin == 1: return ['───►'] elif end - begin == 2: return [ '──┐', '──┴►', ] if unidirectional else [ '◄─┐', '◄─┴►', ] rows = [] for i in range(begin, end): if i == (end - begin) // 2 + begin: rows.append(' ├►') elif i == begin: rows.append('──┐' if unidirectional else '◄─┐') elif i == end - 1: rows.append('──┘' if unidirectional else '◄─┘') else: rows.append(' │') return rows def tree_to_list(T): return [T.label(), [tree_to_list(t) if isinstance(t, Tree) else t for t in T]] def list_to_tree(L): if isinstance(L, str): return L return Tree(L[0], [list_to_tree(child) for child in L[1]]) def render_labeled_span(b, e, spans, labels, label, offset, unidirectional=False): spans.extend([''] * (b - offset)) spans.extend(render_span(b, e, unidirectional)) center = b + (e - b) // 2 labels.extend([''] * (center - offset)) labels.append(label) labels.extend([''] * (e - center - 1)) def main(): # arrows = [{'from': 1, 'to': 0}, {'from': 2, 'to': 1}, {'from': 2, 'to': 4}, {'from': 2, 'to': 5}, # {'from': 4, 'to': 3}] # lines = pretty_tree_horizontal(arrows) # print('\n'.join(lines)) # print('\n'.join([ # '◄─┐', # ' │', # ' ├►', # ' │', # '◄─┘', # ])) print('\n'.join(render_span(7, 12))) if __name__ == '__main__': main() left_rule = {'<': ':', '^': ':', '>': '-'} right_rule = {'<': '-', '^': ':', '>': ':'} def evalute_field(record, field_spec): """Evalute a field of a record using the type of the field_spec as a guide. Args: record: field_spec: Returns: """ if type(field_spec) is int: return str(record[field_spec]) elif type(field_spec) is str: return str(getattr(record, field_spec)) else: return str(field_spec(record)) def markdown_table(headings, records, fields=None, alignment=None, file=None): """Generate a Doxygen-flavor Markdown table from records. See https://stackoverflow.com/questions/13394140/generate-markdown-tables file -- Any object with a 'write' method that takes a single string parameter. records -- Iterable. Rows will be generated from this. fields -- List of fields for each row. Each entry may be an integer, string or a function. If the entry is an integer, it is assumed to be an index of each record. If the entry is a string, it is assumed to be a field of each record. If the entry is a function, it is called with the record and its return value is taken as the value of the field. headings -- List of column headings. alignment - List of pairs alignment characters. The first of the pair specifies the alignment of the header, (Doxygen won't respect this, but it might look good, the second specifies the alignment of the cells in the column. Possible alignment characters are: '<' = Left align '>' = Right align (default for cells) '^' = Center (default for column headings) Args: headings: records: fields: (Default value = None) alignment: (Default value = None) file: (Default value = None) Returns: """ if not file: file = io.StringIO() num_columns = len(headings) if not fields: fields = list(range(num_columns)) assert len(headings) == num_columns # Compute the table cell data columns = [[] for i in range(num_columns)] for record in records: for i, field in enumerate(fields): columns[i].append(evalute_field(record, field)) # Fill out any missing alignment characters. extended_align = alignment if alignment is not None else [('^', '<')] if len(extended_align) > num_columns: extended_align = extended_align[0:num_columns] elif len(extended_align) < num_columns: extended_align += [('^', '>') for i in range(num_columns - len(extended_align))] heading_align, cell_align = [x for x in zip(*extended_align)] field_widths = [len(max(column, key=len)) if len(column) > 0 else 0 for column in columns] heading_widths = [max(len(head), 2) for head in headings] column_widths = [max(x) for x in zip(field_widths, heading_widths)] _ = ' | '.join(['{:' + a + str(w) + '}' for a, w in zip(heading_align, column_widths)]) heading_template = '| ' + _ + ' |' _ = ' | '.join(['{:' + a + str(w) + '}' for a, w in zip(cell_align, column_widths)]) row_template = '| ' + _ + ' |' _ = ' | '.join([left_rule[a] + '-' * (w - 2) + right_rule[a] for a, w in zip(cell_align, column_widths)]) ruling = '| ' + _ + ' |' file.write(heading_template.format(*headings).rstrip() + '\n') file.write(ruling.rstrip() + '\n') for row in zip(*columns): file.write(row_template.format(*row).rstrip() + '\n') if isinstance(file, io.StringIO): text = file.getvalue() file.close() return text ================================================ FILE: plugins/hanlp_common/setup.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 19:26 from os.path import abspath, join, dirname from setuptools import find_packages, setup this_dir = abspath(dirname(__file__)) with open(join(this_dir, 'README.md'), encoding='utf-8') as file: long_description = file.read() setup( name='hanlp_common', version='0.0.22', description='HanLP: Han Language Processing', long_description=long_description, long_description_content_type="text/markdown", url='https://github.com/hankcs/HanLP', author='hankcs', author_email='hankcshe@gmail.com', license='Apache License 2.0', classifiers=[ 'Intended Audience :: Science/Research', 'Intended Audience :: Developers', "Development Status :: 3 - Alpha", 'Operating System :: OS Independent', "License :: OSI Approved :: Apache Software License", 'Programming Language :: Python :: 3 :: Only', 'Topic :: Scientific/Engineering :: Artificial Intelligence', "Topic :: Text Processing :: Linguistic" ], keywords='corpus,machine-learning,NLU,NLP', packages=find_packages(exclude=['docs', 'tests*']), include_package_data=True, install_requires=[ 'phrasetree>=0.0.9', ], extras_require={ # These AMR dependencies might not be necessary for most people. 'full': [ 'networkx', 'penman==0.6.2', ], }, python_requires='>=3.6', ) ================================================ FILE: plugins/hanlp_demo/README.md ================================================ # Demos and examples for HanLP This package is intended for demonstration purpose and won't be released to pypi. **Training requires a fair understanding of Linux and Python which might not be the case for everybody.** You need a Linux/macOS system with Internet on because some corpora and bash scripts will be downloaded during training. Training on Windows might work if you are an expert but we believe it's very rare. Your `python` command needs to be Python2 while `python3` needs to be Python3. You need to install this package and run it from the **root** folder of HanLP. ```bash pip install -e plugins/hanlp_demo python3 plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py ``` ================================================ FILE: plugins/hanlp_demo/hanlp_demo/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-29 17:48 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/block_windows.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-07-28 21:38 from hanlp.utils.io_util import windows assert not windows(), 'Windows is not supported for this script. Please run it on Linux systems.' ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 17:55 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_amr.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-01-25 19:09 import hanlp amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE) amr = amr_parser('The boy wants the girl to believe him.') print(amr) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_dep.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 17:55 import hanlp syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN) sent = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'), ('music', 'NN'), ('?', '.')] tree = syntactic_parser(sent) print(tree) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_lm.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-02-11 09:14 import hanlp lm = hanlp.load(hanlp.pretrained.rnnlm.FLAIR_LM_FW_WMT11_EN_TF) print(''.join(lm.generate_text(list('hello')))) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-03 22:50 import hanlp recognizer = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_CASED_EN) print(recognizer(["President", "Obama", "is", "speaking", "at", "the", "White", "House", "."])) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_pipeline.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-04 21:05 import hanlp from hanlp.utils.lang.en.english_tokenizer import tokenize_english tokenizer = tokenize_english tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN) syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN) semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN) pipeline = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ .append(tokenizer, output_key='tokens') \ .append(tagger, output_key='part_of_speech_tags') \ .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \ .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False) print(pipeline) text = '''Jobs and Wozniak co-founded Apple in 1976 to sell Wozniak's Apple I personal computer. Together the duo gained fame and wealth a year later with the Apple II. ''' doc = pipeline(text) print(doc) # You can save the config to disk for deploying or sharing. pipeline.save('en.json') # Then load it smoothly. deployed = hanlp.load('en.json') print(deployed) print(deployed(text)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_pos.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-03 22:16 import hanlp tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN) print(tagger([['I', 'banked', '2', 'dollars', 'in', 'a', 'bank', '.'], ['Is', 'this', 'the', 'future', 'of', 'chamber', 'music', '?']])) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_sdp.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-03 15:26 import hanlp from hanlp_common.conll import CoNLLSentence # semeval15 offers three independent annotations over the Penn Treebank (PTB) semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN) # semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_DM_BIAFFINE_EN) # semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PSD_BIAFFINE_EN) sent = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'), ('music', 'NN'), ('?', '.')] tree = semantic_parser(sent) # type:CoNLLSentence print(tree) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_sentiment_analysis.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 03:52 import hanlp classifier = hanlp.load('SST2_ALBERT_BASE_EN') print(classifier.predict('I feel lucky')) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/demo_tok.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-02 19:41 from hanlp.utils.lang.en.english_tokenizer import tokenize_english text = """\ Don't go gentle into that good night. """ print(tokenize_english(text)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/en/train_sst2_albert_base.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-11-10 17:41 import os from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF from tests import cdroot from hanlp.datasets.glu.glue import STANFORD_SENTIMENT_TREEBANK_2_DEV, STANFORD_SENTIMENT_TREEBANK_2_TRAIN, \ STANFORD_SENTIMENT_TREEBANK_2_TEST cdroot() save_dir = os.path.join('data', 'model', 'sst', 'sst2_albert_base') classifier = TransformerClassifierTF() classifier.fit(STANFORD_SENTIMENT_TREEBANK_2_TRAIN, STANFORD_SENTIMENT_TREEBANK_2_DEV, save_dir, transformer='albert-base-v2') classifier.load(save_dir) print(classifier('it\' s a charming and often affecting journey')) classifier.evaluate(STANFORD_SENTIMENT_TREEBANK_2_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/ja/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-17 22:30 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/ja/demo_mtl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-17 22:30 import hanlp from hanlp_common.document import Document HanLP = hanlp.load(hanlp.pretrained.mtl.NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA) doc: Document = HanLP([ '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', '奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。', ]) print(doc) doc.pretty_print() ================================================ FILE: plugins/hanlp_demo/hanlp_demo/mul/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-31 22:25 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_lid.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-09-28 16:49 import hanlp lid = hanlp.load(hanlp.pretrained.classifiers.LID_176_FASTTEXT_BASE) print(lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')) lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True) print(f'{lang} language identified with probability {prob:.3%}') print(lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)) # For a combination of languages, predict top-k languages with probabilities: text = ''' 2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。 In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments. ''' print(lid(text, topk=3, prob=True)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_lid_restful.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-09-28 16:49 from hanlp_restful import HanLPClient HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul') print(HanLP.language_identification([ 'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.', '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', ])) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/mul/demo_mtl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-31 13:51 import hanlp from hanlp_common.document import Document HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE) doc: Document = HanLP([ 'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.', '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', ]) print(doc) doc.pretty_print() ================================================ FILE: plugins/hanlp_demo/hanlp_demo/mul/train/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2023-02-21 19:40 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/mul/train/mul_base.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-03 14:24 from hanlp.common.dataset import SortingSamplerBuilder from hanlp.common.transform import NormalizeToken from hanlp.components.mtl.multi_task_learning import MultiTaskLearning from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization from hanlp.components.mtl.tasks.ud import UniversalDependenciesParsing from hanlp.datasets.parsing.ptb import PTB_TOKEN_MAPPING from hanlp.datasets.parsing.ud.ud210m import UD_210_MULTILINGUAL_TRAIN, UD_210_MULTILINGUAL_DEV, \ UD_210_MULTILINGUAL_TEST from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding from hanlp.utils.log_util import cprint from tests import cdroot def main(): cdroot() transformer = "nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large" tasks = { 'tok': TaggingTokenization( 'data/mtl/mul/tok/train.tsv', 'data/mtl/mul/tok/dev.tsv', 'data/mtl/mul/tok/test.tsv', SortingSamplerBuilder(batch_size=128, batch_max_tokens=12800), hard_constraint=True, tagging_scheme='BMES', delimiter='\t', max_seq_len=256, char_level=True, lr=1e-3, ), 'ud': UniversalDependenciesParsing( UD_210_MULTILINGUAL_TRAIN, UD_210_MULTILINGUAL_DEV, UD_210_MULTILINGUAL_TEST, SortingSamplerBuilder(batch_size=128, batch_max_tokens=12800), lr=1e-3, dependencies='tok', max_seq_len=256, ), } mtl = MultiTaskLearning() save_dir = 'data/model/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mMiniLMv2L12' cprint(f'Model will be saved in [cyan]{save_dir}[/cyan]') mtl.fit( ContextualWordEmbedding( 'token', transformer, average_subwords=True, max_sequence_length=512, word_dropout=.2, ), tasks, save_dir, 30, lr=1e-3, encoder_lr=5e-5, grad_norm=1, gradient_accumulation=8, eval_trn=False, transform=NormalizeToken(PTB_TOKEN_MAPPING, 'token'), tau=0.5, cache='data/cache/ud/mtl', ) cprint(f'Model saved in [cyan]{save_dir}[/cyan]') mtl.load(save_dir) mtl['tok'].dict_force = {"'s", "n't", "'ll", "'m", "'d", "'ve", "'re"} mtl['ud'].config.tree = True mtl.save_config(save_dir) for k, v in mtl.tasks.items(): v.trn = tasks[k].trn v.dev = tasks[k].dev v.tst = tasks[k].tst mtl.evaluate(save_dir) doc = mtl(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.', '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。']) doc.pretty_print() if __name__ == '__main__': main() ================================================ FILE: plugins/hanlp_demo/hanlp_demo/sent_split.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-31 14:23 import hanlp split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) output = split_sent('3.14 is pi. “你好!!!”——他说。劇場版「Fate/stay night [HF]」最終章公開カウントダウン!') print('\n'.join(output)) # See also https://hanlp.hankcs.com/docs/api/hanlp/components/eos.html ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-31 13:51 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/abstractive_summarization_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 生成式自动摘要\n", "生成式自动摘要(Abstractive Summarization)任务的目标是为文章生成一段简短的概括性摘要。 生成的摘要有可能出现原文中不存在的新短语或新句子,并且整体流畅性较高。\n", "### 中文\n", "生成式自动摘要任务的输入为一段文本:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BqEmDMGGOtk3", "outputId": "936d439a-e1ff-4308-d2aa-775955558594" }, "outputs": [ { "data": { "text/plain": [ "'长江证券:看好大金属品种中的铜铝钢'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.abstractive_summarization('''\n", "每经AI快讯,2月4日,长江证券研究所金属行业首席分析师王鹤涛表示,2023年海外经济衰退,美债现处于历史高位,\n", "黄金的趋势是值得关注的;在国内需求修复的过程中,看好大金属品种中的铜铝钢。\n", "此外,在细分的小品种里,建议关注两条主线,一是新能源,比如锂、钴、镍、稀土,二是专精特新主线。(央视财经)\n", "''')" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "返回值为一段摘要。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 英文\n", "按照HanLP一贯的多语种设计,任何语言都支持。由于服务器GPU资源限制,目前英文接口暂未上线。如果你有相应需求,欢迎前往论坛发起请愿。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "absum_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/amr_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 抽象意义表示\n", "### 中文\n", "抽象意义表示任务的输入为一段文本或已分词完毕的句子:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BqEmDMGGOtk3", "outputId": "936d439a-e1ff-4308-d2aa-775955558594" }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "graphs = HanLP.abstract_meaning_representation('男孩希望女孩相信他。')\n", "len(graphs)" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "返回值为每个句子相应的AMR图的Meaning Representation格式:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '0',\n", " 'input': '男孩 希望 女孩 相信 他 。',\n", " 'nodes': [{'id': 0,\n", " 'label': '男孩',\n", " 'anchors': [{'from': 0, 'to': 2}, {'from': 12, 'to': 13}]},\n", " {'id': 1, 'label': '希望-01', 'anchors': [{'from': 3, 'to': 5}]},\n", " {'id': 2, 'label': '女孩', 'anchors': [{'from': 6, 'to': 8}]},\n", " {'id': 3, 'label': '相信-01', 'anchors': [{'from': 9, 'to': 11}]}],\n", " 'edges': [{'source': 1, 'target': 3, 'label': 'arg1'},\n", " {'source': 1, 'target': 0, 'label': 'arg0'},\n", " {'source': 3, 'target': 2, 'label': 'arg0'},\n", " {'source': 3, 'target': 0, 'label': 'arg1'}],\n", " 'tops': [1],\n", " 'framework': 'amr'}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "graph = graphs[0]\n", "graph" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "注意上面“男孩”有2个anchor,分别对应“男孩”和“他”。也就是说,MR格式其实包含了指代消解的结果。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 可视化\n", "指定`visualization='svg'`即可得到矢量图可视化。" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "0\n", "\n", "\n", "\n", "\n", "1\n", "\n", "希望-01\n", "\n", "\n", "\n", "top->1\n", "\n", "\n", "\n", "\n", "\n", "0\n", "\n", "男孩\n", "\n", "\n", "\n", "1->0\n", "\n", "\n", "arg0\n", "\n", "\n", "\n", "3\n", "\n", "相信-01\n", "\n", "\n", "\n", "1->3\n", "\n", "\n", "arg1\n", "\n", "\n", "\n", "3->0\n", "\n", "\n", "arg1\n", "\n", "\n", "\n", "2\n", "\n", "女孩\n", "\n", "\n", "\n", "3->2\n", "\n", "\n", "arg0\n", "\n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.display import SVG, display\n", "\n", "def show_svg(g):\n", " display(SVG(data=g['svg']))\n", " \n", "graph = HanLP.abstract_meaning_representation('男孩希望女孩相信他。', visualization='svg')[0]\n", "show_svg(graph)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 多语种支持\n", "除了中文外,支持的语言列表:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 英文\n", "目前,HanLP服务器还支持英文AMR:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "0\n", "\n", "\n", "\n", "\n", "1\n", "\n", "want-01\n", "\n", "\n", "\n", "top->1\n", "\n", "\n", "\n", "\n", "\n", "0\n", "\n", "boy\n", "\n", "\n", "\n", "1->0\n", "\n", "\n", "arg0\n", "\n", "\n", "\n", "3\n", "\n", "believe-01\n", "\n", "\n", "\n", "1->3\n", "\n", "\n", "arg1\n", "\n", "\n", "\n", "3->0\n", "\n", "\n", "arg1\n", "\n", "\n", "\n", "2\n", "\n", "girl\n", "\n", "\n", "\n", "3->2\n", "\n", "\n", "arg0\n", "\n", "\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "graph = HanLP.abstract_meaning_representation('The boy wants the girl to believe him.',\n", " language='en', visualization='svg')[0]\n", "show_svg(graph)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "用户可以通过指定`language`参数来实现英文抽象意义表示的分析:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '0',\n", " 'input': 'The boy wants the girl to believe him .',\n", " 'nodes': [{'id': 0, 'label': 'boy'},\n", " {'id': 1, 'label': 'wants-01'},\n", " {'id': 2, 'label': 'girl'},\n", " {'id': 3, 'label': 'believe-01'}],\n", " 'edges': [{'source': 3, 'target': 0, 'label': 'arg1'},\n", " {'source': 1, 'target': 3, 'label': 'arg1'},\n", " {'source': 3, 'target': 2, 'label': 'arg0'},\n", " {'source': 1, 'target': 0, 'label': 'arg0'}],\n", " 'tops': [1],\n", " 'framework': 'amr'}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.abstract_meaning_representation(tokens=[['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.']], \n", " language='en')[0]" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "amr_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/amr_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp[amr] -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534" }, "outputs": [ { "data": { "text/plain": [ "{'AMR3_SEQ2SEQ_BART_LARGE': 'https://file.hankcs.com/hanlp/amr/amr3_seq2seq_bart_large_83.30_20220125_114450.zip',\n", " 'MRP2020_AMR_ENG_ZHO_XLM_BASE': 'http://download.hanlp.com/amr/extra/amr-eng-zho-xlm-roberta-base_20220412_223756.zip',\n", " 'MRP2020_AMR_ZHO_MENGZI_BASE': 'http://download.hanlp.com/amr/extra/amr-zho-mengzi-base_20220415_101941.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.amr.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0tmKBu7sNAXX", "outputId": "df2de87b-27f5-4c72-8eb2-25ceefdd8270" }, "outputs": [], "source": [ "amr = hanlp.load('MRP2020_AMR_ENG_ZHO_XLM_BASE')" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 抽象意义表示\n", "抽象意义表示任务的输入为一个或多个句子,`MRP2020_AMR_ENG_ZHO_XLM_BASE`要求提供分词完毕的句子:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BqEmDMGGOtk3", "outputId": "936d439a-e1ff-4308-d2aa-775955558594" }, "outputs": [], "source": [ "graph = amr([\"男孩\", \"希望\", \"女孩\", \"相信\", \"他\", \"。\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "返回对象为[penman.Graph](https://penman.readthedocs.io/en/latest/api/penman.graph.html)类型:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "graph" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "打印时为友好格式:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(x2 / 希望-01\n", " :arg1 (x4 / 相信-01\n", " :arg0 (x3 / 女孩)\n", " :arg1 x1)\n", " :arg0 (x1 / 男孩))\n" ] } ], "source": [ "print(graph)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "该AMR的可视化结果为:\n", "\n", "![amr-zh](https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=%E7%94%B7%E5%AD%A9%20%E5%B8%8C%E6%9C%9B%20%E5%A5%B3%E5%AD%A9%20%E7%9B%B8%E4%BF%A1%20%E4%BB%96%20%E3%80%82&language=zh&scale=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`MRP2020_AMR_ENG_ZHO_XLM_BASE`其实是一个Meaning Representation Parsing模型,支持输出Meaning Representation(MR)格式,该格式比AMR的表达力更强:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '0',\n", " 'input': '男孩 希望 女孩 相信 他 。',\n", " 'nodes': [{'id': 0,\n", " 'label': '男孩',\n", " 'anchors': [{'from': 0, 'to': 2}, {'from': 12, 'to': 13}]},\n", " {'id': 1, 'label': '希望-01', 'anchors': [{'from': 3, 'to': 5}]},\n", " {'id': 2, 'label': '女孩', 'anchors': [{'from': 6, 'to': 8}]},\n", " {'id': 3, 'label': '相信-01', 'anchors': [{'from': 9, 'to': 11}]}],\n", " 'edges': [{'source': 1, 'target': 3, 'label': 'arg1'},\n", " {'source': 1, 'target': 0, 'label': 'arg0'},\n", " {'source': 3, 'target': 2, 'label': 'arg0'},\n", " {'source': 3, 'target': 0, 'label': 'arg1'}],\n", " 'tops': [1],\n", " 'framework': 'amr'}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "amr([\"男孩\", \"希望\", \"女孩\", \"相信\", \"他\", \"。\"], output_amr=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "注意上面“男孩”有2个anchor,分别对应“男孩”和“他”。也就是说,MR格式其实包含了指代消解的结果。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 多语种支持\n", "`MRP2020_AMR_ENG_ZHO_XLM_BASE`同时还是一个Cross-Lingual模型,支持的语言列表:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['amr', 'eng'], ['amr', 'zho']]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "amr.config.frameworks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "用户可以通过指定language参数来实现英文抽象意义表示的分析:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(w1 / wants-01\n", " :arg1 (b2 / believe-01\n", " :arg0 (g1 / girl)\n", " :arg1 b1)\n", " :arg0 (b1 / boy))\n" ] } ], "source": [ "print(amr(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "为了达到最佳效果,建议同时提供每个词的词干:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(w1 / want-01\n", " :arg1 (b2 / believe-01\n", " :arg0 (g1 / girl)\n", " :arg1 b1)\n", " :arg0 (b1 / boy))\n" ] } ], "source": [ "print(amr([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'),\n", " ('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "该AMR的可视化结果为:\n", "\n", "![amr-en](https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=The%20boy%20wants%20the%20girl%20to%20believe%20him%20.&language=en&scale=1)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "amr_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/classification_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "nf9TgeCTC0OT" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jaW4eu6kC0OU", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "_xI_bLAaC0OU" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IYwV-UkNNzFp", "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "1Uf_u7ddMhUt", "pycharm": { "name": "#%% md\n" } }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 文本分类\n", "文本分类任务的输入为文档以及分类模型,以新闻领域的`news_zh`为例:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "BqEmDMGGOtk3" }, "outputs": [ { "data": { "text/plain": [ "'科技'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh')" ] }, { "cell_type": "markdown", "metadata": { "id": "SwaPn1hjC0OW" }, "source": [ "返回值为文档最可能的类目。HanLP支持返回类目对应的概率(置信度):" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "egpWwHKxC0OX", "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff" }, "outputs": [ { "data": { "text/plain": [ "['科技', 0.999642014503479]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh', prob=True)" ] }, { "cell_type": "markdown", "metadata": { "id": "kq_j5TLFC0OX" }, "source": [ "HanLP也支持返回概率最高的`topk`个类目:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "isJhzYyIC0OX", "outputId": "683c8489-dffc-426e-f95b-e91dfb373260" }, "outputs": [ { "data": { "text/plain": [ "['科技', '家居']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.text_classification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', model='news_zh', topk=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "该功能对于混合了多个主题的文档而言特别实用:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'时尚': 0.6342714428901672,\n", " '家居': 0.359315425157547,\n", " '科技': 0.0013340614968910813,\n", " '体育': 0.001275017624720931,\n", " '房产': 0.0010209722677245736,\n", " '娱乐': 0.0006360886618494987,\n", " '财经': 0.0005668793455697596,\n", " '游戏': 0.00037119409535080194,\n", " '教育': 0.00029694309341721237,\n", " '股票': 0.0002858955995179713,\n", " '星座': 0.0002288677787873894,\n", " '彩票': 0.00022682634880766273,\n", " '时政': 0.0001005345256999135,\n", " '社会': 6.985480285948142e-05}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = '''\n", "改了好几次,感觉终于可以确定了。\n", "这次的真丝是做了古董感的米金色染色,法蕾也做了同样的颜色。\n", "真丝软糯的手感和温柔的光泽感,在即将结束的冬天,显得格外的美好。\n", "'''\n", "\n", "HanLP.text_classification(text, model='news_zh', topk=True, prob=True)" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "classification_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/con_mtl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [ { "data": { "text/plain": [ "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n", " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 短语句法分析\n", "任务越少,速度越快。如指定仅执行短语句法分析:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='con')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n", " ],\n", " \"con\": [\n", " [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"2021年\"]]]], [\"NP\", [[\"_\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"_\", [\"为\"]], [\"NP\", [[\"_\", [\"生产\"]], [\"_\", [\"环境\"]]]]]], [\"VP\", [[\"_\", [\"带来\"]], [\"NP\", [[\"ADJP\", [[\"NP\", [[\"ADJP\", [[\"_\", [\"次\"]]]], [\"NP\", [[\"_\", [\"世代\"]]]]]], [\"ADVP\", [[\"_\", [\"最\"]]]], [\"VP\", [[\"_\", [\"先进\"]]]]]], [\"_\", [\"的\"]], [\"NP\", [[\"QP\", [[\"_\", [\"多\"]]]], [\"NP\", [[\"_\", [\"语种\"]]]]]], [\"NP\", [[\"_\", [\"NLP\"]], [\"_\", [\"技术\"]]]]]]]]]], [\"_\", [\"。\"]]]]]],\n", " [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"阿婆主\"]]]], [\"VP\", [[\"VP\", [[\"_\", [\"来到\"]], [\"NP\", [[\"_\", [\"北京\"]], [\"_\", [\"立方庭\"]]]]]], [\"VP\", [[\"_\", [\"参观\"]], [\"NP\", [[\"_\", [\"自然\"]], [\"_\", [\"语义\"]], [\"_\", [\"科技\"]], [\"_\", [\"公司\"]]]]]]]], [\"_\", [\"。\"]]]]]]\n", " ]\n", "}\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`doc['con']`为Tree类型,是list的子类。" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "可视化短语句法树:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea" }, "outputs": [ { "data": { "text/html": [ "
Token     
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
P    3       4       5       6       7       8       9 
───────────────────────────────────────────────────────
_───────────────────────────────────────────►NP ───┐   
_───────────────────────────────────────────►NP────┤   
_──────────┐                                       │   
_──┐       ├────────────────────────►PP ───┐       │   
_──┴►NP ───┘                               │       │   
_──────────────────────────────────┐       │       │   
_───►ADJP──┐                       │       ├►VP────┤   
_───►NP ───┴►NP ───┐               │       │       │   
_───────────►ADVP──┼►ADJP──┐       ├►VP ───┘       ├►IP
_───────────►VP ───┘       │       │               │   
_──────────────────────────┤       │               │   
_───►QP ───┐               ├►NP ───┘               │   
_───►NP ───┴────────►NP────┤                       │   
_──┐                       │                       │   
_──┴────────────────►NP ───┘                       │   
_──────────────────────────────────────────────────┘   

Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
P    3       4       5       6 
───────────────────────────────
_───────────────────►NP ───┐   
_──────────┐               │   
_──┐       ├►VP ───┐       │   
_──┴►NP ───┘       │       │   
_──────────┐       ├►VP────┤   
_──┐       │       │       ├►IP
_  │       ├►VP ───┘       │   
_  ├►NP ───┘               │   
_──┘                       │   
_──────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "将第一个短语树转换为bracketed格式:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(TOP\n", " (IP\n", " (NP (_ 2021年))\n", " (NP (_ HanLPv2.1))\n", " (VP\n", " (PP (_ 为) (NP (_ 生产) (_ 环境)))\n", " (VP\n", " (_ 带来)\n", " (NP\n", " (ADJP\n", " (NP (ADJP (_ 次)) (NP (_ 世代)))\n", " (ADVP (_ 最))\n", " (VP (_ 先进)))\n", " (_ 的)\n", " (NP (QP (_ 多)) (NP (_ 语种)))\n", " (NP (_ NLP) (_ 技术)))))\n", " (_ 。)))\n" ] } ], "source": [ "print(doc['con'][0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "将第一个短语树转换为list格式:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['TOP',\n", " [['IP',\n", " [['NP', [['_', ['2021年']]]],\n", " ['NP', [['_', ['HanLPv2.1']]]],\n", " ['VP',\n", " [['PP', [['_', ['为']], ['NP', [['_', ['生产']], ['_', ['环境']]]]]],\n", " ['VP',\n", " [['_', ['带来']],\n", " ['NP',\n", " [['ADJP',\n", " [['NP', [['ADJP', [['_', ['次']]]], ['NP', [['_', ['世代']]]]]],\n", " ['ADVP', [['_', ['最']]]],\n", " ['VP', [['_', ['先进']]]]]],\n", " ['_', ['的']],\n", " ['NP', [['QP', [['_', ['多']]]], ['NP', [['_', ['语种']]]]]],\n", " ['NP', [['_', ['NLP']], ['_', ['技术']]]]]]]]]],\n", " ['_', ['。']]]]]]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc['con'][0].to_list()" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行短语句法分析:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "data": { "text/html": [ "
Token 
───── 
hanlp 
为     
生产    
环境    
带来    
次世代   
最     
先进    
的     
多语种   
nlp   
技术    
。     
P    3       4       5       6       7       8       9 
───────────────────────────────────────────────────────
_───────────────────────────────────────────►NP ───┐   
_──────────┐                                       │   
_──┐       ├────────────────────────►PP ───┐       │   
_──┴►NP ───┘                               │       │   
_──────────────────────────────────┐       │       │   
_───►NP ───┐                       │       ├►VP────┤   
_───►ADVP──┼►VP ────►IP ───┐       │       │       ├►IP
_───►VP ───┘               │       ├►VP ───┘       │   
_──────────────────────────┤       │               │   
_───────────────────►NP────┼►NP ───┘               │   
_───────────────────►NP────┤                       │   
_───────────────────►NP ───┘                       │   
_──────────────────────────────────────────────────┘   

Tok 
─── 
我   
的   
希望  
是   
希望  
张晚霞 
的   
背影  
被   
晚霞  
映红  
。   
P    3       4       5       6       7       8       9       10      11
───────────────────────────────────────────────────────────────────────
_───►NP ───┐                                                           
_──────────┴►DNP ──┐                                                   
_───────────►NP ───┴────────────────────────────────────────►NP ───┐   
_──────────────────────────────────────────────────────────┐       │   
_──────────────────────────────────────────┐               │       │   
_───►NP ───┐                               │               ├►VP────┤   
_──────────┴►DNP ──┐                       ├►VP ────►IP ───┘       │   
_───────────►NP ───┴────────►NP ───┐       │                       ├►IP
_──────────────────────────┐       ├►IP ───┘                       │   
_───►NP ───┐               ├►VP ───┘                               │   
_───►VP ───┴►IP ────►CP ───┘                                       │   
_──────────────────────────────────────────────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP([\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='con', skip_tasks='tok*').pretty_print()" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "con_mtl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/con_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 短语句法分析\n", "任务越少,速度越快。如指定仅执行短语句法分析:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='con')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n", " ],\n", " \"con\": [\n", " [\"TOP\", [[\"IP\", [[\"NP\", [[\"_\", [\"2021年\"]]]], [\"NP\", [[\"_\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"_\", [\"为\"]], [\"NP\", [[\"_\", [\"生产\"]], [\"_\", [\"环境\"]]]]]], [\"VP\", [[\"_\", [\"带来\"]], [\"NP\", [[\"IP\", [[\"VP\", [[\"NP\", [[\"QP\", [[\"CLP\", [[\"_\", [\"次\"]]]]]], [\"NP\", [[\"_\", [\"世代\"]]]]]], [\"ADVP\", [[\"_\", [\"最\"]]]], [\"VP\", [[\"_\", [\"先进\"]]]]]]]], [\"_\", [\"的\"]], [\"NP\", [[\"QP\", [[\"_\", [\"多\"]]]], [\"NP\", [[\"_\", [\"语种\"]]]]]], [\"NP\", [[\"_\", [\"NLP\"]], [\"_\", [\"技术\"]]]]]]]]]], [\"_\", [\"。\"]]]]]]\n", " ]\n", "}\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`doc['con']`为Tree类型,是list的子类。" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "可视化短语句法树:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea" }, "outputs": [ { "data": { "text/html": [ "
Token     
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
P    3       4       5       6       7       8       9       10      11
───────────────────────────────────────────────────────────────────────
_───────────────────────────────────────────────────────────►NP ───┐   
_───────────────────────────────────────────────────────────►NP────┤   
_──────────┐                                                       │   
_──┐       ├────────────────────────────────────────►PP ───┐       │   
_──┴►NP ───┘                                               │       │   
_──────────────────────────────────────────────────┐       │       │   
_───►CLP ───►QP ───┐                               │       ├►VP────┤   
_───────────►NP ───┴►NP ───┐                       │       │       │   
_───────────────────►ADVP──┼►VP ────►IP ───┐       ├►VP ───┘       ├►IP
_───────────────────►VP ───┘               │       │               │   
_──────────────────────────────────────────┤       │               │   
_───►QP ───┐                               ├►NP ───┘               │   
_───►NP ───┴────────────────────────►NP────┤                       │   
_──┐                                       │                       │   
_──┴────────────────────────────────►NP ───┘                       │   
_──────────────────────────────────────────────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "转换为bracketed格式:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(TOP\n", " (IP\n", " (NP (_ 2021年))\n", " (NP (_ HanLPv2.1))\n", " (VP\n", " (PP (_ 为) (NP (_ 生产) (_ 环境)))\n", " (VP\n", " (_ 带来)\n", " (NP\n", " (IP\n", " (VP\n", " (NP (QP (CLP (_ 次))) (NP (_ 世代)))\n", " (ADVP (_ 最))\n", " (VP (_ 先进))))\n", " (_ 的)\n", " (NP (QP (_ 多)) (NP (_ 语种)))\n", " (NP (_ NLP) (_ 技术)))))\n", " (_ 。)))\n" ] } ], "source": [ "print(doc['con'][0])" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行短语句法分析:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "data": { "text/html": [ "
Token 
───── 
hanlp 
为     
生产    
环境    
带来    
次世代   
最     
先进    
的     
多语种   
nlp   
技术    
。     
P    3       4       5       6       7       8       9       10      11      12
───────────────────────────────────────────────────────────────────────────────
_───────────────────────────────────────────────────────────────────►NP ───┐   
_──────────┐                                                               │   
_──┐       ├────────────────────────────────────────────────►PP ───┐       │   
_──┴►NP ───┘                                                       │       │   
_──────────────────────────────────────────────────────────┐       │       │   
_───────────►NP ───┐                                       │       ├►VP────┤   
_───►ADVP──┐       ├►VP ────►IP ───┐                       │       │       ├►IP
_───►VP ───┴►VP ───┘               ├►CP ────►CP ───┐       ├►VP ───┘       │   
_──────────────────────────────────┘               │       │               │   
_──────────────────────────────────────────────────┼►NP ───┘               │   
_───►NP ───┐                                       │                       │   
_───►NP ───┴────────────────────────────────►NP ───┘                       │   
_──────────────────────────────────────────────────────────────────────────┘   

Tok 
─── 
我   
的   
希望  
是   
希望  
张晚霞 
的   
背影  
被   
晚霞  
映红  
。   
P    3       4       5       6       7       8       9       10      11
───────────────────────────────────────────────────────────────────────
_───►NP ───┐                                                           
_──────────┴►DNP ──┐                                                   
_───────────►NP ───┴────────────────────────────────────────►NP ───┐   
_──────────────────────────────────────────────────────────┐       │   
_──────────────────────────────────────────┐               │       │   
_───►NP ───┐                               │               ├►VP────┤   
_──────────┴►DNP ──┐                       ├►VP ────►IP ───┘       │   
_───────────►NP ───┴────────►NP ───┐       │                       ├►IP
_──────────────────────────┐       ├►IP ───┘                       │   
_───►NP ───┐               ├►VP ───┘                               │   
_───►VP ───┴►IP ────►CP ───┘                                       │   
_──────────────────────────────────────────────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP(tokens=[\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='con').pretty_print()" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "con_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/con_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [ { "data": { "text/plain": [ "{'CTB9_CON_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/constituency/ctb9_con_electra_small_20220215_230116.zip',\n", " 'CTB9_CON_FULL_TAG_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/constituency/ctb9_full_tag_con_electra_small_20220118_103119.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.constituency.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "con = hanlp.load('CTB9_CON_FULL_TAG_ELECTRA_SMALL')" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 短语句法分析\n", "输入为已分词的一个或多个句子:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "trees = con([[\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]], tasks='con')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个`Tree`的数组:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['TOP', [['IP', [['NP-TMP', [['_', ['2021年']]]], ['NP-PN-SBJ', [['_', ['HanLPv2.1']]]], ['VP', [['PP-BNF', [['_', ['为']], ['NP', [['_', ['生产']], ['_', ['环境']]]]]], ['VP', [['_', ['带来']], ['NP-OBJ', [['CP', [['CP', [['IP', [['VP', [['NP', [['DP', [['_', ['次']]]], ['NP', [['_', ['世代']]]]]], ['ADVP', [['_', ['最']]]], ['VP', [['_', ['先进']]]]]]]], ['_', ['的']]]]]], ['NP', [['QP', [['_', ['多']]]], ['NP', [['_', ['语种']]]]]], ['NP', [['_', ['NLP']], ['_', ['技术']]]]]]]]]], ['_', ['。']]]]]], ['TOP', [['IP', [['NP-SBJ', [['_', ['阿婆主']]]], ['VP', [['VP', [['_', ['来到']], ['NP-OBJ', [['_', ['北京']], ['NP-PN', [['_', ['立方庭']]]]]]]], ['VP', [['_', ['参观']], ['NP-OBJ', [['_', ['自然']], ['_', ['语义']], ['_', ['科技']], ['_', ['公司']]]]]]]], ['_', ['。']]]]]]]\n" ] } ], "source": [ "print(trees)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "转换为bracketed格式:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(TOP\n", " (IP\n", " (NP-TMP (_ 2021年))\n", " (NP-PN-SBJ (_ HanLPv2.1))\n", " (VP\n", " (PP-BNF (_ 为) (NP (_ 生产) (_ 环境)))\n", " (VP\n", " (_ 带来)\n", " (NP-OBJ\n", " (CP\n", " (CP\n", " (IP\n", " (VP\n", " (NP (DP (_ 次)) (NP (_ 世代)))\n", " (ADVP (_ 最))\n", " (VP (_ 先进))))\n", " (_ 的)))\n", " (NP (QP (_ 多)) (NP (_ 语种)))\n", " (NP (_ NLP) (_ 技术)))))\n", " (_ 。)))\n" ] } ], "source": [ "print(trees[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 组装流水线" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "短语成分树的第一层non-terminal一般是词性标签,所以经常与词性标注一起使用。为此,先加载一个词性标注器:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "然后创建一个函数将词性标签和句法树组装起来:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from hanlp_common.document import Document\n", "def merge_pos_into_con(doc:Document):\n", " flat = isinstance(doc['pos'][0], str)\n", " if flat:\n", " doc = Document((k, [v]) for k, v in doc.items())\n", " for tree, tags in zip(doc['con'], doc['pos']):\n", " offset = 0\n", " for subtree in tree.subtrees(lambda t: t.height() == 2):\n", " tag = subtree.label()\n", " if tag == '_':\n", " subtree.set_label(tags[offset])\n", " offset += 1\n", " if flat:\n", " doc = doc.squeeze()\n", " return doc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "之后就可以用一个流水线将三者组装起来了:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "nlp = hanlp.pipeline() \\\n", " .append(pos, input_key='tok', output_key='pos') \\\n", " .append(con, input_key='tok', output_key='con') \\\n", " .append(merge_pos_into_con, input_key='*')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "该流水线的结构如下:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[tok->TransformerTagger->pos, tok->CRFConstituencyParser->con, None->merge_pos_into_con->None]\n" ] } ], "source": [ "print(nlp)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "传入一个已分词的句子试试:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok\": [\n", " \"2021年\",\n", " \"HanLPv2.1\",\n", " \"带来\",\n", " \"最\",\n", " \"先进\",\n", " \"的\",\n", " \"多\",\n", " \"语种\",\n", " \"NLP\",\n", " \"技术\",\n", " \"。\"\n", " ],\n", " \"pos\": [\n", " \"NT\",\n", " \"NR\",\n", " \"VV\",\n", " \"AD\",\n", " \"VA\",\n", " \"DEC\",\n", " \"CD\",\n", " \"NN\",\n", " \"NR\",\n", " \"NN\",\n", " \"PU\"\n", " ],\n", " \"con\": [\n", " \"TOP\",\n", " [[\"IP\", [[\"NP-TMP\", [[\"NT\", [\"2021年\"]]]], [\"NP-PN-SBJ\", [[\"NR\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"VV\", [\"带来\"]], [\"NP-OBJ\", [[\"CP\", [[\"CP\", [[\"IP\", [[\"VP\", [[\"ADVP\", [[\"AD\", [\"最\"]]]], [\"VP\", [[\"VA\", [\"先进\"]]]]]]]], [\"DEC\", [\"的\"]]]]]], [\"NP\", [[\"QP\", [[\"CD\", [\"多\"]]]], [\"NP\", [[\"NN\", [\"语种\"]]]]]], [\"NP\", [[\"NR\", [\"NLP\"]], [\"NN\", [\"技术\"]]]]]]]], [\"PU\", [\"。\"]]]]]\n", " ]\n", "}\n" ] } ], "source": [ "doc = nlp(tok=[\"2021年\", \"HanLPv2.1\", \"带来\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"])\n", "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "流水线的输出也是一个Document,所以支持可视化:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Token     
───────── 
2021年     
HanLPv2.1 
带来        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
PoS    3       4       5       6       7       8         9            10
────────────────────────────────────────────────────────────────────────
NT ─────────────────────────────────────────────────────►NP-TMP ────┐   
NR ─────────────────────────────────────────────────────►NP-PN-SBJ──┤   
VV ────────────────────────────────────────────────────┐            │   
AD ───►ADVP──┐                                         │            │   
VA ───►VP ───┴►VP ────►IP ───┐                         │            │   
DEC──────────────────────────┴►CP ────►CP ───┐         ├►VP─────────┼►IP
CD ───►QP ───┐                               │         │            │   
NN ───►NP ───┴────────────────────────►NP────┼►NP-OBJ──┘            │   
NR ──┐                                       │                      │   
NN ──┴────────────────────────────────►NP ───┘                      │   
PU ─────────────────────────────────────────────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "如果要分析原始文本的话,分词是第一步,所以先加载一个分词器:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "然后将分词器插入到流水线的第一级:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[None->TransformerTaggingTokenizer->tok,\n", " tok->TransformerTagger->pos,\n", " tok->CRFConstituencyParser->con,\n", " None->merge_pos_into_con->None]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp.insert(0, tok, output_key='tok')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "然后就可以直接分析原始文本了:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(TOP\n", " (IP\n", " (NT 2021)\n", " (M 年)\n", " (NP-PN-SBJ (NR HanLPv2.1))\n", " (VP\n", " (VV 带来)\n", " (NP-OBJ\n", " (CP (CP (IP (VP (ADVP (AD 最)) (VP (VA 先进)))) (DEC 的)))\n", " (NP (QP (CD 多)) (NP (NN 语种)))\n", " (NP (NR NLP) (NN 技术))))\n", " (PU 。)))\n" ] } ], "source": [ "print(nlp('2021年HanLPv2.1带来最先进的多语种NLP技术。')['con'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "你明白吗?HanLP是为聪明人设计的,只要你足够聪明,你就可以优雅地实现各种功能。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 操作短语树的技巧" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "短语结构树的类型为`phrasetree.tree.Tree`,提供了许多接口,此处列举其中一些常用的接口。" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(TOP\n", " (IP\n", " (NP-TMP (NT 2021年))\n", " (NP-PN-SBJ (NR HanLPv2.1))\n", " (VP\n", " (VV 带来)\n", " (NP-OBJ\n", " (CP (CP (IP (VP (ADVP (AD 最)) (VP (VA 先进)))) (DEC 的)))\n", " (NP (QP (CD 多)) (NP (NN 语种)))\n", " (NP (NR NLP) (NN 技术))))\n", " (PU 。)))\n" ] } ], "source": [ "tree = doc['con'] # tree数组的话则需要doc['con'][0]\n", "print(tree)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 按高度枚举子树" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "子树:(VP (ADVP (AD 最)) (VP (VA 先进)))\t标签:VP\t短语:['最', '先进']\n", "子树:(NP (QP (CD 多)) (NP (NN 语种)))\t标签:NP\t短语:['多', '语种']\n" ] } ], "source": [ "for subtree in tree.subtrees(lambda t: t.height() == 4):\n", " print(f'子树:{subtree}\\t标签:{subtree.label()}\\t短语:{subtree.leaves()}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 按标签枚举子树" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(NP (QP (CD 多)) (NP (NN 语种)))\n", "(NP (NN 语种))\n", "(NP (NR NLP) (NN 技术))\n" ] } ], "source": [ "for subtree in tree.subtrees(lambda t: t.label() == 'NP'):\n", " print(subtree)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 遍历子节点" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "父节点(NP (NR NLP) (NN 技术))的子节点有:\n", "(NR NLP)\n", "(NN 技术)\n" ] } ], "source": [ "print(f'父节点{subtree}的子节点有:')\n", "for child in subtree:\n", " print(child)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "con_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/cor_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 指代消解\n", "任务越少,速度越快。如指定仅执行指代消解:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "ret = HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个包含分词结果与簇的dict:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ret == {'clusters': [\n", " [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人\n", " [['我姐', 0, 2], ['她', 4, 5]], # 指代说话人的姐姐\n", " [['她的猫', 4, 7], ['它', 11, 12]]], # 指代说话人的姐姐的猫\n", " 'tokens': ['我', '姐', '送', '我', '她', '的', '猫', '。', '我', '很', '喜欢', '它', '。']}" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "对应如下结构:\n", "![cor](https://file.hankcs.com/img/coref_demo_small.png)" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行指代消解:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [], "source": [ "clusters = HanLP.coreference_resolution(tokens=[['我', '姐', '送', '我', '她', '的', '猫', '。'],\n", " ['我', '很', '喜欢', '它', '。']])\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为簇的list:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clusters == [\n", " [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人\n", " [['我姐', 0, 2], ['她', 4, 5]], # 指代说话人的姐姐\n", " [['她的猫', 4, 7], ['它', 11, 12]]] # 指代说话人的姐姐的猫" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "cor_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_amr.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-04-12 22:19 import hanlp parser = hanlp.load(hanlp.pretrained.amr.MRP2020_AMR_ENG_ZHO_XLM_BASE) # For Chinese: print(parser(["男孩", "希望", "女孩", "相信", "他", "。"])) print(parser(["男孩", "希望", "女孩", "相信", "他", "。"], output_amr=False)) # For English: print(parser(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng')) # It's suggested to also feed the lemma for stabler performance. print(parser([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'), ('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng')) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-15 22:26 import hanlp from hanlp.components.mtl.multi_task_learning import MultiTaskLearning from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization # 加载多任务模型 HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) # 获取分词任务(以tok开头的任务都是分词任务,以细分标准为例) tok: TaggingTokenization = HanLP['tok/fine'] tok.dict_force = tok.dict_combine = None print(f'不挂词典:\n{HanLP("商品和服务项目")["tok/fine"]}') tok.dict_force = {'和服', '服务项目'} print(f'强制模式:\n{HanLP("商品和服务项目")["tok/fine"]}') # 慎用,详见《自然语言处理入门》第二章 tok.dict_force = {'和服务': ['和', '服务']} print(f'强制校正:\n{HanLP("正向匹配商品和服务、任何和服务必按上述切分")["tok/fine"]}') tok.dict_force = None tok.dict_combine = {'和服', '服务项目'} print(f'合并模式:\n{HanLP("商品和服务项目")["tok/fine"]}') # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php # See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html # 含有空格、制表符等(Transformer tokenizer去掉的字符)的词语需要用tuple的形式提供 tok.dict_combine = {('iPad', 'Pro'), '2个空格'} print(f'空格匹配:\n{HanLP("如何评价iPad Pro ?iPad Pro有2个空格", tasks="tok/fine")["tok/fine"]}') # 聪明的用户请继续阅读:tuple词典中的字符串其实等价于该字符串的所有可能的切分方式 print(f'词典内容:\n{dict(tok.dict_combine.config["dictionary"]).keys()}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict_stl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-15 22:26 import hanlp from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer # 加载一个旧版本单任务模型演示分词错误(最新版已经修复): tok: TransformerTaggingTokenizer = hanlp.load('https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip') tok.dict_force = tok.dict_combine = None print(f'不挂词典:\n{tok("首相和川普通电话")}') tok.dict_force = {'川普'} print(f'强制模式:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}') # 慎用,详见《自然语言处理入门》第二章 tok.dict_force = {'川普通电话': ['川普', '通', '电话']} print(f'强制校正:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}') tok.dict_force = None tok.dict_combine = {'美国总统'} print(f'合并模式:\n{tok("首相和川普通电话,川普是美国总统。")}') # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php # See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_del_tasks.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-02-03 13:28 import hanlp from hanlp.components.mtl.multi_task_learning import MultiTaskLearning from hanlp_common.document import Document HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) tasks = list(HanLP.tasks.keys()) print(tasks) # Pick what you need from what we have for task in tasks: if task not in ('tok', 'pos'): del HanLP[task] # You can save it as a new component # HanLP.save('path/to/new/component') # HanLP.load('path/to/new/component') print(HanLP.tasks.keys()) doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', 'up主来到北京立方庭参观自然语义科技公司。']) print(doc) doc.pretty_print() ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_document.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-10-26 23:40 from hanlp_common.document import Document # Create a document or get a document from HanLP.parse doc = Document( tok=[["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司"]], pos=[["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN"]], ner=[[["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4], ["自然语义科技公司", "ORGANIZATION", 5, 9]]], dep=[[[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"], [9, "compound"], [9, "compound"], [9, "compound"], [5, "dobj"]]] ) # print(doc) or str(doc) to get its JSON representation print(doc) # Access an annotation by its task name print(doc['tok']) # Get number of sentences print(f'It has {doc.count_sentences()} sentence(s)') # Access the n-th sentence print(doc.squeeze(0)['tok']) # Pretty print it right in your console or notebook doc.pretty_print() # To save the pretty prints in a str pretty_text: str = '\n\n'.join(doc.to_pretty()) # Create a document from a dict doc = Document({ "tok/fine": [ ["晓美焰", "来到", "北京", "立方庭", "参观", "自然", "语义", "科技", "公司", "。"] ], "tok/coarse": [ ["晓美焰", "来到", "北京立方庭", "参观", "自然语义科技公司", "。"] ], "pos/ctb": [ ["NR", "VV", "NR", "NR", "VV", "NN", "NN", "NN", "NN", "PU"] ], "pos/pku": [ ["nr", "v", "ns", "nz", "v", "n", "n", "n", "n", "w"] ], "ner/msra": [ [["晓美焰", "PERSON", 0, 1], ["北京立方庭", "LOCATION", 2, 4], ["自然语义科技公司", "ORGANIZATION", 5, 9]] ], "ner/ontonotes": [ [["晓美焰", "PERSON", 0, 1], ["北京", "GPE", 2, 3], ["立方庭", "FAC", 3, 4], ["自然语义科技公司", "ORG", 5, 9]] ], "srl": [ [[["晓美焰", "ARG0", 0, 1], ["来到", "PRED", 1, 2], ["北京立方庭", "ARG1", 2, 4]], [["晓美焰", "ARG0", 0, 1], ["参观", "PRED", 4, 5], ["自然语义科技公司", "ARG1", 5, 9]]] ], "dep": [ [[2, "nsubj"], [0, "root"], [4, "name"], [2, "dobj"], [2, "conj"], [9, "compound"], [9, "compound"], [9, "compound"], [5, "dobj"], [2, "punct"]] ] }) # Pretty print using a different NER annotation doc.pretty_print(ner='ner/ontonotes') # Get the first annotation for NER print(doc.get_by_prefix('ner')) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_mlm.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-01-29 21:11 from hanlp.components.lm.mlm import MaskedLanguageModel mlm = MaskedLanguageModel() mlm.load('bert-base-chinese') print(mlm('生活的真谛是[MASK]。')) # Batching is always faster print(mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。'])) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_mtl.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-31 13:51 import hanlp from hanlp_common.document import Document # CLOSE是自然语义标注的闭源语料库,BASE是中号模型,ZH中文 HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH) # 默认执行全部任务 doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。']) # 返回类型Document是dict的子类,打印出来兼容JSON print(doc) # 即时可视化,防止换行请最大化窗口,推荐在Jupyter Notebook里调用 doc.pretty_print() # 指定可视化OntoNotes标准的NER # doc.pretty_print(ner='ner/ontonotes', pos='pku') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_ner_dict.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-04-29 11:06 import hanlp from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition from hanlp.utils.io_util import get_resource HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH) ner: TaggingNamedEntityRecognition = HanLP['ner/msra'] ner.dict_whitelist = {'午饭后': 'TIME'} doc = HanLP('2021年测试高血压是138,时间是午饭后2点45,低血压是44', tasks='ner/msra') doc.pretty_print() print(doc['ner/msra']) ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')} HanLP('他在浙江金华出生,他的名字叫金华。', tasks='ner/msra').pretty_print() # HanLP.save(get_resource(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH)) # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php # See https://hanlp.hankcs.com/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.html ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_parse_constituency.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-01-18 11:09 from hanlp_common.document import Document import hanlp con = hanlp.load(hanlp.pretrained.constituency.CTB9_CON_FULL_TAG_ELECTRA_SMALL) # To speed up, parse multiple sentences at once, and use a GPU. print(con(["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"])) # The rest of this tutorial is written for clever users. # The first level of non-terminals are PoS tags. So usually a PoS model is piped. def merge_pos_into_con(doc: Document): flat = isinstance(doc['pos'][0], str) if flat: doc = Document((k, [v]) for k, v in doc.items()) for tree, tags in zip(doc['con'], doc['pos']): offset = 0 for subtree in tree.subtrees(lambda t: t.height() == 2): tag = subtree.label() if tag == '_': subtree.set_label(tags[offset]) offset += 1 if flat: doc = doc.squeeze() return doc pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) nlp = hanlp.pipeline() \ .append(pos, input_key='tok', output_key='pos') \ .append(con, input_key='tok', output_key='con') \ .append(merge_pos_into_con, input_key='*') print(f'The pipeline looks like this: {nlp}') doc = nlp(tok=["2021年", "HanLPv2.1", "带来", "最", "先进", "的", "多", "语种", "NLP", "技术", "。"]) print(doc) doc.pretty_print() # If you need to parse raw text, simply add a tokenizer into this pipeline. tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) nlp.insert(0, tok, output_key='tok') print(f'The pipeline looks like this: {nlp}') doc = nlp('2021年HanLPv2.1带来最先进的多语种NLP技术。') print(doc) doc.pretty_print() # ATTENTION: Pipelines are usually slower than MTL but they are more flexible. ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_pipeline.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-28 20:47 import hanlp # Pipeline allows blending multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch # one. However, it's slower than the MTL framework. # pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE) # In case both tf and torch are used, load tf first. HanLP = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ .append(hanlp.load('CTB9_TOK_ELECTRA_SMALL'), output_key='tok') \ .append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \ .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \ .append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=False), output_key='dep', input_key='tok') \ .append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok') doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。') print(doc) doc.pretty_print() ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-15 22:26 import hanlp from hanlp.components.mtl.multi_task_learning import MultiTaskLearning from hanlp.components.mtl.tasks.pos import TransformerTagging from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization from tests import cdroot cdroot() HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) # Demonstrates custom dict in part-of-speech tagging pos: TransformerTagging = HanLP['pos/ctb'] print(f'自定义单个词性:') pos.dict_tags = {'HanLP': 'state-of-the-art-tool'} HanLP("HanLP为生产环境带来次世代最先进的多语种NLP技术。", tasks='pos/ctb').pretty_print() print(f'根据上下文自定义词性:') pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'} HanLP("我的希望是希望张晚霞的背影被晚霞映红。", tasks='pos/ctb').pretty_print() # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php # See also https://hanlp.hankcs.com/docs/api/hanlp/components/taggers/transformer_tagger.html ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_sts.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-05-24 13:15 import hanlp sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH) print(sim([ ['看图猜一电影名', '看图猜电影'], ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'], ['北京到上海的动车票', '上海到北京的动车票'], ])) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/demo_word2vec.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-12 18:33 import hanlp import torch word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU) vec = word2vec('先进') print(vec) print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0)) print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0)) print('获取语义最相似的词语:') print(word2vec.most_similar('上海')) # print(word2vec.most_similar(['上海', '寒冷'])) # batching更快 print('非常寒冷是OOV所以无法获取:') print(word2vec.most_similar('非常寒冷')) print('但是在doc2vec模式下OOV也可以进行相似度计算:') print(word2vec.most_similar('非常寒冷', doc2vec=True)) print('甚至可以处理短文本:') print(word2vec.most_similar('国家图书馆推出2022年春节主题活动', doc2vec=True)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_mtl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [ { "data": { "text/plain": [ "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n", " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 依存句法分析\n", "任务越少,速度越快。如指定仅执行依存句法分析:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='dep')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n", " ],\n", " \"dep\": [\n", " [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"amod\"], [15, \"nn\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"assm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]],\n", " [[2, \"nsubj\"], [0, \"root\"], [4, \"nn\"], [2, \"dobj\"], [2, \"conj\"], [9, \"nn\"], [9, \"nn\"], [9, \"nn\"], [5, \"dobj\"], [2, \"punct\"]]\n", " ]\n", "}\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`doc['dep']`为句子们的依存句法树列表,第`i`个二元组表示第`i`个单词的`[中心词的下标, 与中心词的依存关系]`。" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "可视化依存句法树:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dep Tree \tToken \tRelati\n", "────────────\t─────────\t──────\n", " ┌─────────►\t2021年 \ttmod \n", " │┌────────►\tHanLPv2.1\tnsubj \n", " ││┌─►┌─────\t为 \tprep \n", " │││ │ ┌─►\t生产 \tnn \n", " │││ └─►└──\t环境 \tpobj \n", "┌┼┴┴────────\t带来 \troot \n", "││ ┌─►\t次 \tamod \n", "││ ┌───►└──\t世代 \tnn \n", "││ │ ┌─►\t最 \tadvmod\n", "││ │┌──►├──\t先进 \trcmod \n", "││ ││ └─►\t的 \tassm \n", "││ ││ ┌─►\t多 \tnummod\n", "││ ││┌─►└──\t语种 \tnn \n", "││ │││ ┌─►\tNLP \tnn \n", "│└─►└┴┴──┴──\t技术 \tdobj \n", "└──────────►\t。 \tpunct \n", "\n", "Dep Tree \tTok\tRelat\n", "────────────\t───\t─────\n", " ┌─►\t阿婆主\tnsubj\n", "┌┬────┬──┴──\t来到 \troot \n", "││ │ ┌─►\t北京 \tnn \n", "││ └─►└──\t立方庭\tdobj \n", "│└─►┌───────\t参观 \tconj \n", "│ │ ┌───►\t自然 \tnn \n", "│ │ │┌──►\t语义 \tnn \n", "│ │ ││┌─►\t科技 \tnn \n", "│ └─►└┴┴──\t公司 \tdobj \n", "└──────────►\t。 \tpunct\n" ] } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "转换为CoNLL格式:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\t2021年\t_\t_\t_\t_\t6\ttmod\t_\t_\n", "2\tHanLPv2.1\t_\t_\t_\t_\t6\tnsubj\t_\t_\n", "3\t为\t_\t_\t_\t_\t6\tprep\t_\t_\n", "4\t生产\t_\t_\t_\t_\t5\tnn\t_\t_\n", "5\t环境\t_\t_\t_\t_\t3\tpobj\t_\t_\n", "6\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n", "7\t次\t_\t_\t_\t_\t8\tamod\t_\t_\n", "8\t世代\t_\t_\t_\t_\t15\tnn\t_\t_\n", "9\t最\t_\t_\t_\t_\t10\tadvmod\t_\t_\n", "10\t先进\t_\t_\t_\t_\t15\trcmod\t_\t_\n", "11\t的\t_\t_\t_\t_\t10\tassm\t_\t_\n", "12\t多\t_\t_\t_\t_\t13\tnummod\t_\t_\n", "13\t语种\t_\t_\t_\t_\t15\tnn\t_\t_\n", "14\tNLP\t_\t_\t_\t_\t15\tnn\t_\t_\n", "15\t技术\t_\t_\t_\t_\t6\tdobj\t_\t_\n", "16\t。\t_\t_\t_\t_\t6\tpunct\t_\t_\n", "\n", "1\t阿婆主\t_\t_\t_\t_\t2\tnsubj\t_\t_\n", "2\t来到\t_\t_\t_\t_\t0\troot\t_\t_\n", "3\t北京\t_\t_\t_\t_\t4\tnn\t_\t_\n", "4\t立方庭\t_\t_\t_\t_\t2\tdobj\t_\t_\n", "5\t参观\t_\t_\t_\t_\t2\tconj\t_\t_\n", "6\t自然\t_\t_\t_\t_\t9\tnn\t_\t_\n", "7\t语义\t_\t_\t_\t_\t9\tnn\t_\t_\n", "8\t科技\t_\t_\t_\t_\t9\tnn\t_\t_\n", "9\t公司\t_\t_\t_\t_\t5\tdobj\t_\t_\n", "10\t。\t_\t_\t_\t_\t2\tpunct\t_\t_\n" ] } ], "source": [ "print(doc.to_conll())" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行依存句法分析:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dep Tree \tToken\tRelati\n", "───────────\t─────\t──────\n", " ┌────────►\tHanLP\tnsubj \n", " │┌─►┌─────\t为 \tprep \n", " ││ │ ┌─►\t生产 \tnn \n", " ││ └─►└──\t环境 \tpobj \n", "┌┼┴────────\t带来 \troot \n", "││ ┌─────►\t次世代 \tnn \n", "││ │ ┌─►\t最 \tadvmod\n", "││ │┌─►├──\t先进 \trcmod \n", "││ ││ └─►\t的 \tassm \n", "││ ││ ┌──►\t多语种 \tnn \n", "││ ││ │┌─►\tNLP \tnn \n", "│└─►└┴─┴┴──\t技术 \tdobj \n", "└─────────►\t。 \tpunct \n", "\n", "Dep Tree \tTok\tRelation \n", "────────────────\t───\t─────────\n", " ┌─►┌──\t我 \tassmod \n", " │ └─►\t的 \tassm \n", " ┌─►└─────\t希望 \ttop \n", "┌┬─────┴────────\t是 \troot \n", "│└─►┌───────────\t希望 \tccomp \n", "│ │ ┌─►┌──\t张晚霞\tassmod \n", "│ │ │ └─►\t的 \tassm \n", "│ │ ┌─►└─────\t背影 \tnsubjpass\n", "│ └─►└──┬─────\t被 \tccomp \n", "│ │ ┌─►\t晚霞 \tnsubj \n", "│ └─►└──\t映红 \tdep \n", "└──────────────►\t。 \tpunct \n" ] } ], "source": [ "HanLP([\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='dep', skip_tasks='tok*').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 注意\n", "Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "dep_mtl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 依存句法分析\n", "任务越少,速度越快。如指定仅执行依存句法分析:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='dep')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n", " ],\n", " \"dep\": [\n", " [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"clf\"], [10, \"dep\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"cpm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]]\n", " ]\n", "}\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`doc['dep']`为句子们的依存句法树列表,第`i`个二元组表示第`i`个单词的`[中心词的下标, 与中心词的依存关系]`。" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "可视化依存句法树:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dep Tree \tToken \tRelati\n", "─────────────\t─────────\t──────\n", " ┌─────────►\t2021年 \ttmod \n", " │┌────────►\tHanLPv2.1\tnsubj \n", " ││┌─►┌─────\t为 \tprep \n", " │││ │ ┌─►\t生产 \tnn \n", " │││ └─►└──\t环境 \tpobj \n", "┌┬┴┴┴────────\t带来 \troot \n", "││ ┌─►\t次 \tclf \n", "││ ┌─►└──\t世代 \tdep \n", "││ │ ┌─►\t最 \tadvmod\n", "││ ┌─►└──┼──\t先进 \trcmod \n", "││ │ └─►\t的 \tcpm \n", "││ │ ┌─►\t多 \tnummod\n", "││ │ ┌─►└──\t语种 \tnn \n", "││ │ │ ┌─►\tNLP \tnn \n", "│└─►└──┴──┴──\t技术 \tdobj \n", "└───────────►\t。 \tpunct \n" ] } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "转换为CoNLL格式:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\t2021年\t_\t_\t_\t_\t6\ttmod\t_\t_\n", "2\tHanLPv2.1\t_\t_\t_\t_\t6\tnsubj\t_\t_\n", "3\t为\t_\t_\t_\t_\t6\tprep\t_\t_\n", "4\t生产\t_\t_\t_\t_\t5\tnn\t_\t_\n", "5\t环境\t_\t_\t_\t_\t3\tpobj\t_\t_\n", "6\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n", "7\t次\t_\t_\t_\t_\t8\tclf\t_\t_\n", "8\t世代\t_\t_\t_\t_\t10\tdep\t_\t_\n", "9\t最\t_\t_\t_\t_\t10\tadvmod\t_\t_\n", "10\t先进\t_\t_\t_\t_\t15\trcmod\t_\t_\n", "11\t的\t_\t_\t_\t_\t10\tcpm\t_\t_\n", "12\t多\t_\t_\t_\t_\t13\tnummod\t_\t_\n", "13\t语种\t_\t_\t_\t_\t15\tnn\t_\t_\n", "14\tNLP\t_\t_\t_\t_\t15\tnn\t_\t_\n", "15\t技术\t_\t_\t_\t_\t6\tdobj\t_\t_\n", "16\t。\t_\t_\t_\t_\t6\tpunct\t_\t_\n" ] } ], "source": [ "print(doc.to_conll())" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行依存句法分析:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dep Tree \tToken\tRelati\n", "───────────\t─────\t──────\n", " ┌────────►\tHanLP\tnsubj \n", " │┌─►┌─────\t为 \tprep \n", " ││ │ ┌─►\t生产 \tnn \n", " ││ └─►└──\t环境 \tpobj \n", "┌┼┴────────\t带来 \troot \n", "││ ┌──►\t次世代 \tdep \n", "││ │┌─►\t最 \tadvmod\n", "││ ┌─►└┼──\t先进 \trcmod \n", "││ │ └─►\t的 \tcpm \n", "││ │ ┌──►\t多语种 \tnn \n", "││ │ │┌─►\tNLP \tnn \n", "│└─►└──┴┴──\t技术 \tdobj \n", "└─────────►\t。 \tpunct \n", "\n", "Dep Tree \tTok\tRelation \n", "────────────────\t───\t─────────\n", " ┌─►┌──\t我 \tassmod \n", " │ └─►\t的 \tassm \n", " ┌─►└─────\t希望 \ttop \n", "┌┬─────┴────────\t是 \troot \n", "│└─►┌───────────\t希望 \tccomp \n", "│ │ ┌─►┌──\t张晚霞\tassmod \n", "│ │ │ └─►\t的 \tassm \n", "│ │ ┌─►└─────\t背影 \tnsubjpass\n", "│ └─►└──┬─────\t被 \tccomp \n", "│ │ ┌─►\t晚霞 \tnsubj \n", "│ └─►└──\t映红 \tdep \n", "└──────────────►\t。 \tpunct \n" ] } ], "source": [ "HanLP(tokens=[\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='dep').pretty_print()" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "dep_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/dep_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "69cdad22-d94d-41fb-9591-1c29515a3da9" }, "outputs": [ { "data": { "text/plain": [ "{'CTB5_BIAFFINE_DEP_ZH': 'https://file.hankcs.com/hanlp/dep/biaffine_ctb5_20191229_025833.zip',\n", " 'CTB7_BIAFFINE_DEP_ZH': 'https://file.hankcs.com/hanlp/dep/biaffine_ctb7_20200109_022431.zip',\n", " 'CTB9_DEP_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/ctb9_dep_electra_small_20220216_100306.zip',\n", " 'PMT1_DEP_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/pmt_dep_electra_small_20220218_134518.zip',\n", " 'CTB9_UDC_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/dep/udc_dep_electra_small_20220218_095452.zip',\n", " 'PTB_BIAFFINE_DEP_EN': 'https://file.hankcs.com/hanlp/dep/ptb_dep_biaffine_20200101_174624.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.dep.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "dep = hanlp.load(hanlp.pretrained.dep.CTB9_DEP_ELECTRA_SMALL)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 依存句法分析\n", "依存句法分析任务的输入为已分词的一个或多个句子:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "BqEmDMGGOtk3" }, "outputs": [], "source": [ "tree = dep([\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "返回对象为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)类型:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "U_PGm06m6K20", "outputId": "a25c6452-5032-42b3-d501-99158380c487" }, "outputs": [ { "data": { "text/plain": [ "[{'id': 1,\n", " 'form': '2021年',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 3,\n", " 'deprel': 'tmod',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 2,\n", " 'form': 'HanLPv2.1',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 3,\n", " 'deprel': 'nsubj',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 3,\n", " 'form': '带来',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 0,\n", " 'deprel': 'root',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 4,\n", " 'form': '次',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 5,\n", " 'deprel': 'det',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 5,\n", " 'form': '世代',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 7,\n", " 'deprel': 'dep',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 6,\n", " 'form': '最',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 7,\n", " 'deprel': 'advmod',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 7,\n", " 'form': '先进',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 12,\n", " 'deprel': 'rcmod',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 8,\n", " 'form': '的',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 7,\n", " 'deprel': 'cpm',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 9,\n", " 'form': '多',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 10,\n", " 'deprel': 'nummod',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 10,\n", " 'form': '语种',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 12,\n", " 'deprel': 'nn',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 11,\n", " 'form': 'NLP',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 12,\n", " 'deprel': 'nn',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 12,\n", " 'form': '技术',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 3,\n", " 'deprel': 'dobj',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None},\n", " {'id': 13,\n", " 'form': '。',\n", " 'cpos': None,\n", " 'pos': None,\n", " 'head': 3,\n", " 'deprel': 'punct',\n", " 'lemma': None,\n", " 'feats': None,\n", " 'phead': None,\n", " 'pdeprel': None}]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tree" ] }, { "cell_type": "markdown", "metadata": { "id": "Gn_RQa_Z6K20" }, "source": [ "打印时为CoNLL格式:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "26P1LGzv6K20", "outputId": "c78ffdb0-3cd7-492d-f55e-0d50120faffb" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\t2021年\t_\t_\t_\t_\t3\ttmod\t_\t_\n", "2\tHanLPv2.1\t_\t_\t_\t_\t3\tnsubj\t_\t_\n", "3\t带来\t_\t_\t_\t_\t0\troot\t_\t_\n", "4\t次\t_\t_\t_\t_\t5\tdet\t_\t_\n", "5\t世代\t_\t_\t_\t_\t7\tdep\t_\t_\n", "6\t最\t_\t_\t_\t_\t7\tadvmod\t_\t_\n", "7\t先进\t_\t_\t_\t_\t12\trcmod\t_\t_\n", "8\t的\t_\t_\t_\t_\t7\tcpm\t_\t_\n", "9\t多\t_\t_\t_\t_\t10\tnummod\t_\t_\n", "10\t语种\t_\t_\t_\t_\t12\tnn\t_\t_\n", "11\tNLP\t_\t_\t_\t_\t12\tnn\t_\t_\n", "12\t技术\t_\t_\t_\t_\t3\tdobj\t_\t_\n", "13\t。\t_\t_\t_\t_\t3\tpunct\t_\t_\n" ] } ], "source": [ "print(tree)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "如果不需要CoNLL格式的话,也许`conll=False`时的输出更加简洁:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(3, 'tmod'),\n", " (3, 'nsubj'),\n", " (0, 'root'),\n", " (5, 'det'),\n", " (7, 'dep'),\n", " (7, 'advmod'),\n", " (12, 'rcmod'),\n", " (7, 'cpm'),\n", " (10, 'nummod'),\n", " (12, 'nn'),\n", " (12, 'nn'),\n", " (3, 'dobj'),\n", " (3, 'punct')]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dep([\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], conll=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 可视化\n", "你可以构造一个`Document`实现漂亮的可视化:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Dep Tree      
───────────── 
         ┌──► 
         │┌─► 
┌┬───────┴┴── 
││        ┌─► 
││     ┌─►└── 
││     │  ┌─► 
││  ┌─►└──┼── 
││  │     └─► 
││  │     ┌─► 
││  │  ┌─►└── 
││  │  │  ┌─► 
│└─►└──┴──┴── 
└───────────► 
Token     
───────── 
2021年     
HanLPv2.1 
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
Relati
──────
tmod  
nsubj 
root  
det   
dep   
advmod
rcmod 
cpm   
nummod
nn    
nn    
dobj  
punct 
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from hanlp_common.document import Document\n", "doc = Document(\n", " tok=[\"2021年\", \"HanLPv2.1\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n", " dep=[(3, 'tmod'), (3, 'nsubj'), (0, 'root'), (5, 'det'), (7, 'dep'), (7, 'advmod'), (12, 'rcmod'), (7, 'cpm'), (10, 'nummod'), (12, 'nn'), (12, 'nn'), (3, 'dobj'), (3, 'punct')]\n", ")\n", "doc.pretty_print()" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "dep_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/extractive_summarization_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 抽取式自动摘要\n", "抽取式自动摘要的目标是从文章中筛选出一些作为摘要的中心句子:既要紧扣要点,又要避免赘语。\n", "### 中文\n", "抽取式自动摘要任务的输入为一段文本和所需的摘要句子数量的最大值`topk`:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BqEmDMGGOtk3", "outputId": "936d439a-e1ff-4308-d2aa-775955558594" }, "outputs": [ { "data": { "text/plain": [ "{'据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。': 0.9999685883522034,\n", " '仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。': 0.5798477530479431,\n", " '尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。': 0.5435440540313721}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = '''\n", "据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。\n", "据供应链消息人士称,生产厂的订单拉动情况正在慢慢转强,这会提高MacBook Pro机型的供应量,并缩短苹果客户在过去几周所经历的延长交货时间。\n", "仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。\n", "据分析师郭明錤表示,广达是高端MacBook Pro的唯一供应商,自防疫封控依赖,MacBook Pro大部分型号交货时间增加了三到五周,\n", "一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。\n", "尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。\n", "苹果上周表示,防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求,这最终将影响苹果6月份的收入。\n", "'''\n", "HanLP.extractive_summarization(text, topk=3)" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "返回值为最多`topk`个摘要句子以及相应的权重,权重取值区间为$[0, 1]$。由于Trigram Blocking技巧,实际返回的摘要句数量可能小于`topk`。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 可视化" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。\n", "据供应链消息人士称,生产厂的订单拉动情况正在慢慢转强,这会提高MacBook Pro机型的供应量,并缩短苹果客户在过去几周所经历的延长交货时间。\n", "仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。\n", "据分析师郭明錤表示,广达是高端MacBook Pro的唯一供应商,自防疫封控依赖,MacBook Pro大部分型号交货时间增加了三到五周,\n", "一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。\n", "尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。\n", "苹果上周表示,防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求,这最终将影响苹果6月份的收入。\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def highlight(text, scores):\n", " for k, v in scores.items():\n", " text = text.replace(k, f'{k}')\n", " from IPython.display import display, HTML\n", " display(HTML(text))\n", "\n", "scores = HanLP.extractive_summarization(text, topk=100)\n", "highlight(text, scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 繁体中文\n", "HanLP的抽取式自动摘要接口支持繁体中文:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'華爾街日報周二(3日)報導,根據知情人透露,日前已宣布將以440億美元買下推特(Twitter)並下市的馬斯克,曾經跟一些潛在投資人說,他可以在短短幾年後,再將這家社群媒體公司重新上市。': 0.9999818205833435,\n", " '消息來源說,特斯拉創辦人兼執行長馬斯克表示,他計劃在買下推特後最短三年內,就展開推特的首次公開發行股票。': 0.503434419631958,\n", " '根據之前華爾街日報的報導,馬斯克為購買推特籌現金時,與私募股權公司等投資人討論出資事宜,Apollo Global Management有興趣參與。': 0.2688594460487366}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = '''\n", "華爾街日報周二(3日)報導,根據知情人透露,日前已宣布將以440億美元買下推特(Twitter)並下市的馬斯克,曾經跟一些潛在投資人說,他可以在短短幾年後,再將這家社群媒體公司重新上市。\n", "消息來源說,特斯拉創辦人兼執行長馬斯克表示,他計劃在買下推特後最短三年內,就展開推特的首次公開發行股票。\n", "馬斯克買推特的交易案預期在今年稍後走完程序,包括獲得股東同意以及監管機關核准等步驟。\n", "根據之前華爾街日報的報導,馬斯克為購買推特籌現金時,與私募股權公司等投資人討論出資事宜,Apollo Global Management有興趣參與。\n", "私募股權公司通常都先買下公司將之私有化,把公司移出眾人注目的焦點之外以後,整頓公司,接著再把公司上市,時間常是五年左右。\n", "華爾街日報指出,馬斯克暗示他對推特有類似的規劃的話,有助說服潛在投資人,他會很快行動,改善推特的營運和獲利。\n", "'''\n", "scores = HanLP.extractive_summarization(text)\n", "scores" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "華爾街日報周二(3日)報導,根據知情人透露,日前已宣布將以440億美元買下推特(Twitter)並下市的馬斯克,曾經跟一些潛在投資人說,他可以在短短幾年後,再將這家社群媒體公司重新上市。\n", "消息來源說,特斯拉創辦人兼執行長馬斯克表示,他計劃在買下推特後最短三年內,就展開推特的首次公開發行股票。\n", "馬斯克買推特的交易案預期在今年稍後走完程序,包括獲得股東同意以及監管機關核准等步驟。\n", "根據之前華爾街日報的報導,馬斯克為購買推特籌現金時,與私募股權公司等投資人討論出資事宜,Apollo Global Management有興趣參與。\n", "私募股權公司通常都先買下公司將之私有化,把公司移出眾人注目的焦點之外以後,整頓公司,接著再把公司上市,時間常是五年左右。\n", "華爾街日報指出,馬斯克暗示他對推特有類似的規劃的話,有助說服潛在投資人,他會很快行動,改善推特的營運和獲利。\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "highlight(text, scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 英文\n", "按照HanLP一贯的多语种设计,任何语言都支持。由于服务器GPU资源限制,目前英文接口暂未上线。如果你有相应需求,欢迎前往论坛发起请愿。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "extractive_summarization_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/gec_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语法纠错\n", "输入短文本,执行语法纠错:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [ { "data": { "text/plain": [ "['每个青年都应当有远大的抱负。', '有的同学对语言很有兴趣。', '我市本地居民约占全市人口的70%。']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.grammatical_error_correction(['每个青年都应当有远大的报复。', '有的同学对语言很兴趣。', '我市本地居民约占全市人口的70%多。'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值是每段短文本的修改结果列表。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 测试版\n", "当前版本为测试版,暂时仅支持拼写、标点和简单的语法错误,HanLP的线上模型和语料库仍然在迭代发展中。欢迎广大用户将测试版的问题反馈到[论坛](https://bbs.hankcs.com/c/text-generation/gec/30),我们将在下一个版本中,将HanLP的文本纠错能力提升到高考语文水平。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "gec_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/keyphrase_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 关键词提取\n", "关键词(短语)提取的目标是文本中最具有代表性的关键词以及短语。\n", "### 中文\n", "关键词提取任务的输入为一段文本和所需的关键词数量`topk`:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BqEmDMGGOtk3", "outputId": "936d439a-e1ff-4308-d2aa-775955558594" }, "outputs": [ { "data": { "text/plain": [ "{'自然语言处理': 0.800000011920929,\n", " 'HanLP的全部性能': 0.5256577134132385,\n", " '一门博大精深的学科': 0.42154020071029663}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.keyphrase_extraction('自然语言处理是一门博大精深的学科,掌握理论才能发挥出HanLP的全部性能。 '\n", " '《自然语言处理入门》是一本配套HanLP的NLP入门书,助你零起点上手自然语言处理。', topk=3)" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "返回值为`topk`个关键词以及相应的权重,权重取值区间为$[0, 1]$。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "关键词提取并不仅限于短文本,长文章也一样支持:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'新冠病毒核酸阳性感染': 0.888239324092865,\n", " '确诊病例': 0.8868124485015869,\n", " '本土无症状感染者': 0.8557102680206299,\n", " '属地社区(村屯)': 0.8164600133895874,\n", " '疫情防控工作': 0.7749382853507996,\n", " '我市疫情防控要求': 0.7502512335777283,\n", " '症状': 0.669366180896759,\n", " '我市疫情形势': 0.6673010587692261,\n", " '感染': 0.6663177013397217,\n", " '本土确诊病例': 0.6464788317680359}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc = '''\n", "4月15日0-24时,长春市新增本土确诊病例157例(含57例无症状感染者转为确诊病例),新增本土无症状感染者407例。\n", "以上人员均为隔离管控期间筛查新冠病毒核酸阳性感染者。\n", "当前我市疫情形势严峻,为做好全市疫情防控工作,尽快恢复正常社会秩序和经济社会发展,长春市新冠肺炎疫情防控工作领导小组办公室提醒广大市民,\n", "请严格遵守我市疫情防控要求,配合各部门落实好防控措施,进一步提高防范意识,坚持规范戴口罩、勤洗手、常通风、保持社交距离、不聚餐、不聚集,\n", "减少疾病感染风险。一旦出现发热、干咳、乏力、咽痛、嗅味觉减退或丧失等不适症状,应及时向属地社区(村屯)或疾控机构报告。\n", "'''\n", "HanLP.keyphrase_extraction(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 可视化" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "4月15日0-24时,长春市新增本土确诊病例157例(含57例无症状感染者转为确诊病例),新增本土无症状感染407例。\n", "以上人员均为隔离管控期间筛查新冠病毒核酸阳性感染者。\n", "当前我市疫情形势严峻,为做好全市疫情防控工作,尽快恢复正常社会秩序和经济社会发展,长春市新冠肺炎疫情防控工作领导小组办公室提醒广大市民,\n", "请严格遵守我市疫情防控要求,配合各部门落实好防控措施,进一步提高防范意识,坚持规范戴口罩、勤洗手、常通风、保持社交距离、不聚餐、不聚集,\n", "减少疾病感染风险。一旦出现发热、干咳、乏力、咽痛、嗅味觉减退或丧失等不适症状,应及时向属地社区(村屯)或疾控机构报告。\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def highlight(text, scores):\n", " for k, v in scores.items():\n", " text = text.replace(k, f'{k}')\n", " from IPython.display import display, HTML\n", " display(HTML(text))\n", "\n", "scores = HanLP.keyphrase_extraction(doc)\n", "highlight(doc, scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 英文\n", "按照HanLP一贯的多语种设计,任何语言都支持。由于服务器GPU资源限制,目前英文接口暂未上线。如果你有相应需求,欢迎前往论坛发起请愿。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "keyphrase_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/lid_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "nf9TgeCTC0OT" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jaW4eu6kC0OU", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "_xI_bLAaC0OU" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IYwV-UkNNzFp", "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "1Uf_u7ddMhUt", "pycharm": { "name": "#%% md\n" } }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语种识别\n", "语种识别任务的输入为一个或多个文档:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "BqEmDMGGOtk3" }, "outputs": [ { "data": { "text/plain": [ "'en'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.language_identification('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')" ] }, { "cell_type": "markdown", "metadata": { "id": "SwaPn1hjC0OW" }, "source": [ "返回对象为[ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)。HanLP支持返回语种对应的概率(置信度):" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "egpWwHKxC0OX", "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff" }, "outputs": [ { "data": { "text/plain": [ "['ja', 0.9976244568824768]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.language_identification('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)" ] }, { "cell_type": "markdown", "metadata": { "id": "kq_j5TLFC0OX" }, "source": [ "HanLP也支持返回概率最高的`topk`个语种:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "isJhzYyIC0OX", "outputId": "683c8489-dffc-426e-f95b-e91dfb373260" }, "outputs": [ { "data": { "text/plain": [ "['zh', 'ja']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.language_identification('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "该功能对于混合了多个语种的文档而言特别实用:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'zh': 0.3952908217906952,\n", " 'en': 0.37189167737960815,\n", " 'ja': 0.056213412433862686}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = '''\n", "2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。\n", "In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.\n", "'''\n", "\n", "HanLP.language_identification(text, topk=3, prob=True)" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "lid_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/lid_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "nf9TgeCTC0OT" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jaW4eu6kC0OU", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "!pip install hanlp[fasttext] -U" ] }, { "cell_type": "markdown", "metadata": { "id": "_xI_bLAaC0OU" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IYwV-UkNNzFp", "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "{'CHNSENTICORP_BERT_BASE_ZH': 'https://file.hankcs.com/hanlp/classification/chnsenticorp_bert_base_20211228_163210.zip',\n", " 'SST2_ALBERT_BASE_EN': 'https://file.hankcs.com/hanlp/classification/sst2_albert_base_20211228_164917.zip',\n", " 'LID_176_FASTTEXT_BASE': 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin',\n", " 'LID_176_FASTTEXT_SMALL': 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.classifiers.ALL # 任务见第一个字段" ] }, { "cell_type": "markdown", "metadata": { "id": "1Uf_u7ddMhUt", "pycharm": { "name": "#%% md\n" } }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "pp-1KqEOOJ4t", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" ] } ], "source": [ "lid = hanlp.load('LID_176_FASTTEXT_BASE')" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语种识别\n", "语种识别任务的输入为一个或多个文档:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "BqEmDMGGOtk3" }, "outputs": [ { "data": { "text/plain": [ "'en'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')" ] }, { "cell_type": "markdown", "metadata": { "id": "SwaPn1hjC0OW" }, "source": [ "返回对象为[ISO 639-1编码](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)。HanLP支持返回语种对应的概率(置信度):" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "egpWwHKxC0OX", "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff" }, "outputs": [ { "data": { "text/plain": [ "('ja', 0.9976244568824768)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)" ] }, { "cell_type": "markdown", "metadata": { "id": "kq_j5TLFC0OX" }, "source": [ "HanLP也支持返回概率最高的`topk`个语种:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "isJhzYyIC0OX", "outputId": "683c8489-dffc-426e-f95b-e91dfb373260" }, "outputs": [ { "data": { "text/plain": [ "['zh', 'ja']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "该功能对于混合了多个语种的文档而言特别实用:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'zh': 0.3952908217906952,\n", " 'en': 0.37189167737960815,\n", " 'ja': 0.056213412433862686}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = '''\n", "2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。\n", "In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.\n", "'''\n", "\n", "lid(text, topk=3, prob=True)" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "lid_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_mtl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "1Uf_u7ddMhUt", "pycharm": { "name": "#%% md\n" } }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pp-1KqEOOJ4t", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "0tmKBu7sNAXX", "pycharm": { "name": "#%% md\n" } }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EmZDmLn9aGxG", "outputId": "38469cbe-d56c-4648-b103-b67e6d22aeff", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n", " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "w0lm87NUsMwW" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "6Evnxsa0sMwW", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "bPUHdNJ-sMwW" }, "source": [ "## 命名实体识别" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "同时执行所有标准的命名实体识别:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "21be671b-ead0-43c9-cc3a-32c305d8be29" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n", " ],\n", " \"ner/msra\": [\n", " [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"WWW\", 1, 2]],\n", " [[\"北京\", \"LOCATION\", 2, 3], [\"立方庭\", \"LOCATION\", 3, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n", " ],\n", " \"ner/pku\": [\n", " [],\n", " [[\"北京立方庭\", \"ns\", 2, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n", " ],\n", " \"ner/ontonotes\": [\n", " [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORG\", 1, 2]],\n", " [[\"北京立方庭\", \"FAC\", 2, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n", " ]\n", "}\n" ] } ], "source": [ "print(HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'], tasks='ner*'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`,下标指的是命名实体在单词数组中的下标,单词数组默认为第一个以`tok`开头的数组。" ] }, { "cell_type": "markdown", "metadata": { "id": "cqEWnj_7p2Lf" }, "source": [ "任务越少,速度越快。如指定仅执行命名实体识别,默认MSRA标准:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 572 }, "id": "BqEmDMGGOtk3", "outputId": "33790ca9-7013-456f-c1cb-e5ddce90a457" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token \tNER Type \n", "─────────\t────────────────\n", "2021年 \t───►DATE \n", "HanLPv2.1\t───►WWW \n", "为 \t \n", "生产 \t \n", "环境 \t \n", "带来 \t \n", "次世代 \t───►DATE \n", "最 \t \n", "先进 \t \n", "的 \t \n", "多 \t \n", "语种 \t \n", "NLP \t \n", "技术 \t \n", "。 \t \n", "阿婆主 \t \n", "来到 \t \n", "北京 \t◄─┐ \n", "立方庭 \t◄─┴►ORGANIZATION\n", "参观 \t \n", "自然 \t◄─┐ \n", "语义 \t │ \n", "科技 \t ├►ORGANIZATION\n", "公司 \t◄─┘ \n", "。 \t \n" ] } ], "source": [ "HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "执行OntoNotes命名实体识别:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 572 }, "id": "1goEC7znPNkI", "outputId": "2a97331c-a5fb-4d3c-ccf2-ce2186616c57", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token \tNER Type\n", "─────────\t────────\n", "2021年 \t───►DATE\n", "HanLPv2.1\t───►ORG \n", "为 \t \n", "生产 \t \n", "环境 \t \n", "带来 \t \n", "次世代 \t \n", "最 \t \n", "先进 \t \n", "的 \t \n", "多 \t \n", "语种 \t \n", "NLP \t \n", "技术 \t \n", "。 \t \n", "阿婆主 \t \n", "来到 \t \n", "北京 \t◄─┐ \n", "立方庭 \t◄─┴►ORG \n", "参观 \t \n", "自然 \t◄─┐ \n", "语义 \t │ \n", "科技 \t ├►ORG \n", "公司 \t◄─┘ \n", "。 \t \n" ] } ], "source": [ "HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner/ontonotes').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 注意\n", "Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。" ] }, { "cell_type": "markdown", "metadata": { "id": "P7CNTDBRsiYa" }, "source": [ "## 自定义词典" ] }, { "cell_type": "markdown", "metadata": { "id": "ZXtRTXlBsmtw" }, "source": [ "自定义词典是NER任务的成员变量,要操作自定义词典,先获取一个NER任务。以MSRA为例:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "QgY22h0AszsA" }, "outputs": [], "source": [ "ner = HanLP['ner/msra']" ] }, { "cell_type": "markdown", "metadata": { "id": "_6fPzuyps98H" }, "source": [ "### 白名单词典\n", "白名单词典中的词语会尽量被输出。当然,HanLP以统计为主,词典的优先级很低。" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 321 }, "id": "plNDyWhws5qg", "outputId": "7120d400-022c-42e9-fca9-febe3745d2c9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token\tNER Type \n", "─────\t───────────\n", "2021年\t───►DATE \n", "测试 \t \n", "高血压 \t \n", "是 \t \n", "138 \t───►INTEGER\n", ", \t \n", "时间 \t \n", "是 \t \n", "午饭 \t◄─┐ \n", "后 \t◄─┴►TIME \n", "2点45 \t───►TIME \n", ", \t \n", "低血压 \t \n", "是 \t \n", "44 \t───►INTEGER\n" ] } ], "source": [ "ner.dict_whitelist = {'午饭后': 'TIME'}\n", "doc = HanLP('2021年测试高血压是138,时间是午饭后2点45,低血压是44', tasks='ner/msra')\n", "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "aR_8TICmtw_E" }, "source": [ "### 强制词典\n", "如果你读过[《自然语言处理入门》](http://nlp.hankcs.com/book.php),你就会理解BMESO标注集,于是你可以直接干预统计模型预测的标签,拿到最高优先级的权限。" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 268 }, "id": "sWPljj3stsEA", "outputId": "99c4c281-a5b6-46bb-dffd-c1722fee7aee" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "To\tNER Type \n", "──\t────────────\n", "他 \t \n", "在 \t \n", "浙江\t───►LOCATION\n", "金华\t───►LOCATION\n", "出生\t \n", ", \t \n", "他 \t \n", "的 \t \n", "名字\t \n", "叫 \t \n", "金华\t───►PERSON \n", "。 \t \n" ] } ], "source": [ "ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}\n", "HanLP('他在浙江金华出生,他的名字叫金华。', tasks='ner/msra').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "fkTC0GFxtinZ" }, "source": [ "### 黑名单词典\n", "黑名单中的词语绝对不会被当做命名实体。" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 268 }, "id": "bIJpgdGauLJK", "outputId": "e74ec7ba-00fd-4958-d772-a1d1c40d1033" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "To\tNER Type \n", "──\t────────────\n", "他 \t \n", "在 \t \n", "浙江\t───►LOCATION\n", "金华\t \n", "出生\t \n", ", \t \n", "他 \t \n", "的 \t \n", "名字\t \n", "叫 \t \n", "金华\t \n", "。 \t \n" ] } ], "source": [ "ner.dict_blacklist = {'金华'}\n", "HanLP('他在浙江金华出生,他的名字叫金华。', tasks='ner/msra').pretty_print()" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "ner_mtl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 命名实体识别" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "同时执行所有标准的命名实体识别:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "21be671b-ead0-43c9-cc3a-32c305d8be29" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n", " ],\n", " \"ner/msra\": [\n", " [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORGANIZATION\", 1, 2]],\n", " [[\"北京立方庭\", \"LOCATION\", 2, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n", " ],\n", " \"ner/pku\": [\n", " [],\n", " [[\"北京\", \"ns\", 2, 3], [\"立方庭\", \"ns\", 3, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n", " ],\n", " \"ner/ontonotes\": [\n", " [[\"2021年\", \"DATE\", 0, 1], [\"次世代\", \"DATE\", 6, 8]],\n", " [[\"北京\", \"FAC\", 2, 3], [\"立方庭\", \"LOC\", 3, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n", " ]\n", "}\n" ] } ], "source": [ "print(HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner*'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`,下标指的是命名实体在单词数组中的下标,单词数组默认为第一个以`tok`开头的数组。" ] }, { "cell_type": "markdown", "metadata": { "id": "cqEWnj_7p2Lf" }, "source": [ "任务越少,速度越快。如指定仅执行命名实体识别,默认MSRA标准:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 572 }, "id": "BqEmDMGGOtk3", "outputId": "33790ca9-7013-456f-c1cb-e5ddce90a457" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token \tNER Type \n", "─────────\t────────────────\n", "2021年 \t───►DATE \n", "HanLPv2.1\t───►ORGANIZATION\n", "为 \t \n", "生产 \t \n", "环境 \t \n", "带来 \t \n", "次 \t \n", "世代 \t \n", "最 \t \n", "先进 \t \n", "的 \t \n", "多 \t \n", "语种 \t \n", "NLP \t \n", "技术 \t \n", "。 \t \n", "\n", "Tok\tNER Type \n", "───\t────────────────\n", "阿婆主\t \n", "来到 \t \n", "北京 \t◄─┐ \n", "立方庭\t◄─┴►LOCATION \n", "参观 \t \n", "自然 \t◄─┐ \n", "语义 \t │ \n", "科技 \t ├►ORGANIZATION\n", "公司 \t◄─┘ \n", "。 \t \n" ] } ], "source": [ "HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "执行OntoNotes命名实体识别:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 572 }, "id": "1goEC7znPNkI", "outputId": "2a97331c-a5fb-4d3c-ccf2-ce2186616c57" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token \tNER Type\n", "─────────\t────────\n", "2021年 \t───►DATE\n", "HanLPv2.1\t \n", "为 \t \n", "生产 \t \n", "环境 \t \n", "带来 \t \n", "次 \t◄─┐ \n", "世代 \t◄─┴►DATE\n", "最 \t \n", "先进 \t \n", "的 \t \n", "多 \t \n", "语种 \t \n", "NLP \t \n", "技术 \t \n", "。 \t \n", "\n", "Tok\tNER Typ\n", "───\t───────\n", "阿婆主\t \n", "来到 \t \n", "北京 \t───►FAC\n", "立方庭\t───►LOC\n", "参观 \t \n", "自然 \t◄─┐ \n", "语义 \t │ \n", "科技 \t ├►ORG\n", "公司 \t◄─┘ \n", "。 \t \n" ] } ], "source": [ "HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner/ontonotes').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行命名实体识别:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 161 }, "id": "bLZSTbv_f3OA", "outputId": "6a0e1e76-f581-4fd1-8a78-ef97d9429e87" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token \tNER Type \n", "────────\t────────────────\n", "阿婆主 \t \n", "来到 \t \n", "北京立方庭 \t───►LOCATION \n", "参观 \t \n", "自然语义科技公司\t───►ORGANIZATION\n", "。 \t \n" ] } ], "source": [ "HanLP(tokens=[[\"阿婆主\", \"来到\", \"北京立方庭\", \"参观\", \"自然语义科技公司\", \"。\"]], tasks='ner').pretty_print()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "ner_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/ner_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "1Uf_u7ddMhUt", "pycharm": { "name": "#%% md\n" } }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pp-1KqEOOJ4t", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "0tmKBu7sNAXX", "pycharm": { "name": "#%% md\n" } }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EmZDmLn9aGxG", "outputId": "0d55f7a1-3a4c-4170-e60f-da7473208e3f", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "{'MSRA_NER_BERT_BASE_ZH': 'https://file.hankcs.com/hanlp/ner/ner_bert_base_msra_20211227_114712.zip',\n", " 'MSRA_NER_ALBERT_BASE_ZH': 'https://file.hankcs.com/hanlp/ner/msra_ner_albert_base_20211228_173323.zip',\n", " 'MSRA_NER_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/ner/msra_ner_electra_small_20210807_154832.zip',\n", " 'CONLL03_NER_BERT_BASE_CASED_EN': 'https://file.hankcs.com/hanlp/ner/ner_conll03_bert_base_cased_en_20211227_121443.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.ner.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "VDT-qmLyvDST" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "Tzu5Qi-xvDST", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "ner = hanlp.load(hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 命名实体识别" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "命名实体识别任务的输入为已分词的句子:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "864da076-7113-4685-e27a-1856e69bdd2a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[('2021年', 'DATE', 0, 1)], [('北京', 'LOCATION', 2, 3), ('立方庭', 'LOCATION', 3, 4), ('自然语义科技公司', 'ORGANIZATION', 5, 9)]]\n" ] } ], "source": [ "print(ner([[\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"], [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]], tasks='ner*'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "每个四元组表示`[命名实体, 类型标签, 起始下标, 终止下标]`,下标指的是命名实体在单词数组中的下标。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 自定义词典" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "自定义词典是NER任务的成员变量:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n" ] } ], "source": [ "print(ner.dict_whitelist)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 白名单词典\n", "白名单词典中的词语会尽量被输出。当然,HanLP以统计为主,词典的优先级很低。" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('2021年', 'DATE', 0, 1),\n", " ('138', 'INTEGER', 4, 5),\n", " ('午饭后', 'TIME', 8, 10),\n", " ('2点45', 'TIME', 10, 11),\n", " ('44', 'INTEGER', 14, 15)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner.dict_whitelist = {'午饭后': 'TIME'}\n", "ner(['2021年', '测试', '高血压', '是', '138', ',', '时间', '是', '午饭', '后', '2点45', ',', '低血压', '是', '44'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 强制词典\n", "如果你读过[《自然语言处理入门》](http://nlp.hankcs.com/book.php),你就会理解BMESO标注集,于是你可以直接干预统计模型预测的标签,拿到最高优先级的权限。" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('浙江', 'LOCATION', 2, 3), ('金华', 'LOCATION', 3, 4), ('金华', 'PERSON', 10, 11)]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}\n", "ner(['他', '在', '浙江', '金华', '出生', ',', '他', '的', '名字', '叫', '金华', '。'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 黑名单词典\n", "黑名单中的词语绝对不会被当做命名实体。" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('浙江', 'LOCATION', 2, 3)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner.dict_blacklist = {'金华'}\n", "ner(['他', '在', '浙江', '金华', '出生', ',', '他', '的', '名字', '叫', '金华', '。'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "ner_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_mtl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
" ] }, { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "50ad002e-4363-46cd-8f5d-b6d6aad3e957" }, "outputs": [ { "data": { "text/plain": [ "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n", " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 词性标注\n", "任务越少,速度越快。如指定仅执行词性标注,默认CTB标准:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "5ad7fd22-651a-4403-d897-a9492eb15854" }, "outputs": [ { "data": { "text/html": [ "
HanLP/NR 为/P 生产/NN 环境/NN 带来/VV 次/JJ 世代/NN 最/AD 先进/JJ 的/DEG 多语种/NN NLP/NR 技术/NN 。/PU

我/PN 的/DEG 希望/NN 是/VC 希望/VV 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "注意上面两个“希望”的词性各不相同,一个是名词另一个是动词。\n", "执行PKU词性标注:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "1goEC7znPNkI", "outputId": "586afd5d-db0d-41bd-f7de-411f37062a8c" }, "outputs": [ { "data": { "text/html": [ "
HanLP/nx 为/p 生产/vn 环境/n 带来/v 次/b 世代/n 最/d 先进/a 的/u 多语种/n NLP/nx 技术/n 。/w

我/r 的/u 希望/n 是/v 希望/v 张晚霞/nr 的/u 背影/n 被/p 晚霞/n 映红/v 。/w
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos/pku').pretty_print()\n" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "同时执行所有标准的词性标注:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "d2b3eb65-06e6-47a6-d954-04cae27d6c51" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ],\n", " \"pos/ctb\": [\n", " [\"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"JJ\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"NN\", \"NR\", \"NN\", \"PU\"],\n", " [\"PN\", \"DEG\", \"NN\", \"VC\", \"VV\", \"NR\", \"DEG\", \"NN\", \"LB\", \"NN\", \"VV\", \"PU\"]\n", " ],\n", " \"pos/pku\": [\n", " [\"nx\", \"p\", \"vn\", \"n\", \"v\", \"b\", \"n\", \"d\", \"a\", \"u\", \"n\", \"nx\", \"n\", \"w\"],\n", " [\"r\", \"u\", \"n\", \"v\", \"v\", \"nr\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n", " ],\n", " \"pos/863\": [\n", " [\"w\", \"p\", \"v\", \"n\", \"v\", \"a\", \"nt\", \"d\", \"a\", \"u\", \"n\", \"ws\", \"n\", \"w\"],\n", " [\"r\", \"u\", \"n\", \"vl\", \"v\", \"nh\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n", " ]\n", "}\n" ] } ], "source": [ "print(HanLP(['HanLP为生产环境带来次世代最先进的多语种NLP技术。', '我的希望是希望张晚霞的背影被晚霞映红。'], tasks='pos*'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "以`pos`开头的字段为词性,以`tok`开头的第一个数组为单词,两者按下标一一对应。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 注意\n", "Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。" ] }, { "cell_type": "markdown", "metadata": { "id": "suUL042zPpLj" }, "source": [ "## 自定义词典\n", "自定义词典为词性标注任务的成员变量,要操作自定义词典,先获取一个词性标注任务,以CTB标准为例:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AzYShIssP6kq", "outputId": "640cefa5-1d6d-464b-81d2-83c66e2081f2" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos = HanLP['pos/ctb']\n", "pos" ] }, { "cell_type": "markdown", "metadata": { "id": "1q4MUpgVQNlu" }, "source": [ "自定义单个词性:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "2zZkH9tRQOoi", "outputId": "ed0bb8fe-2e68-4c58-e11e-ff6a0cc69ae4" }, "outputs": [ { "data": { "text/html": [ "
HanLP/state-of-the-art-tool 为/P 生产/NN 环境/NN 带来/VV 次/JJ 世代/NN 最/AD 先进/JJ 的/DEG 多语种/NN NLP/NR 技术/NN 。/PU
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pos.dict_tags = {'HanLP': 'state-of-the-art-tool'}\n", "HanLP(\"HanLP为生产环境带来次世代最先进的多语种NLP技术。\", tasks='pos/ctb').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "F-9gAeIVQUFG" }, "source": [ "根据上下文自定义词性:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "F8M8cyBrQduw", "outputId": "16ef7f82-50ff-478f-c3ea-8e768b0cea31" }, "outputs": [ { "data": { "text/html": [ "
我/PN 的/补语成分 希望/名词 是/VC 希望/动词 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'}\n", "HanLP(\"我的希望是希望张晚霞的背影被晚霞映红。\", tasks='pos/ctb').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "需要算法基础才能理解,初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "include_colab_link": true, "name": "pos_mtl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 词性标注\n", "任务越少,速度越快。如指定仅执行词性标注,默认CTB标准:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [ { "data": { "text/html": [ "
HanLP/NR 为/P 生产/NN 环境/NN 带来/VV 次世代/NN 最/AD 先进/JJ 的/DEG 多/CD 语种/NN NLP/NN 技术/NN 。/PU

我/PN 的/DEG 希望/NN 是/VC 希望/VV 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "注意上面两个“希望”的词性各不相同,一个是名词另一个是动词。\n", "\n", "### 执行PKU词性标注" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "1goEC7znPNkI", "outputId": "7a3fde55-7577-49eb-92c8-48146aaa89d3" }, "outputs": [ { "data": { "text/html": [ "
HanLP/nx 为/p 生产/vn 环境/n 带来/v 次世代/n 最/d 先进/a 的/u 多/a 语种/n NLP/nx 技术/n 。/w

我/r 的/u 希望/n 是/v 希望/v 张晚霞/nr 的/u 背影/n 被/p 晚霞/n 映红/v 。/w
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos/pku').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 执行粗颗粒度分词和PKU词性标注" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
阿婆主/n 来到/v 北京立方庭/ns 参观/v 自然语义科技公司/n 。/w
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['tok/coarse', 'pos/pku'], skip_tasks='tok/fine').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "举一反三,你可以指定其他pos标注集(ctb、863等)。用户有多聪明,HanLP就有多强大。" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "### 同时执行所有标准的词性标注" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ],\n", " \"pos/ctb\": [\n", " [\"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"CD\", \"NN\", \"NN\", \"NN\", \"PU\"],\n", " [\"PN\", \"DEG\", \"NN\", \"VC\", \"VV\", \"NR\", \"DEG\", \"NN\", \"LB\", \"NN\", \"VV\", \"PU\"]\n", " ],\n", " \"pos/pku\": [\n", " [\"nx\", \"p\", \"vn\", \"n\", \"v\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"nx\", \"n\", \"w\"],\n", " [\"r\", \"u\", \"n\", \"v\", \"v\", \"nr\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n", " ],\n", " \"pos/863\": [\n", " [\"w\", \"p\", \"v\", \"n\", \"v\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"w\", \"n\", \"w\"],\n", " [\"r\", \"u\", \"v\", \"vl\", \"v\", \"nh\", \"u\", \"n\", \"p\", \"n\", \"v\", \"w\"]\n", " ]\n", "}\n" ] } ], "source": [ "print(HanLP('HanLP为生产环境带来次世代最先进的多语种NLP技术。我的希望是希望张晚霞的背影被晚霞映红。', tasks='pos*'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "以`pos`开头的字段为词性,以`tok`开头的第一个数组为单词,两者按下标一一对应。" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "### 为已分词的句子执行词性标注" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "data": { "text/html": [ "
HanLP/NR 为/P 生产环境/NN 带来/VV 次世代/NN 最/AD 先进/JJ 的/DEG 多语种/NN NLP/NN 技术/NN 。/PU

我/PN 的/DEG 希望/NN 是/VC 希望/VV 张晚霞/NR 的/DEG 背影/NN 被/LB 晚霞/NN 映红/VV 。/PU
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP(tokens=[\n", " [\"HanLP\", \"为\", \"生产环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='pos').pretty_print()" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "pos_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/pos_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "d74f0749-0587-454a-d7c9-7418d45ce534" }, "outputs": [ { "data": { "text/plain": [ "{'CTB5_POS_RNN': 'https://file.hankcs.com/hanlp/pos/ctb5_pos_rnn_20200113_235925.zip',\n", " 'CTB5_POS_RNN_FASTTEXT_ZH': 'https://file.hankcs.com/hanlp/pos/ctb5_pos_rnn_fasttext_20191230_202639.zip',\n", " 'CTB9_POS_ALBERT_BASE': 'https://file.hankcs.com/hanlp/pos/ctb9_albert_base_20211228_163935.zip',\n", " 'CTB9_POS_ELECTRA_SMALL_TF': 'https://file.hankcs.com/hanlp/pos/pos_ctb_electra_small_20211227_121341.zip',\n", " 'CTB9_POS_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_ctb_electra_small_20220215_111944.zip',\n", " 'CTB9_POS_RADICAL_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_ctb_radical_electra_small_20220215_111932.zip',\n", " 'C863_POS_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_863_electra_small_20220217_101958.zip',\n", " 'PKU_POS_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_pku_electra_small_20220217_142436.zip',\n", " 'PKU98_POS_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/pos/pos_pku_electra_small_20210808_125158.zip',\n", " 'PTB_POS_RNN_FASTTEXT_EN': 'https://file.hankcs.com/hanlp/pos/ptb_pos_rnn_fasttext_20200103_145337.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.pos.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0tmKBu7sNAXX", "outputId": "df2de87b-27f5-4c72-8eb2-25ceefdd8270" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading https://file.hankcs.com/hanlp/pos/ctb9_pos_electra_small_20220118_164341.zip to /root/.hanlp/pos/ctb9_pos_electra_small_20220118_164341.zip\n", "100% 43.6 MiB 21.2 MiB/s ETA: 0 s [=========================================]\n", "Decompressing /root/.hanlp/pos/ctb9_pos_electra_small_20220118_164341.zip to /root/.hanlp/pos\n", "Downloading https://file.hankcs.com/hanlp/transformers/electra_zh_small_20210706_125427.zip to /root/.hanlp/transformers/electra_zh_small_20210706_125427.zip\n", "100% 41.2 KiB 41.2 KiB/s ETA: 0 s [=========================================]\n", "Decompressing /root/.hanlp/transformers/electra_zh_small_20210706_125427.zip to /root/.hanlp/transformers\n" ] } ], "source": [ "pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 词性标注\n", "词性标注任务的输入为已分词的一个或多个句子:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BqEmDMGGOtk3", "outputId": "936d439a-e1ff-4308-d2aa-775955558594" }, "outputs": [ { "data": { "text/plain": [ "['PN', 'DEG', 'NN', 'VC', 'VV', 'NR', 'DEG', 'NN', 'LB', 'NR', 'VV', 'PU']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos([\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "注意上面两个“希望”的词性各不相同,一个是名词另一个是动词。" ] }, { "cell_type": "markdown", "metadata": { "id": "suUL042zPpLj" }, "source": [ "## 自定义词典\n", "自定义词典为词性标注任务的成员变量,以CTB标准为例:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AzYShIssP6kq", "outputId": "99b2607b-b618-4876-bbea-9f8c24859a85" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n" ] } ], "source": [ "print(pos.dict_tags)" ] }, { "cell_type": "markdown", "metadata": { "id": "1q4MUpgVQNlu" }, "source": [ "自定义单个词性:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2zZkH9tRQOoi", "outputId": "4f92a907-10c3-4798-e7b9-914b8f577b2c" }, "outputs": [ { "data": { "text/plain": [ "['state-of-the-art-tool',\n", " 'P',\n", " 'NN',\n", " 'NN',\n", " 'VV',\n", " 'JJ',\n", " 'NN',\n", " 'AD',\n", " 'VA',\n", " 'DEC',\n", " 'NN',\n", " 'NN',\n", " 'NN',\n", " 'PU']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos.dict_tags = {'HanLP': 'state-of-the-art-tool'}\n", "pos([\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "F-9gAeIVQUFG" }, "source": [ "根据上下文自定义词性:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "F8M8cyBrQduw", "outputId": "24fa7ff0-305d-4d71-925e-f369b1c50e96" }, "outputs": [ { "data": { "text/plain": [ "['PN', '补语成分', '名词', 'VC', '动词', 'NR', 'DEG', 'NN', 'LB', 'NR', 'VV', 'PU']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'}\n", "pos([\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "需要算法基础才能理解,初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "pos_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/sdp_mtl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "IYwV-UkNNzFp", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n", " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "1Uf_u7ddMhUt", "pycharm": { "name": "#%% md\n" } }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "pp-1KqEOOJ4t", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语义依存分析\n", "任务越少,速度越快。如指定仅执行语义依存分析:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='sdp')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " \"2021年\",\n", " \"HanLPv2.1\",\n", " \"为\",\n", " \"生产\",\n", " \"环境\",\n", " \"带来\",\n", " \"次\",\n", " \"世代\",\n", " \"最\",\n", " \"先进\",\n", " \"的\",\n", " \"多\",\n", " \"语种\",\n", " \"NLP\",\n", " \"技术\",\n", " \"。\"\n", " ],\n", " \"sdp\": [\n", " [[6, \"Time\"]],\n", " [[6, \"Exp\"]],\n", " [[5, \"mPrep\"]],\n", " [[5, \"Desc\"]],\n", " [[6, \"Datv\"]],\n", " [[13, \"dDesc\"]],\n", " [[0, \"Root\"], [8, \"Desc\"], [13, \"Desc\"]],\n", " [[15, \"Time\"]],\n", " [[10, \"mDegr\"]],\n", " [[15, \"Desc\"]],\n", " [[10, \"mAux\"]],\n", " [[8, \"Quan\"], [13, \"Quan\"]],\n", " [[15, \"Desc\"]],\n", " [[15, \"Nmod\"]],\n", " [[6, \"Pat\"]],\n", " [[6, \"mPunc\"]]\n", " ]\n", "}\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`doc['sdp']`字段代表语义依存图的数组格式,数组中第`i`个子数组代表第`i`个单词的语义依存关系,子数组中每个二元组的格式为`[中心词的下标, 与中心词的语义依存关系]`。每个单词的语义依存关系可能有零个、一个或多个(任意数量)。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "转换为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)格式更容易观察:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\t2021年\t_\t_\t_\t_\t_\t_\t6:Time\t_\n", "2\tHanLPv2.1\t_\t_\t_\t_\t_\t_\t6:Exp\t_\n", "3\t为\t_\t_\t_\t_\t_\t_\t5:mPrep\t_\n", "4\t生产\t_\t_\t_\t_\t_\t_\t5:Desc\t_\n", "5\t环境\t_\t_\t_\t_\t_\t_\t6:Datv\t_\n", "6\t带来\t_\t_\t_\t_\t_\t_\t13:dDesc\t_\n", "7\t次\t_\t_\t_\t_\t_\t_\t0:Root|8:Desc|13:Desc\t_\n", "8\t世代\t_\t_\t_\t_\t_\t_\t15:Time\t_\n", "9\t最\t_\t_\t_\t_\t_\t_\t10:mDegr\t_\n", "10\t先进\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n", "11\t的\t_\t_\t_\t_\t_\t_\t10:mAux\t_\n", "12\t多\t_\t_\t_\t_\t_\t_\t8:Quan|13:Quan\t_\n", "13\t语种\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n", "14\tNLP\t_\t_\t_\t_\t_\t_\t15:Nmod\t_\n", "15\t技术\t_\t_\t_\t_\t_\t_\t6:Pat\t_\n", "16\t。\t_\t_\t_\t_\t_\t_\t6:mPunc\t_\n" ] } ], "source": [ "print(doc.to_conll())" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行语义依存分析:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tHanLP\t_\t_\t_\t_\t_\t_\t5:Exp\t_\n", "2\t为\t_\t_\t_\t_\t_\t_\t4:mPrep\t_\n", "3\t生产\t_\t_\t_\t_\t_\t_\t4:Desc\t_\n", "4\t环境\t_\t_\t_\t_\t_\t_\t5:Datv\t_\n", "5\t带来\t_\t_\t_\t_\t_\t_\t0:Root\t_\n", "6\t次世代\t_\t_\t_\t_\t_\t_\t12:Time\t_\n", "7\t最\t_\t_\t_\t_\t_\t_\t8:mDegr\t_\n", "8\t先进\t_\t_\t_\t_\t_\t_\t12:Desc\t_\n", "9\t的\t_\t_\t_\t_\t_\t_\t8:mAux\t_\n", "10\t多语种\t_\t_\t_\t_\t_\t_\t12:Desc\t_\n", "11\tNLP\t_\t_\t_\t_\t_\t_\t12:Nmod\t_\n", "12\t技术\t_\t_\t_\t_\t_\t_\t5:Pat\t_\n", "13\t。\t_\t_\t_\t_\t_\t_\t5:mPunc\t_\n", "\n", "1\t我\t_\t_\t_\t_\t_\t_\t3:Poss\t_\n", "2\t的\t_\t_\t_\t_\t_\t_\t1:mAux\t_\n", "3\t希望\t_\t_\t_\t_\t_\t_\t4:Exp\t_\n", "4\t是\t_\t_\t_\t_\t_\t_\t11:mMod\t_\n", "5\t希望\t_\t_\t_\t_\t_\t_\t4:dClas\t_\n", "6\t张晚霞\t_\t_\t_\t_\t_\t_\t8:Poss\t_\n", "7\t的\t_\t_\t_\t_\t_\t_\t6:mAux\t_\n", "8\t背影\t_\t_\t_\t_\t_\t_\t11:Pat\t_\n", "9\t被\t_\t_\t_\t_\t_\t_\t10:mPrep\t_\n", "10\t晚霞\t_\t_\t_\t_\t_\t_\t11:Exp\t_\n", "11\t映红\t_\t_\t_\t_\t_\t_\t5:dCont\t_\n", "12\t。\t_\t_\t_\t_\t_\t_\t4:mPunc\t_\n" ] } ], "source": [ "print(HanLP([\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='sdp', skip_tasks='tok*').to_conll())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 注意\n", "Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "sdp_mtl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/sdp_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语义依存分析\n", "任务越少,速度越快。如指定仅执行语义依存分析:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='sdp')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n", " ],\n", " \"sdp\": [\n", " [[[6, \"Time\"]], [[6, \"Agt\"]], [[5, \"mPrep\"]], [[5, \"Desc\"]], [[6, \"Datv\"]], [[0, \"Root\"]], [[8, \"Qp\"]], [[15, \"TDur\"]], [[10, \"mDegr\"]], [[15, \"Desc\"]], [[10, \"mAux\"]], [[13, \"Quan\"]], [[15, \"Desc\"]], [[15, \"Nmod\"]], [[6, \"Cont\"]], [[6, \"mPunc\"]]]\n", " ]\n", "}\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`doc['sdp']`字段代表语义依存图的数组格式,数组中第`i`个子数组代表第`i`个单词的语义依存关系,子数组中每个二元组的格式为`[中心词的下标, 与中心词的语义依存关系]`。每个单词的语义依存关系可能有零个、一个或多个(任意数量)。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "转换为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)格式更容易观察:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\t2021年\t_\t_\t_\t_\t_\t_\t6:Time\t_\n", "2\tHanLPv2.1\t_\t_\t_\t_\t_\t_\t6:Agt\t_\n", "3\t为\t_\t_\t_\t_\t_\t_\t5:mPrep\t_\n", "4\t生产\t_\t_\t_\t_\t_\t_\t5:Desc\t_\n", "5\t环境\t_\t_\t_\t_\t_\t_\t6:Datv\t_\n", "6\t带来\t_\t_\t_\t_\t_\t_\t0:Root\t_\n", "7\t次\t_\t_\t_\t_\t_\t_\t8:Qp\t_\n", "8\t世代\t_\t_\t_\t_\t_\t_\t15:TDur\t_\n", "9\t最\t_\t_\t_\t_\t_\t_\t10:mDegr\t_\n", "10\t先进\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n", "11\t的\t_\t_\t_\t_\t_\t_\t10:mAux\t_\n", "12\t多\t_\t_\t_\t_\t_\t_\t13:Quan\t_\n", "13\t语种\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n", "14\tNLP\t_\t_\t_\t_\t_\t_\t15:Nmod\t_\n", "15\t技术\t_\t_\t_\t_\t_\t_\t6:Cont\t_\n", "16\t。\t_\t_\t_\t_\t_\t_\t6:mPunc\t_\n" ] } ], "source": [ "print(doc.to_conll())" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行语义依存分析:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tHanLP\t_\t_\t_\t_\t_\t_\t5:Agt\t_\n", "2\t为\t_\t_\t_\t_\t_\t_\t4:mPrep\t_\n", "3\t生产\t_\t_\t_\t_\t_\t_\t4:Desc\t_\n", "4\t环境\t_\t_\t_\t_\t_\t_\t5:Datv\t_\n", "5\t带来\t_\t_\t_\t_\t_\t_\t0:Root\t_\n", "6\t次世代\t_\t_\t_\t_\t_\t_\t12:Time\t_\n", "7\t最\t_\t_\t_\t_\t_\t_\t8:mDegr\t_\n", "8\t先进\t_\t_\t_\t_\t_\t_\t12:Desc\t_\n", "9\t的\t_\t_\t_\t_\t_\t_\t8:mAux\t_\n", "10\t多语种\t_\t_\t_\t_\t_\t_\t12:Desc\t_\n", "11\tNLP\t_\t_\t_\t_\t_\t_\t12:Nmod\t_\n", "12\t技术\t_\t_\t_\t_\t_\t_\t5:Cont\t_\n", "13\t。\t_\t_\t_\t_\t_\t_\t5:mPunc\t_\n", "\n", "1\t我\t_\t_\t_\t_\t_\t_\t3:Poss\t_\n", "2\t的\t_\t_\t_\t_\t_\t_\t1:mAux\t_\n", "3\t希望\t_\t_\t_\t_\t_\t_\t0:Root|4:Exp\t_\n", "4\t是\t_\t_\t_\t_\t_\t_\t5:mMod\t_\n", "5\t希望\t_\t_\t_\t_\t_\t_\t4:dClas\t_\n", "6\t张晚霞\t_\t_\t_\t_\t_\t_\t8:Poss\t_\n", "7\t的\t_\t_\t_\t_\t_\t_\t6:mAux\t_\n", "8\t背影\t_\t_\t_\t_\t_\t_\t11:Pat\t_\n", "9\t被\t_\t_\t_\t_\t_\t_\t10:mPrep\t_\n", "10\t晚霞\t_\t_\t_\t_\t_\t_\t11:Exp\t_\n", "11\t映红\t_\t_\t_\t_\t_\t_\t5:dCont\t_\n", "12\t。\t_\t_\t_\t_\t_\t_\t5:mPunc\t_\n" ] } ], "source": [ "print(HanLP(tokens=[\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='sdp').to_conll())" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "sdp_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/sdp_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "nf9TgeCTC0OT" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jaW4eu6kC0OU", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "_xI_bLAaC0OU" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IYwV-UkNNzFp", "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "{'SEMEVAL16_NEWS_BIAFFINE_ZH': 'https://file.hankcs.com/hanlp/sdp/semeval16-news-biaffine_20191231_235407.zip',\n", " 'SEMEVAL16_TEXT_BIAFFINE_ZH': 'https://file.hankcs.com/hanlp/sdp/semeval16-text-biaffine_20200101_002257.zip',\n", " 'SEMEVAL16_ALL_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/sdp/semeval16_sdp_electra_small_20220208_122026.zip',\n", " 'SEMEVAL15_PAS_BIAFFINE_EN': 'https://file.hankcs.com/hanlp/sdp/semeval15_biaffine_pas_20200103_152405.zip',\n", " 'SEMEVAL15_PSD_BIAFFINE_EN': 'https://file.hankcs.com/hanlp/sdp/semeval15_biaffine_psd_20200106_123009.zip',\n", " 'SEMEVAL15_DM_BIAFFINE_EN': 'https://file.hankcs.com/hanlp/sdp/semeval15_biaffine_dm_20200106_122808.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.sdp.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "1Uf_u7ddMhUt", "pycharm": { "name": "#%% md\n" } }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "pp-1KqEOOJ4t", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "sdp = hanlp.load('SEMEVAL16_ALL_ELECTRA_SMALL_ZH')" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语义依存分析\n", "语义依存分析的输入为已分词的一个或多个句子:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "BqEmDMGGOtk3" }, "outputs": [], "source": [ "graph = sdp([\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "SwaPn1hjC0OW" }, "source": [ "返回对象为[CoNLLSentence](https://hanlp.hankcs.com/docs/api/common/conll.html#hanlp_common.conll.CoNLLSentence)类型:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "egpWwHKxC0OX", "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff" }, "outputs": [ { "data": { "text/plain": [ "[{'id': 1,\n", " 'form': '2021年',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(6, 'Time')],\n", " 'misc': None},\n", " {'id': 2,\n", " 'form': 'HanLPv2.1',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(6, 'Exp')],\n", " 'misc': None},\n", " {'id': 3,\n", " 'form': '为',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(5, 'mPrep')],\n", " 'misc': None},\n", " {'id': 4,\n", " 'form': '生产',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(5, 'Desc')],\n", " 'misc': None},\n", " {'id': 5,\n", " 'form': '环境',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(6, 'Datv')],\n", " 'misc': None},\n", " {'id': 6,\n", " 'form': '带来',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(2, 'eSucc')],\n", " 'misc': None},\n", " {'id': 7,\n", " 'form': '次',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(8, 'Desc'), (13, 'Desc')],\n", " 'misc': None},\n", " {'id': 8,\n", " 'form': '世代',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(0, 'Root'), (15, 'Time')],\n", " 'misc': None},\n", " {'id': 9,\n", " 'form': '最',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(10, 'mDegr')],\n", " 'misc': None},\n", " {'id': 10,\n", " 'form': '先进',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(15, 'Desc')],\n", " 'misc': None},\n", " {'id': 11,\n", " 'form': '的',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(10, 'mAux')],\n", " 'misc': None},\n", " {'id': 12,\n", " 'form': '多',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(10, 'mDegr'), (13, 'Quan')],\n", " 'misc': None},\n", " {'id': 13,\n", " 'form': '语种',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(15, 'Desc')],\n", " 'misc': None},\n", " {'id': 14,\n", " 'form': 'NLP',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(15, 'Desc')],\n", " 'misc': None},\n", " {'id': 15,\n", " 'form': '技术',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(6, 'Pat')],\n", " 'misc': None},\n", " {'id': 16,\n", " 'form': '。',\n", " 'upos': None,\n", " 'xpos': None,\n", " 'head': None,\n", " 'deprel': None,\n", " 'lemma': None,\n", " 'feats': None,\n", " 'deps': [(6, 'mPunc')],\n", " 'misc': None}]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "graph" ] }, { "cell_type": "markdown", "metadata": { "id": "kq_j5TLFC0OX" }, "source": [ "打印为为CoNLL格式:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "isJhzYyIC0OX", "outputId": "683c8489-dffc-426e-f95b-e91dfb373260" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\t2021年\t_\t_\t_\t_\t_\t_\t6:Time\t_\n", "2\tHanLPv2.1\t_\t_\t_\t_\t_\t_\t6:Exp\t_\n", "3\t为\t_\t_\t_\t_\t_\t_\t5:mPrep\t_\n", "4\t生产\t_\t_\t_\t_\t_\t_\t5:Desc\t_\n", "5\t环境\t_\t_\t_\t_\t_\t_\t6:Datv\t_\n", "6\t带来\t_\t_\t_\t_\t_\t_\t2:eSucc\t_\n", "7\t次\t_\t_\t_\t_\t_\t_\t8:Desc|13:Desc\t_\n", "8\t世代\t_\t_\t_\t_\t_\t_\t0:Root|15:Time\t_\n", "9\t最\t_\t_\t_\t_\t_\t_\t10:mDegr\t_\n", "10\t先进\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n", "11\t的\t_\t_\t_\t_\t_\t_\t10:mAux\t_\n", "12\t多\t_\t_\t_\t_\t_\t_\t10:mDegr|13:Quan\t_\n", "13\t语种\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n", "14\tNLP\t_\t_\t_\t_\t_\t_\t15:Desc\t_\n", "15\t技术\t_\t_\t_\t_\t_\t_\t6:Pat\t_\n", "16\t。\t_\t_\t_\t_\t_\t_\t6:mPunc\t_\n" ] } ], "source": [ "print(graph)" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "sdp_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/sentiment_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "nf9TgeCTC0OT" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jaW4eu6kC0OU", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "_xI_bLAaC0OU" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IYwV-UkNNzFp", "outputId": "54065443-9b0a-444c-f6c0-c701bc86400b", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "1Uf_u7ddMhUt", "pycharm": { "name": "#%% md\n" } }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 情感分析\n", "情感分析任务的输入为文档:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "BqEmDMGGOtk3" }, "outputs": [ { "data": { "text/plain": [ "0.8418035507202148" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.sentiment_analysis('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。')" ] }, { "cell_type": "markdown", "metadata": { "id": "SwaPn1hjC0OW" }, "source": [ "返回值为文档的情感极性,表示为$[-1, +1]$之间的数值,数值的正负代表正负面情绪,数值的绝对值代表情感的强烈程度。" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "egpWwHKxC0OX", "outputId": "f7c77687-dd75-4fa2-dbd2-be6bda8a3fff" }, "outputs": [ { "data": { "text/plain": [ "0.8327275514602661" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.sentiment_analysis('看哭了。感人肺腑。')" ] }, { "cell_type": "markdown", "metadata": { "id": "kq_j5TLFC0OX" }, "source": [ "注意返回值的符号代表正负情感:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "isJhzYyIC0OX", "outputId": "683c8489-dffc-426e-f95b-e91dfb373260" }, "outputs": [ { "data": { "text/plain": [ "-0.8850911855697632" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.sentiment_analysis('看哭了。难看哭了。')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "绝对值的大小代表情感的强烈程度:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.9190718531608582" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.sentiment_analysis('看哭了。难看哭了!!!')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "长文档一样支持:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "0.9505730271339417" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = '''“这是一部男人必看的电影。”人人都这么说。但单纯从性别区分,就会让这电影变狭隘。\n", "《肖申克的救赎》突破了男人电影的局限,通篇几乎充满令人难以置信的温馨基调,而电影里最伟大的主题是“希望”。\n", "当我们无奈地遇到了如同肖申克一般囚禁了心灵自由的那种囹圄,我们是无奈的老布鲁克,灰心的瑞德,还是智慧的安迪?\n", "运用智慧,信任希望,并且勇敢面对恐惧心理,去打败它?\n", "经典的电影之所以经典,因为他们都在做同一件事——让你从不同的角度来欣赏希望的美好。'''\n", "HanLP.sentiment_analysis(text)" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "sentiment_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/srl_mtl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [ { "data": { "text/plain": [ "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n", " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语义角色分析\n", "任务越少,速度越快。如指定仅执行语义角色分析:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='srl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " \"2021年\",\n", " \"HanLPv2.1\",\n", " \"为\",\n", " \"生产\",\n", " \"环境\",\n", " \"带来\",\n", " \"次\",\n", " \"世代\",\n", " \"最\",\n", " \"先进\",\n", " \"的\",\n", " \"多\",\n", " \"语种\",\n", " \"NLP\",\n", " \"技术\",\n", " \"。\"\n", " ],\n", " \"srl\": [\n", " [[\"2021年\", \"ARGM-TMP\", 0, 1], [\"HanLPv2.1\", \"ARG0\", 1, 2], [\"为生产环境\", \"ARG2\", 2, 5], [\"带来\", \"PRED\", 5, 6], [\"次世代最先进的多语种NLP技术\", \"ARG1\", 6, 15]],\n", " [[\"最\", \"ARGM-ADV\", 8, 9], [\"先进\", \"PRED\", 9, 10], [\"技术\", \"ARG0\", 14, 15]]\n", " ]\n", "}\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`doc['srl']`字段为语义角色标注结果,每个四元组的格式为`[论元或谓词, 语义角色标签, 起始下标, 终止下标]`。其中,谓词的语义角色标签为`PRED`,起止下标对应以`tok`开头的第一个单词数组。" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "可视化谓词论元结构:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token \tSRL PA1 \tToken \tSRL PA2 \n", "─────────\t────────────\t─────────\t────────────\n", "2021年 \t───►ARGM-TMP\t2021年 \t \n", "HanLPv2.1\t───►ARG0 \tHanLPv2.1\t \n", "为 \t◄─┐ \t为 \t \n", "生产 \t ├►ARG2 \t生产 \t \n", "环境 \t◄─┘ \t环境 \t \n", "带来 \t╟──►PRED \t带来 \t \n", "次 \t◄─┐ \t次 \t \n", "世代 \t │ \t世代 \t \n", "最 \t │ \t最 \t───►ARGM-ADV\n", "先进 \t │ \t先进 \t╟──►PRED \n", "的 \t ├►ARG1 \t的 \t \n", "多 \t │ \t多 \t \n", "语种 \t │ \t语种 \t \n", "NLP \t │ \tNLP \t \n", "技术 \t◄─┘ \t技术 \t───►ARG0 \n", "。 \t \t。 \t \n" ] } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "遍历谓词论元结构:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "第1个谓词论元结构:\n", "2021年 = ARGM-TMP at [0, 1]\n", "HanLPv2.1 = ARG0 at [1, 2]\n", "为生产环境 = ARG2 at [2, 5]\n", "带来 = PRED at [5, 6]\n", "次世代最先进的多语种NLP技术 = ARG1 at [6, 15]\n", "第2个谓词论元结构:\n", "最 = ARGM-ADV at [8, 9]\n", "先进 = PRED at [9, 10]\n", "技术 = ARG0 at [14, 15]\n" ] } ], "source": [ "for i, pas in enumerate(doc['srl']):\n", " print(f'第{i+1}个谓词论元结构:')\n", " for form, role, begin, end in pas:\n", " print(f'{form} = {role} at [{begin}, {end}]')" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行语义角色分析:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token\tSRL PA1 \tToken\tSRL PA2 \n", "─────\t────────\t─────\t────────────\n", "HanLP\t───►ARG0\tHanLP\t \n", "为 \t◄─┐ \t为 \t \n", "生产 \t ├►ARG2\t生产 \t \n", "环境 \t◄─┘ \t环境 \t \n", "带来 \t╟──►PRED\t带来 \t \n", "次世代 \t◄─┐ \t次世代 \t \n", "最 \t │ \t最 \t───►ARGM-ADV\n", "先进 \t │ \t先进 \t╟──►PRED \n", "的 \t ├►ARG1\t的 \t \n", "多语种 \t │ \t多语种 \t \n", "NLP \t │ \tNLP \t \n", "技术 \t◄─┘ \t技术 \t───►ARG0 \n", "。 \t \t。 \t \n", "\n", "Tok\tSRL PA1 \tTok\tSRL PA2 \tTok\tSRL PA3 \n", "───\t────────\t───\t────────\t───\t────────\n", "我 \t◄─┐ \t我 \t \t我 \t \n", "的 \t ├►ARG0\t的 \t \t的 \t \n", "希望 \t◄─┘ \t希望 \t \t希望 \t \n", "是 \t╟──►PRED\t是 \t \t是 \t \n", "希望 \t◄─┐ \t希望 \t╟──►PRED\t希望 \t \n", "张晚霞\t │ \t张晚霞\t◄─┐ \t张晚霞\t \n", "的 \t │ \t的 \t │ \t的 \t \n", "背影 \t ├►ARG1\t背影 \t │ \t背影 \t \n", "被 \t │ \t被 \t ├►ARG1\t被 \t \n", "晚霞 \t │ \t晚霞 \t │ \t晚霞 \t───►ARG0\n", "映红 \t◄─┘ \t映红 \t◄─┘ \t映红 \t╟──►PRED\n", "。 \t \t。 \t \t。 \t \n" ] } ], "source": [ "HanLP([\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='srl', skip_tasks='tok*').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 注意\n", "Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "srl_mtl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/srl_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语义角色分析\n", "任务越少,速度越快。如指定仅执行语义角色分析:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [], "source": [ "doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', tasks='srl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回值为一个[Document](https://hanlp.hankcs.com/docs/api/common/document.html):" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"]\n", " ],\n", " \"srl\": [\n", " [[[\"2021年\", \"ARGM-TMP\", 0, 1], [\"HanLPv2.1\", \"ARG0\", 1, 2], [\"为生产环境\", \"ARG2\", 2, 5], [\"带来\", \"PRED\", 5, 6], [\"次世代最先进的多语种NLP技术\", \"ARG1\", 6, 15]], [[\"次世代\", \"ARGM-TMP\", 6, 8], [\"最\", \"ARGM-ADV\", 8, 9], [\"先进\", \"PRED\", 9, 10], [\"NLP技术\", \"ARG0\", 13, 15]]]\n", " ]\n", "}\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`doc['srl']`字段为语义角色标注结果,每个四元组的格式为`[论元或谓词, 语义角色标签, 起始下标, 终止下标]`。其中,谓词的语义角色标签为`PRED`,起止下标对应以`tok`开头的第一个单词数组。" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "可视化谓词论元结构:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "c6077f2d-7084-4f4b-a3bc-9aa9951704ea" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token \tSRL PA1 \tToken \tSRL PA2 \n", "─────────\t────────────\t─────────\t────────────\n", "2021年 \t───►ARGM-TMP\t2021年 \t \n", "HanLPv2.1\t───►ARG0 \tHanLPv2.1\t \n", "为 \t◄─┐ \t为 \t \n", "生产 \t ├►ARG2 \t生产 \t \n", "环境 \t◄─┘ \t环境 \t \n", "带来 \t╟──►PRED \t带来 \t \n", "次 \t◄─┐ \t次 \t◄─┐ \n", "世代 \t │ \t世代 \t◄─┴►ARGM-TMP\n", "最 \t │ \t最 \t───►ARGM-ADV\n", "先进 \t │ \t先进 \t╟──►PRED \n", "的 \t ├►ARG1 \t的 \t \n", "多 \t │ \t多 \t \n", "语种 \t │ \t语种 \t \n", "NLP \t │ \tNLP \t◄─┐ \n", "技术 \t◄─┘ \t技术 \t◄─┴►ARG0 \n", "。 \t \t。 \t \n" ] } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "遍历谓词论元结构:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "第1个谓词论元结构:\n", "2021年 = ARGM-TMP at [0, 1]\n", "HanLPv2.1 = ARG0 at [1, 2]\n", "为生产环境 = ARG2 at [2, 5]\n", "带来 = PRED at [5, 6]\n", "次世代最先进的多语种NLP技术 = ARG1 at [6, 15]\n", "第2个谓词论元结构:\n", "次世代 = ARGM-TMP at [6, 8]\n", "最 = ARGM-ADV at [8, 9]\n", "先进 = PRED at [9, 10]\n", "NLP技术 = ARG0 at [13, 15]\n" ] } ], "source": [ "for i, pas in enumerate(doc['srl'][0]):\n", " print(f'第{i+1}个谓词论元结构:')\n", " for form, role, begin, end in pas:\n", " print(f'{form} = {role} at [{begin}, {end}]')" ] }, { "cell_type": "markdown", "metadata": { "id": "XOsWkOqQfzlr" }, "source": [ "为已分词的句子执行语义角色分析:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "bLZSTbv_f3OA", "outputId": "111c0be9-bac6-4eee-d5bd-a972ffc34844" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token\tSRL PA1 \tToken\tSRL PA2 \n", "─────\t────────\t─────\t────────────\n", "HanLP\t───►ARG0\tHanLP\t \n", "为 \t◄─┐ \t为 \t \n", "生产 \t ├►ARG2\t生产 \t \n", "环境 \t◄─┘ \t环境 \t \n", "带来 \t╟──►PRED\t带来 \t \n", "次世代 \t◄─┐ \t次世代 \t───►ARGM-TMP\n", "最 \t │ \t最 \t───►ARGM-ADV\n", "先进 \t │ \t先进 \t╟──►PRED \n", "的 \t ├►ARG1\t的 \t \n", "多语种 \t │ \t多语种 \t \n", "NLP \t │ \tNLP \t \n", "技术 \t◄─┘ \t技术 \t───►ARG0 \n", "。 \t \t。 \t \n", "\n", "Tok\tSRL PA1 \tTok\tSRL PA2 \tTok\tSRL PA3 \n", "───\t────────\t───\t────────\t───\t────────\n", "我 \t◄─┐ \t我 \t \t我 \t \n", "的 \t ├►ARG0\t的 \t \t的 \t \n", "希望 \t◄─┘ \t希望 \t \t希望 \t \n", "是 \t╟──►PRED\t是 \t \t是 \t \n", "希望 \t◄─┐ \t希望 \t╟──►PRED\t希望 \t \n", "张晚霞\t │ \t张晚霞\t◄─┐ \t张晚霞\t◄─┐ \n", "的 \t │ \t的 \t │ \t的 \t ├►ARG1\n", "背影 \t ├►ARG1\t背影 \t │ \t背影 \t◄─┘ \n", "被 \t │ \t被 \t ├►ARG1\t被 \t \n", "晚霞 \t │ \t晚霞 \t │ \t晚霞 \t───►ARG0\n", "映红 \t◄─┘ \t映红 \t◄─┘ \t映红 \t╟──►PRED\n", "。 \t \t。 \t \t。 \t \n" ] } ], "source": [ "HanLP(tokens=[\n", " [\"HanLP\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"我\", \"的\", \"希望\", \"是\", \"希望\", \"张晚霞\", \"的\", \"背影\", \"被\", \"晚霞\", \"映红\", \"。\"]\n", " ], tasks='srl', skip_tasks='tok*').pretty_print()" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "srl_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/srl_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [ { "data": { "text/plain": [ "{'CPB3_SRL_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/srl/cpb3_electra_small_crf_has_transform_20220218_135910.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.srl.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "srl = hanlp.load('CPB3_SRL_ELECTRA_SMALL')" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语义角色分析\n", "为已分词的句子执行语义角色分析:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [ { "data": { "text/plain": [ "[[('2021年', 'ARGM-TMP', 0, 1),\n", " ('HanLPv2.1', 'ARG0', 1, 2),\n", " ('为生产环境', 'ARG2', 2, 5),\n", " ('带来', 'PRED', 5, 6),\n", " ('次世代最先进的多语种NLP技术', 'ARG1', 6, 15)],\n", " [('次世代', 'ARGM-TMP', 6, 8),\n", " ('最', 'ARGM-ADV', 8, 9),\n", " ('先进', 'PRED', 9, 10),\n", " ('技术', 'ARG0', 14, 15)]]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "srl(['2021年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次', '世代', '最', '先进', '的', '多', '语种', 'NLP', '技术', '。'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "语义角色标注结果中每个四元组的格式为`[论元或谓词, 语义角色标签, 起始下标, 终止下标]`。其中,谓词的语义角色标签为`PRED`,起止下标对应单词数组。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "遍历谓词论元结构:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "第1个谓词论元结构:\n", "2021年 = ARGM-TMP at [0, 1]\n", "HanLPv2.1 = ARG0 at [1, 2]\n", "为生产环境 = ARG2 at [2, 5]\n", "带来 = PRED at [5, 6]\n", "次世代最先进的多语种NLP技术 = ARG1 at [6, 15]\n", "第2个谓词论元结构:\n", "次世代 = ARGM-TMP at [6, 8]\n", "最 = ARGM-ADV at [8, 9]\n", "先进 = PRED at [9, 10]\n", "技术 = ARG0 at [14, 15]\n" ] } ], "source": [ "for i, pas in enumerate(srl(['2021年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次', '世代', '最', '先进', '的', '多', '语种', 'NLP', '技术', '。'])):\n", " print(f'第{i+1}个谓词论元结构:')\n", " for form, role, begin, end in pas:\n", " print(f'{form} = {role} at [{begin}, {end}]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 注意\n", "Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "srl_mtl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/sts_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语义文本相似度\n", "输入两段短文本组成的二元组列表,执行语义文本相似度:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [ { "data": { "text/plain": [ "[0.9764469861984253, 0.0, 0.003458738327026367]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.semantic_textual_similarity([\n", " ('看图猜一电影名', '看图猜电影'),\n", " ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),\n", " ('北京到上海的动车票', '上海到北京的动车票'),\n", "])" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "sts_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/sts_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "import hanlp\n", "hanlp.pretrained.sts.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "sts = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 语义文本相似度\n", "输入两段短文本组成的二元组列表,执行语义文本相似度:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [ { "data": { "text/plain": [ "[0.9764469861984253, 0.0, 0.003458738327026367]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sts([\n", " ('看图猜一电影名', '看图猜电影'),\n", " ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),\n", " ('北京到上海的动车票', '上海到北京的动车票'),\n", "])" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "sts_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-31 20:36 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_classifier.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 03:52 from hanlp.datasets.classification.sentiment import CHNSENTICORP_ERNIE_TEST import hanlp classifier = hanlp.load('CHNSENTICORP_BERT_BASE_ZH') print(classifier.predict('前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!')) # predict a whole file in batch mode outputs = classifier.predict(classifier.transform.file_to_inputs(CHNSENTICORP_ERNIE_TEST), gold=True) print(outputs[:5]) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_client.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-08 04:43 # pip3 install tensorflow-serving-api-gpu import grpc import tensorflow as tf from tensorflow_core.python.framework import tensor_util from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc import hanlp from hanlp.common.keras_component import KerasComponent tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN, transform_only=True) transform = tagger.transform del tagger inputs = [['商品', '和', '服务'], ['我', '的', '希望', '是', '希望', '和平']] samples = next(iter(transform.inputs_to_dataset(inputs)))[0] print(samples) channel = grpc.insecure_channel('{host}:{port}'.format(host='localhost', port=8500)) stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) request = predict_pb2.PredictRequest() request.model_spec.name = 'ctb5_pos_rnn_20191229_015325' request.model_spec.signature_name = 'serving_default' request.inputs['embedding_input'].CopyFrom( tf.make_tensor_proto(samples, dtype=tf.float32)) result = stub.Predict(request, 10.0) # 10 secs timeout print(result) prediction = tensor_util.MakeNdarray(result.outputs['dense']) print(prediction) print(list(transform.Y_to_outputs(prediction, inputs=inputs))) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_cws.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 21:25 import hanlp tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE) print(tokenizer('商品和服务')) print(tokenizer(['萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。', '上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。', 'HanLP支援臺灣正體、香港繁體,具有新詞辨識能力的中文斷詞系統'])) text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。' print(tokenizer(text)) dic = {'自定义词典': 'custom_dict', '聪明人': 'smart'} def split_by_dic(text: str): # We use regular expression for the sake of simplicity. # However, you should use some trie trees for production import re p = re.compile('(' + '|'.join(dic.keys()) + ')') sents, offset, words = [], 0, [] for m in p.finditer(text): if offset < m.start(): sents.append(text[offset: m.start()]) words.append((m.group(), dic[m.group()])) offset = m.end() if offset < len(text): sents.append(text[offset:]) words.append((None, None)) flat = [] for pred, (word, tag) in zip(tokenizer(sents), words): flat.extend(pred) if word: flat.append((word, tag)) return flat print(split_by_dic(text)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_cws_trie.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 21:25 from hanlp_trie.trie import Trie import hanlp tokenizer = hanlp.load('LARGE_ALBERT_BASE') text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。' print(tokenizer(text)) trie = Trie() trie.update({'自定义词典': 'custom_dict', '聪明人': 'smart'}) def split_sents(text: str, trie: Trie): words = trie.parse_longest(text) sents = [] pre_start = 0 offsets = [] for start, end, value in words: if pre_start != start: sents.append(text[pre_start: start]) offsets.append(pre_start) pre_start = end if pre_start != len(text): sents.append(text[pre_start:]) offsets.append(pre_start) return sents, offsets, words print(split_sents(text, trie)) def merge_parts(parts, offsets, words): items = [(i, p) for (i, p) in zip(offsets, parts)] items += [(start, [value]) for (start, end, value) in words] return [each for x in sorted(items) for each in x[1]] tokenizer = hanlp.pipeline() \ .append(split_sents, output_key=('parts', 'offsets', 'words'), trie=trie) \ .append(tokenizer, input_key='parts', output_key='tokens') \ .append(merge_parts, input_key=('tokens', 'offsets', 'words'), output_key='merged') print(tokenizer(text)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_dep.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 21:25 import hanlp syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH) sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')] tree = syntactic_parser(sent) print(tree) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_fasttext.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-12 18:33 import hanlp import torch # fasttext is a `torch.nn.Module`. Unless you know how to code in # PyTorch, otherwise don't bother to use this. fasttext = hanlp.load(hanlp.pretrained.fasttext.FASTTEXT_WIKI_300_ZH) vec = fasttext('单词') print(vec) print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('词语'), dim=0)) print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('今天'), dim=0)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_multiprocess.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-02-15 11:30 import multiprocessing import hanlp tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE) def worker(job): print(job) print(tokenizer(job)) if __name__ == '__main__': num_proc = 2 # Important! The python multiprocessing package defaults to just call fork when creating a child process. # This cannot work when the child process calls async code (i.e TensorFlow is multithreaded). # See https://github.com/tensorflow/tensorflow/issues/8220#issuecomment-302826884 # See https://sefiks.com/2019/03/20/tips-and-tricks-for-gpu-and-multiprocessing-in-tensorflow/ multiprocessing.set_start_method('spawn', force=True) # only spawn works with TensorFlow with multiprocessing.Pool(num_proc) as pool: pool.map(worker, [f'给{i}号进程的任务' for i in range(num_proc)]) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-30 19:52 import hanlp recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH) print(recognizer.predict([list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'), list('萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。')])) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_pipeline.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-31 03:24 import hanlp tokenizer = hanlp.load('LARGE_ALBERT_BASE') tagger = hanlp.load('CTB9_POS_ALBERT_BASE') syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH') semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH') pipeline = hanlp.pipeline() \ .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ .append(tokenizer, output_key='tokens') \ .append(tagger, output_key='part_of_speech_tags') \ .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \ .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False) print(pipeline) text = '''HanLP是一系列模型与算法组成的自然语言处理工具包,目标是普及自然语言处理在生产环境中的应用。 HanLP具备功能完善、性能高效、架构清晰、语料时新、可自定义的特点。 内部算法经过工业界和学术界考验,配套书籍《自然语言处理入门》已经出版。 ''' doc = pipeline(text) print(doc) # By default the doc is json serializable, it holds true if your pipes output json serializable object too. # print(json.dumps(doc, ensure_ascii=False, indent=2)) # You can save the config to disk for deploying or sharing. pipeline.save('zh.json') # Then load it smoothly. deployed = hanlp.load('zh.json') print(deployed) print(deployed(text)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_pos.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 21:25 import hanlp from hanlp.pretrained.pos import CTB9_POS_ALBERT_BASE tagger = hanlp.load(CTB9_POS_ALBERT_BASE) print(tagger.predict(['我', '的', '希望', '是', '希望', '世界', '和平'])) print(tagger.predict([['支持', '批处理', '地', '预测'], ['速度', '更', '快']])) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_sdp.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-31 23:55 import hanlp semantic_parser = hanlp.load('SEMEVAL16_NEWS_BIAFFINE_ZH') sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')] print(semantic_parser(sent)) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/demo_serving.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-06 20:23 import hanlp from hanlp.common.keras_component import KerasComponent tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN) print(tagger('商品 和 服务'.split())) tagger.serve() ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2021-12-26 23:25 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 20:55 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_albert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizerTF() save_dir = 'data/model/cws_bert_albert_ctb6' tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2', metrics='f1', learning_rate=5e-5, epochs=3) tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_bert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizerTF() save_dir = 'data/model/cws_bert_base_ctb6' tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, transformer='chinese_L-12_H-768_A-12', metrics='f1') tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_convseg.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 import tensorflow as tf from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR from tests import cdroot cdroot() tokenizer = NgramConvTokenizerTF() save_dir = 'data/model/cws/ctb6_cws' optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5) tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, word_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 'expand_vocab': False, 'lowercase': False, }}, optimizer=optimizer, window_size=0, weight_norm=True) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_bert_cws.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizerTF() save_dir = 'data/model/cws_bert_base_100million' tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_DEV, save_dir, transformer='bert-base-chinese', metrics='accuracy', batch_size=32) tokenizer.load(save_dir, metrics='f1') print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_conv_cws.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-29 21:58 import tensorflow as tf from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR from tests import cdroot cdroot() tokenizer = NgramConvTokenizerTF() save_dir = 'data/model/cws/ctb6_cws' optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5) tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, word_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 'expand_vocab': False, 'lowercase': False, }}, optimizer=optimizer, window_size=0, weight_norm=True) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_cws_albert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizerTF() save_dir = 'data/model/large_corpus_cws_albert_base' tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_DEV, save_dir, transformer='uer/albert-base-chinese-cluecorpussmall', max_seq_length=128, metrics='accuracy', learning_rate=5e-5, epochs=3) tokenizer.load(save_dir, metrics='f1') print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_cws_electra.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizerTF() save_dir = 'data/model/large_corpus_cws_electra_small' tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_DEV, save_dir, transformer='hfl/chinese-electra-small-discriminator', max_seq_length=128, metrics='accuracy', learning_rate=5e-5, epochs=10) tokenizer.load(save_dir, metrics='f1') print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_rnn_cws.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 import tensorflow as tf from hanlp.components.tokenizers.tok_tf import RNNTokenizerTF from hanlp.datasets.cws.ctb import CTB6_CWS_TEST, CTB6_CWS_DEV from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100 from tests import cdroot cdroot() tokenizer = RNNTokenizerTF() save_dir = 'data/model/cws/large_rnn_cws' optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5) tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_DEV, save_dir, embeddings={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': RADICAL_CHAR_EMBEDDING_100, 'expand_vocab': False, 'lowercase': False, }}, early_stopping_patience=5, batch_size=64, max_seq_len=64, metrics='accuracy' ) tokenizer.load(save_dir, metrics='f1') tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_albert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:22 from hanlp.components.tokenizers.tok import TransformerTokenizer from hanlp.datasets.cws.ctb import CTB6_CWS_TEST from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TRAIN from tests import cdroot cdroot() tokenizer = TransformerTokenizer() save_dir = 'data/model/msr_cws_albert_base' tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, save_dir, transformer='albert_base_zh', max_seq_length=150, metrics='f1', learning_rate=5e-5, epochs=10) tokenizer.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_bert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, SIGHAN2005_MSR_TEST from tests import cdroot cdroot() tokenizer = TransformerTokenizerTF() save_dir = 'data/model/cws_bert_base_msra' tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, save_dir, transformer='bert-base-chinese', metrics='f1') # tagger.load(save_dir) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_ngram_conv.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 import tensorflow as tf from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, SIGHAN2005_MSR_TEST from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR from tests import cdroot cdroot() tokenizer = NgramConvTokenizerTF() save_dir = 'data/model/cws/convseg-msr-nocrf-noembed' tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, save_dir, word_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 'expand_vocab': False, 'lowercase': False, }}, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5), epochs=100, window_size=0, metrics='f1', weight_norm=True) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_ngram_conv_embed.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 import tensorflow as tf from hanlp.components.tokenizers.tok import NgramConvTokenizer from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TEST from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR, CONVSEG_W2V_NEWS_TENSITE_WORD_MSR from tests import cdroot cdroot() tokenizer = NgramConvTokenizer() save_dir = 'data/model/cws/convseg-msr-nocrf-noembed' tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, save_dir, word_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 'expand_vocab': False, 'lowercase': False, }}, ngram_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_WORD_MSR, 'expand_vocab': True, 'lowercase': False, }}, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5), epochs=3, window_size=4, metrics='f1', weight_norm=True) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) tokenizer.load(save_dir, metrics='f1') tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku980106_conv_cws.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 import tensorflow as tf from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100 from tests import cdroot cdroot() tokenizer = NgramConvTokenizerTF() save_dir = 'data/model/cws/pku98_6m_conv_ngram' optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5) tokenizer.fit('data/cws/pku98/199801-06-seg.txt', 'data/cws/pku98/test_pku98_name_merged.txt', save_dir, word_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'filepath': RADICAL_CHAR_EMBEDDING_100, 'expand_vocab': True, 'lowercase': False, }}, optimizer=optimizer, window_size=0, weight_norm=True) tokenizer.evaluate('data/cws/pku98/test_pku98_name_merged.txt', save_dir=save_dir, output=False) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku980106_rnn_cws.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 import tensorflow as tf from hanlp.components.tokenizers.tok_tf import RNNTokenizerTF from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100 from tests import cdroot cdroot() tokenizer = RNNTokenizerTF() save_dir = 'data/model/cws/pku_6m_rnn_cws' optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5) tokenizer.fit('data/cws/pku98/199801-06-seg.txt', 'data/cws/pku98/pku98_test.txt', save_dir, embeddings={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'filepath': RADICAL_CHAR_EMBEDDING_100, 'expand_vocab': True, 'lowercase': False, }} ) tokenizer.evaluate('data/cws/pku98/pku98_test.txt', save_dir=save_dir, output=False) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku_conv_cws.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-21 15:39 from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005_PKU_TRAIN, SIGHAN2005_PKU_DEV, SIGHAN2005_PKU_TEST from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR from hanlp.utils.tf_util import nice from tests import cdroot import tensorflow as tf nice() cdroot() from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF tokenizer = NgramConvTokenizerTF() save_dir = 'data/model/cws/sighan2005-pku-convseg' optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8, clipnorm=5) tokenizer.fit(SIGHAN2005_PKU_TRAIN, SIGHAN2005_PKU_DEV, save_dir, word_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 'expand_vocab': False, 'lowercase': False, }}, optimizer=optimizer, window_size=0, weight_norm=True) tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir=save_dir, output=False) # print(tagger.tag(list('中央民族乐团离开北京前往维也纳'))) # print(tagger.predict('中央民族乐团离开北京前往维也纳')) print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/finetune_msra_ner_albert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:15 import hanlp from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST from tests import cdroot cdroot() recognizer = TransformerNamedEntityRecognizerTF() save_dir = 'data/model/ner/finetune_ner_albert_base_zh_msra' recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='albert_base_zh', finetune=hanlp.pretrained.ner.MSRA_NER_ALBERT_BASE_ZH) recognizer.load(save_dir) print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'))) recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_chnsenticorp_bert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-30 21:01 from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF, TransformerTextTransform from hanlp.datasets.classification.sentiment import CHNSENTICORP_ERNIE_TRAIN, CHNSENTICORP_ERNIE_TEST, \ CHNSENTICORP_ERNIE_DEV from tests import cdroot cdroot() save_dir = 'data/model/classification/chnsenticorp_bert_base' classifier = TransformerClassifierTF(TransformerTextTransform(y_column=0)) classifier.fit(CHNSENTICORP_ERNIE_TRAIN, CHNSENTICORP_ERNIE_DEV, save_dir, transformer='bert-base-chinese') classifier.load(save_dir) print(classifier.predict('前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!')) classifier.evaluate(CHNSENTICORP_ERNIE_TEST, save_dir=save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_conll03_ner_bert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-25 21:34 from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF from hanlp.datasets.ner.conll03 import CONLL03_EN_TRAIN, CONLL03_EN_DEV, CONLL03_EN_TEST from tests import cdroot cdroot() tagger = TransformerNamedEntityRecognizerTF() save_dir = 'data/model/ner/ner_conll03_bert_base_cased_en' tagger.fit(CONLL03_EN_TRAIN, CONLL03_EN_DEV, save_dir, transformer='bert-base-cased', metrics='accuracy') tagger.load(save_dir, metrics='f1') print(tagger.predict('West Indian all-rounder Phil Simmons eats apple .'.split())) tagger.evaluate(CONLL03_EN_TEST, save_dir=save_dir, output=False, batch_size=32) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_conll03_ner_flair.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-25 21:34 import tensorflow as tf from hanlp.components.ner.ner_tf import RNNNamedEntityRecognizerTF from hanlp.datasets.ner.conll03 import CONLL03_EN_TRAIN, CONLL03_EN_TEST from hanlp.pretrained.glove import GLOVE_6B_100D from hanlp.pretrained.rnnlm import FLAIR_LM_FW_WMT11_EN_TF, FLAIR_LM_BW_WMT11_EN_TF from tests import cdroot cdroot() tagger = RNNNamedEntityRecognizerTF() save_dir = 'data/model/conll03-ner-rnn-flair' tagger.fit(CONLL03_EN_TRAIN, CONLL03_EN_TEST, save_dir, epochs=100, optimizer=tf.keras.optimizers.Adam(learning_rate=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8), loss='crf', rnn_units=256, embeddings=[ {'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'embeddings_initializer': 'zero', 'filepath': GLOVE_6B_100D, 'expand_vocab': True, 'lowercase': False }}, {'class_name': 'HanLP>ContextualStringEmbedding', 'config': { 'trainable': False, 'forward_model_path': FLAIR_LM_FW_WMT11_EN_TF, 'backward_model_path': FLAIR_LM_BW_WMT11_EN_TF }} ], rnn_output_dropout=0.5, rnn_input_dropout=0.5, batch_size=32, metrics='f1', anneal_factor=0.5, patience=2, ) print(tagger.predict('West Indian all-rounder Phil Simmons eats apple .'.split())) # print(tagger.predict([['This', 'is', 'an', 'old', 'story'], # ['Not', 'this', 'year', '.']])) # [['DT', 'VBZ', 'DT', 'JJ', 'NN'], ['RB', 'DT', 'NN', '.']] # tagger.load(save_dir) tagger.evaluate(CONLL03_EN_TEST, save_dir=save_dir, output=False) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb5_dep.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 18:33 from hanlp.components.parsers.biaffine_parser_tf import BiaffineDependencyParserTF from hanlp.datasets.parsing.ctb5 import CTB5_DEP_TRAIN, CTB5_DEP_DEV, CTB5_DEP_TEST from hanlp.pretrained.word2vec import CTB5_FASTTEXT_300_CN from tests import cdroot cdroot() save_dir = 'data/model/dep/biaffine_ctb' parser = BiaffineDependencyParserTF() parser.fit(CTB5_DEP_TRAIN, CTB5_DEP_DEV, save_dir, pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'embeddings_initializer': 'zero', 'filepath': CTB5_FASTTEXT_300_CN, 'expand_vocab': True, 'lowercase': True, 'normalize': True, }}, ) parser.load(save_dir) sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'), ('三十万', 'CD'), ('家', 'M')] print(parser.predict(sentence)) parser.evaluate(CTB5_DEP_TEST, save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb5_pos_rnn.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 22:46 from hanlp.components.taggers.pos_tf import RNNPartOfSpeechTaggerTF from hanlp.datasets.pos.ctb5 import CTB5_POS_TRAIN, CTB5_POS_DEV, CTB5_POS_TEST from hanlp.pretrained.fasttext import FASTTEXT_WIKI_300_ZH from tests import cdroot cdroot() tagger = RNNPartOfSpeechTaggerTF() save_dir = 'data/model/pos/ctb5_pos_rnn_fasttext' tagger.fit(CTB5_POS_TRAIN, CTB5_POS_DEV, save_dir, embeddings={'class_name': 'HanLP>FastTextEmbedding', 'config': {'filepath': FASTTEXT_WIKI_300_ZH}}, ) tagger.evaluate(CTB5_POS_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb7_dep.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 18:33 from hanlp.components.parsers.biaffine_parser_tf import BiaffineDependencyParserTF from hanlp.datasets.parsing.ctb5 import CIP_W2V_100_CN from hanlp.datasets.parsing.ctb7 import CTB7_DEP_TRAIN, CTB7_DEP_DEV, CTB7_DEP_TEST from tests import cdroot cdroot() save_dir = 'data/model/dep/biaffine_ctb7' parser = BiaffineDependencyParserTF() parser.fit(CTB7_DEP_TRAIN, CTB7_DEP_DEV, save_dir, pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'embeddings_initializer': 'zero', 'filepath': CIP_W2V_100_CN, 'expand_vocab': True, 'lowercase': True, 'normalize': True, }}, ) parser.load(save_dir) sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'), ('三十万', 'CD'), ('家', 'M')] print(parser.predict(sentence)) parser.evaluate(CTB7_DEP_TEST, save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb9_pos_albert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:15 from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF from tests import cdroot cdroot() tagger = TransformerTaggerTF() save_dir = 'data/model/pos/ctb9_albert_base' tagger.fit('data/pos/ctb9/train.tsv', 'data/pos/ctb9/test.tsv', save_dir, transformer='uer/albert-base-chinese-cluecorpussmall', max_seq_length=130, warmup_steps_ratio=0.1, epochs=20, learning_rate=5e-5) tagger.load(save_dir) print(tagger(['我', '的', '希望', '是', '希望', '和平'])) tagger.evaluate('data/pos/ctb9/test.tsv', save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb9_pos_electra.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:15 from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF from tests import cdroot cdroot() tagger = TransformerTaggerTF() save_dir = 'data/model/pos/ctb9_electra_small_zh_epoch_20' tagger.fit('data/pos/ctb9/train.tsv', 'data/pos/ctb9/test.tsv', save_dir, transformer='hfl/chinese-electra-small-discriminator', max_seq_length=130, warmup_steps_ratio=0.1, epochs=20, learning_rate=5e-5) tagger.load(save_dir) print(tagger(['我', '的', '希望', '是', '希望', '和平'])) tagger.evaluate('data/pos/ctb9/test.tsv', save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_albert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:15 from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST from tests import cdroot cdroot() recognizer = TransformerNamedEntityRecognizerTF() save_dir = 'data/model/ner/msra_ner_albert_base' recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='uer/albert-base-chinese-cluecorpussmall', learning_rate=5e-5, metrics='accuracy') # Use accuracy to speed up training recognizer.load(save_dir, metrics='f1') print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'))) recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_bert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:15 from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST from tests import cdroot cdroot() recognizer = TransformerNamedEntityRecognizerTF() save_dir = 'data/model/ner/ner_bert_base_msra_1' recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='bert-base-chinese', metrics='accuracy') # accuracy is faster recognizer.load(save_dir, metrics='f1') print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'))) recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_electra.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:15 from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST from tests import cdroot cdroot() recognizer = TransformerNamedEntityRecognizerTF() save_dir = 'data/model/ner/ner_electra_small_zh_msra_sparse_categorical_crossentropy' recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='hfl/chinese-electra-small-discriminator', learning_rate=5e-5, metrics='accuracy') # Use accuracy to speed up training recognizer.load(save_dir, metrics='f1') print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'))) recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_ngram_conv.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:15 from hanlp.components.ner.ner_tf import NgramConvNamedEntityRecognizerTF from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR, \ CONVSEG_W2V_NEWS_TENSITE_WORD_MSR from tests import cdroot cdroot() recognizer = NgramConvNamedEntityRecognizerTF() save_dir = 'data/model/ner/msra_ner_ngram_conv' recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, word_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 'expand_vocab': False, 'lowercase': False, }}, ngram_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': True, 'filepath': CONVSEG_W2V_NEWS_TENSITE_WORD_MSR, 'expand_vocab': True, 'lowercase': False, }}, weight_norm=True) recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_rnn.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 23:15 from hanlp.components.ner.ner_tf import RNNNamedEntityRecognizerTF from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100 from tests import cdroot cdroot() recognizer = RNNNamedEntityRecognizerTF() save_dir = 'data/model/ner/msra_ner_rnn' recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, embeddings=RADICAL_CHAR_EMBEDDING_100, embedding_trainable=True, epochs=100) recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_albert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-07 23:48 from hanlp.metrics.parsing import conllx_eval from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF from tests import cdroot cdroot() save_dir = 'data/model/dep/ptb_albert3' parser = BiaffineTransformerDependencyParserTF() parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'albert-xxlarge-v2', batch_size=256, warmup_steps_ratio=.1, token_mapping=PTB_TOKEN_MAPPING, samples_per_batch=150, transformer_dropout=.33, learning_rate=2e-3, learning_rate_transformer=1e-5, # early_stopping_patience=10, ) parser.load(save_dir) # output = f'{save_dir}/test.predict.conll' parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-07 23:48 from hanlp.metrics.parsing import conllx_eval from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF from tests import cdroot cdroot() save_dir = 'data/model/dep/ptb_bert_1e-5' parser = BiaffineTransformerDependencyParserTF() # parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', # batch_size=3000, # warmup_steps_ratio=.1, # token_mapping=PTB_TOKEN_MAPPING, # samples_per_batch=150, # transformer_dropout=.33, # learning_rate=2e-3, # learning_rate_transformer=1e-5, # # early_stopping_patience=10, # ) parser.load(save_dir, tree='tarjan') # output = f'{save_dir}/test.predict.conll' parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert_96.6.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-07 23:48 from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF from tests import cdroot from hanlp.metrics.parsing import conllx_eval cdroot() save_dir = 'data/model/dep/ptb_bert_96.61' parser = BiaffineTransformerDependencyParserTF() # parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', # batch_size=3000, # warmup_steps_ratio=.1, # token_mapping=PTB_TOKEN_MAPPING, # samples_per_batch=150, # ) parser.load(save_dir) output = f'{save_dir}/test.predict.conll' parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False, output=output) uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert_positional.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-07 23:48 from hanlp.metrics.parsing import conllx_eval from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF from tests import cdroot cdroot() save_dir = 'data/model/dep/ptb_bert_positional_diff_lr' parser = BiaffineTransformerDependencyParserTF() parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', batch_size=3000, warmup_steps_ratio=.1, token_mapping=PTB_TOKEN_MAPPING, samples_per_batch=150, transformer_dropout=.33, learning_rate=1e-4, learning_rate_transformer=1e-5, d_positional=128, # early_stopping_patience=10, ) # parser.load(save_dir) # output = f'{save_dir}/test.predict.conll' parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') # print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_albert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-07 23:48 from hanlp.metrics.parsing import conllx_eval from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \ StructuralAttentionDependencyParserTF from hanlp.pretrained.glove import GLOVE_840B_300D from tests import cdroot cdroot() save_dir = 'data/model/dep/ptb_sa_glove' parser = StructuralAttentionDependencyParserTF() # parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', # batch_size=3000, # warmup_steps_ratio=.1, # token_mapping=PTB_TOKEN_MAPPING, # samples_per_batch=150, # transformer_dropout=.33, # masked_lm_dropout=.33, # # learning_rate=2e-3, # # learning_rate_transformer=1e-5, # masked_lm_embed={'class_name': 'HanLP>Word2VecEmbedding', # 'config': { # 'trainable': False, # # 'embeddings_initializer': 'zero', # 'filepath': GLOVE_840B_300D, # 'expand_vocab': False, # 'lowercase': True, # 'cpu': False # }} # # alpha=1, # # early_stopping_patience=10, # # num_decoder_layers=2, # ) parser.load(save_dir) # output = f'{save_dir}/test.predict.conll' parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_albert_topk.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-07 23:48 from hanlp.metrics.parsing import conllx_eval from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \ StructuralAttentionDependencyParserTF from hanlp.pretrained.glove import GLOVE_840B_300D from tests import cdroot cdroot() save_dir = 'data/model/dep/ptb_sa_topk' parser = StructuralAttentionDependencyParserTF() parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', batch_size=3000, warmup_steps_ratio=.1, token_mapping=PTB_TOKEN_MAPPING, samples_per_batch=150, transformer_dropout=.33, masked_lm_dropout=.33, learning_rate=2e-3, learning_rate_transformer=1e-5, # alpha=1, # early_stopping_patience=10, # num_decoder_layers=2, ) parser.load(save_dir) # output = f'{save_dir}/test.predict.conll' parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_bert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-07 23:48 from hanlp.metrics.parsing import conllx_eval from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \ StructuralAttentionDependencyParserTF from hanlp.pretrained.glove import GLOVE_840B_300D from tests import cdroot cdroot() save_dir = 'data/model/dep/ptb_sa_glove' parser = StructuralAttentionDependencyParserTF() # parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', # batch_size=3000, # warmup_steps_ratio=.1, # token_mapping=PTB_TOKEN_MAPPING, # samples_per_batch=150, # transformer_dropout=.33, # masked_lm_dropout=.33, # # learning_rate=2e-3, # # learning_rate_transformer=1e-5, # masked_lm_embed={'class_name': 'HanLP>Word2VecEmbedding', # 'config': { # 'trainable': False, # # 'embeddings_initializer': 'zero', # 'filepath': GLOVE_840B_300D, # 'expand_vocab': False, # 'lowercase': True, # 'cpu': False # }} # # alpha=1, # # early_stopping_patience=10, # # num_decoder_layers=2, # ) parser.load(save_dir) # output = f'{save_dir}/test.predict.conll' parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_pos_bert.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-03-07 23:48 from hanlp.metrics.parsing import conllx_eval from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \ StructuralAttentionDependencyParserTF from hanlp.pretrained.glove import GLOVE_840B_300D from tests import cdroot cdroot() save_dir = 'data/model/dep/ptb_sa_bert_joint_pos' parser = StructuralAttentionDependencyParserTF() parser.fit('data/ptb-dep/train.conllx', 'data/ptb-dep/dev.conllx', save_dir, 'bert-base-uncased', batch_size=256, warmup_steps_ratio=.1, token_mapping=PTB_TOKEN_MAPPING, samples_per_batch=150, transformer_dropout=.33, masked_lm_dropout=.33, learning_rate=2e-3, learning_rate_transformer=1e-5, joint_pos=True # alpha=1, # early_stopping_patience=10, # num_decoder_layers=2, ) # parser.load(save_dir) # output = f'{save_dir}/test.predict.conll' parser.evaluate('data/ptb-dep/test.conllx', save_dir, warm_up=False) # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_pos_rnn_fasttext.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-10-25 21:34 import tensorflow as tf from hanlp.components.taggers.pos_tf import RNNPartOfSpeechTaggerTF from hanlp.pretrained.fasttext import FASTTEXT_CC_300_EN from tests import cdroot cdroot() tagger = RNNPartOfSpeechTaggerTF() save_dir = 'data/model/pos/ptb_pos_rnn_fasttext' optimizer = tf.keras.optimizers.SGD(lr=0.015) # optimizer = 'adam' tagger.fit('data/ptb-pos/train.tsv', 'data/ptb-pos/dev.tsv', batch_size=10, save_dir=save_dir, embeddings={'class_name': 'HanLP>FastTextEmbedding', 'config': {'filepath': FASTTEXT_CC_300_EN}}, optimizer=optimizer, lr_decay_per_epoch=0.05, rnn_units=100, rnn_input_dropout=0.5, rnn_output_dropout=0.5, epochs=100, verbose=True) tagger.load(save_dir) tagger.evaluate('data/ptb-pos/test.tsv', save_dir=save_dir, output=False) print(tagger.predict(['This' 'time', 'is', 'for', 'dinner'])) print(tagger.predict([['This', 'is', 'an', 'old', 'story'], ['Not', 'this', 'year', '.']])) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_dm.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 18:26 from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF from hanlp.pretrained.glove import GLOVE_6B_100D from tests import cdroot cdroot() save_dir = 'data/model/sdp/semeval15_biaffine_dm' parser = BiaffineSemanticDependencyParserTF() parser.fit('data/semeval15/en.dm.train.conll', 'data/semeval15/en.dm.dev.conll', save_dir, pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'embeddings_initializer': 'zero', 'filepath': GLOVE_6B_100D, 'expand_vocab': True, 'lowercase': True, 'normalize': True, }}, ) parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'), ('music', 'NN'), ('?', '.')] print(parser.predict(sentence)) parser.evaluate('data/semeval15/en.id.dm.auto.conllu', save_dir) parser.evaluate('data/semeval15/en.ood.dm.auto.conllu', save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_pas.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 18:26 from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF from hanlp.pretrained.glove import GLOVE_6B_100D from tests import cdroot cdroot() save_dir = 'data/model/sdp/semeval15_biaffine_pas' parser = BiaffineSemanticDependencyParserTF() parser.fit('data/semeval15/en.pas.train.conll', 'data/semeval15/en.pas.dev.conll', save_dir, pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'embeddings_initializer': 'zero', 'filepath': GLOVE_6B_100D, 'expand_vocab': True, 'lowercase': True, 'normalize': True, }}, ) parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'), ('music', 'NN'), ('?', '.')] print(parser.predict(sentence)) parser.evaluate('data/semeval15/en.id.pas.conll', save_dir) parser.evaluate('data/semeval15/en.ood.pas.conll', save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_psd.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-01 18:26 from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF from hanlp.pretrained.glove import GLOVE_6B_100D from tests import cdroot cdroot() save_dir = 'data/model/sdp/semeval15_biaffine_psd' parser = BiaffineSemanticDependencyParserTF() parser.fit('data/semeval15/en.psd.train.conll', 'data/semeval15/en.psd.dev.conll', save_dir, pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'embeddings_initializer': 'zero', 'filepath': GLOVE_6B_100D, 'expand_vocab': True, 'lowercase': True, 'normalize': True, }}, ) parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'), ('music', 'NN'), ('?', '.')] print(parser.predict(sentence)) parser.evaluate('data/semeval15/en.id.psd.conll', save_dir) parser.evaluate('data/semeval15/en.ood.psd.conll', save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval16_news.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-26 23:20 from hanlp.datasets.parsing.semeval16 import SEMEVAL2016_NEWS_TRAIN, SEMEVAL2016_NEWS_DEV, SEMEVAL2016_NEWS_TEST from hanlp.pretrained.word2vec import SEMEVAL16_EMBEDDINGS_300_NEWS_CN from hanlp.utils.tf_util import nice nice() from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF from tests import cdroot cdroot() save_dir = 'data/model/sdp/semeval16-news' parser = BiaffineSemanticDependencyParserTF() parser.fit(SEMEVAL2016_NEWS_TRAIN, SEMEVAL2016_NEWS_DEV, save_dir, pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'embeddings_initializer': 'zero', 'filepath': SEMEVAL16_EMBEDDINGS_300_NEWS_CN, 'expand_vocab': True, 'lowercase': True, 'normalize': True, }}, ) parser.load(save_dir) sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'), ('三十万', 'CD'), ('家', 'M')] print(parser.predict(sentence)) parser.evaluate(SEMEVAL2016_NEWS_TEST, save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval16_text.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-26 23:20 from hanlp.datasets.parsing.semeval16 import SEMEVAL2016_TEXT_TRAIN, SEMEVAL2016_TEXT_DEV, SEMEVAL2016_TEXT_TEST from hanlp.pretrained.word2vec import SEMEVAL16_EMBEDDINGS_300_TEXT_CN from hanlp.utils.tf_util import nice nice() from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF from tests import cdroot cdroot() save_dir = 'data/model/sdp/semeval16-text' parser = BiaffineSemanticDependencyParserTF() parser.fit(SEMEVAL2016_TEXT_TRAIN, SEMEVAL2016_TEXT_DEV, save_dir, pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 'config': { 'trainable': False, 'embeddings_initializer': 'zero', 'filepath': SEMEVAL16_EMBEDDINGS_300_TEXT_CN, 'expand_vocab': True, 'lowercase': True, 'normalize': True, }}, ) parser.load(save_dir) sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'), ('三十万', 'CD'), ('家', 'M')] print(parser.predict(sentence)) parser.evaluate(SEMEVAL2016_TEXT_TEST, save_dir) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tok_mtl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
" ] }, { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "9a1dc26a-786a-4dce-c013-7ae5017a8805" }, "outputs": [ { "data": { "text/plain": [ "{'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_ernie_gram_base_aug_20210904_145403.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip',\n", " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210914_133742.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0tmKBu7sNAXX", "outputId": "e0187328-c6d2-47fe-cf84-c5b44703940b" }, "outputs": [], "source": [ "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 分词\n", "任务越少,速度越快。如指定仅执行分词,默认细粒度:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "BqEmDMGGOtk3", "outputId": "387cbf30-4d70-44b1-d64b-b7a5c22ae31e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "阿婆主 来到 北京 立方庭 参观 自然 语义 科技 公司 。\n" ] } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "执行粗颗粒度分词:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "1goEC7znPNkI", "outputId": "ddf15a17-2f5d-4bc3-d145-908fb6176552" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "阿婆主 来到 北京立方庭 参观 自然语义科技公司 。\n" ] } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok/coarse').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "同时执行细粒度和粗粒度分词:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "bf24a01a-a09b-4b78-fdec-2bb705b4becb" }, "outputs": [ { "data": { "text/plain": [ "{'tok/fine': ['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。'],\n", " 'tok/coarse': ['阿婆主', '来到', '北京立方庭', '参观', '自然语义科技公司', '。']}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok*')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`coarse`为粗分,`fine`为细分。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 注意\n", "Native API的输入单位限定为句子,需使用[多语种分句模型](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/sent_split.py)或[基于规则的分句函数](https://github.com/hankcs/HanLP/blob/master/hanlp/utils/rules.py#L19)先行分句。RESTful同时支持全文、句子、已分词的句子。除此之外,RESTful和native两种API的语义设计完全一致,用户可以无缝互换。" ] }, { "cell_type": "markdown", "metadata": { "id": "suUL042zPpLj" }, "source": [ "## 自定义词典\n", "自定义词典为分词任务的成员变量,要操作自定义词典,先获取分词任务,以细分标准为例:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AzYShIssP6kq", "outputId": "7f07897c-8a97-4193-855d-d9e296581d0c" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok = HanLP['tok/fine']\n", "tok" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "自定义词典为分词任务的成员变量:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "1q4MUpgVQNlu", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "(None, None)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok.dict_combine, tok.dict_force" ] }, { "cell_type": "markdown", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2zZkH9tRQOoi", "outputId": "c231c35b-1a5f-4b54-e5c3-8680d2cc1515", "pycharm": { "name": "#%% md\n" } }, "source": [ "HanLP支持合并和强制两种优先级的自定义词典,以满足不同场景的需求。" ] }, { "cell_type": "markdown", "metadata": { "id": "F-9gAeIVQUFG", "pycharm": { "name": "#%% md\n" } }, "source": [ "不挂词典:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "F8M8cyBrQduw", "outputId": "c3bf7ec5-b1d4-4207-a979-2c85754c7cd7", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "商品 和 服务 项目\n" ] } ], "source": [ "tok.dict_force = tok.dict_combine = None\n", "HanLP(\"商品和服务项目\", tasks='tok/fine').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "DDqQxqQaTayv", "pycharm": { "name": "#%% md\n" } }, "source": [ "### 强制模式\n", "强制模式优先输出正向最长匹配到的自定义词条(慎用,详见[《自然语言处理入门》](http://nlp.hankcs.com/book.php)第二章):" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bjnEqDaATdVr", "outputId": "3a282acc-5716-45e4-e1e2-96eefb8ee342", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "商品 和服 务 项目\n" ] } ], "source": [ "tok.dict_force = {'和服', '服务项目'}\n", "HanLP(\"商品和服务项目\", tasks='tok/fine').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "ldKAnVoSTgxb", "pycharm": { "name": "#%% md\n" } }, "source": [ "与大众的朴素认知不同,词典优先级最高未必是好事,极有可能匹配到不该分出来的自定义词语,导致歧义。自定义词语越长,越不容易发生歧义。这启发我们将强制模式拓展为强制校正功能。\n", "\n", "强制校正原理相似,但会将匹配到的自定义词条替换为相应的分词结果:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bwIu0f6wTgbF", "outputId": "b941b079-5202-420a-e7f3-8f1617a2545c", "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "商品 和 服务 项目\n" ] } ], "source": [ "tok.dict_force = {'和服务': ['和', '服务']}\n", "HanLP(\"商品和服务项目\", tasks='tok/fine').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 合并模式\n", "合并模式的优先级低于统计模型,即`dict_combine`会在统计模型的分词结果上执行最长匹配并合并匹配到的词条。一般情况下,推荐使用该模式。" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "商品 和 服务项目\n" ] } ], "source": [ "tok.dict_force = None\n", "tok.dict_combine = {'和服', '服务项目'}\n", "HanLP(\"商品和服务项目\", tasks='tok/fine').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "9aRzEeRvTlRr" }, "source": [ "需要算法基础才能理解,初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。\n", "#### 空格单词" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "含有空格、制表符等(Transformer tokenizer去掉的字符)的词语需要用`tuple`的形式提供:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "['如何', '评价', 'iPad Pro', '?', 'iPad Pro', '有', '2个空格']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok.dict_combine = {('iPad', 'Pro'), '2个空格'}\n", "HanLP(\"如何评价iPad Pro ?iPad Pro有2个空格\", tasks='tok/fine')['tok/fine']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "聪明的用户请继续阅读,`tuple`词典中的字符串其实等价于该字符串的所有可能的切分方式:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "dict_keys([('2', '个', '空格'), ('2', '个', '空', '格'), ('2', '个空', '格'), ('2', '个空格'), ('2个', '空', '格'), ('2个', '空格'), ('2个空格',), ('iPad', 'Pro'), ('2个空', '格')])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict(tok.dict_combine.config[\"dictionary\"]).keys()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 单词位置" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "HanLP支持输出每个单词在文本中的原始位置,以便用于搜索引擎等场景。在词法分析中,非语素字符(空格、换行、制表符等)会被剔除,此时需要额外的位置信息才能定位每个单词:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['2021 年', 0, 6], ['HanLPv2.1', 7, 16], ['为', 17, 18], ['生产', 18, 20], ['环境', 20, 22], ['带来', 22, 24], ['次', 24, 25], ['世代', 25, 27], ['最', 27, 28], ['先进', 28, 30], ['的', 30, 31], ['多', 31, 32], ['语种', 32, 34], ['NLP', 34, 37], ['技术', 37, 39], ['。', 39, 40]]\n" ] } ], "source": [ "tok.config.output_spans = True\n", "sent = '2021 年\\nHanLPv2.1 为生产环境带来次世代最先进的多语种NLP技术。'\n", "word_offsets = HanLP(sent, tasks='tok/fine')['tok/fine']\n", "print(word_offsets)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回格式为三元组(单词,单词的起始下标,单词的终止下标),下标以字符级别计量。" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "for word, begin, end in word_offsets:\n", " assert word == sent[begin:end]" ] } ], "metadata": { "accelerator": "GPU", "colab": { "authorship_tag": "ABX9TyNRpO7rdchCK1UmB0nQmPrG", "collapsed_sections": [], "include_colab_link": true, "name": "tok_mtl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tok_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 分词\n", "HanLP线上模型训练自`9970`万字的大型综合语料库,覆盖新闻、社交媒体、金融、法律等多个领域,是已知范围内**全世界最大**的中文分词语料库。语料库规模决定实际效果,面向生产环境的语料库应当在千万字量级。自然语义的语言学专家一直在持续标注该语料库,与时俱进保持最先进的分词质量。\n", "在分词标准上,HanLP提供细粒度和粗粒度两种颗粒度,细粒度适合搜索引擎业务,粗粒度适合文本挖掘业务。\n", "### 细粒度分词\n", "默认细粒度:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['商品', '和', '服务', '。'],\n", " ['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。']]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司。')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "用户也可以直接将`HanLP`当作函数调用,并且打印漂亮的分词结果:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "BqEmDMGGOtk3", "outputId": "6fbb3eac-df26-4a55-8ba9-975d6cede227" }, "outputs": [ { "data": { "text/html": [ "
商品 和 服务 。

阿婆主 来到 北京 立方庭 参观 自然 语义 科技 公司 。
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回类型为[Document](https://hanlp.hankcs.com/docs/api/common/document.html),是`dict`的子类,拓展了很多操作各种语言学结构的方法。\n", "\n", "两个接口都会对文本进行分句,所以返回的结果一定是句子的列表。推荐在不超过服务器允许的最大长度的前提下,尽量传入整篇文章,以提高分词速度。" ] }, { "cell_type": "markdown", "metadata": { "id": "jj1Jk-2sPHYx" }, "source": [ "### 粗粒度分词\n", "执行粗颗粒度分词:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['商品', '和', '服务', '。'], ['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司']]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "或者直接当函数调用:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "1goEC7znPNkI", "outputId": "ddf15a17-2f5d-4bc3-d145-908fb6176552" }, "outputs": [ { "data": { "text/html": [ "
阿婆主 来到 北京 立方庭 参观 自然语义科技公司 。
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok/coarse').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "wxctCigrTKu-" }, "source": [ "### 同时执行细粒度和粗粒度分词" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zo08uquCTFSk", "outputId": "bf24a01a-a09b-4b78-fdec-2bb705b4becb" }, "outputs": [ { "data": { "text/plain": [ "{'tok/fine': [['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。']],\n", " 'tok/coarse': [['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司', '。']]}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok*')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`fine`为细分,`coarse`为粗分。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 多语种分词\n", "得益于语言无关的设计,HanLP支持包括简繁中英日俄法德在内的104种语言上的分词。这一切,只需指定`language='mul'`即可实现。" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
In 2021 , HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments .

2021 年 、 HanLPv2.1 は 次 世代 の 最 先端 多 言語 NLP 技術 を 本番 環境 に 導入 します 。

2021 年 HanLPv2.1 为 生产 环境 带来 次世代 最 先进的 多 语种 NLP 技术 。
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "HanLP(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',\n", " '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',\n", " '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], tasks='tok', language='mul').pretty_print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "自然语言处理分为许多任务,分词只是最初级的一个。也许大家只听说过中文分词,但HanLP并不局限于分词。HanLP的使命是普及最前沿的自然语言处理技术到生产环境,所以在其他教程中你会见到许多更高级的NLP任务以及相应的API用法。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "tok_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tok_stl.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4M7ka0K5OMWU", "outputId": "f931579a-f5a8-487a-a89e-33d5477584c3" }, "outputs": [ { "data": { "text/plain": [ "{'SIGHAN2005_PKU_CONVSEG': 'https://file.hankcs.com/hanlp/tok/sighan2005-pku-convseg_20200110_153722.zip',\n", " 'SIGHAN2005_MSR_CONVSEG': 'https://file.hankcs.com/hanlp/tok/convseg-msr-nocrf-noembed_20200110_153524.zip',\n", " 'CTB6_CONVSEG': 'https://file.hankcs.com/hanlp/tok/ctb6_convseg_nowe_nocrf_20200110_004046.zip',\n", " 'PKU_NAME_MERGED_SIX_MONTHS_CONVSEG': 'https://file.hankcs.com/hanlp/tok/pku98_6m_conv_ngram_20200110_134736.zip',\n", " 'LARGE_ALBERT_BASE': 'https://file.hankcs.com/hanlp/tok/large_corpus_cws_albert_base_20211228_160926.zip',\n", " 'SIGHAN2005_PKU_BERT_BASE_ZH': 'https://file.hankcs.com/hanlp/tok/sighan2005_pku_bert_base_zh_20201231_141130.zip',\n", " 'COARSE_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220616_012050.zip',\n", " 'FINE_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/tok/fine_electra_small_20220615_231803.zip',\n", " 'CTB9_TOK_ELECTRA_SMALL': 'https://file.hankcs.com/hanlp/tok/ctb9_electra_small_20220215_205427.zip',\n", " 'CTB9_TOK_ELECTRA_BASE': 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_20220426_111949.zip',\n", " 'CTB9_TOK_ELECTRA_BASE_CRF': 'http://download.hanlp.com/tok/extra/ctb9_tok_electra_base_crf_20220426_161255.zip',\n", " 'MSR_TOK_ELECTRA_BASE_CRF': 'http://download.hanlp.com/tok/extra/msra_crf_electra_base_20220507_113936.zip',\n", " 'UD_TOK_MMINILMV2L6': 'https://file.hankcs.com/hanlp/tok/ud_tok_mMiniLMv2L6_no_space_mul_20220619_091824.zip',\n", " 'UD_TOK_MMINILMV2L12': 'https://file.hankcs.com/hanlp/tok/ud_tok_mMiniLMv2L12_no_space_mul_20220619_091159.zip'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.tok.ALL # 语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "BMW528wGNulM" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0tmKBu7sNAXX", "outputId": "8977891f-9e64-4e39-8ce6-264a791541a3" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)\n", "tok" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 进阶知识\n", "你可以通过加载不同的模型实现各种颗粒度、各种分词标准、各种领域的中文分词。其中,coarse和fine模型训练自`9970`万字的大型综合语料库,覆盖新闻、社交媒体、金融、法律等多个领域,是已知范围内**全世界最大**的中文分词语料库。语料库规模决定实际效果,面向生产环境的语料库应当在千万字量级。欢迎用户在自己的语料上[训练或微调模型](https://github.com/hankcs/HanLP/tree/master/plugins/hanlp_demo/hanlp_demo/zh/train)以适应新领域。语料库标注标准决定最终的分词标准,模型的准确率决定多大程度上再现该分词标准。更多背景知识请参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。" ] }, { "cell_type": "markdown", "metadata": { "id": "KYH1oEKkctuy" }, "source": [ "## 执行分词" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uzex--zFcqKB", "outputId": "a4db6808-1039-4803-84af-2687cce0fa7b" }, "outputs": [ { "data": { "text/plain": [ "[['商品', '和', '服务', '。'], ['晓美焰', '来到', '北京立方庭', '参观', '自然语义科技公司']]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok(['商品和服务。', '晓美焰来到北京立方庭参观自然语义科技公司'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 细分标准" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "你可以通过加载`FINE_ELECTRA_SMALL_ZH`模型实现细粒度中文分词:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "tok_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "无论哪个模型,分词器的接口是完全一致的:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['晓美焰', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok_fine('晓美焰来到北京立方庭参观自然语义科技公司')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 无限长度\n", "众所周知,Transformer的输入有长度限制(通常是512)。幸运地是,HanLP的滑动窗口技巧完美地突破了该限制。只要你的内存(显存)足够,HanLP就可以处理无限长的句子。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 并行分词\n", "无论是CPU还是GPU,同时传入多个句子都将并行分词。也就是说,仅花费1个句子的时间可以处理多个句子。然而工作研究中的文本通常是一篇文档,而不是许多句子。此时可以利用HanLP提供的分句功能和流水线模式优雅应对,既能处理长文本又能并行化。只需创建一个流水线`pipeline`,第一级管道分句,第二级管道分词:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['量体裁衣', ',', 'HanLP', '提供', 'RESTful', '和', 'native', '两种', 'API', '。'],\n", " ['两者', '在', '语义', '上', '保持', '一致', ',', '在', '代码', '上', '坚持', '开源', '。']]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP = hanlp.pipeline() \\\n", " .append(hanlp.utils.rules.split_sentence) \\\n", " .append(tok)\n", "HanLP('量体裁衣,HanLP提供RESTful和native两种API。两者在语义上保持一致,在代码上坚持开源。')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回结果是每个句子的分词`list`,如果要将它们合并到一个`list`里该怎么办呢?聪明的用户可能已经想到了,再加一级`lambda`管道:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['量体裁衣', ',', 'HanLP', '提供', 'RESTful', '和', 'native', '两种', 'API', '。', '两者', '在', '语义', '上', '保持', '一致', ',', '在', '代码', '上', '坚持', '开源', '。']\n" ] } ], "source": [ "HanLP.append(lambda sents: sum(sents, []))\n", "print(HanLP('量体裁衣,HanLP提供RESTful和native两种API。两者在语义上保持一致,在代码上坚持开源。'))" ] }, { "cell_type": "markdown", "metadata": { "id": "suUL042zPpLj" }, "source": [ "## 自定义词典" ] }, { "cell_type": "markdown", "metadata": { "id": "1q4MUpgVQNlu" }, "source": [ "智者千虑,必有一失。模型偶尔也会犯错误,比如某个旧版本模型在不挂词典时会犯以下错误(最新版已经修复):" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2zZkH9tRQOoi", "outputId": "a74db6c6-0a71-411c-de78-60621a43eded", "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['首相', '和', '川', '普通', '电话']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok = hanlp.load('https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip')\n", "tok.dict_force = tok.dict_combine = None\n", "tok(\"首相和川普通电话\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "上面分词任务两个成员变量`dict_force`和`dict_combine`为自定义词典:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "AzYShIssP6kq", "outputId": "ce3bb1aa-5042-47d7-8ac9-7ed0fd478c77" }, "outputs": [ { "data": { "text/plain": [ "(None, None)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok.dict_combine, tok.dict_force" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "HanLP支持合并和强制两种优先级的自定义词典,以满足不同场景的需求。" ] }, { "cell_type": "markdown", "metadata": { "id": "F-9gAeIVQUFG" }, "source": [ "### 强制模式\n", "强制模式`dict_force`优先输出正向最长匹配到的自定义词条,在这个案例中,用户的第一反应也许是将`川普`加入到`dict_force`中,强制分词器输出`川普`:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "F8M8cyBrQduw", "outputId": "c156513c-d13c-47f1-bc3a-c73a8649ddb1" }, "outputs": [ { "data": { "text/plain": [ "[['首相', '和', '川普', '通', '电话'],\n", " ['银', '川普', '通人', '与', '川普', '通', '电话', '讲', '四', '川普', '通话']]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok.dict_force = {'川普'}\n", "tok([\"首相和川普通电话\", \"银川普通人与川普通电话讲四川普通话\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "DDqQxqQaTayv" }, "source": [ "然而与大众的朴素认知不同,词典优先级最高未必是好事。极有可能匹配到不该分出来的自定义词语,导致歧义。即便是将`普通人`或`普通话`加入到词典中也无济于事,因为在正向最长匹配第二个句子的过程中,会匹配到`川普`而不会匹配后两者。这也解释了为什么自定义词典中存在的词可能分不出来:当歧义发生时,两个词语发生交叉冲突,自然有所取舍,无法同时输出两者。那种同时输出句子或长单词中所有可能的单词,并且允许单词交叉的算法,并非分词,而是多模式字符串匹配。你需要基本的算法知识才能理解这一点,总之一般情况下应当慎用强制模式,详见[《自然语言处理入门》](http://nlp.hankcs.com/book.php)第二章。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "自定义词语越长,越不容易发生歧义。这启发我们将强制模式拓展为强制校正功能。强制校正原理相似,但会将匹配到的自定义词条替换为相应的分词结果:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bjnEqDaATdVr", "outputId": "2e694aed-a71f-4a28-d981-0767d9e263e9" }, "outputs": [ { "data": { "text/plain": [ "[['首相', '和', '川普', '通', '电话'],\n", " ['银川', '普通人', '与', '川普', '通', '电话', '讲', '四川', '普通话']]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok.dict_force = {'川普通电话': ['川普', '通', '电话']}\n", "tok([\"首相和川普通电话\", \"银川普通人与川普通电话讲四川普通话\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "强制校正是一种短平快的规则补丁,需要针对每种可能产生歧义的语境,截取一个片段执行校正。当你积累了很多歧义片段与相应的校正补丁后,其实就应该考虑微调模型。微调可以让模型增量式学习这些歧义语境,摆脱对补丁规则的依赖,同时举一反三应对新的语境。从错误中积累经验,用经验预测未来,这就是机器学习与人工智能的魅力。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "事实上,“川普通电话”这种例子不需要词典即可分对。只需提供给神经网络足够的上下文线索(这也是真实文本所具备的),告诉神经网络“川普是美国总统”:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['首相', '和', '川普', '通', '电话', ',', '川普', '是', '美国', '总统', '。'], ['银川', '普通人', '与', '川普', '通', '电话', '讲', '四川', '普通话', ',', '川普', '是', '美国', '总统', '。']]\n" ] } ], "source": [ "tok.dict_force = tok.dict_combine = None\n", "print(tok([\"首相和川普通电话,川普是美国总统。\", \"银川普通人与川普通电话讲四川普通话,川普是美国总统。\"]))" ] }, { "cell_type": "markdown", "metadata": { "id": "9aRzEeRvTlRr" }, "source": [ "在上面的例子中,虽然词典对“川普”没有施加任何影响,但是更丰富的上下文促进了神经网络对语境的理解,使其得出了正确的结果。深度学习中的神经网络似乎展示了些许智能,感兴趣的初学者可参考[《自然语言处理入门》](http://nlp.hankcs.com/book.php)。" ] }, { "cell_type": "markdown", "metadata": { "id": "ldKAnVoSTgxb" }, "source": [ "### 合并模式\n", "合并模式的优先级低于统计模型,即`dict_combine`会在统计模型的分词结果上执行最长匹配并合并匹配到的词条。一般情况下,推荐使用该模式。比如,将“美国总统”加入`dict_combine`后会合并`['美国', '总统']`,而不会合并`['美国', '总', '统筹部']`为`['美国总统', '筹部']`:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bwIu0f6wTgbF", "outputId": "22807b6a-3472-431b-d1e3-95f6b761c84c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['首相', '和', '川普', '通', '电话', ',', '川普', '是', '美国总统', '。'], ['银川', '普通人', '与', '川普', '通', '电话', '讲', '四川', '普通话', ',', '川普', '是', '美国总统', '。'], ['美国', '总统筹部', '部长', '是', '谁', '?']]\n" ] } ], "source": [ "tok.dict_force = None\n", "tok.dict_combine = {'美国总统'}\n", "print(tok([\"首相和川普通电话,川普是美国总统。\", \"银川普通人与川普通电话讲四川普通话,川普是美国总统。\", \"美国总统筹部部长是谁?\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 空格单词" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "含有空格、制表符等(Transformer tokenizer去掉的字符)的词语需要用`tuple`的形式提供:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['如何', '评价', 'iPad Pro', '?', 'iPad Pro', '有', '2个空格']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tok.dict_combine = {('iPad', 'Pro'), '2个空格'}\n", "tok(\"如何评价iPad Pro ?iPad Pro有2个空格\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "聪明的用户请继续阅读,`tuple`词典中的字符串其实等价于该字符串的所有可能的切分方式:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys([('iPad', 'Pro'), ('2个空格',), ('2', '个', '空格'), ('2', '个', '空', '格'), ('2', '个空格'), ('2', '个空', '格'), ('2个', '空', '格'), ('2个', '空格'), ('2个空', '格')])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict(tok.dict_combine.config[\"dictionary\"]).keys()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 单词位置" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "HanLP支持输出每个单词在文本中的原始位置,以便用于搜索引擎等场景。在词法分析中,非语素字符(空格、换行、制表符等)会被剔除,此时需要额外的位置信息才能定位每个单词:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['2021', 0, 4], ['年', 5, 6], ['HanLPv2.1', 7, 16], ['为', 17, 18], ['生产', 18, 20], ['环境', 20, 22], ['带来', 22, 24], ['次', 24, 25], ['世代', 25, 27], ['最', 27, 28], ['先进', 28, 30], ['的', 30, 31], ['多', 31, 32], ['语种', 32, 34], ['NLP', 34, 37], ['技术', 37, 39], ['。', 39, 40]]\n" ] } ], "source": [ "tok.config.output_spans = True\n", "sent = '2021 年\\nHanLPv2.1 为生产环境带来次世代最先进的多语种NLP技术。'\n", "word_offsets = tok(sent)\n", "print(word_offsets)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "返回格式为三元组(单词,单词的起始下标,单词的终止下标),下标以字符级别计量。" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "for word, begin, end in word_offsets:\n", " assert word == sent[begin:end]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 多语种支持" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "得益于语言无关的设计,以及大规模多语种语料库,最近HanLP发布了支持[130种语言](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html#hanlp.pretrained.tok.UD_TOK_MMINILMV2L12)的单任务分词器。用法与中文分词器相同:" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "mul = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L6)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[['In', '2021', ',', 'HanLPv2.1', 'delivers', 'state-of-the-art', 'multilingual', 'NLP', 'techniques', 'to', 'production', 'environments', '.'], ['2021年', '、', 'HanLPv2.1', 'は', '次世代', 'の', '最', '先端', '多', '言語', 'NLP', '技術', 'を', '本番', '環境', 'に', '導入', 'し', 'ます', '。'], ['2021年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次', '世代', '最', '先进', '的', '多语种', 'NLP', '技术', '。'], ['奈須きのこ', 'は', '1973年', '11月', '28日', 'に', '千葉', '県', '円空山', 'で', '生まれ', '、', 'ゲーム', '制作', '会社', '「', 'ノーツ', '」', 'の', '設立', '者', 'だ', '。']]\n" ] } ], "source": [ "print(mul([\n", " 'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',\n", " '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',\n", " '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',\n", " '奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。'\n", "]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "目前,多语种分词器的效果并不如单语种好。欢迎在你自己的单语种语料上自行训练新模型,也欢迎开源你的语料和模型。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "authorship_tag": "ABX9TyPxXzYAXgLUW5uKV7v0/2iP", "collapsed_sections": [], "name": "tok_stl.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/train/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-31 20:12 ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/train/finetune_ner.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2023-10-18 18:49 import os import hanlp from hanlp.components.ner.transformer_ner import TransformerNamedEntityRecognizer from tests import cdroot cdroot() your_training_corpus = 'data/ner/finetune/word_to_iobes.tsv' your_development_corpus = your_training_corpus # Use a different one in reality save_dir = 'data/ner/finetune/model' if not os.path.exists(your_training_corpus): os.makedirs(os.path.dirname(your_training_corpus), exist_ok=True) with open(your_training_corpus, 'w') as out: out.write( '''训练\tB-NLP 语料\tE-NLP 为\tO IOBES\tO 格式\tO ''' ) ner = TransformerNamedEntityRecognizer() if not os.path.exists(save_dir): print('Start fine-tuning ') ner.fit( trn_data=your_training_corpus, dev_data=your_development_corpus, save_dir=save_dir, epochs=50, # Since the corpus is small, overfit it finetune=hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH, # You MUST set the same parameters with the fine-tuning model: average_subwords=True, transformer='hfl/chinese-electra-180g-small-discriminator', ) else: print('Load fine-tuned model') ner = hanlp.load(save_dir) HanLP = hanlp.pipeline()\ .append(hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH), output_key='tok')\ .append(ner, output_key='ner') HanLP(['训练语料为IOBES格式', '晓美焰来到北京立方庭参观自然语义科技公司。']).pretty_print() ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/train/open_base.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-03 14:24 from hanlp_demo import block_windows from hanlp.common.dataset import SortingSamplerBuilder from hanlp.common.transform import NormalizeCharacter from hanlp.components.mtl.multi_task_learning import MultiTaskLearning from hanlp.components.mtl.tasks.constituency import CRFConstituencyParsing from hanlp.components.mtl.tasks.dep import BiaffineDependencyParsing from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition from hanlp.components.mtl.tasks.pos import TransformerTagging from hanlp.components.mtl.tasks.sdp import BiaffineSemanticDependencyParsing from hanlp.components.mtl.tasks.srl.bio_srl import SpanBIOSemanticRoleLabeling from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization from hanlp.datasets.ner.msra import MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN, MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV, \ MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST from hanlp.datasets.parsing.ctb8 import CTB8_POS_TRAIN, CTB8_POS_DEV, CTB8_POS_TEST, CTB8_SD330_TEST, CTB8_SD330_DEV, \ CTB8_SD330_TRAIN, CTB8_CWS_TRAIN, CTB8_CWS_DEV, CTB8_CWS_TEST, CTB8_BRACKET_LINE_NOEC_TRAIN, \ CTB8_BRACKET_LINE_NOEC_DEV, CTB8_BRACKET_LINE_NOEC_TEST from hanlp.datasets.parsing.semeval16 import SEMEVAL2016_TEXT_TRAIN_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU, \ SEMEVAL2016_TEXT_DEV_CONLLU from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_TEST, ONTONOTES5_CONLL12_CHINESE_DEV, \ ONTONOTES5_CONLL12_CHINESE_TRAIN from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding from hanlp.layers.transformers.relative_transformer import RelativeTransformerEncoder from hanlp.utils.lang.zh.char_table import HANLP_CHAR_TABLE_JSON from hanlp.utils.log_util import cprint from tests import cdroot cdroot() tasks = { 'tok': TaggingTokenization( CTB8_CWS_TRAIN, CTB8_CWS_DEV, CTB8_CWS_TEST, SortingSamplerBuilder(batch_size=32), max_seq_len=510, hard_constraint=True, char_level=True, tagging_scheme='BMES', lr=1e-3, transform=NormalizeCharacter(HANLP_CHAR_TABLE_JSON, 'token'), ), 'pos': TransformerTagging( CTB8_POS_TRAIN, CTB8_POS_DEV, CTB8_POS_TEST, SortingSamplerBuilder(batch_size=32), hard_constraint=True, max_seq_len=510, char_level=True, dependencies='tok', lr=1e-3, ), 'ner': TaggingNamedEntityRecognition( MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN, MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV, MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST, SortingSamplerBuilder(batch_size=32), lr=1e-3, secondary_encoder=RelativeTransformerEncoder(768, k_as_x=True), dependencies='tok', ), 'srl': SpanBIOSemanticRoleLabeling( ONTONOTES5_CONLL12_CHINESE_TRAIN, ONTONOTES5_CONLL12_CHINESE_DEV, ONTONOTES5_CONLL12_CHINESE_TEST, SortingSamplerBuilder(batch_size=32, batch_max_tokens=2048), lr=1e-3, crf=True, dependencies='tok', ), 'dep': BiaffineDependencyParsing( CTB8_SD330_TRAIN, CTB8_SD330_DEV, CTB8_SD330_TEST, SortingSamplerBuilder(batch_size=32), lr=1e-3, tree=True, punct=True, dependencies='tok', ), 'sdp': BiaffineSemanticDependencyParsing( SEMEVAL2016_TEXT_TRAIN_CONLLU, SEMEVAL2016_TEXT_DEV_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU, SortingSamplerBuilder(batch_size=32), lr=1e-3, apply_constraint=True, punct=True, dependencies='tok', ), 'con': CRFConstituencyParsing( CTB8_BRACKET_LINE_NOEC_TRAIN, CTB8_BRACKET_LINE_NOEC_DEV, CTB8_BRACKET_LINE_NOEC_TEST, SortingSamplerBuilder(batch_size=32), lr=1e-3, dependencies='tok', ) } mtl = MultiTaskLearning() save_dir = 'data/model/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base' mtl.fit( ContextualWordEmbedding('token', "hfl/chinese-electra-180g-base-discriminator", average_subwords=True, max_sequence_length=512, word_dropout=.1), tasks, save_dir, 30, lr=1e-3, encoder_lr=5e-5, grad_norm=1, gradient_accumulation=2, eval_trn=False, ) cprint(f'Model saved in [cyan]{save_dir}[/cyan]') mtl.evaluate(save_dir) mtl.load(save_dir) print(mtl('华纳音乐旗下的新垣结衣在12月21日于日本武道馆举办歌手出道活动')) ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-12-03 14:24 from hanlp_demo import block_windows from hanlp.common.dataset import SortingSamplerBuilder from hanlp.common.transform import NormalizeCharacter from hanlp.components.mtl.multi_task_learning import MultiTaskLearning from hanlp.components.mtl.tasks.constituency import CRFConstituencyParsing from hanlp.components.mtl.tasks.dep import BiaffineDependencyParsing from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition from hanlp.components.mtl.tasks.pos import TransformerTagging from hanlp.components.mtl.tasks.sdp import BiaffineSemanticDependencyParsing from hanlp.components.mtl.tasks.srl.bio_srl import SpanBIOSemanticRoleLabeling from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization from hanlp.datasets.ner.msra import MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST, MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV, \ MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN from hanlp.datasets.parsing.ctb8 import CTB8_POS_TRAIN, CTB8_POS_DEV, CTB8_POS_TEST, CTB8_SD330_TEST, CTB8_SD330_DEV, \ CTB8_SD330_TRAIN, CTB8_CWS_TRAIN, CTB8_CWS_DEV, CTB8_CWS_TEST, CTB8_BRACKET_LINE_NOEC_TEST, \ CTB8_BRACKET_LINE_NOEC_DEV, CTB8_BRACKET_LINE_NOEC_TRAIN from hanlp.datasets.parsing.semeval16 import SEMEVAL2016_TEXT_TRAIN_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU, \ SEMEVAL2016_TEXT_DEV_CONLLU from hanlp.datasets.srl.ontonotes5.chinese import ONTONOTES5_CONLL12_CHINESE_TEST, ONTONOTES5_CONLL12_CHINESE_DEV, \ ONTONOTES5_CONLL12_CHINESE_TRAIN from hanlp.layers.embeddings.contextual_word_embedding import ContextualWordEmbedding from hanlp.layers.transformers.relative_transformer import RelativeTransformerEncoder from hanlp.utils.lang.zh.char_table import HANLP_CHAR_TABLE_JSON from hanlp.utils.log_util import cprint from tests import cdroot cdroot() tasks = { 'tok': TaggingTokenization( CTB8_CWS_TRAIN, CTB8_CWS_DEV, CTB8_CWS_TEST, SortingSamplerBuilder(batch_size=32), max_seq_len=510, hard_constraint=True, char_level=True, tagging_scheme='BMES', lr=1e-3, transform=NormalizeCharacter(HANLP_CHAR_TABLE_JSON, 'token'), ), 'pos': TransformerTagging( CTB8_POS_TRAIN, CTB8_POS_DEV, CTB8_POS_TEST, SortingSamplerBuilder(batch_size=32), hard_constraint=True, max_seq_len=510, char_level=True, dependencies='tok', lr=1e-3, ), 'ner': TaggingNamedEntityRecognition( MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TRAIN, MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_DEV, MSRA_NER_TOKEN_LEVEL_SHORT_IOBES_TEST, SortingSamplerBuilder(batch_size=32), max_seq_len=510, hard_constraint=True, char_level=True, lr=1e-3, secondary_encoder=RelativeTransformerEncoder(256, k_as_x=True, feedforward_dim=128), dependencies='tok', ), 'srl': SpanBIOSemanticRoleLabeling( ONTONOTES5_CONLL12_CHINESE_TRAIN, ONTONOTES5_CONLL12_CHINESE_DEV, ONTONOTES5_CONLL12_CHINESE_TEST, SortingSamplerBuilder(batch_size=32, batch_max_tokens=1280), lr=1e-3, crf=True, dependencies='tok', ), 'dep': BiaffineDependencyParsing( CTB8_SD330_TRAIN, CTB8_SD330_DEV, CTB8_SD330_TEST, SortingSamplerBuilder(batch_size=32), lr=1e-3, tree=True, proj=True, punct=True, dependencies='tok', ), 'sdp': BiaffineSemanticDependencyParsing( SEMEVAL2016_TEXT_TRAIN_CONLLU, SEMEVAL2016_TEXT_DEV_CONLLU, SEMEVAL2016_TEXT_TEST_CONLLU, SortingSamplerBuilder(batch_size=32), lr=1e-3, apply_constraint=True, punct=True, dependencies='tok', ), 'con': CRFConstituencyParsing( CTB8_BRACKET_LINE_NOEC_TRAIN, CTB8_BRACKET_LINE_NOEC_DEV, CTB8_BRACKET_LINE_NOEC_TEST, SortingSamplerBuilder(batch_size=32), lr=1e-3, dependencies='tok', ) } mtl = MultiTaskLearning() save_dir = 'data/model/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small' cprint(f'Model will be saved in [cyan]{save_dir}[/cyan]') mtl.fit( ContextualWordEmbedding('token', "hfl/chinese-electra-180g-small-discriminator", average_subwords=True, max_sequence_length=512, word_dropout=.1), tasks, save_dir, 30, lr=1e-3, encoder_lr=5e-5, grad_norm=1, gradient_accumulation=1, eval_trn=False, ) cprint(f'Model saved in [cyan]{save_dir}[/cyan]') mtl.evaluate(save_dir) mtl.load(save_dir) mtl('华纳音乐旗下的新垣结衣在12月21日于日本武道馆举办歌手出道活动').pretty_print() ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/train_sota_bert_pku.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-08-11 02:47 from hanlp.common.dataset import SortingSamplerBuilder from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer from hanlp.datasets.tokenization.sighan2005.pku import SIGHAN2005_PKU_TRAIN_ALL, SIGHAN2005_PKU_TEST from tests import cdroot cdroot() tokenizer = TransformerTaggingTokenizer() save_dir = 'data/model/cws/sighan2005_pku_bert_base_96.7' tokenizer.fit( SIGHAN2005_PKU_TRAIN_ALL, SIGHAN2005_PKU_TEST, # Conventionally, no devset is used. See Tian et al. (2020). save_dir, 'bert-base-chinese', max_seq_len=300, char_level=True, hard_constraint=True, sampler_builder=SortingSamplerBuilder(batch_size=32), epochs=3, adam_epsilon=1e-6, warmup_steps=0.1, weight_decay=0.01, word_dropout=0.1, seed=1660853059, ) tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir) print(f'Model saved in {save_dir}') ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tst_restful.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "WfGpInivS0fG" }, "source": [ "

点击下列图标在线运行HanLP

\n", "
\n", "\t\"Open\n", "\t\"Open\n", "
\n", "\n", "## 安装" ] }, { "cell_type": "markdown", "metadata": { "id": "IYwV-UkNNzFp" }, "source": [ "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1Uf_u7ddMhUt" }, "outputs": [], "source": [ "!pip install hanlp_restful -U" ] }, { "cell_type": "markdown", "metadata": { "id": "pp-1KqEOOJ4t" }, "source": [ "## 创建客户端" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "0tmKBu7sNAXX" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "EmZDmLn9aGxG" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "elA_UyssOut_" }, "source": [ "## 文本风格转换\n", "输入短文本以及目标风格,执行文本风格转换:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "id": "BqEmDMGGOtk3", "outputId": "2a0d392f-b99a-4a18-fc7f-754e2abe2e34" }, "outputs": [ { "data": { "text/plain": [ "['国家对中石油寄予巨大期望。', '要用创新推动高质量发展。']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HanLP.text_style_transfer(['国家对中石油抱有很大的期望.', '要用创新去推动高质量的发展。'],\n", " target_style='gov_doc')" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "tst_restful.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/hanlp_demo/zh/tutorial.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "BZPSH4VkK7J2" }, "source": [ "欢迎来到HanLP在线交互环境,这是一个Jupyter记事本,可以输入任意Python代码并在线执行。请点击左上角【Run】来运行这篇NLP教程。\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "XxPAiNwSK7J4" }, "source": [ "## 安装\n", "量体裁衣,HanLP提供**RESTful**(云端)和**native**(本地)两种API,分别面向轻量级和海量级两种场景。无论何种API何种语言,HanLP接口在语义上保持一致,你可以**任选一种**API来运行本教程。\n", "\n", "### 轻量级RESTful API\n", "\n", "仅数KB,适合敏捷开发、移动APP等场景。简单易用,无需GPU配环境,**强烈推荐**,秒速安装:\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lgMa4kbfK7J5", "outputId": "5bb662d8-1665-4bcc-c517-70d1c4bc4837" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: hanlp_restful in /usr/local/lib/python3.7/dist-packages (0.0.7)\n", "Requirement already satisfied: hanlp-common in /usr/local/lib/python3.7/dist-packages (from hanlp_restful) (0.0.9)\n", "Requirement already satisfied: phrasetree in /usr/local/lib/python3.7/dist-packages (from hanlp-common->hanlp_restful) (0.0.8)\n" ] } ], "source": [ "!pip install hanlp_restful" ] }, { "cell_type": "markdown", "metadata": { "id": "N4G6GbNmK7J6" }, "source": [ "创建客户端,填入服务器地址:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "3XM9-3-oK7J6" }, "outputs": [], "source": [ "from hanlp_restful import HanLPClient\n", "HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh') # auth不填则匿名,zh中文,mul多语种" ] }, { "cell_type": "markdown", "metadata": { "id": "pbeFH9jmK7J7" }, "source": [ "调用`parse`接口,传入一篇文章,得到HanLP精准的分析结果。" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mNJPvZ_3K7J7", "outputId": "4048d0d6-2dad-4582-e327-f99338f8f72b" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n", " ],\n", " \"tok/coarse\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"阿婆主\", \"来到\", \"北京立方庭\", \"参观\", \"自然语义科技公司\", \"。\"]\n", " ],\n", " \"pos/ctb\": [\n", " [\"NT\", \"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"JJ\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"CD\", \"NN\", \"NR\", \"NN\", \"PU\"],\n", " [\"NN\", \"VV\", \"NR\", \"NR\", \"VV\", \"NN\", \"NN\", \"NN\", \"NN\", \"PU\"]\n", " ],\n", " \"pos/pku\": [\n", " [\"t\", \"nx\", \"p\", \"vn\", \"n\", \"v\", \"b\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"nx\", \"n\", \"w\"],\n", " [\"n\", \"v\", \"ns\", \"ns\", \"v\", \"n\", \"n\", \"n\", \"n\", \"w\"]\n", " ],\n", " \"pos/863\": [\n", " [\"nt\", \"w\", \"p\", \"v\", \"n\", \"v\", \"a\", \"nt\", \"d\", \"a\", \"u\", \"a\", \"n\", \"ws\", \"n\", \"w\"],\n", " [\"n\", \"v\", \"ns\", \"n\", \"v\", \"n\", \"n\", \"n\", \"n\", \"w\"]\n", " ],\n", " \"ner/msra\": [\n", " [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORGANIZATION\", 1, 2]],\n", " [[\"北京立方庭\", \"LOCATION\", 2, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n", " ],\n", " \"ner/pku\": [\n", " [],\n", " [[\"北京立方庭\", \"ns\", 2, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n", " ],\n", " \"ner/ontonotes\": [\n", " [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORG\", 1, 2]],\n", " [[\"北京立方庭\", \"FAC\", 2, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n", " ],\n", " \"srl\": [\n", " [[[\"2021年\", \"ARGM-TMP\", 0, 1], [\"HanLPv2.1\", \"ARG0\", 1, 2], [\"为生产环境\", \"ARG2\", 2, 5], [\"带来\", \"PRED\", 5, 6], [\"次世代最先进的多语种NLP技术\", \"ARG1\", 6, 15]], [[\"最\", \"ARGM-ADV\", 8, 9], [\"先进\", \"PRED\", 9, 10], [\"技术\", \"ARG0\", 14, 15]]],\n", " [[[\"阿婆主\", \"ARG0\", 0, 1], [\"来到\", \"PRED\", 1, 2], [\"北京立方庭\", \"ARG1\", 2, 4]], [[\"阿婆主\", \"ARG0\", 0, 1], [\"参观\", \"PRED\", 4, 5], [\"自然语义科技公司\", \"ARG1\", 5, 9]]]\n", " ],\n", " \"dep\": [\n", " [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"amod\"], [15, \"nn\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"assm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]],\n", " [[2, \"nsubj\"], [0, \"root\"], [4, \"nn\"], [2, \"dobj\"], [2, \"conj\"], [9, \"nn\"], [9, \"nn\"], [9, \"nn\"], [5, \"dobj\"], [2, \"punct\"]]\n", " ],\n", " \"sdp\": [\n", " [[[6, \"Time\"]], [[6, \"Exp\"]], [[5, \"mPrep\"]], [[5, \"Desc\"]], [[6, \"Datv\"]], [[13, \"dDesc\"]], [[0, \"Root\"], [8, \"Desc\"], [13, \"Desc\"]], [[15, \"Time\"]], [[10, \"mDegr\"]], [[15, \"Desc\"]], [[10, \"mAux\"]], [[8, \"Quan\"], [13, \"Quan\"]], [[15, \"Desc\"]], [[15, \"Nmod\"]], [[6, \"Pat\"]], [[6, \"mPunc\"]]],\n", " [[[2, \"Agt\"], [5, \"Agt\"]], [[0, \"Root\"]], [[4, \"Loc\"]], [[2, \"Lfin\"]], [[2, \"ePurp\"]], [[8, \"Nmod\"]], [[9, \"Nmod\"]], [[9, \"Nmod\"]], [[5, \"Datv\"]], [[5, \"mPunc\"]]]\n", " ],\n", " \"con\": [\n", " [\"TOP\", [[\"IP\", [[\"NP\", [[\"NT\", [\"2021年\"]]]], [\"NP\", [[\"NR\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"P\", [\"为\"]], [\"NP\", [[\"NN\", [\"生产\"]], [\"NN\", [\"环境\"]]]]]], [\"VP\", [[\"VV\", [\"带来\"]], [\"NP\", [[\"ADJP\", [[\"NP\", [[\"ADJP\", [[\"JJ\", [\"次\"]]]], [\"NP\", [[\"NN\", [\"世代\"]]]]]], [\"ADVP\", [[\"AD\", [\"最\"]]]], [\"VP\", [[\"JJ\", [\"先进\"]]]]]], [\"DEG\", [\"的\"]], [\"NP\", [[\"QP\", [[\"CD\", [\"多\"]]]], [\"NP\", [[\"NN\", [\"语种\"]]]]]], [\"NP\", [[\"NR\", [\"NLP\"]], [\"NN\", [\"技术\"]]]]]]]]]], [\"PU\", [\"。\"]]]]]],\n", " [\"TOP\", [[\"IP\", [[\"NP\", [[\"NN\", [\"阿婆主\"]]]], [\"VP\", [[\"VP\", [[\"VV\", [\"来到\"]], [\"NP\", [[\"NR\", [\"北京\"]], [\"NR\", [\"立方庭\"]]]]]], [\"VP\", [[\"VV\", [\"参观\"]], [\"NP\", [[\"NN\", [\"自然\"]], [\"NN\", [\"语义\"]], [\"NN\", [\"科技\"]], [\"NN\", [\"公司\"]]]]]]]], [\"PU\", [\"。\"]]]]]]\n", " ]\n", "}\n" ] } ], "source": [ "doc = HanLP.parse(\"2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。\")\n", "print(doc)" ] }, { "cell_type": "markdown", "metadata": { "id": "w4E8Kn_nK7J8" }, "source": [ "#### 可视化\n", "输出结果是一个可以`json`化的`dict`,键为[NLP任务名](https://hanlp.hankcs.com/docs/data_format.html#naming-convention),值为分析结果。关于标注集含义,请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习,所以HanLP的标注集也是覆盖面最广的。通过`doc.pretty_print`,可以在等宽字体环境中得到可视化,你需要取消换行才能对齐可视化结果。我们已经发布HTML环境的可视化,在Jupyter Notebook中自动对齐中文。" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 575 }, "id": "GZ79la4LK7J8", "outputId": "b9bd5dc0-52f9-4b42-93fd-7c4e49214ace" }, "outputs": [ { "data": { "text/html": [ "
Dep Tree     
──────────── 
 ┌─────────► 
 │┌────────► 
 ││┌─►┌───── 
 │││  │  ┌─► 
 │││  └─►└── 
┌┼┴┴──────── 
││       ┌─► 
││  ┌───►└── 
││  │    ┌─► 
││  │┌──►├── 
││  ││   └─► 
││  ││   ┌─► 
││  ││┌─►└── 
││  │││  ┌─► 
│└─►└┴┴──┴── 
└──────────► 
Token     
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
Relati 
────── 
tmod   
nsubj  
prep   
nn     
pobj   
root   
amod   
nn     
advmod 
rcmod  
assm   
nummod 
nn     
nn     
dobj   
punct  
PoS 
─── 
NT  
NR  
P   
NN  
NN  
VV  
JJ  
NN  
AD  
JJ  
DEG 
CD  
NN  
NR  
NN  
PU  
Tok       
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
NER Type         
──────────────── 
───►DATE         
───►ORGANIZATION 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
                 
Tok       
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
SRL PA1      
──────────── 
───►ARGM-TMP 
───►ARG0     
◄─┐          
  ├►ARG2     
◄─┘          
╟──►PRED     
◄─┐          
  │          
  │          
  │          
  ├►ARG1     
  │          
  │          
  │          
◄─┘          
             
Tok       
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
SRL PA2      
──────────── 
             
             
             
             
             
             
             
             
───►ARGM-ADV 
╟──►PRED     
             
             
             
             
───►ARG0     
             
Tok       
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
PoS    3       4       5       6       7       8       9 
─────────────────────────────────────────────────────────
NT ───────────────────────────────────────────►NP ───┐   
NR ───────────────────────────────────────────►NP────┤   
P ───────────┐                                       │   
NN ──┐       ├────────────────────────►PP ───┐       │   
NN ──┴►NP ───┘                               │       │   
VV ──────────────────────────────────┐       │       │   
JJ ───►ADJP──┐                       │       ├►VP────┤   
NN ───►NP ───┴►NP ───┐               │       │       │   
AD ───────────►ADVP──┼►ADJP──┐       ├►VP ───┘       ├►IP
JJ ───────────►VP ───┘       │       │               │   
DEG──────────────────────────┤       │               │   
CD ───►QP ───┐               ├►NP ───┘               │   
NN ───►NP ───┴────────►NP────┤                       │   
NR ──┐                       │                       │   
NN ──┴────────────────►NP ───┘                       │   
PU ──────────────────────────────────────────────────┘   

Dep Tree     
──────────── 
         ┌─► 
┌┬────┬──┴── 
││    │  ┌─► 
││    └─►└── 
│└─►┌─────── 
│   │  ┌───► 
│   │  │┌──► 
│   │  ││┌─► 
│   └─►└┴┴── 
└──────────► 
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
Relat 
───── 
nsubj 
root  
nn    
dobj  
conj  
nn    
nn    
nn    
dobj  
punct 
Po 
── 
NN 
VV 
NR 
NR 
VV 
NN 
NN 
NN 
NN 
PU 
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
NER Type         
──────────────── 
                 
                 
◄─┐              
◄─┴►LOCATION     
                 
◄─┐              
  │              
  ├►ORGANIZATION 
◄─┘              
                 
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
SRL PA1  
──────── 
───►ARG0 
╟──►PRED 
◄─┐      
◄─┴►ARG1 
         
         
         
         
         
         
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
SRL PA2  
──────── 
───►ARG0 
         
         
         
╟──►PRED 
◄─┐      
  │      
  ├►ARG1 
◄─┘      
         
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
Po    3       4       5       6 
────────────────────────────────
NN───────────────────►NP ───┐   
VV──────────┐               │   
NR──┐       ├►VP ───┐       │   
NR──┴►NP ───┘       │       │   
VV──────────┐       ├►VP────┤   
NN──┐       │       │       ├►IP
NN  │       ├►VP ───┘       │   
NN  ├►NP ───┘               │   
NN──┘                       │   
PU──────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "WIKyCLQJK7J9" }, "source": [ "#### 申请秘钥\n", "由于服务器算力有限,匿名用户每分钟限2次调用。如果你需要更多调用次数,[建议申请免费公益API秘钥auth](https://bbs.hanlp.com/t/hanlp2-1-restful-api/53)。" ] }, { "cell_type": "markdown", "metadata": { "id": "PcZAZopQK7J9" }, "source": [ "### 海量级native API\n", "\n", "依赖PyTorch、TensorFlow等深度学习技术,适合**专业**NLP工程师、研究者以及本地海量数据场景。要求Python 3.6以上,支持Windows,推荐*nix。可以在CPU上运行,推荐GPU/TPU。\n", "\n", "无论是Windows、Linux还是macOS,HanLP的安装只需一句话搞定。" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bjRdHxl1K7J-", "outputId": "659d7920-c857-4eb8-f45f-dba84366688a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: hanlp in /usr/local/lib/python3.7/dist-packages (2.1.0a54)\n", "Requirement already satisfied: sentencepiece>=0.1.91torch>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from hanlp) (0.1.96)\n", "Requirement already satisfied: toposort==1.5 in /usr/local/lib/python3.7/dist-packages (from hanlp) (1.5)\n", "Requirement already satisfied: alnlp in /usr/local/lib/python3.7/dist-packages (from hanlp) (1.0.0rc27)\n", "Requirement already satisfied: hanlp-common>=0.0.9 in /usr/local/lib/python3.7/dist-packages (from hanlp) (0.0.9)\n", "Requirement already satisfied: hanlp-downloader in /usr/local/lib/python3.7/dist-packages (from hanlp) (0.0.23)\n", "Requirement already satisfied: hanlp-trie>=0.0.2 in /usr/local/lib/python3.7/dist-packages (from hanlp) (0.0.2)\n", "Requirement already satisfied: transformers>=4.1.1 in /usr/local/lib/python3.7/dist-packages (from hanlp) (4.9.1)\n", "Requirement already satisfied: termcolor in /usr/local/lib/python3.7/dist-packages (from hanlp) (1.1.0)\n", "Requirement already satisfied: pynvml in /usr/local/lib/python3.7/dist-packages (from hanlp) (11.0.0)\n", "Requirement already satisfied: phrasetree in /usr/local/lib/python3.7/dist-packages (from hanlp-common>=0.0.9->hanlp) (0.0.8)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (3.0.12)\n", "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (0.0.45)\n", "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (0.10.3)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (21.0)\n", "Requirement already satisfied: huggingface-hub==0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (0.0.12)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (5.4.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (2019.12.20)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (4.41.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (2.23.0)\n", "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (4.6.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers>=4.1.1->hanlp) (1.19.5)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub==0.0.12->transformers>=4.1.1->hanlp) (3.7.4.3)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers>=4.1.1->hanlp) (2.4.7)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from alnlp->hanlp) (1.9.0+cu102)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers>=4.1.1->hanlp) (3.5.0)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers>=4.1.1->hanlp) (1.24.3)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers>=4.1.1->hanlp) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers>=4.1.1->hanlp) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers>=4.1.1->hanlp) (2021.5.30)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers>=4.1.1->hanlp) (1.0.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers>=4.1.1->hanlp) (7.1.2)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers>=4.1.1->hanlp) (1.15.0)\n" ] } ], "source": [ "!pip install hanlp -U" ] }, { "cell_type": "markdown", "metadata": { "id": "dHhIRwgqK7J-" }, "source": [ "#### 加载模型\n", "HanLP的工作流程是先加载模型,模型的标示符存储在`hanlp.pretrained`这个包中,按照NLP任务归类。" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KHY6bsG_K7J-", "outputId": "208c12b6-2702-4ee7-a03a-f053b7ad3479" }, "outputs": [ { "data": { "text/plain": [ "{'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_base_20210111_124519.zip',\n", " 'CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/close_tok_pos_ner_srl_dep_sdp_con_electra_small_20210111_124159.zip',\n", " 'NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA': 'https://file.hankcs.com/hanlp/mtl/npcmj_ud_kyoto_tok_pos_ner_dep_con_srl_bert_base_char_ja_20210517_225654.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_base_20201223_201906.zip',\n", " 'OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH': 'https://file.hankcs.com/hanlp/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small_20201223_035557.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MT5_SMALL': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_mt5_small_20210228_123458.zip',\n", " 'UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE': 'https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20210602_211620.zip'}" ] }, "execution_count": 6, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "import hanlp\n", "hanlp.pretrained.mtl.ALL # MTL多任务,具体任务见模型名称,语种见名称最后一个字段或相应语料库" ] }, { "cell_type": "markdown", "metadata": { "id": "WDT3Hks0K7J_" }, "source": [ "调用`hanlp.load`进行加载,模型会自动下载到本地缓存。自然语言处理分为许多任务,分词只是最初级的一个。与其每个任务单独创建一个模型,不如利用HanLP的联合模型一次性完成多个任务:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4Cj8a73rK7J_", "outputId": "a92ac736-6e61-4949-8d35-56c773faf950" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [] } ], "source": [ "HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)" ] }, { "cell_type": "markdown", "metadata": { "id": "pBqH_My8K7J_" }, "source": [ "## 多任务批量分析\n", "客户端创建完毕,或者模型加载完毕后,就可以传入一个或多个句子进行分析了:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "B58npfkHK7J_", "outputId": "69fed02d-39cb-4b4c-d2c8-d0edc25970ea" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"tok/fine\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次\", \"世代\", \"最\", \"先进\", \"的\", \"多\", \"语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"阿婆主\", \"来到\", \"北京\", \"立方庭\", \"参观\", \"自然\", \"语义\", \"科技\", \"公司\", \"。\"]\n", " ],\n", " \"tok/coarse\": [\n", " [\"2021年\", \"HanLPv2.1\", \"为\", \"生产\", \"环境\", \"带来\", \"次世代\", \"最\", \"先进\", \"的\", \"多语种\", \"NLP\", \"技术\", \"。\"],\n", " [\"阿婆主\", \"来到\", \"北京立方庭\", \"参观\", \"自然语义科技公司\", \"。\"]\n", " ],\n", " \"pos/ctb\": [\n", " [\"NT\", \"NR\", \"P\", \"NN\", \"NN\", \"VV\", \"JJ\", \"NN\", \"AD\", \"JJ\", \"DEG\", \"CD\", \"NN\", \"NR\", \"NN\", \"PU\"],\n", " [\"NN\", \"VV\", \"NR\", \"NR\", \"VV\", \"NN\", \"NN\", \"NN\", \"NN\", \"PU\"]\n", " ],\n", " \"pos/pku\": [\n", " [\"t\", \"nx\", \"p\", \"vn\", \"n\", \"v\", \"b\", \"n\", \"d\", \"a\", \"u\", \"a\", \"n\", \"nx\", \"n\", \"w\"],\n", " [\"n\", \"v\", \"ns\", \"ns\", \"v\", \"n\", \"n\", \"n\", \"n\", \"w\"]\n", " ],\n", " \"pos/863\": [\n", " [\"nt\", \"w\", \"p\", \"v\", \"n\", \"v\", \"a\", \"nt\", \"d\", \"a\", \"u\", \"a\", \"n\", \"ws\", \"n\", \"w\"],\n", " [\"n\", \"v\", \"ns\", \"n\", \"v\", \"n\", \"n\", \"n\", \"n\", \"w\"]\n", " ],\n", " \"ner/msra\": [\n", " [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"WWW\", 1, 2]],\n", " [[\"北京\", \"LOCATION\", 2, 3], [\"立方庭\", \"LOCATION\", 3, 4], [\"自然语义科技公司\", \"ORGANIZATION\", 5, 9]]\n", " ],\n", " \"ner/pku\": [\n", " [],\n", " [[\"北京立方庭\", \"ns\", 2, 4], [\"自然语义科技公司\", \"nt\", 5, 9]]\n", " ],\n", " \"ner/ontonotes\": [\n", " [[\"2021年\", \"DATE\", 0, 1], [\"HanLPv2.1\", \"ORG\", 1, 2]],\n", " [[\"北京立方庭\", \"FAC\", 2, 4], [\"自然语义科技公司\", \"ORG\", 5, 9]]\n", " ],\n", " \"srl\": [\n", " [[[\"2021年\", \"ARGM-TMP\", 0, 1], [\"HanLPv2.1\", \"ARG0\", 1, 2], [\"为生产环境\", \"ARG2\", 2, 5], [\"带来\", \"PRED\", 5, 6], [\"次世代最先进的多语种NLP技术\", \"ARG1\", 6, 15]], [[\"最\", \"ARGM-ADV\", 8, 9], [\"先进\", \"PRED\", 9, 10], [\"技术\", \"ARG0\", 14, 15]]],\n", " [[[\"阿婆主\", \"ARG0\", 0, 1], [\"来到\", \"PRED\", 1, 2], [\"北京立方庭\", \"ARG1\", 2, 4]], [[\"阿婆主\", \"ARG0\", 0, 1], [\"参观\", \"PRED\", 4, 5], [\"自然语义科技公司\", \"ARG1\", 5, 9]]]\n", " ],\n", " \"dep\": [\n", " [[6, \"tmod\"], [6, \"nsubj\"], [6, \"prep\"], [5, \"nn\"], [3, \"pobj\"], [0, \"root\"], [8, \"amod\"], [15, \"nn\"], [10, \"advmod\"], [15, \"rcmod\"], [10, \"assm\"], [13, \"nummod\"], [15, \"nn\"], [15, \"nn\"], [6, \"dobj\"], [6, \"punct\"]],\n", " [[2, \"nsubj\"], [0, \"root\"], [4, \"nn\"], [2, \"dobj\"], [2, \"conj\"], [9, \"nn\"], [9, \"nn\"], [9, \"nn\"], [5, \"dobj\"], [2, \"punct\"]]\n", " ],\n", " \"sdp\": [\n", " [[[6, \"Time\"]], [[6, \"Exp\"]], [[5, \"mPrep\"]], [[5, \"Desc\"]], [[6, \"Datv\"]], [[13, \"dDesc\"]], [[0, \"Root\"], [8, \"Desc\"], [13, \"Desc\"]], [[15, \"Time\"]], [[10, \"mDegr\"]], [[15, \"Desc\"]], [[10, \"mAux\"]], [[8, \"Quan\"], [13, \"Quan\"]], [[15, \"Desc\"]], [[15, \"Nmod\"]], [[6, \"Pat\"]], [[6, \"mPunc\"]]],\n", " [[[2, \"Agt\"], [5, \"Agt\"]], [[0, \"Root\"]], [[4, \"Loc\"]], [[2, \"Lfin\"]], [[2, \"ePurp\"]], [[8, \"Nmod\"]], [[9, \"Nmod\"]], [[9, \"Nmod\"]], [[5, \"Datv\"]], [[5, \"mPunc\"]]]\n", " ],\n", " \"con\": [\n", " [\"TOP\", [[\"IP\", [[\"NP\", [[\"NT\", [\"2021年\"]]]], [\"NP\", [[\"NR\", [\"HanLPv2.1\"]]]], [\"VP\", [[\"PP\", [[\"P\", [\"为\"]], [\"NP\", [[\"NN\", [\"生产\"]], [\"NN\", [\"环境\"]]]]]], [\"VP\", [[\"VV\", [\"带来\"]], [\"NP\", [[\"ADJP\", [[\"NP\", [[\"ADJP\", [[\"JJ\", [\"次\"]]]], [\"NP\", [[\"NN\", [\"世代\"]]]]]], [\"ADVP\", [[\"AD\", [\"最\"]]]], [\"VP\", [[\"JJ\", [\"先进\"]]]]]], [\"DEG\", [\"的\"]], [\"NP\", [[\"QP\", [[\"CD\", [\"多\"]]]], [\"NP\", [[\"NN\", [\"语种\"]]]]]], [\"NP\", [[\"NR\", [\"NLP\"]], [\"NN\", [\"技术\"]]]]]]]]]], [\"PU\", [\"。\"]]]]]],\n", " [\"TOP\", [[\"IP\", [[\"NP\", [[\"NN\", [\"阿婆主\"]]]], [\"VP\", [[\"VP\", [[\"VV\", [\"来到\"]], [\"NP\", [[\"NR\", [\"北京\"]], [\"NR\", [\"立方庭\"]]]]]], [\"VP\", [[\"VV\", [\"参观\"]], [\"NP\", [[\"NN\", [\"自然\"]], [\"NN\", [\"语义\"]], [\"NN\", [\"科技\"]], [\"NN\", [\"公司\"]]]]]]]], [\"PU\", [\"。\"]]]]]]\n", " ]\n", "}\n" ] } ], "source": [ "doc = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])\n", "print(doc)" ] }, { "cell_type": "markdown", "metadata": { "id": "tvuxfWPYK7J_" }, "source": [ "## 可视化\n", "输出结果是一个可以`json`化的`dict`,键为[NLP任务名](https://hanlp.hankcs.com/docs/data_format.html#naming-convention),值为分析结果。关于标注集含义,请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习,所以HanLP的标注集也是覆盖面最广的。通过`doc.pretty_print`,可以在等宽字体环境中得到可视化,你需要取消换行才能对齐可视化结果。我们已经发布HTML环境的可视化,在Jupyter Notebook中自动对齐中文。" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 575 }, "id": "M8WxTdlAK7KA", "outputId": "a027a302-74d8-48c9-b30d-45ebf8741c1e" }, "outputs": [ { "data": { "text/html": [ "
Dep Tree     
──────────── 
 ┌─────────► 
 │┌────────► 
 ││┌─►┌───── 
 │││  │  ┌─► 
 │││  └─►└── 
┌┼┴┴──────── 
││       ┌─► 
││  ┌───►└── 
││  │    ┌─► 
││  │┌──►├── 
││  ││   └─► 
││  ││   ┌─► 
││  ││┌─►└── 
││  │││  ┌─► 
│└─►└┴┴──┴── 
└──────────► 
Token     
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
Relati 
────── 
tmod   
nsubj  
prep   
nn     
pobj   
root   
amod   
nn     
advmod 
rcmod  
assm   
nummod 
nn     
nn     
dobj   
punct  
PoS 
─── 
NT  
NR  
P   
NN  
NN  
VV  
JJ  
NN  
AD  
JJ  
DEG 
CD  
NN  
NR  
NN  
PU  
Tok       
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
NER Type 
──────── 
───►DATE 
───►WWW  
         
         
         
         
         
         
         
         
         
         
         
         
         
         
Tok       
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
SRL PA1      
──────────── 
───►ARGM-TMP 
───►ARG0     
◄─┐          
  ├►ARG2     
◄─┘          
╟──►PRED     
◄─┐          
  │          
  │          
  │          
  ├►ARG1     
  │          
  │          
  │          
◄─┘          
             
Tok       
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
SRL PA2      
──────────── 
             
             
             
             
             
             
             
             
───►ARGM-ADV 
╟──►PRED     
             
             
             
             
───►ARG0     
             
Tok       
───────── 
2021年     
HanLPv2.1 
为         
生产        
环境        
带来        
次         
世代        
最         
先进        
的         
多         
语种        
NLP       
技术        
。         
PoS    3       4       5       6       7       8       9 
─────────────────────────────────────────────────────────
NT ───────────────────────────────────────────►NP ───┐   
NR ───────────────────────────────────────────►NP────┤   
P ───────────┐                                       │   
NN ──┐       ├────────────────────────►PP ───┐       │   
NN ──┴►NP ───┘                               │       │   
VV ──────────────────────────────────┐       │       │   
JJ ───►ADJP──┐                       │       ├►VP────┤   
NN ───►NP ───┴►NP ───┐               │       │       │   
AD ───────────►ADVP──┼►ADJP──┐       ├►VP ───┘       ├►IP
JJ ───────────►VP ───┘       │       │               │   
DEG──────────────────────────┤       │               │   
CD ───►QP ───┐               ├►NP ───┘               │   
NN ───►NP ───┴────────►NP────┤                       │   
NR ──┐                       │                       │   
NN ──┴────────────────►NP ───┘                       │   
PU ──────────────────────────────────────────────────┘   

Dep Tree     
──────────── 
         ┌─► 
┌┬────┬──┴── 
││    │  ┌─► 
││    └─►└── 
│└─►┌─────── 
│   │  ┌───► 
│   │  │┌──► 
│   │  ││┌─► 
│   └─►└┴┴── 
└──────────► 
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
Relat 
───── 
nsubj 
root  
nn    
dobj  
conj  
nn    
nn    
nn    
dobj  
punct 
Po 
── 
NN 
VV 
NR 
NR 
VV 
NN 
NN 
NN 
NN 
PU 
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
NER Type         
──────────────── 
                 
                 
───►LOCATION     
───►LOCATION     
                 
◄─┐              
  │              
  ├►ORGANIZATION 
◄─┘              
                 
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
SRL PA1  
──────── 
───►ARG0 
╟──►PRED 
◄─┐      
◄─┴►ARG1 
         
         
         
         
         
         
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
SRL PA2  
──────── 
───►ARG0 
         
         
         
╟──►PRED 
◄─┐      
  │      
  ├►ARG1 
◄─┘      
         
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
Po    3       4       5       6 
────────────────────────────────
NN───────────────────►NP ───┐   
VV──────────┐               │   
NR──┐       ├►VP ───┐       │   
NR──┴►NP ───┘       │       │   
VV──────────┐       ├►VP────┤   
NN──┐       │       │       ├►IP
NN  │       ├►VP ───┘       │   
NN  ├►NP ───┘               │   
NN──┘                       │   
PU──────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "_B2HDiZgK7KA" }, "source": [ "## 指定任务\n", "简洁的接口也支持灵活的参数,任务越少,速度越快。如指定仅执行分词:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "9Mnys4t2K7KA", "outputId": "88d72a72-c095-4f6d-df0b-d881887087ce" }, "outputs": [ { "data": { "text/html": [ "
阿婆主 来到 北京 立方庭 参观 自然 语义 科技 公司 。
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "s5RkVkVkK7KA" }, "source": [ "### 执行粗颗粒度分词" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "5R_PwELlK7KA", "outputId": "5ce2c037-eb44-481f-9de2-dc0d4122e7c4" }, "outputs": [ { "data": { "text/html": [ "
阿婆主 来到 北京立方庭 参观 自然语义科技公司 。
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='tok/coarse').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "pTrajkHEK7KB" }, "source": [ "### 执行分词和PKU词性标注" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "kkkgVKFqK7KB", "outputId": "e9f9879b-47ce-459a-e089-923de1c6436c" }, "outputs": [ { "data": { "text/html": [ "
阿婆主/n 来到/v 北京/ns 立方庭/ns 参观/v 自然/n 语义/n 科技/n 公司/n 。/w
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='pos/pku').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "YLLTVY0RK7KB" }, "source": [ "### 执行粗颗粒度分词和PKU词性标注" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "5qSlqbcfK7KB", "outputId": "66944459-bc22-4bd9-e4af-4d2aba9316f3" }, "outputs": [ { "data": { "text/html": [ "
阿婆主/n 来到/v 北京立方庭/ns 参观/v 自然语义科技公司/n 。/w
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['tok/coarse', 'pos/pku'], skip_tasks='tok/fine').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "3nNojvHiK7KB" }, "source": [ "### 执行分词和MSRA标准NER" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 225 }, "id": "tTVoEPiAK7KB", "outputId": "b8dc8c24-3392-4712-d1b6-e2dc8b7710e8" }, "outputs": [ { "data": { "text/html": [ "
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
NER Type        
────────────────
                
                
───►LOCATION    
───►LOCATION    
                
◄─┐             
  │             
  ├►ORGANIZATION
◄─┘             
                
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks='ner/msra').pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "uG2wYTfmK7KB" }, "source": [ "### 执行分词、词性标注和依存句法分析" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 225 }, "id": "WXl6f7zyK7KC", "outputId": "8671e0e4-d0c3-40f4-a4db-ba9aaec225ab" }, "outputs": [ { "data": { "text/html": [ "
Dep Tree     
──────────── 
         ┌─► 
┌┬────┬──┴── 
││    │  ┌─► 
││    └─►└── 
│└─►┌─────── 
│   │  ┌───► 
│   │  │┌──► 
│   │  ││┌─► 
│   └─►└┴┴── 
└──────────► 
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
Relat 
───── 
nsubj 
root  
nn    
dobj  
conj  
nn    
nn    
nn    
dobj  
punct 
Po
──
NN
VV
NR
NR
VV
NN
NN
NN
NN
PU
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "doc = HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['pos', 'dep'])\n", "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "ocxM3LsGK7KC" }, "source": [ "转换为CoNLL格式:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NtKmSB_0K7KC", "outputId": "cc9245b3-32c2-4d35-88a8-a7d91127eca7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\t阿婆主\t_\tNN\t_\t_\t2\tnsubj\t_\t_\n", "2\t来到\t_\tVV\t_\t_\t0\troot\t_\t_\n", "3\t北京\t_\tNR\t_\t_\t4\tnn\t_\t_\n", "4\t立方庭\t_\tNR\t_\t_\t2\tdobj\t_\t_\n", "5\t参观\t_\tVV\t_\t_\t2\tconj\t_\t_\n", "6\t自然\t_\tNN\t_\t_\t9\tnn\t_\t_\n", "7\t语义\t_\tNN\t_\t_\t9\tnn\t_\t_\n", "8\t科技\t_\tNN\t_\t_\t9\tnn\t_\t_\n", "9\t公司\t_\tNN\t_\t_\t5\tdobj\t_\t_\n", "10\t。\t_\tPU\t_\t_\t2\tpunct\t_\t_\n" ] } ], "source": [ "print(doc.to_conll())" ] }, { "cell_type": "markdown", "metadata": { "id": "PNBo-kETK7KC" }, "source": [ "### 执行分词、词性标注和短语成分分析" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 225 }, "id": "Ja8dib6XK7KC", "outputId": "a972f5bb-ae23-47a9-cd9f-6070a5b39f50" }, "outputs": [ { "data": { "text/html": [ "
Tok 
─── 
阿婆主 
来到  
北京  
立方庭 
参观  
自然  
语义  
科技  
公司  
。   
Po    3       4       5       6 
────────────────────────────────
NN───────────────────►NP ───┐   
VV──────────┐               │   
NR──┐       ├►VP ───┐       │   
NR──┴►NP ───┘       │       │   
VV──────────┐       ├►VP────┤   
NN──┐       │       │       ├►IP
NN  │       ├►VP ───┘       │   
NN  ├►NP ───┘               │   
NN──┘                       │   
PU──────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "doc = HanLP('阿婆主来到北京立方庭参观自然语义科技公司。', tasks=['pos', 'con'])\n", "doc.pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "Mg3DhvjhK7KC" }, "source": [ "#### 将短语结构树以bracketed形式打印" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kE8iBZNUK7KC", "outputId": "79e2a72d-e473-41ca-c054-9595a4dd5971" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(TOP\n", " (IP\n", " (NP (NN 阿婆主))\n", " (VP\n", " (VP (VV 来到) (NP (NR 北京) (NR 立方庭)))\n", " (VP (VV 参观) (NP (NN 自然) (NN 语义) (NN 科技) (NN 公司))))\n", " (PU 。)))\n" ] } ], "source": [ "print(doc['con']) # str(doc['con'])会将短语结构列表转换为括号形式" ] }, { "cell_type": "markdown", "metadata": { "id": "MfleaY_pK7KC" }, "source": [ "关于标注集含义,请参考[《语言学标注规范》](https://hanlp.hankcs.com/docs/annotations/index.html)及[《格式规范》](https://hanlp.hankcs.com/docs/data_format.html)。我们购买、标注或采用了世界上量级最大、种类最多的语料库用于联合多语种多任务学习,所以HanLP的标注集也是覆盖面最广的。\n", "\n", "## 多语种支持\n", "总之,可以通过tasks参数灵活调用各种NLP任务。除了中文联合模型之外,你可以在文档中通过找到许多其他语种的模型,比如日语:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oJP8dvfvK7KD", "outputId": "2262ccdb-7cf5-4859-8d6c-18300e54c22e" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [] } ], "source": [ "ja = hanlp.load(hanlp.pretrained.mtl.NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 991 }, "id": "3WPvCbH2K7KD", "outputId": "46a9435d-ed5b-47ef-99c6-71d7ee0fc6e8" }, "outputs": [ { "data": { "text/html": [ "
Dep Tree       
────────────── 
           ┌─► 
┌─────────►├── 
│          └─► 
│   ┌────────► 
│   │┌───────► 
│   ││     ┌─► 
│   ││┌───►├── 
│   │││    └─► 
│   │││┌─────► 
│   ││││┌────► 
│   │││││┌───► 
│   ││││││┌──► 
│   │││││││┌─► 
│┌─►└┴┴┴┴┴┴┼── 
││         └─► 
││         ┌─► 
││      ┌─►├── 
││      │  └─► 
└┴──────┴┬┬┬── 
         ││└─► 
         │└──► 
         └───► 
Token     
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
し         
ます        
。         
Relation 
──────── 
nummod   
obl      
punct    
compound 
case     
compound 
nmod     
case     
compound 
compound 
compound 
compound 
compound 
obj      
case     
compound 
obl      
case     
root     
aux      
aux      
punct    
PoS 
─── 
NUM 
CL  
PU  
NPR 
P   
N   
N   
P   
N   
N   
NUM 
N   
N   
N   
P   
N   
N   
P   
VB  
VB0 
AX  
PU  
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
し         
ます        
。         
NER Type     
──────────── 
◄─┐          
◄─┴►DATE     
             
───►ARTIFACT 
             
             
             
             
             
             
             
             
             
             
             
             
             
             
             
             
             
             
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
し         
ます        
。         
SRL PA1  
──────── 
         
         
         
         
         
───►修飾   
╟──►PRED 
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
し         
ます        
。         
SRL PA3  
──────── 
         
         
         
         
         
         
         
         
◄─┐      
◄─┴►修飾   
         
╟──►PRED 
         
         
         
         
         
         
         
         
         
         
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
し         
ます        
。         
SRL PA4  
──────── 
         
         
         
         
         
◄─┐      
  │      
  │      
  ├►修飾   
  │      
◄─┘      
◄─┐      
◄─┴►ノ    
╟──►PRED 
         
         
         
         
         
         
         
         
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
し         
ます        
。         
SRL PA5  
──────── 
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
───►修飾   
╟──►PRED 
         
         
         
         
         
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
し         
ます        
。         
SRL PA6  
──────── 
◄─┐      
  ├►時間   
◄─┘      
◄─┐      
◄─┴►ガ    
◄─┐      
  │      
  │      
  │      
  │      
  ├►ヲ    
  │      
  │      
  │      
◄─┘      
◄─┐      
  ├►ニ    
◄─┘      
╟──►PRED 
         
         
         
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
し         
ます        
。         
PoS    3         4        5       6       7       8 
────────────────────────────────────────────────────
NUM──┐                                              
CL ──┴►NUMCLP──────── ───────────────────►NP ───┐   
PU ──────── ───────── ──────────────────────────┤   
NPR───►NP ─────┐                                │   
P ───────── ───┴►──── ───────────────────►PP────┤   
N ───┐                                          │   
N ───┴►NP ─────┐                                │   
P ───────── ───┴►PP ────┐                       │   
N ───────── ─────────   │                       │   
N ────►NP ──────►CONJP──┤                       │   
NUM──────── ─────────   ├►NML ──┐               │   
N ───────── ─────────   │       │               ├►IP
N ───────── ───────── ──┘       ├►NP ───┐       │   
N ───────── ───────── ──────────┘       ├►PP────┤   
P ───────── ───────── ──────────────────┘       │   
N ───┐                                          │   
N ───┴►NP ─────┐                                │   
P ───────── ───┴►──── ───────────────────►PP────┤   
VB ──────── ───────── ──────────────────────────┤   
VB0──────── ───────── ──────────────────────────┤   
AX ──────── ───────── ──────────────────────────┤   
PU ──────── ───────── ──────────────────────────┘   

Dep Tree       
────────────── 
           ┌─► 
┌─────────►├── 
│          └─► 
│      ┌─────► 
│      │┌────► 
│      ││┌───► 
│      │││┌──► 
│      ││││┌─► 
│   ┌─►└┴┴┴┼── 
│   │      └─► 
│   │      ┌─► 
│   │   ┌─►└── 
│   │   │  ┌─► 
│   │┌─►└──┼── 
│   ││     └─► 
│┌─►└┴─────┬── 
││         └─► 
││        ┌──► 
││        │┌─► 
││   ┌─►┌┬┼┼── 
││   │  │││└─► 
││   │  ││└──► 
││   │  │└───► 
││   │  └────► 
││   │     ┌─► 
└┴───┴────┬┼── 
          │└─► 
          └──► 
Toke 
──── 
奈須   
きのこ  
は    
1973 
年    
11   
月    
28   
日    
に    
千葉   
県    
円空   
山    
で    
生まれ  
、    
ゲーム  
制作   
会社   
「    
ノーツ  
」    
の    
設立   
者    
だ    
。    
Relation 
──────── 
compound 
nsubj    
case     
compound 
compound 
compound 
compound 
nummod   
obl      
case     
compound 
nmod     
compound 
obl      
case     
acl      
punct    
compound 
compound 
nmod     
punct    
compound 
punct    
case     
compound 
root     
cop      
punct    
PoS 
─── 
NPR 
NPR 
P   
NUM 
CL  
NUM 
CL  
NUM 
CL  
P   
NPR 
NPR 
NPR 
NPR 
P   
VB  
PU  
N   
N   
N   
PUL 
NPR 
PUR 
P   
N   
N   
AX  
PU  
Tok  
──── 
奈須   
きのこ  
は    
1973 
年    
11   
月    
28   
日    
に    
千葉   
県    
円空   
山    
で    
生まれ  
、    
ゲーム  
制作   
会社   
「    
ノーツ  
」    
の    
設立   
者    
だ    
。    
NER Type         
──────────────── 
◄─┐              
◄─┴►PERSON       
                 
◄─┐              
  │              
  │              
  ├►DATE         
  │              
◄─┘              
                 
◄─┐              
  │              
  ├►LOCATION     
◄─┘              
                 
                 
                 
                 
                 
                 
                 
───►ORGANIZATION 
                 
                 
                 
                 
                 
                 
Tok  
──── 
奈須   
きのこ  
は    
1973 
年    
11   
月    
28   
日    
に    
千葉   
県    
円空   
山    
で    
生まれ  
、    
ゲーム  
制作   
会社   
「    
ノーツ  
」    
の    
設立   
者    
だ    
。    
SRL PA1  
──────── 
         
         
         
         
         
         
         
         
         
         
◄─┐      
◄─┴►ノ?   
         
╟──►PRED 
         
         
         
         
         
         
         
         
         
         
         
         
         
         
Tok  
──── 
奈須   
きのこ  
は    
1973 
年    
11   
月    
28   
日    
に    
千葉   
県    
円空   
山    
で    
生まれ  
、    
ゲーム  
制作   
会社   
「    
ノーツ  
」    
の    
設立   
者    
だ    
。    
SRL PA2  
──────── 
◄─┐      
  ├►ガ    
◄─┘      
◄─┐      
  │      
  │      
  ├►時間   
  │      
  │      
◄─┘      
◄─┐      
  │      
  ├►デ    
  │      
◄─┘      
╟──►PRED 
         
         
         
         
         
         
         
         
         
         
         
         
Tok  
──── 
奈須   
きのこ  
は    
1973 
年    
11   
月    
28   
日    
に    
千葉   
県    
円空   
山    
で    
生まれ  
、    
ゲーム  
制作   
会社   
「    
ノーツ  
」    
の    
設立   
者    
だ    
。    
SRL PA3  
──────── 
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
◄─┐      
◄─┴►ノ    
╟──►PRED 
         
         
         
         
         
         
         
         
Tok  
──── 
奈須   
きのこ  
は    
1973 
年    
11   
月    
28   
日    
に    
千葉   
県    
円空   
山    
で    
生まれ  
、    
ゲーム  
制作   
会社   
「    
ノーツ  
」    
の    
設立   
者    
だ    
。    
SRL PA4  
──────── 
◄─┐      
  ├►ガ    
◄─┘      
         
         
         
         
         
         
         
         
         
         
         
         
         
         
◄─┐      
  │      
  │      
  ├►ヲ    
  │      
  │      
◄─┘      
╟──►PRED 
         
         
         
Tok  
──── 
奈須   
きのこ  
は    
1973 
年    
11   
月    
28   
日    
に    
千葉   
県    
円空   
山    
で    
生まれ  
、    
ゲーム  
制作   
会社   
「    
ノーツ  
」    
の    
設立   
者    
だ    
。    
SRL PA5  
──────── 
◄─┐      
  ├►ガ    
◄─┘      
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
╟──►PRED 
         
         
Tok  
──── 
奈須   
きのこ  
は    
1973 
年    
11   
月    
28   
日    
に    
千葉   
県    
円空   
山    
で    
生まれ  
、    
ゲーム  
制作   
会社   
「    
ノーツ  
」    
の    
設立   
者    
だ    
。    
PoS    3         4       5       6       7       8       9       10      11
───────────────────────────────────────────────────────────────────────────
NPR──┐                                                                     
NPR──┴►NP ─────┐                                                           
P ───────── ───┴────────────────────────────────────────────────►PP ───┐   
NUM──┐                                                                 │   
CL ──┴►NUMCLP──┐                                                       │   
NUM──┐         │                                                       │   
CL ──┴►NUMCLP──┼►NP ───┐                                               │   
NUM──┐         │       │                                               │   
CL ──┴►NUMCLP──┘       ├►PP ───┐                                       │   
P ───────── ───────────┘       │                                       │   
NPR──┐                         │                                       │   
NPR──┴►PP ─────┐               │                                       │   
NPR────────    ├►NP ───┐       ├────────────────────────────────►IP────┤   
NPR──────── ───┘       ├►PP────┤                                       │   
P ───────── ───────────┘       │                                       │   
VB ──────── ───────────────────┘                                       ├►IP
PU ──────── ───────────────────────────────────────────────────────────┤   
N ───┐                                                                 │   
N ───┴►NP ──────►PRN ──┐                                               │   
N ───────── ───────────┴►NP ────►PRN ──┐                               │   
PUL──────── ───────────────────────────┤                               │   
NPR──────── ───────────────────────────┼►NP ───┐                       │   
PUR──────── ───────────────────────────┘       ├►PP ───┐               │   
P ───────── ───────────────────────────────────┘       ├►IP ───┐       │   
N ───────── ───────────────────────────────────────────┘       ├►NP────┤   
N ───────── ───────────────────────────────────────────────────┘       │   
AX ──────── ───────────────────────────────────────────────────────────┤   
PU ──────── ───────────────────────────────────────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "ja(['2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',\n", " '奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。',]).pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "NifrOGlNK7KD" }, "source": [ "以及支持[130种语言](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/mtl.html#hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6)的多语种联合模型:" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "ae-4j5sbK7KD", "outputId": "2777cc5d-c1c5-4091-b754-0c220dafea8a" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [] }, { "data": { "text/html": [ "
Dep Tree   
────────── 
       ┌─► 
    ┌─►├── 
    │  └─► 
    │  ┌─► 
┌┬┬─┴──┴── 
│││  ┌───► 
│││  │┌──► 
│││  ││┌─► 
││└─►└┴┴── 
││    ┌──► 
││    │┌─► 
│└───►└┴── 
└────────► 
Token            
──────────────── 
In               
2021             
,                
HanLPv2.1        
delivers         
state-of-the-art 
multilingual     
NLP              
techniques       
to               
production       
environments     
.                
Relation 
──────── 
case     
obl      
punct    
nsubj    
root     
amod     
amod     
compound 
obj      
case     
compound 
obl      
punct    
Lemma            
──────────────── 
in               
2021             
,                
HANlpv2.1        
deliver          
state-of-the-art 
multilingual     
NLP              
technique        
to               
production       
environment      
.                
PoS   
───── 
ADP   
NUM   
PUNCT 
PROPN 
VERB  
ADJ   
ADJ   
PROPN 
NOUN  
ADP   
NOUN  
NOUN  
PUNCT 
Tok              
──────────────── 
In               
2021             
,                
HanLPv2.1        
delivers         
state-of-the-art 
multilingual     
NLP              
techniques       
to               
production       
environments     
.                
NER Type        
─────────────── 
                
───►DATE        
                
───►WORK_OF_ART 
                
                
                
                
                
                
                
                
                
Tok              
──────────────── 
In               
2021             
,                
HanLPv2.1        
delivers         
state-of-the-art 
multilingual     
NLP              
techniques       
to               
production       
environments     
.                
SRL PA1      
──────────── 
◄─┐          
◄─┴►ARGM-TMP 
             
───►ARG0     
╟──►PRED     
             
             
             
             
◄─┐          
  ├►ARG2     
◄─┘          
             
Tok              
──────────────── 
In               
2021             
,                
HanLPv2.1        
delivers         
state-of-the-art 
multilingual     
NLP              
techniques       
to               
production       
environments     
.                
PoS      3       4       5       6
──────────────────────────────────
ADP ───────────┐                  
NUM ────►NP ───┴────────►PP ───┐  
PUNCT──────────────────────────┤  
PROPN───────────────────►NP────┤  
VERB ──────────────────┐       │  
ADJ ───┐               │       │  
ADJ    │               │       │  
PROPN  ├────────►NP────┼►VP────┼►S
NOUN ──┘               │       │  
ADP ───────────┐       │       │  
NOUN ──┐       ├►PP ───┘       │  
NOUN ──┴►NP ───┘               │  
PUNCT──────────────────────────┘  

Dep Tree      
───────────── 
          ┌─► 
┌────────►├── 
│         └─► 
│┌───────►┌── 
││        └─► 
││        ┌─► 
││   ┌───►├── 
││   │    └─► 
││   │┌─────► 
││   ││┌────► 
││   │││┌───► 
││   ││││┌──► 
││   │││││┌─► 
││┌─►└┴┴┴┴┼── 
│││       └─► 
│││       ┌─► 
│││    ┌─►├── 
│││    │  └─► 
└┴┴────┴─┬┬── 
         │└─► 
         └──► 
Token     
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
します       
。         
Relation 
──────── 
nummod   
obl      
punct    
nsubj    
case     
compound 
nmod     
case     
compound 
compound 
compound 
compound 
compound 
obj      
case     
compound 
obl      
case     
root     
aux      
punct    
Lemma     
───────── 
2021      
年         
、         
HANLPV2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
します       
。         
PoS   
───── 
NUM   
NOUN  
PUNCT 
NOUN  
ADP   
NOUN  
NOUN  
ADP   
NOUN  
NOUN  
NOUN  
NOUN  
NOUN  
NOUN  
ADP   
NOUN  
NOUN  
ADP   
VERB  
AUX   
PUNCT 
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
します       
。         
NER Type 
──────── 
◄─┐      
◄─┴►DATE 
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
         
Tok       
───────── 
2021      
年         
、         
HanLPv2.1 
は         
次         
世代        
の         
最         
先端        
多         
言語        
NLP       
技術        
を         
本番        
環境        
に         
導入        
します       
。         
PoS      3       4       5       6       7       8       9 
───────────────────────────────────────────────────────────
NUM ───────────────────────────────────────────────────┐   
NOUN ──────────────────────────────────────────────────┤   
PUNCT──────────────────────────────────────────────────┤   
NOUN ──────────────────────────────────────────────────┤   
ADP ───────────────────────────┐                       │   
NOUN ──────────────────────────┤                       │   
NOUN ──────────────────────────┤                       │   
ADP ───────────────────────────┼►VP ────►VP ────►IP────┤   
NOUN ───►ADJP──┐               │                       │   
NOUN ───►ADJP──┴►ADJP──┐       │                       │   
NOUN ───────────►ADJP──┴►ADJP──┘                       ├►IP
NOUN ──┐                                               │   
NOUN   ├►NP ───┐                                       │   
NOUN ──┘       ├►NP ───┐                               │   
ADP ───────────┘       │                               │   
NOUN ──────────────────┼►NP ───┐                       │   
NOUN ──────────────────┘       ├►NP ───┐               │   
ADP ────────────────────►PP ───┘       │               │   
VERB ──┐                               ├────────►NP────┤   
AUX ───┴────────────────────────►VP ───┘               │   
PUNCT──────────────────────────────────────────────────┘   

Dep Tree     
──────────── 
         ┌─► 
   ┌────►└── 
   │┌──────► 
   ││   ┌──► 
   ││   │┌─► 
   ││┌─►└┴── 
┌┬─┴┴┴────── 
││  ┌──────► 
││  │    ┌─► 
││  │┌──►└── 
││  ││   ┌─► 
││  ││┌─►└── 
││  │││  ┌─► 
│└─►└┴┴──┴── 
└──────────► 
Token     
───────── 
2021      
年         
HanLPv2.1 
为         
生产        
环境        
带来        
次世代       
最         
先进的       
多         
语种        
NLP       
技术        
。         
Relation  
───────── 
nummod    
nmod:tmod 
nsubj     
case      
nmod      
obl       
root      
nmod      
advmod    
amod      
nummod    
nmod      
nmod      
obj       
punct     
Lemma     
───────── 
2021      
年         
HANlpv2.1 
为         
生产        
环境        
带来        
次世代       
最         
先进的       
多         
语种        
NLP       
技术        
。         
PoS   
───── 
NUM   
NOUN  
X     
ADP   
NOUN  
NOUN  
VERB  
NOUN  
ADV   
ADJ   
NUM   
NOUN  
X     
NOUN  
PUNCT 
Tok       
───────── 
2021      
年         
HanLPv2.1 
为         
生产        
环境        
带来        
次世代       
最         
先进的       
多         
语种        
NLP       
技术        
。         
NER Type   
────────── 
◄─┐        
◄─┴►DATE   
───►PERSON 
           
           
           
           
           
           
           
           
           
           
           
           
Tok       
───────── 
2021      
年         
HanLPv2.1 
为         
生产        
环境        
带来        
次世代       
最         
先进的       
多         
语种        
NLP       
技术        
。         
SRL PA1      
──────────── 
◄─┐          
◄─┴►ARGM-TMP 
             
             
             
             
╟──►PRED     
             
             
             
             
             
             
             
             
Tok       
───────── 
2021      
年         
HanLPv2.1 
为         
生产        
环境        
带来        
次世代       
最         
先进的       
多         
语种        
NLP       
技术        
。         
PoS      3       4       5       6       7       8 
───────────────────────────────────────────────────
NUM ───┐                                           
NOUN ──┴────────────────────────────────►NP ───┐   
X ──────────────────────────────────────►NP────┤   
ADP ───────────┐                               │   
NOUN ──┐       ├────────────────►PP ───┐       │   
NOUN ──┴►NP ───┘                       │       │   
VERB ──────────────────────────┐       ├►VP────┤   
NOUN ───────────►ADJP──┐       │       │       │   
ADV ────►ADVP──┐       │       ├►VP ───┘       ├►IP
ADJ ────►ADJP──┴►ADJP──┤       │               │   
NUM ────►QP ───┐       ├►NP ───┘               │   
NOUN ───►NP ───┴►NP────┤                       │   
X ─────┐               │                       │   
NOUN ──┴────────►NP ───┘                       │   
PUNCT──────────────────────────────────────────┘   
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] }, "output_type": "display_data" } ], "source": [ "from hanlp.utils.torch_util import gpus_available\n", "if gpus_available(): # 建议在GPU上运行XLMR_BASE,否则运行mini模型\n", " mul = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE)\n", "else:\n", " if 'ja' in globals(): # Binder内存只有2G,释放已加载的模型\n", " del ja\n", " mul = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6)\n", "mul(['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.',\n", " '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',\n", " '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。']).pretty_print()" ] }, { "cell_type": "markdown", "metadata": { "id": "0QV_93CjK7KD" }, "source": [ "你可以在下面输入你想执行的代码~" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "tutorial.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 1 } ================================================ FILE: plugins/hanlp_demo/setup.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 19:26 from os.path import abspath, join, dirname from setuptools import find_packages, setup this_dir = abspath(dirname(__file__)) with open(join(this_dir, 'README.md'), encoding='utf-8') as file: long_description = file.read() setup( name='hanlp_demo', version='0.0.1', description='HanLP: Han Language Processing', long_description=long_description, long_description_content_type="text/markdown", url='https://github.com/hankcs/HanLP', author='hankcs', author_email='hankcshe@gmail.com', license='Apache License 2.0', classifiers=[ 'Intended Audience :: Science/Research', 'Intended Audience :: Developers', "Development Status :: 3 - Alpha", 'Operating System :: OS Independent', "License :: OSI Approved :: Apache Software License", 'Programming Language :: Python :: 3 :: Only', 'Topic :: Scientific/Engineering :: Artificial Intelligence', "Topic :: Text Processing :: Linguistic" ], keywords='corpus,machine-learning,NLU,NLP', packages=find_packages(exclude=['docs', 'tests*']), include_package_data=True, install_requires=[ 'hanlp_common' ], python_requires='>=3.6', ) ================================================ FILE: plugins/hanlp_restful/README.md ================================================ # RESTFul API Client for HanLP [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker) The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch. ## Installation ```bash pip install hanlp-restful ``` ## License HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website. ================================================ FILE: plugins/hanlp_restful/hanlp_restful/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-29 17:48 import json from typing import Union, List, Optional, Dict, Any, Tuple from urllib.error import HTTPError from urllib.parse import urlencode from urllib.request import Request, urlopen from hanlp_common.document import Document try: # noinspection PyUnresolvedReferences import requests def _post(url, form: Dict[str, Any], headers: Dict[str, Any], timeout=60, verify=True) -> str: response = requests.post(url, json=form, headers=headers, timeout=timeout, verify=verify) if response.status_code != 200: raise HTTPError(url, response.status_code, response.text, response.headers, None) return response.text except ImportError: import ssl def _post(url, form: Dict[str, Any], headers: Dict[str, Any], timeout=60, verify=True) -> str: request = Request(url, json.dumps(form).encode()) for k, v in headers.items(): request.add_header(k, v) ctx = None if not verify: ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE return urlopen(request, timeout=timeout, context=ctx).read().decode() class HanLPClient(object): def __init__(self, url: str, auth: str = None, language=None, timeout=60, verify=True) -> None: """ Args: url (str): An API endpoint to a service provider. auth (str): An auth key licenced from a service provider. language (str): The default language for each :func:`~hanlp_restful.HanLPClient.parse` call. Contact the service provider for the list of languages supported. Conventionally, ``zh`` is used for Chinese and ``mul`` for multilingual. Leave ``None`` to use the default language on server. timeout (int): Maximum waiting time in seconds for a request. verify (bool): ``True`` to enable SSL cert verification. You can also pass ``verify`` the path to a CA_BUNDLE file or directory with certificates of trusted CAs (``requests`` required). """ super().__init__() self._language = language self._timeout = timeout self._url = url if auth is None: import os auth = os.getenv('HANLP_AUTH', None) self._auth = auth self._verify = verify def parse(self, text: Union[str, List[str]] = None, tokens: List[List[str]] = None, tasks: Optional[Union[str, List[str]]] = None, skip_tasks: Optional[Union[str, List[str]]] = None, language: str = None, ) -> Document: """ Parse a piece of text. Args: text: A document (str), or a list of sentences (List[str]). tokens: A list of sentences where each sentence is a list of tokens. tasks: The tasks to predict. Use ``tasks=[...]`` to run selected tasks only. Dependent tasks will be automatically selected. skip_tasks: The tasks to skip. Use ``skip_tasks='tok/fine'`` to enable coarse tokenization for all tasks. Use ``tasks=['tok/coarse', ...]`` and ``skip_tasks='tok/fine'`` to enable coarse tokenization for selected tasks. language: The language of input text or tokens. ``None`` to use the default language on server. Returns: A :class:`~hanlp_common.document.Document`. Examples:: # Use tasks=[...] to run selected tasks only HanLP('晓美焰来到自然语义科技公司', tasks=['pos', 'ner']) # Use skip_tasks='tok/fine' to enable coarse tokenization for all tasks HanLP('晓美焰来到自然语义科技公司', skip_tasks='tok/fine') # Use tasks=['tok/coarse', ...] and skip_tasks='tok/fine' to enable # coarse tokenization for selected tasks HanLP('晓美焰来到自然语义科技公司', tasks=['tok/coarse','pos'],skip_tasks='tok/fine') Raises: HTTPError: Any errors happening on the Internet side or the server side. Refer to the ``code`` and ``msg`` of the exception for more details. A list of common errors : - ``400 Bad Request`` indicates that the server cannot process the request due to a client fault (e.g., text too long, language unsupported). - ``401 Unauthorized`` indicates that the request lacks **valid** ``auth`` credentials for the API. - ``422 Unprocessable Entity`` indicates that the content type of the request entity is not in proper json format. - ``429 Too Many Requests`` indicates the user has sent too many requests in a given amount of time ("rate limiting"). """ assert text or tokens, 'At least one of text or tokens has to be specified.' response = self._send_post_json(self._url + '/parse', { 'text': text, 'tokens': tokens, 'tasks': tasks, 'skip_tasks': skip_tasks, 'language': language or self._language }) return Document(response) def __call__(self, text: Union[str, List[str]] = None, tokens: List[List[str]] = None, tasks: Optional[Union[str, List[str]]] = None, skip_tasks: Optional[Union[str, List[str]]] = None, language: str = None, ) -> Document: """ A shortcut of :meth:`~hanlp_restful.HanLPClient.parse`. """ return self.parse(text, tokens, tasks, skip_tasks, language) def about(self) -> Dict[str, Any]: """Get the information about server and your client. Returns: A dict containing your rate limit and server version etc. """ info = self._send_get_json(self._url + '/about', {}) return Document(info) def _send_post(self, url, form: Dict[str, Any]): request = Request(url, json.dumps(form).encode()) self._add_headers(request) return self._fire_request(request) def _fire_request(self, request): return urlopen(request, timeout=self._timeout).read().decode() def _send_post_json(self, url, form: Dict[str, Any]): headers = dict() if self._auth: headers['Authorization'] = f'Basic {self._auth}' return json.loads(_post(url, form, headers, self._timeout, verify=self._verify)) def _send_get(self, url, form: Dict[str, Any]): request = Request(url + '?' + urlencode(form)) self._add_headers(request) return self._fire_request(request) def _add_headers(self, request): if self._auth: request.add_header('Authorization', f'Basic {self._auth}') def _send_get_json(self, url, form: Dict[str, Any]): return json.loads(self._send_get(url, form)) def text_style_transfer(self, text: Union[str, List[str]], target_style: str, language: str = None) \ -> Union[str, List[str]]: """ Text style transfer aims to change the style of the input text to the target style while preserving its content. Args: text: Source text. target_style: Target style. language: The language of input text. ``None`` to use the default language. Returns: Text or a list of text of the target style. Examples:: HanLP.text_style_transfer(['国家对中石油抱有很大的期望.', '要用创新去推动高质量的发展。'], target_style='gov_doc') # Output: [ '国家对中石油寄予厚望。', '要以创新驱动高质量发展。' ] HanLP.text_style_transfer('我看到了窗户外面有白色的云和绿色的森林', target_style='modern_poetry') # Output: '我看见窗外的白云绿林' """ response = self._send_post_json(self._url + '/text_style_transfer', {'text': text, 'target_style': target_style, 'language': language or self._language}) return response def semantic_textual_similarity(self, text: Union[Tuple[str, str], List[Tuple[str, str]]], language: str = None) \ -> Union[float, List[float]]: """ Semantic textual similarity deals with determining how similar two pieces of texts are. Args: text: A pair or pairs of text. language: The language of input text. ``None`` to use the default language. Returns: Similarities. Examples:: HanLP.semantic_textual_similarity([ ('看图猜一电影名', '看图猜电影'), ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'), ('北京到上海的动车票', '上海到北京的动车票'), ]) # Output: [ 0.9764469, # Similarity of ('看图猜一电影名', '看图猜电影') 0.0, # Similarity of ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用') 0.0034587 # Similarity of ('北京到上海的动车票', '上海到北京的动车票') ] """ response = self._send_post_json(self._url + '/semantic_textual_similarity', {'text': text, 'language': language or self._language}) return response def coreference_resolution(self, text: Optional[str] = None, tokens: Optional[List[List[str]]] = None, speakers: Optional[List[str]] = None, language: Optional[str] = None) -> Union[ Dict[str, Union[List[str], List[List[Tuple[str, int, int]]]]], List[List[Tuple[str, int, int]]]]: r""" Coreference resolution is the task of clustering mentions in text that refer to the same underlying real world entities. Args: text: A piece of text, usually a document without tokenization. tokens: A list of sentences where each sentence is a list of tokens. speakers: A list of speakers where each speaker is a ``str`` representing the speaker's ID, e.g., ``Tom``. language: The language of input text. ``None`` to use the default language. Returns: When ``text`` is specified, return the clusters and tokens. Otherwise just the clusters, In this case, you need to ``sum(tokens, [])`` in order to match the span indices with tokens Examples:: HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。') # Output: {'clusters': [ [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人 [['我姐', 0, 2], ['她', 4, 5]], # 指代说话人的姐姐 [['她的猫', 4, 7], ['它', 11, 12]]], # 指代说话人的姐姐的猫 'tokens': ['我', '姐', '送', '我', '她', '的', '猫', '。', '我', '很', '喜欢', '它', '。']} HanLP.coreference_resolution( tokens=[['我', '姐', '送', '我', '她', '的', '猫', '。'], ['我', '很', '喜欢', '它', '。']]) # Output: [ [['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人 [['我姐', 0, 2], ['她', 4, 5]], # 指代说话人的姐姐 [['她的猫', 4, 7], ['它', 11, 12]]], # 指代说话人的姐姐的猫 .. image:: https://file.hankcs.com/img/coref_demo_small.png :alt: Coreference resolution visualization """ response = self._send_post_json(self._url + '/coreference_resolution', {'text': text, 'tokens': tokens, 'speakers': speakers, 'language': language or self._language}) return response def tokenize(self, text: Union[str, List[str]], coarse: Optional[bool] = None, language=None) -> List[List[str]]: """ Split a document into sentences and tokenize them. Note that it is always faster to tokenize a whole document than to tokenize each sentence one by one. So avoid calling this method sentence by sentence but put sentences into a ``list`` and pass them to the ``text`` argument. Args: text: A document (``str``), or a list of sentences (``List[str]``). coarse: Whether to perform coarse-grained or fine-grained tokenization. language: The language of input text. ``None`` to use the default language. Returns: A list of tokenized sentences. Examples:: # Avoid tokenizing sentence by sentence, it is expensive: HanLP.tokenize('商品和服务。') [['商品', '和', '服务', '。']] HanLP.tokenize('阿婆主来到北京立方庭参观自然语义科技公司') [['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']] # Instead, the following codes are much faster: HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司') [['商品', '和', '服务', '。'], ['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']] # To tokenize with coarse-grained standard: HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True) [['商品', '和', '服务', '。'], ['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司']] # To tokenize pre-segmented sentences: HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重']) [['商品', '和', '服务', '。'], ['当', '下雨天', '地面', '积水', '分', '外', '严重']] # Multilingual tokenization by specifying language='mul': HanLP.tokenize( ['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques 'to production environment.', '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], language='mul') [['In', '2021', ',', 'HanLPv2.1', 'delivers', 'state-of-the-art', 'multilingual', 'NLP', 'techniques', 'to', 'production', 'environment', '.'], ['2021', '年', '、', 'HanLPv2.1', 'は', '次', '世代', 'の', '最', '先端', '多', '言語', 'NLP', '技術', 'を', '本番', '環境', 'に', '導入', 'します', '。'], ['2021', '年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次世代', '最', '先进的', '多', '语种', 'NLP', '技术', '。']] """ language = language or self._language if coarse and language and language != 'zh': raise NotImplementedError(f'Coarse tokenization not supported for {language}. Please set language="zh".') doc = self.parse(text=text, tasks='tok/coarse' if coarse is True else 'tok', language=language) return next(iter(doc.values())) def abstract_meaning_representation(self, text: Union[str, List[str]] = None, tokens: List[List[str]] = None, language: str = None, visualization: str = None, ) -> List[Dict]: """Abstract Meaning Representation (AMR) captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph consisting of nodes (concepts) and edges (relations). Args: text: A document (str), or a list of sentences (List[str]). tokens: A list of sentences where each sentence is a list of tokens. language: The language of input text or tokens. ``None`` to use the default language on server. visualization: Set to `dot` or `svg` to obtain coresspodning visualization. Returns: Graphs in meaning represenation format. Examples:: HanLP.abstract_meaning_representation('男孩希望女孩相信他。') HanLP.abstract_meaning_representation('The boy wants the girl to believe him.', language='en') .. image:: https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=%E7%94%B7%E5%AD%A9%20%E5%B8%8C%E6%9C%9B%20%E5%A5%B3%E5%AD%A9%20%E7%9B%B8%E4%BF%A1%20%E4%BB%96%20%E3%80%82&language=zh&scale=1 :alt: Abstract Meaning Representation .. image:: https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=The%20boy%20wants%20the%20girl%20to%20believe%20him%20.&language=en&scale=1 :alt: Abstract Meaning Representation """ assert text or tokens, 'At least one of text or tokens has to be specified.' return self._send_post_json(self._url + '/abstract_meaning_representation', { 'text': text, 'tokens': tokens, 'language': language or self._language, 'visualization': visualization, }) def keyphrase_extraction( self, text: str, topk: int = 10, language: str = None, ) -> Dict[str, float]: """ Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document. Args: text: The text content of the document. Preferably the concatenation of the title and the content. topk: The number of top-K ranked keywords or keyphrases. language: The language of input text or tokens. ``None`` to use the default language on server. Returns: A dictionary containing each keyword or keyphrase and its ranking score :math:`s`, :math:`s \in [0, 1]`. Examples:: HanLP.keyphrase_extraction( '自然语言处理是一门博大精深的学科,掌握理论才能发挥出HanLP的全部性能。 ' '《自然语言处理入门》是一本配套HanLP的NLP入门书,助你零起点上手自然语言处理。', topk=3) # Output: {'自然语言处理': 0.800000011920929, 'HanLP的全部性能': 0.5258446335792542, '一门博大精深的学科': 0.421421080827713} """ assert text, 'Text has to be specified.' return self._send_post_json(self._url + '/keyphrase_extraction', { 'text': text, 'language': language or self._language, 'topk': topk, }) def extractive_summarization( self, text: str, topk: int = 3, language: str = None, ) -> Dict[str, float]: """ Single document summarization is the task of selecting a subset of the sentences which best represents a summary of the document, with a balance of salience and redundancy. Args: text: The text content of the document. topk: The maximum number of top-K ranked sentences. Note that due to Trigram Blocking tricks, the actual number of returned sentences could be less than ``topk``. language: The language of input text or tokens. ``None`` to use the default language on server. Returns: A dictionary containing each sentence and its ranking score :math:`s \in [0, 1]`. Examples:: HanLP.extractive_summarization(''' 据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。 据供应链消息人士称,生产厂的订单拉动情况正在慢慢转强,这会提高MacBook Pro机型的供应量,并缩短苹果客户在过去几周所经历的延长交货时间。 仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。 据分析师郭明錤表示,广达是高端MacBook Pro的唯一供应商,自防疫封控依赖,MacBook Pro大部分型号交货时间增加了三到五周, 一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。 尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。 苹果上周表示,防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求,这最终将影响苹果6月份的收入。 ''') # Output: {'据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。': 0.9999, '仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。': 0.5800, '尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。': 0.5422} """ assert text, 'Text has to be non-empty.' return self._send_post_json(self._url + '/extractive_summarization', { 'text': text, 'language': language or self._language, 'topk': topk, }) def abstractive_summarization( self, text: str, language: str = None, ) -> str: r""" Abstractive Summarization is the task of generating a short and concise summary that captures the salient ideas of the source text. The generated summaries potentially contain new phrases and sentences that may not appear in the source text. Args: text: The text content of the document. language: The language of input text or tokens. ``None`` to use the default language on server. Returns: Summarization. Examples:: HanLP.abstractive_summarization(''' 每经AI快讯,2月4日,长江证券研究所金属行业首席分析师王鹤涛表示,2023年海外经济衰退,美债现处于历史高位, 黄金的趋势是值得关注的;在国内需求修复的过程中,看好大金属品种中的铜铝钢。 此外,在细分的小品种里,建议关注两条主线,一是新能源,比如锂、钴、镍、稀土,二是专精特新主线。(央视财经) ''') # Output: '长江证券:看好大金属品种中的铜铝钢' """ assert text, 'Text has to be non-empty.' return self._send_post_json(self._url + '/abstractive_summarization', { 'text': text, 'language': language or self._language, }) def grammatical_error_correction(self, text: Union[str, List[str]], language: str = None) \ -> Union[str, List[str]]: """ Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as spelling, punctuation, grammatical, and word choice errors. Args: text: Text potentially containing different kinds of errors such as spelling, punctuation, grammatical, and word choice errors. language: The language of input text. ``None`` to use the default language. Returns: Corrected text. Examples:: HanLP.grammatical_error_correction(['每个青年都应当有远大的报复。', '有的同学对语言很兴趣。']) # Output: [ '每个青年都应当有远大的抱负。', '有的同学对语言很有兴趣。' ] """ response = self._send_post_json(self._url + '/grammatical_error_correction', {'text': text, 'language': language or self._language}) return response def text_classification(self, text: Union[str, List[str]], model, topk=False, prob=False) -> Union[ str, Dict[str, float], List[Union[str, Dict[str, float]]]]: """ Text classification is the task of assigning a sentence or document an appropriate category. The categories depend on the chosen dataset and can range from topics. Args: text: A document or a list of documents. model: The model to use for prediction. topk: ``True`` or ``int`` to return the top-k labels. prob: Return also probabilities. Returns: Classification results. """ response = self._send_post_json(self._url + '/text_classification', {'text': text, 'model': model, 'topk': topk, 'prob': prob}) return response def sentiment_analysis(self, text: Union[str, List[str]], language=None) -> Union[float, List[float]]: r""" Sentiment analysis is the task of classifying the polarity of a given text. For instance, a text-based tweet can be categorized into either "positive", "negative", or "neutral". Args: text: A document or a list of documents. language (str): The default language for each :func:`~hanlp_restful.HanLPClient.parse` call. Contact the service provider for the list of languages supported. Conventionally, ``zh`` is used for Chinese and ``mul`` for multilingual. Leave ``None`` to use the default language on server. Returns: Sentiment polarity as a numerical value which measures how positive the sentiment is. Examples:: HanLP.language_identification('''“这是一部男人必看的电影。”人人都这么说。但单纯从性别区分,就会让这电影变狭隘。 《肖申克的救赎》突破了男人电影的局限,通篇几乎充满令人难以置信的温馨基调,而电影里最伟大的主题是“希望”。 当我们无奈地遇到了如同肖申克一般囚禁了心灵自由的那种囹圄,我们是无奈的老布鲁克,灰心的瑞德,还是智慧的安迪? 运用智慧,信任希望,并且勇敢面对恐惧心理,去打败它? 经典的电影之所以经典,因为他们都在做同一件事——让你从不同的角度来欣赏希望的美好。''') 0.9505730271339417 """ response = self._send_post_json(self._url + '/sentiment_analysis', {'text': text, 'language': language or self._language}) return response def language_identification(self, text: Union[str, List[str]], topk=False, prob=False) -> Union[ str, Dict[str, float], List[Union[str, Dict[str, float]]]]: """ Identify the language of a given text. Args: text: A document or a list of documents. topk: ``True`` or ``int`` to return the top-k languages. prob: Return also probabilities. Returns: Identified language in `ISO 639-1 codes`_. Examples:: HanLP.language_identification( 'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques.') 'en' lang, prob = HanLP.language_identification( '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True) ('ja', 0.9976244568824768) HanLP.language_identification( '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2) ['zh', 'ja'] HanLP.language_identification( '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=3, prob=True) {'zh': 0.3952908217906952, 'en': 0.37189167737960815, 'ja': 0.056213412433862686} .. _ISO 639-1 codes: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes """ return self.text_classification(text, 'lid', topk, prob) ================================================ FILE: plugins/hanlp_restful/setup.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 19:26 from os.path import abspath, join, dirname from setuptools import find_packages, setup this_dir = abspath(dirname(__file__)) with open(join(this_dir, 'README.md'), encoding='utf-8') as file: long_description = file.read() setup( name='hanlp_restful', version='0.0.23', description='HanLP: Han Language Processing', long_description=long_description, long_description_content_type="text/markdown", url='https://github.com/hankcs/HanLP', author='hankcs', author_email='hankcshe@gmail.com', license='Apache License 2.0', classifiers=[ 'Intended Audience :: Science/Research', 'Intended Audience :: Developers', "Development Status :: 3 - Alpha", 'Operating System :: OS Independent', "License :: OSI Approved :: Apache Software License", 'Programming Language :: Python :: 3 :: Only', 'Topic :: Scientific/Engineering :: Artificial Intelligence', "Topic :: Text Processing :: Linguistic" ], keywords='corpus,machine-learning,NLU,NLP', packages=find_packages(exclude=['docs', 'tests*']), include_package_data=True, install_requires=[ 'hanlp_common' ], python_requires='>=3.6', ) ================================================ FILE: plugins/hanlp_restful/tests/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-29 18:05 ================================================ FILE: plugins/hanlp_restful/tests/test_client.py ================================================ import unittest from hanlp_restful import HanLPClient class TestClient(unittest.TestCase): def setUp(self) -> None: self.HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None) # Fill in your auth def test_raw_text(self): text = '2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。' doc = self.HanLP.parse(text) def test_sents(self): text = ['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'] doc = self.HanLP(text) def test_tokens(self): tokens = [ ["2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"], ["英", "首相", "与", "特朗普", "通", "电话", "讨论", "华为", "与", "苹果", "公司", "。"] ] doc = self.HanLP(tokens=tokens, tasks=['ner*', 'srl', 'dep']) def test_sents_mul(self): text = ['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.', '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'] doc = self.HanLP.parse(text, language='mul') def test_tokenize(self): print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司')) print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True)) print(self.HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重'])) print(self.HanLP.tokenize( ['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.', '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], language='mul')) def test_coreference_resolution(self): print(self.HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。')) def test_text_style_transfer(self): print(self.HanLP.text_style_transfer('国家对中石油抱有很大的期望.', target_style='gov_doc')) print(self.HanLP.text_style_transfer('打工人,打工魂,打工都是人上人', target_style='gov_doc')) print(self.HanLP.text_style_transfer('我看到了窗户外面有白色的云和绿色的森林', target_style='modern_poetry')) def test_abstract_meaning_representation(self): print(self.HanLP.abstract_meaning_representation('男孩希望女孩相信他。')) print(self.HanLP.abstract_meaning_representation('男孩希望女孩相信他。', visualization='dot')) print(self.HanLP.abstract_meaning_representation('男孩希望女孩相信他。', visualization='svg')) print(self.HanLP.abstract_meaning_representation(tokens=[['男孩', '希望', '女孩', '相信', '他', '。']])) print(self.HanLP.abstract_meaning_representation('The boy wants the girl to believe him.', language='en')) def test_keyphrase_extraction(self): print(self.HanLP.keyphrase_extraction( '自然语言处理是一门博大精深的学科,掌握理论才能发挥出HanLP的全部性能。 ' '《自然语言处理入门》是一本配套HanLP的NLP入门书,助你零起点上手自然语言处理。', topk=3)) def test_extractive_summarization(self): text = ''' 据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。 据供应链消息人士称,生产厂的订单拉动情况正在慢慢转强,这会提高MacBook Pro机型的供应量,并缩短苹果客户在过去几周所经历的延长交货时间。 仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。 据分析师郭明錤表示,广达是高端MacBook Pro的唯一供应商,自防疫封控依赖,MacBook Pro大部分型号交货时间增加了三到五周, 一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。 尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。 苹果上周表示,防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求,这最终将影响苹果6月份的收入。 ''' print(self.HanLP.extractive_summarization(text)) if __name__ == '__main__': unittest.main() ================================================ FILE: plugins/hanlp_restful_golang/README.md ================================================ # gohanlp Golang RESTful Client for HanLP We have moved to https://github.com/hankcs/gohanlp ================================================ FILE: plugins/hanlp_restful_java/pom.xml ================================================ 4.0.0 com.hankcs.hanlp.restful hanlp-restful 0.0.15 HanLP RESTful Client in Java https://github.com/hankcs/HanLP HanLP: Han Language Processing hankcs http://www.hankcs.com/ Apache License Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.html 2020 hankcs cnhankmc@gmail.com http://www.hankcs.com scm:git@github.com:hankcs/HanLP.git scm:git@github.com:hankcs/HanLP.git git@github.com:hankcs/HanLP.git org.apache.maven.plugins maven-compiler-plugin 8 8 maven-source-plugin 2.4 attach-sources jar org.apache.maven.plugins maven-javadoc-plugin 2.9.1 package jar org.apache.maven.plugins maven-gpg-plugin 1.6 verify sign --pinentry-mode loopback com.fasterxml.jackson.core jackson-databind 2.14.1 org.junit.jupiter junit-jupiter RELEASE test maven-repo https://oss.sonatype.org/content/repositories/snapshots/ maven-repo https://oss.sonatype.org/service/local/staging/deploy/maven2/ ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/BaseInput.java ================================================ /* * Han He * me@hankcs.com * 2020-12-27 12:07 AM * * * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful; /** * @author hankcs */ public class BaseInput { public String[] tasks; public String[] skip_tasks; public String language; public BaseInput(String[] tasks, String[] skipTasks, String language) { this.tasks = tasks; this.skip_tasks = skipTasks; this.language = language; } } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/CoreferenceResolutionOutput.java ================================================ /* * Han He * me@hankcs.com * 2021-10-16 4:43 PM * * * Copyright (c) 2021, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful; import java.util.ArrayList; import java.util.List; import java.util.Set; /** * A data class for coreference resolution * * @author hankcs */ public class CoreferenceResolutionOutput { public List> clusters; public ArrayList tokens; public CoreferenceResolutionOutput(List> clusters, ArrayList tokens) { this.clusters = clusters; this.tokens = tokens; } } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/DocumentInput.java ================================================ /* * Han He * me@hankcs.com * 2020-12-27 12:09 AM * * * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful; /** * @author hankcs */ public class DocumentInput extends BaseInput { public String text; public DocumentInput(String text, String[] tasks, String[] skipTasks, String language) { super(tasks, skipTasks, language); this.text = text; } } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/HanLPClient.java ================================================ /* * Han He * me@hankcs.com * 2020-12-26 11:54 PM * * * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful; import com.fasterxml.jackson.databind.ObjectMapper; import com.hankcs.hanlp.restful.mrp.MeaningRepresentation; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.net.HttpURLConnection; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.*; /** * A RESTful client implementing the data format specification of HanLP. * * @author hankcs * @see Data Format */ public class HanLPClient { private String url; private String auth; private String language; private int timeout; private ObjectMapper mapper; /** * @param url An API endpoint to a service provider. * @param auth An auth key licenced by a service provider. * @param language The language this client will be expecting. Contact the service provider for the list of * languages supported. Conventionally, zh is used for Chinese and mul for multilingual. * Leave null to use the default language on server. * @param timeout Maximum waiting time in seconds for a request. */ public HanLPClient(String url, String auth, String language, int timeout) { if (auth == null) { auth = System.getenv().getOrDefault("HANLP_AUTH", null); } this.url = url; this.auth = auth; this.language = language; this.timeout = timeout * 1000; this.mapper = new ObjectMapper(); } /** * @param url An API endpoint to a service provider. * @param auth An auth key licenced by a service provider. */ public HanLPClient(String url, String auth) { this(url, auth, null, 5); } /** * Parse a raw document. * * @param text Document content which can have multiple sentences. * @param tasks Tasks to perform. * @param skipTasks Tasks to skip. * @return Parsed annotations. * @throws IOException HTTP exception. * @see Data Format */ public Map parse(String text, String[] tasks, String[] skipTasks) throws IOException { //noinspection unchecked return mapper.readValue(post("/parse", new DocumentInput(text, tasks, skipTasks, language)), Map.class); } /** * Parse a raw document. * * @param text Document content which can have multiple sentences. * @return Parsed annotations. * @throws IOException HTTP exception. * @see Data Format */ public Map parse(String text) throws IOException { return parse(text, null, null); } /** * Parse an array of sentences. * * @param sentences Multiple sentences to parse. * @param tasks Tasks to perform. * @param skipTasks Tasks to skip. * @return Parsed annotations. * @throws IOException HTTP exception. * @see Data Format */ public Map parse(String[] sentences, String[] tasks, String[] skipTasks) throws IOException { //noinspection unchecked return mapper.readValue(post("/parse", new SentenceInput(sentences, tasks, skipTasks, language)), Map.class); } /** * Parse an array of sentences. * * @param sentences Multiple sentences to parse. * @return Parsed annotations. * @throws IOException HTTP exception. * @see Data Format */ public Map parse(String[] sentences) throws IOException { return parse(sentences, null, null); } /** * Parse an array of pre-tokenized sentences. * * @param tokens Multiple pre-tokenized sentences to parse. * @param tasks Tasks to perform. * @param skipTasks Tasks to skip. * @return Parsed annotations. * @throws IOException HTTP exception. * @see Data Format */ public Map parse(String[][] tokens, String[] tasks, String[] skipTasks) throws IOException { //noinspection unchecked return mapper.readValue(post("/parse", new TokenInput(tokens, tasks, skipTasks, language)), Map.class); } /** * Parse an array of pre-tokenized sentences. * * @param tokens Multiple pre-tokenized sentences to parse. * @return Parsed annotations. * @throws IOException HTTP exception. * @see Data Format */ public Map parse(String[][] tokens) throws IOException { return parse(tokens, null, null); } /** * Split a document into sentences and tokenize them. * * @param text A document. * @param coarse Whether to perform coarse-grained or fine-grained tokenization. * @return A list of tokenized sentences. * @throws IOException HTTP exception. */ public List> tokenize(String text, Boolean coarse) throws IOException { String[] tasks; if (coarse != null) { if (coarse) tasks = new String[]{"tok/coarse"}; else tasks = new String[]{"tok/fine"}; } else tasks = new String[]{"tok"}; Map doc = parse(text, tasks, null); //noinspection unchecked return doc.values().iterator().next(); } /** * Split a document into sentences and tokenize them using fine-grained standard. * * @param text A document. * @return A list of tokenized sentences. * @throws IOException HTTP exception. */ public List> tokenize(String text) throws IOException { return tokenize(text, null); } /** * Text style transfer aims to change the style of the input text to the target style while preserving its content. * * @param text Source text. * @param targetStyle Target style. * @return Text of the target style. */ public List textStyleTransfer(List text, String targetStyle) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("target_style", targetStyle); input.put("language", language); //noinspection unchecked return mapper.readValue(post("/text_style_transfer", input), List.class); } /** * Text style transfer aims to change the style of the input text to the target style while preserving its content. * * @param text Source text. * @param targetStyle Target style. * @return Text of the target style. */ public String textStyleTransfer(String text, String targetStyle) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("target_style", targetStyle); input.put("language", language); return mapper.readValue(post("/text_style_transfer", input), String.class); } /** * Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as * spelling, punctuation, grammatical, and word choice errors. * * @param text Text potentially containing different kinds of errors such as spelling, punctuation, * grammatical, and word choice errors. * @return Corrected text. */ public List grammaticalErrorCorrection(List text) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("language", language); //noinspection unchecked return mapper.readValue(post("/grammatical_error_correction", input), List.class); } /** * Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as * spelling, punctuation, grammatical, and word choice errors. * * @param text Text potentially containing different kinds of errors such as spelling, punctuation, * grammatical, and word choice errors. * @return Corrected text. */ public String[] grammaticalErrorCorrection(String[] text) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("language", language); //noinspection unchecked return mapper.readValue(post("/grammatical_error_correction", input), String[].class); } /** * Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as * spelling, punctuation, grammatical, and word choice errors. * * @param text Text potentially containing different kinds of errors such as spelling, punctuation, * grammatical, and word choice errors. * @return Corrected text. */ public String grammaticalErrorCorrection(String text) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("language", language); return mapper.readValue(post("/grammatical_error_correction", input), String.class); } /** * Semantic textual similarity deals with determining how similar two pieces of texts are. * * @param textA The first text. * @param textB The second text. * @return Their similarity. * @throws IOException HTTP errors. */ public Double semanticTextualSimilarity(String textA, String textB) throws IOException { Map input = new HashMap<>(); input.put("text", new String[]{textA, textB}); input.put("language", language); return mapper.readValue(post("/semantic_textual_similarity", input), Double.class); } /** * Semantic textual similarity deals with determining how similar two pieces of texts are. * * @param text The pairs of text. * @return Their similarities. * @throws IOException HTTP errors. */ public List semanticTextualSimilarity(String[][] text) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("language", language); //noinspection unchecked return mapper.readValue(post("/semantic_textual_similarity", input), List.class); } /** * Coreference resolution is the task of clustering mentions in text that refer to the same underlying real world entities. * * @param text A piece of text, usually a document without tokenization. * @return Coreference resolution clusters and tokens. * @throws IOException HTTP errors. */ public CoreferenceResolutionOutput coreferenceResolution(String text) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("language", language); //noinspection unchecked Map response = mapper.readValue(post("/coreference_resolution", input), Map.class); //noinspection unchecked List> clusters = response.get("clusters"); return new CoreferenceResolutionOutput(_convert_clusters(clusters), (ArrayList) response.get("tokens")); } /** * Coreference resolution is the task of clustering mentions in text that refer to the same underlying real world entities. * * @param tokens A list of sentences where each sentence is a list of tokens. * @param speakers A list of speakers where each speaker is a String representing the speaker's ID, e.g., "Tom". * @return Coreference resolution clusters. * @throws IOException HTTP errors. */ public List> coreferenceResolution(String[][] tokens, String[] speakers) throws IOException { Map input = new HashMap<>(); input.put("tokens", tokens); input.put("speakers", speakers); input.put("language", language); //noinspection unchecked List> clusters = mapper.readValue(post("/coreference_resolution", input), List.class); return _convert_clusters(clusters); } /** * Coreference resolution is the task of clustering mentions in text that refer to the same underlying real world entities. * * @param tokens A list of sentences where each sentence is a list of tokens. * @return Coreference resolution clusters. * @throws IOException HTTP errors. */ public List> coreferenceResolution(String[][] tokens) throws IOException { Map input = new HashMap<>(); input.put("tokens", tokens); input.put("language", language); //noinspection unchecked List> clusters = mapper.readValue(post("/coreference_resolution", input), List.class); return _convert_clusters(clusters); } private static List> _convert_clusters(List> clusters) { List> results = new ArrayList<>(clusters.size()); for (List cluster : clusters) { Set spans = new LinkedHashSet<>(); for (List span : cluster) { spans.add(new Span((String) span.get(0), (Integer) span.get(1), (Integer) span.get(2))); } results.add(spans); } return results; } /** * Abstract Meaning Representation (AMR) captures “who is doing what to whom” in a sentence. Each sentence is * represented as a rooted, directed, acyclic graph consisting of nodes (concepts) and edges (relations). * * @param text A piece of text, usually a document without tokenization. * @return AMR graphs. * @throws IOException HTTP errors. */ public MeaningRepresentation[] abstractMeaningRepresentation(String text) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("language", language); return mapper.readValue(post("/abstract_meaning_representation", input), MeaningRepresentation[].class); } /** * Abstract Meaning Representation (AMR) captures “who is doing what to whom” in a sentence. Each sentence is * represented as a rooted, directed, acyclic graph consisting of nodes (concepts) and edges (relations). * * @param tokens A list of sentences where each sentence is a list of tokens. * @return AMR graphs. * @throws IOException HTTP errors. */ public MeaningRepresentation[] abstractMeaningRepresentation(String[][] tokens) throws IOException { Map input = new HashMap<>(); input.put("tokens", tokens); input.put("language", language); return mapper.readValue(post("/abstract_meaning_representation", input), MeaningRepresentation[].class); } /** * Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document. * * @param text The text content of the document. Preferably the concatenation of the title and the content. * @param topk The number of top-K ranked keywords or keyphrases. * @return A dictionary containing each keyphrase and its ranking score s between 0 and 1. * @throws IOException HTTP errors. */ public Map keyphraseExtraction(String text, int topk) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("topk", topk); input.put("language", language); //noinspection unchecked return mapper.readValue(post("/keyphrase_extraction", input), LinkedHashMap.class); } /** * Single document summarization is the task of selecting a subset of the sentences which best * represents a summary of the document, with a balance of salience and redundancy. * * @param text The text content of the document. * @return A dictionary containing each sentence and its ranking score s between 0 and 1. * @throws IOException HTTP errors. */ public Map extractiveSummarization(String text) throws IOException { return extractiveSummarization(text, 3); } /** * Single document summarization is the task of selecting a subset of the sentences which best * represents a summary of the document, with a balance of salience and redundancy. * * @param text The text content of the document. * @param topk The maximum number of top-K ranked sentences. Note that due to Trigram Blocking tricks, the actual * number of returned sentences could be less than ``topk``. * @return A dictionary containing each sentence and its ranking score s between 0 and 1. * @throws IOException HTTP errors. */ public Map extractiveSummarization(String text, int topk) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("topk", topk); input.put("language", language); //noinspection unchecked return mapper.readValue(post("/extractive_summarization", input), LinkedHashMap.class); } /** * Abstractive Summarization is the task of generating a short and concise summary that captures the * salient ideas of the source text. The generated summaries potentially contain new phrases and sentences that * may not appear in the source text. * * @param text The text content of the document. * @return Summarization. * @throws IOException HTTP errors. */ public String abstractiveSummarization(String text) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("language", language); //noinspection unchecked return mapper.readValue(post("/abstractive_summarization", input), String.class); } /** * Text classification is the task of assigning a sentence or document an appropriate category. * The categories depend on the chosen dataset and can range from topics. * * @param text The text content of the document. * @param model The model to use for prediction. * @return Classification results. * @throws IOException HTTP errors. */ public String textClassification(String text, String model) throws IOException { return (String) textClassification(text, model, false, false); } /** * Sentiment analysis is the task of classifying the polarity of a given text. For instance, * a text-based tweet can be categorized into either "positive", "negative", or "neutral". * * @param text The text content of the document. * @return Sentiment polarity as a numerical value which measures how positive the sentiment is. * @throws IOException HTTP errors. */ public Double sentimentAnalysis(String text) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("language", language); //noinspection unchecked return mapper.readValue(post("/sentiment_analysis", input), Double.class); } /** * Text classification is the task of assigning a sentence or document an appropriate category. * The categories depend on the chosen dataset and can range from topics. * * @param text A document or a list of documents. * @param model The model to use for prediction. * @param topk `true` or `int` to return the top-k languages. * @param prob Return also probabilities. * @return Classification results. * @throws IOException HTTP errors. */ public Object textClassification(Object text, String model, Object topk, boolean prob) throws IOException { Map input = new HashMap<>(); input.put("text", text); input.put("model", model); input.put("topk", topk); input.put("prob", prob); //noinspection unchecked return mapper.readValue(post("/text_classification", input), Object.class); } /** * Recognize the language of a given text. * * @param text The text content of the document. * @return Identified language in ISO 639-1 codes. * @throws IOException HTTP errors. */ public String languageIdentification(String text) throws IOException { return textClassification(text, "lid"); } /** * Recognize the language of a given text. * * @param text The text content of the document. * @return Identified language in ISO 639-1 codes. * @throws IOException HTTP errors. */ public List languageIdentification(String[] text) throws IOException { return (List) textClassification(text, "lid", false, false); } /** * Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document. * * @param text The text content of the document. Preferably the concatenation of the title and the content. * @return A dictionary containing 10 keyphrases and their ranking scores s between 0 and 1. * @throws IOException HTTP errors. */ public Map keyphraseExtraction(String text) throws IOException { return keyphraseExtraction(text, 10); } private String post(String api, Object input_) throws IOException { URL url = new URL(this.url + api); HttpURLConnection con = (HttpURLConnection) url.openConnection(); con.setRequestMethod("POST"); if (auth != null) con.setRequestProperty("Authorization", "Basic " + auth); con.setRequestProperty("Content-Type", "application/json; utf-8"); con.setRequestProperty("Accept", "application/json"); con.setDoOutput(true); con.setConnectTimeout(timeout); con.setReadTimeout(timeout); String jsonInputString = mapper.writeValueAsString(input_); try (OutputStream os = con.getOutputStream()) { byte[] input = jsonInputString.getBytes(StandardCharsets.UTF_8); os.write(input, 0, input.length); } int code = con.getResponseCode(); if (code != 200) { StringBuilder response = new StringBuilder(); try (BufferedReader br = new BufferedReader(new InputStreamReader(con.getErrorStream(), StandardCharsets.UTF_8))) { String responseLine; while ((responseLine = br.readLine()) != null) { response.append(responseLine.trim()); } } String error = String.format("Request failed, status code = %d, error = %s", code, con.getResponseMessage()); try { Map detail = mapper.readValue(response.toString(), Map.class); error = (String) detail.get("detail"); } catch (Exception ignored) { } throw new IOException(error); } StringBuilder response = new StringBuilder(); try (BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(), StandardCharsets.UTF_8))) { String responseLine; while ((responseLine = br.readLine()) != null) { response.append(responseLine.trim()); } } return response.toString(); } } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/SentenceInput.java ================================================ /* * Han He * me@hankcs.com * 2020-12-27 12:09 AM * * * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful; /** * @author hankcs */ public class SentenceInput extends BaseInput { public String[] text; public SentenceInput(String[] text, String[] tasks, String[] skipTasks, String language) { super(tasks, skipTasks, language); this.text = text; } } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/Span.java ================================================ /* * Han He * me@hankcs.com * 2021-10-16 4:26 PM * * * Copyright (c) 2021, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful; import java.util.Objects; /** * A common data format to represent a span. * * @author hankcs */ public class Span { /** * The raw form of a span, which can be either a token, an entity or a mention etc. */ public String form; /** * The inclusive beginning offset of a span. */ public int begin; /** * The exclusive ending offset of a span. */ public int end; public Span(String form, int begin, int end) { this.form = form; this.begin = begin; this.end = end; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Span span = (Span) o; return begin == span.begin && end == span.end && form.equals(span.form); } @Override public int hashCode() { return Objects.hash(form, begin, end); } @Override public String toString() { return String.format("[%d, %d) = %s", begin, end, form); } } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/TokenInput.java ================================================ /* * Han He * me@hankcs.com * 2020-12-27 12:09 AM * * * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful; /** * @author hankcs */ public class TokenInput extends BaseInput { public String[][] tokens; public TokenInput(String[][] tokens, String[] tasks, String[] skipTasks, String language) { super(tasks, skipTasks, language); this.tokens = tokens; } } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Anchor.java ================================================ /* * Han He * me@hankcs.com * 2022-04-13 8:58 AM * * * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful.mrp; /** * @author hankcs */ public class Anchor { public String from; public String to; } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Edge.java ================================================ /* * Han He * me@hankcs.com * 2022-04-13 9:01 AM * * * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful.mrp; /** * @author hankcs */ public class Edge { public int source; public int target; public String label; } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/MeaningRepresentation.java ================================================ /* * Han He * me@hankcs.com * 2022-04-13 8:57 AM * * * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful.mrp; /** * Graph-based meaning representation. * * @author hankcs */ public class MeaningRepresentation { public String id; public String input; public Node[] nodes; public Edge[] edges; public String[] tops; public String framework; } ================================================ FILE: plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Node.java ================================================ /* * Han He * me@hankcs.com * 2022-04-13 8:57 AM * * * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/ * See LICENSE file in the project root for full license information. * */ package com.hankcs.hanlp.restful.mrp; /** * @author hankcs */ public class Node { public int id; public String label; public String[] properties; public String[] values; public Anchor[] anchors; } ================================================ FILE: plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java ================================================ package com.hankcs.hanlp.restful; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Set; class HanLPClientTest { HanLPClient client; @BeforeEach void setUp() { client = new HanLPClient("https://hanlp.hankcs.com/api", null); } @org.junit.jupiter.api.Test void parseText() throws IOException { Map doc = client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。英首相与特朗普通电话讨论华为与苹果公司。"); prettyPrint(doc); } @org.junit.jupiter.api.Test void parseSentences() throws IOException { Map doc = client.parse(new String[]{ "2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。", "英首相与特朗普通电话讨论华为与苹果公司。" }); prettyPrint(doc); } @org.junit.jupiter.api.Test void parseTokens() throws IOException { Map doc = client.parse(new String[][]{ new String[]{"2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"}, new String[]{"英", "首相", "与", "特朗普", "通", "电话", "讨论", "华为", "与", "苹果", "公司", "。"}, }); prettyPrint(doc); } @Test void parseCoarse() throws IOException { Map doc = client.parse( "阿婆主来到北京立方庭参观自然语义科技公司。", new String[]{"tok/coarse", "pos", "dep"}, new String[]{"tok/fine"}); prettyPrint(doc); } @Test void tokenize() throws IOException { List> fine = client.tokenize("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。"); System.out.println(fine); List> coarse = client.tokenize("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。", true); System.out.println(coarse); } @Test void textStyleTransfer() throws IOException { String doc = client.textStyleTransfer("国家对中石油抱有很大的期望.", "gov_doc"); prettyPrint(doc); } @Test void semanticTextualSimilarity() throws IOException { Double similarity = client.semanticTextualSimilarity("看图猜一电影名", "看图猜电影"); prettyPrint(similarity); List similarities = client.semanticTextualSimilarity(new String[][]{ new String[]{"看图猜一电影名", "看图猜电影"}, new String[]{"北京到上海的动车票", "上海到北京的动车票"} }); for (Double similarityPerPair : similarities) { prettyPrint(similarityPerPair); } } @Test void coreferenceResolutionText() throws IOException { CoreferenceResolutionOutput clusters = client.coreferenceResolution("我姐送我她的猫。我很喜欢它。"); prettyPrint(clusters); } @Test void coreferenceResolutionTokens() throws IOException { List> clusters = client.coreferenceResolution( new String[][]{ new String[]{"我", "姐", "送", "我", "她", "的", "猫", "。"}, new String[]{"我", "很", "喜欢", "它", "。"}}); prettyPrint(clusters); } @Test void coreferenceResolutionTokensWithSpeakers() throws IOException { List> clusters = client.coreferenceResolution( new String[][]{ new String[]{"我", "姐", "送", "我", "她", "的", "猫", "。"}, new String[]{"我", "很", "喜欢", "它", "。"}}, new String[]{"张三", "张三"}); prettyPrint(clusters); } @Test void keyphraseExtraction() throws IOException { prettyPrint(client.keyphraseExtraction( "自然语言处理是一门博大精深的学科,掌握理论才能发挥出HanLP的全部性能。" + "《自然语言处理入门》是一本配套HanLP的NLP入门书,助你零起点上手自然语言处理。", 3)); } @Test void extractiveSummarization() throws IOException { prettyPrint(client.extractiveSummarization( "据DigiTimes报道,在上海疫情趋缓,防疫管控开始放松后,苹果供应商广达正在逐步恢复其中国工厂的MacBook产品生产。\n" + "据供应链消息人士称,生产厂的订单拉动情况正在慢慢转强,这会提高MacBook Pro机型的供应量,并缩短苹果客户在过去几周所经历的延长交货时间。\n" + "仍有许多苹果笔记本用户在等待3月和4月订购的MacBook Pro机型到货,由于苹果的供应问题,他们的发货时间被大大推迟了。\n" + "据分析师郭明錤表示,广达是高端MacBook Pro的唯一供应商,自防疫封控依赖,MacBook Pro大部分型号交货时间增加了三到五周,\n" + "一些高端定制型号的MacBook Pro配置要到6月底到7月初才能交货。\n" + "尽管MacBook Pro的生产逐渐恢复,但供应问题预计依然影响2022年第三季度的产品销售。\n" + "苹果上周表示,防疫措施和元部件短缺将继续使其难以生产足够的产品来满足消费者的强劲需求,这最终将影响苹果6月份的收入。")); } @Test void abstractiveSummarization() throws IOException { prettyPrint(client.abstractiveSummarization( "每经AI快讯,2月4日,长江证券研究所金属行业首席分析师王鹤涛表示,2023年海外经济衰退,美债现处于历史高位,\n" + "黄金的趋势是值得关注的;在国内需求修复的过程中,看好大金属品种中的铜铝钢。\n" + "此外,在细分的小品种里,建议关注两条主线,一是新能源,比如锂、钴、镍、稀土,二是专精特新主线。(央视财经)")); } @Test void abstractMeaningRepresentationText() throws IOException { prettyPrint(client.abstractMeaningRepresentation("男孩希望女孩相信他。阿婆主来到北京立方庭参观自然语义科技公司。")); } @Test void abstractMeaningRepresentationTokens() throws IOException { prettyPrint(client.abstractMeaningRepresentation(new String[][]{ new String[]{"2021年", "HanLPv2.1", "为", "生产", "环境", "带来", "次", "世代", "最", "先进", "的", "多语种", "NLP", "技术", "。"}, new String[]{"英", "首相", "与", "特朗普", "通", "电话", "讨论", "华为", "与", "苹果", "公司", "。"}})); } @Test void grammaticalErrorCorrection() throws IOException { prettyPrint(client.grammaticalErrorCorrection(new String[]{"每个青年都应当有远大的报复。", "有的同学对语言很兴趣。"})); } @Test void languageIdentification() throws IOException { prettyPrint(client.languageIdentification(new String[]{ "In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.", "2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。", "2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。", })); } @Test void sentimentAnalysis() throws IOException { prettyPrint(client.sentimentAnalysis( "“这是一部男人必看的电影。”人人都这么说。但单纯从性别区分,就会让这电影变狭隘。《肖申克的救赎》突破了男人电影的局限,通篇几乎充满令人难以置信的温馨基调,而电影里最伟大的主题是“希望”。 当我们无奈地遇到了如同肖申克一般囚禁了心灵自由的那种囹圄,我们是无奈的老布鲁克,灰心的瑞德,还是智慧的安迪?运用智慧,信任希望,并且勇敢面对恐惧心理,去打败它? 经典的电影之所以经典,因为他们都在做同一件事——让你从不同的角度来欣赏希望的美好。" )); } void prettyPrint(Object object) throws JsonProcessingException { ObjectMapper mapper = new ObjectMapper(); System.out.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(object)); } } ================================================ FILE: plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/MeaningRepresentationTest.java ================================================ package com.hankcs.hanlp.restful; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.hankcs.hanlp.restful.mrp.MeaningRepresentation; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Set; class MeaningRepresentationTest { @Test void parseText() throws IOException { String json = "[{\"id\": \"0\", \"input\": \"北京 大学 计算 语言学 研究所 和 富士通 研究 开发 中心 有限公司 , 得到 了 人民日报社 新闻 信息 中心 的 语料库 。\", \"nodes\": [{\"id\": 0, \"label\": \"name\", \"properties\": [\"op1\", \"op2\"], \"values\": [\"北京\", \"大学\"], \"anchors\": [{\"from\": 0, \"to\": 2}, {\"from\": 3, \"to\": 5}]}, {\"id\": 1, \"label\": \"university\", \"anchors\": []}, {\"id\": 2, \"label\": \"name\", \"properties\": [\"op1\", \"op2\", \"op4\"], \"values\": [\"计算\", \"语言学\", \"\"], \"anchors\": [{\"from\": 6, \"to\": 8}, {\"from\": 9, \"to\": 12}, {\"from\": 13, \"to\": 16}]}, {\"id\": 3, \"label\": \"research-institute\", \"anchors\": []}, {\"id\": 4, \"label\": \"and\", \"anchors\": []}, {\"id\": 5, \"label\": \"name\", \"properties\": [\"op1\", \"op2\", \"op3\", \"op4\", \"op5\"], \"values\": [\"富士通\", \"研究\", \"开发\", \"中心\", \"有限公司\"], \"anchors\": [{\"from\": 19, \"to\": 22}, {\"from\": 23, \"to\": 25}, {\"from\": 26, \"to\": 28}, {\"from\": 29, \"to\": 31}, {\"from\": 32, \"to\": 36}]}, {\"id\": 6, \"label\": \"company\", \"anchors\": []}, {\"id\": 7, \"label\": \"得到-01\", \"anchors\": [{\"from\": 39, \"to\": 41}]}, {\"id\": 8, \"label\": \"了\", \"anchors\": [{\"from\": 42, \"to\": 43}]}, {\"id\": 9, \"label\": \"name\", \"properties\": [\"op1\"], \"values\": [\"人民日报社\"], \"anchors\": [{\"from\": 44, \"to\": 49}]}, {\"id\": 10, \"label\": \"organization\", \"anchors\": []}, {\"id\": 11, \"label\": \"name\", \"properties\": [\"op1\", \"op2\", \"op3\"], \"values\": [\"新闻\", \"信息\", \"中心\"], \"anchors\": [{\"from\": 50, \"to\": 52}, {\"from\": 53, \"to\": 55}, {\"from\": 56, \"to\": 58}]}, {\"id\": 12, \"label\": \"organization\", \"anchors\": []}, {\"id\": 13, \"label\": \"语料库\", \"anchors\": [{\"from\": 61, \"to\": 64}]}], \"edges\": [{\"source\": 7, \"target\": 8, \"label\": \"aspect\"}, {\"source\": 7, \"target\": 4, \"label\": \"arg0\"}, {\"source\": 10, \"target\": 9, \"label\": \"name\"}, {\"source\": 4, \"target\": 6, \"label\": \"op2\"}, {\"source\": 7, \"target\": 13, \"label\": \"arg1\"}, {\"source\": 6, \"target\": 5, \"label\": \"name\"}, {\"source\": 12, \"target\": 11, \"label\": \"name\"}, {\"source\": 3, \"target\": 2, \"label\": \"name\"}, {\"source\": 1, \"target\": 0, \"label\": \"name\"}, {\"source\": 13, \"target\": 12, \"label\": \"poss\"}, {\"source\": 4, \"target\": 3, \"label\": \"op1\"}, {\"source\": 12, \"target\": 9, \"label\": \"name\"}, {\"source\": 1, \"target\": 3, \"label\": \"part\"}], \"tops\": [7], \"framework\": \"amr\"}]"; ObjectMapper mapper = new ObjectMapper(); MeaningRepresentation[] graphs = mapper.readValue(json, MeaningRepresentation[].class); prettyPrint(graphs); } void prettyPrint(Object object) throws JsonProcessingException { ObjectMapper mapper = new ObjectMapper(); System.out.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(object)); } } ================================================ FILE: plugins/hanlp_trie/README.md ================================================ # Trie interface and implementation for HanLP [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker) The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch. ## Installation ```bash pip install hanlp ``` ## License HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website. ================================================ FILE: plugins/hanlp_trie/hanlp_trie/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-29 17:48 from .trie import Trie from .dictionary import DictInterface, TrieDict ================================================ FILE: plugins/hanlp_trie/hanlp_trie/dictionary.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-29 17:53 from abc import ABC, abstractmethod from typing import List, Tuple, Any, Dict, Union, Sequence, Iterable, Optional from hanlp_common.configurable import Configurable from hanlp_common.reflection import classpath_of from hanlp_trie.trie import Trie class DictInterface(ABC): @abstractmethod def tokenize(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]: """Implement this method to tokenize a piece of text into a list of non-intersect spans, each span is a tuple of ``(begin_offset, end_offset, label)``, where label is some properties related to this span and downstream tasks have the freedom to define what kind of labels they want. Args: text: The text to be tokenized. Returns: A list of tokens. """ pass def split(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]: """Like the :meth:`str.split`, this method splits a piece of text into chunks by taking the keys in this dictionary as delimiters. It performs longest-prefix-matching on text and split it whenever a longest key is matched. Unlike the :meth:`str.split`, it inserts matched keys into the results list right after where they are found. So that the text can be restored by joining chunks in the results list. Args: text: A piece of text. Returns: A list of chunks, each chunk is a span of ``(begin_offset, end_offset, label)``, where label is some properties related to this span and downstream tasks. """ offset = 0 spans = [] for begin, end, label in self.tokenize(text): if begin > offset: spans.append(text[offset:begin]) spans.append((begin, end, label)) offset = end if offset < len(text): spans.append(text[offset:]) return spans class TrieDict(Trie, DictInterface, Configurable): def __init__(self, dictionary: Optional[Union[Dict[Iterable[str], Any], Iterable[str]]] = None) -> None: r""" A dict-like structure for fast custom dictionary strategies in tokenization and tagging. It is built with a dict of key-value pairs or a set of strings. When a set is passed in, it will be turned into a dict where each key is assigned with a boolean value ``True``. Args: dictionary: A custom dictionary of string-value pairs. """ super().__init__(dictionary) def tokenize(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]: return self.parse_longest(text) def split_batch(self, data: List[str]) -> Tuple[List[str], List[int], List[List[Tuple[int, int, Any]]]]: """ A handy method to perform longest-prefix-matching on a batch of sentences. It tokenize each sentence, record the chunks being either a key in the dict or a span outside of the dict. The spans are then packed into a new batch and returned along with the following information: - which sentence a span belongs to - the matched keys along with their spans and values. This method bridges the gap between statistical models and rule-based gazetteers. It's used in conjunction with :meth:`~hanlp_trie.dictionary.TrieDict.merge_batch`. Args: data: A batch of sentences. Returns: A tuple of the new batch, the belonging information and the keys. """ new_data, new_data_belongs, parts = [], [], [] for idx, sent in enumerate(data): parts.append([]) found = self.tokenize(sent) if found: pre_start = 0 for start, end, info in found: if start > pre_start: new_data.append(sent[pre_start:start]) new_data_belongs.append(idx) pre_start = end parts[idx].append((start, end, info)) if pre_start != len(sent): new_data.append(sent[pre_start:]) new_data_belongs.append(idx) else: new_data.append(sent) new_data_belongs.append(idx) return new_data, new_data_belongs, parts @staticmethod def merge_batch(data, new_outputs, new_data_belongs, parts): """ A helper method to merge the outputs of split batch back by concatenating the output per span with the key used to split it. It's used in conjunction with :meth:`~hanlp_trie.dictionary.TrieDict.split_batch`. Args: data: Split batch. new_outputs: Outputs of the split batch. new_data_belongs: Belonging information. parts: The keys. Returns: Merged outputs. """ outputs = [] segments = [] for idx in range(len(data)): segments.append([]) for o, b in zip(new_outputs, new_data_belongs): dst = segments[b] dst.append(o) for s, p, sent in zip(segments, parts, data): s: list = s if p: dst = [] offset = 0 for start, end, info in p: while offset < start: head = s.pop(0) offset += sum(len(token) for token in head) dst += head if isinstance(info, list): dst += info elif isinstance(info, str): dst.append(info) else: dst.append(sent[start:end]) offset = end if s: assert len(s) == 1 dst += s[0] outputs.append(dst) else: outputs.append(s[0]) return outputs @property def config(self): return { 'classpath': classpath_of(self), 'dictionary': dict(self.items()) } class TupleTrieDict(TrieDict): def __init__(self, dictionary: Optional[Union[Dict[Iterable[str], Any], Iterable[str]]] = None) -> None: r""" A dict-like structure for fast custom dictionary strategies in tokenization and tagging. It is built with a dict of key-value pairs or a set of strings. When a set is passed in, it will be turned into a dict where each key is assigned with a boolean value ``True``. In comparison to ``TrieDict``, ``TupleTrieDict`` additionally supports serializing/deserializing tuple-as-keys dict. Args: dictionary: A custom dictionary of string-value pairs. """ if isinstance(dictionary, list) and dictionary and isinstance(dictionary[0], (list, tuple)): _d = dict() for k, v in dictionary: _d[tuple(k)] = v dictionary = _d super().__init__(dictionary) @property def config(self): return { 'classpath': classpath_of(self), 'dictionary': list(self.items(prefix=())) } def parse_longest(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]: """Longest-prefix-matching which tries to match the longest keyword sequentially from the head of the text till its tail. By definition, the matches won't overlap with each other. Args: text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str. The trie will transit on either types properly, which means a list of str simply defines a list of transition criteria while a str defines each criterion as a character. Returns: A tuple of ``(begin, end, value)``. """ found = [] i = 0 while i < len(text): state = self.transit(text[i:i + 1]) if state: to = i + 1 end = to value = state._value for to in range(i + 1, len(text)): state = state.transit(text[to:to + 1]) if not state: break if state._value is not None: value = state._value end = to + 1 if value is not None: found.append((i, end, value)) i = end - 1 i += 1 return found ================================================ FILE: plugins/hanlp_trie/hanlp_trie/trie.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-01-04 23:46 from typing import Dict, Any, List, Tuple, Sequence, Union, Iterable, Optional class Node(object): def __init__(self, value=None) -> None: """A node in a trie tree. Args: value: The value associated with this node. """ self._children = {} self._value = value def _get_or_add_child(self, char): child = self._children.get(char) if child is None: child = Node(None) self._children[char] = child return child def transit(self, key): """Transit the state of a Deterministic Finite Automata (DFA) with key. Args: key: A sequence of criterion (tokens or characters) used to transit to a new state. Returns: A new state if the transition succeeded, otherwise ``None``. """ state = self for char in key: state = state._children.get(char) if state is None: break return state def _walk(self, prefix: Union[str, tuple], ordered=False): for char, child in sorted(self._children.items()) if ordered else self._children.items(): prefix_new = prefix + (char if isinstance(prefix, str) else (char,)) if child._value: yield prefix_new, child._value yield from child._walk(prefix_new) class Trie(Node): def __init__(self, tokens: Optional[Union[Dict[str, Any], Iterable[str]]] = None) -> None: """A referential implementation of the trie (:cite:`10.1145/1457838.1457895`) structure. It stores a dict by assigning each key/value pair a :class:`~hanlp_trie.trie.Node` in a trie tree. It provides get/set/del/items methods just like a :class:`dict` does. Additionally, it also provides longest-prefix-matching and keywords lookup against a piece of text, which are very helpful in rule-based Natural Language Processing. Args: tokens: A set of keys or a dict mapping. """ super().__init__() self._size = 0 if tokens: if isinstance(tokens, dict): for k, v in tokens.items(): self[k] = v else: for k in tokens: self[k] = True def __contains__(self, key): return self[key] is not None def __getitem__(self, key): state = self.transit(key) if state is None: return None return state._value def __setitem__(self, key, value): state = self for char in key[:-1]: state = state._get_or_add_child(char) leaf = state._get_or_add_child(key[-1]) if leaf._value is None: self._size += 1 leaf._value = value def __delitem__(self, key): state = self.transit(key) if state is not None: state._value = None self._size -= 1 def update(self, dic: Dict[str, Any]): for k, v in dic.items(): self[k] = v return self def parse(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]: """Keywords lookup which takes a piece of text as input, and lookup all occurrences of keywords in it. These occurrences can overlap with each other. Args: text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str. The trie will transit on either types properly, which means a list of str simply defines a list of transition criteria while a str defines each criterion as a character. Returns: A tuple of ``(begin, end, value)``. """ found = [] for i in range(len(text)): state = self for j in range(i, len(text)): state = state.transit(text[j]) if state: if state._value is not None: found.append((i, j + 1, state._value)) else: break return found def parse_longest(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]: """Longest-prefix-matching which tries to match the longest keyword sequentially from the head of the text till its tail. By definition, the matches won't overlap with each other. Args: text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str. The trie will transit on either types properly, which means a list of str simply defines a list of transition criteria while a str defines each criterion as a character. Returns: A tuple of ``(begin, end, value)``. """ found = [] i = 0 while i < len(text): state = self.transit(text[i]) if state: to = i + 1 end = to value = state._value for to in range(i + 1, len(text)): state = state.transit(text[to]) if not state: break if state._value is not None: value = state._value end = to + 1 if value is not None: found.append((i, end, value)) i = end - 1 i += 1 return found def items(self, ordered=False, prefix=''): yield from self._walk(prefix, ordered) def __len__(self): return self._size def __bool__(self): return bool(len(self)) ================================================ FILE: plugins/hanlp_trie/setup.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 19:26 from os.path import abspath, join, dirname from setuptools import find_packages, setup this_dir = abspath(dirname(__file__)) with open(join(this_dir, 'README.md'), encoding='utf-8') as file: long_description = file.read() setup( name='hanlp_trie', version='0.0.5', description='HanLP: Han Language Processing', long_description=long_description, long_description_content_type="text/markdown", url='https://github.com/hankcs/HanLP', author='hankcs', author_email='hankcshe@gmail.com', license='Apache License 2.0', classifiers=[ 'Intended Audience :: Science/Research', 'Intended Audience :: Developers', "Development Status :: 3 - Alpha", 'Operating System :: OS Independent', "License :: OSI Approved :: Apache Software License", 'Programming Language :: Python :: 3 :: Only', 'Topic :: Scientific/Engineering :: Artificial Intelligence', "Topic :: Text Processing :: Linguistic" ], keywords='corpus,machine-learning,NLU,NLP', packages=find_packages(exclude=['docs', 'tests*']), include_package_data=True, install_requires=[ 'hanlp_common' ], python_requires='>=3.6', ) ================================================ FILE: plugins/hanlp_trie/tests/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2020-11-29 18:05 ================================================ FILE: plugins/hanlp_trie/tests/test_trie.py ================================================ import unittest from hanlp_trie import Trie class TestTrie(unittest.TestCase): def build_small_trie(self): return Trie({'商品': 'goods', '和': 'and', '和服': 'kimono', '服务': 'service', '务': 'business'}) def assert_results_valid(self, text, results, trie): for begin, end, value in results: self.assertEqual(value, trie[text[begin:end]]) def test_parse(self): trie = self.build_small_trie() text = '商品和服务' parse_result = trie.parse(text) self.assert_results_valid(text, parse_result, trie) self.assertEqual([(0, 2, 'goods'), (2, 3, 'and'), (2, 4, 'kimono'), (3, 5, 'service'), (4, 5, 'business')], parse_result) def test_parse_longest(self): trie = self.build_small_trie() text = '商品和服务' parse_longest_result = trie.parse_longest(text) self.assert_results_valid(text, parse_longest_result, trie) self.assertEqual([(0, 2, 'goods'), (2, 4, 'kimono'), (4, 5, 'business')], parse_longest_result) def test_items(self): trie = self.build_small_trie() items = list(trie.items()) self.assertEqual([('商品', 'goods'), ('和', 'and'), ('和服', 'kimono'), ('服务', 'service'), ('务', 'business')], items) def test_len(self): trie = self.build_small_trie() self.assertEqual(len(trie), 5) trie['和'] = '&' self.assertEqual(len(trie), 5) del trie['和'] self.assertEqual(len(trie), 4) trie['和'] = '&' self.assertEqual(len(trie), 5) if __name__ == '__main__': unittest.main() ================================================ FILE: plugins/hanlp_trie/tests/test_trie_dict.py ================================================ import unittest from hanlp_trie import TrieDict class TestTrieDict(unittest.TestCase): def setUp(self) -> None: super().setUp() self.text = '第一个词语很重要,第二个词语也很重要' self.trie_dict = TrieDict({'重要': 'important'}) def test_tokenize(self): self.assertEqual([(6, 8, 'important'), (16, 18, 'important')], self.trie_dict.tokenize(self.text)) def test_split_batch(self): data = [self.text] new_data, new_data_belongs, parts = self.trie_dict.split_batch(data) predictions = [list(x) for x in new_data] self.assertSequenceEqual( [['第', '一', '个', '词', '语', '很', 'important', ',', '第', '二', '个', '词', '语', '也', '很', 'important']], self.trie_dict.merge_batch(data, predictions, new_data_belongs, parts)) def test_tokenize_2(self): t = TrieDict({'次世代', '生产环境'}) self.assertSequenceEqual(t.tokenize('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'), [(15, 19, True), (21, 24, True)]) def test_empty_dict(self): trie_dict = TrieDict() self.assertFalse(bool(trie_dict)) trie_dict['one'] = 1 self.assertTrue(bool(trie_dict)) del trie_dict['one'] self.assertFalse(bool(trie_dict)) if __name__ == '__main__': unittest.main() ================================================ FILE: setup.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-12-28 19:26 import sys from os.path import abspath, join, dirname from setuptools import find_packages, setup this_dir = abspath(dirname(__file__)) with open(join(this_dir, 'README.md'), encoding='utf-8') as file: long_description = file.read() version = {} with open(join(this_dir, "hanlp", "version.py")) as fp: exec(fp.read(), version) FASTTEXT = 'fasttext-wheel==0.9.2' sys_version_info = sys.version_info TOKENIZERS = [] if (sys_version_info.major, sys_version_info.minor) == (3, 6) and sys.platform in {'darwin', 'win32'}: TOKENIZERS = ['tokenizers==0.10.3'] extras_require = { 'amr': [ 'penman==1.2.1', 'networkx>=2.5.1', 'perin-parser>=0.0.12', ], 'fasttext': [FASTTEXT], 'tf': [FASTTEXT, 'tensorflow>=2.6.0,<2.14'] } extras_require['full'] = list(set(sum(extras_require.values(), []))) setup( name='hanlp', version=version['__version__'], description='HanLP: Han Language Processing', long_description=long_description, long_description_content_type="text/markdown", url='https://github.com/hankcs/HanLP', author='hankcs', author_email='hankcshe@gmail.com', license='Apache License 2.0', classifiers=[ 'Intended Audience :: Science/Research', 'Intended Audience :: Developers', "Development Status :: 4 - Beta", 'Operating System :: OS Independent', "License :: OSI Approved :: Apache Software License", 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Topic :: Scientific/Engineering :: Artificial Intelligence', "Topic :: Text Processing :: Linguistic" ], keywords='corpus,machine-learning,NLU,NLP', packages=find_packages(exclude=['docs', 'tests*']), include_package_data=True, install_requires=[ 'termcolor', 'pynvml', 'toposort==1.5', 'transformers>=4.1.1', 'sentencepiece>=0.1.91', # Essential for tokenization_bert_japanese 'torch>=1.6.0', 'hanlp-common>=0.0.22', 'hanlp-trie>=0.0.4', 'hanlp-downloader', *TOKENIZERS, ], extras_require=extras_require, python_requires='>=3.6', ) ================================================ FILE: tests/__init__.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-06-13 23:43 import os root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) def cdroot(): """ cd to project root, so models are saved in the root folder """ os.chdir(root) ================================================ FILE: tests/test_config_tracker.py ================================================ import unittest from hanlp.common.structure import ConfigTracker class MyClass(ConfigTracker): def __init__(self, i_need_this='yes') -> None: super().__init__(locals()) class TestConfigTracker(unittest.TestCase): def test_init(self): obj = MyClass() self.assertEqual(obj.config.get('i_need_this', None), 'yes') if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_mtl.py ================================================ import hanlp import unittest from multiprocessing.dummy import Pool from hanlp_common.document import Document mtl = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH, devices=-1) def tokenize(mtl, text): return mtl(text, tasks='tok/fine')['tok/fine'] class TestMultiTaskLearning(unittest.TestCase): def test_mtl_single_sent(self): doc: Document = mtl('商品和服务') self.assertSequenceEqual(doc['tok/fine'], ["商品", "和", "服务"]) def test_mtl_multiple_sents(self): doc: Document = mtl(['商品和服务', '研究生命']) self.assertSequenceEqual(doc['tok/fine'], [ ["商品", "和", "服务"], ["研究", "生命"] ]) def test_mtl_empty_str(self): mtl('') mtl(' ') mtl(['']) mtl([' ']) mtl(['', ' ']) mtl(['', ' ', 'good']) mtl([[]], skip_tasks='tok*') def test_skip_tok(self): pre_tokenized_sents = [ ["商品和服务", '一个', '词'], ["研究", "生命"] ] doc: Document = mtl(pre_tokenized_sents, skip_tasks='tok*') self.assertSequenceEqual(doc['tok'], pre_tokenized_sents) def test_sdp_as_the_first_task(self): doc: Document = mtl(['人', '吃', '鱼'], tasks='sdp', skip_tasks='tok*') self.assertDictEqual( doc.to_dict(), { "sdp": [ [(2, "Agt")], [(0, "Root")], [(2, "Pat")] ], "tok": [ "人", "吃", "鱼" ] } ) def test_threading(self): num_proc = 8 with Pool(num_proc) as pool: results = pool.starmap(tokenize, [(mtl, '商品和服务')] * num_proc) self.assertSequenceEqual(results, [['商品', '和', '服务']] * num_proc) def test_emoji(self): self.assertSequenceEqual(mtl('( ͡° ͜ʖ ͡ °)你好', tasks='tok/fine')['tok/fine'], ["(", " ͡", "°", " ͜", "ʖ", " ͡ ", "°", ")", "你", "好"]) mtl['tok/fine'].dict_combine = {'( ͡° ͜ʖ ͡ °)'} self.assertSequenceEqual(mtl('( ͡° ͜ʖ ͡ °)你好', tasks='tok/fine')['tok/fine'], ["( ͡° ͜ʖ ͡ °)", "你", "好"]) def test_unicode_removed_by_hf(self): self.assertSequenceEqual(mtl('͡', tasks='tok/fine')['tok/fine'], ['͡']) def test_space(self): task = 'tok/fine' doc: Document = mtl('商品 和服务', tasks=task) self.assertSequenceEqual(doc[task], ["商品", "和", "服务"]) mtl[task].dict_combine = {('iPad', 'Pro'), '2个空格'} self.assertSequenceEqual(mtl("如何评价iPad Pro ?iPad Pro有2个空格", tasks=task)[task], ['如何', '评价', 'iPad Pro', '?', 'iPad Pro', '有', '2个空格']) def test_transform(self): task = 'tok/fine' mtl[task].dict_force = {'用户ID'} self.assertSequenceEqual(mtl("我的用户ID跟你的用户id不同", tasks=task)[task], ['我', '的', '用户ID', '跟', '你', '的', '用户', 'id', '不同']) def test_tok_offset(self): task = 'tok/fine' tok = mtl[task] tok.config.output_spans = True tok.dict_force = None tok.dict_combine = None sent = '我先去看医生' for t, b, e in mtl(sent, tasks=task)[task]: self.assertEqual(t, sent[b:e]) tok.dict_combine = {'先去'} for t, b, e in mtl(sent, tasks=task)[task]: self.assertEqual(t, sent[b:e]) tok.config.output_spans = False tok.dict_force = None tok.dict_combine = None if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_pipeline.py ================================================ import unittest import hanlp class TestPipeLine(unittest.TestCase): def test_copy(self): pipe = hanlp.pipeline().append(hanlp.utils.rules.split_sentence) copied_pipe = pipe.copy() test_text = "今天天气真好。我要去散步。" assert pipe is not copied_pipe copied_pipe.append(lambda sent: "".join(sent)) assert pipe(test_text) != copied_pipe(test_text) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_rules.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-03-22 17:17 import unittest from hanlp.utils.rules import split_sentence class TestRules(unittest.TestCase): def test_eos(self): self.assertListEqual(list(split_sentence('叶')), ['叶']) self.assertListEqual(list(split_sentence('他说:“加油。”谢谢')), ['他说:“加油。”', '谢谢']) self.assertListEqual(list(split_sentence('Go to hankcs.com. Yes.')), ['Go to hankcs.com.', 'Yes.']) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/test_string_util.py ================================================ # -*- coding:utf-8 -*- # Author: hankcs # Date: 2022-03-22 17:17 import unittest from hanlp.utils.string_util import possible_tokenization class TestStringUtility(unittest.TestCase): def test_enumerate_tokenization(self): text = '商品和服务' toks = possible_tokenization(text) assert len(set(toks)) == 2 ** (len(text) - 1) for each in toks: assert ''.join(each) == text if __name__ == '__main__': unittest.main()