Repository: ncbi-nlp/BLUE_Benchmark Branch: master Commit: c497c4196cd5 Files: 40 Total size: 104.1 KB Directory structure: gitextract_5xef4m7i/ ├── .gitignore ├── LICENSE.txt ├── README.md ├── blue/ │ ├── __init__.py │ ├── bert/ │ │ ├── __init__.py │ │ ├── create_cdr_bert.py │ │ ├── create_chemprot_bert.py │ │ ├── create_clefe_bert.py │ │ ├── create_ddi_bert.py │ │ ├── create_i2b2_bert.py │ │ └── create_mednli_bert.py │ ├── create_bert.sh │ ├── create_gs.sh │ ├── eval_hoc.py │ ├── eval_mednli.py │ ├── eval_ner.py │ ├── eval_rel.py │ ├── eval_sts.py │ ├── ext/ │ │ ├── __init__.py │ │ ├── data_structure.py │ │ ├── pmetrics.py │ │ ├── preprocessing.py │ │ ├── pstring.py │ │ └── pubtator.py │ └── gs/ │ ├── __init__.py │ ├── create_cdr_test_gs.py │ ├── create_chemprot_test_gs.py │ ├── create_clefe_test_gs.py │ ├── create_ddi_test_gs.py │ ├── create_hoc.py │ ├── create_i2b2_test_gs.py │ └── create_mednli_test_gs.py ├── blue_plus/ │ ├── README.md │ ├── __init__.py │ ├── dataset.py │ └── example_dataset/ │ ├── __init__.py │ ├── biosses.yml │ ├── biosses_dataset.py │ └── requirements.txt └── requirements.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ data .idea data.zip bert_data blue_plus_data ### JetBrains template # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff .idea/**/workspace.xml .idea/**/tasks.xml .idea/**/dictionaries .idea/**/shelf # Sensitive or high-churn files .idea/**/dataSources/ .idea/**/dataSources.ids .idea/**/dataSources.local.xml .idea/**/sqlDataSources.xml .idea/**/dynamic.xml .idea/**/uiDesigner.xml # Gradle .idea/**/gradle.xml .idea/**/libraries # CMake cmake-build-debug/ cmake-build-release/ # Mongo Explorer plugin .idea/**/mongoSettings.xml # File-based project format *.iws # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties # Editor-based Rest Client .idea/httpRequests ### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: LICENSE.txt ================================================ PUBLIC DOMAIN NOTICE National Center for Biotechnology Information This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as a United States Government employee and thus cannot be copyrighted. This software/database is freely available to the public for use. The National Library of Medicine and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NLM and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NLM and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the author in any work or product based on this material: Peng Y, Yan S, Lu Z. Transfer Learning in Biomedical Natural Language Processing: An Evaluation of BERT and ELMo on Ten Benchmarking Datasets. In Proceedings of the 2019 Workshop on Biomedical Natural Language Processing (BioNLP 2019). 2019:58-65. ================================================ FILE: README.md ================================================ # BLUE, the Biomedical Language Understanding Evaluation benchmark **\*\*\*\*\* New Aug 13th, 2019: Change DDI metric from micro-F1 to macro-F1 \*\*\*\*\*** **\*\*\*\*\* New July 11th, 2019: preprocessed PubMed texts \*\*\*\*\*** We uploaded the [preprocessed PubMed texts](https://github.com/ncbi-nlp/ncbi_bluebert/blob/master/README.md#pubmed) that were used to pre-train the NCBI_BERT models. **\*\*\*\*\* New June 17th, 2019: data in BERT format \*\*\*\*\*** We uploaded some [datasets](https://github.com/ncbi-nlp/BLUE_Benchmark/releases/tag/0.1) that are ready to be used with the [NCBI BlueBERT codes](https://github.com/ncbi-nlp/ncbi_bluebert). ## Introduction BLUE benchmark consists of five different biomedicine text-mining tasks with ten corpora. Here, we rely on preexisting datasets because they have been widely used by the BioNLP community as shared tasks. These tasks cover a diverse range of text genres (biomedical literature and clinical notes), dataset sizes, and degrees of difficulty and, more importantly, highlight common biomedicine text-mining challenges. ## Tasks | Corpus | Train | Dev | Test | Task | Metrics | Domain | |-----------------|------:|-----:|-----:|-------------------------|---------------------|------------| | MedSTS | 675 | 75 | 318 | Sentence similarity | Pearson | Clinical | | BIOSSES | 64 | 16 | 20 | Sentence similarity | Pearson | Biomedical | | BC5CDR-disease | 4182 | 4244 | 4424 | NER | F1 | Biomedical | | BC5CDR-chemical | 5203 | 5347 | 5385 | NER | F1 | Biomedical | | ShARe/CLEFE | 4628 | 1075 | 5195 | NER | F1 | Clinical | | DDI | 2937 | 1004 | 979 | Relation extraction | macro F1 | Biomedical | | ChemProt | 4154 | 2416 | 3458 | Relation extraction | micro F1 | Biomedical | | i2b2-2010 | 3110 | 11 | 6293 | Relation extraction | F1 | Clinical | | HoC | 1108 | 157 | 315 | Document classification | F1 | Biomedical | | MedNLI | 11232 | 1395 | 1422 | Inference | accuracy | Clinical | ### Sentence similarity [BIOSSES](http://tabilab.cmpe.boun.edu.tr/BIOSSES/) is a corpus of sentence pairs selected from the Biomedical Summarization Track Training Dataset in the biomedical domain. Here, we randomly select 80% for training and 20% for testing because there is no standard splits in the released data. [MedSTS](https://mayoclinic.pure.elsevier.com/en/publications/medsts-a-resource-for-clinical-semantic-textual-similarity) is a corpus of sentence pairs selected from Mayo Clinics clinical data warehouse. Please visit the website to obtain a copy of the dataset. We use the standard training and testing sets in the shared task. ### Named entity recognition [BC5CDR](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/track-3-cdr/) is a collection of 1,500 PubMed titles and abstracts selected from the CTD-Pfizer corpus and was used in the BioCreative V chemical-disease relation task We use the standard training and test set in the BC5CDR shared task [ShARe/CLEF](https://physionet.org/works/ShAReCLEFeHealth2013/) eHealth Task 1 Corpus is a collection of 299 deidentified clinical free-text notes from the MIMIC II database Please visit the website to obtain a copy of the dataset. We use the standard training and test set in the ShARe/CLEF eHealth Tasks 1. ### Relation extraction [DDI](http://labda.inf.uc3m.es/ddicorpus) extraction 2013 corpus is a collection of 792 texts selected from the DrugBank database and other 233 Medline abstracts In our benchmark, we use 624 train files and 191 test files to evaluate the performance and report the macro-average F1-score of the four DDI types. [ChemProt](https://biocreative.bioinformatics.udel.edu/news/corpora/) consists of 1,820 PubMed abstracts with chemical-protein interactions and was used in the BioCreative VI text mining chemical-protein interactions shared task We use the standard training and test sets in the ChemProt shared task and evaluate the same five classes: CPR:3, CPR:4, CPR:5, CPR:6, and CPR:9. [i2b2 2010](https://www.i2b2.org/NLP/DataSets/) shared task collection consists of 170 documents for training and 256 documents for testing, which is the subset of the original dataset. The dataset was collected from three different hospitals and was annotated by medical practitioners for eight types of relations between problems and treatments. ### Document multilabel classification [HoC](https://www.cl.cam.ac.uk/~sb895/HoC.html) (the Hallmarks of Cancers corpus) consists of 1,580 PubMed abstracts annotated with ten currently known hallmarks of cancer We use 315 (~20%) abstracts for testing and the remaining abstracts for training. For the HoC task, we followed the common practice and reported the example-based F1-score on the abstract level ### Inference task [MedNLI](https://physionet.org/physiotools/mimic-code/mednli/) is a collection of sentence pairs selected from MIMIC-III. We use the same training, development, and test sets in [Romanov and Shivade](https://www.aclweb.org/anthology/D18-1187) ### Datasets Some datasets can be downloaded at [https://github.com/ncbi-nlp/BLUE_Benchmark/releases/tag/0.1](https://github.com/ncbi-nlp/BLUE_Benchmark/releases/tag/0.1) ## Baselines | Corpus | Metrics | SOTA* | ELMo | BioBERT | NCBI_BERT(base) (P) | NCBI_BERT(base) (P+M) | NCBI_BERT(large) (P) | NCBI_BERT(large) (P+M) | |-----------------|--------:|------:|-----:|--------:|--------------------:|----------------------:|---------------------:|-----------------------:| | MedSTS | Pearson | 83.6 | 68.6 | 84.5 | 84.5 | 84.8 | 84.6 | 83.2 | | BIOSSES | Pearson | 84.8 | 60.2 | 82.7 | 89.3 | 91.6 | 86.3 | 75.1 | | BC5CDR-disease | F | 84.1 | 83.9 | 85.9 | 86.6 | 85.4 | 82.9 | 83.8 | | BC5CDR-chemical | F | 93.3 | 91.5 | 93.0 | 93.5 | 92.4 | 91.7 | 91.1 | | ShARe/CLEFE | F | 70.0 | 75.6 | 72.8 | 75.4 | 77.1 | 72.7 | 74.4 | | DDI | F | 72.9 | 62.0 | 78.8 | 78.1 | 79.4 | 79.9 | 76.3 | | ChemProt | F | 64.1 | 66.6 | 71.3 | 72.5 | 69.2 | 74.4 | 65.1 | | i2b2 2010 | F | 73.7 | 71.2 | 72.2 | 74.4 | 76.4 | 73.3 | 73.9 | | HoC | F | 81.5 | 80.0 | 82.9 | 85.3 | 83.1 | 87.3 | 85.3 | | MedNLI | acc | 73.5 | 71.4 | 80.5 | 82.2 | 84.0 | 81.5 | 83.8 | **P**: PubMed, **P+M**: PubMed + MIMIC-III SOTA, state-of-the-art as of April 2019, to the best of our knowledge * **MedSTS, BIOSSES**: Chen et al. 2019. [BioSentVec: creating sentence embeddings for biomedical texts](https://arxiv.org/abs/1810.09302v2). In Proceedings of the 7th IEEE International Conference on Healthcare Informatics. * **BC5CDR-disease, BC5CDR-chem**: Yoon et al. 2018. [CollaboNet: collaboration of deep neural networks for biomedical named entity recognition](https://arxiv.org/abs/1809.07950v1). arXiv preprint arXiv:1809.07950. * **ShARe/CLEFE**: Leaman et al. 2015. [Challenges in clinical natural language processing for automated disorder normalization](https://www.sciencedirect.com/science/article/pii/S1532046415001501?via%3Dihub). Journal of biomedical informatics, 57:28–37. * **DDI**: Zhang et al. 2018. [Drug-drug interaction extraction via hierarchical RNNs on sequence and shortest dependency paths](https://academic.oup.com/bioinformatics/article/34/5/828/4565590). Bioinformatics (Oxford, England), 34:828–835. * **Chem-Prot**: Peng et al. 2018. [Extracting chemical-protein relations with ensembles of SVM and deep learning models](https://academic.oup.com/database/article/doi/10.1093/database/bay073/5055578). Database: the journal of biological databases and curation, 2018. * **i2b2 2010**: Rink et al. 2011. [Automatic extraction of relations between medical concepts in clinical texts](https://academic.oup.com/jamia/article/18/5/594/833364). Journal of the American Medical Informatics Association, 18:594–600. * **HoC**: Du et al. 2019. [ML-Net: multilabel classification of biomedical texts with deep neural networks](https://arxiv.org/abs/1811.05475v2). Journal of the American Medical Informatics Association (JAMIA). * **MedNLI**: Romanov et al. 2018. [Lessons from natural language inference in the clinical domain](https://www.aclweb.org/anthology/D18-1187). In Proceedings of EMNLP, pages 1586–1596. ### Fine-tuning with ELMo We adopted the ELMo model pre-trained on PubMed abstracts to accomplish the BLUE tasks. The output of ELMo embeddings of each token is used as input for the fine-tuning model. We retrieved the output states of both layers in ELMo and concatenated them into one vector for each word. We used the maximum sequence length 128 for padding. The learning rate was set to 0.001 with an Adam optimizer. We iterated the training process for 20 epochs with batch size 64 and early stopped if the training loss did not decrease. ### Fine-tuning with BERT Please see [https://github.com/ncbi-nlp/ncbi_bluebert](https://github.com/ncbi-nlp/ncbi_bluebert). ## Citing BLUE * Peng Y, Yan S, Lu Z. [Transfer Learning in Biomedical Natural Language Processing: An Evaluation of BERT and ELMo on Ten Benchmarking Datasets](https://arxiv.org/abs/1906.05474). In *Proceedings of the Workshop on Biomedical Natural Language Processing (BioNLP)*. 2019. ``` @InProceedings{peng2019transfer, author = {Yifan Peng and Shankai Yan and Zhiyong Lu}, title = {Transfer Learning in Biomedical Natural Language Processing: An Evaluation of BERT and ELMo on Ten Benchmarking Datasets}, booktitle = {Proceedings of the 2019 Workshop on Biomedical Natural Language Processing (BioNLP 2019)}, year = {2019}, } ``` ## Acknowledgments This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of Medicine and Clinical Center. This work was supported by the National Library of Medicine of the National Institutes of Health under award number K99LM013001-01. We are also grateful to the authors of BERT and ELMo to make the data and codes publicly available. We would like to thank Geeticka Chauhan for providing thoughtful comments. ## Disclaimer This tool shows the results of research conducted in the Computational Biology Branch, NCBI. The information produced on this website is not intended for direct diagnostic use or medical decision-making without review and oversight by a clinical professional. Individuals should not change their health behavior solely on the basis of information produced on this website. NIH does not independently verify the validity or utility of the information produced by this tool. If you have questions about the information produced on this website, please see a health care professional. More information about NCBI's disclaimer policy is available. ================================================ FILE: blue/__init__.py ================================================ ================================================ FILE: blue/bert/__init__.py ================================================ ================================================ FILE: blue/bert/create_cdr_bert.py ================================================ import fire import tqdm from blue.ext import pubtator from blue.ext.preprocessing import tokenize_text, print_ner_debug, write_bert_ner_file def _find_toks(sentences, start, end): toks = [] for sentence in sentences: for ann in sentence.annotations: span = ann.total_span if start <= span.offset and span.offset + span.length <= end: toks.append(ann) elif span.offset <= start and end <= span.offset + span.length: toks.append(ann) return toks def convert(src, dest, entity_type, validate_mentions=None): with open(src) as fp: docs = pubtator.load(fp) total_sentences = [] for doc in tqdm.tqdm(docs): text = doc.title + ' ' + doc.abstract sentences = tokenize_text(text, doc.pmid) total_sentences.extend(sentences) for ann in doc.annotations: if ann.type == entity_type: anns = _find_toks(sentences, ann.start, ann.end) if len(anns) == 0: print(f'Cannot find {doc.pmid}: {ann}') print_ner_debug(sentences, ann.start, ann.end) exit(1) has_first = False for ann in anns: if not has_first: ann.infons['NE_label'] = 'B' has_first = True else: ann.infons['NE_label'] = 'I' cnt = write_bert_ner_file(dest, total_sentences) if validate_mentions is not None and validate_mentions != cnt: print(f'Should have {validate_mentions}, but have {cnt} {entity_type} mentions') else: print(f'Have {cnt} mentions') class Commend: def chemical(self, input, output): convert(input, output, 'Chemical') def disease(self, input, output): convert(input, output, 'Disease') if __name__ == '__main__': fire.Fire(Commend) ================================================ FILE: blue/bert/create_chemprot_bert.py ================================================ import collections import csv import itertools from pathlib import Path import fire import tqdm from blue.ext import pstring from blue.ext.preprocessing import tokenize_text def find_entities(sentence, entities, entity_type): es = [] for e in entities: if e['type'] != entity_type: continue if sentence.offset <= e['start'] and e['end'] <= sentence.offset + len(sentence.text): es.append(e) return es def find_relations(relations, chem, prot): labels = [] for i in range(len(relations) - 1, -1, -1): r = relations[i] if r['Arg1'] == chem['id'] and r['Arg2'] == prot['id']: del relations[i] labels.append(r['label']) return labels def replace_text(text, offset, ann1, ann2): ann1_start = ann1['start'] - offset ann2_start = ann2['start'] - offset ann1_end = ann1['end'] - offset ann2_end = ann2['end'] - offset if ann1_start <= ann2_start <= ann1_end \ or ann1_start <= ann2_end <= ann1_end \ or ann2_start <= ann1_start <= ann2_end \ or ann2_start <= ann1_end <= ann2_end: start = min(ann1_start, ann2_start) end = max(ann1_end, ann2_end) before = text[:start] after = text[end:] return before + '@CHEM-GENE$' + after if ann1_start > ann2_start: ann1_start, ann1_end, ann2_start, ann2_end = ann2_start, ann2_end, ann1_start, ann1_end before = text[:ann1_start] middle = text[ann1_end:ann2_start] after = text[ann2_end:] if ann1['type'] in ('GENE-N', 'GENE-Y'): ann1['type'] = 'GENE' if ann2['type'] in ('GENE-N', 'GENE-Y'): ann2['type'] = 'GENE' return before + f'@{ann1["type"]}$' + middle + f'@{ann2["type"]}$' + after def print_rel_debug(sentences, entities, id1, id2): e1 = None e2 = None for e in entities: if e['id'] == id1: e1 = e if e['id'] == id2: e2 = e assert e1 is not None and e2 is not None ss = [s for s in sentences if s.offset <= e1['start'] <= s.offset + len(s.text) or s.offset <= e2['start'] <= s.offset + len(s.text)] if len(ss) != 0: for s in ss: print(s.offset, s.text) else: for s in sentences: print(s.offset, s.text) def merge_sentences(sentences): if len(sentences) == 0: return sentences new_sentences = [] last_one = sentences[0] for s in sentences[1:]: if last_one.text[-1] in """.?!""" and last_one.text[-4:] != 'i.v.' and s.text[0].isupper(): new_sentences.append(last_one) last_one = s else: last_one.text += ' ' * (s.offset - len(last_one.text) - last_one.offset) last_one.text += s.text new_sentences.append(last_one) return new_sentences def convert(abstract_file, entities_file, relation_file, output): # abstract total_sentences = collections.OrderedDict() with open(abstract_file, encoding='utf8') as fp: for line in tqdm.tqdm(fp, desc=abstract_file.stem): toks = line.strip().split('\t') text = toks[1] + ' ' + toks[2] text = pstring.printable(text, greeklish=True) sentences = tokenize_text(text, toks[0]) sentences = merge_sentences(sentences) total_sentences[toks[0]] = sentences # entities entities = collections.defaultdict(list) with open(entities_file, encoding='utf8') as fp: for line in tqdm.tqdm(fp, desc=entities_file.stem): toks = line.strip().split('\t') entities[toks[0]].append({ 'docid': toks[0], 'start': int(toks[3]), 'end': int(toks[4]), 'type': toks[2], 'id': toks[1], 'text': toks[5]}) # relations relations = collections.defaultdict(list) with open(relation_file, encoding='utf8') as fp: for line in tqdm.tqdm(fp, desc=relation_file.stem): toks = line.strip().split('\t') relations[toks[0]].append({ 'docid': toks[0], 'label': toks[1], 'Arg1': toks[2][toks[2].find(':') + 1:], 'Arg2': toks[3][toks[3].find(':') + 1:], 'toks': toks }) with open(output, 'w') as fp: writer = csv.writer(fp, delimiter='\t', lineterminator='\n') writer.writerow(['index', 'sentence', 'label']) cnt = 0 for docid, sentences in tqdm.tqdm(total_sentences.items(), total=len(total_sentences)): for sentence in sentences: # find chemical chemicals = find_entities(sentence, entities[docid], 'CHEMICAL') # find prot genes = find_entities(sentence, entities[docid], 'GENE-N') \ + find_entities(sentence, entities[docid], 'GENE-Y') for i, (chem, gene) in enumerate(itertools.product(chemicals, genes)): text = replace_text(sentence.text, sentence.offset, chem, gene) labels = find_relations(relations[docid], chem, gene) if len(labels) == 0: writer.writerow([f'{docid}.{chem["id"]}.{gene["id"]}', text, 'false']) else: for l in labels: writer.writerow([f'{docid}.{chem["id"]}.{gene["id"]}', text, l]) cnt += 1 # print('-' * 80) # for docid, rs in relations.items(): # if len(rs) > 0: # for r in rs: # print('\t'.join(r['toks'])) # print_rel_debug(total_sentences[r['docid']], entities[r['docid']], # r['Arg1'], r['Arg2']) # print('-' * 80) def create_chemprot_bert(data_dir, output_dir): data_dir = Path(data_dir) output_dir = Path(output_dir) convert(data_dir / 'chemprot_training/chemprot_training_abstracts.tsv', data_dir / 'chemprot_training/chemprot_training_entities.tsv', data_dir / 'chemprot_training/chemprot_training_gold_standard.tsv', output_dir / 'train.tsv') # convert(data_dir / 'chemprot_development/chemprot_development_abstracts.tsv', # data_dir / 'chemprot_development/chemprot_development_entities.tsv', # data_dir / 'chemprot_development/chemprot_development_gold_standard.tsv', # output_dir / 'dev.tsv') # convert(data_dir / 'chemprot_test_gs/chemprot_test_abstracts_gs.tsv', # data_dir / 'chemprot_test_gs/chemprot_test_entities_gs.tsv', # data_dir / 'chemprot_test_gs/chemprot_test_gold_standard.tsv', # output_dir / 'train.tsv') if __name__ == '__main__': fire.Fire(create_chemprot_bert) ================================================ FILE: blue/bert/create_clefe_bert.py ================================================ import functools import os import re import shutil from pathlib import Path import fire import tqdm from lxml import etree from blue.ext.preprocessing import tokenize_text, print_ner_debug, write_bert_ner_file def pattern_repl(matchobj, prefix): """ Replace [**Patterns**] with prefix+spaces. """ s = matchobj.group(0).lower() return prefix.rjust(len(s)) def _find_toks(sentences, start, end): toks = [] for sentence in sentences: for ann in sentence.annotations: span = ann.total_span if start <= span.offset and span.offset + span.length <= end: toks.append(ann) elif span.offset <= start and end <= span.offset + span.length: toks.append(ann) return toks def read_text(pathname): with open(pathname) as fp: text = fp.read() text = re.sub(r'\[\*\*.*?\*\*\]', functools.partial(pattern_repl, prefix='PATTERN'), text) text = re.sub(r'(\|{4})|___|~~', functools.partial(pattern_repl, prefix=''), text) sentences = tokenize_text(text, pathname.stem) # sentences = _cleanupSentences2(sentences) # sentences = _cleanupSentences1(sentences) # sentences = _normalize_sentences(sentences) # sentences = _tokenize_sentences(sentences) return sentences def map_anns(sentences, ann_file): with open(ann_file) as fp: for line in fp: line = line.strip() toks = line.split('||') has_first = False for i in range(3, len(toks), 2): start = int(toks[i]) end = int(toks[i + 1]) anns = _find_toks(sentences, start, end) if len(anns) == 0: print(f'Cannot find {ann_file}: {line}') print_ner_debug(sentences, start, end) exit(1) for ann in anns: if not has_first: ann.infons['NE_label'] = 'B' has_first = True else: ann.infons['NE_label'] = 'I' return sentences def convert(text_dir, ann_dir, dest, validate_mentions=None): total_sentences = [] with os.scandir(text_dir) as it: for entry in tqdm.tqdm(it): text_file = Path(entry) ann_file = ann_dir / text_file.name if not ann_file.exists(): print('Cannot find ann file:', ann_file) continue sentences = read_text(text_file) sentences = map_anns(sentences, ann_file) total_sentences.extend(sentences) # print(len(total_sentences)) cnt = write_bert_ner_file(dest, total_sentences) if validate_mentions is not None and validate_mentions != cnt: print(f'Should have {validate_mentions}, but have {cnt} mentions') else: print(f'Have {cnt} mentions') def convert_train_gs_to_text(src_dir, dest_dir): def _one_file(src_file, dest_file): # annotation with open(src_file) as fp: tree = etree.parse(fp) stringSlotMentions = {} for atag in tree.xpath('stringSlotMention'): stringSlotMentions[atag.get('id')] = atag.xpath('stringSlotMentionValue')[0].get( 'value') classMentions = {} for atag in tree.xpath('classMention'): classMentions[atag.get('id')] = (atag.xpath('hasSlotMention')[0].get('id'), atag.xpath('mentionClass')[0].get('id')) with open(dest_file, 'w') as fp: for atag in tree.xpath('annotation'): id = atag.xpath('mention')[0].get('id') mentionClass = classMentions[id][1] try: stringSlotMentionValue = stringSlotMentions[classMentions[id][0]] except: stringSlotMentionValue = 'CUI-less' fp.write(f'{dest_file.name}||{mentionClass}||{stringSlotMentionValue}') for stag in atag.xpath('span'): start = stag.get('start') end = stag.get('end') fp.write(f'||{start}||{end}') fp.write('\n') with os.scandir(src_dir) as it: for entry in tqdm.tqdm(it): path = Path(entry) basename = path.stem[:path.stem.find('.')] _one_file(path, dest_dir / f'{basename}.txt') def split_development(data_path, devel_docids_pathname): with open(devel_docids_pathname) as fp: devel_docids = set(line.strip() for line in fp) os.mkdir(data_path / 'TRAIN_REPORTS') os.mkdir(data_path / 'DEV_REPORTS') with os.scandir(data_path / 'ALLREPORTS') as it: for entry in tqdm.tqdm(it): text_file = Path(entry) if text_file.stem in devel_docids: dest = data_path / 'DEV_REPORTS' / text_file.name else: dest = data_path / 'TRAIN_REPORTS' / text_file.name shutil.copy(text_file, dest) def create_clefe_bert(gold_directory, output_directory): data_path = Path(gold_directory) dest_path = Path(output_directory) convert(data_path / 'Task1TrainSetCorpus199/TRAIN_REPORTS', data_path / 'Task1TrainSetGOLD199knowtatorehost/Task1Gold', dest_path / 'Training.tsv') convert(data_path / 'Task1TrainSetCorpus199/DEV_REPORTS', data_path / 'Task1TrainSetGOLD199knowtatorehost/Task1Gold', dest_path / 'Development.tsv') convert(data_path / 'Task1TestSetCorpus100/ALLREPORTS', data_path / 'Task1Gold_SN2012/Gold_SN2012', dest_path / 'Test.tsv') if __name__ == '__main__': fire.Fire(create_clefe_bert) ================================================ FILE: blue/bert/create_ddi_bert.py ================================================ import csv import logging import os import re import bioc import fire from lxml import etree def get_ann(arg, obj): for ann in obj['annotations']: if ann['id'] == arg: return ann raise ValueError def replace_text(text, offset, ann1, ann2): ann1_start = ann1['start'] - offset ann2_start = ann2['start'] - offset ann1_end = ann1['end'] - offset ann2_end = ann2['end'] - offset if ann1_start <= ann2_start <= ann1_end \ or ann1_start <= ann2_end <= ann1_end \ or ann2_start <= ann1_start <= ann2_end \ or ann2_start <= ann1_end <= ann2_end: start = min(ann1_start, ann2_start) end = max(ann1_end, ann2_end) before = text[:start] after = text[end:] return before + f'@{ann1["type"]}-{ann2["type"]}$' + after if ann1_start > ann2_start: ann1_start, ann1_end, ann2_start, ann2_end = ann2_start, ann2_end, ann1_start, ann1_end before = text[:ann1_start] middle = text[ann1_end:ann2_start] after = text[ann2_end:] return before + f'@{ann1["type"]}$' + middle + f'@{ann2["type"]}$' + after def create_ddi_bert(gold_directory, output): fp = open(output, 'w') writer = csv.writer(fp, delimiter='\t', lineterminator='\n') writer.writerow(['index', 'sentence', 'label']) cnt = 0 for root, dirs, files in os.walk(gold_directory): for name in files: pathname = os.path.join(root, name) tree = etree.parse(pathname) for stag in tree.xpath('/document/sentence'): sentence = bioc.BioCSentence() sentence.offset = 0 sentence.text = stag.get('text') entities = {} for etag in stag.xpath('entity'): id = etag.get('id') m = re.match('(\d+)-(\d+)', etag.get('charOffset')) if m is None: logging.warning('{}:{}: charOffset does not match. {}'.format( output, id, etag.get('charOffset'))) continue start = int(m.group(1)) end = int(m.group(2)) + 1 expected_text = etag.get('text') actual_text = sentence.text[start:end] if expected_text != actual_text: logging.warning('{}:{}: Text does not match. Expected {}. Actual {}'.format( output, id, repr(expected_text), repr(actual_text))) entities[id] = { 'start': start, 'end': end, 'type': etag.get('type'), 'id': id, 'text': actual_text } for rtag in stag.xpath('pair'): if rtag.get('ddi') == 'false': label = 'DDI-false' else: label = 'DDI-{}'.format(rtag.get('type')) cnt += 1 e1 = entities.get(rtag.get('e1')) e2 = entities.get(rtag.get('e2')) text = replace_text(sentence.text, sentence.offset, e1, e2) writer.writerow([f'{rtag.get("id")}', text, label]) print(f'Have {cnt} relations') if __name__ == '__main__': fire.Fire(create_ddi_bert) ================================================ FILE: blue/bert/create_i2b2_bert.py ================================================ import csv import itertools import os import re from pathlib import Path from typing import Match import bioc import fire import pandas as pd import tqdm from blue.bert.create_chemprot_bert import print_rel_debug from blue.bert.create_ddi_bert import replace_text labels = ['PIP', 'TeCP', 'TeRP', 'TrAP', 'TrCP', 'TrIP', 'TrNAP', 'TrWP', 'false'] def read_text(pathname): with open(pathname) as fp: text = fp.read() sentences = [] offset = 0 for sent in text.split('\n'): sentence = bioc.BioCSentence() sentence.infons['filename'] = pathname.stem sentence.offset = offset sentence.text = sent sentences.append(sentence) i = 0 for m in re.finditer('\S+', sent): if i == 0 and m.start() != 0: # add fake ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = '' ann.add_location(bioc.BioCLocation(offset, 0)) sentence.add_annotation(ann) i += 1 ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = m.group() ann.add_location(bioc.BioCLocation(m.start() + offset, len(m.group()))) sentence.add_annotation(ann) i += 1 offset += len(sent) + 1 return sentences def _get_ann_offset(sentences, match_obj: Match, start_line_group, start_token_group, end_line_group, end_token_group, text_group): assert match_obj.group(start_line_group) == match_obj.group(end_line_group) sentence = sentences[int(match_obj.group(start_line_group)) - 1] start_token_idx = int(match_obj.group(start_token_group)) end_token_idx = int(match_obj.group(end_token_group)) start = sentence.annotations[start_token_idx].total_span.offset end = sentence.annotations[end_token_idx].total_span.end text = match_obj.group(text_group) actual = sentence.text[start - sentence.offset:end - sentence.offset].lower() expected = text.lower() assert actual == expected, 'Cannot match at %s:\n%s\n%s\nFind: %r, Matched: %r' \ % ( sentence.infons['filename'], sentence.text, match_obj.string, actual, expected) return start, end, text def read_annotations(pathname, sentences): anns = [] pattern = re.compile(r'c="(.*?)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*?)"(\|\|a="(.*?)")?') with open(pathname) as fp: for i, line in enumerate(fp): line = line.strip() m = pattern.match(line) assert m is not None start, end, text = _get_ann_offset(sentences, m, 2, 3, 4, 5, 1) ann = { 'start': start, 'end': end, 'type': m.group(6), 'a': m.group(7), 'text': text, 'line': int(m.group(2)) - 1, 'id': f'{pathname.name}.l{i}' } if len(m.groups()) == 9: ann['a'] = m.group(8) anns.append(ann) return anns def _find_anns(anns, start, end): for ann in anns: if ann['start'] == start and ann['end'] == end: return ann raise ValueError def read_relations(pathname, sentences, cons): pattern = re.compile( r'c="(.*?)" (\d+):(\d+) (\d+):(\d+)\|\|r="(.*?)"\|\|c="(.*?)" (\d+):(\d+) (\d+):(\d+)') relations = [] with open(pathname) as fp: for line in fp: line = line.strip() m = pattern.match(line) assert m is not None start, end, text = _get_ann_offset(sentences, m, 2, 3, 4, 5, 1) ann1 = _find_anns(cons, start, end) start, end, text = _get_ann_offset(sentences, m, 8, 9, 10, 11, 7) ann2 = _find_anns(cons, start, end) relations.append({ 'docid': pathname.stem, 'label': m.group(6), 'Arg1': ann1['id'], 'Arg2': ann2['id'], 'string': line }) return relations def find_relations(relations, ann1, ann2): labels = [] for i in range(len(relations) - 1, -1, -1): r = relations[i] if (r['Arg1'] == ann1['id'] and r['Arg2'] == ann2['id']) \ or (r['Arg1'] == ann2['id'] and r['Arg2'] == ann1['id']): del relations[i] labels.append(r['label']) return labels def convert(top_dir, dest): fp = open(dest, 'w') writer = csv.writer(fp, delimiter='\t', lineterminator='\n') writer.writerow(['index', 'sentence', 'label']) with os.scandir(top_dir / 'txt') as it: for entry in tqdm.tqdm(it): if not entry.name.endswith('.txt'): continue text_pathname = Path(entry.path) docid = text_pathname.stem sentences = read_text(text_pathname) # read assertions cons = read_annotations(top_dir / 'concept' / f'{text_pathname.stem}.con', sentences) # read relations relations = read_relations(top_dir / 'rel' / f'{text_pathname.stem}.rel', sentences, cons) for i, (con1, con2) in enumerate(itertools.combinations(cons, 2)): if con1['line'] != con2['line']: continue # if con['type'] != 'treatment' or ast['type'] != 'problem': # continue sentence = sentences[con1['line']] text = replace_text(sentence.text, sentence.offset, con1, con2) labels = find_relations(relations, con1, con2) if len(labels) == 0: writer.writerow([f'{docid}.{con1["id"]}.{con2["id"]}', text, 'false']) else: for l in labels: writer.writerow([f'{docid}.{con1["id"]}.{con2["id"]}', text, l]) if len(relations) != 0: for r in relations: print(r['string']) print_rel_debug(sentences, cons, r['Arg1'], r['Arg2']) print('-' * 80) fp.close() def split_doc(train1, train2, dev_docids, dest_dir): train1_df = pd.read_csv(train1, sep='\t') train2_df = pd.read_csv(train2, sep='\t') train_df = pd.concat([train1_df, train2_df]) with open(dev_docids) as fp: dev_docids = fp.readlines() with open(dest_dir / 'train.tsv', 'w') as tfp, open(dest_dir / 'dev.tsv', 'w') as dfp: twriter = csv.writer(tfp, delimiter='\t', lineterminator='\n') twriter.writerow(['index', 'sentence', 'label']) dwriter = csv.writer(dfp, delimiter='\t', lineterminator='\n') dwriter.writerow(['index', 'sentence', 'label']) for i, row in train_df.iterrows(): if row[0][:row[0].find('.')] in dev_docids: dwriter.writerow(row) else: twriter.writerow(row) def create_i2b2_bert(gold_directory, output_directory): data_path = Path(gold_directory) dest_path = Path(output_directory) convert(data_path / 'original/reference_standard_for_test_data', dest_path / 'test.tsv') convert(data_path / 'original/concept_assertion_relation_training_data/beth', dest_path / 'train-beth.tsv') convert(data_path / 'original/concept_assertion_relation_training_data/partners', dest_path / 'train-partners.tsv') split_doc(dest_path / 'train-beth.tsv', dest_path / 'train-partners.tsv', data_path / 'dev-docids.txt', dest_path) if __name__ == '__main__': fire.Fire(create_i2b2_bert) ================================================ FILE: blue/bert/create_mednli_bert.py ================================================ import csv import json import fire import tqdm from pathlib import Path from blue.ext import pstring def convert(src, dest): with open(src, encoding='utf8') as fin, open(dest, 'w', encoding='utf8') as fout: writer = csv.writer(fout, delimiter='\t', lineterminator='\n') writer.writerow(['index', 'sentence1', 'sentence2', 'label']) for line in tqdm.tqdm(fin): line = pstring.printable(line, greeklish=True) obj = json.loads(line) writer.writerow([obj['pairID'], obj['sentence1'], obj['sentence2'], obj['gold_label']]) def create_mednli(input_dir, output_dir): mednli_dir = Path(input_dir) output_dir = Path(output_dir) for src_name, dst_name in zip(['mli_train_v1.jsonl', 'mli_dev_v1.jsonl', 'mli_test_v1.jsonl'], ['train.tsv', 'dev.tsv', 'test.tsv']): source = mednli_dir / src_name dest = output_dir / dst_name convert(source, dest) if __name__ == '__main__': fire.Fire(create_mednli) ================================================ FILE: blue/create_bert.sh ================================================ #!/usr/bin/env bash python blue/bert/create_mednli_bert.py data\mednli\Original data\mednli python blue/bert/create_chemprot_bert.py data\ChemProt\original data\ChemProt\ python blue/bert/create_ddi_bert.py data\ddi2013-type\original\Test data\ddi2013-type\test.tsv ================================================ FILE: blue/create_gs.sh ================================================ #!/usr/bin/env bash python blue/gs/create_cdr_test_gs.py \ --input data\BC5CDR\Original\CDR_TestSet.PubTator.txt \ --output data\BC5CDR\CDR_TestSet.chem.jsonl \ --type Chemical python blue/gs/create_cdr_test_gs.py \ --input data\BC5CDR\Original\CDR_TestSet.PubTator.txt \ --output data\BC5CDR\CDR_TestSet.disease.jsonl \ --type Disease python blue/gs/create_clefe_test_gs.py \ --reports_dir data\ShAReCLEFEHealthCorpus\Origin\Task1TestSetCorpus100\ALLREPORTS \ --anns_dir data\ShAReCLEFEHealthCorpus\Origin\Task1Gold_SN2012\Gold_SN2012 \ --output data\ShAReCLEFEHealthCorpus\Task1TestSetCorpus100_test_gs.jsonl python blue/gs/create_chemprot_test_gs.py \ --entities data\ChemProt\original\chemprot_test_gs\chemprot_test_entities_gs.tsv \ --relations data\ChemProt\original\chemprot_test_gs\chemprot_test_gold_standard.tsv \ --output data\ChemProt\chemprot_test_gs.tsv python blue\gs\create_ddi_test_gs.py \ --input_dir data\ddi2013-type\original\Test \ --output data\ddi2013-type\test_gs.tsv python blue\gs\create_i2b2_test_gs.py \ --input_dir data\i2b2-2010\Original\reference_standard_for_test_data \ --output_dir data\i2b2-2010\ python blue\gs\create_mednli_test_gs.py \ --input data\mednli\Orignial\mli_test_v1.jsonl --output data\mednli\test_gs.tsv # eval python blue/eval_rel.py data\ChemProt\chemprot_test_gs.tsv data\ChemProt\chemprot_test_gs.tsv ================================================ FILE: blue/eval_hoc.py ================================================ import fire import pandas as pd import numpy as np from blue.ext.pmetrics import divide LABELS = ['activating invasion and metastasis', 'avoiding immune destruction', 'cellular energetics', 'enabling replicative immortality', 'evading growth suppressors', 'genomic instability and mutation', 'inducing angiogenesis', 'resisting cell death', 'sustaining proliferative signaling', 'tumor promoting inflammation'] def get_p_r_f_arrary(test_predict_label, test_true_label): num, cat = test_predict_label.shape acc_list = [] prc_list = [] rec_list = [] f_score_list = [] for i in range(num): label_pred_set = set() label_gold_set = set() for j in range(cat): if test_predict_label[i, j] == 1: label_pred_set.add(j) if test_true_label[i, j] == 1: label_gold_set.add(j) uni_set = label_gold_set.union(label_pred_set) intersec_set = label_gold_set.intersection(label_pred_set) tt = len(intersec_set) if len(label_pred_set) == 0: prc = 0 else: prc = tt / len(label_pred_set) acc = tt / len(uni_set) rec = tt / len(label_gold_set) if prc == 0 and rec == 0: f_score = 0 else: f_score = 2 * prc * rec / (prc + rec) acc_list.append(acc) prc_list.append(prc) rec_list.append(rec) f_score_list.append(f_score) mean_prc = np.mean(prc_list) mean_rec = np.mean(rec_list) f_score = divide(2 * mean_prc * mean_rec, (mean_prc + mean_rec)) return mean_prc, mean_rec, f_score def eval_hoc(true_file, pred_file): data = {} true_df = pd.read_csv(true_file, sep='\t') pred_df = pd.read_csv(pred_file, sep='\t') assert len(true_df) == len(pred_df), \ f'Gold line no {len(true_df)} vs Prediction line no {len(pred_df)}' for i in range(len(true_df)): true_row = true_df.iloc[i] pred_row = pred_df.iloc[i] assert true_row['index'] == pred_row['index'], \ 'Index does not match @{}: {} vs {}'.format(i, true_row['index'], pred_row['index']) key = true_row['index'][:true_row['index'].find('_')] if key not in data: data[key] = (set(), set()) if not pd.isna(true_row['labels']): for l in true_row['labels'].split(','): data[key][0].add(LABELS.index(l)) if not pd.isna(pred_row['labels']): for l in pred_row['labels'].split(','): data[key][1].add(LABELS.index(l)) assert len(data) == 315, 'There are 315 documents in the test set: %d' % len(data) y_test = [] y_pred = [] for k, (true, pred) in data.items(): t = [0] * len(LABELS) for i in true: t[i] = 1 p = [0] * len(LABELS) for i in pred: p[i] = 1 y_test.append(t) y_pred.append(p) y_test = np.array(y_test) y_pred = np.array(y_pred) r, p, f1 = get_p_r_f_arrary(y_pred, y_test) print('Precision: {:.1f}'.format(p*100)) print('Recall : {:.1f}'.format(r*100)) print('F1 : {:.1f}'.format(f1*100)) if __name__ == '__main__': fire.Fire(eval_hoc) ================================================ FILE: blue/eval_mednli.py ================================================ import fire import pandas as pd from blue.ext import pmetrics labels = ['contradiction', 'entailment', 'neutral'] def eval_mednli(gold_file, pred_file): true_df = pd.read_csv(gold_file, sep='\t') pred_df = pd.read_csv(pred_file, sep='\t') assert len(true_df) == len(pred_df), \ f'Gold line no {len(true_df)} vs Prediction line no {len(pred_df)}' y_test = [] y_pred = [] for i in range(len(true_df)): true_row = true_df.iloc[i] pred_row = pred_df.iloc[i] assert true_row['index'] == pred_row['index'], \ 'Index does not match @{}: {} vs {}'.format(i, true_row['index'], pred_row['index']) y_test.append(labels.index(true_row['label'])) y_pred.append(labels.index(pred_row['label'])) result = pmetrics.classification_report(y_test, y_pred, classes_=labels, macro=False, micro=True) print(result.report) if __name__ == '__main__': fire.Fire(eval_mednli) ================================================ FILE: blue/eval_ner.py ================================================ from typing import List import fire from ext import pmetrics from ext.data_structure import read_annotations, Annotation def has_strict(target: Annotation, lst: List[Annotation]): for x in lst: if target.strict_equal(x): return True return False def eval_cdr(gold_file, pred_file): golds = read_annotations(gold_file) preds = read_annotations(pred_file) # tp tps = [] fns = [] fps = [] for g in golds: if has_strict(g, preds): tps.append(g) else: fns.append(g) tps2 = [] for p in preds: if has_strict(p, golds): tps2.append(p) else: fps.append(p) tp = len(tps) fp = len(fps) fn = len(fns) tp2 = len(tps2) if tp != tp2: print(f'TP: {tp} vs TPs: {tp2}') TPR = pmetrics.tpr(tp, 0, fp, fn) PPV = pmetrics.ppv(tp, 0, fp, fn) F1 = pmetrics.f1(PPV, TPR) print('tp: {}'.format(tp)) print('fp: {}'.format(fp)) print('fn: {}'.format(fn)) print('pre: {:.1f}'.format(PPV * 100)) print('rec: {:.1f}'.format(TPR * 100)) print('f1: {:.1f}'.format(F1 * 100)) print('support: {}'.format(tp + fn)) if __name__ == '__main__': fire.Fire(eval_cdr) ================================================ FILE: blue/eval_rel.py ================================================ import json import logging import fire import pandas as pd from blue.ext import pmetrics all_labels = set() def _read_relations(pathname): objs = [] df = pd.read_csv(pathname, sep='\t') for i, row in df.iterrows(): obj = {'docid': row['docid'], 'id': row['id'], 'arg1': row['arg1'], 'arg2': row['arg2'], 'label': row['label']} objs.append(obj) all_labels.add(obj['label']) return objs def eval_chemprot(gold_file, pred_file): trues = _read_relations(gold_file) preds = _read_relations(pred_file) if len(trues) != len(preds): logging.error('%s-%s: Unmatched line no %s vs %s', gold_file, pred_file, len(trues), len(preds)) exit(1) labels = list(sorted(all_labels)) y_test = [] y_pred = [] for i, (t, p) in enumerate(zip(trues, preds)): if t['docid'] != p['docid'] or t['arg1'] != p['arg1'] or t['arg2'] != p['arg2']: logging.warning('%s:%s-%s:%s: Cannot match %s vs %s', gold_file, i, pred_file, i, t, p) continue y_test.append(labels.index(t['label'])) y_pred.append(labels.index(p['label'])) result = pmetrics.classification_report(y_test, y_pred, macro=False, micro=True, classes_=labels) print(result.report) print() subindex = [i for i in range(len(labels)) if labels[i] != 'false'] result = result.sub_report(subindex, macro=False, micro=True) print(result.report) if __name__ == '__main__': fire.Fire(eval_chemprot) ================================================ FILE: blue/eval_sts.py ================================================ import fire import pandas as pd def eval_sts(true_file, pred_file): true_df = pd.read_csv(true_file, sep='\t') pred_df = pd.read_csv(pred_file, sep='\t') assert len(true_df) == len(pred_df), \ f'Gold line no {len(true_df)} vs Prediction line no {len(pred_df)}' for i in range(len(true_df)): true_row = true_df.iloc[i] pred_row = pred_df.iloc[i] assert true_row['index'] == pred_row['index'], \ 'Index does not match @{}: {} vs {}'.format(i, true_row['index'], pred_row['index']) print('Pearson correlation: {}'.format(true_df['score'].corr(pred_df['score']))) if __name__ == '__main__': fire.Fire(eval_sts) ================================================ FILE: blue/ext/__init__.py ================================================ ================================================ FILE: blue/ext/data_structure.py ================================================ import json from typing import List, Any, Dict class Span: def __init__(self, start: int, end: int, text: str): self.start = start self.end = end self.text = text def __str__(self): return f'[start={self.start}, end={self.end}, text={self.text}]' def __repr__(self): return str(self) class Annotation: def __init__(self, id: str, docid: str, spans: List[Span], type: str): self.spans = spans self.id = id self.docid = docid self.type = type def __str__(self): return f'docid={self.docid}, spans={self.spans}' def __repr__(self): return str(self) def strict_equal(self, another: 'Annotation') -> bool: if self.docid != another.docid: return False if len(self.spans) != len(another.spans): return False for s1, s2 in zip(self.spans, another.spans): if s1.start != s2.start: return False if s1.end != s2.end: return False return True def relaxed_equal(self, another: 'Annotation') -> bool: if self.docid != another.docid: return False for s1 in self.spans: for s2 in another.spans: if s2.start >= s1.end or s1.start >= s2.end: continue return True return False def to_obj(self) -> Dict: return { 'id': self.id, 'docid': self.docid, 'locations': [{'start': s.start, 'end': s.end, 'text': s.text} for s in self.spans], 'type': self.type } @staticmethod def from_obj(obj: Any) -> "Annotation": return Annotation(obj['id'], obj['docid'], [Span(o['start'], o['end'], o['text']) for o in obj['locations']], obj['type']) def read_annotations(pathname)->List[Annotation]: anns = [] with open(pathname) as fp: for line in fp: obj = json.loads(line) anns.append(Annotation.from_obj(obj)) return anns ================================================ FILE: blue/ext/pmetrics.py ================================================ """ Copyright (c) 2019, Yifan Peng All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ import numpy as np import pandas as pd import tabulate from sklearn import metrics from sklearn.metrics import precision_recall_fscore_support class Report(object): def __init__(self): self.report = None self.table = None self.overall_acc = None self.kappa = None self.weighted_acc = None self.confusion = None def sensitivity(self, targetclass): return self.table.iloc[targetclass, 9] def specificity(self, targetclass): return self.table.iloc[targetclass, 10] def precision(self, targetclass): return self.table.iloc[targetclass, 5] def recall(self, targetclass): return self.table.iloc[targetclass, 6] def f1(self, targetclass): return self.table.iloc[targetclass, 7] def sub_report(self, targetclasses, *_, **kwargs) -> 'Report': digits = kwargs.pop('digits', 3) macro = kwargs.pop('macro', False) has_micro = kwargs.pop('micro', False) TP = np.zeros(len(targetclasses)) TN = np.zeros(len(targetclasses)) FP = np.zeros(len(targetclasses)) FN = np.zeros(len(targetclasses)) for i, targetclass in enumerate(targetclasses): TP[i] = self.table.iloc[targetclass, 1] TN[i] = self.table.iloc[targetclass, 2] FP[i] = self.table.iloc[targetclass, 3] FN[i] = self.table.iloc[targetclass, 4] TPR = tpr(TP, TN, FP, FN) TNR = tnr(TP, TN, FP, FN) PPV = ppv(TP, TN, FP, FN) NPV = npv(TP, TN, FP, FN) ACC = accuracy(TP, TN, FP, FN) F1 = f1(PPV, TPR) headings = ['Class', 'TP', 'TN', 'FP', 'FN', 'Precision', 'Recall', 'F-score', 'Accuracy', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'Support'] tables = [] for i, targetclass in enumerate(targetclasses): row = [t[i] for t in (TP, TN, FP, FN, PPV, TPR, F1, ACC, TPR, TNR, PPV, NPV, TP + FN)] tables.append([self.table.iloc[targetclass, 0]] + row) if has_micro: row = list(micro(TP, TN, FP, FN)) tables.append(['micro'] + row) if macro: row = [np.nan] * 4 row += [np.average(t) for t in [PPV, TPR, F1, ACC, TPR, TNR, PPV, NPV]] row += [np.nan] tables.append(['macro'] + row) df = pd.DataFrame(tables, columns=headings) float_formatter = ['g'] * 5 + ['.{}f'.format(digits)] * 8 + ['g'] rtn = Report() rtn.report = tabulate.tabulate(df, showindex=False, headers=df.columns, tablefmt="plain", floatfmt=float_formatter) rtn.table = df rtn.overall_acc = overall_acc(TP, FN, FP, FN) rtn.weighted_acc = weighted_acc(TP, FN, FP, FN) return rtn def divide(x, y): return np.true_divide(x, y, out=np.zeros_like(x, dtype=np.float), where=y != 0) def tpr(tp, tn, fp, fn): """Sensitivity, hit rate, recall, or true positive rate""" return divide(tp, tp + fn) def tnr(tp, tn, fp, fn): """Specificity or true negative rate""" return divide(tn, tn + fp) def tp_tn_fp_fn(confusion_matrix): FP = np.sum(confusion_matrix, axis=0) - np.diag(confusion_matrix) FN = np.sum(confusion_matrix, axis=1) - np.diag(confusion_matrix) TP = np.diag(confusion_matrix) TN = np.sum(confusion_matrix) - (FP + FN + TP) return TP, TN, FP, FN def ppv(tp, tn, fp, fn): """Precision or positive predictive value""" return divide(tp, tp + fp) def npv(tp, tn, fp, fn): """Negative predictive value""" return divide(tn, tn + fn) def fpr(tp, tn, fp, fn): """Fall out or false positive rate""" return divide(fp, fp + tn) def fnr(tp, tn, fp, fn): """False negative rate""" return divide(fn, tp + fn) def fdr(tp, tn, fp, fn): """False discovery rate""" return divide(fp, tp + fp) def accuracy(tp, tn, fp, fn): """tp / N, same as """ return divide(tp, tp + fn) def f1(precision, recall): return divide(2 * precision * recall, precision + recall) def cohen_kappa(confusion, weights=None): n_classes = confusion.shape[0] sum0 = np.sum(confusion, axis=0) sum1 = np.sum(confusion, axis=1) expected = divide(np.outer(sum0, sum1), np.sum(sum0)) if weights is None: w_mat = np.ones([n_classes, n_classes], dtype=np.int) w_mat.flat[:: n_classes + 1] = 0 elif weights == "linear" or weights == "quadratic": w_mat = np.zeros([n_classes, n_classes], dtype=np.int) w_mat += np.arange(n_classes) if weights == "linear": w_mat = np.abs(w_mat - w_mat.T) else: w_mat = (w_mat - w_mat.T) ** 2 else: raise ValueError("Unknown kappa weighting type.") k = divide(np.sum(w_mat * confusion), np.sum(w_mat * expected)) return 1 - k def micro(tp, tn, fp, fn): """Returns tp, tn, fp, fn, ppv, tpr, f1, acc, tpr, tnr, ppv, npv, support""" TP, TN, FP, FN = [np.sum(t) for t in [tp, tn, fp, fn]] TPR = tpr(TP, TN, FP, FN) TNR = tnr(TP, TN, FP, FN) PPV = ppv(TP, TN, FP, FN) NPV = npv(TP, TN, FP, FN) FPR = fpr(TP, TN, FP, FN) FNR = fnr(TP, TN, FP, FN) FDR = fdr(TP, TN, FP, FN) F1 = f1(PPV, TPR) return TP, TN, FP, FN, PPV, TPR, F1, np.nan, TPR, TNR, PPV, NPV, TP + FN def overall_acc(tp, tn, fp, fn): """Same as micro recall.""" return divide(np.sum(tp), np.sum(tp + fn)) def weighted_acc(tp, tn, fp, fn): weights = tp + fn portion = divide(weights, np.sum(weights)) acc = accuracy(tp, tn, fp, fn) return np.average(acc, weights=portion) def micro_weighted(tp, tn, fp, fn): weights = tp + fn portion = divide(weights, np.sum(weights)) # print(portion) TP, TN, FP, FN = [np.average(t, weights=portion) for t in [tp, tn, fp, fn]] TPR = tpr(TP, TN, FP, FN) TNR = tnr(TP, TN, FP, FN) PPV = ppv(TP, TN, FP, FN) NPV = npv(TP, TN, FP, FN) FPR = fpr(TP, TN, FP, FN) FNR = fnr(TP, TN, FP, FN) FDR = fdr(TP, TN, FP, FN) # ACC = accuracy(TP, TN, FP, FN) F1 = f1(PPV, TPR) return TP, TN, FP, FN, PPV, TPR, F1, np.nan, TPR, TNR, PPV, NPV, TP + FN def confusion_matrix_report(confusion_matrix, *_, **kwargs) -> 'Report': classes_ = kwargs.get('classes_', None) digits = kwargs.pop('digits', 3) macro = kwargs.pop('macro', False) has_micro = kwargs.pop('micro', False) kappa_weights = kwargs.pop('kappa', None) TP, TN, FP, FN = tp_tn_fp_fn(confusion_matrix) TPR = tpr(TP, TN, FP, FN) TNR = tnr(TP, TN, FP, FN) PPV = ppv(TP, TN, FP, FN) NPV = npv(TP, TN, FP, FN) FPR = fpr(TP, TN, FP, FN) FNR = fnr(TP, TN, FP, FN) FDR = fdr(TP, TN, FP, FN) ACC = accuracy(TP, TN, FP, FN) F1 = f1(PPV, TPR) if classes_ is None: classes_ = [str(i) for i in range(confusion_matrix.shape[0])] headings = ['Class', 'TP', 'TN', 'FP', 'FN', 'Precision', 'Recall', 'F-score', 'Accuracy', 'Sensitivity', 'Specificity', 'PPV', 'NPV', 'Support'] tables = [] for i, c in enumerate(classes_): row = [t[i] for t in (TP, TN, FP, FN, PPV, TPR, F1, ACC, TPR, TNR, PPV, NPV, TP + FN)] tables.append([str(c)] + row) if has_micro: row = list(micro(TP, TN, FP, FN)) tables.append(['micro'] + row) if macro: row = [np.nan] * 4 row += [np.average(t) for t in [PPV, TPR, F1, ACC, TPR, TNR, PPV, NPV]] row += [np.nan] tables.append(['macro'] + row) df = pd.DataFrame(tables, columns=headings) float_formatter = ['g'] * 5 + ['.{}f'.format(digits)] * 8 + ['g'] rtn = Report() rtn.report = tabulate.tabulate(df, showindex=False, headers=df.columns, tablefmt="plain", floatfmt=float_formatter) rtn.table = df rtn.kappa = cohen_kappa(confusion_matrix, weights=kappa_weights) rtn.overall_acc = overall_acc(TP, FN, FP, FN) rtn.weighted_acc = weighted_acc(TP, FN, FP, FN) rtn.confusion = pd.DataFrame(confusion_matrix) return rtn def auc(y_true, y_score, y_column: int = 1): """Compute Area Under the Curve (AUC). Args: y_true: (n_sample, ) y_score: (n_sample, n_classes) y_column: column of y """ fpr, tpr, _ = metrics.roc_curve(y_true, y_score[:, y_column], pos_label=1) roc_auc = metrics.auc(fpr, tpr) return roc_auc, fpr, tpr def multi_class_auc(y_true, y_score): """Compute Area Under the Curve (AUC). Args: y_true: (n_sample, n_classes) y_score: (n_sample, n_classes) """ assert y_score.shape[1] == y_true.shape[1] fpr = dict() tpr = dict() roc_auc = dict() n_classes = y_score.shape[1] for i in range(n_classes): fpr[i], tpr[i], _ = metrics.roc_curve(y_true[:, i], y_score[:, i]) roc_auc[i] = metrics.auc(fpr[i], tpr[i]) fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_true.ravel(), y_score.ravel()) roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"]) return roc_auc, fpr, tpr def classification_report(y_true, y_pred, *_, **kwargs) -> 'Report': """ Args: y_true: (n_sample, ) y_pred: (n_sample, ) """ m = metrics.confusion_matrix(y_true, y_pred) report = confusion_matrix_report(m, **kwargs) confusion = pd.DataFrame(m) if 'classes_' in kwargs: confusion.index = kwargs['classes_'] confusion.columns = kwargs['classes_'] report.confusion = confusion return report def precision_recall_fscore_multilabel(y_true, y_pred, *_, **kwargs): """ Args: y_true: (n_sample, n_classes) y_pred: (n_sample, n_classes) """ example_based = kwargs.pop('example_based', False) if example_based: rs = [] ps = [] for yt, yp in zip(y_true, y_pred): p, r, _, _ = precision_recall_fscore_support(y_true=yt, y_pred=yp, pos_label=1, average='binary') rs.append(r) ps.append(p) r = np.average(rs) p = np.average(ps) f1 = divide(2 * r * p, r + p) else: raise NotImplementedError return r, p, f1 """ Test cases """ def test_cm1(): cm = np.asarray([[20, 5], [10, 15]]) k = cohen_kappa(cm) assert np.math.isclose(k, 0.4, rel_tol=1e-01) k = cohen_kappa(cm, weights='linear') assert np.math.isclose(k, 0.4, rel_tol=1e-01) def test_cm2(): cm = np.array([ [236, 29, 7, 4, 8, 5, 3, 3, 1, 0, 5, 6, 1], [45, 3724, 547, 101, 102, 16, 0, 0, 2, 0, 0, 11, 0], [5, 251, 520, 132, 158, 11, 2, 1, 4, 0, 0, 4, 0], [0, 9, 71, 78, 63, 14, 2, 0, 0, 0, 0, 1, 0], [8, 37, 152, 144, 501, 200, 71, 11, 30, 3, 0, 18, 0], [5, 6, 6, 24, 144, 178, 136, 34, 30, 1, 0, 20, 0], [5, 2, 2, 3, 53, 115, 333, 106, 69, 4, 0, 36, 0], [2, 0, 0, 0, 1, 9, 99, 247, 119, 8, 0, 26, 0], [3, 2, 4, 7, 30, 54, 113, 124, 309, 78, 22, 72, 6], [1, 0, 0, 0, 1, 0, 2, 0, 5, 46, 17, 25, 0], [1, 0, 0, 0, 0, 0, 0, 0, 7, 53, 229, 28, 34], [18, 16, 5, 5, 16, 10, 11, 25, 29, 38, 70, 1202, 99], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 9, 11] ]).transpose() k = cohen_kappa(cm) assert np.math.isclose(k, 0.5547, rel_tol=1e-04) k = cohen_kappa(cm, weights='quadratic') assert np.math.isclose(k, 0.9214, rel_tol=1e-04) k = cohen_kappa(cm, weights='linear') assert np.math.isclose(k, 0.8332, rel_tol=1e-04) def test_kappa(): """ Example 10.52 Bernald Rosner, Fundamentals of Biostatistics (8th ed). Cengage Learning. 2016. p.434 """ cm = [[136, 92], [69, 240]] k = cohen_kappa(np.array(cm)) assert np.math.isclose(k, 0.378, rel_tol=1e-03) def test_precision_recall_fscore_multilabel(): y_true = np.array([[0, 0, 1, 0]]) y_pred = np.array([[0, 1, 1, 0]]) r, p, f1 = precision_recall_fscore_multilabel(y_true, y_pred, example_based=True) assert r == 1 assert p == 0.5 y_true = np.array([[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 1]]) y_pred = np.array([[1, 0, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1]]) r, p, f1 = precision_recall_fscore_multilabel(y_true, y_pred, example_based=True) assert r == 1 assert np.isclose(p, 0.888, 1e-02) if __name__ == '__main__': test_precision_recall_fscore_multilabel() ================================================ FILE: blue/ext/preprocessing.py ================================================ import csv import re import bioc import en_core_web_sm nlp = en_core_web_sm.load() def split_punct(text, start): for m in re.finditer(r"""[\w']+|[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]""", text): yield m.group(), m.start() + start, m.end() + start def tokenize_text(text, id): sentences = [] doc = nlp(text) for sent in doc.sents: sentence = bioc.BioCSentence() sentence.infons['filename'] = id sentence.offset = sent.start_char sentence.text = text[sent.start_char:sent.end_char] sentences.append(sentence) i = 0 for token in sent: for t, start, end in split_punct(token.text, token.idx): ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = t ann.add_location(bioc.BioCLocation(start, end-start)) sentence.add_annotation(ann) i += 1 return sentences def print_ner_debug(sentences, start, end): anns = [] for sentence in sentences: for ann in sentence.annotations: span = ann.total_span if start <= span.offset <= end \ or start <= span.offset + span.length <= end: anns.append(ann) print('-' * 80) if len(anns) != 0: for ann in anns: print(ann) print('-' * 80) ss = [s for s in sentences if s.offset <= start <= s.offset + len(s.text)] if len(ss) != 0: for s in ss: print(s.offset, s.text) else: for s in sentences: print(s.offset, s.text) def write_bert_ner_file(dest, total_sentences): cnt = 0 with open(dest, 'w') as fp: writer = csv.writer(fp, delimiter='\t', lineterminator='\n') for sentence in total_sentences: for i, ann in enumerate(sentence.annotations): if 'NE_label' not in ann.infons: ann.infons['NE_label'] = 'O' elif ann.infons['NE_label'] == 'B': cnt += 1 if i == 0: writer.writerow([ann.text, sentence.infons['filename'], ann.total_span.offset, ann.infons['NE_label']]) else: writer.writerow([ann.text, '-', ann.total_span.offset, ann.infons['NE_label']]) fp.write('\n') return cnt ================================================ FILE: blue/ext/pstring.py ================================================ # -*- coding: utf-8 -*- """ Copyright (c) 2019, Yifan Peng All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ import logging import string import sympy ACCENTS = { u'ά': u'a', u'Ά': u'Α', u'έ': u'e', u'Έ': u'Ε', u'ή': u'h', u'Ή': u'H', u'ί': u'e', u'Ί': u'Ι', u'ύ': u'u', u'Ύ': u'Y', u'ό': u'o', u'Ό': u'O', u'ώ': u'w', u'Ώ': u'w', u'Ã': u'A', u'Å': u'A', u'ç': u'c', u'ï': 'i', } # The possible string conversions for each case. GREEK_CONVERT_STRINGS = { u"αι": [u"ai", u"e"], u"Αι": [u"Ai", u"E"], u"ΑΙ": [u"AI", u"E"], u"ει": [u"ei", u"i"], u"Ει": [u"Ei", u"I"], u"ΕΙ": [u"EI", u"I"], u"οι": [u"oi", u"i"], u"Οι": [u"Oi", u"I"], u"ΟΙ": [u"OI", u"I"], u"ου": [u"ou", u"oy", u"u"], u"Ου": [u"Ou", u"Oy", u"U"], u"ΟΥ": [u"OU", u"OY", u"U"], u"ευ": [u"eu", u"ef", u"ev", u"ey"], u"Ευ": [u"Eu", u"Ef", u"Ev", u"Ey"], u"ΕΥ": [u"EU", u"EF", u"EV", u"EY"], u"αυ": [u"au", u"af", u"av", u"ay"], u"Αυ": [u"Au", u"Af", u"Av", u"Ay"], u"ΑΥ": [u"AU", u"AF", u"av", u"AY"], u"μπ": [u"mp", u"b"], u"Μπ": [u"Mp", u"B"], u"ΜΠ": [u"MP", u"B"], u"γγ": [u"gg", u"g"], u"Γγ": [u"Gg", u"G"], u"ΓΓ": [u"GG", u"G"], u"γκ": [u"gk", u"g"], u"Γκ": [u"Gk", u"G"], u"ΓΚ": [u"GK", u"G"], u"ντ": [u"nt", u"d"], u"Ντ": [u"Nt", u"D"], u"ΝΤ": [u"NT", u"D"], u"α": [u"a"], u"Α": [u"A"], u"β": [u"b", u"v"], u"Β": [u"B", u"V"], u"γ": [u"g"], u"Γ": [u"G"], u"δ": [u"d"], u"Δ": [u"D"], u"ε": [u"e"], u"Ε": [u"E"], u"ζ": [u"z"], u"Ζ": [u"Z"], u"η": [u"h", u"i"], u"Η": [u"H", u"I"], u"θ": [u"th", u"8"], u"Θ": [u"TH", u"8"], u"ι": [u"i"], u"Ι": [u"I"], u"κ": [u"k"], u"Κ": [u"K"], u"λ": [u"l"], u"Λ": [u"L"], u"μ": [u"m"], u"Μ": [u"M"], u"ν": [u"n"], u"Ν": [u"N"], u"ξ": [u"x", u"ks"], u"Ξ": [u"X", u"KS"], u"ο": [u"o"], u"Ο": [u"O"], u"π": [u"p"], u"Π": [u"P"], u"ρ": [u"r"], u"Ρ": [u"R"], u"σ": [u"s"], u"Σ": [u"S"], u"ς": [u"s"], u"τ": [u"t"], u"Τ": [u"T"], u"υ": [u"y", u"u", u"i"], u"Υ": [u"Y", u"U", u"I"], u"φ": [u"f", u"ph"], u"Φ": [u"F", u"PH"], u"χ": [u"x", u"h", u"ch"], u"Χ": [u"X", u"H", u"CH"], u"ψ": [u"ps"], u"Ψ": [u"PS"], u"ω": [u"w", u"o", u"v"], u"Ω": [u"w", u"O", u"V"], } OTHERS = { u'\xb7': '*', # MIDDLE DOT u'\xb1': '+', # PLUS-MINUS SIGN u'\xae': 'r', # REGISTERED SIGN u'\u2002': ' ', # EN SPACE u'\xa9': 'c', # COPYRIGHT SIGN u'\xa0': ' ', # NO-BREAK SPACE u'\u2009': ' ', # THIN SPACE u'\u025b': 'e', # LATIN SMALL LETTER OPEN E u'\u0303': '~', # COMBINING TILDE u'\u043a': 'k', # CYRILLIC SMALL LETTER KA u'\u2005': ' ', # FOUR-PER-EM SPACE u'\u200a': ' ', # HAIR SPACE u'\u2026': '.', # HORIZONTAL ELLIPSIS u'\u2033': '"', # DOUBLE PRIME u'\u2034': '"', # TRIPLE PRIME u'\u2075': '5', # SUPERSCRIPT FIVE u'\u2077': '7', # SUPERSCRIPT SEVEN u'\u2079': '9', # SUPERSCRIPT NINE u'\u207a': '+', # SUPERSCRIPT PLUS SIGN u'\u207b': '-', # SUPERSCRIPT MINUS u'\u2080': '0', # SUBSCRIPT ZERO u'\u2081': '1', # SUBSCRIPT ONE u'\u2082': '2', # SUBSCRIPT TWO u'\u2083': '3', # SUBSCRIPT THREE u'\u2084': '4', # SUBSCRIPT FOUR u'\u2085': '5', # SUBSCRIPT FIVE u'\u2122': 'T', # TRADE MARK SIGN u'\u2192': '>', # RIGHTWARDS ARROW u'\u2217': '*', # STERISK OPERATOR u'\u223c': '~', # TILDE OPERATOR u'\u2248': '=', # ALMOST EQUAL TO u'\u2264': '<', # LESS-THAN OR EQUAL TO u'\u2265': '>', # GREATER-THAN OR EQUAL TO u'\u22c5': '*', # DOT OPERATOR u'\ue232': 'x', # u'\ue2f6': 'x', # Chinese character u'\xb0': '*', # DEGREE SIGN u'\xb2': '2', # SUPERSCRIPT TWO u'\xb3': '3', # SUPERSCRIPT THREE u'\xb4': '\'', # ACUTE ACCENT u'\xb5': 'm', # MICRO SIGN u'\xb9': '1', # SUPERSCRIPT ONE u'\xc3': 'A', # LATIN CAPITAL LETTER A WITH TILDE u'\xc5': 'A', # LATIN CAPITAL LETTER A WITH RING ABOVE u'\xd7': '*', # MULTIPLICATION SIGN u'\xe7': 'c', # LATIN SMALL LETTER C WITH CEDILLA u'\xef': 'i', # LATIN SMALL LETTER I WITH DIAERESIS u'\xf8': 'm', # LATIN SMALL LETTER O WITH STROKE u'\xfc': 'u', # LATIN SMALL LETTER U WITH DIAERESIS u'\xf6': 'o', # LATIN SMALL LETTER O WITH DIAERESIS u'\u2194': '<', # LEFT RIGHT ARROW u'\xe1': 'a', # LATIN SMALL LETTER A WITH ACUTE u'\u221e': '~', # INFINITY u'\u2193': '<', # DOWNWARDS ARROW u'\u2022': '*', # BULLET u'\u2211': 'E', # N-ARY SUMMATION u'\xdf': 'b', # LATIN SMALL LETTER SHARP S u'\xff': 'y', # LATIN SMALL LETTER Y WITH DIAERESIS u'\u2550': '=', # BOX DRAWINGS DOUBLE HORIZONTAL u'\u208b': '-', # SUBSCRIPT MINUS u'\u226b': '>', # MUCH GREATER-THAN u'\u2a7e': '>', # GREATER-THAN OR SLANTED EQUAL TO u'\uf8ff': '*', # Private Use, Last u'\xe9': 'e', # LATIN SMALL LETTER E WITH ACUTE u'\u0192': 'f', # LATIN SMALL LETTER F WITH HOOK u'\u3008': '(', # LEFT ANGLE BRACKET u'\u3009': ')', # RIGHT ANGLE BRACKET u'\u0153': 'o', # LATIN SMALL LIGATURE OE u'\u2a7d': '<', # LESS-THAN OR SLANTED EQUAL TO u'\u2243': '=', # ASYMPTOTICALLY EQUAL TO u'\u226a': '<', # much less-than } def printable(s: str, greeklish=False, verbose=False, replacement=' ') -> str: """ Return string of ASCII string which is considered printable. """ out = '' for c in s: if c in string.printable: out += c else: if greeklish: if c in ACCENTS: out += ACCENTS[c] elif c in GREEK_CONVERT_STRINGS: out += GREEK_CONVERT_STRINGS[c][0] elif c in OTHERS: out += OTHERS[c] elif verbose: logging.warning('Unknown char: %r', sympy.pretty(c)) out += replacement else: if verbose: logging.warning('Cannot convert char: %s', c) out += replacement return out ================================================ FILE: blue/ext/pubtator.py ================================================ """ Loads str/file-obj to a list of Pubtator objects """ import logging import re from typing import List class Pubtator: def __init__(self, pmid: str=None, title: str=None, abstract: str=None): self.pmid = pmid self.title = title self.abstract = abstract self.annotations = [] # type: List[PubtatorAnn] self.relations = [] # type: List[PubtatorRel] def __str__(self): text = self.pmid + '|t|' + self.title + '\n' if self.abstract: text += self.pmid + '|a|' + self.abstract + '\n' for ann in self.annotations: text += '{}\n'.format(ann) for rel in self.relations: text += '{}\n'.format(rel) return text def __iter__(self): yield 'pmid', self.pmid yield 'title', self.title yield 'abstract', self.abstract yield 'annotations', [dict(a) for a in self.annotations] yield 'relations', [dict(a) for a in self.relations] @property def text(self): """ str: text """ text = self.title if self.abstract: text += '\n' + self.abstract return text class PubtatorAnn: def __init__(self, pmid, start, end, text, type, id): self.pmid = pmid self.start = start self.end = end self.text = text self.type = type self.id = id self.line = None def __str__(self): return f'{self.pmid}\t{self.start}\t{self.end}\t{self.text}\t{self.type}\t{self.id}' def __iter__(self): yield 'pmid', self.pmid yield 'start', self.start yield 'end', self.end yield 'text', self.text yield 'type', self.type yield 'id', self.id class PubtatorRel: def __init__(self, pmid, type, id1, id2): self.pmid = pmid self.type = type self.id1 = id1 self.id2 = id2 self.line = None def __str__(self): return '{self.pmid}\t{self.type}\t{self.id1}\t{self.id2}'.format(self=self) def __iter__(self): yield 'pmid', self.pmid yield 'type', self.type yield 'id1', self.id1 yield 'id2', self.id2 ABSTRACT_PATTERN = re.compile(r'(.*?)\|a\|(.*)') TITLE_PATTERN = re.compile(r'(.*?)\|t\|(.*)') def loads(s: str) -> List[Pubtator]: """ Parse s (a str) to a list of Pubtator documents Returns: list: a list of PubTator documents """ return list(__iterparse(s.splitlines())) def load(fp) -> List[Pubtator]: """ Parse file-like object to a list of Pubtator documents Args: fp: file-like object Returns: list: a list of PubTator documents """ return loads(fp.read()) def __iterparse(line_iterator): """ Iterative parse each line """ doc = Pubtator() i = 0 for i, line in enumerate(line_iterator, 1): if i % 100000 == 0: logging.debug('Read %d lines', i) line = line.strip() if not line: if doc.pmid and (doc.title or doc.abstract): yield doc doc = Pubtator() continue matcher = TITLE_PATTERN.match(line) if matcher: doc.pmid = matcher.group(1) doc.title = matcher.group(2) continue matcher = ABSTRACT_PATTERN.match(line) if matcher: doc.pmid = matcher.group(1) doc.abstract = matcher.group(2) continue toks = line.split('\t') if len(toks) >= 6: annotation = PubtatorAnn(toks[0], int(toks[1]), int(toks[2]), toks[3], toks[4], toks[5]) annotation.line = i doc.annotations.append(annotation) if len(toks) == 4: relation = PubtatorRel(toks[0], toks[1], toks[2], toks[3]) relation.line = i doc.relations.append(relation) if doc.pmid and (doc.title or doc.abstract): yield doc logging.debug('Read %d lines', i) ================================================ FILE: blue/gs/__init__.py ================================================ ================================================ FILE: blue/gs/create_cdr_test_gs.py ================================================ import fire import jsonlines import tqdm import logging from blue.ext import pubtator from ext.data_structure import Span, Annotation def create_test_gs(input, output, type): assert type in ('Chemical', 'Disease'), \ 'entity_type has to be Chemical or Disease' with open(input) as fp: docs = pubtator.load(fp) with jsonlines.open(output, 'w') as writer: for doc in tqdm.tqdm(docs): for i, ann in enumerate(doc.annotations): if ann.type != type: continue expected_text = ann.text actual_text = doc.text[ann.start:ann.end] if expected_text != actual_text: logging.warning('{}:{}: Text does not match. Expected {}. Actual {}'.format( output, ann.line, repr(expected_text), repr(actual_text))) continue a = Annotation(ann.pmid + f'.T{i}', doc.pmid, [Span(ann.start, ann.end, ann.text)], ann.type) writer.write(a.to_obj()) if __name__ == '__main__': fire.Fire(create_test_gs) ================================================ FILE: blue/gs/create_chemprot_test_gs.py ================================================ import collections import csv from pathlib import Path import fire import jsonlines def _read_entities(pathname): d = collections.defaultdict(list) with open(pathname, encoding='utf8') as fp: for line in fp: toks = line.strip().split() d[toks[0]].append(toks) return d def _read_relations(pathname): d = collections.defaultdict(list) with open(pathname, encoding='utf8') as fp: for line in fp: toks = line.strip().split() arg1 = toks[2][toks[2].find(':') + 1:] arg2 = toks[3][toks[3].find(':') + 1:] d[toks[0], arg1, arg2].append(toks) return d def create_test_gs(entities, relations, output): entities = _read_entities(entities) relations = _read_relations(relations) counter = collections.Counter() with open(output, 'w') as fp: writer = csv.writer(fp, delimiter='\t', lineterminator='\n') writer.writerow(['id', 'docid', 'arg1', 'arg2', 'label']) for docid, ents in entities.items(): chemicals = [e for e in ents if e[2] == 'CHEMICAL'] genes = [e for e in ents if e[2] != 'CHEMICAL'] i = 0 for c in chemicals: for g in genes: k = (docid, c[1], g[1]) if k in relations: for l in relations[k]: label = l[1] writer.writerow([f'{docid}.R{i}', docid, k[1], k[2], label]) counter[label] += 1 i += 1 else: writer.writerow([f'{docid}.R{i}', docid, k[1], k[2], 'false']) i += 1 for k, v in counter.items(): print(k, v) if __name__ == '__main__': fire.Fire(create_test_gs) ================================================ FILE: blue/gs/create_clefe_test_gs.py ================================================ import functools import logging import os import re from pathlib import Path import jsonlines import tqdm import fire from ext.data_structure import Span, Annotation def pattern_repl(matchobj, prefix): """ Replace [**Patterns**] with prefix+spaces. """ s = matchobj.group(0).lower() return prefix.rjust(len(s)) def _proprocess_text(text): # noinspection PyTypeChecker text = re.sub(r'\[\*\*.*?\*\*\]', functools.partial(pattern_repl, prefix='PATTERN'), text) # noinspection PyTypeChecker text = re.sub(r'(\|{4})|___|~~', functools.partial(pattern_repl, prefix=''), text) return text def create_test_gs(reports_dir, anns_dir, output): anns_dir = Path(anns_dir) with jsonlines.open(output, 'w') as writer: with os.scandir(reports_dir) as it: for entry in tqdm.tqdm(it): text_file = Path(entry) with open(text_file) as fp: text = fp.read() text = _proprocess_text(text) ann_file = anns_dir / text_file.name if not ann_file.exists(): logging.warning(f'{text_file.stem}: Cannot find ann file {ann_file}') continue with open(ann_file) as fp: for i, line in enumerate(fp): line = line.strip() toks = line.split('||') type = toks[1] spans = [] for i in range(3, len(toks), 2): start = int(toks[i]) end = int(toks[i + 1]) spans.append(Span(start, end, text[start:end])) a = Annotation(text_file.stem + f'.T{i}', text_file.stem, spans, type) writer.write(a.to_obj()) if __name__ == '__main__': fire.Fire(create_test_gs) ================================================ FILE: blue/gs/create_ddi_test_gs.py ================================================ import collections import csv import os import re import fire from lxml import etree def create_test_gs(input_dir, output): counter = collections.Counter() with open(output, 'w') as fp: writer = csv.writer(fp, delimiter='\t', lineterminator='\n') writer.writerow(['id', 'docid', 'arg1', 'arg2', 'label']) for root, dirs, files in os.walk(input_dir): for name in files: pathname = os.path.join(root, name) tree = etree.parse(pathname) docid = tree.xpath('/document')[0].get('id') for stag in tree.xpath('/document/sentence'): entities = {} for etag in stag.xpath('entity'): m = re.match('(\d+)-(\d+)', etag.get('charOffset')) assert m is not None entities[etag.get('id')] = { 'start': int(m.group(1)), 'end': int(m.group(2)), 'type': etag.get('type'), 'id': etag.get('id'), 'text': etag.get('text') } for rtag in stag.xpath('pair'): if rtag.get('ddi') == 'false': label = 'DDI-false' else: label = 'DDI-{}'.format(rtag.get('type')) e1 = entities.get(rtag.get('e1')) e2 = entities.get(rtag.get('e2')) writer.writerow([rtag.get("id"), docid, e1['id'], e2['id'], label]) counter[label] += 1 for k, v in counter.items(): print(k, v) if __name__ == '__main__': fire.Fire(create_test_gs) ================================================ FILE: blue/gs/create_hoc.py ================================================ import csv from pathlib import Path import fire import tqdm def split_doc(docid_file, data_dir, dest): with open(docid_file) as fp: docids = [line.strip() for line in fp] with open(dest, 'w', encoding='utf8') as fout: writer = csv.writer(fout, delimiter='\t', lineterminator='\n') writer.writerow(['index', 'sentence1', 'sentence2', 'label']) for docid in tqdm.tqdm(docids): with open(data_dir / f'{docid}.txt', encoding='utf8') as fp: for i, line in enumerate(fp): idx = f'{docid}_s{i}' toks = line.strip().split('\t') text = toks[0] labels = set(l[1:-1] for l in toks[1][1:-1].split(', ')) labels = ','.join(sorted(labels)) writer.writerow([idx, text, labels]) def create_hoc(hoc_dir): hoc_dir = Path(hoc_dir) text_dir = hoc_dir / 'HoCCorpus' for name in ['train', 'dev', 'test']: print('Creating', name) split_doc(hoc_dir / f'{name}_docid.txt', text_dir, hoc_dir / f'{name}.tsv') if __name__ == '__main__': fire.Fire(create_hoc) ================================================ FILE: blue/gs/create_i2b2_test_gs.py ================================================ import collections import csv import itertools import os import re from pathlib import Path from typing import Match import bioc import fire import jsonlines import tqdm from ext.data_structure import Annotation, Span labels = ['PIP', 'TeCP', 'TeRP', 'TrAP', 'TrCP', 'TrIP', 'TrNAP', 'TrWP', 'false'] def read_text(pathname): with open(pathname) as fp: text = fp.read() sentences = [] offset = 0 for sent in text.split('\n'): sentence = bioc.BioCSentence() sentence.infons['filename'] = pathname.stem sentence.offset = offset sentence.text = sent sentences.append(sentence) i = 0 for m in re.finditer('\S+', sent): if i == 0 and m.start() != 0: # add fake ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = '' ann.add_location(bioc.BioCLocation(offset, 0)) sentence.add_annotation(ann) i += 1 ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = m.group() ann.add_location(bioc.BioCLocation(m.start() + offset, len(m.group()))) sentence.add_annotation(ann) i += 1 offset += len(sent) + 1 return sentences def _get_ann_offset(sentences, match_obj: Match, start_line_group, start_token_group, end_line_group, end_token_group, text_group): assert match_obj.group(start_line_group) == match_obj.group(end_line_group) sentence = sentences[int(match_obj.group(start_line_group)) - 1] start_token_idx = int(match_obj.group(start_token_group)) end_token_idx = int(match_obj.group(end_token_group)) start = sentence.annotations[start_token_idx].total_span.offset end = sentence.annotations[end_token_idx].total_span.end text = match_obj.group(text_group) actual = sentence.text[start - sentence.offset:end - sentence.offset].lower() expected = text.lower() assert actual == expected, 'Cannot match at %s:\n%s\n%s\nFind: %r, Matched: %r' \ % ( sentence.infons['filename'], sentence.text, match_obj.string, actual, expected) return start, end, text def read_annotations(pathname, sentences): anns = [] pattern = re.compile(r'c="(.*?)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*?)"(\|\|a="(.*?)")?') with open(pathname) as fp: for i, line in enumerate(fp): line = line.strip() m = pattern.match(line) assert m is not None start, end, text = _get_ann_offset(sentences, m, 2, 3, 4, 5, 1) ann = { 'start': start, 'end': end, 'type': m.group(6), 'a': m.group(7), 'text': text, 'line': int(m.group(2)) - 1, 'id': f'{pathname.name}.l{i}' } if len(m.groups()) == 9: ann['a'] = m.group(8) anns.append(ann) return anns def _find_anns(anns, start, end): for ann in anns: if ann['start'] == start and ann['end'] == end: return ann raise ValueError def read_relations(pathname, sentences, cons): pattern = re.compile( r'c="(.*?)" (\d+):(\d+) (\d+):(\d+)\|\|r="(.*?)"\|\|c="(.*?)" (\d+):(\d+) (\d+):(\d+)') relations = [] with open(pathname) as fp: for line in fp: line = line.strip() m = pattern.match(line) assert m is not None start, end, text = _get_ann_offset(sentences, m, 2, 3, 4, 5, 1) ann1 = _find_anns(cons, start, end) start, end, text = _get_ann_offset(sentences, m, 8, 9, 10, 11, 7) ann2 = _find_anns(cons, start, end) relations.append({ 'docid': pathname.stem, 'label': m.group(6), 'Arg1': ann1['id'], 'Arg2': ann2['id'], 'string': line }) return relations def find_relations(relations, ann1, ann2): labels = [] for i in range(len(relations) - 1, -1, -1): r = relations[i] if (r['Arg1'] == ann1['id'] and r['Arg2'] == ann2['id']) \ or (r['Arg1'] == ann2['id'] and r['Arg2'] == ann1['id']): del relations[i] labels.append(r['label']) return labels def create_test_gs(input_dir, output_dir): top_dir = Path(input_dir) dest = Path(output_dir) counter = collections.Counter() with jsonlines.open(dest / 'test_ann_gs.jsonl', 'w') as writer_ann, \ open(dest / 'test_rel_gs.tsv', 'w') as fp_rel: writer_rel = csv.writer(fp_rel, delimiter='\t', lineterminator='\n') writer_rel.writerow(['id', 'docid', 'arg1', 'arg2', 'label']) with os.scandir(top_dir / 'txt') as it: for entry in tqdm.tqdm(it): if not entry.name.endswith('.txt'): continue text_pathname = Path(entry.path) docid = text_pathname.stem sentences = read_text(text_pathname) # read assertions cons = read_annotations(top_dir / 'concept' / f'{text_pathname.stem}.con', sentences) for con in cons: a = Annotation(con['id'], docid, [Span(con['start'], con['end'], con['text'])], con['type']) writer_ann.write(a.to_obj()) # read relations relations = read_relations(top_dir / 'rel' / f'{text_pathname.stem}.rel', sentences, cons) for i, (con1, con2) in enumerate(itertools.combinations(cons, 2)): if con1['line'] != con2['line']: continue labels = find_relations(relations, con1, con2) if len(labels) == 0: writer_rel.writerow([f'{docid}.R{i}', docid, con1["id"], con2["id"], 'false']) counter['false'] += 1 else: for l in labels: writer_rel.writerow([f'{docid}.R{i}', docid, con1["id"], con2["id"], l]) counter[l] += 1 for k, v in counter.items(): print(k, v) if __name__ == '__main__': fire.Fire(create_test_gs) ================================================ FILE: blue/gs/create_mednli_test_gs.py ================================================ import csv import json import fire import tqdm from blue.ext import pstring def create_mednli_test_gs(input, output): with open(input, encoding='utf8') as fin, open(output, 'w', encoding='utf8') as fout: writer = csv.writer(fout, delimiter='\t', lineterminator='\n') writer.writerow(['index', 'label']) for line in tqdm.tqdm(fin): line = pstring.printable(line, greeklish=True) obj = json.loads(line) writer.writerow([obj['pairID'], obj['sentence1'], obj['sentence2'], obj['gold_label']]) if __name__ == '__main__': fire.Fire(create_mednli_test_gs) ================================================ FILE: blue_plus/README.md ================================================ # BLUE+ Protocol ## Task/Data description Please provide a high-level description of your dataset to be included for BLUE+, with more details at your own web site. We also need a published reference for your dataset. ## Original data Please prepare your dataset as follows, in order to be included in BLUE+: * Each data instance should have a unique ID. * The data needs to be split into training, validation, and test sets. ## Evaluation script The evaluation script takes as input the test data and method output and generates detailed evaluation results. For example, for F1-metrics, TP, FP, FN, precision, and recall are required. Please provide your evaluation scripts to be included at BLUE+. ## Previous state-of-the-art (SOTA) results The previous SOTA results should be provided from a published study, with a corresponding reference. The results should be verified by the evaluation script above. ## BERT Results To be part of the benchmarking datasets, please report the results of your benchmarking task with the recent BERT model. Specifically, please provide: 1. BERT-format files of the training, validation, and test sets 2. BERT results 3. Scripts to train and test BERT models. The scripts can be hosted at your own preferred repository. The scripts are available to users for model re-training and results verification. If the data belongs to one of the tasks in BLUE (sentence similarity, named entity recognition, relation extraction, document classification, text inference), please follow the examples at NCBI_BERT ([https://github.com/ncbi-nlp/NCBI_BERT](https://github.com/ncbi-nlp/NCBI_BERT)) ## Steps 1. Pick a name for your dataset (e.g. CRAFT) 2. Fork the BLUE_Benchmark project on GitHub ([https://github.com/ncbi-nlp/BLUE_Benchmark](https://github.com/ncbi-nlp/BLUE_Benchmark)), and create a new branch (e.g. craft) 3. In your branch, create a subfolder (e.g., CRAFT) in the ‘blue_plus’ folder with at least the following files: - your_dataset.yml - The configuration file * Dataset name * Dataset description * Version * The citation to use for this dataset * Links to download the original data, BERT-formatted data and its results * Your dataset license information - your_dataset.py - downloading the datasets and evaluating the results. * Implement a class to inherit the abstract class BaseDataset in [dataset.py](https://github.com/ncbi-nlp/BLUE_Benchmark/blob/master/blue_plus/dataset.py). * The method `download` should download the data sets from the official internet distribution location (ie, “links” in the configuration) * The method `evaluate` should evaluate the results * CLI entry points to download the data - requirements.txt - a list of packages the script your_dataset.py relies on. 4. Send a “pull request” back to BLUE-PLUS An example dataset can be found at [https://github.com/ncbi-nlp/BLUE_Benchmark/tree/master/blue_plus](https://github.com/ncbi-nlp/BLUE_Benchmark/tree/master/blue_plus) It may take up to 2-3 weeks to review your pull request. We may propose changes or request missing or additional information. Pull requests must be approved first before they can be merged. After the approval, we will include your dataset and results in the benchmark. ================================================ FILE: blue_plus/__init__.py ================================================ ================================================ FILE: blue_plus/dataset.py ================================================ import yaml class BaseDataset(object): """Abstract dataset class""" def __init__(self, config_file): print(config_file) with open(config_file, encoding='utf8') as fp: self.config = yaml.load(fp) self.name = self.config['name'] if 'name' in self.config else '' self.description = self.config['description'] if 'description' in self.config else '' self.version = self.config['version'] if 'version' in self.config else '' self.citation = self.config['citation'] if 'citation' in self.config else '' self.links = self.config['links'] if 'links' in self.config else '' @property def full_name(self): """Full canonical name: (_).""" return '{}_{}'.format(self.name, self.version) def download(self, download_dir='blue_plus_data', override=False): """Downloads and prepares dataset for reading. Args: download_dir: string directory where downloaded files are stored. Defaults to "blue_plus_data/". override: bool True to override the data Raises: IOError: if there is not enough disk space available. Returns: successful: bool True if download complete """ raise NotImplementedError def evaluate(self, test_file, prediction_file, output_file): """Evaluate the predictions. Args: test_file: string location of the file containing the gold standards. prediction_file: string location of the file containing the predictions. output_file: string location of the file to store the evaluation results. Returns: results: string or pandas DataFrame that containing the evaluation results. """ raise NotImplementedError ================================================ FILE: blue_plus/example_dataset/__init__.py ================================================ ================================================ FILE: blue_plus/example_dataset/biosses.yml ================================================ # dataset name name: BIOSSES # description of this dataset. description: A corpus of sentence pairs selected from the Biomedical Summarization Track Training Dataset in the biomedical domain. version: 1.0 # The citation to use for this dataset. citation: "Sogancioglu G, Ozturk H, Ozgur A. BIOSSES: a semantic sentence similarity estimation system for the biomedical domain. Bioinformatics. 2017 Jul 12;33(14):i49-58." # Homepages of the dataset links: # original dataset train.tsv: http://pengyifan.com/tmp/BIOSSES/train.tsv dev.tsv: http://pengyifan.com/tmp/BIOSSES/dev.tsv test.tsv: http://pengyifan.com/tmp/BIOSSES/test.tsv test_results.tsv: http://pengyifan.com/tmp/BIOSSES/test_results.tsv # license information # license.txt # BERT version # bert_train: bert_train.csv # bert_dev: bert_dev.csv # bert_test: bert_test.csv # bert_test_results: bert_test_results.csv ================================================ FILE: blue_plus/example_dataset/biosses_dataset.py ================================================ import logging import os import sys import urllib.request from pathlib import Path import pandas as pd import yaml from scipy.stats import pearsonr sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir, os.pardir))) from blue_plus.dataset import BaseDataset class BIOSSES_Dataset(BaseDataset): def download(self, download_dir='blue_plus_data', override=False): download_dir = Path(download_dir) for local_name, url in self.links.items(): local_data_path = download_dir / self.full_name / local_name data_exists = local_data_path.exists() if data_exists and not override: logging.info("Reusing dataset %s (%s)", self.name, local_data_path) continue logging.info('Downloading dataset %s (%s) to %s', self.name, url, local_data_path) urllib.request.urlretrieve(url, local_data_path) def evaluate(self, test_file, prediction_file, results_file): true_df = pd.read_csv(test_file, sep='\t') pred_df = pd.read_csv(prediction_file, sep='\t') assert len(true_df) == len(pred_df), \ f'Gold line no {len(true_df)} vs Prediction line no {len(pred_df)}' p, _ = pearsonr(true_df['score'], pred_df['score']) print('Pearson: {:.3f}'.format(p)) with open(results_file, 'w') as fp: fp.write('Pearson: {:.3f}'.format(p)) def evaluate_bert(self, test_file, prediction_file, results_file): return self.evaluate(test_file, prediction_file, results_file) def prepare_bert_format(self, input_file, output_file): """Optional""" df = pd.read_csv(input_file, sep='\t') df = df['sentence1', 'sentence2', 'score'] df.to_csv(output_file, sep='\t', index=None) def main(): logging.basicConfig(level=logging.INFO) dir = os.path.dirname(os.path.abspath(__file__)) d = BIOSSES_Dataset(os.path.join(dir, 'biosses.yml')) print('Name: ', d.full_name) print('Description:', d.description) print('Citation: ', d.citation) dir = Path('blue_plus_data') / d.full_name dir.mkdir(parents=True, exist_ok=True) d.download(override=True) d.evaluate(dir / 'test.tsv', dir / 'test_results.tsv', dir / 'test_results.txt') if __name__ == '__main__': main() ================================================ FILE: blue_plus/example_dataset/requirements.txt ================================================ pyaml>=18.11.0 scipy>=1.2.1 pandas>=0.20.1 ================================================ FILE: requirements.txt ================================================ pyyaml==5.1