Repository: Pandora-Intelligence/concise-concepts Branch: main Commit: f31d1c3aa5a9 Files: 20 Total size: 49.2 KB Directory structure: gitextract_05q4s6rh/ ├── .github/ │ └── workflows/ │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── README.md ├── concise_concepts/ │ ├── __init__.py │ ├── conceptualizer/ │ │ ├── Conceptualizer.py │ │ └── __init__.py │ └── examples/ │ ├── __init__.py │ ├── data.py │ ├── example_gensim_custom_model.py │ ├── example_gensim_custom_path.py │ ├── example_gensim_default.py │ └── example_spacy.py ├── pyproject.toml ├── setup.cfg └── tests/ ├── __init__.py └── test_model_import.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/python-package.yml ================================================ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: Python package on: push: branches: [main] pull_request: branches: [main] jobs: build: runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install flake8 pytest pytest-cov python -m pip install poetry poetry export -f requirements.txt -o requirements.txt --without-hashes if [ -f requirements.txt ]; then pip install -r requirements.txt; fi python -m spacy download en_core_web_md - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --max-complexity=18 --enable=W0614 --select=C,E,F,W,B,B950 --ignore=E203,E266,E501,W503 --exclude=.git,__pycache__,build,dist --max-line-length=119 --show-source --statistics - name: Test with pytest run: | pytest --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html ================================================ FILE: .github/workflows/python-publish.yml ================================================ # This workflow will upload a Python Package using Twine when a release is created # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries # This workflow uses actions that are not certified by GitHub. # They are provided by a third-party and are governed by # separate terms of service, privacy policy, and support # documentation. name: Upload Python Package on: release: types: [created] permissions: contents: read jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v3 with: python-version: '3.x' - name: Install dependencies run: | python -m pip install --upgrade pip pip install build - name: Build package run: python -m build - name: Publish package uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 with: user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ /test_spacy.py .model /concise_concepts/word2vec.model.vectors.npy /test.html # Downloaded models *.model *.model.* *.json test.py s2v_old ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.0.1 hooks: - id: check-added-large-files - id: end-of-file-fixer - id: check-ast - id: check-case-conflict - id: check-docstring-first - id: check-merge-conflict - id: check-symlinks - id: check-toml - id: check-xml - id: check-yaml - id: destroyed-symlinks - id: detect-private-key - id: fix-encoding-pragma - repo: https://github.com/psf/black rev: 22.3.0 hooks: - id: black - id: black-jupyter # Execute isort on all changed files (make sure the version is the same as in pyproject) - repo: https://github.com/pycqa/isort rev: 5.10.1 hooks: - id: isort # Execute flake8 on all changed files (make sure the version is the same as in pyproject) - repo: https://github.com/pycqa/flake8 rev: 4.0.1 hooks: - id: flake8 additional_dependencies: ["flake8-docstrings", "flake8-bugbear", "pep8-naming"] ================================================ FILE: CITATION.cff ================================================ cff-version: 1.0.0 message: "If you use this software, please cite it as below." authors: - family-names: David given-names: Berenstein title: "Concise Concepts - an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings." version: 0.7.3 date-released: 2022-12-31 ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 Pandora Intelligence Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Concise Concepts When wanting to apply NER to concise concepts, it is really easy to come up with examples, but pretty difficult to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going with easy! Now with entity scoring! [![Python package](https://github.com/Pandora-Intelligence/concise-concepts/actions/workflows/python-package.yml/badge.svg?branch=main)](https://github.com/Pandora-Intelligence/concise-concepts/actions/workflows/python-package.yml) [![Current Release Version](https://img.shields.io/github/release/pandora-intelligence/concise-concepts.svg?style=flat-square&logo=github)](https://github.com/pandora-intelligence/concise-concepts/releases) [![pypi Version](https://img.shields.io/pypi/v/concise-concepts.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/concise-concepts/) [![PyPi downloads](https://static.pepy.tech/personalized-badge/concise-concepts?period=total&units=international_system&left_color=grey&right_color=orange&left_text=pip%20downloads)](https://pypi.org/project/concise-concepts/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) ## Usage This library defines matching patterns based on the most similar words found in each group, which are used to fill a [spaCy EntityRuler](https://spacy.io/api/entityruler). To better understand the rule definition, I recommend playing around with the [spaCy Rule-based Matcher Explorer](https://demos.explosion.ai/matcher). ### Tutorials - [TechVizTheDataScienceGuy](https://www.youtube.com/c/TechVizTheDataScienceGuy) created a [nice tutorial](https://prakhar-mishra.medium.com/few-shot-named-entity-recognition-in-natural-language-processing-92d31f0d1143) on how to use it. - [I](https://www.linkedin.com/in/david-berenstein-1bab11105/) created a [tutorial](https://www.rubrix.ml/blog/concise-concepts-rubrix/) in collaboration with Rubrix. The section [Matching Pattern Rules](#matching-pattern-rules) expands on the construction, analysis and customization of these matching patterns. # Install ``` pip install concise-concepts ``` # Quickstart Take a look at the [configuration section](#configuration) for more info. ## Spacy Pipeline Component Note that, [custom embedding models](#custom-embedding-models) are passed via `model_path`. ```python import spacy from spacy import displacy data = { "fruit": ["apple", "pear", "orange"], "vegetable": ["broccoli", "spinach", "tomato"], "meat": ['beef', 'pork', 'turkey', 'duck'] } text = """ Heat the oil in a large pan and add the Onion, celery and carrots. Then, cook over a medium–low heat for 10 minutes, or until softened. Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes. Later, add some oranges and chickens. """ nlp = spacy.load("en_core_web_md", disable=["ner"]) nlp.add_pipe( "concise_concepts", config={ "data": data, "ent_score": True, # Entity Scoring section "verbose": True, "exclude_pos": ["VERB", "AUX"], "exclude_dep": ["DOBJ", "PCOMP"], "include_compound_words": False, "json_path": "./fruitful_patterns.json", "topn": (100,500,300) }, ) doc = nlp(text) options = { "colors": {"fruit": "darkorange", "vegetable": "limegreen", "meat": "salmon"}, "ents": ["fruit", "vegetable", "meat"], } ents = doc.ents for ent in ents: new_label = f"{ent.label_} ({ent._.ent_score:.0%})" options["colors"][new_label] = options["colors"].get(ent.label_.lower(), None) options["ents"].append(new_label) ent.label_ = new_label doc.ents = ents displacy.render(doc, style="ent", options=options) ``` ![](https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png) ## Standalone This might be useful when iterating over few_shot training data when not wanting to reload larger models continuously. Note that, [custom embedding models](#custom-embedding-models) are passed via `model`. ```python import gensim import spacy from concise_concepts import Conceptualizer model = gensim.downloader.load("fasttext-wiki-news-subwords-300") nlp = spacy.load("en_core_web_sm") data = { "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"], "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"], } conceptualizer = Conceptualizer(nlp, data, model) conceptualizer.nlp("I have a headache and a fever.").ents data = { "disease": ["cancer", "diabetes"], "symptom": ["headache", "fever"], } conceptualizer = Conceptualizer(nlp, data, model) conceptualizer.nlp("I have a headache and a fever.").ents ``` # Configuration ## Matching Pattern Rules A general introduction about the usage of matching patterns in the [usage section](#usage). ### Customizing Matching Pattern Rules Even though the baseline parameters provide a decent result, the construction of these matching rules can be customized via the config passed to the spaCy pipeline. - `exclude_pos`: A list of POS tags to be excluded from the rule-based match. - `exclude_dep`: A list of dependencies to be excluded from the rule-based match. - `include_compound_words`: If True, it will include compound words in the entity. For example, if the entity is "New York", it will also include "New York City" as an entity. - `case_sensitive`: Whether to match the case of the words in the text. ### Analyze Matching Pattern Rules To motivate actually looking at the data and support interpretability, the matching patterns that have been generated are stored as `./main_patterns.json`. This behavior can be changed by using the `json_path` variable via the config passed to the spaCy pipeline. ## Fuzzy matching using `spaczz` - `fuzzy`: A boolean value that determines whether to use fuzzy matching ```python data = { "fruit": ["apple", "pear", "orange"], "vegetable": ["broccoli", "spinach", "tomato"], "meat": ["beef", "pork", "fish", "lamb"] } nlp.add_pipe("concise_concepts", config={"data": data, "fuzzy": True}) ``` ## Most Similar Word Expansion - `topn`: Use a specific number of words to expand over. ```python data = { "fruit": ["apple", "pear", "orange"], "vegetable": ["broccoli", "spinach", "tomato"], "meat": ["beef", "pork", "fish", "lamb"] } topn = [50, 50, 150] assert len(topn) == len nlp.add_pipe("concise_concepts", config={"data": data, "topn": topn}) ``` ## Entity Scoring - `ent_score`: Use embedding based word similarity to score entities against their groups ```python import spacy data = { "ORG": ["Google", "Apple", "Amazon"], "GPE": ["Netherlands", "France", "China"], } text = """Sony was founded in Japan.""" nlp = spacy.load("en_core_web_lg") nlp.add_pipe("concise_concepts", config={"data": data, "ent_score": True, "case_sensitive": True}) doc = nlp(text) print([(ent.text, ent.label_, ent._.ent_score) for ent in doc.ents]) # output # # [('Sony', 'ORG', 0.5207586), ('Japan', 'GPE', 0.7371268)] ``` ## Custom Embedding Models - `model_path`: Use custom `sense2vec.Sense2Vec`, `gensim.Word2vec` `gensim.FastText`, or `gensim.KeyedVectors`, or a pretrained model from [gensim](https://radimrehurek.com/gensim/downloader.html) library or a custom model path. For using a `sense2vec.Sense2Vec` take a look [here](https://github.com/explosion/sense2vec#pretrained-vectors). - `model`: within [standalone usage](#standalone), it is possible to pass these models directly. ```python data = { "fruit": ["apple", "pear", "orange"], "vegetable": ["broccoli", "spinach", "tomato"], "meat": ["beef", "pork", "fish", "lamb"] } # model from https://radimrehurek.com/gensim/downloader.html or path to local file model_path = "glove-wiki-gigaword-300" nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) ```` ================================================ FILE: concise_concepts/__init__.py ================================================ # -*- coding: utf-8 -*- from typing import List, Union from gensim.models import FastText, Word2Vec from gensim.models.keyedvectors import KeyedVectors from spacy.language import Language from .conceptualizer import Conceptualizer @Language.factory( "concise_concepts", default_config={ "data": None, "topn": None, "model_path": None, "word_delimiter": "_", "ent_score": False, "exclude_pos": [ "VERB", "AUX", "ADP", "DET", "CCONJ", "PUNCT", "ADV", "ADJ", "PART", "PRON", ], "exclude_dep": [], "include_compound_words": False, "fuzzy": False, "case_sensitive": False, "json_path": "./matching_patterns.json", "verbose": True, }, ) def make_concise_concepts( nlp: Language, name: str, data: Union[dict, list], topn: Union[list, None], model_path: Union[str, FastText, Word2Vec, KeyedVectors, None], word_delimiter: str, ent_score: bool, exclude_pos: List[str], exclude_dep: List[str], include_compound_words: bool, fuzzy: bool, case_sensitive: bool, json_path: str, verbose: bool, ): return Conceptualizer( nlp=nlp, data=data, topn=topn, model=model_path, word_delimiter=word_delimiter, ent_score=ent_score, exclude_pos=exclude_pos, exclude_dep=exclude_dep, include_compound_words=include_compound_words, fuzzy=fuzzy, case_sensitive=case_sensitive, json_path=json_path, verbose=verbose, name=name, ) ================================================ FILE: concise_concepts/conceptualizer/Conceptualizer.py ================================================ # -*- coding: utf-8 -*- import json import logging import re import types from copy import deepcopy from pathlib import Path from typing import List, Union import gensim.downloader import spaczz # noqa: F401 from gensim import matutils # utility fnc for pickling, common scipy operations etc from gensim.models import FastText, Word2Vec from gensim.models.keyedvectors import KeyedVectors from numpy import argmax, dot from sense2vec import Sense2Vec from spacy import Language, util from spacy.tokens import Doc, Span logger = logging.getLogger(__name__) POS_LIST = [ "ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE", ] class Conceptualizer: def __init__( self, nlp: Language, data: dict = {}, model: Union[str, FastText, KeyedVectors, Word2Vec] = None, topn: list = None, word_delimiter: str = "_", ent_score: bool = False, exclude_pos: list = None, exclude_dep: list = None, include_compound_words: bool = False, case_sensitive: bool = False, fuzzy: bool = False, json_path: str = "./matching_patterns.json", verbose: bool = True, name: str = "concise_concepts", ): """ The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and their synonyms, but with the words in the new dictionary all in uppercase :param nlp: The spaCy model to use. :type nlp: Language :param name: The name of the entity. :type name: str :param data: A dictionary of the words you want to match. The keys are the classes you want to match, and the values are the words you want to expand over. :type data: dict :param topn: The number of words to be returned for each class. :type topn: list :param model_path: The path to the model you want to use. If you don't have a model, you can use the spaCy one. :param word_delimiter: The delimiter used to separate words in model the dictionary, defaults to _ (optional) :param ent_score: If True, the extension "ent_score" will be added to the Span object. This will be the score of the entity, defaults to False (optional) :param exclude_pos: A list of POS tags to exclude from the rule based match :param exclude_dep: list of dependencies to exclude from the rule based match :param include_compound_words: If True, it will include compound words in the entity. For example, if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional) :param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional) """ assert data, ValueError("You must provide a dictionary of words to match") self.verbose = verbose self.log_cache = {"key": list(), "word": list(), "key_word": list()} if Span.has_extension("ent_score"): Span.remove_extension("ent_score") if ent_score: Span.set_extension("ent_score", default=None) self.ent_score = ent_score self.data = data self.name = name self.nlp = nlp self.fuzzy = fuzzy self.topn = topn self.model = model self.match_rule = {} self.set_exclude_pos(exclude_pos) self.set_exclude_dep(exclude_dep) self.json_path = json_path self.include_compound_words = include_compound_words self.case_sensitive = case_sensitive self.word_delimiter = word_delimiter if "lemmatizer" not in self.nlp.component_names: logger.warning( "No lemmatizer found in spacy pipeline. Consider adding it for matching" " on LEMMA instead of exact text." ) self.match_key = "TEXT" else: self.match_key = "LEMMA" for ruler in ["entity_ruler", "spaczz_ruler"]: if ruler in self.nlp.component_names: logger.warning( f"{ruler} already exists in the pipeline. Removing old rulers" ) self.nlp.remove_pipe(ruler) self.run() def set_exclude_dep(self, exclude_dep: list): if exclude_dep is None: exclude_dep = [] if exclude_dep: self.match_rule["DEP"] = {"NOT_IN": exclude_dep} def set_exclude_pos(self, exclude_pos: list): if exclude_pos is None: exclude_pos = [ "VERB", "AUX", "ADP", "DET", "CCONJ", "PUNCT", "ADV", "ADJ", "PART", "PRON", ] if exclude_pos: self.match_rule["POS"] = {"NOT_IN": exclude_pos} self.exclude_pos = exclude_pos else: self.exclude_pos = [] def run(self) -> None: self.check_validity_path() self.set_gensim_model() self.verify_data(self.verbose) self.determine_topn() self.expand_concepts() # settle words around overlapping concepts for _ in range(5): self.expand_concepts() self.infer_original_data() self.resolve_overlapping_concepts() self.infer_original_data() self.create_conceptual_patterns() self.set_concept_dict() if not self.ent_score: del self.kv self.data_upper = {k.upper(): v for k, v in self.data.items()} def check_validity_path(self) -> None: """ If the path is a file, create the parent directory if it doesn't exist. If the path is a directory, create the directory and set the path to the default file name """ if self.json_path: if Path(self.json_path).suffix: Path(self.json_path).parents[0].mkdir(parents=True, exist_ok=True) else: Path(self.json_path).mkdir(parents=True, exist_ok=True) old_path = str(self.json_path) self.json_path = Path(self.json_path) / "matching_patterns.json" logger.warning( f"Path ´{old_path} is a directory, not a file. Setting" f" ´json_path´to {self.json_path}" ) def determine_topn(self) -> None: """ If the user doesn't specify a topn value for each class, then the topn value for each class is set to 100 """ if self.topn is None: self.topn_dict = {key: 100 for key in self.data} else: num_classes = len(self.data) assert ( len(self.topn) == num_classes ), f"Provide a topn integer for each of the {num_classes} classes." self.topn_dict = dict(zip(self.data, self.topn)) def set_gensim_model(self) -> None: """ If the model_path is not None, then we try to load the model from the path. If it's not a valid path, then we raise an exception. If the model_path is None, then we load the model from the internal embeddings of the spacy model """ if isinstance(self.model, str): if self.model: available_models = gensim.downloader.info()["models"] if self.model in available_models: self.kv = gensim.downloader.load(self.model) else: try: self.kv = Sense2Vec().from_disk(self.model) except Exception as e0: try: self.kv = FastText.load(self.model).wv except Exception as e1: try: self.kv = Word2Vec.load(self.model).wv except Exception as e2: try: self.kv = KeyedVectors.load(self.model) except Exception as e3: try: self.kv = KeyedVectors.load_word2vec_format( self.model, binary=True ) except Exception as e4: raise Exception( "Not a valid model.Sense2Vec, FastText," f" Word2Vec, KeyedVectors.\n {e0}\n {e1}\n" f" {e2}\n {e3}\n {e4}" ) elif isinstance(self.model, (FastText, Word2Vec)): self.kv = self.model.wv elif isinstance(self.model, KeyedVectors): self.kv = self.model elif isinstance(self.model, Sense2Vec): self.kv = self.model else: wordList = [] vectorList = [] assert len( self.nlp.vocab.vectors ), "Choose a spaCy model with internal embeddings, e.g. md or lg." for key, vector in self.nlp.vocab.vectors.items(): wordList.append(self.nlp.vocab.strings[key]) vectorList.append(vector) self.kv = KeyedVectors(self.nlp.vocab.vectors_length) self.kv.add_vectors(wordList, vectorList) def verify_data(self, verbose: bool = True) -> None: """ It takes a dictionary of lists of words, and returns a dictionary of lists of words, where each word in the list is present in the word2vec model """ verified_data: dict[str, list[str]] = dict() for key, value in self.data.items(): verified_values = [] present_key = self._check_presence_vocab(key) if present_key: key = present_key if not present_key and verbose and key not in self.log_cache["key"]: logger.warning(f"key ´{key}´ not present in vector model") self.log_cache["key"].append(key) for word in value: present_word = self._check_presence_vocab(word) if present_word: verified_values.append(present_word) elif verbose and word not in self.log_cache["word"]: logger.warning( f"word ´{word}´ from key ´{key}´ not present in vector model" ) self.log_cache["word"].append(word) verified_data[key] = verified_values if not len(verified_values): msg = ( f"None of the entries for key {key} are present in the vector" " model. " ) if present_key: logger.warning( msg + f"Using {present_key} as word to expand over instead." ) verified_data[key] = present_key else: raise Exception(msg) self.data = deepcopy(verified_data) self.original_data = deepcopy(verified_data) def expand_concepts(self) -> None: """ For each key in the data dictionary, find the topn most similar words to the key and the values in the data dictionary, and add those words to the values in the data dictionary """ for key in self.data: present_key = self._check_presence_vocab(key) if present_key: key_list = [present_key] else: key_list = [] if isinstance(self.kv, Sense2Vec): similar = self.kv.most_similar( self.data[key] + key_list, n=self.topn_dict[key], ) else: similar = self.kv.most_similar( self.data[key] + key_list, topn=self.topn_dict[key], ) self.data[key] = list({word for word, _ratio in similar}) def resolve_overlapping_concepts(self) -> None: """ It removes words from the data that are in other concepts, and then removes words that are not closest to the centroid of the concept """ for key in self.data: self.data[key] = [ word for word in self.data[key] if key == self.most_similar_to_given(word, list(self.data.keys())) ] def most_similar_to_given(self, key1, keys_list): """Get the `key` from `keys_list` most similar to `key1`.""" return keys_list[argmax([self.similarity(key1, key) for key in keys_list])] def similarity(self, w1, w2): """Compute cosine similarity between two keys. Parameters ---------- w1 : str Input key. w2 : str Input key. Returns ------- float Cosine similarity between `w1` and `w2`. """ return dot(matutils.unitvec(self.kv[w1]), matutils.unitvec(self.kv[w2])) def infer_original_data(self) -> None: """ It takes the original data and adds the new data to it, then removes the new data from the original data. """ for key in self.data: self.data[key] = list(set(self.data[key] + self.original_data[key])) for key_x in self.data: for key_y in self.data: if key_x != key_y: self.data[key_x] = [ word for word in self.data[key_x] if word not in self.original_data[key_y] ] def lemmatize_concepts(self) -> None: """ For each key in the data dictionary, the function takes the list of concepts associated with that key, and lemmatizes each concept. """ for key in self.data: self.data[key] = list( set([doc[0].lemma_ for doc in self.nlp.pipe(self.data[key])]) ) def create_conceptual_patterns(self) -> None: """ For each key in the data dictionary, create a pattern for each word in the list of words associated with that key. The pattern is a dictionary with three keys: 1. "lemma" 2. "POS" 3. "DEP" The value for each key is another dictionary with one key and one value. The key is either "regex" or "NOT_IN" or "IN". The value is either a regular expression or a list of strings. The regular expression is the word associated with the key in the data dictionary. The list of strings is either ["VERB"] or ["nsubjpass"] or ["amod", "compound"]. The regular expression is case insensitive. The pattern is """ lemma_patterns = [] fuzzy_patterns = [] def add_patterns(input_dict: dict) -> None: """ It creates a list of dictionaries that can be used for a spaCy entity ruler :param input_dict: a dictionary :type input_dict: dict """ if isinstance(self.kv, Sense2Vec): input_dict = { key.split("|")[0]: [word.split("|")[0] for word in value] for key, value in input_dict.items() } for key in input_dict: words = input_dict[key] for word in words: if word != key: word_parts = self._split_word(word) op_pattern = { "TEXT": { "REGEX": "|".join([" ", "-", "_", "/"]), "OP": "*", } } partial_pattern_parts = [] lemma_pattern_parts = [] for partial_pattern in word_parts: word_part = partial_pattern if self.fuzzy: partial_pattern = { "FUZZY": word_part, } partial_pattern = {"TEXT": partial_pattern} lemma_pattern_parts.append({self.match_key: word_part}) lemma_pattern_parts.append(op_pattern) partial_pattern_parts.append(partial_pattern) partial_pattern_parts.append(op_pattern) pattern = { "label": key.upper(), "pattern": partial_pattern_parts[:-1], "id": f"{word}_individual", } # add fuzzy matching formatting if fuzzy matching is enabled fuzzy_patterns.append(pattern) # add lemmma matching if lemma_pattern_parts: lemma_pattern = { "label": key.upper(), "pattern": lemma_pattern_parts[:-1], "id": f"{word}_lemma_individual", } lemma_patterns.append(lemma_pattern) if self.include_compound_words: compound_rule = [ { "DEP": {"IN": ["amod", "compound"]}, "OP": "*", } ] partial_pattern_parts.append( { "label": key.upper(), "pattern": compound_rule + partial_pattern_parts[:-1] + compound_rule, "id": f"{word}_compound", } ) if lemma_pattern_parts: lemma_patterns.append( { "label": key.upper(), "pattern": compound_rule + lemma_pattern_parts[:-1] + compound_rule, "id": f"{word}_lemma_compound", } ) add_patterns(self.data) if self.json_path: with open(self.json_path, "w") as f: json.dump(lemma_patterns + fuzzy_patterns, f) config = {"overwrite_ents": True} if self.case_sensitive: config["phrase_matcher_attr"] = "LOWER" self.ruler = self.nlp.add_pipe("entity_ruler", config=config) self.ruler.add_patterns(lemma_patterns) # Add spaczz entity ruler if fuzzy if self.fuzzy: for pattern in fuzzy_patterns: pattern["type"] = "token" self.fuzzy_ruler = self.nlp.add_pipe("spaczz_ruler", config=config) self.fuzzy_ruler.add_patterns(fuzzy_patterns) def __call__(self, doc: Doc) -> Doc: """ It takes a doc object and assigns a score to each entity in the doc object :param doc: Doc :type doc: Doc """ if isinstance(doc, str): doc = self.nlp(doc) elif isinstance(doc, Doc): if self.ent_score: doc = self.assign_score_to_entities(doc) return doc def pipe(self, stream, batch_size=128) -> Doc: """ It takes a stream of documents, and for each document, it assigns a score to each entity in the document :param stream: a generator of documents :param batch_size: The number of documents to be processed at a time, defaults to 128 (optional) """ if isinstance(stream, str): stream = [stream] if not isinstance(stream, types.GeneratorType): stream = self.nlp.pipe(stream, batch_size=batch_size) for docs in util.minibatch(stream, size=batch_size): for doc in docs: if self.ent_score: doc = self.assign_score_to_entities(doc) yield doc def assign_score_to_entities(self, doc: Doc) -> Doc: """ The function takes a spaCy document as input and assigns a score to each entity in the document. The score is calculated using the word embeddings of the entity and the concept. The score is assigned to the entity using the `._.ent_score` attribute :param doc: Doc :type doc: Doc :return: The doc object with the entities and their scores. """ ents = doc.ents for ent in ents: if ent.label_ in self.data_upper: ent_text = ent.text # get word part representations if self._check_presence_vocab(ent_text): entity = [self._check_presence_vocab(ent_text)] else: entity = [] for part in self._split_word(ent_text): present_part = self._check_presence_vocab(part) if present_part: entity.append(present_part) # get concepts to match concept = self.concept_data.get(ent.label_, None) # compare set similarities if entity and concept: ent._.ent_score = self.kv.n_similarity(entity, concept) else: ent._.ent_score = 0 if self.verbose: if f"{ent_text}_{concept}" not in self.log_cache["key_word"]: logger.warning( f"Entity ´{ent.text}´ and/or label ´{concept}´ not" " found in vector model. Nothing to compare to, so" " setting ent._.ent_score to 0." ) self.log_cache["key_word"].append(f"{ent_text}_{concept}") else: ent._.ent_score = 0 if self.verbose: if ent.text not in self.log_cache["word"]: logger.warning( f"Entity ´{ent.text}´ not found in vector model. Nothing to" " compare to, so setting ent._.ent_score to 0." ) self.log_cache["word"].append(ent.text) doc.ents = ents return doc def set_concept_dict(self): self.concept_data = {k.upper(): v for k, v in self.data.items()} for ent_label in self.concept_data: concept = [] for word in self.concept_data[ent_label]: present_word = self._check_presence_vocab(word) if present_word: concept.append(present_word) self.concept_data[ent_label] = concept def _split_word(self, word: str) -> List[str]: """ It splits a word into a list of subwords, using the word delimiter :param word: str :type word: str :return: A list of strings or any. """ return re.split(f"[{re.escape(self.word_delimiter)}]+", word) def _check_presence_vocab(self, word: str) -> str: """ If the word is not lowercase and the case_sensitive flag is set to False, then check if the lowercase version of the word is in the vocabulary. If it is, return the lowercase version of the word. Otherwise, return the word itself :param word: The word to check for presence in the vocabulary :type word: str :return: The word itself if it is present in the vocabulary, otherwise the word with the highest probability of being the word that was intended. """ word = word.replace(" ", "_") if not word.islower() and not self.case_sensitive: present_word = self.__check_presence_vocab(word.lower()) if present_word: return present_word return self.__check_presence_vocab(word) def __check_presence_vocab(self, word: str) -> str: """ If the word is in the vocabulary, return the word. If not, replace spaces and dashes with the word delimiter and check if the new word is in the vocabulary. If so, return the new word :param word: str - the word to check :type word: str :return: The word or the check_word """ if isinstance(self.kv, Sense2Vec): return self.kv.get_best_sense(word, (set(POS_LIST) - set(self.exclude_pos))) else: if word in self.kv: return word ================================================ FILE: concise_concepts/conceptualizer/__init__.py ================================================ # -*- coding: utf-8 -*- from .Conceptualizer import Conceptualizer __all__ = ["Conceptualizer"] ================================================ FILE: concise_concepts/examples/__init__.py ================================================ ================================================ FILE: concise_concepts/examples/data.py ================================================ # -*- coding: utf-8 -*- text = """ Heat the oil in a large pan and add the Onion, celery and carrots. Then, cook over a medium–low heat for 10 minutes, or until softened. Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes. Later, add some oranges, chickens. """ text_fuzzy = """ Heat the oil in a large pan and add the Onion, celery and carots. Then, cook over a medium–low heat for 10 minutes, or until softened. Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes. Later, add some oranges, chickens. """ data = { "fruit": ["apple", "pear", "orange"], "vegetable": ["broccoli", "spinach", "tomato"], "meat": ["chicken", "beef", "pork", "fish", "lamb"], } ================================================ FILE: concise_concepts/examples/example_gensim_custom_model.py ================================================ # -*- coding: utf-8 -*- import spacy from gensim.models import Word2Vec from gensim.test.utils import common_texts import concise_concepts # noqa: F401 data = {"human": ["trees"], "interface": ["computer"]} text = ( "believe me, it's the slowest mobile I saw. Don't go on screen and Battery, it is" " an extremely slow mobile phone and takes ages to open and navigate. Forget about" " heavy use, it can't handle normal regular use. I made a huge mistake but pls" " don't buy this mobile. It's only a few months and I am thinking to change it. Its" " dam SLOW SLOW SLOW." ) model = Word2Vec( sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4 ) model.save("word2vec.model") model_path = "word2vec.model" nlp = spacy.blank("en") nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) ================================================ FILE: concise_concepts/examples/example_gensim_custom_path.py ================================================ # -*- coding: utf-8 -*- import gensim.downloader as api import spacy import concise_concepts # noqa: F401 from .data import data, text model_path = "word2vec.model" model = api.load("glove-twitter-25") model.save(model_path) nlp = spacy.blank("en") nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) doc = nlp(text) print([(ent.text, ent.label_) for ent in doc.ents]) ================================================ FILE: concise_concepts/examples/example_gensim_default.py ================================================ # -*- coding: utf-8 -*- import spacy import concise_concepts # noqa: F401 from .data import data, text model_path = "glove-twitter-25" nlp = spacy.blank("en") nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) doc = nlp(text) print([(ent.text, ent.label_) for ent in doc.ents]) ================================================ FILE: concise_concepts/examples/example_spacy.py ================================================ # -*- coding: utf-8 -*- import spacy import concise_concepts # noqa: F401 from .data import data, text nlp = spacy.load("en_core_web_md") nlp.add_pipe("concise_concepts", config={"data": data}) doc = nlp(text) print([(ent.text, ent.label_) for ent in doc.ents]) ================================================ FILE: pyproject.toml ================================================ [tool.poetry] name = "concise-concepts" version = "0.8.1" description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!" authors = ["David Berenstein "] license = "MIT" readme = "README.md" homepage = "https://github.com/pandora-intelligence/concise-concepts" repository = "https://github.com/pandora-intelligence/concise-concepts" documentation = "https://github.com/pandora-intelligence/concise-concepts" keywords = ["spacy", "NER", "few-shot classification", "nlu"] classifiers = [ "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering", "Topic :: Software Development" ] packages = [{include = "concise_concepts"}] [tool.poetry.dependencies] python = ">=3.8,<3.12" spacy = "^3" scipy = "^1.7" gensim = "^4" spaczz = "^0.5.4" sense2vec = "^2.0.1" [tool.poetry.plugins] [tool.poetry.plugins."spacy_factories"] "spacy" = "concise_concepts.__init__:make_concise_concepts" [tool.poetry.group.dev.dependencies] black = "^22" flake8 = "^5" pytest = "^7.1" pre-commit = "^2.20" pep8-naming = "^0.13" flake8-bugbear = "^22.9" flake8-docstrings = "^1.6" ipython = "^8.7.0" ipykernel = "^6.17.1" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] testpaths = "tests" [tool.black] preview = true [tool.isort] profile = "black" src_paths = ["concise_concepts"] ================================================ FILE: setup.cfg ================================================ [flake8] max-line-length = 119 max-complexity = 18 docstring-convention=google exclude = .git,__pycache__,build,dist select = C,E,F,W,B,B950 ignore = E203,E266,E501,W503 enable = W0614 per-file-ignores = test_*.py: D ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/test_model_import.py ================================================ # -*- coding: utf-8 -*- def test_spacy_embeddings(): from concise_concepts.examples import example_spacy # noqa: F401 def test_gensim_default(): from concise_concepts.examples import example_gensim_default # noqa: F401 def test_gensim_custom_path(): from concise_concepts.examples import example_gensim_custom_path # noqa: F401 def test_gensim_custom_model(): from concise_concepts.examples import example_gensim_custom_model # noqa: F401 def test_standalone_spacy(): import spacy from concise_concepts import Conceptualizer nlp = spacy.load("en_core_web_md") data = { "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"], "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"], } conceptualizer = Conceptualizer(nlp, data) assert ( list(conceptualizer.pipe(["I have a headache and a fever."]))[0].to_json() == list(conceptualizer.nlp.pipe(["I have a headache and a fever."]))[ 0 ].to_json() ) assert ( conceptualizer("I have a headache and a fever.").to_json() == conceptualizer.nlp("I have a headache and a fever.").to_json() ) data = { "disease": ["cancer", "diabetes"], "symptom": ["headache", "fever"], } conceptualizer = Conceptualizer(nlp, data) def test_standalone_gensim(): import gensim import spacy from concise_concepts import Conceptualizer model_path = "glove-twitter-25" model = gensim.downloader.load(model_path) nlp = spacy.load("en_core_web_md") data = { "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"], "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"], } conceptualizer = Conceptualizer(nlp, data, model=model) print(list(conceptualizer.pipe(["I have a headache and a fever."]))[0].ents) print(list(conceptualizer.nlp.pipe(["I have a headache and a fever."]))[0].ents) print(conceptualizer("I have a headache and a fever.").ents) print(conceptualizer.nlp("I have a headache and a fever.").ents) def test_spaczz(): # -*- coding: utf-8 -*- import spacy import concise_concepts # noqa: F401 from concise_concepts.examples.data import data, text, text_fuzzy nlp = spacy.load("en_core_web_md") nlp.add_pipe("concise_concepts", config={"data": data, "fuzzy": True}) assert len(nlp(text).ents) == len(nlp(text_fuzzy).ents) def test_sense2vec(): # -*- coding: utf-8 -*- import requests import spacy import concise_concepts # noqa: F401 from concise_concepts.examples.data import data, text model_path = "s2v_old" # download .tar.gz file an URL # and extract it to a folder url = "https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz" r = requests.get(url, allow_redirects=True) open("s2v_reddit_2015_md.tar.gz", "wb").write(r.content) # extract tar.gz file filename = "s2v_reddit_2015_md.tar.gz" import tarfile tar = tarfile.open(filename, "r:gz") tar.extractall() tar.close() nlp = spacy.load("en_core_web_md") nlp.add_pipe("concise_concepts", config={"data": data, "model_path": model_path}) assert len(nlp(text).ents)