[
  {
    "path": ".github/workflows/python-package.yml",
    "content": "# This workflow will install Python dependencies, run tests and lint with a variety of Python versions\n# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions\n\nname: Python package\n\non:\n  push:\n    branches: [main]\n  pull_request:\n    branches: [main]\n\njobs:\n  build:\n    runs-on: ubuntu-latest\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.8\", \"3.9\", \"3.10\"]\n\n    steps:\n      - uses: actions/checkout@v3\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v3\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Install dependencies\n        run: |\n          python -m pip install --upgrade pip\n          python -m pip install flake8 pytest pytest-cov\n          python -m pip install poetry\n          poetry export -f requirements.txt -o requirements.txt --without-hashes\n          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi\n          python -m spacy download en_core_web_md\n      - name: Lint with flake8\n        run: |\n          # stop the build if there are Python syntax errors or undefined names\n          flake8 . --count --max-complexity=18 --enable=W0614 --select=C,E,F,W,B,B950 --ignore=E203,E266,E501,W503 --exclude=.git,__pycache__,build,dist --max-line-length=119 --show-source --statistics\n      - name: Test with pytest\n        run: |\n          pytest --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html\n"
  },
  {
    "path": ".github/workflows/python-publish.yml",
    "content": "# This workflow will upload a Python Package using Twine when a release is created\n# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries\n\n# This workflow uses actions that are not certified by GitHub.\n# They are provided by a third-party and are governed by\n# separate terms of service, privacy policy, and support\n# documentation.\n\nname: Upload Python Package\n\non:\n  release:\n    types: [created]\n\npermissions:\n  contents: read\n\njobs:\n  deploy:\n\n    runs-on: ubuntu-latest\n\n    steps:\n    - uses: actions/checkout@v3\n    - name: Set up Python\n      uses: actions/setup-python@v3\n      with:\n        python-version: '3.x'\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        pip install build\n    - name: Build package\n      run: python -m build\n    - name: Publish package\n      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29\n      with:\n        user: __token__\n        password: ${{ secrets.PYPI_API_TOKEN }}\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n/test_spacy.py\n.model\n/concise_concepts/word2vec.model.vectors.npy\n/test.html\n\n# Downloaded models\n*.model\n*.model.*\n*.json\ntest.py\ns2v_old"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v4.0.1\n    hooks:\n      - id: check-added-large-files\n      - id: end-of-file-fixer\n      - id: check-ast\n      - id: check-case-conflict\n      - id: check-docstring-first\n      - id: check-merge-conflict\n      - id: check-symlinks\n      - id: check-toml\n      - id: check-xml\n      - id: check-yaml\n      - id: destroyed-symlinks\n      - id: detect-private-key\n      - id: fix-encoding-pragma\n  - repo: https://github.com/psf/black\n    rev: 22.3.0\n    hooks:\n      - id: black\n      - id: black-jupyter\n  # Execute isort on all changed files (make sure the version is the same as in pyproject)\n  - repo: https://github.com/pycqa/isort\n    rev: 5.10.1\n    hooks:\n      - id: isort\n  # Execute flake8 on all changed files (make sure the version is the same as in pyproject)\n  - repo: https://github.com/pycqa/flake8\n    rev: 4.0.1\n    hooks:\n      - id: flake8\n        additional_dependencies:\n          [\"flake8-docstrings\", \"flake8-bugbear\", \"pep8-naming\"]\n"
  },
  {
    "path": "CITATION.cff",
    "content": "cff-version: 1.0.0\nmessage: \"If you use this software, please cite it as below.\"\nauthors:\n  - family-names: David\n    given-names: Berenstein\ntitle: \"Concise Concepts - an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings.\"\nversion: 0.7.3\ndate-released: 2022-12-31\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2022 Pandora Intelligence\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Concise Concepts\nWhen wanting to apply NER to concise concepts, it is really easy to come up with examples, but pretty difficult to train an entire pipeline. Concise Concepts uses few-shot NER based on word embedding similarity to get you going\nwith easy! Now with entity scoring!\n\n\n[![Python package](https://github.com/Pandora-Intelligence/concise-concepts/actions/workflows/python-package.yml/badge.svg?branch=main)](https://github.com/Pandora-Intelligence/concise-concepts/actions/workflows/python-package.yml)\n[![Current Release Version](https://img.shields.io/github/release/pandora-intelligence/concise-concepts.svg?style=flat-square&logo=github)](https://github.com/pandora-intelligence/concise-concepts/releases)\n[![pypi Version](https://img.shields.io/pypi/v/concise-concepts.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/concise-concepts/)\n[![PyPi downloads](https://static.pepy.tech/personalized-badge/concise-concepts?period=total&units=international_system&left_color=grey&right_color=orange&left_text=pip%20downloads)](https://pypi.org/project/concise-concepts/)\n[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)\n\n\n## Usage\nThis library defines matching patterns based on the most similar words found in each group, which are used to fill a [spaCy EntityRuler](https://spacy.io/api/entityruler). To better understand the rule definition, I recommend playing around with the [spaCy Rule-based Matcher Explorer](https://demos.explosion.ai/matcher).\n\n### Tutorials\n- [TechVizTheDataScienceGuy](https://www.youtube.com/c/TechVizTheDataScienceGuy) created a [nice tutorial](https://prakhar-mishra.medium.com/few-shot-named-entity-recognition-in-natural-language-processing-92d31f0d1143) on how to use it.\n\n- [I](https://www.linkedin.com/in/david-berenstein-1bab11105/) created a [tutorial](https://www.rubrix.ml/blog/concise-concepts-rubrix/) in collaboration with Rubrix.\n\nThe section [Matching Pattern Rules](#matching-pattern-rules) expands on the construction, analysis and customization of these matching patterns.\n\n\n# Install\n\n```\npip install concise-concepts\n```\n\n# Quickstart\n\nTake a look at the [configuration section](#configuration) for more info.\n\n## Spacy Pipeline Component\n\nNote that, [custom embedding models](#custom-embedding-models) are passed via `model_path`.\n\n```python\nimport spacy\nfrom spacy import displacy\n\ndata = {\n    \"fruit\": [\"apple\", \"pear\", \"orange\"],\n    \"vegetable\": [\"broccoli\", \"spinach\", \"tomato\"],\n    \"meat\": ['beef', 'pork', 'turkey', 'duck']\n}\n\ntext = \"\"\"\n    Heat the oil in a large pan and add the Onion, celery and carrots.\n    Then, cook over a medium–low heat for 10 minutes, or until softened.\n    Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.\n    Later, add some oranges and chickens. \"\"\"\n\nnlp = spacy.load(\"en_core_web_md\", disable=[\"ner\"])\n\nnlp.add_pipe(\n    \"concise_concepts\",\n    config={\n        \"data\": data,\n        \"ent_score\": True,  # Entity Scoring section\n        \"verbose\": True,\n        \"exclude_pos\": [\"VERB\", \"AUX\"],\n        \"exclude_dep\": [\"DOBJ\", \"PCOMP\"],\n        \"include_compound_words\": False,\n        \"json_path\": \"./fruitful_patterns.json\",\n        \"topn\": (100,500,300)\n    },\n)\ndoc = nlp(text)\n\noptions = {\n    \"colors\": {\"fruit\": \"darkorange\", \"vegetable\": \"limegreen\", \"meat\": \"salmon\"},\n    \"ents\": [\"fruit\", \"vegetable\", \"meat\"],\n}\n\nents = doc.ents\nfor ent in ents:\n    new_label = f\"{ent.label_} ({ent._.ent_score:.0%})\"\n    options[\"colors\"][new_label] = options[\"colors\"].get(ent.label_.lower(), None)\n    options[\"ents\"].append(new_label)\n    ent.label_ = new_label\ndoc.ents = ents\n\ndisplacy.render(doc, style=\"ent\", options=options)\n```\n![](https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png)\n\n## Standalone\n\nThis might be useful when iterating over few_shot training data when not wanting to reload larger models continuously.\nNote that, [custom embedding models](#custom-embedding-models) are passed via `model`.\n\n```python\nimport gensim\nimport spacy\n\nfrom concise_concepts import Conceptualizer\n\nmodel = gensim.downloader.load(\"fasttext-wiki-news-subwords-300\")\nnlp = spacy.load(\"en_core_web_sm\")\ndata = {\n    \"disease\": [\"cancer\", \"diabetes\", \"heart disease\", \"influenza\", \"pneumonia\"],\n    \"symptom\": [\"headache\", \"fever\", \"cough\", \"nausea\", \"vomiting\", \"diarrhea\"],\n}\nconceptualizer = Conceptualizer(nlp, data, model)\nconceptualizer.nlp(\"I have a headache and a fever.\").ents\n\ndata = {\n    \"disease\": [\"cancer\", \"diabetes\"],\n    \"symptom\": [\"headache\", \"fever\"],\n}\nconceptualizer = Conceptualizer(nlp, data, model)\nconceptualizer.nlp(\"I have a headache and a fever.\").ents\n```\n\n# Configuration\n## Matching Pattern Rules\nA general introduction about the usage of matching patterns in the [usage section](#usage).\n### Customizing Matching Pattern Rules\nEven though the baseline parameters provide a decent result, the construction of these matching rules can be customized via the config passed to the spaCy pipeline.\n\n - `exclude_pos`: A list of POS tags to be excluded from the rule-based match.\n - `exclude_dep`: A list of dependencies to be excluded from the rule-based match.\n - `include_compound_words`:  If True, it will include compound words in the entity. For example, if the entity is \"New York\", it will also include \"New York City\" as an entity.\n - `case_sensitive`: Whether to match the case of the words in the text.\n\n\n### Analyze Matching Pattern Rules\nTo motivate actually looking at the data and support interpretability, the matching patterns that have been generated are stored as `./main_patterns.json`. This behavior can be changed by using the `json_path` variable via the config passed to the spaCy pipeline.\n\n\n## Fuzzy matching using `spaczz`\n\n - `fuzzy`: A boolean value that determines whether to use fuzzy matching\n\n```python\ndata = {\n    \"fruit\": [\"apple\", \"pear\", \"orange\"],\n    \"vegetable\": [\"broccoli\", \"spinach\", \"tomato\"],\n    \"meat\": [\"beef\", \"pork\", \"fish\", \"lamb\"]\n}\n\nnlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"fuzzy\": True})\n```\n\n## Most Similar Word Expansion\n\n- `topn`: Use a specific number of words to expand over.\n\n```python\ndata = {\n    \"fruit\": [\"apple\", \"pear\", \"orange\"],\n    \"vegetable\": [\"broccoli\", \"spinach\", \"tomato\"],\n    \"meat\": [\"beef\", \"pork\", \"fish\", \"lamb\"]\n}\n\ntopn = [50, 50, 150]\n\nassert len(topn) == len\n\nnlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"topn\": topn})\n```\n\n## Entity Scoring\n\n- `ent_score`: Use embedding based word similarity to score entities against their groups\n\n```python\nimport spacy\n\ndata = {\n    \"ORG\": [\"Google\", \"Apple\", \"Amazon\"],\n    \"GPE\": [\"Netherlands\", \"France\", \"China\"],\n}\n\ntext = \"\"\"Sony was founded in Japan.\"\"\"\n\nnlp = spacy.load(\"en_core_web_lg\")\nnlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"ent_score\": True, \"case_sensitive\": True})\ndoc = nlp(text)\n\nprint([(ent.text, ent.label_, ent._.ent_score) for ent in doc.ents])\n# output\n#\n# [('Sony', 'ORG', 0.5207586), ('Japan', 'GPE', 0.7371268)]\n```\n\n## Custom Embedding Models\n\n- `model_path`: Use custom `sense2vec.Sense2Vec`, `gensim.Word2vec` `gensim.FastText`, or `gensim.KeyedVectors`, or a pretrained model from [gensim](https://radimrehurek.com/gensim/downloader.html) library or a custom model path. For using a `sense2vec.Sense2Vec` take a look [here](https://github.com/explosion/sense2vec#pretrained-vectors).\n- `model`: within [standalone usage](#standalone), it is possible to pass these models directly.\n\n```python\ndata = {\n    \"fruit\": [\"apple\", \"pear\", \"orange\"],\n    \"vegetable\": [\"broccoli\", \"spinach\", \"tomato\"],\n    \"meat\": [\"beef\", \"pork\", \"fish\", \"lamb\"]\n}\n\n# model from https://radimrehurek.com/gensim/downloader.html or path to local file\nmodel_path = \"glove-wiki-gigaword-300\"\n\nnlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"model_path\": model_path})\n````\n"
  },
  {
    "path": "concise_concepts/__init__.py",
    "content": "# -*- coding: utf-8 -*-\nfrom typing import List, Union\n\nfrom gensim.models import FastText, Word2Vec\nfrom gensim.models.keyedvectors import KeyedVectors\nfrom spacy.language import Language\n\nfrom .conceptualizer import Conceptualizer\n\n\n@Language.factory(\n    \"concise_concepts\",\n    default_config={\n        \"data\": None,\n        \"topn\": None,\n        \"model_path\": None,\n        \"word_delimiter\": \"_\",\n        \"ent_score\": False,\n        \"exclude_pos\": [\n            \"VERB\",\n            \"AUX\",\n            \"ADP\",\n            \"DET\",\n            \"CCONJ\",\n            \"PUNCT\",\n            \"ADV\",\n            \"ADJ\",\n            \"PART\",\n            \"PRON\",\n        ],\n        \"exclude_dep\": [],\n        \"include_compound_words\": False,\n        \"fuzzy\": False,\n        \"case_sensitive\": False,\n        \"json_path\": \"./matching_patterns.json\",\n        \"verbose\": True,\n    },\n)\ndef make_concise_concepts(\n    nlp: Language,\n    name: str,\n    data: Union[dict, list],\n    topn: Union[list, None],\n    model_path: Union[str, FastText, Word2Vec, KeyedVectors, None],\n    word_delimiter: str,\n    ent_score: bool,\n    exclude_pos: List[str],\n    exclude_dep: List[str],\n    include_compound_words: bool,\n    fuzzy: bool,\n    case_sensitive: bool,\n    json_path: str,\n    verbose: bool,\n):\n    return Conceptualizer(\n        nlp=nlp,\n        data=data,\n        topn=topn,\n        model=model_path,\n        word_delimiter=word_delimiter,\n        ent_score=ent_score,\n        exclude_pos=exclude_pos,\n        exclude_dep=exclude_dep,\n        include_compound_words=include_compound_words,\n        fuzzy=fuzzy,\n        case_sensitive=case_sensitive,\n        json_path=json_path,\n        verbose=verbose,\n        name=name,\n    )\n"
  },
  {
    "path": "concise_concepts/conceptualizer/Conceptualizer.py",
    "content": "# -*- coding: utf-8 -*-\nimport json\nimport logging\nimport re\nimport types\nfrom copy import deepcopy\nfrom pathlib import Path\nfrom typing import List, Union\n\nimport gensim.downloader\nimport spaczz  # noqa: F401\nfrom gensim import matutils  # utility fnc for pickling, common scipy operations etc\nfrom gensim.models import FastText, Word2Vec\nfrom gensim.models.keyedvectors import KeyedVectors\nfrom numpy import argmax, dot\nfrom sense2vec import Sense2Vec\nfrom spacy import Language, util\nfrom spacy.tokens import Doc, Span\n\nlogger = logging.getLogger(__name__)\n\nPOS_LIST = [\n    \"ADJ\",\n    \"ADP\",\n    \"ADV\",\n    \"AUX\",\n    \"CONJ\",\n    \"CCONJ\",\n    \"DET\",\n    \"INTJ\",\n    \"NOUN\",\n    \"NUM\",\n    \"PART\",\n    \"PRON\",\n    \"PROPN\",\n    \"PUNCT\",\n    \"SCONJ\",\n    \"SYM\",\n    \"VERB\",\n    \"X\",\n    \"SPACE\",\n]\n\n\nclass Conceptualizer:\n    def __init__(\n        self,\n        nlp: Language,\n        data: dict = {},\n        model: Union[str, FastText, KeyedVectors, Word2Vec] = None,\n        topn: list = None,\n        word_delimiter: str = \"_\",\n        ent_score: bool = False,\n        exclude_pos: list = None,\n        exclude_dep: list = None,\n        include_compound_words: bool = False,\n        case_sensitive: bool = False,\n        fuzzy: bool = False,\n        json_path: str = \"./matching_patterns.json\",\n        verbose: bool = True,\n        name: str = \"concise_concepts\",\n    ):\n        \"\"\"\n        The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and\n        their synonyms, but with the words in the new dictionary all in uppercase\n\n        :param nlp: The spaCy model to use.\n        :type nlp: Language\n        :param name: The name of the entity.\n        :type name: str\n        :param data: A dictionary of the words you want to match. The keys are the classes you want to match,\n            and the values are the words you want to expand over.\n        :type data: dict\n        :param topn: The number of words to be returned for each class.\n        :type topn: list\n        :param model_path: The path to the model you want to use. If you don't have a model, you can use the spaCy one.\n        :param word_delimiter: The delimiter used to separate words in model the dictionary, defaults to _ (optional)\n        :param ent_score: If True, the extension \"ent_score\" will be added to the Span object. This will be the score of\n            the entity, defaults to False (optional)\n        :param exclude_pos: A list of POS tags to exclude from the rule based match\n        :param exclude_dep: list of dependencies to exclude from the rule based match\n        :param include_compound_words: If True, it will include compound words in the entity. For example,\n            if the entity is \"New York\", it will also include \"New York City\" as an entity, defaults to False (optional)\n        :param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional)\n        \"\"\"\n        assert data, ValueError(\"You must provide a dictionary of words to match\")\n        self.verbose = verbose\n        self.log_cache = {\"key\": list(), \"word\": list(), \"key_word\": list()}\n        if Span.has_extension(\"ent_score\"):\n            Span.remove_extension(\"ent_score\")\n        if ent_score:\n            Span.set_extension(\"ent_score\", default=None)\n        self.ent_score = ent_score\n        self.data = data\n        self.name = name\n        self.nlp = nlp\n        self.fuzzy = fuzzy\n        self.topn = topn\n        self.model = model\n        self.match_rule = {}\n        self.set_exclude_pos(exclude_pos)\n        self.set_exclude_dep(exclude_dep)\n        self.json_path = json_path\n        self.include_compound_words = include_compound_words\n        self.case_sensitive = case_sensitive\n        self.word_delimiter = word_delimiter\n        if \"lemmatizer\" not in self.nlp.component_names:\n            logger.warning(\n                \"No lemmatizer found in spacy pipeline. Consider adding it for matching\"\n                \" on LEMMA instead of exact text.\"\n            )\n            self.match_key = \"TEXT\"\n        else:\n            self.match_key = \"LEMMA\"\n\n        for ruler in [\"entity_ruler\", \"spaczz_ruler\"]:\n            if ruler in self.nlp.component_names:\n                logger.warning(\n                    f\"{ruler} already exists in the pipeline. Removing old rulers\"\n                )\n                self.nlp.remove_pipe(ruler)\n        self.run()\n\n    def set_exclude_dep(self, exclude_dep: list):\n        if exclude_dep is None:\n            exclude_dep = []\n        if exclude_dep:\n            self.match_rule[\"DEP\"] = {\"NOT_IN\": exclude_dep}\n\n    def set_exclude_pos(self, exclude_pos: list):\n        if exclude_pos is None:\n            exclude_pos = [\n                \"VERB\",\n                \"AUX\",\n                \"ADP\",\n                \"DET\",\n                \"CCONJ\",\n                \"PUNCT\",\n                \"ADV\",\n                \"ADJ\",\n                \"PART\",\n                \"PRON\",\n            ]\n        if exclude_pos:\n            self.match_rule[\"POS\"] = {\"NOT_IN\": exclude_pos}\n            self.exclude_pos = exclude_pos\n        else:\n            self.exclude_pos = []\n\n    def run(self) -> None:\n        self.check_validity_path()\n        self.set_gensim_model()\n        self.verify_data(self.verbose)\n        self.determine_topn()\n        self.expand_concepts()\n        # settle words around overlapping concepts\n        for _ in range(5):\n            self.expand_concepts()\n            self.infer_original_data()\n            self.resolve_overlapping_concepts()\n        self.infer_original_data()\n        self.create_conceptual_patterns()\n        self.set_concept_dict()\n\n        if not self.ent_score:\n            del self.kv\n\n        self.data_upper = {k.upper(): v for k, v in self.data.items()}\n\n    def check_validity_path(self) -> None:\n        \"\"\"\n        If the path is a file, create the parent directory if it doesn't exist. If the path is a directory, create the\n        directory and set the path to the default file name\n        \"\"\"\n        if self.json_path:\n            if Path(self.json_path).suffix:\n                Path(self.json_path).parents[0].mkdir(parents=True, exist_ok=True)\n            else:\n                Path(self.json_path).mkdir(parents=True, exist_ok=True)\n                old_path = str(self.json_path)\n                self.json_path = Path(self.json_path) / \"matching_patterns.json\"\n                logger.warning(\n                    f\"Path ´{old_path} is a directory, not a file. Setting\"\n                    f\" ´json_path´to {self.json_path}\"\n                )\n\n    def determine_topn(self) -> None:\n        \"\"\"\n        If the user doesn't specify a topn value for each class,\n        then the topn value for each class is set to 100\n        \"\"\"\n        if self.topn is None:\n            self.topn_dict = {key: 100 for key in self.data}\n        else:\n            num_classes = len(self.data)\n            assert (\n                len(self.topn) == num_classes\n            ), f\"Provide a topn integer for each of the {num_classes} classes.\"\n            self.topn_dict = dict(zip(self.data, self.topn))\n\n    def set_gensim_model(self) -> None:\n        \"\"\"\n        If the model_path is not None, then we try to load the model from the path.\n        If it's not a valid path, then we raise an exception.\n        If the model_path is None, then we load the model from the internal embeddings of the spacy model\n        \"\"\"\n        if isinstance(self.model, str):\n            if self.model:\n                available_models = gensim.downloader.info()[\"models\"]\n                if self.model in available_models:\n                    self.kv = gensim.downloader.load(self.model)\n                else:\n                    try:\n                        self.kv = Sense2Vec().from_disk(self.model)\n                    except Exception as e0:\n                        try:\n                            self.kv = FastText.load(self.model).wv\n                        except Exception as e1:\n                            try:\n                                self.kv = Word2Vec.load(self.model).wv\n                            except Exception as e2:\n                                try:\n                                    self.kv = KeyedVectors.load(self.model)\n                                except Exception as e3:\n                                    try:\n                                        self.kv = KeyedVectors.load_word2vec_format(\n                                            self.model, binary=True\n                                        )\n                                    except Exception as e4:\n                                        raise Exception(\n                                            \"Not a valid model.Sense2Vec, FastText,\"\n                                            f\" Word2Vec, KeyedVectors.\\n {e0}\\n {e1}\\n\"\n                                            f\" {e2}\\n {e3}\\n {e4}\"\n                                        )\n        elif isinstance(self.model, (FastText, Word2Vec)):\n            self.kv = self.model.wv\n        elif isinstance(self.model, KeyedVectors):\n            self.kv = self.model\n        elif isinstance(self.model, Sense2Vec):\n            self.kv = self.model\n        else:\n            wordList = []\n            vectorList = []\n\n            assert len(\n                self.nlp.vocab.vectors\n            ), \"Choose a spaCy model with internal embeddings, e.g. md or lg.\"\n\n            for key, vector in self.nlp.vocab.vectors.items():\n                wordList.append(self.nlp.vocab.strings[key])\n                vectorList.append(vector)\n\n            self.kv = KeyedVectors(self.nlp.vocab.vectors_length)\n\n            self.kv.add_vectors(wordList, vectorList)\n\n    def verify_data(self, verbose: bool = True) -> None:\n        \"\"\"\n        It takes a dictionary of lists of words, and returns a dictionary of lists of words,\n        where each word in the list is present in the word2vec model\n        \"\"\"\n        verified_data: dict[str, list[str]] = dict()\n        for key, value in self.data.items():\n            verified_values = []\n            present_key = self._check_presence_vocab(key)\n            if present_key:\n                key = present_key\n            if not present_key and verbose and key not in self.log_cache[\"key\"]:\n                logger.warning(f\"key ´{key}´ not present in vector model\")\n                self.log_cache[\"key\"].append(key)\n            for word in value:\n                present_word = self._check_presence_vocab(word)\n                if present_word:\n                    verified_values.append(present_word)\n                elif verbose and word not in self.log_cache[\"word\"]:\n                    logger.warning(\n                        f\"word ´{word}´ from key ´{key}´ not present in vector model\"\n                    )\n                    self.log_cache[\"word\"].append(word)\n            verified_data[key] = verified_values\n            if not len(verified_values):\n                msg = (\n                    f\"None of the entries for key {key} are present in the vector\"\n                    \" model. \"\n                )\n                if present_key:\n                    logger.warning(\n                        msg + f\"Using {present_key} as word to expand over instead.\"\n                    )\n                    verified_data[key] = present_key\n                else:\n                    raise Exception(msg)\n        self.data = deepcopy(verified_data)\n        self.original_data = deepcopy(verified_data)\n\n    def expand_concepts(self) -> None:\n        \"\"\"\n        For each key in the data dictionary, find the topn most similar words to the key and the values in the data\n        dictionary, and add those words to the values in the data dictionary\n        \"\"\"\n        for key in self.data:\n            present_key = self._check_presence_vocab(key)\n            if present_key:\n                key_list = [present_key]\n            else:\n                key_list = []\n            if isinstance(self.kv, Sense2Vec):\n                similar = self.kv.most_similar(\n                    self.data[key] + key_list,\n                    n=self.topn_dict[key],\n                )\n            else:\n                similar = self.kv.most_similar(\n                    self.data[key] + key_list,\n                    topn=self.topn_dict[key],\n                )\n            self.data[key] = list({word for word, _ratio in similar})\n\n    def resolve_overlapping_concepts(self) -> None:\n        \"\"\"\n        It removes words from the data that are in other concepts, and then removes words that are not closest to the\n        centroid of the concept\n        \"\"\"\n        for key in self.data:\n            self.data[key] = [\n                word\n                for word in self.data[key]\n                if key == self.most_similar_to_given(word, list(self.data.keys()))\n            ]\n\n    def most_similar_to_given(self, key1, keys_list):\n        \"\"\"Get the `key` from `keys_list` most similar to `key1`.\"\"\"\n        return keys_list[argmax([self.similarity(key1, key) for key in keys_list])]\n\n    def similarity(self, w1, w2):\n        \"\"\"Compute cosine similarity between two keys.\n\n        Parameters\n        ----------\n        w1 : str\n            Input key.\n        w2 : str\n            Input key.\n\n        Returns\n        -------\n        float\n            Cosine similarity between `w1` and `w2`.\n\n        \"\"\"\n        return dot(matutils.unitvec(self.kv[w1]), matutils.unitvec(self.kv[w2]))\n\n    def infer_original_data(self) -> None:\n        \"\"\"\n        It takes the original data and adds the new data to it, then removes the new data from the original data.\n        \"\"\"\n        for key in self.data:\n            self.data[key] = list(set(self.data[key] + self.original_data[key]))\n\n        for key_x in self.data:\n            for key_y in self.data:\n                if key_x != key_y:\n                    self.data[key_x] = [\n                        word\n                        for word in self.data[key_x]\n                        if word not in self.original_data[key_y]\n                    ]\n\n    def lemmatize_concepts(self) -> None:\n        \"\"\"\n        For each key in the data dictionary,\n        the function takes the list of concepts associated with that key, and lemmatizes\n        each concept.\n        \"\"\"\n        for key in self.data:\n            self.data[key] = list(\n                set([doc[0].lemma_ for doc in self.nlp.pipe(self.data[key])])\n            )\n\n    def create_conceptual_patterns(self) -> None:\n        \"\"\"\n        For each key in the data dictionary,\n        create a pattern for each word in the list of words associated with that key.\n\n\n        The pattern is a dictionary with three keys:\n\n        1. \"lemma\"\n        2. \"POS\"\n        3. \"DEP\"\n\n        The value for each key is another dictionary with one key and one value.\n\n        The key is either \"regex\" or \"NOT_IN\" or \"IN\".\n\n        The value is either a regular expression or a list of strings.\n\n        The regular expression is the word associated with the key in the data dictionary.\n\n        The list of strings is either [\"VERB\"] or [\"nsubjpass\"] or [\"amod\", \"compound\"].\n\n        The regular expression is case insensitive.\n\n        The pattern is\n        \"\"\"\n        lemma_patterns = []\n        fuzzy_patterns = []\n\n        def add_patterns(input_dict: dict) -> None:\n            \"\"\"\n            It creates a  list of dictionaries that can be used for a spaCy entity ruler\n\n            :param input_dict: a dictionary\n            :type input_dict: dict\n            \"\"\"\n\n            if isinstance(self.kv, Sense2Vec):\n                input_dict = {\n                    key.split(\"|\")[0]: [word.split(\"|\")[0] for word in value]\n                    for key, value in input_dict.items()\n                }\n            for key in input_dict:\n                words = input_dict[key]\n                for word in words:\n                    if word != key:\n                        word_parts = self._split_word(word)\n                        op_pattern = {\n                            \"TEXT\": {\n                                \"REGEX\": \"|\".join([\" \", \"-\", \"_\", \"/\"]),\n                                \"OP\": \"*\",\n                            }\n                        }\n                        partial_pattern_parts = []\n                        lemma_pattern_parts = []\n                        for partial_pattern in word_parts:\n                            word_part = partial_pattern\n                            if self.fuzzy:\n                                partial_pattern = {\n                                    \"FUZZY\": word_part,\n                                }\n                            partial_pattern = {\"TEXT\": partial_pattern}\n                            lemma_pattern_parts.append({self.match_key: word_part})\n                            lemma_pattern_parts.append(op_pattern)\n                            partial_pattern_parts.append(partial_pattern)\n                            partial_pattern_parts.append(op_pattern)\n\n                        pattern = {\n                            \"label\": key.upper(),\n                            \"pattern\": partial_pattern_parts[:-1],\n                            \"id\": f\"{word}_individual\",\n                        }\n\n                        # add fuzzy matching formatting if fuzzy matching is enabled\n                        fuzzy_patterns.append(pattern)\n\n                        # add lemmma matching\n                        if lemma_pattern_parts:\n                            lemma_pattern = {\n                                \"label\": key.upper(),\n                                \"pattern\": lemma_pattern_parts[:-1],\n                                \"id\": f\"{word}_lemma_individual\",\n                            }\n                            lemma_patterns.append(lemma_pattern)\n\n                        if self.include_compound_words:\n                            compound_rule = [\n                                {\n                                    \"DEP\": {\"IN\": [\"amod\", \"compound\"]},\n                                    \"OP\": \"*\",\n                                }\n                            ]\n                            partial_pattern_parts.append(\n                                {\n                                    \"label\": key.upper(),\n                                    \"pattern\": compound_rule\n                                    + partial_pattern_parts[:-1]\n                                    + compound_rule,\n                                    \"id\": f\"{word}_compound\",\n                                }\n                            )\n                            if lemma_pattern_parts:\n                                lemma_patterns.append(\n                                    {\n                                        \"label\": key.upper(),\n                                        \"pattern\": compound_rule\n                                        + lemma_pattern_parts[:-1]\n                                        + compound_rule,\n                                        \"id\": f\"{word}_lemma_compound\",\n                                    }\n                                )\n\n        add_patterns(self.data)\n\n        if self.json_path:\n            with open(self.json_path, \"w\") as f:\n                json.dump(lemma_patterns + fuzzy_patterns, f)\n\n        config = {\"overwrite_ents\": True}\n        if self.case_sensitive:\n            config[\"phrase_matcher_attr\"] = \"LOWER\"\n\n        self.ruler = self.nlp.add_pipe(\"entity_ruler\", config=config)\n        self.ruler.add_patterns(lemma_patterns)\n\n        # Add spaczz entity ruler if fuzzy\n        if self.fuzzy:\n            for pattern in fuzzy_patterns:\n                pattern[\"type\"] = \"token\"\n            self.fuzzy_ruler = self.nlp.add_pipe(\"spaczz_ruler\", config=config)\n            self.fuzzy_ruler.add_patterns(fuzzy_patterns)\n\n    def __call__(self, doc: Doc) -> Doc:\n        \"\"\"\n        It takes a doc object and assigns a score to each entity in the doc object\n\n        :param doc: Doc\n        :type doc: Doc\n        \"\"\"\n        if isinstance(doc, str):\n            doc = self.nlp(doc)\n        elif isinstance(doc, Doc):\n            if self.ent_score:\n                doc = self.assign_score_to_entities(doc)\n\n        return doc\n\n    def pipe(self, stream, batch_size=128) -> Doc:\n        \"\"\"\n        It takes a stream of documents, and for each document,\n        it assigns a score to each entity in the document\n\n        :param stream: a generator of documents\n        :param batch_size: The number of documents to be processed at a time, defaults to 128 (optional)\n        \"\"\"\n        if isinstance(stream, str):\n            stream = [stream]\n\n        if not isinstance(stream, types.GeneratorType):\n            stream = self.nlp.pipe(stream, batch_size=batch_size)\n\n        for docs in util.minibatch(stream, size=batch_size):\n            for doc in docs:\n                if self.ent_score:\n                    doc = self.assign_score_to_entities(doc)\n                yield doc\n\n    def assign_score_to_entities(self, doc: Doc) -> Doc:\n        \"\"\"\n        The function takes a spaCy document as input and assigns a score to each entity in the document. The score is\n        calculated using the word embeddings of the entity and the concept.\n        The score is assigned to the entity using the\n        `._.ent_score` attribute\n\n        :param doc: Doc\n        :type doc: Doc\n        :return: The doc object with the entities and their scores.\n        \"\"\"\n        ents = doc.ents\n        for ent in ents:\n            if ent.label_ in self.data_upper:\n                ent_text = ent.text\n\n                # get word part representations\n                if self._check_presence_vocab(ent_text):\n                    entity = [self._check_presence_vocab(ent_text)]\n                else:\n                    entity = []\n                    for part in self._split_word(ent_text):\n                        present_part = self._check_presence_vocab(part)\n                        if present_part:\n                            entity.append(present_part)\n\n                # get concepts to match\n                concept = self.concept_data.get(ent.label_, None)\n\n                # compare set similarities\n                if entity and concept:\n                    ent._.ent_score = self.kv.n_similarity(entity, concept)\n                else:\n                    ent._.ent_score = 0\n                    if self.verbose:\n                        if f\"{ent_text}_{concept}\" not in self.log_cache[\"key_word\"]:\n                            logger.warning(\n                                f\"Entity ´{ent.text}´ and/or label ´{concept}´ not\"\n                                \" found in vector model. Nothing to compare to, so\"\n                                \" setting ent._.ent_score to 0.\"\n                            )\n                            self.log_cache[\"key_word\"].append(f\"{ent_text}_{concept}\")\n            else:\n                ent._.ent_score = 0\n                if self.verbose:\n                    if ent.text not in self.log_cache[\"word\"]:\n                        logger.warning(\n                            f\"Entity ´{ent.text}´ not found in vector model. Nothing to\"\n                            \" compare to, so setting ent._.ent_score to 0.\"\n                        )\n                        self.log_cache[\"word\"].append(ent.text)\n        doc.ents = ents\n        return doc\n\n    def set_concept_dict(self):\n        self.concept_data = {k.upper(): v for k, v in self.data.items()}\n        for ent_label in self.concept_data:\n            concept = []\n            for word in self.concept_data[ent_label]:\n                present_word = self._check_presence_vocab(word)\n                if present_word:\n                    concept.append(present_word)\n            self.concept_data[ent_label] = concept\n\n    def _split_word(self, word: str) -> List[str]:\n        \"\"\"\n        It splits a word into a list of subwords, using the word delimiter\n\n        :param word: str\n        :type word: str\n        :return: A list of strings or any.\n        \"\"\"\n        return re.split(f\"[{re.escape(self.word_delimiter)}]+\", word)\n\n    def _check_presence_vocab(self, word: str) -> str:\n        \"\"\"\n        If the word is not lowercase and the case_sensitive flag is set to False, then check if the lowercase version of\n        the word is in the vocabulary. If it is, return the lowercase version of the word. Otherwise, return the word\n        itself\n\n        :param word: The word to check for presence in the vocabulary\n        :type word: str\n        :return: The word itself if it is present in the vocabulary, otherwise the word with the highest probability of\n        being the word that was intended.\n        \"\"\"\n        word = word.replace(\" \", \"_\")\n        if not word.islower() and not self.case_sensitive:\n            present_word = self.__check_presence_vocab(word.lower())\n            if present_word:\n                return present_word\n        return self.__check_presence_vocab(word)\n\n    def __check_presence_vocab(self, word: str) -> str:\n        \"\"\"\n        If the word is in the vocabulary, return the word. If not, replace spaces and dashes with the word delimiter and\n        check if the new word is in the vocabulary. If so, return the new word\n\n        :param word: str - the word to check\n        :type word: str\n        :return: The word or the check_word\n        \"\"\"\n        if isinstance(self.kv, Sense2Vec):\n            return self.kv.get_best_sense(word, (set(POS_LIST) - set(self.exclude_pos)))\n        else:\n            if word in self.kv:\n                return word\n"
  },
  {
    "path": "concise_concepts/conceptualizer/__init__.py",
    "content": "# -*- coding: utf-8 -*-\nfrom .Conceptualizer import Conceptualizer\n\n__all__ = [\"Conceptualizer\"]\n"
  },
  {
    "path": "concise_concepts/examples/__init__.py",
    "content": ""
  },
  {
    "path": "concise_concepts/examples/data.py",
    "content": "# -*- coding: utf-8 -*-\ntext = \"\"\"\n    Heat the oil in a large pan and add the Onion, celery and carrots.\n    Then, cook over a medium–low heat for 10 minutes, or until softened.\n    Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.\n    Later, add some oranges, chickens. \"\"\"\n\ntext_fuzzy = \"\"\"\n    Heat the oil in a large pan and add the Onion, celery and carots.\n    Then, cook over a medium–low heat for 10 minutes, or until softened.\n    Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.\n    Later, add some oranges, chickens. \"\"\"\n\ndata = {\n    \"fruit\": [\"apple\", \"pear\", \"orange\"],\n    \"vegetable\": [\"broccoli\", \"spinach\", \"tomato\"],\n    \"meat\": [\"chicken\", \"beef\", \"pork\", \"fish\", \"lamb\"],\n}\n"
  },
  {
    "path": "concise_concepts/examples/example_gensim_custom_model.py",
    "content": "# -*- coding: utf-8 -*-\nimport spacy\nfrom gensim.models import Word2Vec\nfrom gensim.test.utils import common_texts\n\nimport concise_concepts  # noqa: F401\n\ndata = {\"human\": [\"trees\"], \"interface\": [\"computer\"]}\n\ntext = (\n    \"believe me, it's the slowest mobile I saw. Don't go on screen and Battery, it is\"\n    \" an extremely slow mobile phone and takes ages to open and navigate. Forget about\"\n    \" heavy use, it can't handle normal regular use. I made a huge mistake but pls\"\n    \" don't buy this mobile. It's only a few months and I am thinking to change it. Its\"\n    \" dam SLOW SLOW SLOW.\"\n)\n\nmodel = Word2Vec(\n    sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4\n)\nmodel.save(\"word2vec.model\")\nmodel_path = \"word2vec.model\"\n\nnlp = spacy.blank(\"en\")\nnlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"model_path\": model_path})\n"
  },
  {
    "path": "concise_concepts/examples/example_gensim_custom_path.py",
    "content": "# -*- coding: utf-8 -*-\nimport gensim.downloader as api\nimport spacy\n\nimport concise_concepts  # noqa: F401\n\nfrom .data import data, text\n\nmodel_path = \"word2vec.model\"\nmodel = api.load(\"glove-twitter-25\")\nmodel.save(model_path)\nnlp = spacy.blank(\"en\")\n\nnlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"model_path\": model_path})\n\ndoc = nlp(text)\nprint([(ent.text, ent.label_) for ent in doc.ents])\n"
  },
  {
    "path": "concise_concepts/examples/example_gensim_default.py",
    "content": "# -*- coding: utf-8 -*-\nimport spacy\n\nimport concise_concepts  # noqa: F401\n\nfrom .data import data, text\n\nmodel_path = \"glove-twitter-25\"\n\nnlp = spacy.blank(\"en\")\n\nnlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"model_path\": model_path})\n\ndoc = nlp(text)\nprint([(ent.text, ent.label_) for ent in doc.ents])\n"
  },
  {
    "path": "concise_concepts/examples/example_spacy.py",
    "content": "# -*- coding: utf-8 -*-\nimport spacy\n\nimport concise_concepts  # noqa: F401\n\nfrom .data import data, text\n\nnlp = spacy.load(\"en_core_web_md\")\n\nnlp.add_pipe(\"concise_concepts\", config={\"data\": data})\n\ndoc = nlp(text)\nprint([(ent.text, ent.label_) for ent in doc.ents])\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[tool.poetry]\nname = \"concise-concepts\"\nversion = \"0.8.1\"\ndescription = \"This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!\"\nauthors = [\"David Berenstein <david.m.berenstein@gmail.com>\"]\nlicense = \"MIT\"\nreadme = \"README.md\"\nhomepage = \"https://github.com/pandora-intelligence/concise-concepts\"\nrepository = \"https://github.com/pandora-intelligence/concise-concepts\"\ndocumentation = \"https://github.com/pandora-intelligence/concise-concepts\"\nkeywords = [\"spacy\", \"NER\", \"few-shot classification\", \"nlu\"]\nclassifiers = [\n    \"Intended Audience :: Developers\",\n    \"Intended Audience :: Science/Research\",\n    \"License :: OSI Approved :: MIT License\",\n    \"Operating System :: OS Independent\",\n    \"Programming Language :: Python :: 3.8\",\n    \"Programming Language :: Python :: 3.9\",\n    \"Programming Language :: Python :: 3.10\",\n    \"Programming Language :: Python :: 3.11\",\n    \"Topic :: Scientific/Engineering\",\n    \"Topic :: Software Development\"\n]\npackages = [{include = \"concise_concepts\"}]\n\n\n[tool.poetry.dependencies]\npython = \">=3.8,<3.12\"\nspacy = \"^3\"\nscipy = \"^1.7\"\ngensim = \"^4\"\nspaczz = \"^0.5.4\"\nsense2vec = \"^2.0.1\"\n\n[tool.poetry.plugins]\n\n[tool.poetry.plugins.\"spacy_factories\"]\n\"spacy\" = \"concise_concepts.__init__:make_concise_concepts\"\n\n[tool.poetry.group.dev.dependencies]\nblack = \"^22\"\nflake8 = \"^5\"\npytest = \"^7.1\"\npre-commit = \"^2.20\"\npep8-naming = \"^0.13\"\nflake8-bugbear = \"^22.9\"\nflake8-docstrings = \"^1.6\"\nipython = \"^8.7.0\"\nipykernel = \"^6.17.1\"\n\n[build-system]\nrequires = [\"poetry-core>=1.0.0\"]\nbuild-backend = \"poetry.core.masonry.api\"\n\n[tool.pytest.ini_options]\ntestpaths = \"tests\"\n\n[tool.black]\npreview = true\n\n[tool.isort]\nprofile = \"black\"\nsrc_paths = [\"concise_concepts\"]\n"
  },
  {
    "path": "setup.cfg",
    "content": "[flake8]\nmax-line-length = 119\nmax-complexity = 18\ndocstring-convention=google\nexclude = .git,__pycache__,build,dist\nselect = C,E,F,W,B,B950\nignore =\n    E203,E266,E501,W503\nenable =\n    W0614\nper-file-ignores =\n    test_*.py: D\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/test_model_import.py",
    "content": "# -*- coding: utf-8 -*-\ndef test_spacy_embeddings():\n    from concise_concepts.examples import example_spacy  # noqa: F401\n\n\ndef test_gensim_default():\n    from concise_concepts.examples import example_gensim_default  # noqa: F401\n\n\ndef test_gensim_custom_path():\n    from concise_concepts.examples import example_gensim_custom_path  # noqa: F401\n\n\ndef test_gensim_custom_model():\n    from concise_concepts.examples import example_gensim_custom_model  # noqa: F401\n\n\ndef test_standalone_spacy():\n    import spacy\n\n    from concise_concepts import Conceptualizer\n\n    nlp = spacy.load(\"en_core_web_md\")\n    data = {\n        \"disease\": [\"cancer\", \"diabetes\", \"heart disease\", \"influenza\", \"pneumonia\"],\n        \"symptom\": [\"headache\", \"fever\", \"cough\", \"nausea\", \"vomiting\", \"diarrhea\"],\n    }\n    conceptualizer = Conceptualizer(nlp, data)\n    assert (\n        list(conceptualizer.pipe([\"I have a headache and a fever.\"]))[0].to_json()\n        == list(conceptualizer.nlp.pipe([\"I have a headache and a fever.\"]))[\n            0\n        ].to_json()\n    )\n    assert (\n        conceptualizer(\"I have a headache and a fever.\").to_json()\n        == conceptualizer.nlp(\"I have a headache and a fever.\").to_json()\n    )\n\n    data = {\n        \"disease\": [\"cancer\", \"diabetes\"],\n        \"symptom\": [\"headache\", \"fever\"],\n    }\n    conceptualizer = Conceptualizer(nlp, data)\n\n\ndef test_standalone_gensim():\n    import gensim\n    import spacy\n\n    from concise_concepts import Conceptualizer\n\n    model_path = \"glove-twitter-25\"\n    model = gensim.downloader.load(model_path)\n    nlp = spacy.load(\"en_core_web_md\")\n    data = {\n        \"disease\": [\"cancer\", \"diabetes\", \"heart disease\", \"influenza\", \"pneumonia\"],\n        \"symptom\": [\"headache\", \"fever\", \"cough\", \"nausea\", \"vomiting\", \"diarrhea\"],\n    }\n    conceptualizer = Conceptualizer(nlp, data, model=model)\n    print(list(conceptualizer.pipe([\"I have a headache and a fever.\"]))[0].ents)\n    print(list(conceptualizer.nlp.pipe([\"I have a headache and a fever.\"]))[0].ents)\n    print(conceptualizer(\"I have a headache and a fever.\").ents)\n    print(conceptualizer.nlp(\"I have a headache and a fever.\").ents)\n\n\ndef test_spaczz():\n    # -*- coding: utf-8 -*-\n    import spacy\n\n    import concise_concepts  # noqa: F401\n    from concise_concepts.examples.data import data, text, text_fuzzy\n\n    nlp = spacy.load(\"en_core_web_md\")\n\n    nlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"fuzzy\": True})\n\n    assert len(nlp(text).ents) == len(nlp(text_fuzzy).ents)\n\n\ndef test_sense2vec():\n    # -*- coding: utf-8 -*-\n    import requests\n    import spacy\n\n    import concise_concepts  # noqa: F401\n    from concise_concepts.examples.data import data, text\n\n    model_path = \"s2v_old\"\n    # download .tar.gz file an URL\n    # and extract it to a folder\n    url = \"https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz\"\n    r = requests.get(url, allow_redirects=True)\n    open(\"s2v_reddit_2015_md.tar.gz\", \"wb\").write(r.content)\n    # extract tar.gz file\n    filename = \"s2v_reddit_2015_md.tar.gz\"\n    import tarfile\n\n    tar = tarfile.open(filename, \"r:gz\")\n    tar.extractall()\n    tar.close()\n\n    nlp = spacy.load(\"en_core_web_md\")\n\n    nlp.add_pipe(\"concise_concepts\", config={\"data\": data, \"model_path\": model_path})\n\n    assert len(nlp(text).ents)\n"
  }
]