Repository: MaartenGr/BERTopic Branch: master Commit: b2ce08422250 Files: 184 Total size: 12.3 MB Directory structure: gitextract_s6becou6/ ├── .git-blame-ignore-revs ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ ├── config.yml │ │ └── feature_request.yml │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ └── testing.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── bertopic/ │ ├── __init__.py │ ├── _bertopic.py │ ├── _save_utils.py │ ├── _utils.py │ ├── backend/ │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── _cohere.py │ │ ├── _fastembed.py │ │ ├── _flair.py │ │ ├── _gensim.py │ │ ├── _hftransformers.py │ │ ├── _langchain.py │ │ ├── _model2vec.py │ │ ├── _multimodal.py │ │ ├── _openai.py │ │ ├── _sentencetransformers.py │ │ ├── _sklearn.py │ │ ├── _spacy.py │ │ ├── _use.py │ │ ├── _utils.py │ │ └── _word_doc.py │ ├── cluster/ │ │ ├── __init__.py │ │ ├── _base.py │ │ └── _utils.py │ ├── dimensionality/ │ │ ├── __init__.py │ │ └── _base.py │ ├── plotting/ │ │ ├── __init__.py │ │ ├── _approximate_distribution.py │ │ ├── _barchart.py │ │ ├── _datamap.py │ │ ├── _distribution.py │ │ ├── _documents.py │ │ ├── _heatmap.py │ │ ├── _hierarchical_documents.py │ │ ├── _hierarchy.py │ │ ├── _term_rank.py │ │ ├── _topics.py │ │ ├── _topics_over_time.py │ │ └── _topics_per_class.py │ ├── representation/ │ │ ├── __init__.py │ │ ├── _base.py │ │ ├── _cohere.py │ │ ├── _keybert.py │ │ ├── _langchain.py │ │ ├── _litellm.py │ │ ├── _llamacpp.py │ │ ├── _mmr.py │ │ ├── _openai.py │ │ ├── _pos.py │ │ ├── _textgeneration.py │ │ ├── _utils.py │ │ ├── _visual.py │ │ └── _zeroshot.py │ └── vectorizers/ │ ├── __init__.py │ ├── _ctfidf.py │ └── _online_cv.py ├── docs/ │ ├── algorithm/ │ │ └── algorithm.md │ ├── api/ │ │ ├── backends.md │ │ ├── bertopic.md │ │ ├── cluster copy.md │ │ ├── cluster.md │ │ ├── ctfidf.md │ │ ├── dimensionality.md │ │ ├── plotting/ │ │ │ ├── barchart.md │ │ │ ├── distribution.md │ │ │ ├── document_datamap.md │ │ │ ├── documents.md │ │ │ ├── dtm.md │ │ │ ├── heatmap.md │ │ │ ├── hierarchical_documents.md │ │ │ ├── hierarchy.md │ │ │ ├── term.md │ │ │ ├── topics.md │ │ │ └── topics_per_class.md │ │ ├── plotting.md │ │ ├── representations.md │ │ └── vectorizers.md │ ├── changelog.md │ ├── faq.md │ ├── getting_started/ │ │ ├── best_practices/ │ │ │ └── best_practices.md │ │ ├── clustering/ │ │ │ └── clustering.md │ │ ├── ctfidf/ │ │ │ └── ctfidf.md │ │ ├── dim_reduction/ │ │ │ └── dim_reduction.md │ │ ├── distribution/ │ │ │ ├── distribution.md │ │ │ └── distribution_viz.html │ │ ├── embeddings/ │ │ │ └── embeddings.md │ │ ├── guided/ │ │ │ └── guided.md │ │ ├── hierarchicaltopics/ │ │ │ ├── hierarchical_topics.html │ │ │ └── hierarchicaltopics.md │ │ ├── manual/ │ │ │ └── manual.md │ │ ├── merge/ │ │ │ └── merge.md │ │ ├── multiaspect/ │ │ │ └── multiaspect.md │ │ ├── multimodal/ │ │ │ └── multimodal.md │ │ ├── online/ │ │ │ └── online.md │ │ ├── outlier_reduction/ │ │ │ ├── fig_base.html │ │ │ ├── fig_reduced.html │ │ │ └── outlier_reduction.md │ │ ├── parameter tuning/ │ │ │ └── parametertuning.md │ │ ├── quickstart/ │ │ │ ├── quickstart.md │ │ │ └── viz.html │ │ ├── representation/ │ │ │ ├── llm.md │ │ │ └── representation.md │ │ ├── search/ │ │ │ └── search.md │ │ ├── seed_words/ │ │ │ └── seed_words.md │ │ ├── semisupervised/ │ │ │ └── semisupervised.md │ │ ├── serialization/ │ │ │ └── serialization.md │ │ ├── supervised/ │ │ │ └── supervised.md │ │ ├── tips_and_tricks/ │ │ │ └── tips_and_tricks.md │ │ ├── topicreduction/ │ │ │ └── topicreduction.md │ │ ├── topicrepresentation/ │ │ │ └── topicrepresentation.md │ │ ├── topicsovertime/ │ │ │ ├── topicsovertime.md │ │ │ └── trump.html │ │ ├── topicsperclass/ │ │ │ ├── topics_per_class.html │ │ │ └── topicsperclass.md │ │ ├── vectorizers/ │ │ │ └── vectorizers.md │ │ ├── visualization/ │ │ │ ├── bar_chart.html │ │ │ ├── datamapplot.html │ │ │ ├── documents.html │ │ │ ├── heatmap.html │ │ │ ├── hierarchical_documents.html │ │ │ ├── hierarchical_topics.html │ │ │ ├── hierarchy.html │ │ │ ├── probabilities.html │ │ │ ├── term_rank.html │ │ │ ├── term_rank_log.html │ │ │ ├── topics_per_class.html │ │ │ ├── trump.html │ │ │ ├── visualization.md │ │ │ ├── visualize_documents.md │ │ │ ├── visualize_hierarchy.md │ │ │ ├── visualize_terms.md │ │ │ ├── visualize_topics.md │ │ │ └── viz.html │ │ └── zeroshot/ │ │ └── zeroshot.md │ ├── img/ │ │ └── probabilities.html │ ├── index.md │ ├── stylesheets/ │ │ └── extra.css │ └── usecases.md ├── mkdocs.yml ├── pyproject.toml └── tests/ ├── __init__.py ├── conftest.py ├── test_bertopic.py ├── test_other.py ├── test_plotting/ │ ├── __init__.py │ ├── test_approximate.py │ ├── test_bar.py │ ├── test_documents.py │ ├── test_dynamic.py │ ├── test_heatmap.py │ ├── test_term_rank.py │ └── test_topics.py ├── test_reduction/ │ ├── __init__.py │ ├── test_delete.py │ └── test_merge.py ├── test_representation/ │ ├── __init__.py │ ├── test_get.py │ ├── test_labels.py │ └── test_representations.py ├── test_sub_models/ │ ├── __init__.py │ ├── test_cluster.py │ ├── test_dim_reduction.py │ └── test_embeddings.py ├── test_utils.py ├── test_variations/ │ ├── __init__.py │ ├── test_class.py │ ├── test_dynamic.py │ └── test_hierarchy.py └── test_vectorizers/ ├── __init__.py ├── test_ctfidf.py └── test_online_cv.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .git-blame-ignore-revs ================================================ # Initial Ruff Linting 70d7725e5c89bccfe7d4e5a3ccd87e05c642d74b # Change line-length and ruff format 39bbfdb8298b5faa32e4bc052080d240f6140bea # pre-commit hooks and ruff 6ed123ecc4aec9da26bd48748df670cd5b42b3cd ================================================ FILE: .gitattributes ================================================ *.ipynb linguist-documentation ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: "\U0001F41B Bug Report" description: Report your bug here. labels: ["bug"] body: - type: markdown attributes: value: | Thanks for taking the time to fill out this bug report! Any information you can provide about your system and the issue you encountered will help to resolve it faster. - type: checkboxes attributes: label: Have you searched existing issues? 🔎 description: Please search to see if an [issue](https://github.com/MaartenGr/BERTopic/issues) already exists for the issue you encountered. options: - label: I have searched and found no existing issues required: true - type: textarea id: describe_the_bug attributes: label: Desribe the bug description: Please provide a concise description of the bug. If there is an error, make sure to provide the **full** error log. placeholder: Describe the bug validations: required: true - type: textarea id: reproduction attributes: label: Reproduction description: Please provide a minimal example, with code, that can be run to reproduce the issue. value: | ```python from bertopic import BERTopic ``` - type: input id: bertopic_version attributes: label: BERTopic Version description: What version of BERTopic are you using? validations: required: true ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: true contact_links: - name: 💡 General questions url: https://github.com/MaartenGr/BERTopic/discussions about: Ask a question there! - name: Want to contribute? url: https://github.com/MaartenGr/BERTopic/blob/master/CONTRIBUTING.md about: Head to the contributing guidelines ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ name: "\U0001F680 Feature request" description: Submit a proposal/request for a new BERTopic feature labels: ["Feature request"] body: - type: textarea id: feature-request validations: required: true attributes: label: Feature request description: | A clear and concise description of the feature proposal. - type: textarea id: motivation validations: required: true attributes: label: Motivation description: | Please outline the motivation for the proposal. If this is related to another GitHub issue, please link here too. - type: textarea id: contribution validations: required: true attributes: label: Your contribution description: | Any help on the implementation of this feature would be greatly appreciated. If you are interested in working on this, make sure to read the [CONTRIBUTING.MD guide](https://github.com/MaartenGr/BERTopic/blob/master/CONTRIBUTING.md) ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (if yes, ignore all other checks!). - [ ] Did you read the [contributor guideline](https://github.com/MaartenGr/BERTopic/blob/master/CONTRIBUTING.md)? - [ ] Was this discussed/approved via a Github issue? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes (if applicable)? - [ ] Did you write any new necessary tests? ================================================ FILE: .github/workflows/testing.yml ================================================ name: Code Checks on: push: branches: - master - dev pull_request: branches: - master - dev jobs: lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - uses: actions/setup-python@v6 with: python-version: "3.13" # Ref: https://github.com/tox-dev/action-pre-commit-uv - uses: tox-dev/action-pre-commit-uv@246b66536e366bb885f52d61983bf32f7c95e8b1 # v1.0.3 build: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v5 - name: Install latest version of uv and set up Python ${{ matrix.python-version }} uses: astral-sh/setup-uv@v7 with: python-version: ${{ matrix.python-version }} activate-environment: "true" - name: Install dependencies (including test group) run: uv sync --extra test - name: Display the project's dependency tree (including information on outdated packages) run: uv tree --outdated - name: Run Checking Mechanisms run: make check ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST model_dir model_dir/ test # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Sphinx documentation docs/_build/ # Jupyter Notebook .ipynb_checkpoints notebooks/ # IPython profile_default/ ipython_config.py # pyenv .python-version # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ *.lock # Artifacts .idea .idea/ .vscode .DS_Store # mkdocs site/ ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: trailing-whitespace exclude: | (?x)^( README.md| docs/ )$ - id: end-of-file-fixer exclude_types: [html, svg] - id: check-yaml - id: check-added-large-files - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.0 hooks: - id: ruff-check args: [--fix, --show-fixes, --exit-non-zero-on-fix] - id: ruff-format ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to BERTopic Hi! Thank you for considering contributing to BERTopic. With the modular nature of BERTopic, many new add-ons, backends, representation models, sub-models, and LLMs, can quickly be added to keep up with the incredibly fast-pacing field. Whether contributions are new features, better documentation, bug fixes, or improvement on the repository itself, anything is appreciated! ## 📚 Guidelines ### 🤖 Contributing Code To contribute to this project, we follow an `issue -> pull request` approach for main features and bug fixes. This means that any new feature, bug fix, or anything else that touches on code directly needs to start from an issue first. That way, the main discussion about what needs to be added/fixed can be done in the issue before creating a pull request. This makes sure that we are on the same page before you start coding your pull request. If you start working on an issue, please assign it to yourself but do so after there is an agreement with the maintainer, [@MaartenGr](https://github.com/MaartenGr). When there is agreement on the assigned approach, a pull request can be created in which the fix/feature can be added. This follows a ["fork and pull request"](https://docs.github.com/en/get-started/quickstart/contributing-to-projects) workflow. Please do not try to push directly to this repo unless you are a maintainer. There are exceptions to the `issue -> pull request` approach that are typically small changes that do not need agreements, such as: * Documentation * Spelling/grammar issues * Docstrings * etc. There is a large focus on documentation in this repository, so please make sure to add extensive descriptions of features when creating the pull request. Note that the main focus of pull requests and code should be: * Easy readability * Clear communication * Sufficient documentation ## 🚀 Quick Start To start contributing, make sure to first start from a fresh environment. Using an environment manager, such as `conda` or `pyenv` helps in making sure that your code is reproducible and tracks the versions you have in your environment. If you are using conda, you can approach it as follows: 1. Create and activate a new conda environment (e.g., `conda create -n bertopic python=3.10`) 2. Install requirements (e.g., `pip install .[dev]`) * This makes sure to also install documentation and testing packages 3. (Optional) Run `make docs` to build your documentation 4. (Optional) Run `make test` to run the unit tests and `make coverage` to check the coverage of unit tests ❗Note: Unit testing the package can take quite some time since it needs to run several variants of the BERTopic pipeline. ## 🧹 Linting and Formatting We use [Ruff](https://docs.astral.sh/ruff/) to ensure code is uniformly formatted and to avoid common mistakes and bad practices. * To automatically re-format code, run `make format` * To check for linting issues, run `make lint` - some issues may be automatically fixed, some will not be When a pull request is made, the CI will automatically check for linting and formatting issues. However, it will not automatically apply any fixes, so it is easiest to run locally. If you believe an error is incorrectly flagged, use a [`# noqa:` comment to suppress](https://docs.astral.sh/ruff/linter/#error-suppression), but this is discouraged unless strictly necessary. ## 🤓 Collaborative Efforts When you run into any issue with the above or need help to start with a pull request, feel free to reach out in the issues! As with all repositories, this one has its particularities as a result of the maintainer's view. Each repository is quite different and so will their processes. ## 🏆 Recognition If your contribution has made its way into a new release of BERTopic, you will be given credit in the changelog of the new release! Regardless of the size of the contribution, any help is greatly appreciated. ## 🎈 Release BERTopic tries to mostly follow [semantic versioning](https://semver.org/) for its new releases. Even though BERTopic has been around for a few years now, it is still pre-1.0 software. With the rapid chances in the field and as a way to keep up, this versioning is on purpose. Backwards-compatibility is taken into account but integrating new features and thereby keeping up with the field takes priority. Especially since BERTopic focuses on modularity, flexibility is necessary. ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024, Maarten P. Grootendorst Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ test: pytest test-no-plotly: uv sync --extra test uv pip uninstall plotly pytest tests/test_other.py -k plotly --pdb uv sync --extra test pytest tests/test_other.py -k plotly coverage: pytest --cov format: ruff format lint: ruff check --fix install: python -m pip install -e . install-test: python -m pip install -e ".[dev]" docs: mkdocs serve pypi: python -m build twine upload dist/* clean: rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache check: test clean ================================================ FILE: README.md ================================================ [![PyPI Downloads](https://static.pepy.tech/badge/bertopic)](https://pepy.tech/projects/bertopic) [![PyPI - Python](https://img.shields.io/badge/python-v3.10+-blue.svg)](https://pypi.org/project/bertopic/) [![Build](https://img.shields.io/github/actions/workflow/status/MaartenGr/BERTopic/testing.yml?branch=master)](https://github.com/MaartenGr/BERTopic/actions) [![docs](https://img.shields.io/badge/docs-Passing-green.svg)](https://maartengr.github.io/BERTopic/) [![PyPI - PyPi](https://img.shields.io/pypi/v/BERTopic)](https://pypi.org/project/bertopic/) [![PyPI - License](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/MaartenGr/VLAC/blob/master/LICENSE) [![arXiv](https://img.shields.io/badge/arXiv-2203.05794-.svg)](https://arxiv.org/abs/2203.05794) # BERTopic BERTopic is a topic modeling technique that leverages 🤗 transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions. BERTopic supports all kinds of topic modeling techniques:
Guided Supervised Semi-supervised
Manual Multi-topic distributions Hierarchical
Class-based Dynamic Online/Incremental
Multimodal Multi-aspect Text Generation/LLM
Zero-shot (new!) Merge Models (new!) Seed Words (new!)
Corresponding medium posts can be found [here](https://medium.com/data-science/topic-modeling-with-bert-779f7db187e6?sk=0b5a470c006d1842ad4c8a3057063a99 ), [here](https://medium.com/data-science/using-whisper-and-bertopic-to-model-kurzgesagts-videos-7d8a63139bdf?sk=b1e0fd46f70cb15e8422b4794a81161d ) and [here](https://medium.com/data-science/interactive-topic-modeling-with-bertopic-1ea55e7d73d8?sk=03c2168e9e74b6bda2a1f3ed953427e4 ). For a more detailed overview, you can read the [paper](https://arxiv.org/abs/2203.05794) or see a [brief overview](https://maartengr.github.io/BERTopic/algorithm/algorithm.html). ## Installation Installation, with sentence-transformers, can be done using [uv](https://docs.astral.sh/uv/): ```bash uv add bertopic ``` or with [pip](https://github.com/pypa/pip): ```bash pip install bertopic ``` If you want to install BERTopic with other embedding models, you can choose one of the following: ```bash # Choose an embedding backend pip install bertopic[flair,gensim,spacy,use] # Topic modeling with images pip install bertopic[vision] ``` For a *light-weight installation* without transformers, UMAP and/or HDBSCAN (for training with Model2Vec or inference), see [this tutorial](https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#lightweight-installation). ## Getting Started For an in-depth overview of the features of BERTopic you can check the [**full documentation**](https://maartengr.github.io/BERTopic/) or you can follow along with one of the examples below: | Name | Link | |---|---| | Start Here - **Best Practices in BERTopic** | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing) | | **🆕 New!** - Topic Modeling on Large Data (GPU Acceleration) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1W7aEdDPxC29jP99GGZphUlqjMFFVKtBC?usp=sharing) | | **🆕 New!** - Topic Modeling with Llama 2 🦙 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QCERSMUjqGetGGujdrvv_6_EeoIcd_9M?usp=sharing) | | **🆕 New!** - Topic Modeling with Quantized LLMs | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DdSHvVPJA3rmNfBWjCo2P1E9686xfxFx?usp=sharing) | | Topic Modeling with BERTopic | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1FieRA9fLdkQEGDIMYl0I3MCjSUKVF8C-?usp=sharing) | | (Custom) Embedding Models in BERTopic | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/18arPPe50szvcCp_Y6xS56H2tY0m-RLqv?usp=sharing) | | Advanced Customization in BERTopic | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ClTYut039t-LDtlcd-oQAdXWgcsSGTw9?usp=sharing) | | (semi-)Supervised Topic Modeling with BERTopic | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bxizKzv5vfxJEB29sntU__ZC7PBSIPaQ?usp=sharing) | | Dynamic Topic Modeling with Trump's Tweets | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1un8ooI-7ZNlRoK0maVkYhmNRl0XGK88f?usp=sharing) | | Topic Modeling arXiv Abstracts | [![Kaggle](https://img.shields.io/static/v1?style=for-the-badge&message=Kaggle&color=222222&logo=Kaggle&logoColor=20BEFF&label=)](https://www.kaggle.com/maartengr/topic-modeling-arxiv-abstract-with-bertopic) | ## Quick Start We start by extracting topics from the well-known 20 newsgroups dataset containing English documents: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` After generating topics and their probabilities, we can access all of the topics together with their topic representations: ```python >>> topic_model.get_topic_info() Topic Count Name -1 4630 -1_can_your_will_any 0 693 49_windows_drive_dos_file 1 466 32_jesus_bible_christian_faith 2 441 2_space_launch_orbit_lunar 3 381 22_key_encryption_keys_encrypted ... ``` The `-1` topic refers to all outlier documents and are typically ignored. Each word in a topic describes the underlying theme of that topic and can be used for interpreting that topic. Next, let's take a look at the most frequent topic that was generated: ```python >>> topic_model.get_topic(0) [('windows', 0.006152228076250982), ('drive', 0.004982897610645755), ('dos', 0.004845038866360651), ('file', 0.004140142872194834), ('disk', 0.004131678774810884), ('mac', 0.003624848635985097), ('memory', 0.0034840976976789903), ('software', 0.0034415334250699077), ('email', 0.0034239554442333257), ('pc', 0.003047105930670237)] ``` Using `.get_document_info`, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.: ```python >>> topic_model.get_document_info(docs) Document Topic Name Top_n_words Probability ... I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ... My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ... Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ... Think! It's the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ... 1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ... ``` **`🔥 Tip`**: Use `BERTopic(language="multilingual")` to select a model that supports 50+ languages. ## Fine-tune Topic Representations In BERTopic, there are a number of different [topic representations](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) that we can choose from. They are all quite different from one another and give interesting perspectives and variations of topic representations. A great start is `KeyBERTInspired`, which for many users increases the coherence and reduces stopwords from the resulting topic representations: ```python from bertopic.representation import KeyBERTInspired # Fine-tune your topic representations representation_model = KeyBERTInspired() topic_model = BERTopic(representation_model=representation_model) ``` However, you might want to use something more powerful to describe your clusters. You can even use ChatGPT or other models from OpenAI to generate labels, summaries, phrases, keywords, and more: ```python import openai from bertopic.representation import OpenAI # Fine-tune topic representations with GPT client = openai.OpenAI(api_key="sk-...") representation_model = OpenAI(client, model="gpt-4o-mini", chat=True) topic_model = BERTopic(representation_model=representation_model) ``` **`🔥 Tip`**: Instead of iterating over all of these different topic representations, you can model them simultaneously with [multi-aspect topic representations](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) in BERTopic. ## Visualizations After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can use one of the [many visualization options](https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html) in BERTopic. For example, we can visualize the topics that were generated in a way very similar to [LDAvis](https://github.com/cpsievert/LDAvis): ```python topic_model.visualize_topics() ``` ## Modularity By default, the [main steps](https://maartengr.github.io/BERTopic/algorithm/algorithm.html) for topic modeling with BERTopic are sentence-transformers, UMAP, HDBSCAN, and c-TF-IDF run in sequence. However, it assumes some independence between these steps which makes BERTopic quite modular. In other words, BERTopic not only allows you to build your own topic model but to explore several topic modeling techniques on top of your customized topic model: https://user-images.githubusercontent.com/25746895/218420473-4b2bb539-9dbe-407a-9674-a8317c7fb3bf.mp4 You can swap out any of these models or even remove them entirely. The following steps are completely modular: 1. [Embedding](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html) documents 2. [Reducing dimensionality](https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html) of embeddings 3. [Clustering](https://maartengr.github.io/BERTopic/getting_started/clustering/clustering.html) reduced embeddings into topics 4. [Tokenization](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html) of topics 5. [Weight](https://maartengr.github.io/BERTopic/getting_started/ctfidf/ctfidf.html) tokens 6. [Represent topics](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) with one or [multiple](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) representations ## Functionality BERTopic has many functions that quickly can become overwhelming. To alleviate this issue, you will find an overview of all methods and a short description of its purpose. ### Common Below, you will find an overview of common functions in BERTopic. | Method | Code | |-----------------------|---| | Fit the model | `.fit(docs)` | | Fit the model and predict documents | `.fit_transform(docs)` | | Predict new documents | `.transform([new_doc])` | | Access single topic | `.get_topic(topic=12)` | | Access all topics | `.get_topics()` | | Get topic freq | `.get_topic_freq()` | | Get all topic information| `.get_topic_info()` | | Get all document information| `.get_document_info(docs)` | | Get representative docs per topic | `.get_representative_docs()` | | Update topic representation | `.update_topics(docs, n_gram_range=(1, 3))` | | Generate topic labels | `.generate_topic_labels()` | | Set topic labels | `.set_topic_labels(my_custom_labels)` | | Merge topics | `.merge_topics(docs, topics_to_merge)` | | Reduce nr of topics | `.reduce_topics(docs, nr_topics=30)` | | Reduce outliers | `.reduce_outliers(docs, topics)` | | Find topics | `.find_topics("vehicle")` | | Save model | `.save("my_model", serialization="safetensors")` | | Load model | `BERTopic.load("my_model")` | | Get parameters | `.get_params()` | ### Attributes After having trained your BERTopic model, several attributes are saved within your model. These attributes, in part, refer to how model information is stored on an estimator during fitting. The attributes that you see below all end in `_` and are public attributes that can be used to access model information. | Attribute | Description | |------------------------|---------------------------------------------------------------------------------------------| | `.topics_` | The topics that are generated for each document after training or updating the topic model. | | `.probabilities_` | The probabilities that are generated for each document if HDBSCAN is used. | | `.topic_sizes_` | The size of each topic | | `.topic_mapper_` | A class for tracking topics and their mappings anytime they are merged/reduced. | | `.topic_representations_` | The top *n* terms per topic and their respective c-TF-IDF values. | | `.c_tf_idf_` | The topic-term matrix as calculated through c-TF-IDF. | | `.topic_aspects_` | The different aspects, or representations, of each topic. | | `.topic_labels_` | The default labels for each topic. | | `.custom_labels_` | Custom labels for each topic as generated through `.set_topic_labels`. | | `.topic_embeddings_` | The embeddings for each topic if `embedding_model` was used. | | `.representative_docs_` | The representative documents for each topic if HDBSCAN is used. | ### Variations There are many different use cases in which topic modeling can be used. As such, several variations of BERTopic have been developed such that one package can be used across many use cases. | Method | Code | |-----------------------|---| | [Topic Distribution Approximation](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) | `.approximate_distribution(docs)` | | [Online Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/online/online.html) | `.partial_fit(doc)` | | [Semi-supervised Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html) | `.fit(docs, y=y)` | | [Supervised Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html) | `.fit(docs, y=y)` | | [Manual Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/manual/manual.html) | `.fit(docs, y=y)` | | [Multimodal Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html) | ``.fit(docs, images=images)`` | | [Topic Modeling per Class](https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html) | `.topics_per_class(docs, classes)` | | [Dynamic Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) | `.topics_over_time(docs, timestamps)` | | [Hierarchical Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) | `.hierarchical_topics(docs)` | | [Guided Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) | `BERTopic(seed_topic_list=seed_topic_list)` | | [Zero-shot Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html) | `BERTopic(zeroshot_topic_list=zeroshot_topic_list)` | | [Merge Multiple Models](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) | `BERTopic.merge_models([topic_model_1, topic_model_2])` | ### Visualizations Evaluating topic models can be rather difficult due to the somewhat subjective nature of evaluation. Visualizing different aspects of the topic model helps in understanding the model and makes it easier to tweak the model to your liking. | Method | Code | |-----------------------|---| | Visualize Topics | `.visualize_topics()` | | Visualize Documents | `.visualize_documents()` | | Visualize Document Hierarchy | `.visualize_hierarchical_documents()` | | Visualize Topic Hierarchy | `.visualize_hierarchy()` | | Visualize Topic Tree | `.get_topic_tree(hierarchical_topics)` | | Visualize Topic Terms | `.visualize_barchart()` | | Visualize Topic Similarity | `.visualize_heatmap()` | | Visualize Term Score Decline | `.visualize_term_rank()` | | Visualize Topic Probability Distribution | `.visualize_distribution(probs[0])` | | Visualize Topics over Time | `.visualize_topics_over_time(topics_over_time)` | | Visualize Topics per Class | `.visualize_topics_per_class(topics_per_class)` | ## Citation To cite the [BERTopic paper](https://arxiv.org/abs/2203.05794), please use the following bibtex reference: ```bibtext @article{grootendorst2022bertopic, title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure}, author={Grootendorst, Maarten}, journal={arXiv preprint arXiv:2203.05794}, year={2022} } ``` ================================================ FILE: bertopic/__init__.py ================================================ from importlib.metadata import version from bertopic._bertopic import BERTopic __version__ = version("bertopic") __all__ = [ "BERTopic", ] ================================================ FILE: bertopic/_bertopic.py ================================================ # ruff: noqa: E402 import yaml import warnings warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) try: yaml._warnings_enabled["YAMLLoadWarning"] = False except (KeyError, AttributeError, TypeError): pass import re import math import joblib import inspect import collections import numpy as np import pandas as pd import scipy.sparse as sp from copy import deepcopy from tqdm import tqdm from pathlib import Path from packaging import version from tempfile import TemporaryDirectory from collections import defaultdict, Counter from scipy.sparse import csr_matrix from scipy.cluster import hierarchy as sch from importlib.util import find_spec from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable, TYPE_CHECKING, Literal # Plotting if find_spec("plotly") is None: from bertopic._utils import MockPlotlyModule plotting = MockPlotlyModule() else: from bertopic import plotting if TYPE_CHECKING: import plotly.graph_objs as go import matplotlib.figure as fig # Models try: from hdbscan import HDBSCAN HAS_HDBSCAN = True except (ImportError, ModuleNotFoundError): HAS_HDBSCAN = False from sklearn.cluster import HDBSCAN as SK_HDBSCAN from sklearn.preprocessing import normalize from sklearn import __version__ as sklearn_version from sklearn.cluster import AgglomerativeClustering from sklearn.decomposition import PCA from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # BERTopic from bertopic.cluster import BaseCluster from bertopic.backend import BaseEmbedder from bertopic.representation._mmr import mmr from bertopic.backend._utils import select_backend from bertopic.vectorizers import ClassTfidfTransformer from bertopic.representation import BaseRepresentation, KeyBERTInspired from bertopic.dimensionality import BaseDimensionalityReduction from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan from bertopic._utils import ( MyLogger, check_documents_type, check_embeddings_shape, check_is_fitted, validate_distance_matrix, select_topic_representation, get_unique_distances, ) import bertopic._save_utils as save_utils logger = MyLogger() logger.configure("WARNING") class BERTopic: """BERTopic is a topic modeling technique that leverages BERT embeddings and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions. The default embedding model is `all-MiniLM-L6-v2` when selecting `language="english"` and `paraphrase-multilingual-MiniLM-L12-v2` when selecting `language="multilingual"`. Attributes: topics_ (List[int]) : The topics that are generated for each document after training or updating the topic model. The most recent topics are tracked. probabilities_ (List[float]): The probability of the assigned topic per document. These are only calculated if a HDBSCAN model is used for the clustering step. When `calculate_probabilities=True`, then it is the probabilities of all topics per document. topic_sizes_ (Mapping[int, int]) : The size of each topic. topic_mapper_ (TopicMapper) : A class for tracking topics and their mappings anytime they are merged, reduced, added, or removed. topic_representations_ (Mapping[int, Tuple[int, float]]) : The top n terms per topic and their respective c-TF-IDF values. c_tf_idf_ (csr_matrix) : The topic-term matrix as calculated through c-TF-IDF. To access its respective words, run `.vectorizer_model.get_feature_names()` or `.vectorizer_model.get_feature_names_out()` topic_labels_ (Mapping[int, str]) : The default labels for each topic. custom_labels_ (List[str]) : Custom labels for each topic. topic_embeddings_ (np.ndarray) : The embeddings for each topic. They are calculated by taking the centroid embedding of each cluster. representative_docs_ (Mapping[int, str]) : The representative documents for each topic. Examples: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all')['data'] topic_model = BERTopic() topics, probabilities = topic_model.fit_transform(docs) ``` If you want to use your own embedding model, use it as follows: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer docs = fetch_20newsgroups(subset='all')['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") topic_model = BERTopic(embedding_model=sentence_model) ``` Due to the stochastic nature of UMAP, the results from BERTopic might differ and the quality can degrade. Using your own embeddings allows you to try out BERTopic several times until you find the topics that suit you best. """ def __init__( self, language: str = "english", top_n_words: int = 10, n_gram_range: Tuple[int, int] = (1, 1), min_topic_size: int = 10, nr_topics: Union[int, str] | None = None, low_memory: bool = False, calculate_probabilities: bool = False, seed_topic_list: List[List[str]] | None = None, zeroshot_topic_list: List[str] | None = None, zeroshot_min_similarity: float = 0.7, embedding_model=None, umap_model=None, hdbscan_model=None, vectorizer_model: CountVectorizer = None, ctfidf_model: TfidfTransformer = None, representation_model: BaseRepresentation = None, verbose: bool = False, ): """BERTopic initialization. Arguments: language: The main language used in your documents. The default sentence-transformers model for "english" is `all-MiniLM-L6-v2`. For a full overview of supported languages see bertopic.backend.languages. Select "multilingual" to load in the `paraphrase-multilingual-MiniLM-L12-v2` sentence-transformers model that supports 50+ languages. NOTE: This is not used if `embedding_model` is used. top_n_words: The number of words per topic to extract. Setting this too high can negatively impact topic embeddings as topics are typically best represented by at most 10 words. n_gram_range: The n-gram range for the CountVectorizer. Advised to keep high values between 1 and 3. More would likely lead to memory issues. NOTE: This param will not be used if you pass in your own CountVectorizer. min_topic_size: The minimum size of the topic. Increasing this value will lead to a lower number of clusters/topics and vice versa. It is the same parameter as `min_cluster_size` in HDBSCAN. NOTE: This param will not be used if you are using `hdbscan_model`. nr_topics: Specifying the number of topics will reduce the initial number of topics to the value specified. This reduction can take a while as each reduction in topics (-1) activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics using HDBSCAN. NOTE: Controlling the number of topics is best done by adjusting `min_topic_size` first before adjusting this parameter. low_memory: Sets UMAP low memory to True to make sure less memory is used. NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP this parameter will not be used. calculate_probabilities: Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document. This could slow down the extraction of topics if you have many documents (> 100_000). NOTE: If false you cannot use the corresponding visualization method `visualize_probabilities`. NOTE: This is an approximation of topic probabilities as used in HDBSCAN and not an exact representation. seed_topic_list: A list of seed words per topic to converge around zeroshot_topic_list: A list of topic names to use for zero-shot classification zeroshot_min_similarity: The minimum similarity between a zero-shot topic and a document for assignment. The higher this value, the more confident the model needs to be to assign a zero-shot topic to a document. verbose: Changes the verbosity of the model, Set to True if you want to track the stages of the model. embedding_model: Use a custom embedding model. The following backends are currently supported * SentenceTransformers * Flair * Spacy * Gensim * USE (TF-Hub) You can also pass in a string that points to one of the following sentence-transformers models: * https://www.sbert.net/docs/pretrained_models.html umap_model: Pass in a UMAP model to be used instead of the default. NOTE: You can also pass in any dimensionality reduction algorithm as long as it has `.fit` and `.transform` functions. hdbscan_model: Pass in a hdbscan.HDBSCAN model to be used instead of the default NOTE: You can also pass in any clustering algorithm as long as it has `.fit` and `.predict` functions along with the `.labels_` variable. vectorizer_model: Pass in a custom `CountVectorizer` instead of the default model. ctfidf_model: Pass in a custom ClassTfidfTransformer instead of the default model. representation_model: Pass in a model that fine-tunes the topic representations calculated through c-TF-IDF. Models from `bertopic.representation` are supported. """ # Topic-based parameters if top_n_words > 100: logger.warning( "Note that extracting more than 100 words from a sparse can slow down computation quite a bit." ) self.top_n_words = top_n_words self.min_topic_size = min_topic_size self.nr_topics = nr_topics self.low_memory = low_memory self.calculate_probabilities = calculate_probabilities self.verbose = verbose self.seed_topic_list = seed_topic_list self.zeroshot_topic_list = zeroshot_topic_list self.zeroshot_min_similarity = zeroshot_min_similarity # Embedding model self.language = language if not embedding_model else None self.embedding_model = embedding_model # Vectorizer self.n_gram_range = n_gram_range self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=self.n_gram_range) self.ctfidf_model = ctfidf_model or ClassTfidfTransformer() # Representation model self.representation_model = representation_model # UMAP or another algorithm that has .fit and .transform functions if umap_model is not None: self.umap_model = umap_model else: try: from umap import UMAP self.umap_model = UMAP( n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", low_memory=self.low_memory, ) except (ImportError, ModuleNotFoundError): self.umap_model = PCA(n_components=5) # HDBSCAN or another clustering algorithm that has .fit and .predict functions and # the .labels_ variable to extract the labels if hdbscan_model is not None: self.hdbscan_model = hdbscan_model elif HAS_HDBSCAN: self.hdbscan_model = HDBSCAN( min_cluster_size=self.min_topic_size, metric="euclidean", cluster_selection_method="eom", prediction_data=True, ) else: self.hdbscan_model = SK_HDBSCAN( min_cluster_size=self.min_topic_size, metric="euclidean", cluster_selection_method="eom", n_jobs=-1 ) # Public attributes self.topics_ = None self.probabilities_ = None self.topic_sizes_ = None self.topic_mapper_ = None self.topic_representations_ = None self.topic_embeddings_ = None self._topic_id_to_zeroshot_topic_idx = {} self.custom_labels_ = None self.c_tf_idf_ = None self.representative_images_ = None self.representative_docs_ = {} self.topic_aspects_ = {} # Private attributes for internal tracking purposes self._merged_topics = None if verbose: logger.set_level("DEBUG") else: logger.set_level("WARNING") @property def _outliers(self): """Some algorithms have outlier labels (-1) that can be tricky to work with if you are slicing data based on that labels. Therefore, we track if there are outlier labels and act accordingly when slicing. Returns: An integer indicating whether outliers are present in the topic model """ return 1 if -1 in self.topic_sizes_ else 0 @property def topic_labels_(self): """Map topic IDs to their labels. A label is the topic ID, along with the first four words of the topic representation, joined using '_'. Zeroshot topic labels come from self.zeroshot_topic_list rather than the calculated representation. Returns: topic_labels: a dict mapping a topic ID (int) to its label (str) """ topic_labels = { key: f"{key}_" + "_".join([word[0] for word in values[:4]]) for key, values in self.topic_representations_.items() } if self._is_zeroshot(): # Need to correct labels from zero-shot topics topic_id_to_zeroshot_label = { topic_id: self.zeroshot_topic_list[zeroshot_topic_idx] for topic_id, zeroshot_topic_idx in self._topic_id_to_zeroshot_topic_idx.items() } topic_labels.update(topic_id_to_zeroshot_label) return topic_labels def fit( self, documents: List[str], embeddings: np.ndarray = None, images: List[str] | None = None, y: Union[List[int], np.ndarray] = None, ): """Fit the models on a collection of documents and generate topics. Arguments: documents: A list of documents to fit on embeddings: Pre-trained document embeddings. These can be used instead of the sentence-transformer model images: A list of paths to the images to fit on or the images themselves y: The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified. Examples: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all')['data'] topic_model = BERTopic().fit(docs) ``` If you want to use your own embeddings, use it as follows: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer # Create embeddings docs = fetch_20newsgroups(subset='all')['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=True) # Create topic model topic_model = BERTopic().fit(docs, embeddings) ``` """ self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images) return self def fit_transform( self, documents: List[str], embeddings: np.ndarray = None, images: List[str] | None = None, y: Union[List[int], np.ndarray] = None, ) -> Tuple[List[int], Union[np.ndarray, None]]: """Fit the models on a collection of documents, generate topics, and return the probabilities and topic per document. Arguments: documents: A list of documents to fit on embeddings: Pre-trained document embeddings. These can be used instead of the sentence-transformer model images: A list of paths to the images to fit on or the images themselves y: The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified. Returns: predictions: Topic predictions for each documents probabilities: The probability of the assigned topic per document. If `calculate_probabilities` in BERTopic is set to True, then it calculates the probabilities of all topics across all documents instead of only the assigned topic. This, however, slows down computation and may increase memory usage. Examples: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all')['data'] topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` If you want to use your own embeddings, use it as follows: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer # Create embeddings docs = fetch_20newsgroups(subset='all')['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=True) # Create topic model topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs, embeddings) ``` """ if documents is not None: check_documents_type(documents) check_embeddings_shape(embeddings, documents) doc_ids = range(len(documents)) if documents is not None else range(len(images)) documents = pd.DataFrame({"Document": documents, "ID": doc_ids, "Topic": None, "Image": images}) # Extract embeddings if embeddings is None: logger.info("Embedding - Transforming documents to embeddings.") self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose) embeddings = self._extract_embeddings( documents.Document.to_numpy().tolist(), images=images, method="document", verbose=self.verbose, ) logger.info("Embedding - Completed \u2713") else: if self.embedding_model is not None: self.embedding_model = select_backend( self.embedding_model, language=self.language, verbose=self.verbose ) # Guided Topic Modeling if self.seed_topic_list is not None and self.embedding_model is not None: y, embeddings = self._guided_topic_modeling(embeddings) # Reduce dimensionality and fit UMAP model umap_embeddings = self._reduce_dimensionality(embeddings, y) # Zero-shot Topic Modeling if self._is_zeroshot(): documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling( documents, embeddings ) # Filter UMAP embeddings to only non-assigned embeddings to be used for clustering if len(documents) > 0: umap_embeddings = self.umap_model.transform(embeddings) if len(documents) > 0: # Cluster reduced embeddings documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y) if self._is_zeroshot() and len(assigned_documents) > 0: documents, embeddings = self._combine_zeroshot_topics( documents, embeddings, assigned_documents, assigned_embeddings ) else: # All documents matches zero-shot topics documents = assigned_documents embeddings = assigned_embeddings # Sort and Map Topic IDs by their frequency if not self.nr_topics: documents = self._sort_mappings_by_frequency(documents) # Create documents from images if we have images only if documents.Document.to_numpy()[0] is None: custom_documents = self._images_to_text(documents, embeddings) # Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations. self._extract_topics(custom_documents, embeddings=embeddings, fine_tune_representation=not self.nr_topics) if self.nr_topics: custom_documents = self._reduce_topics(custom_documents) self._create_topic_vectors(documents=documents, embeddings=embeddings) # Save the top 3 most representative documents per topic self._save_representative_docs(custom_documents) else: # Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations. self._extract_topics( documents, embeddings=embeddings, verbose=self.verbose, fine_tune_representation=not self.nr_topics ) if self.nr_topics: documents = self._reduce_topics(documents) # Save the top 3 most representative documents per topic self._save_representative_docs(documents) # In the case of zero-shot topics, probability will come from cosine similarity, # and the HDBSCAN model will be removed if self._is_zeroshot() and len(assigned_documents) > 0: self.hdbscan_model = BaseCluster() sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_)) if self.calculate_probabilities: self.probabilities_ = sim_matrix else: self.probabilities_ = np.max(sim_matrix, axis=1) else: self.probabilities_ = self._map_probabilities(probabilities, original_topics=True) predictions = documents.Topic.to_list() return predictions, self.probabilities_ def transform( self, documents: Union[str, List[str]], embeddings: np.ndarray = None, images: List[str] | None = None, ) -> Tuple[List[int], np.ndarray]: """After having fit a model, use transform to predict new instances. Arguments: documents: A single document or a list of documents to predict on embeddings: Pre-trained document embeddings. These can be used instead of the sentence-transformer model. images: A list of paths to the images to predict on or the images themselves Returns: predictions: Topic predictions for each documents probabilities: The topic probability distribution which is returned by default. If `calculate_probabilities` in BERTopic is set to False, then the probabilities are not calculated to speed up computation and decrease memory usage. Examples: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all')['data'] topic_model = BERTopic().fit(docs) topics, probs = topic_model.transform(docs) ``` If you want to use your own embeddings: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer # Create embeddings docs = fetch_20newsgroups(subset='all')['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=True) # Create topic model topic_model = BERTopic().fit(docs, embeddings) topics, probs = topic_model.transform(docs, embeddings) ``` """ check_is_fitted(self) check_embeddings_shape(embeddings, documents) if isinstance(documents, str) or documents is None: documents = [documents] if embeddings is None: embeddings = self._extract_embeddings(documents, images=images, method="document", verbose=self.verbose) # Check if an embedding model was found if embeddings is None: raise ValueError( "No embedding model was found to embed the documents." "Make sure when loading in the model using BERTopic.load()" "to also specify the embedding model." ) # Transform without hdbscan_model and umap_model using only cosine similarity elif type(self.hdbscan_model) is BaseCluster: logger.info("Predicting topic assignments through cosine similarity of topic and document embeddings.") sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_)) predictions = np.argmax(sim_matrix, axis=1) - self._outliers if self.calculate_probabilities: probabilities = sim_matrix else: probabilities = np.max(sim_matrix, axis=1) # Transform with full pipeline else: logger.info("Dimensionality - Reducing dimensionality of input embeddings.") umap_embeddings = self.umap_model.transform(embeddings) logger.info("Dimensionality - Completed \u2713") # Extract predictions and probabilities if it is a HDBSCAN-like model logger.info("Clustering - Approximating new points with `hdbscan_model`") if is_supported_hdbscan(self.hdbscan_model): predictions, probabilities = hdbscan_delegator( self.hdbscan_model, "approximate_predict", umap_embeddings ) # Calculate probabilities if self.calculate_probabilities: logger.info("Probabilities - Start calculation of probabilities with HDBSCAN") probabilities = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings) logger.info("Probabilities - Completed \u2713") else: predictions = self.hdbscan_model.predict(umap_embeddings) probabilities = None logger.info("Cluster - Completed \u2713") # Map probabilities and predictions probabilities = self._map_probabilities(probabilities, original_topics=True) predictions = self._map_predictions(predictions) return predictions, probabilities def partial_fit( self, documents: List[str], embeddings: np.ndarray = None, y: Union[List[int], np.ndarray] = None, ): """Fit BERTopic on a subset of the data and perform online learning with batch-like data. Online topic modeling in BERTopic is performed by using dimensionality reduction and cluster algorithms that support a `partial_fit` method in order to incrementally train the topic model. Likewise, the `bertopic.vectorizers.OnlineCountVectorizer` is used to dynamically update its vocabulary when presented with new data. It has several parameters for modeling decay and updating the representations. In other words, although the main algorithm stays the same, the training procedure now works as follows: For each subset of the data: 1. Generate embeddings with a pre-trained language model 2. Incrementally update the dimensionality reduction algorithm with `partial_fit` 3. Incrementally update the cluster algorithm with `partial_fit` 4. Incrementally update the OnlineCountVectorizer and apply some form of decay Note that it is advised to use `partial_fit` with batches and not single documents for the best performance. Arguments: documents: A list of documents to fit on embeddings: Pre-trained document embeddings. These can be used instead of the sentence-transformer model y: The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified. Examples: ```python from sklearn.datasets import fetch_20newsgroups from sklearn.cluster import MiniBatchKMeans from sklearn.decomposition import IncrementalPCA from bertopic.vectorizers import OnlineCountVectorizer from bertopic import BERTopic # Prepare documents docs = fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'))["data"] # Prepare sub-models that support online learning umap_model = IncrementalPCA(n_components=5) cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0) vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01) topic_model = BERTopic(umap_model=umap_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model) # Incrementally fit the topic model by training on 1000 documents at a time for index in range(0, len(docs), 1000): topic_model.partial_fit(docs[index: index+1000]) ``` """ # Checks check_embeddings_shape(embeddings, documents) if not hasattr(self.hdbscan_model, "partial_fit"): raise ValueError("In order to use `.partial_fit`, the cluster model should have a `.partial_fit` function.") # Prepare documents if isinstance(documents, str): documents = [documents] documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None}) # Extract embeddings if embeddings is None: if self.topic_representations_ is None: self.embedding_model = select_backend( self.embedding_model, language=self.language, verbose=self.verbose ) embeddings = self._extract_embeddings( documents.Document.to_numpy().tolist(), method="document", verbose=self.verbose, ) else: if self.embedding_model is not None and self.topic_representations_ is None: self.embedding_model = select_backend( self.embedding_model, language=self.language, verbose=self.verbose ) # Reduce dimensionality if self.seed_topic_list is not None and self.embedding_model is not None: y, embeddings = self._guided_topic_modeling(embeddings) umap_embeddings = self._reduce_dimensionality(embeddings, y, partial_fit=True) # Cluster reduced embeddings documents, self.probabilities_ = self._cluster_embeddings(umap_embeddings, documents, partial_fit=True) topics = documents.Topic.to_list() # Map and find new topics if not self.topic_mapper_: self.topic_mapper_ = TopicMapper(topics) mappings = self.topic_mapper_.get_mappings() new_topics = set(topics).difference(set(mappings.keys())) new_topic_ids = {topic: max(mappings.values()) + index + 1 for index, topic in enumerate(new_topics)} self.topic_mapper_.add_new_topics(new_topic_ids) updated_mappings = self.topic_mapper_.get_mappings() updated_topics = [updated_mappings[topic] for topic in topics] documents["Topic"] = updated_topics # Add missing topics (topics that were originally created but are now missing) if self.topic_representations_: missing_topics = set(self.topic_representations_.keys()).difference(set(updated_topics)) for missing_topic in missing_topics: documents.loc[len(documents), :] = [" ", len(documents), missing_topic] else: missing_topics = {} # Prepare documents documents_per_topic = documents.sort_values("Topic").groupby(["Topic"], as_index=False) updated_topics = documents_per_topic.first().Topic.astype(int) documents_per_topic = documents_per_topic.agg({"Document": " ".join}) # Update topic representations self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True) self.topic_representations_ = self._extract_words_per_topic( updated_words, documents, self.c_tf_idf_, calculate_aspects=False ) self._create_topic_vectors() # Update topic sizes if len(missing_topics) > 0: documents = documents.iloc[: -len(missing_topics)] if self.topic_sizes_ is None: self._update_topic_size(documents) else: sizes = documents.groupby(["Topic"], as_index=False).count() for _, row in sizes.iterrows(): topic = int(row.Topic) if self.topic_sizes_.get(topic) is not None and topic not in missing_topics: self.topic_sizes_[topic] += int(row.Document) elif self.topic_sizes_.get(topic) is None: self.topic_sizes_[topic] = int(row.Document) self.topics_ = documents.Topic.astype(int).tolist() return self def topics_over_time( self, docs: List[str], timestamps: Union[List[str], List[int]], topics: List[int] | None = None, nr_bins: int | None = None, datetime_format: str | None = None, evolution_tuning: bool = True, global_tuning: bool = True, ) -> pd.DataFrame: """Create topics over time. To create the topics over time, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculate at each timestamp t. Then, the c-TF-IDF representations at timestamp t are averaged with the global c-TF-IDF representations in order to fine-tune the local representations. Note: Make sure to use a limited number of unique timestamps (<100) as the c-TF-IDF representation will be calculated at each single unique timestamp. Having a large number of unique timestamps can take some time to be calculated. Moreover, there aren't many use-cases where you would like to see the difference in topic representations over more than 100 different timestamps. Arguments: docs: The documents you used when calling either `fit` or `fit_transform` timestamps: The timestamp of each document. This can be either a list of strings or ints. If it is a list of strings, then the datetime format will be automatically inferred. If it is a list of ints, then the documents will be ordered in ascending order. topics: A list of topics where each topic is related to a document in `docs` and a timestamp in `timestamps`. You can use this to apply topics_over_time on a subset of the data. Make sure that `docs`, `timestamps`, and `topics` all correspond to one another and have the same size. nr_bins: The number of bins you want to create for the timestamps. The left interval will be chosen as the timestamp. An additional column will be created with the entire interval. datetime_format: The datetime format of the timestamps if they are strings, eg "%d/%m/%Y". Set this to None if you want to have it automatically detect the format. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. evolution_tuning: Fine-tune each topic representation at timestamp *t* by averaging its c-TF-IDF matrix with the c-TF-IDF matrix at timestamp *t-1*. This creates evolutionary topic representations. global_tuning: Fine-tune each topic representation at timestamp *t* by averaging its c-TF-IDF matrix with the global c-TF-IDF matrix. Turn this off if you want to prevent words in topic representations that could not be found in the documents at timestamp *t*. Returns: topics_over_time: A dataframe that contains the topic, words, and frequency of topic at timestamp *t*. Examples: The timestamps variable represents the timestamp of each document. If you have over 100 unique timestamps, it is advised to bin the timestamps as shown below: ```python from bertopic import BERTopic topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20) ``` """ check_is_fitted(self) check_documents_type(docs) selected_topics = topics if topics else self.topics_ documents = pd.DataFrame({"Document": docs, "Topic": selected_topics, "Timestamps": timestamps}) global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False) all_topics = sorted(list(documents.Topic.unique())) all_topics_indices = {topic: index for index, topic in enumerate(all_topics)} if isinstance(timestamps[0], str): infer_datetime_format = True if not datetime_format else False documents["Timestamps"] = pd.to_datetime( documents["Timestamps"], infer_datetime_format=infer_datetime_format, format=datetime_format, ) if nr_bins: documents["Bins"] = pd.cut(documents.Timestamps, bins=nr_bins) documents["Timestamps"] = documents.apply(lambda row: row.Bins.left, 1) # Sort documents in chronological order documents = documents.sort_values("Timestamps") timestamps = documents.Timestamps.unique() if len(timestamps) > 100: logger.warning( f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) " "which significantly slows down the application. Consider setting `nr_bins` " "to a value lower than 100 to speed up calculation. " ) # For each unique timestamp, create topic representations topics_over_time = [] for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose): # Calculate c-TF-IDF representation for a specific timestamp selection = documents.loc[documents.Timestamps == timestamp, :] documents_per_topic = selection.groupby(["Topic"], as_index=False).agg( {"Document": " ".join, "Timestamps": "count"} ) c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False) if global_tuning or evolution_tuning: c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False) # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF # matrix at timestamp t-1 if evolution_tuning and index != 0: current_topics = sorted(list(documents_per_topic.Topic.values)) overlapping_topics = sorted( list(set(previous_topics).intersection(set(current_topics))) # noqa: F821 ) current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics] previous_overlap_idx = [ previous_topics.index(topic) # noqa: F821 for topic in overlapping_topics ] c_tf_idf.tolil()[current_overlap_idx] = ( ( c_tf_idf[current_overlap_idx] + previous_c_tf_idf[previous_overlap_idx] # noqa: F821 ) / 2.0 ).tolil() # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation # by simply taking the average of the two if global_tuning: selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.to_numpy()] c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0 # Extract the words per topic words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) topic_frequency = pd.Series( documents_per_topic.Timestamps.values, index=documents_per_topic.Topic ).to_dict() # Fill dataframe with results topics_at_timestamp = [ ( topic, ", ".join([words[0] for words in values][:5]), topic_frequency[topic], timestamp, ) for topic, values in words_per_topic.items() ] topics_over_time.extend(topics_at_timestamp) if evolution_tuning: previous_topics = sorted(list(documents_per_topic.Topic.values)) # noqa: F841 previous_c_tf_idf = c_tf_idf.copy() # noqa: F841 return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"]) def topics_per_class( self, docs: List[str], classes: Union[List[int], List[str]], global_tuning: bool = True, ) -> pd.DataFrame: """Create topics per class. To create the topics per class, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculated at each class c. Then, the c-TF-IDF representations at class c are averaged with the global c-TF-IDF representations in order to fine-tune the local representations. This can be turned off if the pure representation is needed. Note: Make sure to use a limited number of unique classes (<100) as the c-TF-IDF representation will be calculated at each single unique class. Having a large number of unique classes can take some time to be calculated. Arguments: docs: The documents you used when calling either `fit` or `fit_transform` classes: The class of each document. This can be either a list of strings or ints. global_tuning: Fine-tune each topic representation for class c by averaging its c-TF-IDF matrix with the global c-TF-IDF matrix. Turn this off if you want to prevent words in topic representations that could not be found in the documents for class c. Returns: topics_per_class: A dataframe that contains the topic, words, and frequency of topics for each class. Examples: ```python from bertopic import BERTopic topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) topics_per_class = topic_model.topics_per_class(docs, classes) ``` """ check_documents_type(docs) documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Class": classes}) global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False) # For each unique timestamp, create topic representations topics_per_class = [] for _, class_ in tqdm(enumerate(set(classes)), disable=not self.verbose): # Calculate c-TF-IDF representation for a specific timestamp selection = documents.loc[documents.Class == class_, :] documents_per_topic = selection.groupby(["Topic"], as_index=False).agg( {"Document": " ".join, "Class": "count"} ) c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False) # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation # by simply taking the average of the two if global_tuning: c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False) c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.to_numpy() + self._outliers] + c_tf_idf) / 2.0 # Extract the words per topic words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) topic_frequency = pd.Series(documents_per_topic.Class.to_numpy(), index=documents_per_topic.Topic).to_dict() # Fill dataframe with results topics_at_class = [ ( topic, ", ".join([words[0] for words in values][:5]), topic_frequency[topic], class_, ) for topic, values in words_per_topic.items() ] topics_per_class.extend(topics_at_class) topics_per_class = pd.DataFrame(topics_per_class, columns=["Topic", "Words", "Frequency", "Class"]) return topics_per_class def hierarchical_topics( self, docs: List[str], use_ctfidf: bool = True, linkage_function: Callable[[csr_matrix], np.ndarray] | None = None, distance_function: Callable[[csr_matrix], csr_matrix] | None = None, ) -> pd.DataFrame: """Create a hierarchy of topics. To create this hierarchy, BERTopic needs to be already fitted once. Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings representation using `scipy.cluster.hierarchy.linkage`. Based on that hierarchy, we calculate the topic representation at each merged step. This is a local representation, as we only assume that the chosen step is merged and not all others which typically improves the topic representation. Arguments: docs: The documents you used when calling either `fit` or `fit_transform` use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used. linkage_function: The linkage function to use. Default is: `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` distance_function: The distance function to use on the c-TF-IDF matrix. Default is: `lambda x: 1 - cosine_similarity(x)`. You can pass any function that returns either a square matrix of shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. Returns: hierarchical_topics: A dataframe that contains a hierarchy of topics represented by their parents and their children Examples: ```python from bertopic import BERTopic topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) hierarchical_topics = topic_model.hierarchical_topics(docs) ``` A custom linkage function can be used as follows: ```python from scipy.cluster import hierarchy as sch from bertopic import BERTopic topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Hierarchical topics linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True) hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function) ``` """ check_documents_type(docs) if distance_function is None: distance_function = lambda x: 1 - cosine_similarity(x) if linkage_function is None: linkage_function = lambda x: sch.linkage(x, "ward", optimal_ordering=True) # Calculate distance embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][ self._outliers : ] X = distance_function(embeddings) X = validate_distance_matrix(X, embeddings.shape[0]) # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix Z = linkage_function(X) # Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with # `sch.fcluster(...)` would produce incorrect values for "Topics" for these clusters if len(Z[:, 2]) != len(np.unique(Z[:, 2])): Z[:, 2] = get_unique_distances(Z[:, 2]) # Calculate basic bag-of-words to be iteratively merged later documents = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": self.topics_}) documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :] clean_documents = self._preprocess_text(documents_per_topic.Document.values) # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 # and will be removed in 1.2. Please use get_feature_names_out instead. if version.parse(sklearn_version) >= version.parse("1.0.0"): words = self.vectorizer_model.get_feature_names_out() else: words = self.vectorizer_model.get_feature_names() bow = self.vectorizer_model.transform(clean_documents) # Extract clusters hier_topics = pd.DataFrame( columns=[ "Parent_ID", "Parent_Name", "Topics", "Child_Left_ID", "Child_Left_Name", "Child_Right_ID", "Child_Right_Name", ] ) for index in tqdm(range(len(Z))): # Find clustered documents clusters = sch.fcluster(Z, t=Z[index][2], criterion="distance") - self._outliers nr_clusters = len(clusters) # Extract first topic we find to get the set of topics in a merged topic topic = None val = Z[index][0] while topic is None: if val - len(clusters) < 0: topic = int(val) else: val = Z[int(val - len(clusters))][0] clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]] # Group bow per cluster, calculate c-TF-IDF and extract words grouped = csr_matrix(bow[clustered_topics].sum(axis=0)) c_tf_idf = self.ctfidf_model.transform(grouped) selection = documents.loc[documents.Topic.isin(clustered_topics), :] selection.Topic = 0 words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) # Extract parent's name and ID parent_id = index + len(clusters) parent_name = "_".join([x[0] for x in words_per_topic[0]][:5]) # Extract child's name and ID Z_id = Z[index][0] child_left_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters if Z_id - nr_clusters < 0: child_left_name = "_".join([x[0] for x in self.get_topic(Z_id)][:5]) else: child_left_name = hier_topics.iloc[int(child_left_id)].Parent_Name # Extract child's name and ID Z_id = Z[index][1] child_right_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters if Z_id - nr_clusters < 0: child_right_name = "_".join([x[0] for x in self.get_topic(Z_id)][:5]) else: child_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name # Save results hier_topics.loc[len(hier_topics), :] = [ parent_id, parent_name, clustered_topics, int(Z[index][0]), child_left_name, int(Z[index][1]), child_right_name, ] hier_topics["Distance"] = Z[:, 2] hier_topics = hier_topics.sort_values("Parent_ID", ascending=False) hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]] = hier_topics[ ["Parent_ID", "Child_Left_ID", "Child_Right_ID"] ].astype(str) return hier_topics def approximate_distribution( self, documents: Union[str, List[str]], window: int = 4, stride: int = 1, min_similarity: float = 0.1, batch_size: int = 1000, padding: bool = False, use_embedding_model: bool = False, calculate_tokens: bool = False, separator: str = " ", ) -> Tuple[np.ndarray, Union[List[np.ndarray], None]]: """A post-hoc approximation of topic distributions across documents. In order to perform this approximation, each document is split into tokens according to the provided tokenizer in the `CountVectorizer`. Then, a sliding window is applied on each document creating subsets of the document. For example, with a window size of 3 and stride of 1, the sentence: `Solving the right problem is difficult.` can be split up into `solving the right`, `the right problem`, `right problem is`, and `problem is difficult`. These are called tokensets. For each of these tokensets, we calculate their c-TF-IDF representation and find out how similar they are to the previously generated topics. Then, the similarities to the topics for each tokenset are summed up in order to create a topic distribution for the entire document. We can also dive into this a bit deeper by then splitting these tokensets up into individual tokens and calculate how much a word, in a specific sentence, contributes to the topics found in that document. This can be enabled by setting `calculate_tokens=True` which can be used for visualization purposes in `topic_model.visualize_approximate_distribution`. The main output, `topic_distributions`, can also be used directly in `.visualize_distribution(topic_distributions[index])` by simply selecting a single distribution. Arguments: documents: A single document or a list of documents for which we approximate their topic distributions window: Size of the moving window which indicates the number of tokens being considered. stride: How far the window should move at each step. min_similarity: The minimum similarity of a document's tokenset with respect to the topics. batch_size: The number of documents to process at a time. If None, then all documents are processed at once. NOTE: With a large number of documents, it is not advised to process all documents at once. padding: Whether to pad the beginning and ending of a document with empty tokens. use_embedding_model: Whether to use the topic model's embedding model to calculate the similarity between tokensets and topics instead of using c-TF-IDF. calculate_tokens: Calculate the similarity of tokens with all topics. NOTE: This is computation-wise more expensive and can require more memory. Using this over batches of documents might be preferred. separator: The separator used to merge tokens into tokensets. Returns: topic_distributions: A `n` x `m` matrix containing the topic distributions for all input documents with `n` being the documents and `m` the topics. topic_token_distributions: A list of `t` x `m` arrays with `t` being the number of tokens for the respective document and `m` the topics. Examples: After fitting the model, the topic distributions can be calculated regardless of the clustering model and regardless of whether the documents were previously seen or not: ```python topic_distr, _ = topic_model.approximate_distribution(docs) ``` As a result, the topic distributions are calculated in `topic_distr` for the entire document based on a token set with a specific window size and stride. If you want to calculate the topic distributions on a token-level: ```python topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) ``` The `topic_token_distr` then contains, for each token, the best fitting topics. As with `topic_distr`, it can contain multiple topics for a single token. """ if isinstance(documents, str): documents = [documents] if batch_size is None: batch_size = len(documents) batches = 1 else: batches = math.ceil(len(documents) / batch_size) topic_distributions = [] topic_token_distributions = [] for i in tqdm(range(batches), disable=not self.verbose): doc_set = documents[i * batch_size : (i + 1) * batch_size] # Extract tokens analyzer = self.vectorizer_model.build_tokenizer() tokens = [analyzer(document) for document in doc_set] # Extract token sets all_sentences = [] all_indices = [0] all_token_sets_ids = [] for tokenset in tokens: if len(tokenset) < window: token_sets = [tokenset] token_sets_ids = [list(range(len(tokenset)))] else: # Extract tokensets using window and stride parameters stride_indices = list(range(len(tokenset)))[::stride] token_sets = [] token_sets_ids = [] for stride_index in stride_indices: selected_tokens = tokenset[stride_index : stride_index + window] if padding or len(selected_tokens) == window: token_sets.append(selected_tokens) token_sets_ids.append( list( range( stride_index, stride_index + len(selected_tokens), ) ) ) # Add empty tokens at the beginning and end of a document if padding: padded = [] padded_ids = [] t = math.ceil(window / stride) - 1 for i in range(math.ceil(window / stride) - 1): padded.append(tokenset[: window - ((t - i) * stride)]) padded_ids.append(list(range(0, window - ((t - i) * stride)))) token_sets = padded + token_sets token_sets_ids = padded_ids + token_sets_ids # Join the tokens sentences = [separator.join(token) for token in token_sets] all_sentences.extend(sentences) all_token_sets_ids.extend(token_sets_ids) all_indices.append(all_indices[-1] + len(sentences)) # Calculate similarity between embeddings of token sets and the topics if use_embedding_model: embeddings = self._extract_embeddings(all_sentences, method="document", verbose=True) similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers :]) # Calculate similarity between c-TF-IDF of token sets and the topics else: bow_doc = self.vectorizer_model.transform(all_sentences) c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :]) # Only keep similarities that exceed the minimum similarity[similarity < min_similarity] = 0 # Aggregate results on an individual token level if calculate_tokens: topic_distribution = [] topic_token_distribution = [] for index, token in enumerate(tokens): start = all_indices[index] end = all_indices[index + 1] if start == end: end = end + 1 # Assign topics to individual tokens token_id = [i for i in range(len(token))] token_val = {index: [] for index in token_id} for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]): for token in token_set: if token in token_val: token_val[token].append(sim) matrix = [] for _, value in token_val.items(): matrix.append(np.add.reduce(value)) # Take empty documents into account matrix = np.array(matrix) if len(matrix.shape) == 1: matrix = np.zeros((1, len(self.topic_labels_) - self._outliers)) topic_token_distribution.append(np.array(matrix)) topic_distribution.append(np.add.reduce(matrix)) topic_distribution = normalize(topic_distribution, norm="l1", axis=1) # Aggregate on a tokenset level indicated by the window and stride else: topic_distribution = [] for index in range(len(all_indices) - 1): start = all_indices[index] end = all_indices[index + 1] if start == end: end = end + 1 group = similarity[start:end].sum(axis=0) topic_distribution.append(group) topic_distribution = normalize(np.array(topic_distribution), norm="l1", axis=1) topic_token_distribution = None # Combine results topic_distributions.append(topic_distribution) if topic_token_distribution is None: topic_token_distributions = None else: topic_token_distributions.extend(topic_token_distribution) topic_distributions = np.vstack(topic_distributions) return topic_distributions, topic_token_distributions def find_topics( self, search_term: str | None = None, image: str | None = None, top_n: int = 5 ) -> Tuple[List[int], List[float]]: """Find topics most similar to a search_term. Creates an embedding for a search query and compares that with the topic embeddings. The most similar topics are returned along with their similarity values. The query is specified using search_term for text queries or image for image queries. The search_term can be of any size but since it is compared with the topic representation it is advised to keep it below 5 words. Arguments: search_term: the term you want to use to search for topics. image: path to the image you want to use to search for topics. top_n: the number of topics to return Returns: similar_topics: the most similar topics from high to low similarity: the similarity scores from high to low Examples: You can use the underlying embedding model to find topics that best represent the search term: ```python topics, similarity = topic_model.find_topics("sports", top_n=5) ``` Note that the search query is typically more accurate if the search_term consists of a phrase or multiple words. """ if self.embedding_model is None: raise Exception("This method can only be used if you did not use custom embeddings.") topic_list = list(self.topic_representations_.keys()) topic_list.sort() # Extract search_term embeddings and compare with topic embeddings if search_term is not None: search_embedding = self._extract_embeddings([search_term], method="word", verbose=False).flatten() elif image is not None: search_embedding = self._extract_embeddings( [None], images=[image], method="document", verbose=False ).flatten() sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten() # Extract topics most similar to search_term ids = np.argsort(sims)[-top_n:] similarity = [sims[i] for i in ids][::-1] similar_topics = [topic_list[index] for index in ids][::-1] return similar_topics, similarity def update_topics( self, docs: List[str], images: List[str] | None = None, topics: List[int] | None = None, top_n_words: int = 10, n_gram_range: Tuple[int, int] | None = None, vectorizer_model: CountVectorizer = None, ctfidf_model: ClassTfidfTransformer = None, representation_model: BaseRepresentation = None, ): """Updates the topic representation by recalculating c-TF-IDF with the new parameters as defined in this function. When you have trained a model and viewed the topics and the words that represent them, you might not be satisfied with the representation. Perhaps you forgot to remove stop_words or you want to try out a different n_gram_range. This function allows you to update the topic representation after they have been formed. Arguments: docs: The documents you used when calling either `fit` or `fit_transform` images: The images you used when calling either `fit` or `fit_transform` topics: A list of topics where each topic is related to a document in `docs`. Use this variable to change or map the topics. NOTE: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline top_n_words: The number of words per topic to extract. Setting this too high can negatively impact topic embeddings as topics are typically best represented by at most 10 words. n_gram_range: The n-gram range for the CountVectorizer. vectorizer_model: Pass in your own CountVectorizer from scikit-learn ctfidf_model: Pass in your own c-TF-IDF model to update the representations representation_model: Pass in a model that fine-tunes the topic representations calculated through c-TF-IDF. Models from `bertopic.representation` are supported. Examples: In order to update the topic representation, you will need to first fit the topic model and extract topics from them. Based on these, you can update the representation: ```python topic_model.update_topics(docs, n_gram_range=(2, 3)) ``` You can also use a custom vectorizer to update the representation: ```python from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` You can also use this function to change or map the topics to something else. You can update them as follows: ```python topic_model.update_topics(docs, my_updated_topics) ``` """ check_documents_type(docs) check_is_fitted(self) if not n_gram_range: n_gram_range = self.n_gram_range if top_n_words > 100: logger.warning( "Note that extracting more than 100 words from a sparse can slow down computation quite a bit." ) self.top_n_words = top_n_words self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range) self.ctfidf_model = ctfidf_model or ClassTfidfTransformer() self.representation_model = representation_model if topics is None: topics = self.topics_ else: logger.warning( "Using a custom list of topic assignments may lead to errors if " "topic reduction techniques are used afterwards. Make sure that " "manually assigning topics is the last step in the pipeline." "Note that topic embeddings will also be created through weighted" "c-TF-IDF embeddings instead of centroid embeddings." ) documents = pd.DataFrame({"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images}) documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) # Update topic sizes and assignments self._update_topic_size(documents) # Extract words and update topic labels self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) self.topic_representations_ = self._extract_words_per_topic(words, documents) # Update topic vectors if set(topics) != set(self.topics_): # Remove outlier topic embedding if all that has changed is the outlier class same_position = all( [ True if old_topic == new_topic else False for old_topic, new_topic in zip(self.topics_, topics) if old_topic != -1 ] ) if same_position and -1 not in topics and -1 in self.topics_: self.topic_embeddings_ = self.topic_embeddings_[1:] else: self._create_topic_vectors() def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]: """Return topics with top n words and their c-TF-IDF score. Arguments: full: If True, returns all different forms of topic representations for each topic, including aspects Returns: self.topic_representations_: The top n words per topic and the corresponding c-TF-IDF score Examples: ```python all_topics = topic_model.get_topics() ``` """ check_is_fitted(self) if full: topic_representations = {"Main": self.topic_representations_} topic_representations.update(self.topic_aspects_) return topic_representations else: return self.topic_representations_ def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]: """Return top n words for a specific topic and their c-TF-IDF scores. Arguments: topic: A specific topic for which you want its representation full: If True, returns all different forms of topic representations for a topic, including aspects Returns: The top n words for a specific word and its respective c-TF-IDF scores Examples: ```python topic = topic_model.get_topic(12) ``` """ check_is_fitted(self) if topic in self.topic_representations_: if full: representations = {"Main": self.topic_representations_[topic]} aspects = {aspect: representations[topic] for aspect, representations in self.topic_aspects_.items()} representations.update(aspects) return representations else: return self.topic_representations_[topic] else: return False def get_topic_info(self, topic: int | None = None) -> pd.DataFrame: """Get information about each topic including its ID, frequency, and name. Arguments: topic: A specific topic for which you want the frequency Returns: info: The information relating to either a single topic or all topics Examples: ```python info_df = topic_model.get_topic_info() ``` """ check_is_fitted(self) info = pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values("Topic") info["Name"] = info.Topic.map(self.topic_labels_) # Custom label if self.custom_labels_ is not None: if len(self.custom_labels_) == len(info): labels = {topic - self._outliers: label for topic, label in enumerate(self.custom_labels_)} info["CustomName"] = info["Topic"].map(labels) # Main Keywords values = {topic: list(next(zip(*values))) for topic, values in self.topic_representations_.items()} info["Representation"] = info["Topic"].map(values) # Extract all topic aspects if self.topic_aspects_: for aspect, values in self.topic_aspects_.items(): if isinstance(list(values.values())[-1], list): if isinstance(list(values.values())[-1][0], tuple) or isinstance( list(values.values())[-1][0], list ): values = {topic: list(next(zip(*value))) for topic, value in values.items()} elif isinstance(list(values.values())[-1][0], str): values = {topic: " ".join(value).strip() for topic, value in values.items()} info[aspect] = info["Topic"].map(values) # Representative Docs / Images if self.representative_docs_ is not None: info["Representative_Docs"] = info["Topic"].map(self.representative_docs_) if self.representative_images_ is not None: info["Representative_Images"] = info["Topic"].map(self.representative_images_) # Select specific topic to return if topic is not None: info = info.loc[info.Topic == topic, :] return info.reset_index(drop=True) def get_topic_freq(self, topic: int | None = None) -> Union[pd.DataFrame, int]: """Return the size of topics (descending order). Arguments: topic: A specific topic for which you want the frequency Returns: Either the frequency of a single topic or dataframe with the frequencies of all topics Examples: To extract the frequency of all topics: ```python frequency = topic_model.get_topic_freq() ``` To get the frequency of a single topic: ```python frequency = topic_model.get_topic_freq(12) ``` """ check_is_fitted(self) if isinstance(topic, int): return self.topic_sizes_[topic] else: return pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values( "Count", ascending=False ) def get_document_info( self, docs: List[str], df: pd.DataFrame = None, metadata: Mapping[str, Any] | None = None, ) -> pd.DataFrame: """Get information about the documents on which the topic was trained including the documents themselves, their respective topics, the name of each topic, the top n words of each topic, whether it is a representative document, and probability of the clustering if the cluster model supports it. There are also options to include other meta data, such as the topic distributions or the x and y coordinates of the reduced embeddings. Arguments: docs: The documents on which the topic model was trained. df: A dataframe containing the metadata and the documents on which the topic model was originally trained on. metadata: A dictionary with meta data for each document in the form of column name (key) and the respective values (value). Returns: document_info: A dataframe with several statistics regarding the documents on which the topic model was trained. Usage: To get the document info, you will only need to pass the documents on which the topic model was trained: ```python document_info = topic_model.get_document_info(docs) ``` There are additionally options to include meta data, such as the topic distributions. Moreover, we can pass the original dataframe that contains the documents and extend it with the information retrieved from BERTopic: ```python from sklearn.datasets import fetch_20newsgroups # The original data in a dataframe format to include the target variable data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) df = pd.DataFrame({"Document": data['data'], "Class": data['target']}) # Add information about the percentage of the document that relates to the topic topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000) distributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)] # Create our documents dataframe using the original dataframe and meta data about # the topic distributions document_info = topic_model.get_document_info(docs, df=df, metadata={"Topic_distribution": distributions}) """ check_documents_type(docs) if df is not None: document_info = df.copy() document_info["Document"] = docs document_info["Topic"] = self.topics_ else: document_info = pd.DataFrame({"Document": docs, "Topic": self.topics_}) # Add topic info through `.get_topic_info()` topic_info = self.get_topic_info().drop("Count", axis=1) document_info = document_info.merge(topic_info, on="Topic", how="left") # Add top n words top_n_words = {topic: " - ".join(next(zip(*self.get_topic(topic)))) for topic in set(self.topics_)} document_info["Top_n_words"] = document_info.Topic.map(top_n_words) # Add flat probabilities if self.probabilities_ is not None: if len(self.probabilities_.shape) == 1: document_info["Probability"] = self.probabilities_ else: document_info["Probability"] = [ max(probs) if topic != -1 else 1 - sum(probs) for topic, probs in zip(self.topics_, self.probabilities_) ] # Add representative document labels repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs] document_info["Representative_document"] = False document_info.loc[document_info.Document.isin(repr_docs), "Representative_document"] = True # Add custom meta data provided by the user if metadata is not None: for column, values in metadata.items(): document_info[column] = values return document_info def get_representative_docs(self, topic: int | None = None) -> List[str]: """Extract the best representing documents per topic. Note: This does not extract all documents per topic as all documents are not saved within BERTopic. To get all documents, please run the following: ```python # When you used `.fit_transform`: df = pd.DataFrame({"Document": docs, "Topic": topic}) # When you used `.fit`: df = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_}) ``` Arguments: topic: A specific topic for which you want the representative documents Returns: Representative documents of the chosen topic Examples: To extract the representative docs of all topics: ```python representative_docs = topic_model.get_representative_docs() ``` To get the representative docs of a single topic: ```python representative_docs = topic_model.get_representative_docs(12) ``` """ check_is_fitted(self) if isinstance(topic, int): if self.representative_docs_.get(topic): return self.representative_docs_[topic] else: return None else: return self.representative_docs_ @staticmethod def get_topic_tree( hier_topics: pd.DataFrame, max_distance: float | None = None, tight_layout: bool = False, ) -> str: """Extract the topic tree such that it can be printed. Arguments: hier_topics: A dataframe containing the structure of the topic tree. This is the output of `topic_model.hierarchical_topics()` max_distance: The maximum distance between two topics. This value is based on the Distance column in `hier_topics`. tight_layout: Whether to use a tight layout (narrow width) for easier readability if you have hundreds of topics. Returns: A tree that has the following structure when printed: . . └─health_medical_disease_patients_hiv ├─patients_medical_disease_candida_health │ ├─■──candida_yeast_infection_gonorrhea_infections ── Topic: 48 │ └─patients_disease_cancer_medical_doctor │ ├─■──hiv_medical_cancer_patients_doctor ── Topic: 34 │ └─■──pain_drug_patients_disease_diet ── Topic: 26 └─■──health_newsgroup_tobacco_vote_votes ── Topic: 9 The blocks (■) indicate that the topic is one you can directly access from `topic_model.get_topic`. In other words, they are the original un-grouped topics. Examples: ```python # Train model from bertopic import BERTopic topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) hierarchical_topics = topic_model.hierarchical_topics(docs) # Print topic tree tree = topic_model.get_topic_tree(hierarchical_topics) print(tree) ``` """ width = 1 if tight_layout else 4 if max_distance is None: max_distance = hier_topics.Distance.max() + 1 max_original_topic = hier_topics.Parent_ID.astype(int).min() - 1 # Extract mapping from ID to name topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name)) topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name))) topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()} # Create tree tree = { str(row[1].Parent_ID): [ str(row[1].Child_Left_ID), str(row[1].Child_Right_ID), ] for row in hier_topics.iterrows() } def get_tree(start, tree): """Based on: https://stackoverflow.com/a/51920869/10532563.""" def _tree(to_print, start, parent, tree, grandpa=None, indent=""): # Get distance between merged topics distance = hier_topics.loc[ (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent), "Distance", ] distance = distance.to_numpy()[0] if len(distance) > 0 else 10 if parent != start: if grandpa is None: to_print += topic_to_name[parent] else: if int(parent) <= max_original_topic: # Do not append topic ID if they are not merged if distance < max_distance: to_print += "■──" + topic_to_name[parent] + f" ── Topic: {parent}" + "\n" else: to_print += "O \n" else: to_print += topic_to_name[parent] + "\n" if parent not in tree: return to_print for child in tree[parent][:-1]: to_print += indent + "├" + "─" to_print = _tree(to_print, start, child, tree, parent, indent + "│" + " " * width) child = tree[parent][-1] to_print += indent + "└" + "─" to_print = _tree(to_print, start, child, tree, parent, indent + " " * (width + 1)) return to_print to_print = "." + "\n" to_print = _tree(to_print, start, start, tree) return to_print start = str(hier_topics.Parent_ID.astype(int).max()) return get_tree(start, tree) def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None: """Set custom topic labels in your fitted BERTopic model. Arguments: topic_labels: If a list of topic labels, it should contain the same number of labels as there are topics. This must be ordered from the topic with the lowest ID to the highest ID, including topic -1 if it exists. If a dictionary of `topic ID`: `topic_label`, it can have any number of topics as it will only map the topics found in the dictionary. Examples: First, we define our topic labels with `.generate_topic_labels` in which we can customize our topic labels: ```python topic_labels = topic_model.generate_topic_labels(nr_words=2, topic_prefix=True, word_length=10, separator=", ") ``` Then, we pass these `topic_labels` to our topic model which can be accessed at any time with `.custom_labels_`: ```python topic_model.set_topic_labels(topic_labels) topic_model.custom_labels_ ``` You might want to change only a few topic labels instead of all of them. To do so, you can pass a dictionary where the keys are the topic IDs and its keys the topic labels: ```python topic_model.set_topic_labels({0: "Space", 1: "Sports", 2: "Medicine"}) topic_model.custom_labels_ ``` """ unique_topics = sorted(set(self.topics_)) if isinstance(topic_labels, dict): if self.custom_labels_ is not None: original_labels = {topic: label for topic, label in zip(unique_topics, self.custom_labels_)} else: info = self.get_topic_info() original_labels = dict(zip(info.Topic, info.Name)) custom_labels = [ topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] for topic in unique_topics ] elif isinstance(topic_labels, list): if len(topic_labels) == len(unique_topics): custom_labels = topic_labels else: raise ValueError( "Make sure that `topic_labels` contains the same number of labels as there are topics." ) self.custom_labels_ = custom_labels def generate_topic_labels( self, nr_words: int = 3, topic_prefix: bool = True, word_length: int | None = None, separator: str = "_", aspect: str | None = None, ) -> List[str]: """Get labels for each topic in a user-defined format. Arguments: nr_words: Top `n` words per topic to use topic_prefix: Whether to use the topic ID as a prefix. If set to True, the topic ID will be separated using the `separator` word_length: The maximum length of each word in the topic label. Some words might be relatively long and setting this value helps to make sure that all labels have relatively similar lengths. separator: The string with which the words and topic prefix will be separated. Underscores are the default but a nice alternative is `", "`. aspect: The aspect from which to generate topic labels Returns: topic_labels: A list of topic labels sorted from the lowest topic ID to the highest. If the topic model was trained using HDBSCAN, the lowest topic ID is -1, otherwise it is 0. Examples: To create our custom topic labels, usage is rather straightforward: ```python topic_labels = topic_model.generate_topic_labels(nr_words=2, separator=", ") ``` """ unique_topics = sorted(set(self.topics_)) topic_labels = [] for topic in unique_topics: if aspect: words, _ = zip(*self.topic_aspects_[aspect][topic]) else: words, _ = zip(*self.get_topic(topic)) if word_length: words = [word[:word_length] for word in words][:nr_words] else: words = list(words)[:nr_words] if topic_prefix: topic_label = f"{topic}{separator}" + separator.join(words) else: topic_label = separator.join(words) topic_labels.append(topic_label) return topic_labels def merge_topics( self, docs: List[str], topics_to_merge: List[Union[Iterable[int], int]], images: List[str] | None = None, ) -> None: """Arguments: docs: The documents you used when calling either `fit` or `fit_transform` topics_to_merge: Either a list of topics or a list of list of topics to merge. For example: [1, 2, 3] will merge topics 1, 2 and 3 [[1, 2], [3, 4]] will merge topics 1 and 2, and separately merge topics 3 and 4. images: A list of paths to the images used when calling either `fit` or `fit_transform`. Examples: If you want to merge topics 1, 2, and 3: ```python topics_to_merge = [1, 2, 3] topic_model.merge_topics(docs, topics_to_merge) ``` or if you want to merge topics 1 and 2, and separately merge topics 3 and 4: ```python topics_to_merge = [[1, 2], [3, 4]] topic_model.merge_topics(docs, topics_to_merge) ``` """ check_is_fitted(self) check_documents_type(docs) documents = pd.DataFrame( { "Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs)), } ) mapping = {topic: topic for topic in set(self.topics_)} if isinstance(topics_to_merge[0], int): for topic in sorted(topics_to_merge): mapping[topic] = topics_to_merge[0] elif isinstance(topics_to_merge[0], Iterable): for topic_group in sorted(topics_to_merge): for topic in topic_group: mapping[topic] = topic_group[0] else: raise ValueError("Make sure that `topics_to_merge` is eithera list of topics or a list of list of topics.") # Track mappings and sizes of topics for merging topic embeddings mappings = defaultdict(list) for key, val in sorted(mapping.items()): mappings[val].append(key) mappings = { topic_to: { "topics_from": topics_from, "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], } for topic_to, topics_from in mappings.items() } # Update topics documents.Topic = documents.Topic.map(mapping) self.topic_mapper_.add_mappings(mapping, topic_model=self) documents = self._sort_mappings_by_frequency(documents) self._extract_topics(documents, mappings=mappings) self._update_topic_size(documents) self._save_representative_docs(documents) self.probabilities_ = self._map_probabilities(self.probabilities_) def delete_topics( self, topics_to_delete: List[int], ) -> None: """Delete topics from the topic model. The deleted topics will be mapped to -1 (outlier topic). Core topic attributes like topic embeddings and c-TF-IDF will be automatically updated. Arguments: topics_to_delete: List of topics to delete """ check_is_fitted(self) topics_df = pd.DataFrame({"Topic": self.topics_}) # Check if -1 exists in the current topics had_outliers = -1 in set(self.topics_) # If adding -1 for the first time, initialize its attributes if not had_outliers and any(topic in topics_to_delete for topic in self.topics_): # Initialize c-TF-IDF for -1 topic (zeros) outlier_row = np.zeros((1, self.c_tf_idf_.shape[1])) outlier_row = sp.csr_matrix(outlier_row) self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_]) # Initialize topic embeddings for -1 topic (zeros) outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1])) self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_]) # Initialize topic representations for -1 topic: ("", 1e-05) self.topic_representations_[-1] = [("", 1e-05)] # Initialize representative docs for -1 topic (empty list) self.representative_docs_[-1] = [] # Initialize representative images for -1 topic if images are being used if self.representative_images_ is not None: outlier_image = np.zeros((1, self.representative_images_.shape[1])) self.representative_images_ = np.vstack([outlier_image, self.representative_images_]) # Initialize custom labels for -1 topic if they exist if hasattr(self, "custom_labels_") and self.custom_labels_ is not None: self.custom_labels_[-1] = "" # Initialize ctfidf model diagonal for -1 topic (ones) if it exists if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: n_features = self.ctfidf_model._idf_diag.shape[1] outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features)) self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag]) # Initialize topic aspects for -1 topic (empty dict for each aspect) if they exist if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None: for aspect in self.topic_aspects_: self.topic_aspects_[aspect][-1] = {} # First map deleted topics to -1 mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)} mapping[-1] = -1 # Track mappings and sizes of topics for merging topic embeddings mappings = defaultdict(list) for key, val in sorted(mapping.items()): mappings[val].append(key) mappings = { topic_to: { "topics_from": topics_from, "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], } for topic_to, topics_from in mappings.items() } # remove deleted topics and update attributes topics_df.Topic = topics_df.Topic.map(mapping) self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self)) topics_df = self._sort_mappings_by_frequency(topics_df) self._update_topic_size(topics_df) self.probabilities_ = self._map_probabilities(self.probabilities_) final_mapping = self.topic_mapper_.get_mappings(original_topics=False) # Update dictionary-based attributes to remove deleted topics # Handle topic_aspects_ if it exists if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None: new_aspects = { aspect: { (final_mapping[old_topic] if old_topic != -1 else -1): content for old_topic, content in topics.items() if old_topic not in topics_to_delete } for aspect, topics in self.topic_aspects_.items() } self.topic_aspects_ = new_aspects # Update custom labels if they exist if hasattr(self, "custom_labels_") and self.custom_labels_ is not None: new_labels = { (final_mapping[old_topic] if old_topic != -1 else -1): label for old_topic, label in self.custom_labels_.items() if old_topic not in topics_to_delete } self.custom_labels_ = new_labels # Update topic representations new_representations = { (final_mapping[old_topic] if old_topic != -1 else -1): content for old_topic, content in self.topic_representations_.items() if old_topic not in topics_to_delete } self.topic_representations_ = new_representations # Update representative docs if they exist new_representative_docs = { (final_mapping[old_topic] if old_topic != -1 else -1): docs for old_topic, docs in self.representative_docs_.items() if old_topic not in topics_to_delete } self.representative_docs_ = new_representative_docs # Update representative images if they exist if self.representative_images_ is not None: # Create a mask for non-deleted topics mask = np.array([topic not in topics_to_delete for topic in range(len(self.representative_images_))]) self.representative_images_ = self.representative_images_[mask] if mask.any() else None # Update array-based attributes using masks to remove deleted topics for attr in ["topic_embeddings_", "c_tf_idf_"]: matrix = getattr(self, attr) mask = np.array([topic not in topics_to_delete for topic in range(matrix.shape[0])]) setattr(self, attr, matrix[mask]) # Update ctfidf model to remove deleted topics if it exists if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None: mask = np.array([topic not in topics_to_delete for topic in range(self.ctfidf_model._idf_diag.shape[0])]) self.ctfidf_model._idf_diag = self.ctfidf_model._idf_diag[mask] def reduce_topics( self, docs: List[str], nr_topics: Union[int, str] = 20, images: List[str] | None = None, use_ctfidf: bool = False, ) -> None: """Reduce the number of topics to a fixed number of topics or automatically. If nr_topics is an integer, then the number of topics is reduced to nr_topics using `AgglomerativeClustering` on the cosine distance matrix of the topic c-TF-IDF or semantic embeddings. If nr_topics is `"auto"`, then HDBSCAN is used to automatically reduce the number of topics by running it on the topic embeddings. The topics, their sizes, and representations are updated. Arguments: docs: The docs you used when calling either `fit` or `fit_transform` nr_topics: The number of topics you want reduced to images: A list of paths to the images used when calling either `fit` or `fit_transform` use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used. Updates: topics_ : Assigns topics to their merged representations. probabilities_ : Assigns probabilities to their merged representations. Examples: You can further reduce the topics by passing the documents with their topics and probabilities (if they were calculated): ```python topic_model.reduce_topics(docs, nr_topics=30) ``` You can then access the updated topics and probabilities with: ```python topics = topic_model.topics_ probabilities = topic_model.probabilities_ ``` """ check_is_fitted(self) check_documents_type(docs) self.nr_topics = nr_topics documents = pd.DataFrame( { "Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs)), } ) # Reduce number of topics documents = self._reduce_topics(documents, use_ctfidf) self._merged_topics = None self._save_representative_docs(documents) self.probabilities_ = self._map_probabilities(self.probabilities_) return self def reduce_outliers( self, documents: List[str], topics: List[int], images: List[str] | None = None, strategy: str = "distributions", probabilities: np.ndarray = None, threshold: float = 0, embeddings: np.ndarray = None, distributions_params: Mapping[str, Any] = {}, ) -> List[int]: """Reduce outliers by merging them with their nearest topic according to one of several strategies. When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created that do not fall within any of the created topics. These are labeled as -1. This function allows the user to match outlier documents with their nearest topic using one of the following strategies using the `strategy` parameter: * "probabilities" This uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. To use this, make sure to calculate the `probabilities` beforehand by instantiating BERTopic with `calculate_probabilities=True`. * "distributions" Use the topic distributions, as calculated with `.approximate_distribution` to find the most frequent topic in each outlier document. You can use the `distributions_params` variable to tweak the parameters of `.approximate_distribution`. * "c-tf-idf" Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity. * "embeddings" Using the embeddings of each outlier documents, find the best matching topic embedding using cosine similarity. Arguments: documents: A list of documents for which we reduce or remove the outliers. topics: The topics that correspond to the documents images: A list of paths to the images used when calling either `fit` or `fit_transform` strategy: The strategy used for reducing outliers. Options: * "probabilities" This uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. * "distributions" Use the topic distributions, as calculated with `.approximate_distribution` to find the most frequent topic in each outlier document. * "c-tf-idf" Calculate the c-TF-IDF representation for outlier documents and find the best matching c-TF-IDF topic representation. * "embeddings" Calculate the embeddings for outlier documents and find the best matching topic embedding. probabilities: Probabilities generated by HDBSCAN for each document when using the strategy `"probabilities"`. threshold: The threshold for assigning topics to outlier documents. This value represents the minimum probability when `strategy="probabilities"`. For all other strategies, it represents the minimum similarity. embeddings: The pre-computed embeddings to be used when `strategy="embeddings"`. If this is None, then it will compute the embeddings for the outlier documents. distributions_params: The parameters used in `.approximate_distribution` when using the strategy `"distributions"`. Returns: new_topics: The updated topics Usage: The default settings uses the `"distributions"` strategy: ```python new_topics = topic_model.reduce_outliers(docs, topics) ``` When you use the `"probabilities"` strategy, make sure to also pass the probabilities as generated through HDBSCAN: ```python from bertopic import BERTopic topic_model = BERTopic(calculate_probabilities=True) topics, probs = topic_model.fit_transform(docs) new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities") ``` """ if not self._outliers: raise ValueError("No outliers to reduce.") if images is not None: strategy = "embeddings" # Check correct use of parameters if strategy.lower() == "probabilities" and probabilities is None: raise ValueError("Make sure to pass in `probabilities` in order to use the probabilities strategy") # Reduce outliers by extracting most likely topics through the topic-term probability matrix if strategy.lower() == "probabilities": new_topics = [ np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic for topic, prob in zip(topics, probabilities) ] # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions elif strategy.lower() == "distributions": outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] outlier_docs = [documents[index] for index in outlier_ids] topic_distr, _ = self.approximate_distribution( outlier_docs, min_similarity=threshold, **distributions_params ) outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr]) new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] # Reduce outliers by finding the most similar c-TF-IDF representations elif strategy.lower() == "c-tf-idf": outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] outlier_docs = [documents[index] for index in outlier_ids] # Calculate c-TF-IDF of outlier documents with all topics bow_doc = self.vectorizer_model.transform(outlier_docs) c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :]) # Update topics similarity[similarity < threshold] = 0 outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]) new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] # Reduce outliers by finding the most similar topic embeddings elif strategy.lower() == "embeddings": if self.embedding_model is None and embeddings is None: raise ValueError( "To use this strategy, you will need to pass a model to `embedding_model`" "when instantiating BERTopic." ) outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] if images is not None: outlier_docs = [images[index] for index in outlier_ids] else: outlier_docs = [documents[index] for index in outlier_ids] # Extract or calculate embeddings for outlier documents if embeddings is not None: outlier_embeddings = np.array([embeddings[index] for index in outlier_ids]) elif images is not None: outlier_images = [images[index] for index in outlier_ids] outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose) else: outlier_embeddings = self.embedding_model.embed_documents(outlier_docs) similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers :]) # Update topics similarity[similarity < threshold] = 0 outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]) new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] return new_topics def visualize_topics( self, topics: List[int] | None = None, top_n_topics: int | None = None, use_ctfidf: bool = False, custom_labels: bool = False, title: str = "Intertopic Distance Map", width: int = 650, height: int = 650, ) -> "go.Figure": """Visualize topics, their sizes, and their corresponding words. This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA. Arguments: topics: A selection of topics to visualize Not to be confused with the topics that you get from `.fit_transform`. For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. top_n_topics: Only select the top n most frequent topics use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: To visualize the topics simply run: ```python topic_model.visualize_topics() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_topics() fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) return plotting.visualize_topics( self, topics=topics, top_n_topics=top_n_topics, use_ctfidf=use_ctfidf, custom_labels=custom_labels, title=title, width=width, height=height, ) def visualize_documents( self, docs: List[str], topics: List[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, sample: float | None = None, hide_annotations: bool = False, hide_document_hover: bool = False, custom_labels: bool = False, title: str = "Documents and Topics", width: int = 1200, height: int = 750, ) -> "go.Figure": """Visualize documents and their topics in 2D. Arguments: topic_model: A fitted BERTopic instance. docs: The documents you used when calling either `fit` or `fit_transform` topics: A selection of topics to visualize. Not to be confused with the topics that you get from `.fit_transform`. For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. embeddings: The embeddings of all documents in `docs`. reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. sample: The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen. hide_annotations: Hide the names of the traces on top of each cluster. hide_document_hover: Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualization. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: To visualize the topics simply run: ```python topic_model.visualize_documents(docs) ``` Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic topic_model = BERTopic().fit(docs, embeddings) # Reduce dimensionality of embeddings, this step is optional # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) # Run the visualization with the original embeddings topic_model.visualize_documents(docs, embeddings=embeddings) # Or, if you have reduced the original embeddings already: topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) check_documents_type(docs) return plotting.visualize_documents( self, docs=docs, topics=topics, embeddings=embeddings, reduced_embeddings=reduced_embeddings, sample=sample, hide_annotations=hide_annotations, hide_document_hover=hide_document_hover, custom_labels=custom_labels, title=title, width=width, height=height, ) def visualize_document_datamap( self, docs: List[str] | None = None, topics: List[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, custom_labels: Union[bool, str] = False, title: str = "Documents and Topics", sub_title: Union[str, None] = None, width: int = 1200, height: int = 750, interactive: bool = False, enable_search: bool = False, topic_prefix: bool = False, datamap_kwds: dict = {}, int_datamap_kwds: dict = {}, ) -> "fig.Figure": """Visualize documents and their topics in 2D as a static plot for publication using DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best to use a sufficiently large `min_topic_size` or set `nr_topics` when building the model. Arguments: topic_model: A fitted BERTopic instance. docs: The documents you used when calling either `fit` or `fit_transform`. topics: A selection of topics to visualize. Not to be confused with the topics that you get from `.fit_transform`. For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. Documents not in these topics will be shown as noise points. embeddings: The embeddings of all documents in `docs`. reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. sub_title: Sub-title of the plot. width: The width of the figure. height: The height of the figure. interactive: Whether to create an interactive plot using DataMapPlot's `create_interactive_plot`. enable_search: Whether to enable search in the interactive plot. Only works if `interactive=True`. topic_prefix: Prefix to add to the topic number when displaying the topic name. datamap_kwds: Keyword args be passed on to DataMapPlot's `create_plot` function if you are not using the interactive version. See the DataMapPlot documentation for more details. int_datamap_kwds: Keyword args be passed on to DataMapPlot's `create_interactive_plot` function if you are using the interactive version. See the DataMapPlot documentation for more details. Returns: figure: A Matplotlib Figure object. Examples: To visualize the topics simply run: ```python topic_model.visualize_document_datamap(docs) ``` Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic topic_model = BERTopic(min_topic_size=36).fit(docs, embeddings) # Reduce dimensionality of embeddings, this step is optional # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) # Run the visualization with the original embeddings topic_model.visualize_document_datamap(docs, embeddings=embeddings) # Or, if you have reduced the original embeddings already: topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings) fig.savefig("path/to/file.png", bbox_inches="tight") ``` """ check_is_fitted(self) return plotting.visualize_document_datamap( self, docs, topics, embeddings, reduced_embeddings, custom_labels, title, sub_title, width, height, interactive, enable_search, topic_prefix, datamap_kwds, int_datamap_kwds, ) def visualize_hierarchical_documents( self, docs: List[str], hierarchical_topics: pd.DataFrame, topics: List[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, sample: Union[float, int] | None = None, hide_annotations: bool = False, hide_document_hover: bool = True, nr_levels: int = 10, level_scale: str = "linear", custom_labels: bool = False, title: str = "Hierarchical Documents and Topics", width: int = 1200, height: int = 750, ) -> "go.Figure": """Visualize documents and their topics in 2D at different levels of hierarchy. Arguments: docs: The documents you used when calling either `fit` or `fit_transform` hierarchical_topics: A dataframe that contains a hierarchy of topics represented by their parents and their children topics: A selection of topics to visualize. Not to be confused with the topics that you get from `.fit_transform`. For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. embeddings: The embeddings of all documents in `docs`. reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. sample: The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen. hide_annotations: Hide the names of the traces on top of each cluster. hide_document_hover: Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualizations. nr_levels: The number of levels to be visualized in the hierarchy. First, the distances in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with equal length. Then, for each list of distances, the merged topics, that have a distance less or equal to the maximum distance of the selected list of distances, are selected. NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to the length of `hierarchical_topics`. level_scale: Whether to apply a linear or logarithmic ('log') scale levels of the distance vector. Linear scaling will perform an equal number of merges at each level while logarithmic scaling will perform more mergers in earlier levels to provide more resolution at higher levels (this can be used for when the number of topics is large). custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. NOTE: Custom labels are only generated for the original un-merged topics. title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: To visualize the topics simply run: ```python topic_model.visualize_hierarchical_documents(docs, hierarchical_topics) ``` Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic and extract hierarchical topics topic_model = BERTopic().fit(docs, embeddings) hierarchical_topics = topic_model.hierarchical_topics(docs) # Reduce dimensionality of embeddings, this step is optional # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) # Run the visualization with the original embeddings topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings) # Or, if you have reduced the original embeddings already: topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) check_documents_type(docs) return plotting.visualize_hierarchical_documents( self, docs=docs, hierarchical_topics=hierarchical_topics, topics=topics, embeddings=embeddings, reduced_embeddings=reduced_embeddings, sample=sample, hide_annotations=hide_annotations, hide_document_hover=hide_document_hover, nr_levels=nr_levels, level_scale=level_scale, custom_labels=custom_labels, title=title, width=width, height=height, ) def visualize_term_rank( self, topics: List[int] | None = None, log_scale: bool = False, custom_labels: bool = False, title: str = "Term score decline per Topic", width: int = 800, height: int = 500, ) -> "go.Figure": """Visualize the ranks of all terms across all topics. Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows how many words are needed to represent a topic and at which point the beneficial effect of adding words starts to decline. Arguments: topics: A selection of topics to visualize. These will be colored red where all others will be colored black. log_scale: Whether to represent the ranking on a log scale custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of the figure. height: The height of the figure. Returns: fig: A plotly figure Examples: To visualize the ranks of all words across all topics simply run: ```python topic_model.visualize_term_rank() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_term_rank() fig.write_html("path/to/file.html") ``` Reference: This visualization was heavily inspired by the "Term Probability Decline" visualization found in an analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/). Reference to that specific analysis can be found [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html). """ check_is_fitted(self) return plotting.visualize_term_rank( self, topics=topics, log_scale=log_scale, custom_labels=custom_labels, title=title, width=width, height=height, ) def visualize_topics_over_time( self, topics_over_time: pd.DataFrame, top_n_topics: int | None = None, topics: List[int] | None = None, normalize_frequency: bool = False, custom_labels: bool = False, title: str = "Topics over Time", width: int = 1250, height: int = 450, ) -> "go.Figure": """Visualize topics over time. Arguments: topics_over_time: The topics you would like to be visualized with the corresponding topic representation top_n_topics: To visualize the most frequent topics instead of all topics: Select which topics you would like to be visualized normalize_frequency: Whether to normalize each topic's frequency individually custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of the figure. height: The height of the figure. Returns: A plotly.graph_objects.Figure including all traces Examples: To visualize the topics over time, simply run: ```python topics_over_time = topic_model.topics_over_time(docs, timestamps) topic_model.visualize_topics_over_time(topics_over_time) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_topics_over_time(topics_over_time) fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) return plotting.visualize_topics_over_time( self, topics_over_time=topics_over_time, top_n_topics=top_n_topics, topics=topics, normalize_frequency=normalize_frequency, custom_labels=custom_labels, title=title, width=width, height=height, ) def visualize_topics_per_class( self, topics_per_class: pd.DataFrame, top_n_topics: int = 10, topics: List[int] | None = None, normalize_frequency: bool = False, custom_labels: bool = False, title: str = "Topics per Class", width: int = 1250, height: int = 900, ) -> "go.Figure": """Visualize topics per class. Arguments: topics_per_class: The topics you would like to be visualized with the corresponding topic representation top_n_topics: To visualize the most frequent topics instead of all topics: Select which topics you would like to be visualized normalize_frequency: Whether to normalize each topic's frequency individually custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of the figure. height: The height of the figure. Returns: A plotly.graph_objects.Figure including all traces Examples: To visualize the topics per class, simply run: ```python topics_per_class = topic_model.topics_per_class(docs, classes) topic_model.visualize_topics_per_class(topics_per_class) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_topics_per_class(topics_per_class) fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) return plotting.visualize_topics_per_class( self, topics_per_class=topics_per_class, top_n_topics=top_n_topics, topics=topics, normalize_frequency=normalize_frequency, custom_labels=custom_labels, title=title, width=width, height=height, ) def visualize_distribution( self, probabilities: np.ndarray, min_probability: float = 0.015, custom_labels: bool = False, title: str = "Topic Probability Distribution", width: int = 800, height: int = 600, ) -> "go.Figure": """Visualize the distribution of topic probabilities. Arguments: probabilities: An array of probability scores min_probability: The minimum probability score to visualize. All others are ignored. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: Make sure to fit the model before and only input the probabilities of a single document: ```python topic_model.visualize_distribution(topic_model.probabilities_[0]) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_distribution(topic_model.probabilities_[0]) fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) return plotting.visualize_distribution( self, probabilities=probabilities, min_probability=min_probability, custom_labels=custom_labels, title=title, width=width, height=height, ) def visualize_approximate_distribution( self, document: str, topic_token_distribution: np.ndarray, normalize: bool = False, ): """Visualize the topic distribution calculated by `.approximate_topic_distribution` on a token level. Thereby indicating the extent to which a certain word or phrase belongs to a specific topic. The assumption here is that a single word can belong to multiple similar topics and as such can give information about the broader set of topics within a single document. Arguments: topic_model: A fitted BERTopic instance. document: The document for which you want to visualize the approximated topic distribution. topic_token_distribution: The topic-token distribution of the document as extracted by `.approximate_topic_distribution` normalize: Whether to normalize, between 0 and 1 (summing up to 1), the topic distribution values. Returns: df: A stylized dataframe indicating the best fitting topics for each token. Examples: ```python # Calculate the topic distributions on a token level # Note that we need to have `calculate_token_level=True` topic_distr, topic_token_distr = topic_model.approximate_distribution( docs, calculate_token_level=True ) # Visualize the approximated topic distributions df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]) df ``` To revert this stylized dataframe back to a regular dataframe, you can run the following: ```python df.data.columns = [column.strip() for column in df.data.columns] df = df.data ``` """ check_is_fitted(self) return plotting.visualize_approximate_distribution( self, document=document, topic_token_distribution=topic_token_distribution, normalize=normalize, ) def visualize_hierarchy( self, orientation: str = "left", topics: List[int] | None = None, top_n_topics: int | None = None, use_ctfidf: bool = True, custom_labels: bool = False, title: str = "Hierarchical Clustering", width: int = 1000, height: int = 600, hierarchical_topics: pd.DataFrame = None, linkage_function: Callable[[csr_matrix], np.ndarray] | None = None, distance_function: Callable[[csr_matrix], csr_matrix] | None = None, color_threshold: int = 1, ) -> "go.Figure": """Visualize a hierarchical structure of the topics. A ward linkage function is used to perform the hierarchical clustering based on the cosine distance matrix between c-TF-IDF or semantic embeddings of the topics. Arguments: topic_model: A fitted BERTopic instance. orientation: The orientation of the figure. Either 'left' or 'bottom' topics: A selection of topics to visualize top_n_topics: Only select the top n most frequent topics use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. NOTE: Custom labels are only generated for the original un-merged topics. title: Title of the plot. width: The width of the figure. Only works if orientation is set to 'left' height: The height of the figure. Only works if orientation is set to 'bottom' hierarchical_topics: A dataframe that contains a hierarchy of topics represented by their parents and their children. NOTE: The hierarchical topic names are only visualized if both `topics` and `top_n_topics` are not set. linkage_function: The linkage function to use. Default is: `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` NOTE: Make sure to use the same `linkage_function` as used in `topic_model.hierarchical_topics`. distance_function: The distance function to use on the c-TF-IDF matrix. Default is: `lambda x: 1 - cosine_similarity(x)` NOTE: Make sure to use the same `distance_function` as used in `topic_model.hierarchical_topics`. color_threshold: Value at which the separation of clusters will be made which will result in different colors for different clusters. A higher value will typically lead to less colored clusters. Returns: fig: A plotly figure Examples: To visualize the hierarchical structure of topics simply run: ```python topic_model.visualize_hierarchy() ``` If you also want the labels of hierarchical topics visualized, run the following: ```python # Extract hierarchical topics and their representations hierarchical_topics = topic_model.hierarchical_topics(docs) # Visualize these representations topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) ``` If you want to save the resulting figure: ```python fig = topic_model.visualize_hierarchy() fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) return plotting.visualize_hierarchy( self, orientation=orientation, topics=topics, top_n_topics=top_n_topics, use_ctfidf=use_ctfidf, custom_labels=custom_labels, title=title, width=width, height=height, hierarchical_topics=hierarchical_topics, linkage_function=linkage_function, distance_function=distance_function, color_threshold=color_threshold, ) def visualize_heatmap( self, topics: List[int] | None = None, top_n_topics: int | None = None, n_clusters: int | None = None, use_ctfidf: bool = False, custom_labels: bool = False, title: str = "Similarity Matrix", width: int = 800, height: int = 800, ) -> "go.Figure": """Visualize a heatmap of the topic's similarity matrix. Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics, a heatmap is created showing the similarity between topics. Arguments: topics: A selection of topics to visualize. top_n_topics: Only select the top n most frequent topics. n_clusters: Create n clusters and order the similarity matrix by those clusters. use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of the figure. height: The height of the figure. Returns: fig: A plotly figure Examples: To visualize the similarity matrix of topics simply run: ```python topic_model.visualize_heatmap() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_heatmap() fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) return plotting.visualize_heatmap( self, topics=topics, top_n_topics=top_n_topics, n_clusters=n_clusters, use_ctfidf=use_ctfidf, custom_labels=custom_labels, title=title, width=width, height=height, ) def visualize_barchart( self, topics: List[int] | None = None, top_n_topics: int = 8, n_words: int = 5, custom_labels: bool = False, title: str = "Topic Word Scores", width: int = 250, height: int = 250, autoscale: bool = False, ) -> "go.Figure": """Visualize a barchart of selected topics. Arguments: topics: A selection of topics to visualize. top_n_topics: Only select the top n most frequent topics. n_words: Number of words to show in a topic custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. width: The width of each figure. height: The height of each figure. autoscale: Whether to automatically calculate the height of the figures to fit the whole bar text Returns: fig: A plotly figure Examples: To visualize the barchart of selected topics simply run: ```python topic_model.visualize_barchart() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_barchart() fig.write_html("path/to/file.html") ``` """ check_is_fitted(self) return plotting.visualize_barchart( self, topics=topics, top_n_topics=top_n_topics, n_words=n_words, custom_labels=custom_labels, title=title, width=width, height=height, autoscale=autoscale, ) def save( self, path, serialization: Literal["safetensors", "pickle", "pytorch"] = "pickle", save_embedding_model: Union[bool, str] = True, save_ctfidf: bool = False, ): """Saves the model to the specified path or folder. When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic should not be loaded in other versions. Arguments: path: If `serialization` is 'safetensors' or `pytorch`, this is a directory. If `serialization` is `pickle`, then this is a file. serialization: If `pickle`, the entire model will be pickled. If `safetensors` or `pytorch` the model will be saved without the embedding, dimensionality reduction, and clustering algorithms. This is a very efficient format and typically advised. save_embedding_model: If serialization is `pickle`, then you can choose to skip saving the embedding model. If serialization is `safetensors` or `pytorch`, this variable can be used as a string pointing towards a huggingface model. save_ctfidf: Whether to save c-TF-IDF information if serialization is `safetensors` or `pytorch` Examples: To save the model in an efficient and safe format (safetensors) with c-TF-IDF information: ```python topic_model.save("model_dir", serialization="safetensors", save_ctfidf=True) ``` If you wish to also add a pointer to the embedding model, which will be downloaded from HuggingFace upon loading: ```python embedding_model = "sentence-transformers/all-MiniLM-L6-v2" topic_model.save("model_dir", serialization="safetensors", save_embedding_model=embedding_model) ``` or if you want save the full model with pickle: ```python topic_model.save("my_model") ``` NOTE: Pickle can run arbitrary code and is generally considered to be less safe than safetensors. """ if serialization == "pickle": logger.warning( "When you use `pickle` to save/load a BERTopic model," "please make sure that the environments in which you save" "and load the model are **exactly** the same. The version of BERTopic," "its dependencies, and python need to remain the same." ) with open(path, "wb") as file: # This prevents the vectorizer from being too large in size if `min_df` was # set to a value higher than 1 self.vectorizer_model.stop_words_ = None if not save_embedding_model: embedding_model = self.embedding_model self.embedding_model = None joblib.dump(self, file) self.embedding_model = embedding_model else: joblib.dump(self, file) elif serialization == "safetensors" or serialization == "pytorch": # Directory save_directory = Path(path) save_directory.mkdir(exist_ok=True, parents=True) # Check embedding model if ( save_embedding_model and hasattr(self.embedding_model, "_hf_model") and not isinstance(save_embedding_model, str) ): save_embedding_model = self.embedding_model._hf_model elif not save_embedding_model: logger.warning( "You are saving a BERTopic model without explicitly defining an embedding model." "If you are using a sentence-transformers model or a HuggingFace model supported" "by sentence-transformers, please save the model by using a pointer towards that model." "For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`" ) # Minimal save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization) save_utils.save_topics(model=self, path=save_directory / "topics.json") save_utils.save_images(model=self, path=save_directory / "images") save_utils.save_config( model=self, path=save_directory / "config.json", embedding_model=save_embedding_model, ) # Additional if save_ctfidf: if self.c_tf_idf_ is None: logger.warning( "The c-TF-IDF matrix could not be saved as it was not found. " "This typically occurs when merging BERTopic models with `BERTopic.merge_models`." ) else: save_utils.save_ctfidf( model=self, save_directory=save_directory, serialization=serialization, ) save_utils.save_ctfidf_config(model=self, path=save_directory / "ctfidf_config.json") @classmethod def load(cls, path: str, embedding_model=None): """Loads the model from the specified path or directory. Arguments: path: Either load a BERTopic model from a file (`.pickle`) or a folder containing `.safetensors` or `.bin` files. embedding_model: Additionally load in an embedding model if it was not saved in the BERTopic model file or directory. Examples: ```python BERTopic.load("model_dir") ``` or if you did not save the embedding model: ```python BERTopic.load("model_dir", embedding_model="all-MiniLM-L6-v2") ``` """ file_or_dir = Path(path) # Load from Pickle if file_or_dir.is_file(): with open(file_or_dir, "rb") as file: if embedding_model: topic_model = joblib.load(file) topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose) else: topic_model = joblib.load(file) return topic_model # Load from directory or HF if file_or_dir.is_dir(): topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_local_files(file_or_dir) elif "/" in str(path): topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path) else: raise ValueError("Make sure to either pass a valid directory or HF model.") topic_model = _create_model_from_files( topics, params, tensors, ctfidf_tensors, ctfidf_config, images, warn_no_backend=(embedding_model is None), ) # Replace embedding model if one is specifically chosen if embedding_model is not None: topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose) return topic_model @classmethod def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None): """Merge multiple pre-trained BERTopic models into a single model. The models are merged as if they were all saved using pytorch or safetensors, so a minimal version without c-TF-IDF. To do this, we choose the first model in the list of models as a baseline. Then, we check each model whether they contain topics that are not in the baseline. This check is based on the cosine similarity between topics embeddings. If topic embeddings between two models are similar, then the topic of the second model is re-assigned to the first. If they are dissimilar, the topic of the second model is assigned to the first. In essence, we simply check whether sufficiently "new" topics emerge and add them. Arguments: models: A list of fitted BERTopic models min_similarity: The minimum similarity for when topics are merged. embedding_model: Additionally load in an embedding model if necessary. Returns: A new BERTopic model that was created as if you were loading a model from the HuggingFace Hub without c-TF-IDF Examples: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] # Create three separate models topic_model_1 = BERTopic(min_topic_size=5).fit(docs[:4000]) topic_model_2 = BERTopic(min_topic_size=5).fit(docs[4000:8000]) topic_model_3 = BERTopic(min_topic_size=5).fit(docs[8000:]) # Combine all models into one merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3]) ``` """ def choose_backend(): """Choose the backend to use for saving the model.""" try: import torch # noqa: F401 return "pytorch" except (ModuleNotFoundError, ImportError): try: import safetensors # noqa: F401 return "safetensors" except (ModuleNotFoundError, ImportError): raise ImportError( "Neither pytorch nor safetensors is installed. " "Please install at least one of these packages:\n" " pip install torch\n" " pip install safetensors" ) # Temporarily save model and push to HF with TemporaryDirectory() as tmpdir: # Save model weights and config. all_topics, all_params, all_tensors = [], [], [] for index, model in enumerate(models): model.save(tmpdir, serialization=choose_backend()) topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir)) all_topics.append(topics) all_params.append(params) all_tensors.append(np.array(tensors["topic_embeddings"])) # Create a base set of parameters if index == 0: merged_topics = topics merged_params = params merged_tensors = np.array(tensors["topic_embeddings"]) merged_topics["custom_labels"] = None for tensors, selected_topics in zip(all_tensors[1:], all_topics[1:]): # Calculate similarity matrix sim_matrix = cosine_similarity(tensors, merged_tensors) sims = np.max(sim_matrix, axis=1) # Extract new topics new_topics = sorted( [index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity] ) max_topic = max(set(merged_topics["topics"])) # Merge Topic Representations new_topics_dict = {} for new_topic in new_topics: if new_topic != -1: max_topic += 1 new_topics_dict[new_topic] = max_topic merged_topics["topic_representations"][str(max_topic)] = selected_topics["topic_representations"][ str(new_topic) ] merged_topics["topic_labels"][str(max_topic)] = selected_topics["topic_labels"][str(new_topic)] # Add new aspects if selected_topics["topic_aspects"]: aspects_1 = set(merged_topics["topic_aspects"].keys()) aspects_2 = set(selected_topics["topic_aspects"].keys()) aspects_diff = aspects_2.difference(aspects_1) if aspects_diff: for aspect in aspects_diff: merged_topics["topic_aspects"][aspect] = {} # If the original model does not have topic aspects but the to be added model does if not merged_topics.get("topic_aspects"): merged_topics["topic_aspects"] = selected_topics["topic_aspects"] # If they both contain topic aspects, add to the existing set of aspects else: for aspect, values in selected_topics["topic_aspects"].items(): merged_topics["topic_aspects"][aspect][str(max_topic)] = values[str(new_topic)] # Add new embeddings new_tensors = tensors[new_topic + selected_topics["_outliers"]] merged_tensors = np.vstack([merged_tensors, new_tensors]) # Topic Mapper merged_topics["topic_mapper"] = TopicMapper(list(range(-1, max_topic + 1, 1))).mappings_ # Find similar topics and re-assign those from the new models sims_idx = np.argmax(sim_matrix, axis=1) sims = np.max(sim_matrix, axis=1) to_merge = { a - selected_topics["_outliers"]: b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims)) if val >= min_similarity } to_merge.update(new_topics_dict) to_merge[-1] = -1 topics = [to_merge[topic] for topic in selected_topics["topics"]] merged_topics["topics"].extend(topics) merged_topics["topic_sizes"] = dict(Counter(merged_topics["topics"])) # Create a new model from the merged parameters merged_tensors = {"topic_embeddings": merged_tensors} merged_model = _create_model_from_files( merged_topics, merged_params, merged_tensors, None, None, None, warn_no_backend=False, ) merged_model.embedding_model = models[0].embedding_model # Replace embedding model if one is specifically chosen verbose = any([model.verbose for model in models]) if embedding_model is not None and type(merged_model.embedding_model) is BaseEmbedder: merged_model.embedding_model = select_backend(embedding_model, verbose=verbose) return merged_model def push_to_hf_hub( self, repo_id: str, commit_message: str = "Add BERTopic model", token: str | None = None, revision: str | None = None, private: bool = False, create_pr: bool = False, model_card: bool = True, serialization: str = "safetensors", save_embedding_model: Union[str, bool] = True, save_ctfidf: bool = False, ): """Push your BERTopic model to a HuggingFace Hub. Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account: * Log in to your HuggingFace account with the following command: ```bash huggingface-cli login # or using an environment variable huggingface-cli login --token $HUGGINGFACE_TOKEN ``` * Alternatively, you can programmatically login using login() in a notebook or a script: ```python from huggingface_hub import login login() ``` * Or you can give a token with the `token` variable Arguments: repo_id: The name of your HuggingFace repository commit_message: A commit message token: Token to add if not already logged in revision: Repository revision private: Whether to create a private repository create_pr: Whether to upload the model as a Pull Request model_card: Whether to automatically create a modelcard serialization: The type of serialization. Either `safetensors` or `pytorch` save_embedding_model: A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., `sentence-transformers/all-MiniLM-L6-v2` save_ctfidf: Whether to save c-TF-IDF information Examples: ```python topic_model.push_to_hf_hub( repo_id="ArXiv", save_ctfidf=True, save_embedding_model="sentence-transformers/all-MiniLM-L6-v2" ) ``` """ return save_utils.push_to_hf_hub( model=self, repo_id=repo_id, commit_message=commit_message, token=token, revision=revision, private=private, create_pr=create_pr, model_card=model_card, serialization=serialization, save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf, ) def get_params(self, deep: bool = False) -> Mapping[str, Any]: """Get parameters for this estimator. Adapted from: https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178 Arguments: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: out: Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): value = getattr(self, key) if deep and hasattr(value, "get_params"): deep_items = value.get_params().items() out.update((key + "__" + k, val) for k, val in deep_items) out[key] = value return out def _extract_embeddings( self, documents: Union[List[str], str], images: List[str] | None = None, method: str = "document", verbose: bool | None = None, ) -> np.ndarray: """Extract sentence/document embeddings through pre-trained embeddings For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html. Arguments: documents: Dataframe with documents and their corresponding IDs images: A list of paths to the images to fit on or the images themselves method: Whether to extract document or word-embeddings, options are "document" and "word" verbose: Whether to show a progressbar demonstrating the time to extract embeddings Returns: embeddings: The extracted embeddings. """ if isinstance(documents, str): documents = [documents] if images is not None and hasattr(self.embedding_model, "embed_images"): embeddings = self.embedding_model.embed(documents=documents, images=images, verbose=verbose) elif method == "word": embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose) elif method == "document": embeddings = self.embedding_model.embed_documents(documents, verbose=verbose) elif documents[0] is None and images is None: raise ValueError( "Make sure to use an embedding model that can either embed documents" "or images depending on which you want to embed." ) else: raise ValueError( "Wrong method for extracting document/word embeddings. " "Either choose 'word' or 'document' as the method. " ) return embeddings def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame: """Convert images to text.""" logger.info("Images - Converting images to text. This might take a while.") if isinstance(self.representation_model, dict): for tuner in self.representation_model.values(): if getattr(tuner, "image_to_text_model", False): documents = tuner.image_to_text(documents, embeddings) elif isinstance(self.representation_model, list): for tuner in self.representation_model: if getattr(tuner, "image_to_text_model", False): documents = tuner.image_to_text(documents, embeddings) elif isinstance(self.representation_model, BaseRepresentation): if getattr(self.representation_model, "image_to_text_model", False): documents = self.representation_model.image_to_text(documents, embeddings) logger.info("Images - Completed \u2713") return documents def _map_predictions(self, predictions: List[int]) -> List[int]: """Map predictions to the correct topics if topics were reduced.""" mappings = self.topic_mapper_.get_mappings(original_topics=True) mapped_predictions = [mappings[prediction] if prediction in mappings else -1 for prediction in predictions] return mapped_predictions def _reduce_dimensionality( self, embeddings: Union[np.ndarray, csr_matrix], y: Union[List[int], np.ndarray] = None, partial_fit: bool = False, ) -> np.ndarray: """Reduce dimensionality of embeddings using UMAP and train a UMAP model. Arguments: embeddings: The extracted embeddings using the sentence transformer module. y: The target class for (semi)-supervised dimensionality reduction partial_fit: Whether to run `partial_fit` for online learning Returns: umap_embeddings: The reduced embeddings """ logger.info("Dimensionality - Fitting the dimensionality reduction algorithm") # Partial fit if partial_fit: if hasattr(self.umap_model, "partial_fit"): self.umap_model = self.umap_model.partial_fit(embeddings) umap_embeddings = self.umap_model.transform(embeddings) elif self.topic_representations_ is None: if hasattr(self.umap_model, "fit_transform"): umap_embeddings = self.umap_model.fit_transform(embeddings) else: self.umap_model.fit(embeddings) umap_embeddings = self.umap_model.transform(embeddings) else: umap_embeddings = self.umap_model.transform(embeddings) # Regular fit else: try: # cuml umap needs y to be an numpy array y = np.array(y) if y is not None else None if hasattr(self.umap_model, "fit_transform"): umap_embeddings = self.umap_model.fit_transform(embeddings, y=y) else: self.umap_model.fit(embeddings, y=y) umap_embeddings = self.umap_model.transform(embeddings) except TypeError: if hasattr(self.umap_model, "fit_transform"): umap_embeddings = self.umap_model.fit_transform(embeddings, y=y) else: self.umap_model.fit(embeddings, y=y) umap_embeddings = self.umap_model.transform(embeddings) logger.info("Dimensionality - Completed \u2713") return np.nan_to_num(umap_embeddings) def _cluster_embeddings( self, umap_embeddings: np.ndarray, documents: pd.DataFrame, partial_fit: bool = False, y: np.ndarray = None, ) -> Tuple[pd.DataFrame, np.ndarray]: """Cluster UMAP reduced embeddings with HDBSCAN. Arguments: umap_embeddings: The reduced sentence embeddings with UMAP documents: Dataframe with documents and their corresponding IDs partial_fit: Whether to run `partial_fit` for online learning y: Array of topics to use Returns: documents: Updated dataframe with documents and their corresponding IDs and newly added Topics probabilities: The distribution of probabilities """ logger.info("Cluster - Start clustering the reduced embeddings") if partial_fit: self.hdbscan_model = self.hdbscan_model.partial_fit(umap_embeddings) labels = self.hdbscan_model.labels_ documents["Topic"] = labels self.topics_ = labels else: try: self.hdbscan_model.fit(umap_embeddings, y=y) except TypeError: self.hdbscan_model.fit(umap_embeddings) try: labels = self.hdbscan_model.labels_ except AttributeError: labels = y documents["Topic"] = labels self._update_topic_size(documents) # Extract probabilities probabilities = None if hasattr(self.hdbscan_model, "probabilities_"): probabilities = self.hdbscan_model.probabilities_ if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model): probabilities = hdbscan_delegator(self.hdbscan_model, "all_points_membership_vectors") if not partial_fit: self.topic_mapper_ = TopicMapper(self.topics_) logger.info("Cluster - Completed \u2713") return documents, probabilities def _zeroshot_topic_modeling( self, documents: pd.DataFrame, embeddings: np.ndarray ) -> Tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]: """Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list. We transform the topics in `self.zeroshot_topic_list` to embeddings and compare them through cosine similarity with the document embeddings. If they pass the `self.zeroshot_min_similarity` threshold, they are assigned. Arguments: documents: Dataframe with documents and their corresponding IDs embeddings: The document embeddings Returns: documents: The leftover documents that were not assigned to any topic embeddings: The leftover embeddings that were not assigned to any topic """ logger.info("Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics") # Similarity between document and zero-shot topic embeddings zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list) cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings) assignment = np.argmax(cosine_similarities, 1) assignment_vals = np.max(cosine_similarities, 1) assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity] non_assigned_ids = [ index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity ] # Assign topics assigned_documents = documents.iloc[assigned_ids] assigned_documents["Topic"] = [topic for topic in assignment[assigned_ids]] assigned_documents["Old_ID"] = assigned_documents["ID"].copy() assigned_documents["ID"] = range(len(assigned_documents)) assigned_embeddings = embeddings[assigned_ids] # Check that if a number of topics was specified, it exceeds the number of zeroshot topics matched num_zeroshot_topics = len(assigned_documents["Topic"].unique()) if self.nr_topics != "auto": if self.nr_topics and not self.nr_topics > num_zeroshot_topics: raise ValueError( f"The set nr_topics ({self.nr_topics}) must exceed the number of matched zero-shot topics " f"({num_zeroshot_topics}). Consider raising nr_topics or raising the " f"zeroshot_min_similarity ({self.zeroshot_min_similarity})." ) # Select non-assigned topics to be clustered documents = documents.iloc[non_assigned_ids] documents["Old_ID"] = documents["ID"].copy() documents["ID"] = range(len(documents)) embeddings = embeddings[non_assigned_ids] if len(documents) == 0: self.topics_ = assigned_documents["Topic"].to_numpy().tolist() self.topic_mapper_ = TopicMapper(self.topics_) logger.info("Zeroshot Step 1 - Completed \u2713") return documents, embeddings, assigned_documents, assigned_embeddings def _is_zeroshot(self): """Check whether zero-shot topic modeling is possible. * Embedding model is necessary to convert zero-shot topics to embeddings * Zero-shot topics should be defined """ if self.zeroshot_topic_list is not None and self.embedding_model is not None: return True return False def _combine_zeroshot_topics( self, documents: pd.DataFrame, embeddings: np.ndarray, assigned_documents: pd.DataFrame, assigned_embeddings: np.ndarray, ) -> Tuple[pd.DataFrame, np.ndarray]: """Combine the zero-shot topics with the clustered topics. The zero-shot topics will be inserted between the outlier topic (that may or may not exist) and the rest of the topics from clustering. The rest of the topics from clustering will be given new IDs to correspond to topics after zero-shot topics. Documents and embeddings used in zero-shot topic modeling and clustering and re-merged. Arguments: documents: DataFrame with clustered documents and their corresponding IDs embeddings: The document embeddings for clustered documents assigned_documents: DataFrame with documents and their corresponding IDs that were assigned to a zero-shot topic assigned_embeddings: The document embeddings for documents that were assigned to a zero-shot topic Returns: documents: DataFrame with all the original documents with their topic assignments embeddings: np.ndarray of embeddings aligned with the documents """ logger.info("Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...") # Combine Zero-shot topics with topics from clustering zeroshot_topic_idx_to_topic_id = { zeroshot_topic_id: new_topic_id for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic)) } self._topic_id_to_zeroshot_topic_idx = { new_topic_id: zeroshot_topic_id for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic)) } assigned_documents.Topic = assigned_documents.Topic.map(zeroshot_topic_idx_to_topic_id) num_zeroshot_topics = len(zeroshot_topic_idx_to_topic_id) # Insert zeroshot topics between outlier cluster and other clusters documents.Topic = documents.Topic.apply( lambda topic_id: topic_id + num_zeroshot_topics if topic_id != -1 else topic_id ) # Combine the clustered documents/embeddings with assigned documents/embeddings in the original order documents = pd.concat([documents, assigned_documents]) embeddings = np.vstack([embeddings, assigned_embeddings]) documents.ID = documents.Old_ID sorted_indices = documents.Old_ID.argsort() documents = documents.iloc[sorted_indices] embeddings = embeddings[sorted_indices] # Update topic sizes and topic mapper self._update_topic_size(documents) self.topic_mapper_ = TopicMapper(self.topics_) logger.info("Zeroshot Step 2 - Completed \u2713") return documents, embeddings def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]: """Apply Guided Topic Modeling. We transform the seeded topics to embeddings using the same embedder as used for generating document embeddings. Then, we apply cosine similarity between the embeddings and set labels for documents that are more similar to one of the topics than the average document. If a document is more similar to the average document than any of the topics, it gets the -1 label and is thereby not included in UMAP. Arguments: embeddings: The document embeddings Returns: y: The labels for each seeded topic embeddings: Updated embeddings """ logger.info("Guided - Find embeddings highly related to seeded topics.") # Create embeddings from the seeded topics seed_topic_list = [" ".join(seed_topic) for seed_topic in self.seed_topic_list] seed_topic_embeddings = self._extract_embeddings(seed_topic_list, verbose=self.verbose) seed_topic_embeddings = np.vstack([seed_topic_embeddings, embeddings.mean(axis=0)]) # Label documents that are most similar to one of the seeded topics sim_matrix = cosine_similarity(embeddings, seed_topic_embeddings) y = [np.argmax(sim_matrix[index]) for index in range(sim_matrix.shape[0])] y = [val if val != len(seed_topic_list) else -1 for val in y] # Average the document embeddings related to the seeded topics with the # embedding of the seeded topic to force the documents in a cluster for seed_topic in range(len(seed_topic_list)): indices = [index for index, topic in enumerate(y) if topic == seed_topic] embeddings[indices] = embeddings[indices] * 0.75 + seed_topic_embeddings[seed_topic] * 0.25 logger.info("Guided - Completed \u2713") return y, embeddings def _extract_topics( self, documents: pd.DataFrame, embeddings: np.ndarray = None, mappings=None, verbose: bool = False, fine_tune_representation: bool = True, ): """Extract topics from the clusters using a class-based TF-IDF. Arguments: documents: Dataframe with documents and their corresponding IDs embeddings: The document embeddings mappings: The mappings from topic to word verbose: Whether to log the process of extracting topics fine_tune_representation: If True, the topic representation will be fine-tuned using representation models. If False, the topic representation will remain as the base c-TF-IDF representation. Returns: c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic """ if verbose: action = "Fine-tuning" if fine_tune_representation else "Extracting" method = "representation models" if fine_tune_representation else "c-TF-IDF for topic reduction" logger.info(f"Representation - {action} topics using {method}.") documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) self.topic_representations_ = self._extract_words_per_topic( words, documents, fine_tune_representation=fine_tune_representation, calculate_aspects=fine_tune_representation, embeddings=embeddings, ) self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings) if verbose: logger.info("Representation - Completed \u2713") def _save_representative_docs(self, documents: pd.DataFrame): """Save the 3 most representative docs per topic. Arguments: documents: Dataframe with documents and their corresponding IDs Updates: self.representative_docs_: Populate each topic with 3 representative docs """ repr_docs, _, _, _ = self._extract_representative_docs( self.c_tf_idf_, documents, self.topic_representations_, nr_samples=500, nr_repr_docs=3, ) self.representative_docs_ = repr_docs def _extract_representative_docs( self, c_tf_idf: csr_matrix, documents: pd.DataFrame, topics: Mapping[str, List[Tuple[str, float]]], nr_samples: int = 500, nr_repr_docs: int = 5, diversity: float | None = None, ) -> Union[List[str], List[List[int]]]: """Approximate most representative documents per topic by sampling a subset of the documents in each topic and calculating which are most representative to their topic based on the cosine similarity between c-TF-IDF representations. Arguments: c_tf_idf: The topic c-TF-IDF representation documents: All input documents topics: The candidate topics as calculated with c-TF-IDF nr_samples: The number of candidate documents to extract per topic nr_repr_docs: The number of representative documents to extract per topic diversity: The diversity between the most representative documents. If None, no MMR is used. Otherwise, accepts values between 0 and 1. Returns: repr_docs_mappings: A dictionary from topic to representative documents representative_docs: A flat list of representative documents repr_doc_indices: Ordered indices of representative documents that belong to each topic repr_doc_ids: The indices of representative documents that belong to each topic """ # Sample documents per topic documents_per_topic = ( documents.drop("Image", axis=1, errors="ignore") .groupby("Topic") .sample(n=nr_samples, replace=True, random_state=42) .drop_duplicates() ) # Find and extract documents that are most similar to the topic repr_docs = [] repr_docs_indices = [] repr_docs_mappings = {} repr_docs_ids = [] labels = sorted(list(topics.keys())) for index, topic in enumerate(labels): # Slice data selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :] selected_docs = selection["Document"].to_numpy() selected_docs_ids = selection.index.tolist() # Calculate similarity nr_docs = nr_repr_docs if len(selected_docs) > nr_repr_docs else len(selected_docs) bow = self.vectorizer_model.transform(selected_docs) ctfidf = self.ctfidf_model.transform(bow) sim_matrix = cosine_similarity(ctfidf, c_tf_idf[index]) # Use MMR to find representative but diverse documents if diversity: docs = mmr( c_tf_idf[index], ctfidf, selected_docs, top_n=nr_docs, diversity=diversity, ) # Extract top n most representative documents else: indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[-nr_docs:] docs = [selected_docs[index] for index in indices] doc_ids = [selected_docs_ids[index] for index, doc in enumerate(selected_docs) if doc in docs] repr_docs_ids.append(doc_ids) repr_docs.extend(docs) repr_docs_indices.append([repr_docs_indices[-1][-1] + i + 1 if index != 0 else i for i in range(nr_docs)]) repr_docs_mappings = {topic: repr_docs[i[0] : i[-1] + 1] for topic, i in zip(topics.keys(), repr_docs_indices)} return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids def _create_topic_vectors( self, documents: pd.DataFrame = None, embeddings: np.ndarray = None, mappings=None, ): """Creates embeddings per topics based on their topic representation. As a default, topic vectors (topic embeddings) are created by taking the average of all document embeddings within a topic. If topics are merged, then a weighted average of topic embeddings is taken based on the initial topic sizes. For the `.partial_fit` and `.update_topics` method, the average of all document embeddings is not taken since those are not known. Instead, the weighted average of the embeddings of the top n words is taken for each topic. The weighting is done based on the c-TF-IDF score. This will put more emphasis to words that represent a topic best. """ # Topic embeddings based on input embeddings if embeddings is not None and documents is not None: topic_embeddings = [] topics = documents.sort_values("Topic").Topic.unique() topic_ids = documents["Topic"].to_numpy() doc_ids = documents["ID"].to_numpy().astype(int) for topic in topics: mask = topic_ids == topic topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0)) self.topic_embeddings_ = np.array(topic_embeddings) # Topic embeddings when merging topics elif self.topic_embeddings_ is not None and mappings is not None: topic_embeddings_dict = {} for topic_to, topics_from in mappings.items(): topic_ids = topics_from["topics_from"] topic_sizes = topics_from["topic_sizes"] if topic_ids: embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers] topic_embedding = np.average(embds, axis=0, weights=topic_sizes) topic_embeddings_dict[topic_to] = topic_embedding # Re-order topic embeddings topics_to_map = { topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:] } topic_embeddings = {} for topic, embds in topic_embeddings_dict.items(): topic_embeddings[topics_to_map[topic]] = embds unique_topics = sorted(list(topic_embeddings.keys())) self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics]) # Topic embeddings based on keyword representations elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder: topic_list = list(self.topic_representations_.keys()) topic_list.sort() # Only extract top n words n = len(self.topic_representations_[topic_list[0]]) if self.top_n_words < n: n = self.top_n_words # Extract embeddings for all words in all topics topic_words = [self.get_topic(topic) for topic in topic_list] topic_words = [word[0] for topic in topic_words for word in topic] word_embeddings = self._extract_embeddings(topic_words, method="word", verbose=False) # Take the weighted average of word embeddings in a topic based on their c-TF-IDF value # The embeddings var is a single numpy matrix and therefore slicing is necessary to # access the words per topic topic_embeddings = [] for i, topic in enumerate(topic_list): word_importance = [val[1] for val in self.get_topic(topic)] if sum(word_importance) == 0: word_importance = [1 for _ in range(len(self.get_topic(topic)))] topic_embedding = np.average( word_embeddings[i * n : n + (i * n)], weights=word_importance, axis=0, ) topic_embeddings.append(topic_embedding) self.topic_embeddings_ = np.array(topic_embeddings) def _c_tf_idf( self, documents_per_topic: pd.DataFrame, fit: bool = True, partial_fit: bool = False, ) -> Tuple[csr_matrix, List[str]]: """Calculate a class-based TF-IDF where m is the number of total documents. Arguments: documents_per_topic: The joined documents per topic such that each topic has a single string made out of multiple documents m: The total number of documents (unjoined) fit: Whether to fit a new vectorizer or use the fitted self.vectorizer_model partial_fit: Whether to run `partial_fit` for online learning Returns: tf_idf: The resulting matrix giving a value (importance score) for each word per topic words: The names of the words to which values were given """ documents = self._preprocess_text(documents_per_topic.Document.values) if partial_fit: X = self.vectorizer_model.partial_fit(documents).update_bow(documents) elif fit: X = self.vectorizer_model.fit_transform(documents) else: X = self.vectorizer_model.transform(documents) # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 # and will be removed in 1.2. Please use get_feature_names_out instead. if version.parse(sklearn_version) >= version.parse("1.0.0"): words = self.vectorizer_model.get_feature_names_out() else: words = self.vectorizer_model.get_feature_names() multiplier = None if self.ctfidf_model.seed_words and self.seed_topic_list: seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] multiplier = np.array( [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words] ) multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)]) elif self.ctfidf_model.seed_words: multiplier = np.array( [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words] ) elif self.seed_topic_list: seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words]) if fit: self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier) c_tf_idf = self.ctfidf_model.transform(X) return c_tf_idf, words def _update_topic_size(self, documents: pd.DataFrame): """Calculate the topic sizes. Arguments: documents: Updated dataframe with documents and their corresponding IDs and newly added Topics """ self.topic_sizes_ = collections.Counter(documents.Topic.to_numpy().tolist()) self.topics_ = documents.Topic.astype(int).tolist() def _extract_words_per_topic( self, words: List[str], documents: pd.DataFrame, c_tf_idf: csr_matrix = None, fine_tune_representation: bool = True, calculate_aspects: bool = True, embeddings: np.ndarray = None, ) -> Mapping[str, List[Tuple[str, float]]]: """Based on tf_idf scores per topic, extract the top n words per topic. If the top words per topic need to be extracted, then only the `words` parameter needs to be passed. If the top words per topic in a specific timestamp, then it is important to pass the timestamp-based c-TF-IDF matrix and its corresponding labels. Arguments: words: List of all words (sorted according to tf_idf matrix position) documents: DataFrame with documents and their topic IDs c_tf_idf: A c-TF-IDF matrix from which to calculate the top words fine_tune_representation: If True, the topic representation will be fine-tuned using representation models. If False, the topic representation will remain as the base c-TF-IDF representation. calculate_aspects: Whether to calculate additional topic aspects embeddings: Pre-trained document embeddings. These can be used instead of an embedding model Returns: topics: The top words per topic """ if c_tf_idf is None: c_tf_idf = self.c_tf_idf_ labels = sorted(list(documents.Topic.unique())) labels = [int(label) for label in labels] # Get at least the top 30 indices and values per row in a sparse c-TF-IDF matrix top_n_words = max(self.top_n_words, 30) indices = self._top_n_idx_sparse(c_tf_idf, top_n_words) scores = self._top_n_values_sparse(c_tf_idf, indices) sorted_indices = np.argsort(scores, 1) indices = np.take_along_axis(indices, sorted_indices, axis=1) scores = np.take_along_axis(scores, sorted_indices, axis=1) # Get top 30 words per topic based on c-TF-IDF score base_topics = { label: [ (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001) for word_index, score in zip(indices[index][::-1], scores[index][::-1]) ] for index, label in enumerate(labels) } # Fine-tune the topic representations topics = base_topics.copy() if not self.representation_model or not fine_tune_representation: # Default representation: c_tf_idf + top_n_words topics = {label: values[: self.top_n_words] for label, values in topics.items()} elif fine_tune_representation and isinstance(self.representation_model, list): for tuner in self.representation_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) elif fine_tune_representation and isinstance(self.representation_model, KeyBERTInspired): topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics, embeddings) elif fine_tune_representation and isinstance(self.representation_model, BaseRepresentation): topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics) elif fine_tune_representation and isinstance(self.representation_model, dict): if self.representation_model.get("Main"): main_model = self.representation_model["Main"] if isinstance(main_model, BaseRepresentation): topics = main_model.extract_topics(self, documents, c_tf_idf, topics) elif isinstance(main_model, list): for tuner in main_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) else: raise TypeError(f"unsupported type {type(main_model).__name__} for representation_model['Main']") else: # Default representation: c_tf_idf + top_n_words topics = {label: values[: self.top_n_words] for label, values in topics.items()} else: raise TypeError(f"unsupported type {type(self.representation_model).__name__} for representation_model") # Extract additional topic aspects if calculate_aspects and isinstance(self.representation_model, dict): for aspect, aspect_model in self.representation_model.items(): if aspect != "Main": aspects = base_topics.copy() if not aspect_model: # Default representation: c_tf_idf + top_n_words aspects = {label: values[: self.top_n_words] for label, values in aspects.items()} if isinstance(aspect_model, list): for tuner in aspect_model: aspects = tuner.extract_topics(self, documents, c_tf_idf, aspects) elif isinstance(aspect_model, BaseRepresentation): aspects = aspect_model.extract_topics(self, documents, c_tf_idf, aspects) else: raise TypeError( f"unsupported type {type(aspect_model).__name__} for representation_model[{aspect!r}]" ) self.topic_aspects_[aspect] = aspects return topics def _reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: """Reduce topics to self.nr_topics. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic embeddings are used. Returns: documents: Updated dataframe with documents and the reduced number of Topics """ logger.info("Topic reduction - Reducing number of topics") initial_nr_topics = len(self.get_topics()) if isinstance(self.nr_topics, int): if self.nr_topics < initial_nr_topics: documents = self._reduce_to_n_topics(documents, use_ctfidf) else: logger.info( f"Topic reduction - Number of topics ({self.nr_topics}) is equal or higher than the clustered topics({len(self.get_topics())})." ) self._extract_topics(documents, verbose=self.verbose) return documents elif isinstance(self.nr_topics, str): documents = self._auto_reduce_topics(documents, use_ctfidf) else: raise ValueError("nr_topics needs to be an int or 'auto'! ") logger.info( f"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}" ) return documents def _reduce_to_n_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: """Reduce topics to self.nr_topics. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic embedding are used. Returns: documents: Updated dataframe with documents and the reduced number of Topics """ topics = documents.Topic.tolist().copy() # Create topic distance matrix topic_embeddings = select_topic_representation( self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True )[0][self._outliers :] distance_matrix = 1 - cosine_similarity(topic_embeddings) np.fill_diagonal(distance_matrix, 0) # Cluster the topic embeddings using AgglomerativeClustering if version.parse(sklearn_version) >= version.parse("1.4.0"): cluster = AgglomerativeClustering(self.nr_topics - self._outliers, metric="precomputed", linkage="average") else: cluster = AgglomerativeClustering( self.nr_topics - self._outliers, affinity="precomputed", linkage="average", ) cluster.fit(distance_matrix) new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics] # Track mappings and sizes of topics for merging topic embeddings mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics)} basic_mappings = defaultdict(list) for key, val in sorted(mapped_topics.items()): basic_mappings[val].append(key) mappings = { topic_to: { "topics_from": topics_from, "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], } for topic_to, topics_from in basic_mappings.items() } # Map topics documents.Topic = new_topics self._update_topic_size(documents) self.topic_mapper_.add_mappings(mapped_topics, topic_model=self) # Update representations documents = self._sort_mappings_by_frequency(documents) self._extract_topics(documents, mappings=mappings, verbose=self.verbose) self._update_topic_size(documents) return documents def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: """Reduce the number of topics automatically using HDBSCAN. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used. Returns: documents: Updated dataframe with documents and the reduced number of Topics """ topics = documents.Topic.tolist().copy() unique_topics = sorted(list(documents.Topic.unique()))[self._outliers :] # Find similar topics embeddings = select_topic_representation( self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True )[0] norm_data = normalize(embeddings, norm="l2") if HAS_HDBSCAN: predictions = HDBSCAN( min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", prediction_data=True, ).fit_predict(norm_data[self._outliers :]) else: predictions = SK_HDBSCAN( min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1 ).fit_predict(norm_data[self._outliers :]) # Map clusters to their lowest topic_id cluster_to_lowest = {} for cluster, topic_id in zip(predictions, unique_topics): if cluster != -1: # Ignore unclustered items if cluster not in cluster_to_lowest: cluster_to_lowest[cluster] = topic_id else: cluster_to_lowest[cluster] = min(cluster_to_lowest[cluster], topic_id) # Map each topic_id to the lowest topic_id in its cluster mapped_topics = {} for cluster, topic_id in zip(predictions, unique_topics): if cluster == -1: mapped_topics[topic_id] = topic_id # No clustering, stays the same else: mapped_topics[topic_id] = cluster_to_lowest[cluster] documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int) mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())} # Track mappings and sizes of topics for merging topic embeddings mappings = defaultdict(list) for key, val in sorted(mapped_topics.items()): mappings[val].append(key) mappings = { topic_to: { "topics_from": topics_from, "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], } for topic_to, topics_from in mappings.items() } # Update documents and topics self.topic_mapper_.add_mappings(mapped_topics, topic_model=self) documents = self._sort_mappings_by_frequency(documents) self._extract_topics(documents, mappings=mappings, verbose=self.verbose) self._update_topic_size(documents) return documents def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame: """Reorder mappings by their frequency. For example, if topic 88 was mapped to topic 5 and topic 5 turns out to be the largest topic, then topic 5 will be topic 0. The second largest will be topic 1, etc. If there are no mappings since no reduction of topics took place, then the topics will simply be ordered by their frequency and will get the topic ids based on that order. This means that -1 will remain the outlier class, and that the rest of the topics will be in descending order of ids and frequency. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics Returns: documents: Updated dataframe with documents and the mapped and re-ordered topic ids """ # No need to sort if it's the first pass of zero-shot topic modeling nr_zeroshot = len(self._topic_id_to_zeroshot_topic_idx) if self._is_zeroshot and not self.nr_topics and nr_zeroshot > 0: return documents # Map topics based on frequency self._update_topic_size(documents) df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False) df = df[df.Old_Topic != -1] sorted_topics = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))} self.topic_mapper_.add_mappings(sorted_topics, topic_model=self) # Map documents documents.Topic = documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int) self._update_topic_size(documents) return documents def _map_probabilities( self, probabilities: Union[np.ndarray, None], original_topics: bool = False ) -> Union[np.ndarray, None]: """Map the probabilities to the reduced topics. This is achieved by adding together the probabilities of all topics that are mapped to the same topic. Then, the topics that were mapped from are set to 0 as they were reduced. Arguments: probabilities: An array containing probabilities original_topics: Whether we want to map from the original topics to the most recent topics or from the second-most recent topics. Returns: mapped_probabilities: Updated probabilities """ mappings = self.topic_mapper_.get_mappings(original_topics) # Map array of probabilities (probability for assigned topic per document) if probabilities is not None: if len(probabilities.shape) == 2: mapped_probabilities = np.zeros( ( probabilities.shape[0], len(set(mappings.values())) - self._outliers, ) ) for from_topic, to_topic in mappings.items(): if to_topic != -1 and from_topic != -1: mapped_probabilities[:, to_topic] += probabilities[:, from_topic] return mapped_probabilities return probabilities def _preprocess_text(self, documents: np.ndarray) -> List[str]: r"""Basic preprocessing of text. Steps: * Replace \n and \t with whitespace * Only keep alpha-numerical characters """ cleaned_documents = [doc.replace("\n", " ") for doc in documents] cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents] if self.language == "english": cleaned_documents = [re.sub(r"[^A-Za-z0-9 ]+", "", doc) for doc in cleaned_documents] cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents] return cleaned_documents @staticmethod def _top_n_idx_sparse(matrix: csr_matrix, n: int) -> np.ndarray: """Return indices of top n values in each row of a sparse matrix. Retrieved from: https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix Arguments: matrix: The sparse matrix from which to get the top n indices per row n: The number of highest values to extract from each row Returns: indices: The top n indices per row """ indices = [] for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]): n_row_pick = min(n, ri - le) values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]] values = [values[index] if len(values) >= index + 1 else None for index in range(n)] indices.append(values) return np.array(indices) @staticmethod def _top_n_values_sparse(matrix: csr_matrix, indices: np.ndarray) -> np.ndarray: """Return the top n values for each row in a sparse matrix. Arguments: matrix: The sparse matrix from which to get the top n indices per row indices: The top n indices per row Returns: top_values: The top n scores per row """ top_values = [] for row, values in enumerate(indices): scores = np.array([matrix[row, value] if value is not None else 0 for value in values]) top_values.append(scores) return np.array(top_values) @classmethod def _get_param_names(cls): """Get parameter names for the estimator. Adapted from: https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178 """ init_signature = inspect.signature(cls.__init__) parameters = sorted( [p.name for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD] ) return parameters def __str__(self): """Get a string representation of the current object. Returns: str: Human readable representation of the most important model parameters. The parameters that represent models are ignored due to their length. """ parameters = "" for parameter, value in self.get_params().items(): value = str(value) if "(" in value and value[0] != "(": value = value.split("(")[0] + "(...)" parameters += f"{parameter}={value}, " return f"BERTopic({parameters[:-2]})" class TopicMapper: """Keep track of Topic Mappings. The number of topics can be reduced by merging them together. This mapping needs to be tracked in BERTopic as new predictions need to be mapped to the new topics. These mappings are tracked in the `self.mappings_` attribute where each set of topic is stacked horizontally. For example, the most recent topics can be found in the last column. To get a mapping, simply take two columns of topics. In other words, it is represented as graph: Topic 1 --> Topic 11 --> Topic 4 --> etc. Attributes: self.mappings_ (np.ndarray) : A matrix indicating the mappings from one topic to another. The columns represent a collection of topics at any time. The last column represents the current state of topics and the first column represents the initial state of topics. """ def __init__(self, topics: List[int]): """Initialization of Topic Mapper. Arguments: topics: A list of topics per document """ base_topics = np.array(sorted(set(topics))) topics = base_topics.copy().reshape(-1, 1) self.mappings_ = np.hstack([topics.copy(), topics.copy()]).tolist() def get_mappings(self, original_topics: bool = True) -> Mapping[int, int]: """Get mappings from either the original topics or the second-most recent topics to the current topics. Arguments: original_topics: Whether we want to map from the original topics to the most recent topics or from the second-most recent topics. Returns: mappings: The mappings from old topics to new topics Examples: To get mappings, simply call: ```python mapper = TopicMapper(topics) mappings = mapper.get_mappings(original_topics=False) ``` """ if original_topics: mappings = np.array(self.mappings_)[:, [0, -1]] mappings = dict(zip(mappings[:, 0], mappings[:, 1])) else: mappings = np.array(self.mappings_)[:, [-3, -1]] mappings = dict(zip(mappings[:, 0], mappings[:, 1])) return mappings def add_mappings(self, mappings: Mapping[int, int], topic_model: BERTopic): """Add new column(s) of topic mappings. Arguments: mappings: The mappings to add topic_model: The topic model this TopicMapper belongs to """ for topics in self.mappings_: topic = topics[-1] if topic in mappings: topics.append(mappings[topic]) else: topics.append(-1) # When zero-shot topic(s) are present in the topics to merge, # determine whether to take one of the zero-shot topic labels # or use a calculated representation. if topic_model._is_zeroshot() and len(topic_model._topic_id_to_zeroshot_topic_idx) > 0: new_topic_id_to_zeroshot_topic_idx = {} topics_to_map = { topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(topic_model.topic_mapper_.mappings_)[:, -2:] } # Map topic_to to topics_from mapping = defaultdict(list) for key, value in topics_to_map.items(): mapping[value].append(key) for topic_to, topics_from in mapping.items(): # which of the original topics are zero-shot zeroshot_topic_ids = [ topic_id for topic_id in topics_from if topic_id in topic_model._topic_id_to_zeroshot_topic_idx ] if len(zeroshot_topic_ids) == 0: continue # If any of the original topics are zero-shot, take the best fitting zero-shot label # if the cosine similarity with the new topic exceeds the zero-shot threshold zeroshot_labels = [ topic_model.zeroshot_topic_list[topic_model._topic_id_to_zeroshot_topic_idx[topic_id]] for topic_id in zeroshot_topic_ids ] zeroshot_embeddings = topic_model._extract_embeddings(zeroshot_labels) cosine_similarities = cosine_similarity( zeroshot_embeddings, [topic_model.topic_embeddings_[topic_to]] ).flatten() best_zeroshot_topic_idx = np.argmax(cosine_similarities) best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx] if best_cosine_similarity >= topic_model.zeroshot_min_similarity: # Using the topic ID from before mapping, get the idx into the zeroshot topic list new_topic_id_to_zeroshot_topic_idx[topic_to] = topic_model._topic_id_to_zeroshot_topic_idx[ zeroshot_topic_ids[best_zeroshot_topic_idx] ] topic_model._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx def add_new_topics(self, mappings: Mapping[int, int]): """Add new row(s) of topic mappings. Arguments: mappings: The mappings to add """ length = len(self.mappings_[0]) for key, value in mappings.items(): to_append = [key] + ([None] * (length - 2)) + [value] self.mappings_.append(to_append) def _create_model_from_files( topics: Mapping[str, Any], params: Mapping[str, Any], tensors: Mapping[str, np.array], ctfidf_tensors: Mapping[str, Any] | None = None, ctfidf_config: Mapping[str, Any] | None = None, images: Mapping[int, Any] | None = None, warn_no_backend: bool = True, ): """Create a BERTopic model from a variety of inputs. Arguments: topics: A dictionary containing topic metadata, including: - Topic representations, labels, sizes, custom labels, etc. params: BERTopic-specific hyperparams, including HF embedding_model ID if given. tensors: The topic embeddings ctfidf_tensors: The c-TF-IDF representations ctfidf_config: The config for CountVectorizer and c-TF-IDF images: The images per topic warn_no_backend: Whether to warn the user if no backend is given """ params["n_gram_range"] = tuple(params["n_gram_range"]) if ctfidf_config is not None: ngram_range = ctfidf_config["vectorizer_model"]["params"]["ngram_range"] ctfidf_config["vectorizer_model"]["params"]["ngram_range"] = tuple(ngram_range) params["n_gram_range"] = tuple(params["n_gram_range"]) # Select HF model through SentenceTransformers try: from sentence_transformers import SentenceTransformer embedding_model = select_backend(SentenceTransformer(params["embedding_model"])) except: # noqa: E722 embedding_model = BaseEmbedder() if warn_no_backend: logger.warning( "You are loading a BERTopic model without explicitly defining an embedding model." " If you want to also load in an embedding model, make sure to use" " `BERTopic.load(my_model, embedding_model=my_embedding_model)`." ) if params.get("embedding_model") is not None: del params["embedding_model"] # Prepare our empty sub-models empty_dimensionality_model = BaseDimensionalityReduction() empty_cluster_model = BaseCluster() # Fit BERTopic without actually performing any clustering topic_model = BERTopic( embedding_model=embedding_model, umap_model=empty_dimensionality_model, hdbscan_model=empty_cluster_model, **params, ) topic_model.topic_embeddings_ = tensors["topic_embeddings"] topic_model.topic_representations_ = {int(key): val for key, val in topics["topic_representations"].items()} topic_model.topics_ = topics["topics"] topic_model.topic_sizes_ = {int(key): val for key, val in topics["topic_sizes"].items()} topic_model.custom_labels_ = topics["custom_labels"] if topics.get("topic_aspects"): topic_aspects = {} for aspect, values in topics["topic_aspects"].items(): if aspect != "Visual_Aspect": topic_aspects[aspect] = {int(topic): value for topic, value in values.items()} topic_model.topic_aspects_ = topic_aspects if images is not None: topic_model.topic_aspects_["Visual_Aspect"] = images # Topic Mapper topic_model.topic_mapper_ = TopicMapper([0]) topic_model.topic_mapper_.mappings_ = topics["topic_mapper"] if ctfidf_tensors is not None: topic_model.c_tf_idf_ = csr_matrix( ( ctfidf_tensors["data"], ctfidf_tensors["indices"], ctfidf_tensors["indptr"], ), shape=ctfidf_tensors["shape"], ) # CountVectorizer topic_model.vectorizer_model = CountVectorizer(**ctfidf_config["vectorizer_model"]["params"]) topic_model.vectorizer_model.vocabulary_ = ctfidf_config["vectorizer_model"]["vocab"] # ClassTfidfTransformer topic_model.ctfidf_model.reduce_frequent_words = ctfidf_config["ctfidf_model"]["reduce_frequent_words"] topic_model.ctfidf_model.bm25_weighting = ctfidf_config["ctfidf_model"]["bm25_weighting"] idf = ctfidf_tensors["diag"] topic_model.ctfidf_model._idf_diag = sp.diags( idf, offsets=0, shape=(len(idf), len(idf)), format="csr", dtype=np.float64 ) return topic_model ================================================ FILE: bertopic/_save_utils.py ================================================ import os import json import numpy as np from pathlib import Path from tempfile import TemporaryDirectory # HuggingFace Hub try: from huggingface_hub import ( create_repo, get_hf_file_metadata, hf_hub_download, hf_hub_url, repo_type_and_id_from_hf_id, upload_folder, ) _has_hf_hub = True except ImportError: _has_hf_hub = False # Typing from typing import Union # Pytorch check try: import torch _has_torch = True except ImportError: _has_torch = False # Image check try: from PIL import Image _has_vision = True except ImportError: _has_vision = False TOPICS_NAME = "topics.json" CONFIG_NAME = "config.json" HF_WEIGHTS_NAME = "topic_embeddings.bin" # default pytorch pkl HF_SAFE_WEIGHTS_NAME = "topic_embeddings.safetensors" # safetensors version CTFIDF_WEIGHTS_NAME = "ctfidf.bin" # default pytorch pkl CTFIDF_SAFE_WEIGHTS_NAME = "ctfidf.safetensors" # safetensors version CTFIDF_CFG_NAME = "ctfidf_config.json" MODEL_CARD_TEMPLATE = """ --- tags: - bertopic library_name: bertopic pipeline_tag: {PIPELINE_TAG} --- # {MODEL_NAME} This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model. BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets. ## Usage To use this model, please install BERTopic: ``` pip install -U bertopic ``` You can use the model as follows: ```python from bertopic import BERTopic topic_model = BERTopic.load("{PATH}") topic_model.get_topic_info() ``` ## Topic overview * Number of topics: {NR_TOPICS} * Number of training documents: {NR_DOCUMENTS}
Click here for an overview of all topics. {TOPICS}
## Training hyperparameters {HYPERPARAMS} ## Framework versions {FRAMEWORKS} """ def push_to_hf_hub( model, repo_id: str, commit_message: str = "Add BERTopic model", token: str | None = None, revision: str | None = None, private: bool = False, create_pr: bool = False, model_card: bool = True, serialization: str = "safetensors", save_embedding_model: Union[str, bool] = True, save_ctfidf: bool = False, ): """Push your BERTopic model to a HuggingFace Hub. Arguments: model: The BERTopic model to push repo_id: The name of your HuggingFace repository commit_message: A commit message token: Token to add if not already logged in revision: Repository revision private: Whether to create a private repository create_pr: Whether to upload the model as a Pull Request model_card: Whether to automatically create a modelcard serialization: The type of serialization. Either `safetensors` or `pytorch` save_embedding_model: A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., `sentence-transformers/all-MiniLM-L6-v2` save_ctfidf: Whether to save c-TF-IDF information """ if not _has_hf_hub: raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`") # Create repo if it doesn't exist yet and infer complete repo_id repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True) _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url) repo_id = f"{repo_owner}/{repo_name}" # Temporarily save model and push to HF with TemporaryDirectory() as tmpdir: # Save model weights and config. model.save( tmpdir, serialization=serialization, save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf, ) # Add README if it does not exist try: get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) except: # noqa: E722 if model_card: readme_text = generate_readme(model, repo_id) readme_path = Path(tmpdir) / "README.md" readme_path.write_text(readme_text, encoding="utf8") # Upload model return upload_folder( repo_id=repo_id, folder_path=tmpdir, revision=revision, create_pr=create_pr, commit_message=commit_message, ) def load_local_files(path): """Load local BERTopic files.""" # Load json configs topics = load_cfg_from_json(path / TOPICS_NAME) params = load_cfg_from_json(path / CONFIG_NAME) # Load Topic Embeddings safetensor_path = path / HF_SAFE_WEIGHTS_NAME if safetensor_path.is_file(): tensors = load_safetensors(safetensor_path) else: torch_path = path / HF_WEIGHTS_NAME if torch_path.is_file(): tensors = torch.load(torch_path, map_location="cpu") tensors = {k: v.numpy() for k, v in tensors.items()} # c-TF-IDF try: ctfidf_tensors = None safetensor_path = path / CTFIDF_SAFE_WEIGHTS_NAME if safetensor_path.is_file(): ctfidf_tensors = load_safetensors(safetensor_path) else: torch_path = path / CTFIDF_WEIGHTS_NAME if torch_path.is_file(): ctfidf_tensors = torch.load(torch_path, map_location="cpu") ctfidf_tensors = {k: v.numpy() for k, v in ctfidf_tensors.items()} ctfidf_config = load_cfg_from_json(path / CTFIDF_CFG_NAME) except: # noqa: E722 ctfidf_config, ctfidf_tensors = None, None # Load images images = None if _has_vision: try: Image.open(path / "images/0.jpg") _has_images = True except: # noqa: E722 _has_images = False if _has_images: topic_list = list(topics["topic_representations"].keys()) images = {} for topic in topic_list: image = Image.open(path / f"images/{topic}.jpg") images[int(topic)] = image return topics, params, tensors, ctfidf_tensors, ctfidf_config, images def load_files_from_hf(path): """Load files from HuggingFace.""" path = str(path) # Configs topics = load_cfg_from_json(hf_hub_download(path, TOPICS_NAME, revision=None)) params = load_cfg_from_json(hf_hub_download(path, CONFIG_NAME, revision=None)) # Topic Embeddings try: tensors = hf_hub_download(path, HF_SAFE_WEIGHTS_NAME, revision=None) tensors = load_safetensors(tensors) except: # noqa: E722 tensors = hf_hub_download(path, HF_WEIGHTS_NAME, revision=None) tensors = torch.load(tensors, map_location="cpu") # c-TF-IDF try: ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)) try: ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None) ctfidf_tensors = load_safetensors(ctfidf_tensors) except: # noqa: E722 ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None) ctfidf_tensors = torch.load(ctfidf_tensors, map_location="cpu") except: # noqa: E722 ctfidf_config, ctfidf_tensors = None, None # Load images if they exist images = None if _has_vision: try: hf_hub_download(path, "images/0.jpg", revision=None) _has_images = True except: # noqa: E722 _has_images = False if _has_images: topic_list = list(topics["topic_representations"].keys()) images = {} for topic in topic_list: image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None)) images[int(topic)] = image return topics, params, tensors, ctfidf_tensors, ctfidf_config, images def generate_readme(model, repo_id: str): """Generate README for HuggingFace model card.""" model_card = MODEL_CARD_TEMPLATE topic_table_head = "| Topic ID | Topic Keywords | Topic Frequency | Label | \n|----------|----------------|-----------------|-------| \n" # Get Statistics model_name = repo_id.split("/")[-1] params = {param: value for param, value in model.get_params().items() if "model" not in param} params = "\n".join([f"* {param}: {value}" for param, value in params.items()]) topics = sorted(list(set(model.topics_))) nr_topics = str(len(set(model.topics_))) if model.topic_sizes_ is not None: nr_documents = str(sum(model.topic_sizes_.values())) else: nr_documents = "" # Topic information topic_keywords = [" - ".join(next(zip(*model.get_topic(topic)))[:5]) for topic in topics] topic_freq = [model.get_topic_freq(topic) for topic in topics] topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics] topics = [ f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n" for index, topic in enumerate(topics) ] topics = topic_table_head + "".join(topics) frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()]) # Fill Statistics into model card model_card = model_card.replace("{MODEL_NAME}", model_name) model_card = model_card.replace("{PATH}", repo_id) model_card = model_card.replace("{NR_TOPICS}", nr_topics) model_card = model_card.replace("{TOPICS}", topics.strip()) model_card = model_card.replace("{NR_DOCUMENTS}", nr_documents) model_card = model_card.replace("{HYPERPARAMS}", params) model_card = model_card.replace("{FRAMEWORKS}", frameworks) # Fill Pipeline tag has_visual_aspect = check_has_visual_aspect(model) if not has_visual_aspect: model_card = model_card.replace("{PIPELINE_TAG}", "text-classification") else: model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n", "") # TODO add proper tag for this instance return model_card def save_hf(model, save_directory, serialization: str): """Save topic embeddings, either safely (using safetensors) or using legacy pytorch.""" tensors = np.array(model.topic_embeddings_, dtype=np.float32) if serialization == "safetensors": tensors = {"topic_embeddings": tensors} save_safetensors(save_directory / HF_SAFE_WEIGHTS_NAME, tensors) if serialization == "pytorch": assert _has_torch, "`pip install pytorch` to save as bin" tensors = {"topic_embeddings": torch.from_numpy(tensors)} torch.save(tensors, save_directory / HF_WEIGHTS_NAME) def save_ctfidf(model, save_directory: str, serialization: str): """Save c-TF-IDF sparse matrix.""" indptr = model.c_tf_idf_.indptr indices = model.c_tf_idf_.indices data = model.c_tf_idf_.data shape = np.array(model.c_tf_idf_.shape) diag = np.array(model.ctfidf_model._idf_diag.data) if serialization == "safetensors": tensors = { "indptr": indptr, "indices": indices, "data": data, "shape": shape, "diag": diag, } save_safetensors(save_directory / CTFIDF_SAFE_WEIGHTS_NAME, tensors) if serialization == "pytorch": assert _has_torch, "`pip install pytorch` to save as .bin" tensors = { "indptr": torch.from_numpy(indptr), "indices": torch.from_numpy(indices), "data": torch.from_numpy(data), "shape": torch.from_numpy(shape), "diag": torch.from_numpy(diag), } torch.save(tensors, save_directory / CTFIDF_WEIGHTS_NAME) def save_ctfidf_config(model, path): """Save parameters to recreate CountVectorizer and c-TF-IDF.""" config = {} # Recreate ClassTfidfTransformer config["ctfidf_model"] = { "bm25_weighting": model.ctfidf_model.bm25_weighting, "reduce_frequent_words": model.ctfidf_model.reduce_frequent_words, } # Recreate CountVectorizer cv_params = model.vectorizer_model.get_params() del cv_params["tokenizer"], cv_params["preprocessor"], cv_params["dtype"] if not isinstance(cv_params["analyzer"], str): del cv_params["analyzer"] config["vectorizer_model"] = { "params": cv_params, "vocab": model.vectorizer_model.vocabulary_, } with path.open("w") as f: json.dump(config, f, indent=2) def save_config(model, path: str, embedding_model): """Save BERTopic configuration.""" path = Path(path) params = model.get_params() config = {param: value for param, value in params.items() if "model" not in param} # Embedding model tag to be used in sentence-transformers if isinstance(embedding_model, str): config["embedding_model"] = embedding_model with path.open("w") as f: json.dump(config, f, indent=2) return config def check_has_visual_aspect(model): """Check if model has visual aspect.""" if _has_vision: for aspect, value in model.topic_aspects_.items(): if isinstance(value[0], Image.Image): return True def save_images(model, path: str): """Save topic images.""" if _has_vision: visual_aspects = None for aspect, value in model.topic_aspects_.items(): if isinstance(value[0], Image.Image): visual_aspects = model.topic_aspects_[aspect] break if visual_aspects is not None: path.mkdir(exist_ok=True, parents=True) for topic, image in visual_aspects.items(): image.save(path / f"{topic}.jpg") def save_topics(model, path: str): """Save Topic-specific information.""" path = Path(path) if _has_vision: selected_topic_aspects = {} for aspect, value in model.topic_aspects_.items(): if not isinstance(value[0], Image.Image): selected_topic_aspects[aspect] = value else: selected_topic_aspects["Visual_Aspect"] = True else: selected_topic_aspects = model.topic_aspects_ topics = { "topic_representations": model.topic_representations_, "topics": [int(topic) for topic in model.topics_], "topic_sizes": model.topic_sizes_, "topic_mapper": np.array(model.topic_mapper_.mappings_, dtype=int).tolist(), "topic_labels": model.topic_labels_, "custom_labels": model.custom_labels_, "_outliers": int(model._outliers), "topic_aspects": selected_topic_aspects, } with path.open("w") as f: json.dump(topics, f, indent=2, cls=NumpyEncoder) def load_cfg_from_json(json_file: Union[str, os.PathLike]): """Load configuration from json.""" with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() return json.loads(text) class NumpyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) return super(NumpyEncoder, self).default(obj) def get_package_versions(): """Get versions of main dependencies of BERTopic.""" try: import platform from numpy import __version__ as np_version from pandas import __version__ as pandas_version from sklearn import __version__ as sklearn_version from plotly import __version__ as plotly_version try: from importlib.metadata import version hdbscan_version = version("hdbscan") except (ImportError, ModuleNotFoundError): hdbscan_version = None try: from umap import __version__ as umap_version except (ImportError, ModuleNotFoundError): umap_version = None try: from sentence_transformers import __version__ as sbert_version except (ImportError, ModuleNotFoundError): sbert_version = None try: from numba import __version__ as numba_version except (ImportError, ModuleNotFoundError): numba_version = None try: from transformers import __version__ as transformers_version except (ImportError, ModuleNotFoundError): transformers_version = None return { "Numpy": np_version, "HDBSCAN": hdbscan_version, "UMAP": umap_version, "Pandas": pandas_version, "Scikit-Learn": sklearn_version, "Sentence-transformers": sbert_version, "Transformers": transformers_version, "Numba": numba_version, "Plotly": plotly_version, "Python": platform.python_version(), } except Exception as e: return e def load_safetensors(path): """Load safetensors and check whether it is installed.""" try: import safetensors.numpy return safetensors.numpy.load_file(path) except ImportError: raise ValueError("`pip install safetensors` to load .safetensors") def save_safetensors(path, tensors): """Save safetensors and check whether it is installed.""" try: import safetensors.numpy safetensors.numpy.save_file(tensors, path) except ImportError: raise ValueError("`pip install safetensors` to save as .safetensors") ================================================ FILE: bertopic/_utils.py ================================================ import numpy as np import pandas as pd import logging from collections.abc import Iterable from scipy.sparse import csr_matrix from scipy.spatial.distance import squareform from typing import Optional, Union, Tuple, Any class MyLogger: def __init__(self): self.logger = logging.getLogger("BERTopic") def configure(self, level): self.set_level(level) self._add_handler() self.logger.propagate = False def info(self, message): self.logger.info(f"{message}") def warning(self, message): self.logger.warning(f"WARNING: {message}") def set_level(self, level): levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] if level in levels: self.logger.setLevel(level) def _add_handler(self): sh = logging.StreamHandler() sh.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(message)s")) self.logger.addHandler(sh) # Remove duplicate handlers if len(self.logger.handlers) > 1: self.logger.handlers = [self.logger.handlers[0]] def check_documents_type(documents): """Check whether the input documents are indeed a list of strings.""" if isinstance(documents, pd.DataFrame): raise TypeError("Make sure to supply a list of strings, not a dataframe.") elif isinstance(documents, Iterable) and not isinstance(documents, str): if not any([isinstance(doc, str) for doc in documents]): raise TypeError("Make sure that the iterable only contains strings.") else: raise TypeError("Make sure that the documents variable is an iterable containing strings only.") def check_embeddings_shape(embeddings, docs): """Check if the embeddings have the correct shape.""" if embeddings is not None: if not any([isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]): raise ValueError("Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. ") else: if embeddings.shape[0] != len(docs): raise ValueError( "Make sure that the embeddings are a numpy array with shape: " "(len(docs), vector_dim) where vector_dim is the dimensionality " "of the vector embeddings. " ) def check_is_fitted(topic_model): """Checks if the model was fitted by verifying the presence of self.matches. Arguments: topic_model: BERTopic instance for which the check is performed. Returns: None Raises: ValueError: If the matches were not found. """ msg = "This %(name)s instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." if topic_model.topics_ is None: raise ValueError(msg % {"name": type(topic_model).__name__}) class NotInstalled: """This object is used to notify the user that additional dependencies need to be installed in order to use the string matching model. """ def __init__(self, tool, dep, custom_msg=None): self.tool = tool self.dep = dep msg = f"In order to use {self.tool} you will need to install via;\n\n" if custom_msg is not None: msg += custom_msg else: msg += f"pip install bertopic[{self.dep}]\n\n" self.msg = msg def __getattr__(self, *args, **kwargs): raise ModuleNotFoundError(self.msg) def __call__(self, *args, **kwargs): raise ModuleNotFoundError(self.msg) def validate_distance_matrix(X, n_samples): """Validate the distance matrix and convert it to a condensed distance matrix if necessary. A valid distance matrix is either a square matrix of shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. Arguments: X: Distance matrix to validate. n_samples: Number of samples in the dataset. Returns: X: Validated distance matrix. Raises: ValueError: If the distance matrix is not valid. """ # Make sure it is the 1-D condensed distance matrix with zeros on the diagonal s = X.shape if len(s) == 1: # check it has correct size n = s[0] if n != (n_samples * (n_samples - 1) / 2): raise ValueError("The condensed distance matrix must have shape (n*(n-1)/2,).") elif len(s) == 2: # check it has correct size if (s[0] != n_samples) or (s[1] != n_samples): raise ValueError("The distance matrix must be of shape (n, n) where n is the number of samples.") # force zero diagonal and convert to condensed np.fill_diagonal(X, 0) X = squareform(X) else: raise ValueError( "The distance matrix must be either a 1-D condensed " "distance matrix of shape (n*(n-1)/2,) or a " "2-D square distance matrix of shape (n, n)." "where n is the number of documents." "Got a distance matrix of shape %s" % str(s) ) # Make sure its entries are non-negative if np.any(X < 0): raise ValueError("Distance matrix cannot contain negative values.") return X def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array: """Check if the consecutive elements in the distance array are the same. If so, a small noise is added to one of the elements to make sure that the array does not contain duplicates. Arguments: dists: distance array sorted in the increasing order. noise_max: the maximal magnitude of noise to be added. Returns: Unique distances sorted in the preserved increasing order. """ dists_cp = dists.copy() for i in range(dists.shape[0] - 1): if dists[i] == dists[i + 1]: # returns the next unique distance or the current distance with the added noise next_unique_dist = next((d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max) # the noise can never be large then the difference between the next unique distance and the current one curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i]) dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise) return dists_cp def select_topic_representation( ctfidf_embeddings: Optional[Union[np.ndarray, csr_matrix]] = None, embeddings: Optional[Union[np.ndarray, csr_matrix]] = None, use_ctfidf: bool = True, output_ndarray: bool = False, ) -> Tuple[np.ndarray, bool]: """Select the topic representation. Arguments: ctfidf_embeddings: The c-TF-IDF embedding matrix embeddings: The topic embedding matrix use_ctfidf: Whether to use the c-TF-IDF representation. If False, topics embedding representation is used, if it exists. Default is True. output_ndarray: Whether to convert the selected representation into ndarray Raises ValueError: - If no topic representation was found - If c-TF-IDF embeddings are not a numpy array or a scipy.sparse.csr_matrix Returns: The selected topic representation and a boolean indicating whether it is c-TF-IDF. """ def to_ndarray(array: Union[np.ndarray, csr_matrix]) -> np.ndarray: if isinstance(array, csr_matrix): return array.toarray() return array logger = MyLogger() if use_ctfidf: if ctfidf_embeddings is None: logger.warning( "No c-TF-IDF matrix was found despite it is supposed to be used (`use_ctfidf` is True). " "Defaulting to semantic embeddings." ) repr_, ctfidf_used = embeddings, False else: repr_, ctfidf_used = ctfidf_embeddings, True else: if embeddings is None: logger.warning( "No topic embeddings were found despite they are supposed to be used (`use_ctfidf` is False). " "Defaulting to c-TF-IDF representation." ) repr_, ctfidf_used = ctfidf_embeddings, True else: repr_, ctfidf_used = embeddings, False return to_ndarray(repr_) if output_ndarray else repr_, ctfidf_used class MockPlotlyModule: """Mock module that raises an error when plotly functions are called.""" def __getattr__(self, name: str) -> Any: def mock_function(*args, **kwargs): raise ImportError(f"Plotly is required to use '{name}'. Install it with uv pip install plotly") return mock_function ================================================ FILE: bertopic/backend/__init__.py ================================================ from ._base import BaseEmbedder from ._word_doc import WordDocEmbedder from ._utils import languages from bertopic._utils import NotInstalled # OpenAI Embeddings try: from bertopic.backend._openai import OpenAIBackend except ModuleNotFoundError: msg = "`pip install openai` \n\n" OpenAIBackend = NotInstalled("OpenAI", "OpenAI", custom_msg=msg) # Cohere Embeddings try: from bertopic.backend._cohere import CohereBackend except ModuleNotFoundError: msg = "`pip install cohere` \n\n" CohereBackend = NotInstalled("Cohere", "Cohere", custom_msg=msg) # Multimodal Embeddings try: from bertopic.backend._multimodal import MultiModalBackend except ModuleNotFoundError: msg = "`pip install bertopic[vision]` \n\n" MultiModalBackend = NotInstalled("Vision", "Vision", custom_msg=msg) # Model2Vec Embeddings try: from bertopic.backend._model2vec import Model2VecBackend except ModuleNotFoundError: msg = "`pip install model2vec` \n\n" Model2VecBackend = NotInstalled("Model2Vec", "Model2Vec", custom_msg=msg) # FasteEmbed Embeddings try: from bertopic.backend._fastembed import FastEmbedBackend except ModuleNotFoundError: msg = "`pip install fastembed` \n\n" FastEmbedBackend = NotInstalled("FastEmbed", "FastEmbed", custom_msg=msg) # Langchain Embedddings try: from bertopic.backend._langchain import LangChainBackend except ModuleNotFoundError: msg = "`pip install langchain` \n\n" LangChainBackend = NotInstalled("LangChain", "LangChain", custom_msg=msg) __all__ = [ "BaseEmbedder", "CohereBackend", "FastEmbedBackend", "LangChainBackend", "Model2VecBackend", "MultiModalBackend", "OpenAIBackend", "WordDocEmbedder", "languages", ] ================================================ FILE: bertopic/backend/_base.py ================================================ import numpy as np from typing import List class BaseEmbedder: """The Base Embedder used for creating embedding models. Arguments: embedding_model: The main embedding model to be used for extracting document and word embedding word_embedding_model: The embedding model used for extracting word embeddings only. If this model is selected, then the `embedding_model` is purely used for creating document embeddings. """ def __init__(self, embedding_model=None, word_embedding_model=None): self.embedding_model = embedding_model self.word_embedding_model = word_embedding_model def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ pass def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. Arguments: words: A list of words to be embedded verbose: Controls the verbosity of the process Returns: Word embeddings with shape (n, m) with `n` words that each have an embeddings size of `m` """ return self.embed(words, verbose) def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. Arguments: document: A list of documents to be embedded verbose: Controls the verbosity of the process Returns: Document embeddings with shape (n, m) with `n` documents that each have an embeddings size of `m` """ return self.embed(document, verbose) ================================================ FILE: bertopic/backend/_cohere.py ================================================ import time import numpy as np from tqdm import tqdm from typing import Any, List, Mapping from bertopic.backend import BaseEmbedder class CohereBackend(BaseEmbedder): """Cohere Embedding Model. Arguments: client: A `cohere` client. embedding_model: A Cohere model. Default is "large". For an overview of models see: https://docs.cohere.ai/docs/generation-card delay_in_seconds: If a `batch_size` is given, use this set the delay in seconds between batches. batch_size: The size of each batch. embed_kwargs: Kwargs passed to `cohere.Client.embed`. Can be used to define additional parameters such as `input_type` Examples: ```python import cohere from bertopic.backend import CohereBackend client = cohere.Client("APIKEY") cohere_model = CohereBackend(client) ``` If you want to specify `input_type`: ```python cohere_model = CohereBackend( client, embedding_model="embed-english-v3.0", embed_kwargs={"input_type": "clustering"} ) ``` """ def __init__( self, client, embedding_model: str = "large", delay_in_seconds: float | None = None, batch_size: int | None = None, embed_kwargs: Mapping[str, Any] = {}, ): super().__init__() self.client = client self.embedding_model = embedding_model self.delay_in_seconds = delay_in_seconds self.batch_size = batch_size self.embed_kwargs = embed_kwargs if self.embed_kwargs.get("model"): self.embedding_model = embed_kwargs.get("model") else: self.embed_kwargs["model"] = self.embedding_model def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ # Batch-wise embedding extraction if self.batch_size is not None: embeddings = [] for batch in tqdm(self._chunks(documents), disable=not verbose): response = self.client.embed(texts=batch, **self.embed_kwargs) embeddings.extend(response.embeddings) # Delay subsequent calls if self.delay_in_seconds: time.sleep(self.delay_in_seconds) # Extract embeddings all at once else: response = self.client.embed(texts=documents, **self.embed_kwargs) embeddings = response.embeddings return np.array(embeddings) def _chunks(self, documents): for i in range(0, len(documents), self.batch_size): yield documents[i : i + self.batch_size] ================================================ FILE: bertopic/backend/_fastembed.py ================================================ import numpy as np from typing import List from fastembed import TextEmbedding from bertopic.backend import BaseEmbedder class FastEmbedBackend(BaseEmbedder): """FastEmbed embedding model. The FastEmbed embedding model used for generating sentence embeddings. Arguments: embedding_model: A FastEmbed embedding model Examples: To create a model, you can load in a string pointing to a supported FastEmbed model: ```python from bertopic.backend import FastEmbedBackend sentence_model = FastEmbedBackend("BAAI/bge-small-en-v1.5") ``` """ def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"): super().__init__() supported_models = [m["model"] for m in TextEmbedding.list_supported_models()] if isinstance(embedding_model, str) and embedding_model in supported_models: self.embedding_model = TextEmbedding(model_name=embedding_model) else: raise ValueError( "Please select a correct FasteEmbed model: \n" "the model must be a string and must be supported. \n" "The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/" ) def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ embeddings = np.array(list(self.embedding_model.embed(documents, show_progress_bar=verbose))) return embeddings ================================================ FILE: bertopic/backend/_flair.py ================================================ import numpy as np from tqdm import tqdm from typing import Union, List from flair.data import Sentence from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings from bertopic.backend import BaseEmbedder class FlairBackend(BaseEmbedder): """Flair Embedding Model. The Flair embedding model used for generating document and word embeddings. Arguments: embedding_model: A Flair embedding model Examples: ```python from bertopic.backend import FlairBackend from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings # Create a Flair Embedding model glove_embedding = WordEmbeddings('crawl') document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding]) # Pass the Flair model to create a new backend flair_embedder = FlairBackend(document_glove_embeddings) ``` """ def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]): super().__init__() # Flair word embeddings if isinstance(embedding_model, TokenEmbeddings): self.embedding_model = DocumentPoolEmbeddings([embedding_model]) # Flair document embeddings + disable fine tune to prevent CUDA OOM # https://github.com/flairNLP/flair/issues/1719 elif isinstance(embedding_model, DocumentEmbeddings): if "fine_tune" in embedding_model.__dict__: embedding_model.fine_tune = False self.embedding_model = embedding_model else: raise ValueError( "Please select a correct Flair model by either using preparing a token or document " "embedding model: \n" "`from flair.embeddings import TransformerDocumentEmbeddings` \n" "`roberta = TransformerDocumentEmbeddings('roberta-base')`" ) def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ embeddings = [] for document in tqdm(documents, disable=not verbose): try: sentence = Sentence(document) if document else Sentence("an empty document") self.embedding_model.embed(sentence) except RuntimeError: sentence = Sentence("an empty document") self.embedding_model.embed(sentence) embedding = sentence.embedding.detach().cpu().numpy() embeddings.append(embedding) embeddings = np.asarray(embeddings) return embeddings ================================================ FILE: bertopic/backend/_gensim.py ================================================ import numpy as np from tqdm import tqdm from typing import List from bertopic.backend import BaseEmbedder from gensim.models.keyedvectors import Word2VecKeyedVectors class GensimBackend(BaseEmbedder): """Gensim Embedding Model. The Gensim embedding model is typically used for word embeddings with GloVe, Word2Vec or FastText. Arguments: embedding_model: A Gensim embedding model Examples: ```python from bertopic.backend import GensimBackend import gensim.downloader as api ft = api.load('fasttext-wiki-news-subwords-300') ft_embedder = GensimBackend(ft) ``` """ def __init__(self, embedding_model: Word2VecKeyedVectors): super().__init__() if isinstance(embedding_model, Word2VecKeyedVectors): self.embedding_model = embedding_model else: raise ValueError( "Please select a correct Gensim model: \n" "`import gensim.downloader as api` \n" "`ft = api.load('fasttext-wiki-news-subwords-300')`" ) def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ vector_shape = self.embedding_model.get_vector(next(iter(self.embedding_model.index_to_key))).shape[0] empty_vector = np.zeros(vector_shape) # Extract word embeddings and pool to document-level embeddings = [] for doc in tqdm(documents, disable=not verbose, position=0, leave=True): embedding = [ self.embedding_model.get_vector(word) for word in doc.split() if word in self.embedding_model.key_to_index ] if len(embedding) > 0: embeddings.append(np.mean(embedding, axis=0)) else: embeddings.append(empty_vector) embeddings = np.array(embeddings) return embeddings ================================================ FILE: bertopic/backend/_hftransformers.py ================================================ import numpy as np from tqdm import tqdm from typing import List from torch.utils.data import Dataset from sklearn.preprocessing import normalize from transformers.pipelines import Pipeline from bertopic.backend import BaseEmbedder class HFTransformerBackend(BaseEmbedder): """Hugging Face transformers model. This uses the `transformers.pipelines.pipeline` to define and create a feature generation pipeline from which embeddings can be extracted. Arguments: embedding_model: A Hugging Face feature extraction pipeline Examples: To use a Hugging Face transformers model, load in a pipeline and point to any model found on their model hub (https://huggingface.co/models): ```python from bertopic.backend import HFTransformerBackend from transformers.pipelines import pipeline hf_model = pipeline("feature-extraction", model="distilbert-base-cased") embedding_model = HFTransformerBackend(hf_model) ``` """ def __init__(self, embedding_model: Pipeline): super().__init__() if isinstance(embedding_model, Pipeline): self.embedding_model = embedding_model else: raise ValueError( "Please select a correct transformers pipeline. For example: " "pipeline('feature-extraction', model='distilbert-base-cased', device=0)" ) def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ dataset = MyDataset(documents) embeddings = [] for document, features in tqdm( zip(documents, self.embedding_model(dataset, truncation=True, padding=True)), total=len(dataset), disable=not verbose, ): embeddings.append(self._embed(document, features)) return np.array(embeddings) def _embed(self, document: str, features: np.ndarray) -> np.ndarray: """Mean pooling. Arguments: document: The document for which to extract the attention mask features: The embeddings for each token Adopted from: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers """ token_embeddings = np.array(features) attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[ "attention_mask" ] input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape) sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1) sum_mask = np.clip( input_mask_expanded.sum(1), a_min=1e-9, a_max=input_mask_expanded.sum(1).max(), ) embedding = normalize(sum_embeddings / sum_mask)[0] return embedding class MyDataset(Dataset): """Dataset to pass to `transformers.pipelines.pipeline`.""" def __init__(self, docs): self.docs = docs def __len__(self): return len(self.docs) def __getitem__(self, idx): return self.docs[idx] ================================================ FILE: bertopic/backend/_langchain.py ================================================ from typing import List import numpy as np from bertopic.backend import BaseEmbedder from langchain_core.embeddings import Embeddings class LangChainBackend(BaseEmbedder): """LangChain Embedding Model. This class uses the LangChain Embedding class to embed the documents. Argument: embedding_model: A LangChain Embedding Instance. Examples: ```python from langchain_community.embeddings import HuggingFaceInstructEmbeddings from bertopic.backend import LangChainBackend hf_embedding = HuggingFaceInstructEmbeddings() langchain_embedder = LangChainBackend(hf_embedding) ``` """ def __init__(self, embedding_model: Embeddings): self.embedding_model = embedding_model def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ # Prepare documents, replacing empty strings with a single space prepared_documents = [" " if doc == "" else doc for doc in documents] response = self.embedding_model.embed_documents(prepared_documents) return np.array(response) ================================================ FILE: bertopic/backend/_model2vec.py ================================================ import numpy as np from typing import List, Union from model2vec import StaticModel from sklearn.feature_extraction.text import CountVectorizer from bertopic.backend import BaseEmbedder class Model2VecBackend(BaseEmbedder): """Model2Vec embedding model. Arguments: embedding_model: Either a model2vec model or a string pointing to a model2vec model distill: Indicates whether to distill a sentence-transformers compatible model. The distillation will happen during fitting of the topic model. NOTE: Only works if `embedding_model` is a string. distill_kwargs: Keyword arguments to pass to the distillation process of `model2vec.distill.distill` distill_vectorizer: A CountVectorizer used for creating a custom vocabulary based on the same documents used for topic modeling. NOTE: If "vocabulary" is in `distill_kwargs`, this will be ignored. Examples: To create a model, you can load in a string pointing to a model2vec model: ```python from bertopic.backend import Model2VecBackend sentence_model = Model2VecBackend("minishlab/potion-base-8M") ``` or you can instantiate a model yourself: ```python from bertopic.backend import Model2VecBackend from model2vec import StaticModel embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M") sentence_model = Model2VecBackend(embedding_model) ``` If you want to distill a sentence-transformers model with the vocabulary of the documents, run the following: ```python from bertopic.backend import Model2VecBackend sentence_model = Model2VecBackend("sentence-transformers/all-MiniLM-L6-v2", distill=True) ``` """ def __init__( self, embedding_model: Union[str, StaticModel], distill: bool = False, distill_kwargs: dict = {}, distill_vectorizer: str | None = None, ): super().__init__() self.distill = distill self.distill_kwargs = distill_kwargs self.distill_vectorizer = distill_vectorizer self._has_distilled = False # When we distill, we need a string pointing to a sentence-transformer model if self.distill: self._check_model2vec_installation() if not self.distill_vectorizer: self.distill_vectorizer = CountVectorizer() if isinstance(embedding_model, str): self.embedding_model = embedding_model else: raise ValueError("Please pass a string pointing to a sentence-transformer model when distilling.") # If we don't distill, we can pass a model2vec model directly or load from a string elif isinstance(embedding_model, StaticModel): self.embedding_model = embedding_model elif isinstance(embedding_model, str): self.embedding_model = StaticModel.from_pretrained(embedding_model) else: raise ValueError( "Please select a correct Model2Vec model: \n" "`from model2vec import StaticModel` \n" "`model = StaticModel.from_pretrained('minishlab/potion-base-8M')`" ) def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ # Distill the model if self.distill and not self._has_distilled: from model2vec.distill import distill # Distill with the vocabulary of the documents if not self.distill_kwargs.get("vocabulary"): X = self.distill_vectorizer.fit_transform(documents) word_counts = np.array(X.sum(axis=0)).flatten() words = self.distill_vectorizer.get_feature_names_out() vocabulary = [word for word, _ in sorted(zip(words, word_counts), key=lambda x: x[1], reverse=True)] self.distill_kwargs["vocabulary"] = vocabulary # Distill the model self.embedding_model = distill(self.embedding_model, **self.distill_kwargs) # Distillation should happen only once and not for every embed call # The distillation should only happen the first time on the entire vocabulary self._has_distilled = True # Embed the documents embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose) return embeddings def _check_model2vec_installation(self): try: from model2vec.distill import distill # noqa: F401 except ImportError: raise ImportError("To distill a model using model2vec, you need to run `pip install model2vec[distill]`") ================================================ FILE: bertopic/backend/_multimodal.py ================================================ import numpy as np from PIL import Image from tqdm import tqdm from typing import List, Union from sentence_transformers import SentenceTransformer from bertopic.backend import BaseEmbedder class MultiModalBackend(BaseEmbedder): """Multimodal backend using Sentence-transformers. The sentence-transformers embedding model used for generating word, document, and image embeddings. Arguments: embedding_model: A sentence-transformers embedding model that can either embed both images and text or only text. If it only embeds text, then `image_model` needs to be used to embed the images. image_model: A sentence-transformers embedding model that is used to embed only images. batch_size: The sizes of image batches to pass Examples: To create a model, you can load in a string pointing to a sentence-transformers model: ```python from bertopic.backend import MultiModalBackend sentence_model = MultiModalBackend("clip-ViT-B-32") ``` or you can instantiate a model yourself: ```python from bertopic.backend import MultiModalBackend from sentence_transformers import SentenceTransformer embedding_model = SentenceTransformer("clip-ViT-B-32") sentence_model = MultiModalBackend(embedding_model) ``` """ def __init__( self, embedding_model: Union[str, SentenceTransformer], image_model: Union[str, SentenceTransformer] = None, batch_size: int = 32, ): super().__init__() self.batch_size = batch_size # Text or Text+Image model if isinstance(embedding_model, SentenceTransformer): self.embedding_model = embedding_model elif isinstance(embedding_model, str): self.embedding_model = SentenceTransformer(embedding_model) else: raise ValueError( "Please select a correct SentenceTransformers model: \n" "`from sentence_transformers import SentenceTransformer` \n" "`model = SentenceTransformer('clip-ViT-B-32')`" ) # Image Model self.image_model = None if image_model is not None: if isinstance(image_model, SentenceTransformer): self.image_model = image_model elif isinstance(image_model, str): self.image_model = SentenceTransformer(image_model) else: raise ValueError( "Please select a correct SentenceTransformers model: \n" "`from sentence_transformers import SentenceTransformer` \n" "`model = SentenceTransformer('clip-ViT-B-32')`" ) try: self.tokenizer = self.embedding_model._first_module().processor.tokenizer except AttributeError: self.tokenizer = self.embedding_model.tokenizer except: # noqa: E722 self.tokenizer = None def embed(self, documents: List[str], images: List[str] | None = None, verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words or images into an n-dimensional matrix of embeddings. Either documents, images, or both can be provided. If both are provided, then the embeddings are averaged. Arguments: documents: A list of documents or words to be embedded images: A list of image paths to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ # Embed documents doc_embeddings = None if documents[0] is not None: doc_embeddings = self.embed_documents(documents) # Embed images image_embeddings = None if isinstance(images, list): image_embeddings = self.embed_images(images, verbose) # Average embeddings averaged_embeddings = None if doc_embeddings is not None and image_embeddings is not None: averaged_embeddings = np.mean([doc_embeddings, image_embeddings], axis=0) if averaged_embeddings is not None: return averaged_embeddings elif doc_embeddings is not None: return doc_embeddings elif image_embeddings is not None: return image_embeddings def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ truncated_docs = [self._truncate_document(doc) for doc in documents] embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose) return embeddings def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. Arguments: words: A list of words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ embeddings = self.embedding_model.encode(words, show_progress_bar=verbose) return embeddings def embed_images(self, images, verbose): if self.batch_size: nr_iterations = int(np.ceil(len(images) / self.batch_size)) # Embed images per batch embeddings = [] for i in tqdm(range(nr_iterations), disable=not verbose): start_index = i * self.batch_size end_index = (i * self.batch_size) + self.batch_size images_to_embed = [ Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index] ] if self.image_model is not None: img_emb = self.image_model.encode(images_to_embed) else: img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False) embeddings.extend(img_emb.tolist()) # Close images if isinstance(images[0], str): for image in images_to_embed: image.close() embeddings = np.array(embeddings) else: images_to_embed = [Image.open(filepath) for filepath in images] if self.image_model is not None: embeddings = self.image_model.encode(images_to_embed) else: embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False) return embeddings def _truncate_document(self, document): if self.tokenizer: tokens = self.tokenizer.encode(document) if len(tokens) > 77: # Skip the starting token, only include 75 tokens truncated_tokens = tokens[1:76] document = self.tokenizer.decode(truncated_tokens) # Recursive call here, because the encode(decode()) can have different result return self._truncate_document(document) return document ================================================ FILE: bertopic/backend/_openai.py ================================================ import time import openai import numpy as np from tqdm import tqdm from typing import List, Mapping, Any from bertopic.backend import BaseEmbedder class OpenAIBackend(BaseEmbedder): """OpenAI Embedding Model. Arguments: client: A `openai.OpenAI` client. embedding_model: An OpenAI model. Default is For an overview of models see: https://platform.openai.com/docs/models/embeddings delay_in_seconds: If a `batch_size` is given, use this set the delay in seconds between batches. batch_size: The size of each batch. generator_kwargs: Kwargs passed to `openai.Embedding.create`. Can be used to define custom engines or deployment_ids. Examples: ```python import openai from bertopic.backend import OpenAIBackend client = openai.OpenAI(api_key="sk-...") openai_embedder = OpenAIBackend(client, "text-embedding-ada-002") ``` """ def __init__( self, client: openai.OpenAI, embedding_model: str = "text-embedding-ada-002", delay_in_seconds: float | None = None, batch_size: int | None = None, generator_kwargs: Mapping[str, Any] = {}, ): super().__init__() self.client = client self.embedding_model = embedding_model self.delay_in_seconds = delay_in_seconds self.batch_size = batch_size self.generator_kwargs = generator_kwargs if self.generator_kwargs.get("model"): self.embedding_model = generator_kwargs.get("model") elif not self.generator_kwargs.get("engine"): self.generator_kwargs["model"] = self.embedding_model def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ # Prepare documents, replacing empty strings with a single space prepared_documents = [" " if doc == "" else doc for doc in documents] # Batch-wise embedding extraction if self.batch_size is not None: embeddings = [] for batch in tqdm(self._chunks(prepared_documents), disable=not verbose): response = self.client.embeddings.create(input=batch, **self.generator_kwargs) embeddings.extend([r.embedding for r in response.data]) # Delay subsequent calls if self.delay_in_seconds: time.sleep(self.delay_in_seconds) # Extract embeddings all at once else: response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs) embeddings = [r.embedding for r in response.data] return np.array(embeddings) def _chunks(self, documents): for i in range(0, len(documents), self.batch_size): yield documents[i : i + self.batch_size] ================================================ FILE: bertopic/backend/_sentencetransformers.py ================================================ import numpy as np from typing import List, Union from sentence_transformers import SentenceTransformer from sentence_transformers.models import StaticEmbedding from bertopic.backend import BaseEmbedder class SentenceTransformerBackend(BaseEmbedder): """Sentence-transformers embedding model. The sentence-transformers embedding model used for generating document and word embeddings. Arguments: embedding_model: A sentence-transformers embedding model model2vec: Indicates whether `embedding_model` is a model2vec model. NOTE: Only works if `embedding_model` is a string. Otherwise, you can pass the model2vec model directly to `embedding_model`. Examples: To create a model, you can load in a string pointing to a sentence-transformers model: ```python from bertopic.backend import SentenceTransformerBackend sentence_model = SentenceTransformerBackend("all-MiniLM-L6-v2") ``` or you can instantiate a model yourself: ```python from bertopic.backend import SentenceTransformerBackend from sentence_transformers import SentenceTransformer embedding_model = SentenceTransformer("all-MiniLM-L6-v2") sentence_model = SentenceTransformerBackend(embedding_model) ``` If you want to use a model2vec model without having to install model2vec, you can pass the model2vec model as a string: ```python from bertopic.backend import SentenceTransformerBackend from sentence_transformers import SentenceTransformer embedding_model = SentenceTransformer("minishlab/potion-base-8M", model2vec=True) sentence_model = SentenceTransformerBackend(embedding_model) ``` """ def __init__(self, embedding_model: Union[str, SentenceTransformer], model2vec: bool = False): super().__init__() self._hf_model = None if model2vec and isinstance(embedding_model, str): static_embedding = StaticEmbedding.from_model2vec(embedding_model) self.embedding_model = SentenceTransformer(modules=[static_embedding]) elif isinstance(embedding_model, SentenceTransformer): self.embedding_model = embedding_model elif isinstance(embedding_model, str): self.embedding_model = SentenceTransformer(embedding_model) self._hf_model = embedding_model else: raise ValueError( "Please select a correct SentenceTransformers model: \n" "`from sentence_transformers import SentenceTransformer` \n" "`model = SentenceTransformer('all-MiniLM-L6-v2')`" ) def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose) return embeddings ================================================ FILE: bertopic/backend/_sklearn.py ================================================ from bertopic.backend import BaseEmbedder from sklearn.utils.validation import check_is_fitted, NotFittedError class SklearnEmbedder(BaseEmbedder): """Scikit-Learn based embedding model. This component allows the usage of scikit-learn pipelines for generating document and word embeddings. Arguments: pipe: A scikit-learn pipeline that can `.transform()` text. Examples: Scikit-Learn is very flexible and it allows for many representations. A relatively simple pipeline is shown below. ```python from sklearn.pipeline import make_pipeline from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from bertopic.backend import SklearnEmbedder pipe = make_pipeline( TfidfVectorizer(), TruncatedSVD(100) ) sklearn_embedder = SklearnEmbedder(pipe) topic_model = BERTopic(embedding_model=sklearn_embedder) ``` This pipeline first constructs a sparse representation based on TF/idf and then makes it dense by applying SVD. Alternatively, you might also construct something more elaborate. As long as you construct a scikit-learn compatible pipeline, you should be able to pass it to Bertopic. !!! Warning One caveat to be aware of is that scikit-learns base `Pipeline` class does not support the `.partial_fit()`-API. If you have a pipeline that theoretically should be able to support online learning then you might want to explore the [scikit-partial](https://github.com/koaning/scikit-partial) project. """ def __init__(self, pipe): super().__init__() self.pipe = pipe def embed(self, documents, verbose=False): """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: No-op variable that's kept around to keep the API consistent. If you want to get feedback on training times, you should use the sklearn API. Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ try: check_is_fitted(self.pipe) embeddings = self.pipe.transform(documents) except NotFittedError: embeddings = self.pipe.fit_transform(documents) return embeddings ================================================ FILE: bertopic/backend/_spacy.py ================================================ import numpy as np from tqdm import tqdm from typing import List from bertopic.backend import BaseEmbedder class SpacyBackend(BaseEmbedder): """Spacy embedding model. The Spacy embedding model used for generating document and word embeddings. Arguments: embedding_model: A spacy embedding model Examples: To create a Spacy backend, you need to create an nlp object and pass it through this backend: ```python import spacy from bertopic.backend import SpacyBackend nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) spacy_model = SpacyBackend(nlp) ``` To load in a transformer model use the following: ```python import spacy from thinc.api import set_gpu_allocator, require_gpu from bertopic.backend import SpacyBackend nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) set_gpu_allocator("pytorch") require_gpu(0) spacy_model = SpacyBackend(nlp) ``` If you run into gpu/memory-issues, please use: ```python import spacy from bertopic.backend import SpacyBackend spacy.prefer_gpu() nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) spacy_model = SpacyBackend(nlp) ``` """ def __init__(self, embedding_model): super().__init__() if "spacy" in str(type(embedding_model)): self.embedding_model = embedding_model else: raise ValueError( "Please select a correct Spacy model by either using a string such as 'en_core_web_md' " "or create a nlp model using: `nlp = spacy.load('en_core_web_md')" ) def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ # Handle empty documents, spaCy models automatically map # empty strings to the zero vector empty_document = " " # Extract embeddings embeddings = [] for doc in tqdm(documents, position=0, leave=True, disable=not verbose): embedding = self.embedding_model(doc or empty_document) if embedding.has_vector: embedding = embedding.vector else: embedding = embedding._.trf_data.tensors[-1][0] if not isinstance(embedding, np.ndarray) and hasattr(embedding, "get"): # Convert cupy array to numpy array embedding = embedding.get() embeddings.append(embedding) return np.array(embeddings) ================================================ FILE: bertopic/backend/_use.py ================================================ import numpy as np from tqdm import tqdm from typing import List from bertopic.backend import BaseEmbedder class USEBackend(BaseEmbedder): """Universal Sentence Encoder. USE encodes text into high-dimensional vectors that are used for semantic similarity in BERTopic. Arguments: embedding_model: An USE embedding model Examples: ```python import tensorflow_hub from bertopic.backend import USEBackend embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") use_embedder = USEBackend(embedding_model) ``` """ def __init__(self, embedding_model): super().__init__() try: embedding_model(["test sentence"]) self.embedding_model = embedding_model except TypeError: raise ValueError( "Please select a correct USE model: \n" "`import tensorflow_hub` \n" "`embedding_model = tensorflow_hub.load(path_to_model)`" ) def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. Arguments: documents: A list of documents or words to be embedded verbose: Controls the verbosity of the process Returns: Document/words embeddings with shape (n, m) with `n` documents/words that each have an embeddings size of `m` """ embeddings = np.array( [self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)] ) return embeddings ================================================ FILE: bertopic/backend/_utils.py ================================================ from ._base import BaseEmbedder # Imports for light-weight variant of BERTopic from bertopic.backend._sklearn import SklearnEmbedder from bertopic._utils import MyLogger from sklearn.pipeline import make_pipeline from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline as ScikitPipeline logger = MyLogger() logger.configure("WARNING") languages = [ "arabic", "bulgarian", "catalan", "czech", "danish", "german", "greek", "english", "spanish", "estonian", "persian", "finnish", "french", "canadian french", "galician", "gujarati", "hebrew", "hindi", "croatian", "hungarian", "armenian", "indonesian", "italian", "japanese", "georgian", "korean", "kurdish", "lithuanian", "latvian", "macedonian", "mongolian", "marathi", "malay", "burmese", "norwegian bokmal", "dutch", "polish", "portuguese", "brazilian portuguese", "romanian", "russian", "slovak", "slovenian", "albanian", "serbian", "swedish", "thai", "turkish", "ukrainian", "urdu", "vietnamese", "chinese (simplified)", "chinese (traditional)", ] def select_backend(embedding_model, language: str | None = None, verbose: bool = False) -> BaseEmbedder: """Select an embedding model based on language or a specific provided model. When selecting a language, we choose all-MiniLM-L6-v2 for English and paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages. If sentence-transformers is not installed, in the case of a lightweight installation, a scikit-learn backend is default. Returns: model: The selected model backend. """ logger.set_level("INFO" if verbose else "WARNING") # BERTopic language backend if isinstance(embedding_model, BaseEmbedder): return embedding_model # Scikit-learn backend if isinstance(embedding_model, ScikitPipeline): return SklearnEmbedder(embedding_model) # Flair word embeddings if "flair" in str(type(embedding_model)): from bertopic.backend._flair import FlairBackend return FlairBackend(embedding_model) # Spacy embeddings if "spacy" in str(type(embedding_model)): from bertopic.backend._spacy import SpacyBackend return SpacyBackend(embedding_model) # Gensim embeddings if "gensim" in str(type(embedding_model)): from bertopic.backend._gensim import GensimBackend return GensimBackend(embedding_model) # USE embeddings if "tensorflow" and "saved_model" in str(type(embedding_model)): from bertopic.backend._use import USEBackend return USEBackend(embedding_model) # Sentence Transformer embeddings if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str): from ._sentencetransformers import SentenceTransformerBackend return SentenceTransformerBackend(embedding_model) # Hugging Face embeddings if "transformers" and "pipeline" in str(type(embedding_model)): from ._hftransformers import HFTransformerBackend return HFTransformerBackend(embedding_model) # Model2Vec embeddings if "model2vec" in str(type(embedding_model)): from ._model2vec import Model2VecBackend return Model2VecBackend(embedding_model) # FastEmbed word embeddings if "fastembed" in str(type(embedding_model)): from bertopic.backend._fastembed import FastEmbedBackend return FastEmbedBackend(embedding_model) # Select embedding model based on language if language: try: from ._sentencetransformers import SentenceTransformerBackend if language.lower() in ["English", "english", "en"]: return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2") elif language.lower() in languages or language == "multilingual": return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") else: raise ValueError( f"{language} is currently not supported. However, you can " f"create any embeddings yourself and pass it through fit_transform(docs, embeddings)\n" "Else, please select a language from the following list:\n" f"{languages}" ) # A ModuleNotFoundError might be a lightweight installation except ModuleNotFoundError as e: if e.name != "sentence_transformers": # Error occurred in a downstream module, probably not a lightweight install raise e # Whole sentence_transformers module is missing, probably a lightweight install if verbose: logger.info( "Automatically selecting lightweight scikit-learn embedding backend as sentence-transformers appears to not be installed." ) pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(100)) return SklearnEmbedder(pipe) from ._sentencetransformers import SentenceTransformerBackend return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2") ================================================ FILE: bertopic/backend/_word_doc.py ================================================ import numpy as np from typing import List from bertopic.backend._base import BaseEmbedder from bertopic.backend._utils import select_backend class WordDocEmbedder(BaseEmbedder): """Combine a document- and word-level embedder.""" def __init__(self, embedding_model, word_embedding_model): super().__init__() self.embedding_model = select_backend(embedding_model) self.word_embedding_model = select_backend(word_embedding_model) def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. Arguments: words: A list of words to be embedded verbose: Controls the verbosity of the process Returns: Word embeddings with shape (n, m) with `n` words that each have an embeddings size of `m` """ return self.word_embedding_model.embed(words, verbose) def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. Arguments: document: A list of documents to be embedded verbose: Controls the verbosity of the process Returns: Document embeddings with shape (n, m) with `n` documents that each have an embeddings size of `m` """ return self.embedding_model.embed(document, verbose) ================================================ FILE: bertopic/cluster/__init__.py ================================================ from ._base import BaseCluster __all__ = [ "BaseCluster", ] ================================================ FILE: bertopic/cluster/_base.py ================================================ import numpy as np class BaseCluster: """The Base Cluster class. Using this class directly in BERTopic will make it skip over the cluster step. As a result, topics need to be passed to BERTopic in the form of its `y` parameter in order to create topic representations. Examples: This will skip over the cluster step in BERTopic: ```python from bertopic import BERTopic from bertopic.cluster import BaseCluster empty_cluster_model = BaseCluster() topic_model = BERTopic(hdbscan_model=empty_cluster_model) ``` Then, this class can be used to perform manual topic modeling. That is, topic modeling on a topics that were already generated before without the need to learn them: ```python topic_model.fit(docs, y=y) ``` """ def fit(self, X, y=None): if y is not None: self.labels_ = y else: self.labels_ = None return self def transform(self, X: np.ndarray) -> np.ndarray: return X ================================================ FILE: bertopic/cluster/_utils.py ================================================ import numpy as np def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None): """Function used to select the HDBSCAN-like model for generating predictions and probabilities. Arguments: model: The cluster model. func: The function to use. Options: - "approximate_predict" - "all_points_membership_vectors" - "membership_vector" embeddings: Input embeddings for "approximate_predict" and "membership_vector" """ try: import hdbscan except (ImportError, ModuleNotFoundError): hdbscan = type("hdbscan", (), {"HDBSCAN": None})() # Approximate predict if func == "approximate_predict": if isinstance(model, hdbscan.HDBSCAN): predictions, probabilities = hdbscan.approximate_predict(model, embeddings) return predictions, probabilities str_type_model = str(type(model)).lower() if "cuml" in str_type_model and "hdbscan" in str_type_model: from cuml.cluster import hdbscan as cuml_hdbscan predictions, probabilities = cuml_hdbscan.approximate_predict(model, embeddings) return predictions, probabilities predictions = model.predict(embeddings) return predictions, None # All points membership if func == "all_points_membership_vectors": if isinstance(model, hdbscan.HDBSCAN): return hdbscan.all_points_membership_vectors(model) str_type_model = str(type(model)).lower() if "cuml" in str_type_model and "hdbscan" in str_type_model: from cuml.cluster import hdbscan as cuml_hdbscan return cuml_hdbscan.all_points_membership_vectors(model) return None # membership_vector if func == "membership_vector": if isinstance(model, hdbscan.HDBSCAN): probabilities = hdbscan.membership_vector(model, embeddings) return probabilities str_type_model = str(type(model)).lower() if "cuml" in str_type_model and "hdbscan" in str_type_model: from cuml.cluster import hdbscan as cuml_hdbscan probabilities = cuml_hdbscan.membership_vector(model, embeddings) return probabilities return None def is_supported_hdbscan(model): """Check whether the input model is a supported HDBSCAN-like model.""" try: import hdbscan except (ImportError, ModuleNotFoundError): hdbscan = type("hdbscan", (), {"HDBSCAN": None})() if isinstance(model, hdbscan.HDBSCAN): return True str_type_model = str(type(model)).lower() if "cuml" in str_type_model and "hdbscan" in str_type_model: return True return False ================================================ FILE: bertopic/dimensionality/__init__.py ================================================ from ._base import BaseDimensionalityReduction __all__ = [ "BaseDimensionalityReduction", ] ================================================ FILE: bertopic/dimensionality/_base.py ================================================ import numpy as np class BaseDimensionalityReduction: """The Base Dimensionality Reduction class. You can use this to skip over the dimensionality reduction step in BERTopic. Examples: This will skip over the reduction step in BERTopic: ```python from bertopic import BERTopic from bertopic.dimensionality import BaseDimensionalityReduction empty_reduction_model = BaseDimensionalityReduction() topic_model = BERTopic(umap_model=empty_reduction_model) ``` """ def fit(self, X: np.ndarray = None, y: np.ndarray = None): return self def transform(self, X: np.ndarray) -> np.ndarray: return X ================================================ FILE: bertopic/plotting/__init__.py ================================================ from ._topics import visualize_topics from ._heatmap import visualize_heatmap from ._barchart import visualize_barchart from ._documents import visualize_documents from ._term_rank import visualize_term_rank from ._hierarchy import visualize_hierarchy from ._datamap import visualize_document_datamap from ._distribution import visualize_distribution from ._topics_over_time import visualize_topics_over_time from ._topics_per_class import visualize_topics_per_class from ._hierarchical_documents import visualize_hierarchical_documents from ._approximate_distribution import visualize_approximate_distribution __all__ = [ "visualize_approximate_distribution", "visualize_barchart", "visualize_distribution", "visualize_document_datamap", "visualize_documents", "visualize_heatmap", "visualize_hierarchical_documents", "visualize_hierarchy", "visualize_term_rank", "visualize_topics", "visualize_topics_over_time", "visualize_topics_per_class", ] ================================================ FILE: bertopic/plotting/_approximate_distribution.py ================================================ import numpy as np import pandas as pd try: from pandas.io.formats.style import Styler # noqa: F401 HAS_JINJA = True except (ModuleNotFoundError, ImportError): HAS_JINJA = False def visualize_approximate_distribution( topic_model, document: str, topic_token_distribution: np.ndarray, normalize: bool = False, ): """Visualize the topic distribution calculated by `.approximate_topic_distribution` on a token level. Thereby indicating the extend to which a certain word or phrases belong to a specific topic. The assumption here is that a single word can belong to multiple similar topics and as such give information about the broader set of topics within a single document. Note: This function will return a stylized pandas dataframe if Jinja2 is installed. If not, it will only return a pandas dataframe without color highlighting. To install jinja: `pip install jinja2` Arguments: topic_model: A fitted BERTopic instance. document: The document for which you want to visualize the approximated topic distribution. topic_token_distribution: The topic-token distribution of the document as extracted by `.approximate_topic_distribution` normalize: Whether to normalize, between 0 and 1 (summing to 1), the topic distribution values. Returns: df: A stylized dataframe indicating the best fitting topics for each token. Examples: ```python # Calculate the topic distributions on a token level # Note that we need to have `calculate_token_level=True` topic_distr, topic_token_distr = topic_model.approximate_distribution( docs, calculate_token_level=True ) # Visualize the approximated topic distributions df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]) df ``` To revert this stylized dataframe back to a regular dataframe, you can run the following: ```python df.data.columns = [column.strip() for column in df.data.columns] df = df.data ``` """ # Tokenize document analyzer = topic_model.vectorizer_model.build_tokenizer() tokens = analyzer(document) if len(tokens) == 0: raise ValueError("Make sure that your document contains at least 1 token.") # Prepare dataframe with results if normalize: df = pd.DataFrame(topic_token_distribution / topic_token_distribution.sum()).T else: df = pd.DataFrame(topic_token_distribution).T df.columns = [f"{token}_{i}" for i, token in enumerate(tokens)] df.columns = [f"{token}{' ' * i}" for i, token in enumerate(tokens)] df.index = list(topic_model.topic_labels_.values())[topic_model._outliers :] df = df.loc[(df.sum(axis=1) != 0), :] # Style the resulting dataframe def text_color(val): color = "white" if val == 0 else "black" return "color: %s" % color def highligh_color(data, color="white"): attr = "background-color: {}".format(color) return pd.DataFrame(np.where(data == 0, attr, ""), index=data.index, columns=data.columns) if len(df) == 0: return df elif HAS_JINJA: df = ( df.style.format("{:.3f}") .background_gradient(cmap="Blues", axis=None) .applymap(lambda x: text_color(x)) .apply(highligh_color, axis=None) ) return df ================================================ FILE: bertopic/plotting/_barchart.py ================================================ import itertools import numpy as np from typing import List, Union import plotly.graph_objects as go from plotly.subplots import make_subplots def visualize_barchart( topic_model, topics: List[int] | None = None, top_n_topics: int = 8, n_words: int = 5, custom_labels: Union[bool, str] = False, title: str = "Topic Word Scores", width: int = 250, height: int = 250, autoscale: bool = False, ) -> go.Figure: """Visualize a barchart of selected topics. Arguments: topic_model: A fitted BERTopic instance. topics: A selection of topics to visualize. top_n_topics: Only select the top n most frequent topics. n_words: Number of words to show in a topic custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. width: The width of each figure. height: The height of each figure. autoscale: Whether to automatically calculate the height of the figures to fit the whole bar text Returns: fig: A plotly figure Examples: To visualize the barchart of selected topics simply run: ```python topic_model.visualize_barchart() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_barchart() fig.write_html("path/to/file.html") ``` """ colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"]) # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() freq_df = freq_df.loc[freq_df.Topic != -1, :] if topics is not None: topics = list(topics) elif top_n_topics is not None: topics = sorted(freq_df.Topic.to_list()[:top_n_topics]) else: topics = sorted(freq_df.Topic.to_list()[0:6]) # Initialize figure if isinstance(custom_labels, str): subplot_titles = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics] subplot_titles = ["_".join([label[0] for label in labels[:4]]) for labels in subplot_titles] subplot_titles = [label if len(label) < 30 else label[:27] + "..." for label in subplot_titles] elif topic_model.custom_labels_ is not None and custom_labels: subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics] else: subplot_titles = [f"Topic {topic}" for topic in topics] columns = 4 rows = int(np.ceil(len(topics) / columns)) fig = make_subplots( rows=rows, cols=columns, shared_xaxes=False, horizontal_spacing=0.1, vertical_spacing=0.4 / rows if rows > 1 else 0, subplot_titles=subplot_titles, ) # Add barchart for each topic row = 1 column = 1 for topic in topics: words = [word + " " for word, _ in topic_model.get_topic(topic)][:n_words][::-1] scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1] fig.add_trace( go.Bar(x=scores, y=words, orientation="h", marker_color=next(colors)), row=row, col=column, ) if autoscale: if len(words) > 12: height = 250 + (len(words) - 12) * 11 if len(words) > 9: fig.update_yaxes(tickfont=dict(size=(height - 140) // len(words))) if column == columns: column = 1 row += 1 else: column += 1 # Stylize graph fig.update_layout( template="plotly_white", showlegend=False, title={ "text": f"{title}", "x": 0.5, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, width=width * 4, height=height * rows if rows > 1 else height * 1.3, hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) fig.update_xaxes(showgrid=True) fig.update_yaxes(showgrid=True) return fig ================================================ FILE: bertopic/plotting/_datamap.py ================================================ import numpy as np import pandas as pd from typing import List, Union from warnings import warn try: import datamapplot from matplotlib.figure import Figure except ImportError: warn("Data map plotting is unavailable unless datamapplot is installed.") # Create a dummy figure type for typing class Figure(object): pass def visualize_document_datamap( topic_model, docs: List[str] | None = None, topics: List[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, custom_labels: Union[bool, str] = False, title: str = "Documents and Topics", sub_title: Union[str, None] = None, width: int = 1200, height: int = 750, interactive: bool = False, enable_search: bool = False, topic_prefix: bool = False, datamap_kwds: dict = {}, int_datamap_kwds: dict = {}, ) -> Figure: """Visualize documents and their topics in 2D as a static plot for publication using DataMapPlot. Arguments: topic_model: A fitted BERTopic instance. docs: The documents you used when calling either `fit` or `fit_transform`. topics: A selection of topics to visualize. Not to be confused with the topics that you get from `.fit_transform`. For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. Documents not in these topics will be shown as noise points. embeddings: The embeddings of all documents in `docs`. reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. sub_title: Sub-title of the plot. width: The width of the figure. height: The height of the figure. interactive: Whether to create an interactive plot using DataMapPlot's `create_interactive_plot`. enable_search: Whether to enable search in the interactive plot. Only works if `interactive=True`. topic_prefix: Prefix to add to the topic number when displaying the topic name. datamap_kwds: Keyword args be passed on to DataMapPlot's `create_plot` function if you are not using the interactive version. See the DataMapPlot documentation for more details. int_datamap_kwds: Keyword args be passed on to DataMapPlot's `create_interactive_plot` function if you are using the interactive version. See the DataMapPlot documentation for more details. Returns: figure: A Matplotlib Figure object. Examples: To visualize the topics simply run: ```python topic_model.visualize_document_datamap(docs) ``` Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic topic_model = BERTopic().fit(docs, embeddings) # Reduce dimensionality of embeddings, this step is optional # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) # Run the visualization with the original embeddings topic_model.visualize_document_datamap(docs, embeddings=embeddings) # Or, if you have reduced the original embeddings already: topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings) fig.savefig("path/to/file.png", bbox_inches="tight") ``` DataMapPlot of 20-Newsgroups """ topic_per_doc = topic_model.topics_ df = pd.DataFrame({"topic": np.array(topic_per_doc)}) df["doc"] = docs df["topic"] = topic_per_doc # Extract embeddings if not already done if embeddings is None and reduced_embeddings is None: embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") else: embeddings_to_reduce = embeddings # Reduce input embeddings if reduced_embeddings is None: try: from umap import UMAP umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ except (ImportError, ModuleNotFoundError): raise ModuleNotFoundError( "UMAP is required if the embeddings are not yet reduced in dimensionality. Please install it using `pip install umap-learn`." ) else: embeddings_2d = reduced_embeddings unique_topics = set(topic_per_doc) # Prepare text and names if isinstance(custom_labels, str): names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics] names = [" ".join([label[0] for label in labels[:4]]) for labels in names] names = [label if len(label) < 30 else label[:27] + "..." for label in names] elif topic_model.custom_labels_ is not None and custom_labels: names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics] else: if topic_prefix: names = [ f"Topic-{topic}: " + " ".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics ] else: names = [" ".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics] topic_name_mapping = {topic_num: topic_name for topic_num, topic_name in zip(unique_topics, names)} topic_name_mapping[-1] = "Unlabelled" # If a set of topics is chosen, set everything else to "Unlabelled" if topics is not None: selected_topics = set(topics) for topic_num in topic_name_mapping: if topic_num not in selected_topics: topic_name_mapping[topic_num] = "Unlabelled" # Map in topic names and plot named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).to_numpy() if interactive: figure = datamapplot.create_interactive_plot( embeddings_2d, named_topic_per_doc, hover_text=docs, enable_search=enable_search, width=width, height=height, **int_datamap_kwds, ) else: figure, _ = datamapplot.create_plot( embeddings_2d, named_topic_per_doc, figsize=(width / 100, height / 100), dpi=100, title=title, sub_title=sub_title, **datamap_kwds, ) return figure ================================================ FILE: bertopic/plotting/_distribution.py ================================================ import numpy as np from typing import Union import plotly.graph_objects as go def visualize_distribution( topic_model, probabilities: np.ndarray, min_probability: float = 0.015, custom_labels: Union[bool, str] = False, title: str = "Topic Probability Distribution", width: int = 800, height: int = 600, ) -> go.Figure: """Visualize the distribution of topic probabilities. Arguments: topic_model: A fitted BERTopic instance. probabilities: An array of probability scores min_probability: The minimum probability score to visualize. All others are ignored. custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: Make sure to fit the model before and only input the probabilities of a single document: ```python topic_model.visualize_distribution(probabilities[0]) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_distribution(probabilities[0]) fig.write_html("path/to/file.html") ``` """ if len(probabilities.shape) != 1: raise ValueError( "This visualization cannot be used if you have set `calculate_probabilities` to False " "as it uses the topic probabilities of all topics. " ) if len(probabilities[probabilities > min_probability]) == 0: raise ValueError( "There are no values where `min_probability` is higher than the " "probabilities that were supplied. Lower `min_probability` to prevent this error." ) # Get values and indices equal or exceed the minimum probability labels_idx = np.argwhere(probabilities >= min_probability).flatten() vals = probabilities[labels_idx].tolist() # Create labels if isinstance(custom_labels, str): labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in labels_idx] labels = ["_".join([label[0] for label in l[:4]]) for l in labels] # noqa: E741 labels = [label if len(label) < 30 else label[:27] + "..." for label in labels] elif topic_model.custom_labels_ is not None and custom_labels: labels = [topic_model.custom_labels_[idx + topic_model._outliers] for idx in labels_idx] else: labels = [] for idx in labels_idx: words = topic_model.get_topic(idx) if words: label = [word[0] for word in words[:5]] label = f"Topic {idx}: {'_'.join(label)}" label = label[:40] + "..." if len(label) > 40 else label labels.append(label) else: vals.remove(probabilities[idx]) # Create Figure fig = go.Figure( go.Bar( x=vals, y=labels, marker=dict( color="#C8D2D7", line=dict(color="#6E8484", width=1), ), orientation="h", ) ) fig.update_layout( xaxis_title="Probability", title={ "text": f"{title}", "y": 0.95, "x": 0.5, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, template="simple_white", width=width, height=height, hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) return fig ================================================ FILE: bertopic/plotting/_documents.py ================================================ import numpy as np import pandas as pd import plotly.graph_objects as go from typing import List, Union def visualize_documents( topic_model, docs: List[str], topics: List[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, sample: float | None = None, hide_annotations: bool = False, hide_document_hover: bool = False, custom_labels: Union[bool, str] = False, title: str = "Documents and Topics", width: int = 1200, height: int = 750, ): """Visualize documents and their topics in 2D. Arguments: topic_model: A fitted BERTopic instance. docs: The documents you used when calling either `fit` or `fit_transform` topics: A selection of topics to visualize. Not to be confused with the topics that you get from `.fit_transform`. For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. embeddings: The embeddings of all documents in `docs`. reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. sample: The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen. hide_annotations: Hide the names of the traces on top of each cluster. hide_document_hover: Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualization. custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: To visualize the topics simply run: ```python topic_model.visualize_documents(docs) ``` Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic topic_model = BERTopic().fit(docs, embeddings) # Reduce dimensionality of embeddings, this step is optional # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) # Run the visualization with the original embeddings topic_model.visualize_documents(docs, embeddings=embeddings) # Or, if you have reduced the original embeddings already: topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) fig.write_html("path/to/file.html") ``` """ topic_per_doc = topic_model.topics_ # Sample the data to optimize for visualization and dimensionality reduction if sample is None or sample > 1: sample = 1 indices = [] for topic in set(topic_per_doc): s = np.where(np.array(topic_per_doc) == topic)[0] size = len(s) if len(s) < 100 else int(len(s) * sample) indices.extend(np.random.choice(s, size=size, replace=False)) indices = np.array(indices) df = pd.DataFrame({"topic": np.array(topic_per_doc)[indices]}) df["doc"] = [docs[index] for index in indices] df["topic"] = [topic_per_doc[index] for index in indices] # Extract embeddings if not already done if sample is None: if embeddings is None and reduced_embeddings is None: embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") else: embeddings_to_reduce = embeddings else: if embeddings is not None: embeddings_to_reduce = embeddings[indices] elif embeddings is None and reduced_embeddings is None: embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") # Reduce input embeddings if reduced_embeddings is None: try: from umap import UMAP umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ except (ImportError, ModuleNotFoundError): raise ModuleNotFoundError( "UMAP is required if the embeddings are not yet reduced in dimensionality. Please install it using `pip install umap-learn`." ) elif sample is not None and reduced_embeddings is not None: embeddings_2d = reduced_embeddings[indices] elif sample is None and reduced_embeddings is not None: embeddings_2d = reduced_embeddings unique_topics = set(topic_per_doc) if topics is None: topics = unique_topics # Combine data df["x"] = embeddings_2d[:, 0] df["y"] = embeddings_2d[:, 1] # Prepare text and names if isinstance(custom_labels, str): names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics] names = ["_".join([label[0] for label in labels[:4]]) for labels in names] names = [label if len(label) < 30 else label[:27] + "..." for label in names] elif topic_model.custom_labels_ is not None and custom_labels: names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics] else: names = [ f"{topic}_" + "_".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics ] # Visualize fig = go.Figure() # Outliers and non-selected topics non_selected_topics = set(unique_topics).difference(topics) if len(non_selected_topics) == 0: non_selected_topics = [-1] selection = df.loc[df.topic.isin(non_selected_topics), :] selection["text"] = "" selection.loc[len(selection), :] = [ None, None, selection.x.mean(), selection.y.mean(), "Other documents", ] fig.add_trace( go.Scattergl( x=selection.x, y=selection.y, hovertext=selection.doc if not hide_document_hover else None, hoverinfo="text", mode="markers+text", name="other", showlegend=False, marker=dict(color="#CFD8DC", size=5, opacity=0.5), ) ) # Selected topics for name, topic in zip(names, unique_topics): if topic in topics and topic != -1: selection = df.loc[df.topic == topic, :] selection["text"] = "" if not hide_annotations: selection.loc[len(selection), :] = [ None, None, selection.x.mean(), selection.y.mean(), name, ] fig.add_trace( go.Scattergl( x=selection.x, y=selection.y, hovertext=selection.doc if not hide_document_hover else None, hoverinfo="text", text=selection.text, mode="markers+text", name=name, textfont=dict( size=12, ), marker=dict(size=5, opacity=0.5), ) ) # Add grid in a 'plus' shape x_range = ( df.x.min() - abs((df.x.min()) * 0.15), df.x.max() + abs((df.x.max()) * 0.15), ) y_range = ( df.y.min() - abs((df.y.min()) * 0.15), df.y.max() + abs((df.y.max()) * 0.15), ) fig.add_shape( type="line", x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1], line=dict(color="#CFD8DC", width=2), ) fig.add_shape( type="line", x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2, line=dict(color="#9E9E9E", width=2), ) fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10) fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10) # Stylize layout fig.update_layout( template="simple_white", title={ "text": f"{title}", "x": 0.5, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, width=width, height=height, ) fig.update_xaxes(visible=False) fig.update_yaxes(visible=False) return fig ================================================ FILE: bertopic/plotting/_heatmap.py ================================================ import numpy as np from typing import List, Union from scipy.cluster.hierarchy import fcluster, linkage from sklearn.metrics.pairwise import cosine_similarity from bertopic._utils import select_topic_representation import plotly.express as px import plotly.graph_objects as go def visualize_heatmap( topic_model, topics: List[int] | None = None, top_n_topics: int | None = None, n_clusters: int | None = None, use_ctfidf: bool = False, custom_labels: Union[bool, str] = False, title: str = "Similarity Matrix", width: int = 800, height: int = 800, ) -> go.Figure: """Visualize a heatmap of the topic's similarity matrix. Based on the cosine similarity matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding model), a heatmap is created showing the similarity between topics. Arguments: topic_model: A fitted BERTopic instance. topics: A selection of topics to visualize. top_n_topics: Only select the top n most frequent topics. n_clusters: Create n clusters and order the similarity matrix by those clusters. use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used. custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. width: The width of the figure. height: The height of the figure. Returns: fig: A plotly figure Examples: To visualize the similarity matrix of topics simply run: ```python topic_model.visualize_heatmap() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_heatmap() fig.write_html("path/to/file.html") ``` """ embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][ topic_model._outliers : ] # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() freq_df = freq_df.loc[freq_df.Topic != -1, :] if topics is not None: topics = list(topics) elif top_n_topics is not None: topics = sorted(freq_df.Topic.to_list()[:top_n_topics]) else: topics = sorted(freq_df.Topic.to_list()) # Order heatmap by similar clusters of topics sorted_topics = topics if n_clusters: if n_clusters >= len(set(topics)): raise ValueError("Make sure to set `n_clusters` lower than the total number of unique topics.") distance_matrix = cosine_similarity(embeddings[topics]) Z = linkage(distance_matrix, "ward") clusters = fcluster(Z, t=n_clusters, criterion="maxclust") # Extract new order of topics mapping = {cluster: [] for cluster in clusters} for topic, cluster in zip(topics, clusters): mapping[cluster].append(topic) mapping = [cluster for cluster in mapping.values()] sorted_topics = [topic for cluster in mapping for topic in cluster] # Select embeddings indices = np.array([topics.index(topic) for topic in sorted_topics]) embeddings = embeddings[indices] distance_matrix = cosine_similarity(embeddings) # Create labels if isinstance(custom_labels, str): new_labels = [ [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in sorted_topics ] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] elif topic_model.custom_labels_ is not None and custom_labels: new_labels = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in sorted_topics] else: new_labels = [[[str(topic), None], *topic_model.get_topic(topic)] for topic in sorted_topics] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] fig = px.imshow( distance_matrix, labels=dict(color="Similarity Score"), x=new_labels, y=new_labels, color_continuous_scale="GnBu", ) fig.update_layout( title={ "text": f"{title}", "y": 0.95, "x": 0.55, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, width=width, height=height, hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) fig.update_layout(showlegend=True) fig.update_layout(legend_title_text="Trend") return fig ================================================ FILE: bertopic/plotting/_hierarchical_documents.py ================================================ import numpy as np import pandas as pd import plotly.graph_objects as go import math from typing import List, Union def visualize_hierarchical_documents( topic_model, docs: List[str], hierarchical_topics: pd.DataFrame, topics: List[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, sample: Union[float, int] | None = None, hide_annotations: bool = False, hide_document_hover: bool = True, nr_levels: int = 10, level_scale: str = "linear", custom_labels: Union[bool, str] = False, title: str = "Hierarchical Documents and Topics", width: int = 1200, height: int = 750, ) -> go.Figure: """Visualize documents and their topics in 2D at different levels of hierarchy. Arguments: topic_model: A fitted BERTopic instance. docs: The documents you used when calling either `fit` or `fit_transform` hierarchical_topics: A dataframe that contains a hierarchy of topics represented by their parents and their children topics: A selection of topics to visualize. Not to be confused with the topics that you get from `.fit_transform`. For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. embeddings: The embeddings of all documents in `docs`. reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. sample: The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen. hide_annotations: Hide the names of the traces on top of each cluster. hide_document_hover: Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualizations. nr_levels: The number of levels to be visualized in the hierarchy. First, the distances in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances. Then, for each list of distances, the merged topics are selected that have a distance less or equal to the maximum distance of the selected list of distances. NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to the length of `hierarchical_topics`. level_scale: Whether to apply a linear or logarithmic (log) scale levels of the distance vector. Linear scaling will perform an equal number of merges at each level while logarithmic scaling will perform more mergers in earlier levels to provide more resolution at higher levels (this can be used for when the number of topics is large). custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". NOTE: Custom labels are only generated for the original un-merged topics. title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: To visualize the topics simply run: ```python topic_model.visualize_hierarchical_documents(docs, hierarchical_topics) ``` Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic and extract hierarchical topics topic_model = BERTopic().fit(docs, embeddings) hierarchical_topics = topic_model.hierarchical_topics(docs) # Reduce dimensionality of embeddings, this step is optional # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) # Run the visualization with the original embeddings topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings) # Or, if you have reduced the original embeddings already: topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) fig.write_html("path/to/file.html") ``` Note: This visualization was inspired by the scatter plot representation of Doc2Map: https://github.com/louisgeisler/Doc2Map """ topic_per_doc = topic_model.topics_ # Sample the data to optimize for visualization and dimensionality reduction if sample is None or sample > 1: sample = 1 indices = [] for topic in set(topic_per_doc): s = np.where(np.array(topic_per_doc) == topic)[0] size = len(s) if len(s) < 100 else int(len(s) * sample) indices.extend(np.random.choice(s, size=size, replace=False)) indices = np.array(indices) df = pd.DataFrame({"topic": np.array(topic_per_doc)[indices]}) df["doc"] = [docs[index] for index in indices] df["topic"] = [topic_per_doc[index] for index in indices] # Extract embeddings if not already done if sample is None: if embeddings is None and reduced_embeddings is None: embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") else: embeddings_to_reduce = embeddings else: if embeddings is not None: embeddings_to_reduce = embeddings[indices] elif embeddings is None and reduced_embeddings is None: embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document") # Reduce input embeddings if reduced_embeddings is None: try: from umap import UMAP umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ except (ImportError, ModuleNotFoundError): raise ModuleNotFoundError( "UMAP is required if the embeddings are not yet reduced in dimensionality. Please install it using `pip install umap-learn`." ) elif sample is not None and reduced_embeddings is not None: embeddings_2d = reduced_embeddings[indices] elif sample is None and reduced_embeddings is not None: embeddings_2d = reduced_embeddings # Combine data df["x"] = embeddings_2d[:, 0] df["y"] = embeddings_2d[:, 1] # Create topic list for each level, levels are created by calculating the distance distances = hierarchical_topics.Distance.to_list() if level_scale == "log" or level_scale == "logarithmic": log_indices = ( np.round( np.logspace( start=math.log(1, 10), stop=math.log(len(distances) - 1, 10), num=nr_levels, ) ) .astype(int) .tolist() ) log_indices.reverse() max_distances = [distances[i] for i in log_indices] elif level_scale == "lin" or level_scale == "linear": max_distances = [ distances[indices[-1]] for indices in np.array_split(range(len(hierarchical_topics)), nr_levels) ][::-1] else: raise ValueError("level_scale needs to be one of 'log' or 'linear'") for index, max_distance in enumerate(max_distances): # Get topics below `max_distance` mapping = {topic: topic for topic in df.topic.unique()} selection = hierarchical_topics.loc[hierarchical_topics.Distance <= max_distance, :] selection.Parent_ID = selection.Parent_ID.astype(int) selection = selection.sort_values("Parent_ID") for row in selection.iterrows(): for topic in row[1].Topics: mapping[topic] = row[1].Parent_ID # Make sure the mappings are mapped 1:1 mappings = [True for _ in mapping] while any(mappings): for i, (key, value) in enumerate(mapping.items()): if value in mapping.keys() and key != value: mapping[key] = mapping[value] else: mappings[i] = False # Create new column df[f"level_{index + 1}"] = df.topic.map(mapping) df[f"level_{index + 1}"] = df[f"level_{index + 1}"].astype(int) # Prepare topic names of original and merged topics trace_names = [] topic_names = {} for topic in range(hierarchical_topics.Parent_ID.astype(int).max()): if topic < hierarchical_topics.Parent_ID.astype(int).min(): if topic_model.get_topic(topic): if isinstance(custom_labels, str): trace_name = f"{topic}_" + "_".join( next(zip(*topic_model.topic_aspects_[custom_labels][topic]))[:3] ) elif topic_model.custom_labels_ is not None and custom_labels: trace_name = topic_model.custom_labels_[topic + topic_model._outliers] else: trace_name = f"{topic}_" + "_".join([word[:20] for word, _ in topic_model.get_topic(topic)][:3]) topic_names[topic] = { "trace_name": trace_name[:40], "plot_text": trace_name[:40], } trace_names.append(trace_name) else: trace_name = ( f"{topic}_" + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].to_numpy()[0] ) plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]]) topic_names[topic] = { "trace_name": trace_name[:40], "plot_text": plot_text[:40], } trace_names.append(trace_name) # Prepare traces all_traces = [] for level in range(len(max_distances)): traces = [] # Outliers if topic_model._outliers: traces.append( go.Scattergl( x=df.loc[(df[f"level_{level + 1}"] == -1), "x"], y=df.loc[df[f"level_{level + 1}"] == -1, "y"], mode="markers+text", name="other", hoverinfo="text", hovertext=df.loc[(df[f"level_{level + 1}"] == -1), "doc"] if not hide_document_hover else None, showlegend=False, marker=dict(color="#CFD8DC", size=5, opacity=0.5), ) ) # Selected topics if topics: selection = df.loc[(df.topic.isin(topics)), :] unique_topics = sorted([int(topic) for topic in selection[f"level_{level + 1}"].unique()]) else: unique_topics = sorted([int(topic) for topic in df[f"level_{level + 1}"].unique()]) for topic in unique_topics: if topic != -1: if topics: selection = df.loc[(df[f"level_{level + 1}"] == topic) & (df.topic.isin(topics)), :] else: selection = df.loc[df[f"level_{level + 1}"] == topic, :] if not hide_annotations: selection.loc[len(selection), :] = None selection["text"] = "" selection.loc[len(selection) - 1, "x"] = selection.x.mean() selection.loc[len(selection) - 1, "y"] = selection.y.mean() selection.loc[len(selection) - 1, "text"] = topic_names[int(topic)]["plot_text"] traces.append( go.Scattergl( x=selection.x, y=selection.y, text=selection.text if not hide_annotations else None, hovertext=selection.doc if not hide_document_hover else None, hoverinfo="text", name=topic_names[int(topic)]["trace_name"], mode="markers+text", marker=dict(size=5, opacity=0.5), ) ) all_traces.append(traces) # Track and count traces nr_traces_per_set = [len(traces) for traces in all_traces] trace_indices = [(0, nr_traces_per_set[0])] for index, nr_traces in enumerate(nr_traces_per_set[1:]): start = trace_indices[index][1] end = nr_traces + start trace_indices.append((start, end)) # Visualization fig = go.Figure() for traces in all_traces: for trace in traces: fig.add_trace(trace) for index in range(len(fig.data)): if index >= nr_traces_per_set[0]: fig.data[index].visible = False # Create and add slider steps = [] for index, indices in enumerate(trace_indices): step = dict( method="update", label=str(index), args=[{"visible": [False] * len(fig.data)}], ) for index in range(indices[1] - indices[0]): step["args"][0]["visible"][index + indices[0]] = True steps.append(step) sliders = [dict(currentvalue={"prefix": "Level: "}, pad={"t": 20}, steps=steps)] # Add grid in a 'plus' shape x_range = ( df.x.min() - abs((df.x.min()) * 0.15), df.x.max() + abs((df.x.max()) * 0.15), ) y_range = ( df.y.min() - abs((df.y.min()) * 0.15), df.y.max() + abs((df.y.max()) * 0.15), ) fig.add_shape( type="line", x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1], line=dict(color="#CFD8DC", width=2), ) fig.add_shape( type="line", x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2, line=dict(color="#9E9E9E", width=2), ) fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10) fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10) # Stylize layout fig.update_layout( sliders=sliders, template="simple_white", title={ "text": f"{title}", "x": 0.5, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, width=width, height=height, ) fig.update_xaxes(visible=False) fig.update_yaxes(visible=False) return fig ================================================ FILE: bertopic/plotting/_hierarchy.py ================================================ import numpy as np import pandas as pd from typing import Callable, List, Union from scipy.sparse import csr_matrix from scipy.cluster import hierarchy as sch from sklearn.metrics.pairwise import cosine_similarity from bertopic._utils import select_topic_representation import plotly.graph_objects as go import plotly.figure_factory as ff from bertopic._utils import validate_distance_matrix def visualize_hierarchy( topic_model, orientation: str = "left", topics: List[int] | None = None, top_n_topics: int | None = None, use_ctfidf: bool = True, custom_labels: Union[bool, str] = False, title: str = "Hierarchical Clustering", width: int = 1000, height: int = 600, hierarchical_topics: pd.DataFrame = None, linkage_function: Callable[[csr_matrix], np.ndarray] | None = None, distance_function: Callable[[csr_matrix], csr_matrix] | None = None, color_threshold: int = 1, ) -> go.Figure: """Visualize a hierarchical structure of the topics. A ward linkage function is used to perform the hierarchical clustering based on the cosine distance matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding model). Arguments: topic_model: A fitted BERTopic instance. orientation: The orientation of the figure. Either 'left' or 'bottom' topics: A selection of topics to visualize top_n_topics: Only select the top n most frequent topics use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used. custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". NOTE: Custom labels are only generated for the original un-merged topics. title: Title of the plot. width: The width of the figure. Only works if orientation is set to 'left' height: The height of the figure. Only works if orientation is set to 'bottom' hierarchical_topics: A dataframe that contains a hierarchy of topics represented by their parents and their children. NOTE: The hierarchical topic names are only visualized if both `topics` and `top_n_topics` are not set. linkage_function: The linkage function to use. Default is: `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` NOTE: Make sure to use the same `linkage_function` as used in `topic_model.hierarchical_topics`. distance_function: The distance function to use on the c-TF-IDF matrix. Default is: `lambda x: 1 - cosine_similarity(x)`. You can pass any function that returns either a square matrix of shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. NOTE: Make sure to use the same `distance_function` as used in `topic_model.hierarchical_topics`. color_threshold: Value at which the separation of clusters will be made which will result in different colors for different clusters. A higher value will typically lead in less colored clusters. Returns: fig: A plotly figure Examples: To visualize the hierarchical structure of topics simply run: ```python topic_model.visualize_hierarchy() ``` If you also want the labels visualized of hierarchical topics, run the following: ```python # Extract hierarchical topics and their representations hierarchical_topics = topic_model.hierarchical_topics(docs) # Visualize these representations topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) ``` If you want to save the resulting figure: ```python fig = topic_model.visualize_hierarchy() fig.write_html("path/to/file.html") ``` """ if distance_function is None: distance_function = lambda x: 1 - cosine_similarity(x) if linkage_function is None: linkage_function = lambda x: sch.linkage(x, "ward", optimal_ordering=True) # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() freq_df = freq_df.loc[freq_df.Topic != -1, :] if topics is not None: topics = list(topics) elif top_n_topics is not None: topics = sorted(freq_df.Topic.to_list()[:top_n_topics]) else: topics = sorted(freq_df.Topic.to_list()) # Select embeddings all_topics = sorted(list(topic_model.get_topics().keys())) indices = np.array([all_topics.index(topic) for topic in topics]) # Select topic embeddings embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][ indices ] # Annotations if hierarchical_topics is not None and len(topics) == len(freq_df.Topic.to_list()): annotations = _get_annotations( topic_model=topic_model, hierarchical_topics=hierarchical_topics, embeddings=embeddings, distance_function=distance_function, linkage_function=linkage_function, orientation=orientation, custom_labels=custom_labels, ) else: annotations = None # wrap distance function to validate input and return a condensed distance matrix distance_function_viz = lambda x: validate_distance_matrix(distance_function(x), embeddings.shape[0]) # Create dendogram fig = ff.create_dendrogram( embeddings, orientation=orientation, distfun=distance_function_viz, linkagefun=linkage_function, hovertext=annotations, color_threshold=color_threshold, ) # Create nicer labels axis = "yaxis" if orientation == "left" else "xaxis" if isinstance(custom_labels, str): new_labels = [ [[str(x), None]] + topic_model.topic_aspects_[custom_labels][x] for x in fig.layout[axis]["ticktext"] ] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] elif topic_model.custom_labels_ is not None and custom_labels: new_labels = [ topic_model.custom_labels_[topics[int(x)] + topic_model._outliers] for x in fig.layout[axis]["ticktext"] ] else: new_labels = [ [[str(topics[int(x)]), None], *topic_model.get_topic(topics[int(x)])] for x in fig.layout[axis]["ticktext"] ] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] # Stylize layout fig.update_layout( plot_bgcolor="#ECEFF1", template="plotly_white", title={ "text": f"{title}", "x": 0.5, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) # Stylize orientation if orientation == "left": fig.update_layout( height=200 + (15 * len(topics)), width=width, yaxis=dict(tickmode="array", ticktext=new_labels), ) # Fix empty space on the bottom of the graph y_max = max([trace["y"].max() + 5 for trace in fig["data"]]) y_min = min([trace["y"].min() - 5 for trace in fig["data"]]) fig.update_layout(yaxis=dict(range=[y_min, y_max])) else: fig.update_layout( width=200 + (15 * len(topics)), height=height, xaxis=dict(tickmode="array", ticktext=new_labels), ) if hierarchical_topics is not None: for index in [0, 3]: axis = "x" if orientation == "left" else "y" xs = [data["x"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)] ys = [data["y"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)] hovertext = [data["text"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)] fig.add_trace( go.Scatter( x=xs, y=ys, marker_color="black", hovertext=hovertext, hoverinfo="text", mode="markers", showlegend=False, ) ) return fig def _get_annotations( topic_model, hierarchical_topics: pd.DataFrame, embeddings: csr_matrix, linkage_function: Callable[[csr_matrix], np.ndarray], distance_function: Callable[[csr_matrix], csr_matrix], orientation: str, custom_labels: bool = False, ) -> List[List[str]]: """Get annotations by replicating linkage function calculation in scipy. Arguments: topic_model: A fitted BERTopic instance. hierarchical_topics: A dataframe that contains a hierarchy of topics represented by their parents and their children. NOTE: The hierarchical topic names are only visualized if both `topics` and `top_n_topics` are not set. embeddings: The c-TF-IDF matrix on which to model the hierarchy linkage_function: The linkage function to use. Default is: `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` NOTE: Make sure to use the same `linkage_function` as used in `topic_model.hierarchical_topics`. distance_function: The distance function to use on the c-TF-IDF matrix. Default is: `lambda x: 1 - cosine_similarity(x)`. You can pass any function that returns either a square matrix of shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. NOTE: Make sure to use the same `distance_function` as used in `topic_model.hierarchical_topics`. orientation: The orientation of the figure. Either 'left' or 'bottom' custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. NOTE: Custom labels are only generated for the original un-merged topics. Returns: text_annotations: Annotations to be used within Plotly's `ff.create_dendogram` """ df = hierarchical_topics.loc[hierarchical_topics.Parent_Name != "Top", :] # Calculate distance X = distance_function(embeddings) X = validate_distance_matrix(X, embeddings.shape[0]) # Calculate linkage and generate dendrogram Z = linkage_function(X) P = sch.dendrogram(Z, orientation=orientation, no_plot=True) # store topic no.(leaves) corresponding to the x-ticks in dendrogram x_ticks = np.arange(5, len(P["leaves"]) * 10 + 5, 10) x_topic = dict(zip(P["leaves"], x_ticks)) topic_vals = dict() for key, val in x_topic.items(): topic_vals[val] = [key] parent_topic = dict(zip(df.Parent_ID, df.Topics)) # loop through every trace (scatter plot) in dendrogram text_annotations = [] for index, trace in enumerate(P["icoord"]): fst_topic = topic_vals[trace[0]] scnd_topic = topic_vals[trace[2]] if len(fst_topic) == 1: if isinstance(custom_labels, str): fst_name = f"{fst_topic[0]}_" + "_".join( next(zip(*topic_model.topic_aspects_[custom_labels][fst_topic[0]]))[:3] ) elif topic_model.custom_labels_ is not None and custom_labels: fst_name = topic_model.custom_labels_[fst_topic[0] + topic_model._outliers] else: fst_name = "_".join([word for word, _ in topic_model.get_topic(fst_topic[0])][:5]) else: for key, value in parent_topic.items(): if set(value) == set(fst_topic): fst_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0] if len(scnd_topic) == 1: if isinstance(custom_labels, str): scnd_name = f"{scnd_topic[0]}_" + "_".join( next(zip(*topic_model.topic_aspects_[custom_labels][scnd_topic[0]]))[:3] ) elif topic_model.custom_labels_ is not None and custom_labels: scnd_name = topic_model.custom_labels_[scnd_topic[0] + topic_model._outliers] else: scnd_name = "_".join([word for word, _ in topic_model.get_topic(scnd_topic[0])][:5]) else: for key, value in parent_topic.items(): if set(value) == set(scnd_topic): scnd_name = df.loc[df.Parent_ID == key, "Parent_Name"].to_numpy()[0] text_annotations.append([fst_name, "", "", scnd_name]) center = (trace[0] + trace[2]) / 2 topic_vals[center] = fst_topic + scnd_topic return text_annotations ================================================ FILE: bertopic/plotting/_term_rank.py ================================================ import numpy as np from typing import List, Union import plotly.graph_objects as go def visualize_term_rank( topic_model, topics: List[int] | None = None, log_scale: bool = False, custom_labels: Union[bool, str] = False, title: str = "Term score decline per Topic", width: int = 800, height: int = 500, ) -> go.Figure: """Visualize the ranks of all terms across all topics. Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows how many words are needed to represent a topic and at which point the beneficial effect of adding words starts to decline. Arguments: topic_model: A fitted BERTopic instance. topics: A selection of topics to visualize. These will be colored red where all others will be colored black. log_scale: Whether to represent the ranking on a log scale custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. width: The width of the figure. height: The height of the figure. Returns: fig: A plotly figure Examples: To visualize the ranks of all words across all topics simply run: ```python topic_model.visualize_term_rank() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_term_rank() fig.write_html("path/to/file.html") ``` Reference: This visualization was heavily inspired by the "Term Probability Decline" visualization found in an analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/). Reference to that specific analysis can be found [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html). """ topics = [] if topics is None else topics topic_ids = topic_model.get_topic_info().Topic.unique().tolist() topic_words = [topic_model.get_topic(topic) for topic in topic_ids] values = np.array([[value[1] for value in values] for values in topic_words]) indices = np.array([[value + 1 for value in range(len(values))] for values in topic_words]) # Create figure lines = [] for topic, x, y in zip(topic_ids, indices, values): if not any(y > 1.5): # labels if isinstance(custom_labels, str): label = f"{topic}_" + "_".join(next(zip(*topic_model.topic_aspects_[custom_labels][topic]))[:3]) elif topic_model.custom_labels_ is not None and custom_labels: label = topic_model.custom_labels_[topic + topic_model._outliers] else: label = f"Topic {topic}:" + "_".join([word[0] for word in topic_model.get_topic(topic)]) label = label[:50] # line parameters color = "red" if topic in topics else "black" opacity = 1 if topic in topics else 0.1 if any(y == 0): y[y == 0] = min(values[values > 0]) y = np.log10(y, out=y, where=y > 0) if log_scale else y line = go.Scatter( x=x, y=y, name="", hovertext=label, mode="lines+lines", opacity=opacity, line=dict(color=color, width=1.5), ) lines.append(line) fig = go.Figure(data=lines) # Stylize layout fig.update_xaxes(range=[0, len(indices[0])], tick0=1, dtick=2) fig.update_layout( showlegend=False, template="plotly_white", title={ "text": f"{title}", "y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, width=width, height=height, hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) fig.update_xaxes(title_text="Term Rank") if log_scale: fig.update_yaxes(title_text="c-TF-IDF score (log scale)") else: fig.update_yaxes(title_text="c-TF-IDF score") return fig ================================================ FILE: bertopic/plotting/_topics.py ================================================ import numpy as np import pandas as pd try: from umap import UMAP HAS_UMAP = True except (ImportError, ModuleNotFoundError): HAS_UMAP = False from typing import List, Union from sklearn.preprocessing import MinMaxScaler from bertopic._utils import select_topic_representation import plotly.express as px import plotly.graph_objects as go def visualize_topics( topic_model, topics: List[int] | None = None, top_n_topics: int | None = None, use_ctfidf: bool = False, custom_labels: Union[bool, str] = False, title: str = "Intertopic Distance Map", width: int = 650, height: int = 650, ) -> go.Figure: """Visualize topics, their sizes, and their corresponding words. This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA. Arguments: topic_model: A fitted BERTopic instance. topics: A selection of topics to visualize top_n_topics: Only select the top n most frequent topics use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model. custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: To visualize the topics simply run: ```python topic_model.visualize_topics() ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_topics() fig.write_html("path/to/file.html") ``` """ # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() freq_df = freq_df.loc[freq_df.Topic != -1, :] if topics is not None: topics = list(topics) elif top_n_topics is not None: topics = sorted(freq_df.Topic.to_list()[:top_n_topics]) else: topics = sorted(freq_df.Topic.to_list()) # Extract topic words and their frequencies topic_list = sorted(topics) frequencies = [topic_model.topic_sizes_[topic] for topic in topic_list] if isinstance(custom_labels, str): words = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topic_list] words = ["_".join([label[0] for label in labels[:4]]) for labels in words] words = [label if len(label) < 30 else label[:27] + "..." for label in words] elif custom_labels and topic_model.custom_labels_ is not None: words = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topic_list] else: words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list] # Embed c-TF-IDF into 2D all_topics = sorted(list(topic_model.get_topics().keys())) indices = np.array([all_topics.index(topic) for topic in topics]) embeddings, c_tfidf_used = select_topic_representation( topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf=use_ctfidf, output_ndarray=True, ) embeddings = embeddings[indices] if HAS_UMAP: if c_tfidf_used: embeddings = MinMaxScaler().fit_transform(embeddings) embeddings = UMAP( n_neighbors=2, n_components=2, metric="hellinger", init="random", random_state=42 ).fit_transform(embeddings) else: embeddings = UMAP( n_neighbors=2, n_components=2, metric="cosine", init="random", random_state=42 ).fit_transform(embeddings) else: raise ModuleNotFoundError( "UMAP is required to reduce the embeddings.. Please install it using `pip install umap-learn`." ) # Visualize with plotly df = pd.DataFrame( { "x": embeddings[:, 0], "y": embeddings[:, 1], "Topic": topic_list, "Words": words, "Size": frequencies, } ) return _plotly_topic_visualization(df, topic_list, title, width, height) def _plotly_topic_visualization(df: pd.DataFrame, topic_list: List[str], title: str, width: int, height: int): """Create plotly-based visualization of topics with a slider for topic selection.""" def get_color(topic_selected): if topic_selected == -1: marker_color = ["#B0BEC5" for _ in topic_list] else: marker_color = ["red" if topic == topic_selected else "#B0BEC5" for topic in topic_list] return [{"marker.color": [marker_color]}] # Prepare figure range x_range = ( df.x.min() - abs((df.x.min()) * 0.15), df.x.max() + abs((df.x.max()) * 0.15), ) y_range = ( df.y.min() - abs((df.y.min()) * 0.15), df.y.max() + abs((df.y.max()) * 0.15), ) # Plot topics fig = px.scatter( df, x="x", y="y", size="Size", size_max=40, template="simple_white", labels={"x": "", "y": ""}, hover_data={"Topic": True, "Words": True, "Size": True, "x": False, "y": False}, ) fig.update_traces(marker=dict(color="#B0BEC5", line=dict(width=2, color="DarkSlateGrey"))) # Update hover order fig.update_traces( hovertemplate="
".join( [ "Topic %{customdata[0]}", "%{customdata[1]}", "Size: %{customdata[2]}", ] ) ) # Create a slider for topic selection steps = [dict(label=f"Topic {topic}", method="update", args=get_color(topic)) for topic in topic_list] sliders = [dict(active=0, pad={"t": 50}, steps=steps)] # Stylize layout fig.update_layout( title={ "text": f"{title}", "y": 0.95, "x": 0.5, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, width=width, height=height, hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), xaxis={"visible": False}, yaxis={"visible": False}, sliders=sliders, ) # Update axes ranges fig.update_xaxes(range=x_range) fig.update_yaxes(range=y_range) # Add grid in a 'plus' shape fig.add_shape( type="line", x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1], line=dict(color="#CFD8DC", width=2), ) fig.add_shape( type="line", x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2, line=dict(color="#9E9E9E", width=2), ) fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10) fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10) fig.data = fig.data[::-1] return fig ================================================ FILE: bertopic/plotting/_topics_over_time.py ================================================ import pandas as pd from typing import List, Union import plotly.graph_objects as go from sklearn.preprocessing import normalize def visualize_topics_over_time( topic_model, topics_over_time: pd.DataFrame, top_n_topics: int | None = None, topics: List[int] | None = None, normalize_frequency: bool = False, custom_labels: Union[bool, str] = False, title: str = "Topics over Time", width: int = 1250, height: int = 450, ) -> go.Figure: """Visualize topics over time. Arguments: topic_model: A fitted BERTopic instance. topics_over_time: The topics you would like to be visualized with the corresponding topic representation top_n_topics: To visualize the most frequent topics instead of all topics: Select which topics you would like to be visualized normalize_frequency: Whether to normalize each topic's frequency individually custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. width: The width of the figure. height: The height of the figure. Returns: A plotly.graph_objects.Figure including all traces Examples: To visualize the topics over time, simply run: ```python topics_over_time = topic_model.topics_over_time(docs, timestamps) topic_model.visualize_topics_over_time(topics_over_time) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_topics_over_time(topics_over_time) fig.write_html("path/to/file.html") ``` """ colors = [ "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#D55E00", "#0072B2", "#CC79A7", ] # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() freq_df = freq_df.loc[freq_df.Topic != -1, :] if topics is not None: selected_topics = list(topics) elif top_n_topics is not None: selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics]) else: selected_topics = sorted(freq_df.Topic.to_list()) # Prepare data if isinstance(custom_labels, str): topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics] topic_names = ["_".join([label[0] for label in labels[:4]]) for labels in topic_names] topic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names] topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())} elif topic_model.custom_labels_ is not None and custom_labels: topic_names = { key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items() } else: topic_names = { key: value[:40] + "..." if len(value) > 40 else value for key, value in topic_model.topic_labels_.items() } topics_over_time["Name"] = topics_over_time.Topic.map(topic_names) data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values(["Topic", "Timestamp"]) # Add traces fig = go.Figure() for index, topic in enumerate(data.Topic.unique()): trace_data = data.loc[data.Topic == topic, :] topic_name = trace_data.Name.to_numpy()[0] words = trace_data.Words.to_numpy() if normalize_frequency: y = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0] else: y = trace_data.Frequency fig.add_trace( go.Scatter( x=trace_data.Timestamp, y=y, mode="lines", marker_color=colors[index % 7], hoverinfo="text", name=topic_name, hovertext=[f"Topic {topic}
Words: {word}" for word in words], ) ) # Styling of the visualization fig.update_xaxes(showgrid=True) fig.update_yaxes(showgrid=True) fig.update_layout( yaxis_title="Normalized Frequency" if normalize_frequency else "Frequency", title={ "text": f"{title}", "y": 0.95, "x": 0.40, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, template="simple_white", width=width, height=height, hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), legend=dict( title="Global Topic Representation", ), ) return fig ================================================ FILE: bertopic/plotting/_topics_per_class.py ================================================ import pandas as pd from typing import List, Union import plotly.graph_objects as go from sklearn.preprocessing import normalize def visualize_topics_per_class( topic_model, topics_per_class: pd.DataFrame, top_n_topics: int = 10, topics: List[int] | None = None, normalize_frequency: bool = False, custom_labels: Union[bool, str] = False, title: str = "Topics per Class", width: int = 1250, height: int = 900, ) -> go.Figure: """Visualize topics per class. Arguments: topic_model: A fitted BERTopic instance. topics_per_class: The topics you would like to be visualized with the corresponding topic representation top_n_topics: To visualize the most frequent topics instead of all topics: Select which topics you would like to be visualized normalize_frequency: Whether to normalize each topic's frequency individually custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. width: The width of the figure. height: The height of the figure. Returns: A plotly.graph_objects.Figure including all traces Examples: To visualize the topics per class, simply run: ```python topics_per_class = topic_model.topics_per_class(docs, classes) topic_model.visualize_topics_per_class(topics_per_class) ``` Or if you want to save the resulting figure: ```python fig = topic_model.visualize_topics_per_class(topics_per_class) fig.write_html("path/to/file.html") ``` """ colors = [ "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#D55E00", "#0072B2", "#CC79A7", ] # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() freq_df = freq_df.loc[freq_df.Topic != -1, :] if topics is not None: selected_topics = list(topics) elif top_n_topics is not None: selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics]) else: selected_topics = sorted(freq_df.Topic.to_list()) # Prepare data if isinstance(custom_labels, str): topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics] topic_names = ["_".join([label[0] for label in labels[:4]]) for labels in topic_names] topic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names] topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())} elif topic_model.custom_labels_ is not None and custom_labels: topic_names = { key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items() } else: topic_names = { key: value[:40] + "..." if len(value) > 40 else value for key, value in topic_model.topic_labels_.items() } topics_per_class["Name"] = topics_per_class.Topic.map(topic_names) data = topics_per_class.loc[topics_per_class.Topic.isin(selected_topics), :] # Add traces fig = go.Figure() for index, topic in enumerate(selected_topics): if index == 0: visible = True else: visible = "legendonly" trace_data = data.loc[data.Topic == topic, :] topic_name = trace_data.Name.to_numpy()[0] words = trace_data.Words.to_numpy() if normalize_frequency: x = normalize(trace_data.Frequency.to_numpy().reshape(1, -1))[0] else: x = trace_data.Frequency fig.add_trace( go.Bar( y=trace_data.Class, x=x, visible=visible, marker_color=colors[index % 7], hoverinfo="text", name=topic_name, orientation="h", hovertext=[f"Topic {topic}
Words: {word}" for word in words], ) ) # Styling of the visualization fig.update_xaxes(showgrid=True) fig.update_yaxes(showgrid=True) fig.update_layout( xaxis_title="Normalized Frequency" if normalize_frequency else "Frequency", yaxis_title="Class", title={ "text": f"{title}", "y": 0.95, "x": 0.40, "xanchor": "center", "yanchor": "top", "font": dict(size=22, color="Black"), }, template="simple_white", width=width, height=height, hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), legend=dict( title="Global Topic Representation", ), ) return fig ================================================ FILE: bertopic/representation/__init__.py ================================================ from bertopic._utils import NotInstalled from bertopic.representation._cohere import Cohere from bertopic.representation._base import BaseRepresentation from bertopic.representation._keybert import KeyBERTInspired from bertopic.representation._mmr import MaximalMarginalRelevance # Llama CPP Generator try: from bertopic.representation._llamacpp import LlamaCPP except ModuleNotFoundError: msg = "`pip install llama-cpp-python` \n\n" LlamaCPP = NotInstalled("llama.cpp", "llama-cpp-python", custom_msg=msg) # Text Generation using transformers try: from bertopic.representation._textgeneration import TextGeneration except ModuleNotFoundError: msg = "`pip install bertopic` without `--no-deps` \n\n" TextGeneration = NotInstalled("TextGeneration", "transformers", custom_msg=msg) # Zero-shot classification using transformers try: from bertopic.representation._zeroshot import ZeroShotClassification except ModuleNotFoundError: msg = "`pip install bertopic` without `--no-deps` \n\n" ZeroShotClassification = NotInstalled("ZeroShotClassification", "transformers", custom_msg=msg) # OpenAI Generator try: from bertopic.representation._openai import OpenAI except ModuleNotFoundError: msg = "`pip install openai` \n\n" OpenAI = NotInstalled("OpenAI", "openai", custom_msg=msg) # LiteLLM Generator try: from bertopic.representation._litellm import LiteLLM except ModuleNotFoundError: msg = "`pip install litellm` \n\n" LiteLLM = NotInstalled("LiteLLM", "litellm", custom_msg=msg) # LangChain Generator try: from bertopic.representation._langchain import LangChain except ModuleNotFoundError: msg = "`pip install langchain` \n\n" LangChain = NotInstalled("langchain", "langchain", custom_msg=msg) # POS using Spacy try: from bertopic.representation._pos import PartOfSpeech except ModuleNotFoundError: PartOfSpeech = NotInstalled("Part of Speech with Spacy", "spacy") # Multimodal try: from bertopic.representation._visual import VisualRepresentation except ModuleNotFoundError: VisualRepresentation = NotInstalled("a visual representation model", "vision") __all__ = [ "BaseRepresentation", "Cohere", "KeyBERTInspired", "LangChain", "LiteLLM", "LlamaCPP", "MaximalMarginalRelevance", "OpenAI", "PartOfSpeech", "TextGeneration", "VisualRepresentation", "ZeroShotClassification", ] ================================================ FILE: bertopic/representation/_base.py ================================================ import pandas as pd from scipy.sparse import csr_matrix from sklearn.base import BaseEstimator from typing import Mapping, List, Tuple class BaseRepresentation(BaseEstimator): """The base representation model for fine-tuning topic representations.""" def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. Each representation model that inherits this class will have its arguments (topic_model, documents, c_tf_idf, topics) automatically passed. Therefore, the representation model will only have access to the information about topics related to those arguments. Arguments: topic_model: The BERTopic model that is fitted until topic representations are calculated. documents: A dataframe with columns "Document" and "Topic" that contains all documents with each corresponding topic. c_tf_idf: A c-TF-IDF representation that is typically identical to `topic_model.c_tf_idf_` except for dynamic, class-based, and hierarchical topic modeling where it is calculated on a subset of the documents. topics: A dictionary with topic (key) and tuple of word and weight (value) as calculated by c-TF-IDF. This is the default topics that are returned if no representation model is used. """ return topic_model.topic_representations_ ================================================ FILE: bertopic/representation/_cohere.py ================================================ import time import pandas as pd from tqdm import tqdm from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Union, Callable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters DEFAULT_PROMPT = """ This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title --- Topic: Sample texts from this topic: - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food. - Meat, but especially beef, is the word food in terms of emissions. - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. Keywords: meat beef eat eating emissions steak food health processed chicken Topic name: Environmental impacts of eating meat --- Topic: Sample texts from this topic: - I have ordered the product weeks ago but it still has not arrived! - The website mentions that it only takes a couple of days to deliver but I still have not received mine. - I got a message stating that I received the monitor but that is not true! - It took a month longer to deliver than was advised... Keywords: deliver weeks product shipping long delivery received arrived arrive week Topic name: Shipping and delivery issues --- Topic: Sample texts from this topic: [DOCUMENTS] Keywords: [KEYWORDS] Topic name:""" DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." class Cohere(BaseRepresentation): """Use the Cohere API to generate topic labels based on their generative model. Find more about their models here: https://docs.cohere.ai/docs Arguments: client: A `cohere.Client` model: Model to use within Cohere, defaults to `"xlarge"`. prompt: The prompt to be used in the model. If no prompt is given, `self.default_prompt_` is used instead. NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. system_prompt: The system prompt to be used in the model. If no system prompt is given, `self.default_system_prompt_` is used instead. delay_in_seconds: The delay in seconds between consecutive prompts in order to prevent RateLimitErrors. nr_docs: The number of documents to pass to OpenAI if a prompt with the `["DOCUMENTS"]` tag is used. diversity: The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents. doc_length: The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. tokenizer: The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` Usage: To use this, you will need to install cohere first: `pip install cohere` Then, get yourself an API key and use Cohere's API as follows: ```python import cohere from bertopic.representation import Cohere from bertopic import BERTopic # Create your representation model co = cohere.Client(my_api_key) representation_model = Cohere(co) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` You can also use a custom prompt: ```python prompt = "I have the following documents: [DOCUMENTS]. What topic do they contain?" representation_model = Cohere(co, prompt=prompt) ``` """ def __init__( self, client, model: str = "command-r", prompt: str | None = None, system_prompt: str | None = None, delay_in_seconds: float | None = None, nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, tokenizer: Union[str, Callable] | None = None, ): self.client = client self.model = model self.prompt = prompt if prompt is not None else DEFAULT_PROMPT self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT self.default_prompt_ = DEFAULT_PROMPT self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT self.delay_in_seconds = delay_in_seconds self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer validate_truncate_document_parameters(self.tokenizer, self.doc_length) self.prompts_ = [] def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. Arguments: topic_model: Not used documents: Not used c_tf_idf: Not used topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ # Extract the top 4 representative documents per topic repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity ) # Generate using Cohere's Language Model updated_topics = {} for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) # Delay if self.delay_in_seconds: time.sleep(self.delay_in_seconds) request = self.client.chat( model=self.model, preamble=self.system_prompt, message=prompt, max_tokens=50, stop_sequences=["\n"], ) label = request.text.strip() updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)] return updated_topics def _create_prompt(self, docs, topic, topics): keywords = next(zip(*topics[topic])) # Use the Default Chat Prompt if self.prompt == DEFAULT_PROMPT: prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) prompt = self._replace_documents(prompt, docs) # Use a custom prompt that leverages keywords, documents or both using # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively else: prompt = self.prompt if "[KEYWORDS]" in prompt: prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords)) if "[DOCUMENTS]" in prompt: prompt = self._replace_documents(prompt, docs) return prompt @staticmethod def _replace_documents(prompt, docs): to_replace = "" for doc in docs: to_replace += f"- {doc}\n" prompt = prompt.replace("[DOCUMENTS]", to_replace) return prompt ================================================ FILE: bertopic/representation/_keybert.py ================================================ import numpy as np import pandas as pd from packaging import version from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Union from sklearn.metrics.pairwise import cosine_similarity from bertopic.representation._base import BaseRepresentation from sklearn import __version__ as sklearn_version class KeyBERTInspired(BaseRepresentation): def __init__( self, top_n_words: int = 10, nr_repr_docs: int = 5, nr_samples: int = 500, nr_candidate_words: int = 100, random_state: int = 42, ): """Use a KeyBERT-like model to fine-tune the topic representations. The algorithm follows KeyBERT but does some optimization in order to speed up inference. The steps are as follows. First, we extract the top n representative documents per topic. To extract the representative documents, we randomly sample a number of candidate documents per cluster which is controlled by the `nr_samples` parameter. Then, the top n representative documents are extracted by calculating the c-TF-IDF representation for the candidate documents and finding, through cosine similarity, which are closest to the topic c-TF-IDF representation. Next, the top n words per topic are extracted based on their c-TF-IDF representation, which is controlled by the `nr_repr_docs` parameter. Then, we extract the embeddings for words and representative documents and create topic embeddings by averaging the representative documents. Finally, the most similar words to each topic are extracted by calculating the cosine similarity between word and topic embeddings. Arguments: top_n_words: The top n words to extract per topic. nr_repr_docs: The number of representative documents to extract per cluster. nr_samples: The number of candidate documents to extract per cluster. nr_candidate_words: The number of candidate words per cluster. random_state: The random state for randomly sampling candidate documents. Usage: ```python from bertopic.representation import KeyBERTInspired from bertopic import BERTopic # Create your representation model representation_model = KeyBERTInspired() # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` """ self.top_n_words = top_n_words self.nr_repr_docs = nr_repr_docs self.nr_samples = nr_samples self.nr_candidate_words = nr_candidate_words self.random_state = random_state def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], embeddings: np.ndarray = None, ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. Arguments: topic_model: A BERTopic model documents: All input documents c_tf_idf: The topic c-TF-IDF representation topics: The candidate topics as calculated with c-TF-IDF embeddings: Pre-trained document embeddings. These can be used instead of an embedding model Returns: updated_topics: Updated topic representations """ # We extract the top n representative documents per class _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs( c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs ) # If document embeddings are precomputed, extract the embeddings of the representative documents based on repr_doc_indices repr_embeddings = None if embeddings is not None: repr_embeddings = [embeddings[index] for index in np.concatenate(repr_doc_indices)] # We extract the top n words per class topics = self._extract_candidate_words(topic_model, c_tf_idf, topics) # We calculate the similarity between word and document embeddings and create # topic embeddings from the representative document embeddings sim_matrix, words = self._extract_embeddings( topic_model, topics, representative_docs, repr_doc_indices, repr_embeddings ) # Find the best matching words based on the similarity matrix for each topic updated_topics = self._extract_top_words(words, topics, sim_matrix) return updated_topics def _extract_candidate_words( self, topic_model, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """For each topic, extract candidate words based on the c-TF-IDF representation. Arguments: topic_model: A BERTopic model c_tf_idf: The topic c-TF-IDF representation topics: The top words per topic Returns: topics: The `self.top_n_words` per topic """ labels = [int(label) for label in sorted(list(topics.keys()))] # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 # and will be removed in 1.2. Please use get_feature_names_out instead. if version.parse(sklearn_version) >= version.parse("1.0.0"): words = topic_model.vectorizer_model.get_feature_names_out() else: words = topic_model.vectorizer_model.get_feature_names() indices = topic_model._top_n_idx_sparse(c_tf_idf, self.nr_candidate_words) scores = topic_model._top_n_values_sparse(c_tf_idf, indices) sorted_indices = np.argsort(scores, 1) indices = np.take_along_axis(indices, sorted_indices, axis=1) scores = np.take_along_axis(scores, sorted_indices, axis=1) # Get top 30 words per topic based on c-TF-IDF score topics = { label: [ (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001) for word_index, score in zip(indices[index][::-1], scores[index][::-1]) ] for index, label in enumerate(labels) } topics = {label: next(zip(*values[: self.nr_candidate_words])) for label, values in topics.items()} return topics def _extract_embeddings( self, topic_model, topics: Mapping[str, List[Tuple[str, float]]], representative_docs: List[str], repr_doc_indices: List[List[int]], repr_embeddings: np.ndarray = None, ) -> Union[np.ndarray, List[str]]: """Extract the representative document embeddings and create topic embeddings. Then extract word embeddings and calculate the cosine similarity between topic embeddings and the word embeddings. Topic embeddings are the average of representative document embeddings. Arguments: topic_model: A BERTopic model topics: The top words per topic representative_docs: A flat list of representative documents repr_doc_indices: The indices of representative documents that belong to each topic repr_embeddings: Embeddings of respective representative_docs Returns: sim: The similarity matrix between word and topic embeddings vocab: The complete vocabulary of input documents """ # Calculate representative document embeddings if there are no precomputed embeddings. if repr_embeddings is None: repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] # Calculate word embeddings and extract best matching with updated topic_embeddings vocab = list(set([word for words in topics.values() for word in words])) word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False) sim = cosine_similarity(topic_embeddings, word_embeddings) return sim, vocab def _extract_top_words( self, vocab: List[str], topics: Mapping[str, List[Tuple[str, float]]], sim: np.ndarray, ) -> Mapping[str, List[Tuple[str, float]]]: """Extract the top n words per topic based on the similarity matrix between topics and words. Arguments: vocab: The complete vocabulary of input documents labels: All topic labels topics: The top words per topic sim: The similarity matrix between word and topic embeddings Returns: updated_topics: The updated topic representations """ labels = [int(label) for label in sorted(list(topics.keys()))] updated_topics = {} for i, topic in enumerate(labels): indices = [vocab.index(word) for word in topics[topic]] values = sim[:, indices][i] word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words :]] updated_topics[topic] = [ (vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words :], word_indices) ][::-1] return updated_topics ================================================ FILE: bertopic/representation/_langchain.py ================================================ import pandas as pd from langchain.docstore.document import Document from scipy.sparse import csr_matrix from typing import Callable, Mapping, List, Tuple, Union from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters DEFAULT_PROMPT = "What are these documents about? Please give a single label." class LangChain(BaseRepresentation): """Using chains in langchain to generate topic labels. The classic example uses `langchain.chains.question_answering.load_qa_chain`. This returns a chain that takes a list of documents and a question as input. You can also use Runnables such as those composed using the LangChain Expression Language. Arguments: chain: The langchain chain or Runnable with a `batch` method. Input keys must be `input_documents` and `question`. Output key must be `output_text`. prompt: The prompt to be used in the model. If no prompt is given, `self.default_prompt_` is used instead. NOTE: Use `"[KEYWORDS]"` in the prompt to decide where the keywords need to be inserted. Keywords won't be included unless indicated. Unlike other representation models, Langchain does not use the `"[DOCUMENTS]"` tag to insert documents into the prompt. The load_qa_chain function formats the representative documents within the prompt. nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents. doc_length: The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. tokenizer: The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on `doc_length`. They are decoded with whitespaces. * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` chain_config: The configuration for the langchain chain. Can be used to set options like max_concurrency to avoid rate limiting errors. Usage: To use this, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai: `pip install langchain` `pip install openai` Then, you can create your chain as follows: ```python from langchain.chains.question_answering import load_qa_chain from langchain.llms import OpenAI chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") ``` Finally, you can pass the chain to BERTopic as follows: ```python from bertopic.representation import LangChain # Create your representation model representation_model = LangChain(chain) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` You can also use a custom prompt: ```python prompt = "What are these documents about? Please give a single label." representation_model = LangChain(chain, prompt=prompt) ``` You can also use a Runnable instead of a chain. The example below uses the LangChain Expression Language: ```python from bertopic.representation import LangChain from langchain.chains.question_answering import load_qa_chain from langchain.chat_models import ChatAnthropic from langchain.schema.document import Document from langchain.schema.runnable import RunnablePassthrough from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer prompt = ... llm = ... # We will construct a special privacy-preserving chain using Microsoft Presidio pii_handler = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) chain = ( { "input_documents": ( lambda inp: [ Document( page_content=pii_handler.anonymize( d.page_content, language="en", ), ) for d in inp["input_documents"] ] ), "question": RunnablePassthrough(), } | load_qa_chain(representation_llm, chain_type="stuff") | (lambda output: {"output_text": pii_handler.deanonymize(output["output_text"])}) ) representation_model = LangChain(chain, prompt=representation_prompt) ``` """ def __init__( self, chain, prompt: str | None = None, nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, tokenizer: Union[str, Callable] | None = None, chain_config=None, ): self.chain = chain self.prompt = prompt if prompt is not None else DEFAULT_PROMPT self.default_prompt_ = DEFAULT_PROMPT self.chain_config = chain_config self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, int]]]: """Extract topics. Arguments: topic_model: A BERTopic model documents: All input documents c_tf_idf: The topic c-TF-IDF representation topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ # Extract the top 4 representative documents per topic repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( c_tf_idf=c_tf_idf, documents=documents, topics=topics, nr_samples=500, nr_repr_docs=self.nr_docs, diversity=self.diversity, ) # Generate label using langchain's batch functionality chain_docs: List[List[Document]] = [ [ Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc)) for doc in docs ] for docs in repr_docs_mappings.values() ] # `self.chain` must take `input_documents` and `question` as input keys # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS] if "[KEYWORDS]" in self.prompt: prompts = [] for topic in topics: keywords = next(zip(*topics[topic])) prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) prompts.append(prompt) inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)] else: inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs] # `self.chain` must return a dict with an `output_text` key # same output key as the `StuffDocumentsChain` returned by `load_qa_chain` outputs = self.chain.batch(inputs=inputs, config=self.chain_config) labels = [output["output_text"].strip() for output in outputs] updated_topics = { topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) } return updated_topics ================================================ FILE: bertopic/representation/_litellm.py ================================================ import time from litellm import completion import pandas as pd from tqdm import tqdm from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Any from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import retry_with_exponential_backoff DEFAULT_PROMPT = """ I have a topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: [KEYWORDS] Based on the information above, extract a short topic label in the following format: topic: """ class LiteLLM(BaseRepresentation): """Using the LiteLLM API to generate topic labels. For an overview of models see: https://docs.litellm.ai/docs/providers Arguments: model: Model to use. Defaults to OpenAI's "gpt-3.5-turbo". generator_kwargs: Kwargs passed to `litellm.completion`. prompt: The prompt to be used in the model. If no prompt is given, `self.default_prompt_` is used instead. NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. delay_in_seconds: The delay in seconds between consecutive prompts in order to prevent RateLimitErrors. exponential_backoff: Retry requests with a random exponential backoff. A short sleep is used when a rate limit error is hit, then the requests is retried. Increase the sleep length if errors are hit until 10 unsuccesfull requests. If True, overrides `delay_in_seconds`. nr_docs: The number of documents to pass to LiteLLM if a prompt with the `["DOCUMENTS"]` tag is used. diversity: The diversity of documents to pass to LiteLLM. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents. Usage: To use this, you will need to install the litellm package first: `pip install litellm` Then, get yourself an API key of any provider (for instance OpenAI) and use it as follows: ```python import os from bertopic.representation import LiteLLM from bertopic import BERTopic # set ENV variables os.environ["OPENAI_API_KEY"] = "your-openai-key" # Create your representation model representation_model = LiteLLM(model="gpt-3.5-turbo") # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` You can also use a custom prompt: ```python prompt = "I have the following documents: [DOCUMENTS] \nThese documents are about the following topic: '" representation_model = LiteLLM(model="gpt", prompt=prompt) ``` """ # noqa: D301 def __init__( self, model: str = "gpt-3.5-turbo", prompt: str | None = None, generator_kwargs: Mapping[str, Any] = {}, delay_in_seconds: float | None = None, exponential_backoff: bool = False, nr_docs: int = 4, diversity: float | None = None, ): self.model = model self.prompt = prompt if prompt else DEFAULT_PROMPT self.default_prompt_ = DEFAULT_PROMPT self.delay_in_seconds = delay_in_seconds self.exponential_backoff = exponential_backoff self.nr_docs = nr_docs self.diversity = diversity self.generator_kwargs = generator_kwargs if self.generator_kwargs.get("model"): self.model = generator_kwargs.get("model") if self.generator_kwargs.get("prompt"): del self.generator_kwargs["prompt"] def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]] ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. Arguments: topic_model: A BERTopic model documents: All input documents c_tf_idf: The topic c-TF-IDF representation topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ # Extract the top n representative documents per topic repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity ) # Generate using a (Large) Language Model updated_topics = {} for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): prompt = self._create_prompt(docs, topic, topics) # Delay if self.delay_in_seconds: time.sleep(self.delay_in_seconds) messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}, ] kwargs = {"model": self.model, "messages": messages, **self.generator_kwargs} if self.exponential_backoff: response = chat_completions_with_backoff(**kwargs) else: response = completion(**kwargs) label = response["choices"][0]["message"]["content"].strip().replace("topic: ", "") updated_topics[topic] = [(label, 1)] return updated_topics def _create_prompt(self, docs, topic, topics): keywords = next(zip(*topics[topic])) # Use the Default Chat Prompt if self.prompt == DEFAULT_PROMPT: prompt = self.prompt.replace("[KEYWORDS]", " ".join(keywords)) prompt = self._replace_documents(prompt, docs) # Use a custom prompt that leverages keywords, documents or both using # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively else: prompt = self.prompt if "[KEYWORDS]" in prompt: prompt = prompt.replace("[KEYWORDS]", " ".join(keywords)) if "[DOCUMENTS]" in prompt: prompt = self._replace_documents(prompt, docs) return prompt @staticmethod def _replace_documents(prompt, docs): to_replace = "" for doc in docs: to_replace += f"- {doc[:255]}\n" prompt = prompt.replace("[DOCUMENTS]", to_replace) return prompt def chat_completions_with_backoff(**kwargs): return retry_with_exponential_backoff( completion, )(**kwargs) ================================================ FILE: bertopic/representation/_llamacpp.py ================================================ import pandas as pd from tqdm import tqdm from scipy.sparse import csr_matrix from llama_cpp import Llama from typing import Mapping, List, Tuple, Any, Union, Callable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters DEFAULT_PROMPT = """ This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title --- Topic: Sample texts from this topic: - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food. - Meat, but especially beef, is the word food in terms of emissions. - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. Keywords: meat beef eat eating emissions steak food health processed chicken Topic name: Environmental impacts of eating meat --- Topic: Sample texts from this topic: - I have ordered the product weeks ago but it still has not arrived! - The website mentions that it only takes a couple of days to deliver but I still have not received mine. - I got a message stating that I received the monitor but that is not true! - It took a month longer to deliver than was advised... Keywords: deliver weeks product shipping long delivery received arrived arrive week Topic name: Shipping and delivery issues --- Topic: Sample texts from this topic: [DOCUMENTS] Keywords: [KEYWORDS] Topic name:""" DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." class LlamaCPP(BaseRepresentation): """A llama.cpp implementation to use as a representation model. Arguments: model: Either a string pointing towards a local LLM or a `llama_cpp.Llama` object. prompt: The prompt to be used in the model. If no prompt is given, `self.default_prompt_` is used instead. NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. system_prompt: The system prompt to be used in the model. If no system prompt is given, `self.default_system_prompt_` is used instead. pipeline_kwargs: Kwargs that you can pass to the `llama_cpp.Llama` when it is called such as `max_tokens` to be generated. nr_docs: The number of documents to pass to OpenAI if a prompt with the `["DOCUMENTS"]` tag is used. diversity: The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents. doc_length: The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. tokenizer: The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is 'whitespace', the the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` Usage: To use a llama.cpp, first download the LLM: ```bash wget https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf ``` Then, we can now use the model the model with BERTopic in just a couple of lines: ```python from bertopic import BERTopic from bertopic.representation import LlamaCPP # Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha representation_model = LlamaCPP("zephyr-7b-alpha.Q4_K_M.gguf") # Create our BERTopic model topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` If you want to have more control over the LLMs parameters, you can run it like so: ```python from bertopic import BERTopic from bertopic.representation import LlamaCPP from llama_cpp import Llama # Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha llm = Llama(model_path="zephyr-7b-alpha.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop="Q:") representation_model = LlamaCPP(llm) # Create our BERTopic model topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` """ def __init__( self, model: Union[str, Llama], prompt: str | None = None, system_prompt: str | None = None, pipeline_kwargs: Mapping[str, Any] = {}, nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, tokenizer: Union[str, Callable] | None = None, ): if isinstance(model, str): self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="ChatML") elif isinstance(model, Llama): self.model = model else: raise ValueError( "Make sure that the model that you" "pass is either a string referring to a" "local LLM or a ` llama_cpp.Llama` object." ) self.prompt = prompt if prompt is not None else DEFAULT_PROMPT self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT self.default_prompt_ = DEFAULT_PROMPT self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT self.pipeline_kwargs = pipeline_kwargs self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer validate_truncate_document_parameters(self.tokenizer, self.doc_length) self.prompts_ = [] def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topic representations and return a single label. Arguments: topic_model: A BERTopic model documents: Not used c_tf_idf: Not used topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ # Extract the top 4 representative documents per topic repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity ) updated_topics = {} for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): # Prepare prompt truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) # Extract result from generator and use that as label # topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"] topic_description = self.model.create_chat_completion( messages=[{"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompt}], **self.pipeline_kwargs, ) label = topic_description["choices"][0]["message"]["content"].strip() updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)] return updated_topics def _create_prompt(self, docs, topic, topics): keywords = next(zip(*topics[topic])) # Use the Default Chat Prompt if self.prompt == DEFAULT_PROMPT: prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) prompt = self._replace_documents(prompt, docs) # Use a custom prompt that leverages keywords, documents or both using # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively else: prompt = self.prompt if "[KEYWORDS]" in prompt: prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords)) if "[DOCUMENTS]" in prompt: prompt = self._replace_documents(prompt, docs) return prompt @staticmethod def _replace_documents(prompt, docs): to_replace = "" for doc in docs: to_replace += f"- {doc}\n" prompt = prompt.replace("[DOCUMENTS]", to_replace) return prompt ================================================ FILE: bertopic/representation/_mmr.py ================================================ import warnings import numpy as np import pandas as pd from typing import List, Mapping, Tuple from scipy.sparse import csr_matrix from sklearn.metrics.pairwise import cosine_similarity from bertopic.representation._base import BaseRepresentation class MaximalMarginalRelevance(BaseRepresentation): """Calculate Maximal Marginal Relevance (MMR) between candidate keywords and the document. MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document. Arguments: diversity: How diverse the select keywords/keyphrases are. Values range between 0 and 1 with 0 being not diverse at all and 1 being most diverse. top_n_words: The number of keywords/keyhprases to return Usage: ```python from bertopic.representation import MaximalMarginalRelevance from bertopic import BERTopic # Create your representation model representation_model = MaximalMarginalRelevance(diversity=0.3) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` """ def __init__(self, diversity: float = 0.1, top_n_words: int = 10): self.diversity = diversity self.top_n_words = top_n_words def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topic representations. Arguments: topic_model: The BERTopic model documents: Not used c_tf_idf: Not used topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ if topic_model.embedding_model is None: warnings.warn( "MaximalMarginalRelevance can only be used BERTopic was instantiated" "with the `embedding_model` parameter." ) return topics updated_topics = {} for topic, topic_words in topics.items(): words = [word[0] for word in topic_words] word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False) topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape( 1, -1 ) topic_words = mmr( topic_embedding, word_embeddings, words, self.diversity, self.top_n_words, ) updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words] return updated_topics def mmr( doc_embedding: np.ndarray, word_embeddings: np.ndarray, words: List[str], diversity: float = 0.1, top_n: int = 10, ) -> List[str]: """Maximal Marginal Relevance. Arguments: doc_embedding: The document embeddings word_embeddings: The embeddings of the selected candidate keywords/phrases words: The selected candidate keywords/keyphrases diversity: The diversity of the selected embeddings. Values between 0 and 1. top_n: The top n items to return Returns: List[str]: The selected keywords/keyphrases """ # Extract similarity within words, and between words and the document word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding) word_similarity = cosine_similarity(word_embeddings) # Initialize candidates and already choose best keyword/keyphras keywords_idx = [np.argmax(word_doc_similarity)] candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]] for _ in range(top_n - 1): # Extract similarities within candidates and # between candidates and selected keywords/phrases candidate_similarities = word_doc_similarity[candidates_idx, :] target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1) # Calculate MMR mmr = (1 - diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1) mmr_idx = candidates_idx[np.argmax(mmr)] # Update keywords & candidates keywords_idx.append(mmr_idx) candidates_idx.remove(mmr_idx) return [words[idx] for idx in keywords_idx] ================================================ FILE: bertopic/representation/_openai.py ================================================ import time import openai import pandas as pd from tqdm import tqdm from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Any, Union, Callable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import ( retry_with_exponential_backoff, truncate_document, validate_truncate_document_parameters, ) DEFAULT_CHAT_PROMPT = """You will extract a short topic label from given documents and keywords. Here are two examples of topics you created before: # Example 1 Sample texts from this topic: - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food. - Meat, but especially beef, is the worst food in terms of emissions. - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. Keywords: meat beef eat eating emissions steak food health processed chicken topic: Environmental impacts of eating meat # Example 2 Sample texts from this topic: - I have ordered the product weeks ago but it still has not arrived! - The website mentions that it only takes a couple of days to deliver but I still have not received mine. - I got a message stating that I received the monitor but that is not true! - It took a month longer to deliver than was advised... Keywords: deliver weeks product shipping long delivery received arrived arrive week topic: Shipping and delivery issues # Your task Sample texts from this topic: [DOCUMENTS] Keywords: [KEYWORDS] Based on the information above, extract a short topic label (three words at most) in the following format: topic: """ DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." class OpenAI(BaseRepresentation): r"""Using the OpenAI API to generate topic labels based on one of their Completion of ChatCompletion models. For an overview see: https://platform.openai.com/docs/models Arguments: client: A `openai.OpenAI` client model: Model to use within OpenAI, defaults to `"gpt-4o-mini"`. generator_kwargs: Kwargs passed to `openai.Completion.create` for fine-tuning the output. prompt: The prompt to be used in the model. If no prompt is given, `self.default_prompt_` is used instead. NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. system_prompt: The system prompt to be used in the model. If no system prompt is given, `self.default_system_prompt_` is used instead. delay_in_seconds: The delay in seconds between consecutive prompts in order to prevent RateLimitErrors. exponential_backoff: Retry requests with a random exponential backoff. A short sleep is used when a rate limit error is hit, then the requests is retried. Increase the sleep length if errors are hit until 10 unsuccessful requests. If True, overrides `delay_in_seconds`. nr_docs: The number of documents to pass to OpenAI if a prompt with the `["DOCUMENTS"]` tag is used. diversity: The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents. doc_length: The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. tokenizer: The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` Usage: To use this, you will need to install the openai package first: `pip install openai` Then, get yourself an API key and use OpenAI's API as follows: ```python import openai from bertopic.representation import OpenAI from bertopic import BERTopic # Create your representation model client = openai.OpenAI(api_key=MY_API_KEY) representation_model = OpenAI(client, delay_in_seconds=5) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` You can also use a custom prompt: ```python prompt = "I have the following documents: [DOCUMENTS] \nThese documents are about the following topic: '" representation_model = OpenAI(client, prompt=prompt, delay_in_seconds=5) ``` To choose a model: ```python representation_model = OpenAI(client, model="gpt-4o-mini", delay_in_seconds=10) ``` """ def __init__( self, client, model: str = "gpt-4o-mini", prompt: str | None = None, system_prompt: str | None = None, generator_kwargs: Mapping[str, Any] = {}, delay_in_seconds: float | None = None, exponential_backoff: bool = False, nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, tokenizer: Union[str, Callable] | None = None, **kwargs, ): self.client = client self.model = model if prompt is None: self.prompt = DEFAULT_CHAT_PROMPT else: self.prompt = prompt if system_prompt is None: self.system_prompt = DEFAULT_SYSTEM_PROMPT else: self.system_prompt = system_prompt self.default_prompt_ = DEFAULT_CHAT_PROMPT self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT self.delay_in_seconds = delay_in_seconds self.exponential_backoff = exponential_backoff self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer validate_truncate_document_parameters(self.tokenizer, self.doc_length) self.prompts_ = [] self.generator_kwargs = generator_kwargs if self.generator_kwargs.get("model"): self.model = generator_kwargs.get("model") del self.generator_kwargs["model"] if self.generator_kwargs.get("prompt"): del self.generator_kwargs["prompt"] if not self.generator_kwargs.get("stop"): self.generator_kwargs["stop"] = "\n" def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. Arguments: topic_model: A BERTopic model documents: All input documents c_tf_idf: The topic c-TF-IDF representation topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ # Extract the top n representative documents per topic repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity ) # Generate using OpenAI's Language Model updated_topics = {} for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) # Delay if self.delay_in_seconds: time.sleep(self.delay_in_seconds) messages = [ {"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompt}, ] kwargs = { "model": self.model, "messages": messages, **self.generator_kwargs, } if self.exponential_backoff: response = chat_completions_with_backoff(self.client, **kwargs) else: response = self.client.chat.completions.create(**kwargs) # Check whether content was actually generated # Addresses #1570 for potential issues with OpenAI's content filter # Addresses #2176 for potential issues when openAI returns a None type object if response and hasattr(response.choices[0].message, "content"): label = response.choices[0].message.content.strip().replace("topic: ", "") else: label = "No label returned" updated_topics[topic] = [(label, 1)] return updated_topics def _create_prompt(self, docs, topic, topics): keywords = next(zip(*topics[topic])) # Use the Default Chat Prompt if self.prompt == DEFAULT_CHAT_PROMPT: prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) prompt = self._replace_documents(prompt, docs) # Use a custom prompt that leverages keywords, documents or both using # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively else: prompt = self.prompt if "[KEYWORDS]" in prompt: prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords)) if "[DOCUMENTS]" in prompt: prompt = self._replace_documents(prompt, docs) return prompt @staticmethod def _replace_documents(prompt, docs): to_replace = "" for doc in docs: to_replace += f"- {doc}\n" prompt = prompt.replace("[DOCUMENTS]", to_replace) return prompt def chat_completions_with_backoff(client, **kwargs): return retry_with_exponential_backoff( client.chat.completions.create, errors=(openai.RateLimitError,), )(**kwargs) ================================================ FILE: bertopic/representation/_pos.py ================================================ import numpy as np import pandas as pd import spacy from spacy.matcher import Matcher from spacy.language import Language from packaging import version from scipy.sparse import csr_matrix from typing import List, Mapping, Tuple, Union from sklearn import __version__ as sklearn_version from bertopic.representation._base import BaseRepresentation class PartOfSpeech(BaseRepresentation): """Extract Topic Keywords based on their Part-of-Speech. DEFAULT_PATTERNS = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] ] From candidate topics, as extracted with c-TF-IDF, find documents that contain keywords found in the candidate topics. These candidate documents then serve as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic. These candidate keywords are first judged by whether they fall within the DEFAULT_PATTERNS or the user-defined pattern. Then, the resulting keywords are sorted by their respective c-TF-IDF values. Arguments: model: The Spacy model to use top_n_words: The top n words to extract pos_patterns: Patterns for Spacy to use. See https://spacy.io/usage/rule-based-matching Usage: ```python from bertopic.representation import PartOfSpeech from bertopic import BERTopic # Create your representation model representation_model = PartOfSpeech("en_core_web_sm") # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` You can define custom POS patterns to be extracted: ```python pos_patterns = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] ] representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns) ``` """ def __init__( self, model: Union[str, Language] = "en_core_web_sm", top_n_words: int = 10, pos_patterns: List[str] | None = None, ): if isinstance(model, str): self.model = spacy.load(model) elif isinstance(model, Language): self.model = model else: raise ValueError( "Make sure that the Spacy model that you" "pass is either a string referring to a" "Spacy model or a Spacy nlp object." ) self.top_n_words = top_n_words if pos_patterns is None: self.pos_patterns = [ [{"POS": "ADJ"}, {"POS": "NOUN"}], [{"POS": "NOUN"}], [{"POS": "ADJ"}], ] else: self.pos_patterns = pos_patterns def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. Arguments: topic_model: A BERTopic model documents: All input documents c_tf_idf: Not used topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ matcher = Matcher(self.model.vocab) matcher.add("Pattern", self.pos_patterns) candidate_topics = {} for topic, values in topics.items(): keywords = next(zip(*values)) # Extract candidate documents candidate_documents = [] for keyword in keywords: selection = documents.loc[documents.Topic == topic, :] selection = selection.loc[selection.Document.str.contains(keyword, regex=False), "Document"] if len(selection) > 0: for document in selection[:2]: candidate_documents.append(document) candidate_documents = list(set(candidate_documents)) # Extract keywords docs_pipeline = self.model.pipe(candidate_documents) updated_keywords = [] for doc in docs_pipeline: matches = matcher(doc) for _, start, end in matches: updated_keywords.append(doc[start:end].text) candidate_topics[topic] = list(set(updated_keywords)) # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 # and will be removed in 1.2. Please use get_feature_names_out instead. if version.parse(sklearn_version) >= version.parse("1.0.0"): words = list(topic_model.vectorizer_model.get_feature_names_out()) else: words = list(topic_model.vectorizer_model.get_feature_names()) # Match updated keywords with c-TF-IDF values words_lookup = dict(zip(words, range(len(words)))) updated_topics = {topic: [] for topic in topics.keys()} for topic, candidate_keywords in candidate_topics.items(): word_indices = np.sort( [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup] ) vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers] indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1] vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1] topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)] updated_topics[topic] = topic_words if len(updated_topics[topic]) < self.top_n_words: updated_topics[topic] += [("", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))] return updated_topics ================================================ FILE: bertopic/representation/_textgeneration.py ================================================ import pandas as pd from tqdm import tqdm from scipy.sparse import csr_matrix from transformers import pipeline, set_seed from transformers.pipelines.base import Pipeline from typing import Mapping, List, Tuple, Any, Union, Callable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters DEFAULT_PROMPT = """ I have a topic described by the following keywords: [KEYWORDS]. The name of this topic is: """ class TextGeneration(BaseRepresentation): """Text2Text or text generation with transformers. Arguments: model: A transformers pipeline that should be initialized as "text-generation" for gpt-like models or "text2text-generation" for T5-like models. For example, `pipeline('text-generation', model='gpt2')`. If a string is passed, "text-generation" will be selected by default. prompt: The prompt to be used in the model. If no prompt is given, `self.default_prompt_` is used instead. NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. pipeline_kwargs: Kwargs that you can pass to the transformers.pipeline when it is called. random_state: A random state to be passed to `transformers.set_seed` nr_docs: The number of documents to pass to OpenAI if a prompt with the `["DOCUMENTS"]` tag is used. diversity: The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents. doc_length: The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. tokenizer: The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` Usage: To use a gpt-like model: ```python from bertopic.representation import TextGeneration from bertopic import BERTopic # Create your representation model generator = pipeline('text-generation', model='gpt2') representation_model = TextGeneration(generator) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTo pic(representation_model=representation_model) ``` You can use a custom prompt and decide where the keywords should be inserted by using the `[KEYWORDS]` or documents with thte `[DOCUMENTS]` tag: ```python from bertopic.representation import TextGeneration prompt = "I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?"" # Create your representation model generator = pipeline('text2text-generation', model='google/flan-t5-base') representation_model = TextGeneration(generator) ``` """ def __init__( self, model: Union[str, pipeline], prompt: str | None = None, pipeline_kwargs: Mapping[str, Any] = {}, random_state: int = 42, nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, tokenizer: Union[str, Callable] | None = None, ): self.random_state = random_state set_seed(random_state) if isinstance(model, str): self.model = pipeline("text-generation", model=model) elif isinstance(model, Pipeline): self.model = model else: raise ValueError( "Make sure that the HF model that you" "pass is either a string referring to a" "HF model or a `transformers.pipeline` object." ) self.prompt = prompt if prompt is not None else DEFAULT_PROMPT self.default_prompt_ = DEFAULT_PROMPT self.pipeline_kwargs = pipeline_kwargs self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer validate_truncate_document_parameters(self.tokenizer, self.doc_length) self.prompts_ = [] def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topic representations and return a single label. Arguments: topic_model: A BERTopic model documents: Not used c_tf_idf: Not used topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ # Extract the top 4 representative documents per topic if self.prompt != DEFAULT_PROMPT and "[DOCUMENTS]" in self.prompt: repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity ) else: repr_docs_mappings = {topic: None for topic in topics.keys()} updated_topics = {} for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): # Prepare prompt truncated_docs = ( [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] if docs is not None else docs ) prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) # Extract result from generator and use that as label topic_description = self.model(prompt, **self.pipeline_kwargs) topic_description = [ (description["generated_text"].replace(prompt, ""), 1) for description in topic_description ] if len(topic_description) < 10: topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description return updated_topics def _create_prompt(self, docs, topic, topics): keywords = ", ".join(next(zip(*topics[topic]))) # Use the default prompt and replace keywords if self.prompt == DEFAULT_PROMPT: prompt = self.prompt.replace("[KEYWORDS]", keywords) # Use a prompt that leverages either keywords or documents in # a custom location else: prompt = self.prompt if "[KEYWORDS]" in prompt: prompt = prompt.replace("[KEYWORDS]", keywords) if "[DOCUMENTS]" in prompt: to_replace = "" for doc in docs: to_replace += f"- {doc}\n" prompt = prompt.replace("[DOCUMENTS]", to_replace) return prompt ================================================ FILE: bertopic/representation/_utils.py ================================================ import random import time from typing import Union def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str: """Truncate a document to a certain length. If you want to add a custom tokenizer, then it will need to have a `decode` and `encode` method. An example would be the following custom tokenizer: ```python class Tokenizer: 'A custom tokenizer that splits on commas' def encode(self, doc): return doc.split(",") def decode(self, doc_chunks): return ",".join(doc_chunks) ``` You can use this tokenizer by passing it to the `tokenizer` parameter. Arguments: topic_model: A BERTopic model doc_length: The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. tokenizer: The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on `doc_length`. They are decoded with whitespaces. * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` document: A single document Returns: truncated_document: A truncated document """ if doc_length is not None: if tokenizer == "char": truncated_document = document[:doc_length] elif tokenizer == "whitespace": truncated_document = " ".join(document.split()[:doc_length]) elif tokenizer == "vectorizer": tokenizer = topic_model.vectorizer_model.build_tokenizer() truncated_document = " ".join(tokenizer(document)[:doc_length]) elif hasattr(tokenizer, "encode") and hasattr(tokenizer, "decode"): encoded_document = tokenizer.encode(document) truncated_document = tokenizer.decode(encoded_document[:doc_length]) return truncated_document return document def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, ValueError]: """Validates parameters that are used in the function `truncate_document`.""" if tokenizer is None and doc_length is not None: raise ValueError( "Please select from one of the valid options for the `tokenizer` parameter: \n" "{'char', 'whitespace', 'vectorizer'} \n" "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" ) elif tokenizer is not None and doc_length is None: raise ValueError("If `tokenizer` is provided, `doc_length` of type int must be provided as well.") def retry_with_exponential_backoff( func, initial_delay: float = 1, exponential_base: float = 2, jitter: bool = True, max_retries: int = 10, errors: tuple | None = None, ): """Retry a function with exponential backoff.""" def wrapper(*args, **kwargs): # Initialize variables num_retries = 0 delay = initial_delay # Loop until a successful response or max_retries is hit or an exception is raised while True: try: return func(*args, **kwargs) # Retry on specific errors except errors: # Increment retries num_retries += 1 # Check if max retries has been reached if num_retries > max_retries: raise Exception(f"Maximum number of retries ({max_retries}) exceeded.") # Increment the delay delay *= exponential_base * (1 + jitter * random.random()) # Sleep for the delay time.sleep(delay) # Raise exceptions for any errors not specified except Exception as e: raise e return wrapper ================================================ FILE: bertopic/representation/_visual.py ================================================ import numpy as np import pandas as pd from PIL import Image from tqdm import tqdm from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Union from transformers.pipelines import Pipeline, pipeline from bertopic.representation._mmr import mmr from bertopic.representation._base import BaseRepresentation class VisualRepresentation(BaseRepresentation): """From a collection of representative documents, extract images to represent topics. These topics are represented by a collage of images. Arguments: nr_repr_images: Number of representative images to extract nr_samples: The number of candidate documents to extract per cluster. image_height: The height of the resulting collage image_square: Whether to resize each image in the collage to a square. This can be visually more appealing if all input images are all almost squares. image_to_text_model: The model to caption images. batch_size: The number of images to pass to the `image_to_text_model`. Usage: ```python from bertopic.representation import VisualRepresentation from bertopic import BERTopic # The visual representation is typically not a core representation # and is advised to pass to BERTopic as an additional aspect. # Aspects can be labeled with dictionaries as shown below: representation_model = { "Visual_Aspect": VisualRepresentation() } # Use the representation model in BERTopic as a separate aspect topic_model = BERTopic(representation_model=representation_model) ``` """ def __init__( self, nr_repr_images: int = 9, nr_samples: int = 500, image_height: Tuple[int, int] = 600, image_squares: bool = False, image_to_text_model: Union[str, Pipeline] = None, batch_size: int = 32, ): self.nr_repr_images = nr_repr_images self.nr_samples = nr_samples self.image_height = image_height self.image_squares = image_squares # Text-to-image model if isinstance(image_to_text_model, Pipeline): self.image_to_text_model = image_to_text_model elif isinstance(image_to_text_model, str): self.image_to_text_model = pipeline("image-to-text", model=image_to_text_model) elif image_to_text_model is None: self.image_to_text_model = None else: raise ValueError( "Please select a correct transformers pipeline. For example:" "pipeline('image-to-text', model='nlpconnect/vit-gpt2-image-captioning')" ) self.batch_size = batch_size def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. Arguments: topic_model: A BERTopic model documents: All input documents c_tf_idf: The topic c-TF-IDF representation topics: The candidate topics as calculated with c-TF-IDF Returns: representative_images: Representative images per topic """ # Extract image ids of most representative documents images = documents["Image"].to_numpy().tolist() (_, _, _, repr_docs_ids) = topic_model._extract_representative_docs( c_tf_idf, documents, topics, nr_samples=self.nr_samples, nr_repr_docs=self.nr_repr_images, ) unique_topics = sorted(list(topics.keys())) # Combine representative images into a single representation representative_images = {} for topic in tqdm(unique_topics): # Get and order represetnative images sliced_examplars = repr_docs_ids[topic + topic_model._outliers] sliced_examplars = [sliced_examplars[i : i + 3] for i in range(0, len(sliced_examplars), 3)] images_to_combine = [ [ Image.open(images[index]) if isinstance(images[index], str) else images[index] for index in sub_indices ] for sub_indices in sliced_examplars ] # Concatenate representative images representative_image = get_concat_tile_resize(images_to_combine, self.image_height, self.image_squares) representative_images[topic] = representative_image # Make sure to properly close images if isinstance(images[0], str): for image_list in images_to_combine: for image in image_list: image.close() return representative_images def _convert_image_to_text(self, images: List[str], verbose: bool = False) -> List[str]: """Convert a list of images to captions. Arguments: images: A list of images or words to be converted to text. verbose: Controls the verbosity of the process Returns: List of captions """ # Batch-wise image conversion if self.batch_size is not None: documents = [] for batch in tqdm(self._chunks(images), disable=not verbose): outputs = self.image_to_text_model(batch) captions = [output[0]["generated_text"] for output in outputs] documents.extend(captions) # Convert images to text else: outputs = self.image_to_text_model(images) documents = [output[0]["generated_text"] for output in outputs] return documents def image_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame: """Convert images to text.""" # Create image topic embeddings topics = documents.Topic.to_numpy().tolist() images = documents.Image.to_numpy().tolist() df = pd.DataFrame(np.hstack([np.array(topics).reshape(-1, 1), embeddings])) image_topic_embeddings = df.groupby(0).mean().to_numpy() # Extract image centroids image_centroids = {} unique_topics = sorted(list(set(topics))) for topic, topic_embedding in zip(unique_topics, image_topic_embeddings): indices = np.array([index for index, t in enumerate(topics) if t == topic]) top_n = min([self.nr_repr_images, len(indices)]) indices = mmr( topic_embedding.reshape(1, -1), embeddings[indices], indices, top_n=top_n, diversity=0.1, ) image_centroids[topic] = indices # Extract documents documents = pd.DataFrame(columns=["Document", "ID", "Topic", "Image"]) current_id = 0 for topic, image_ids in tqdm(image_centroids.items()): selected_images = [ Image.open(images[index]) if isinstance(images[index], str) else images[index] for index in image_ids ] text = self._convert_image_to_text(selected_images) for doc, image_id in zip(text, image_ids): documents.loc[len(documents), :] = [ doc, current_id, topic, images[image_id], ] current_id += 1 # Properly close images if isinstance(images[image_ids[0]], str): for image in selected_images: image.close() return documents def _chunks(self, images): for i in range(0, len(images), self.batch_size): yield images[i : i + self.batch_size] def get_concat_h_multi_resize(im_list): """Code adapted from: https://note.nkmk.me/en/python-pillow-concat-images/.""" min_height = min(im.height for im in im_list) min_height = max(im.height for im in im_list) im_list_resize = [] for im in im_list: im.resize((int(im.width * min_height / im.height), min_height), resample=0) im_list_resize.append(im) total_width = sum(im.width for im in im_list_resize) dst = Image.new("RGB", (total_width, min_height), (255, 255, 255)) pos_x = 0 for im in im_list_resize: dst.paste(im, (pos_x, 0)) pos_x += im.width return dst def get_concat_v_multi_resize(im_list): """Code adapted from: https://note.nkmk.me/en/python-pillow-concat-images/.""" min_width = min(im.width for im in im_list) min_width = max(im.width for im in im_list) im_list_resize = [im.resize((min_width, int(im.height * min_width / im.width)), resample=0) for im in im_list] total_height = sum(im.height for im in im_list_resize) dst = Image.new("RGB", (min_width, total_height), (255, 255, 255)) pos_y = 0 for im in im_list_resize: dst.paste(im, (0, pos_y)) pos_y += im.height return dst def get_concat_tile_resize(im_list_2d, image_height=600, image_squares=False): """Code adapted from: https://note.nkmk.me/en/python-pillow-concat-images/.""" images = [[image.copy() for image in images] for images in im_list_2d] # Create if image_squares: width = int(image_height / 3) height = int(image_height / 3) images = [[image.resize((width, height)) for image in images] for images in im_list_2d] # Resize images based on minimum size else: min_width = min([min([img.width for img in imgs]) for imgs in im_list_2d]) min_height = min([min([img.height for img in imgs]) for imgs in im_list_2d]) for i, imgs in enumerate(images): for j, img in enumerate(imgs): if img.height > img.width: images[i][j] = img.resize( (int(img.width * min_height / img.height), min_height), resample=0, ) elif img.width > img.height: images[i][j] = img.resize((min_width, int(img.height * min_width / img.width)), resample=0) else: images[i][j] = img.resize((min_width, min_width)) # Resize grid image images = [get_concat_h_multi_resize(im_list_h) for im_list_h in images] img = get_concat_v_multi_resize(images) height_percentage = image_height / float(img.size[1]) adjusted_width = int((float(img.size[0]) * float(height_percentage))) img = img.resize((adjusted_width, image_height), Image.Resampling.LANCZOS) return img ================================================ FILE: bertopic/representation/_zeroshot.py ================================================ import pandas as pd from transformers import pipeline from transformers.pipelines.base import Pipeline from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Any from bertopic.representation._base import BaseRepresentation class ZeroShotClassification(BaseRepresentation): """Zero-shot Classification on topic keywords with candidate labels. Arguments: candidate_topics: A list of labels to assign to the topics if they exceed `min_prob` model: A transformers pipeline that should be initialized as "zero-shot-classification". For example, `pipeline("zero-shot-classification", model="facebook/bart-large-mnli")` pipeline_kwargs: Kwargs that you can pass to the transformers.pipeline when it is called. NOTE: Use `{"multi_label": True}` to extract multiple labels for each topic. min_prob: The minimum probability to assign a candidate label to a topic Usage: ```python from bertopic.representation import ZeroShotClassification from bertopic import BERTopic # Create your representation model candidate_topics = ["space and nasa", "bicycles", "sports"] representation_model = ZeroShotClassification(candidate_topics, model="facebook/bart-large-mnli") # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` """ def __init__( self, candidate_topics: List[str], model: str = "facebook/bart-large-mnli", pipeline_kwargs: Mapping[str, Any] = {}, min_prob: float = 0.8, ): self.candidate_topics = candidate_topics if isinstance(model, str): self.model = pipeline("zero-shot-classification", model=model) elif isinstance(model, Pipeline): self.model = model else: raise ValueError( "Make sure that the HF model that you" "pass is either a string referring to a" "HF model or a `transformers.pipeline` object." ) self.pipeline_kwargs = pipeline_kwargs self.min_prob = min_prob def extract_topics( self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]], ) -> Mapping[str, List[Tuple[str, float]]]: """Extract topics. Arguments: topic_model: Not used documents: Not used c_tf_idf: Not used topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ # Classify topics topic_descriptions = [" ".join(next(zip(*topics[topic]))) for topic in topics.keys()] classifications = self.model(topic_descriptions, self.candidate_topics, **self.pipeline_kwargs) # Extract labels updated_topics = {} for topic, classification in zip(topics.keys(), classifications): topic_description = topics[topic] # Multi-label assignment if self.pipeline_kwargs.get("multi_label"): topic_description = [] for label, score in zip(classification["labels"], classification["scores"]): if score > self.min_prob: topic_description.append((label, score)) # Single label assignment elif classification["scores"][0] > self.min_prob: topic_description = [(classification["labels"][0], classification["scores"][0])] # Make sure that 10 items are returned if len(topic_description) == 0: topic_description = topics[topic] elif len(topic_description) < 10: topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description return updated_topics ================================================ FILE: bertopic/vectorizers/__init__.py ================================================ from ._ctfidf import ClassTfidfTransformer from ._online_cv import OnlineCountVectorizer __all__ = ["ClassTfidfTransformer", "OnlineCountVectorizer"] ================================================ FILE: bertopic/vectorizers/_ctfidf.py ================================================ from typing import List from sklearn.feature_extraction.text import TfidfTransformer from sklearn.preprocessing import normalize from sklearn.utils import check_array import numpy as np import scipy.sparse as sp class ClassTfidfTransformer(TfidfTransformer): """A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base. ![](../algorithm/c-TF-IDF.svg) c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes by joining all documents per class. Thus, each class is converted to a single document instead of set of documents. The frequency of each word **x** is extracted for each class **c** and is **l1** normalized. This constitutes the term frequency. Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus the average number of words per class **A** divided by the frequency of word **x** across all classes. Arguments: bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure as defined in the c-TF-IDF formula. It uses the following weighting scheme: `log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))` reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix. Helps to reduce the impact of words that appear too frequently. seed_words: Specific words that will have their idf value increased by the value of `seed_multiplier`. NOTE: This will only increase the value of words that have an exact match. seed_multiplier: The value with which the idf values of the words in `seed_words` are multiplied. Examples: ```python transformer = ClassTfidfTransformer() ``` """ def __init__( self, bm25_weighting: bool = False, reduce_frequent_words: bool = False, seed_words: List[str] | None = None, seed_multiplier: float = 2, ): self.bm25_weighting = bm25_weighting self.reduce_frequent_words = reduce_frequent_words self.seed_words = seed_words self.seed_multiplier = seed_multiplier super(ClassTfidfTransformer, self).__init__() def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None): """Learn the idf vector (global term weights). Arguments: X: A matrix of term/token counts. multiplier: A multiplier for increasing/decreasing certain IDF scores """ X = check_array(X, accept_sparse=("csr", "csc")) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = np.float64 if self.use_idf: _, n_features = X.shape # Calculate the frequency of words across all classes df = np.squeeze(np.asarray(X.sum(axis=0))) # Calculate the average number of samples as regularization avg_nr_samples = int(X.sum(axis=1).mean()) # BM25-inspired weighting procedure if self.bm25_weighting: idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5))) # Divide the average number of samples by the word frequency # +1 is added to force values to be positive else: idf = np.log((avg_nr_samples / df) + 1) # Multiplier to increase/decrease certain idf scores if multiplier is not None: idf = idf * multiplier self._idf_diag = sp.diags( idf, offsets=0, shape=(n_features, n_features), format="csr", dtype=dtype, ) return self def transform(self, X: sp.csr_matrix): """Transform a count-based matrix to c-TF-IDF. Arguments: X (sparse matrix): A matrix of term/token counts. Returns: X (sparse matrix): A c-TF-IDF matrix """ if self.use_idf: X = normalize(X, axis=1, norm="l1", copy=False) if self.reduce_frequent_words: X.data = np.sqrt(X.data) X = X * self._idf_diag return X ================================================ FILE: bertopic/vectorizers/_online_cv.py ================================================ import numpy as np from itertools import chain from typing import List from scipy import sparse from scipy.sparse import csr_matrix from sklearn.feature_extraction.text import CountVectorizer class OnlineCountVectorizer(CountVectorizer): """An online variant of the CountVectorizer with updating vocabulary. At each `.partial_fit`, its vocabulary is updated based on any OOV words it might find. Then, `.update_bow` can be used to track and update the Bag-of-Words representation. These functions are separated such that the vectorizer can be used in iteration without updating the Bag-of-Words representation can might speed up the fitting process. However, the `.update_bow` function is used in BERTopic to track changes in the topic representations and allow for decay. This class inherits its parameters and attributes from: `sklearn.feature_extraction.text.CountVectorizer` Arguments: decay: A value between [0, 1] to weight the percentage of frequencies the previous bag-of-words should be decreased. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix with 10% at each iteration. delete_min_df: Delete words at each iteration from its vocabulary that are below a minimum frequency. This will keep the resulting bag-of-words matrix small such that it does not explode in size with increasing vocabulary. If `decay` is None then this equals `min_df`. **kwargs: Set of parameters inherited from: `sklearn.feature_extraction.text.CountVectorizer` In practice, this means that you can still use parameters from the original CountVectorizer, like `stop_words` and `ngram_range`. Attributes: X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation Examples: ```python from bertopic.vectorizers import OnlineCountVectorizer vectorizer = OnlineCountVectorizer(stop_words="english") for index, doc in enumerate(my_docs): vectorizer.partial_fit(doc) # Update and clean the bow every 100 iterations: if index % 100 == 0: X = vectorizer.update_bow() ``` To use the model in BERTopic: ```python from bertopic import BERTopic from bertopic.vectorizers import OnlineCountVectorizer vectorizer_model = OnlineCountVectorizer(stop_words="english") topic_model = BERTopic(vectorizer_model=vectorizer_model) ``` References: Adapted from: https://github.com/idoshlomo/online_vectorizers """ def __init__(self, decay: float | None = None, delete_min_df: float | None = None, **kwargs): self.decay = decay self.delete_min_df = delete_min_df super(OnlineCountVectorizer, self).__init__(**kwargs) def partial_fit(self, raw_documents: List[str]) -> None: """Perform a partial fit and update vocabulary with OOV tokens. Arguments: raw_documents: A list of documents """ if not hasattr(self, "vocabulary_"): return self.fit(raw_documents) analyzer = self.build_analyzer() analyzed_documents = [analyzer(doc) for doc in raw_documents] new_tokens = set(chain.from_iterable(analyzed_documents)) oov_tokens = new_tokens.difference(set(self.vocabulary_.keys())) if oov_tokens: max_index = max(self.vocabulary_.values()) oov_vocabulary = dict( zip( oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)), ) ) self.vocabulary_.update(oov_vocabulary) return self def update_bow(self, raw_documents: List[str]) -> csr_matrix: """Create or update the bag-of-words matrix. Update the bag-of-words matrix by adding the newly transformed documents. This may add empty columns if new words are found and/or add empty rows if new topics are found. During this process, the previous bag-of-words matrix might be decayed if `self.decay` has been set during init. Similarly, words that do not exceed `self.delete_min_df` are removed from its vocabulary and bag-of-words matrix. Arguments: raw_documents: A list of documents Returns: X_: Bag-of-words matrix """ if hasattr(self, "X_"): X = self.transform(raw_documents) # Add empty columns if new words are found columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int) self.X_ = sparse.hstack([self.X_, columns]) # Add empty rows if new topics are found rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int) self.X_ = sparse.vstack([self.X_, rows]) # Decay of BoW matrix if self.decay is not None: self.X_ = self.X_ * (1 - self.decay) self.X_ += X else: self.X_ = self.transform(raw_documents) if self.delete_min_df is not None: self._clean_bow() return self.X_ def _clean_bow(self) -> None: """Remove words that do not exceed `self.delete_min_df`.""" # Only keep words with a minimum frequency indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1] indices_dict = {index: index for index in indices} self.X_ = self.X_[:, indices] # Update vocabulary with new words new_vocab = {} vocabulary_dict = {v: k for k, v in self.vocabulary_.items()} for i, index in enumerate(indices): if indices_dict.get(index) is not None: new_vocab[vocabulary_dict[index]] = i self.vocabulary_ = new_vocab ================================================ FILE: docs/algorithm/algorithm.md ================================================ --- hide: - navigation --- # The Algorithm Below, you will find different types of overviews of each step in BERTopic's main algorithm. Each successive overview will be more in-depth than the previous overview. This approach aims to make the underlying algorithm as intuitive as possible for a wide range of users. ## **Visual Overview** BERTopic can be viewed as a sequence of steps to create its topic representations. There are five steps to this process: Although these steps are the default, there is some modularity to BERTopic. Each step in this process was carefully selected such that they are all somewhat independent from one another. For example, the tokenization step is not directly influenced by the embedding model that was used to convert the documents which allow us to be creative in how we perform the tokenization step. This effect is especially strong in the clustering step. Models like HDBSCAN assume that clusters can have different shapes and forms. As a result, using a centroid-based technique to model the topic representations would not be beneficial since the centroid is not always representative of these types of clusters. A bag-of-words representation, however, makes very few assumptions concerning the shape and form of a cluster. As a result, BERTopic is quite modular and can maintain its quality of topic generation throughout a variety of sub-models. In other words, BERTopic essentially allows you to **build your own topic model**: There is extensive documentation on how to use each step in this pipeline: 1. [Embeddings](../getting_started/embeddings/embeddings.html) 2. [Dimensionality Reduction](../getting_started/dim_reduction/dim_reduction.html) 3. [Clustering](../getting_started/clustering/clustering.html) 4. [Tokenizer](../getting_started/vectorizers/vectorizers.html) 5. [Weighting Scheme](../getting_started/ctfidf/ctfidf.html) 6. [Representation Tuning](../getting_started/representation/representation.html) * [Large Language Models (LLM)](../getting_started/representation/llm.html) ## **Code Overview** After going through the visual overview, this code overview demonstrates the algorithm using BERTopic. An advantage of using BERTopic is each major step in its algorithm can be explicitly defined, thereby making the process not only transparent but also more intuitive. ```python from umap import UMAP from hdbscan import HDBSCAN from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer from bertopic import BERTopic from bertopic.representation import KeyBERTInspired from bertopic.vectorizers import ClassTfidfTransformer # Step 1 - Extract embeddings embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # Step 2 - Reduce dimensionality umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine') # Step 3 - Cluster reduced embeddings hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True) # Step 4 - Tokenize topics vectorizer_model = CountVectorizer(stop_words="english") # Step 5 - Create topic representation ctfidf_model = ClassTfidfTransformer() # Step 6 - (Optional) Fine-tune topic representations with # a `bertopic.representation` model representation_model = KeyBERTInspired() # All steps together topic_model = BERTopic( embedding_model=embedding_model, # Step 1 - Extract embeddings umap_model=umap_model, # Step 2 - Reduce dimensionality hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics ctfidf_model=ctfidf_model, # Step 5 - Extract topic words representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations ) ``` ## **Detailed Overview** This overview describes each step in more detail such that you can get an intuitive feeling as to what models might fit best at each step in your use case. ### **1. Embed documents** We start by converting our documents to numerical representations. Although there are many methods for doing so the default in BERTopic is [sentence-transformers](https://github.com/UKPLab/sentence-transformers). These models are often optimized for semantic similarity which helps tremendously in our clustering task. Moreover, they are great for creating either document- or sentence-embeddings.
In BERTopic, you can choose any sentence-transformers model but two models are set as defaults: * `"all-MiniLM-L6-v2"` * `"paraphrase-multilingual-MiniLM-L12-v2"` The first is an English language model trained specifically for semantic similarity tasks which works quite well for most use cases. The second model is very similar to the first with one major difference being that the `multilingual` models work for 50+ languages. This model is quite a bit larger than the first and is only selected if you select any language other than English. !!! tip Embedding models Although BERTopic uses sentence-transformers models as a default, you can choose any embedding model that fits your use case. Follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html) for selecting and customizing your model. ### **2. Dimensionality reduction** After having created our numerical representations of the documents we have to reduce the dimensionality of these representations. Cluster models typically have difficulty handling high dimensional data due to the curse of dimensionality. There are great approaches that can reduce dimensionality, such as PCA, but as a default [UMAP](https://github.com/lmcinnes/umap) is selected in BERTopic. It is a technique that can keep some of a dataset's local and global structure when reducing its dimensionality. This structure is important to keep as it contains the information necessary to create clusters of semantically similar documents. !!! tip Dimensionality reduction models Although BERTopic uses UMAP as a default, you can choose any dimensionality reduction model that fits your use case. Follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html) for selecting and customizing your model. ### **3. Cluster Documents** After having reduced our embeddings, we can start clustering our data. For that, we leverage a density-based clustering technique, HDBSCAN. It can find clusters of different shapes and has the nice feature of identifying outliers where possible. As a result, we do not force documents into a cluster where they might not belong. This will improve the resulting topic representation as there is less noise to draw from. !!! tip Cluster models Although BERTopic uses HDBSCAN as a default, you can choose any cluster model that fits your use case. Follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/clustering/clustering.html) for selecting and customizing your model. ### **4. Bag-of-words** Before we can start creating the topic representation we first need to select a technique that allows for modularity in BERTopic's algorithm. When we use HDBSCAN as a cluster model, we may assume that our clusters have different degrees of density and different shapes. This means that a centroid-based topic representation technique might not be the best-fitting model. In other words, we want a topic representation technique that makes little to no assumption on the expected structure of the clusters.
To do this, we first combine all documents in a cluster into a single document. That, very long, document then represents the cluster. Then, we can count how often each word appears in each cluster. This generates something called a bag-of-words representation in which the frequency of each word in each cluster can be found. This bag-of-words representation is therefore on a cluster level and not on a document level. This distinction is important as we are interested in words on a topic level (i.e., cluster level). By using a bag-of-words representation, no assumption is made concerning the structure of the clusters. Moreover, the bag-of-words representation is L1-normalized to account for clusters that have different sizes. !!! tip Bag-of-words and tokenization There are many ways you can tune or change the bag-of-words step. This step allows for processing the documents however you want without affecting the first step, embedding the documents. You can follow the guide [here](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html) for more information about tokenization options in BERTopic. ### **5. Topic representation** From the generated bag-of-words representation, we want to know what makes one cluster different from another. Which words are typical for cluster 1 and not so much for all other clusters? To solve this, we need to modify TF-IDF such that it considers topics (i.e., clusters) instead of documents.
When you apply TF-IDF as usual on a set of documents, what you are doing is comparing the importance of words between documents. Now, what if, we instead treat all documents in a single category (e.g., a cluster) as a single document and then apply TF-IDF? The result would be importance scores for words within a cluster. The more important words are within a cluster, the more it is representative of that topic. In other words, if we extract the most important words per cluster, we get descriptions of **topics**! This model is called **class-based TF-IDF**:


Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word `x` in class `c`, where `c` refers to the cluster we created before. This results in our class-based `tf` representation. This representation is L1-normalized to account for the differences in topic sizes.

Then, we take the logarithm of one plus the average number of words per class `A` divided by the frequency of word `x` across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based `idf` representation. Like with the classic TF-IDF, we then multiply `tf` with `idf` to get the importance score per word in each class. In other words, the classical TF-IDF procedure is **not** used here but a modified version of the algorithm that allows for a much better representation. !!! tip c-TF-IDF parameters In the `ClassTfidfTransformer`, there are a few parameters that might be worth exploring, including an option to perform additional BM-25 weighting. You can find more information about that [here](https://maartengr.github.io/BERTopic/getting_started/ctfidf/ctfidf.html). ### **6. (Optional) Fine-tune Topic representation** After having generated the c-TF-IDF representations, we have a set of words that describe a collection of documents. c-TF-IDF is a method that can quickly generate accurate topic representations. However, with the fast developments in NLP-world, new and exciting methods are released weekly. In order to keep up with what is happening, there is the possibility to further fine-tune these c-TF-IDF topics using GPT, T5, KeyBERT, Spacy, and other techniques. Many are implemented in BERTopic for you to use and play around with. More specifically, we can consider the c-TF-IDF generated topics to be candidate topics. They each contain a set of keywords and representative documents that we can use to further fine-tune the topic representations. Having a set of representative documents for each topic is huge advantage as it allows for fine-tuning on a reduced number of documents. This reduces computation for large models as they only need to operate on that small set of representative documents for each topic. As a result, large language models like GPT and T5 becomes feasible in production settings and typically take less wall time than the dimensionality reduction and clustering steps. The following models are implemented in `bertopic.representation`: * `MaximalMarginalRelevance` * `PartOfSpeech` * `KeyBERTInspired` * `ZeroShotClassification` * `TextGeneration` (HuggingFace) * `Cohere` * `OpenAI` * `LangChain` * `LiteLLM` * `LlamaCPP` !!! tip Models There are roughly two sets of models. **First** are the non-generative set of models that you can find [here](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html). These include models that focus on enhancing the keywords in the topic representations. **Second** are the generative models that attempt to label or summarize the topics instead. You can find an overview of [implemented LLMs here](https://maartengr.github.io/BERTopic/getting_started/representation/llm). ================================================ FILE: docs/api/backends.md ================================================ # `Backends` ::: bertopic.backend ================================================ FILE: docs/api/bertopic.md ================================================ # `BERTopic` ::: bertopic._bertopic.BERTopic ================================================ FILE: docs/api/cluster copy.md ================================================ # `BaseCluster` ::: bertopic.cluster._base.BaseCluster ================================================ FILE: docs/api/cluster.md ================================================ # `BaseCluster` ::: bertopic.cluster._base.BaseCluster ================================================ FILE: docs/api/ctfidf.md ================================================ # `c-TF-IDF` ::: bertopic.vectorizers.ClassTfidfTransformer ================================================ FILE: docs/api/dimensionality.md ================================================ # `BaseDimensionalityReduction` ::: bertopic.dimensionality._base.BaseDimensionalityReduction ================================================ FILE: docs/api/plotting/barchart.md ================================================ # `Barchart` ::: bertopic.plotting._barchart.visualize_barchart ================================================ FILE: docs/api/plotting/distribution.md ================================================ # `Distribution` ::: bertopic.plotting._distribution.visualize_distribution ================================================ FILE: docs/api/plotting/document_datamap.md ================================================ # `Documents with DataMapPlot` ::: bertopic.plotting._datamap.visualize_document_datamap ================================================ FILE: docs/api/plotting/documents.md ================================================ # `Documents` ::: bertopic.plotting._documents.visualize_documents ================================================ FILE: docs/api/plotting/dtm.md ================================================ # `DTM` ::: bertopic.plotting._topics_over_time.visualize_topics_over_time ================================================ FILE: docs/api/plotting/heatmap.md ================================================ # `Heatmap` ::: bertopic.plotting._heatmap.visualize_heatmap ================================================ FILE: docs/api/plotting/hierarchical_documents.md ================================================ # `Hierarchical Documents` ::: bertopic.plotting._hierarchical_documents.visualize_hierarchical_documents ================================================ FILE: docs/api/plotting/hierarchy.md ================================================ # `Hierarchy` ::: bertopic.plotting._hierarchy.visualize_hierarchy ================================================ FILE: docs/api/plotting/term.md ================================================ # `Term Score Decline` ::: bertopic.plotting._term_rank.visualize_term_rank ================================================ FILE: docs/api/plotting/topics.md ================================================ # `Topics` ::: bertopic.plotting._topics.visualize_topics ================================================ FILE: docs/api/plotting/topics_per_class.md ================================================ # `Topics per Class` ::: bertopic.plotting._topics_per_class.visualize_topics_per_class ================================================ FILE: docs/api/plotting.md ================================================ # `Plotting` ::: bertopic.plotting ================================================ FILE: docs/api/representations.md ================================================ # `Representations` ::: bertopic.representation ================================================ FILE: docs/api/vectorizers.md ================================================ # `Vectorizers` ::: bertopic.vectorizers._online_cv.OnlineCountVectorizer ================================================ FILE: docs/changelog.md ================================================ --- hide: - navigation --- # Changelog ## **Version 0.17.4** *Release date: 3 December, 2025*

Highlights:

* Add `.delete_topics` by [@shuanglovesdata](https://github.com/shuanglovesdata) in [#2322](https://github.com/MaartenGr/BERTopic/pull/2322) * Allow execution without plotly by [@luismavs](https://github.com/luismavs) in [#2401](https://github.com/MaartenGr/BERTopic/pull/2401) * Add tqdm to _litellm.py [@NFrnk](https://github.com/NFrnk) in [#2408](https://github.com/MaartenGr/BERTopic/pull/2408) * Drop support for python 3.9 by [@afuetterer](https://github.com/afuetterer) in [#2419](https://github.com/MaartenGr/BERTopic/pull/2419) * Make UMAP's init default to random on visualize_topics for reproducible visualization by [@makramab](https://github.com/makramab) in [#2412](https://github.com/MaartenGr/BERTopic/pull/2412)

cuML:

Preparing for MEGA!-scale BERTopic with [Multi-GPU UMAP](https://github.com/rapidsai/cuml/issues/3330#issuecomment-3376951158) and the following necessary updates: * Update installation instructions for cuML with BERTopic by [@csadorf](https://github.com/csadorf) in [#2446](https://github.com/MaartenGr/BERTopic/pull/2446) * Speed up `._create_topic_vectors` by replacing DataFrame .loc with NumPy masking [@jinsolp](https://github.com/jinsolp) in [#2406](https://github.com/MaartenGr/BERTopic/pull/2406) * Modify _reduce_dimensionality to use fit_transform by [@betatim](https://github.com/betatim) in [#2416](https://github.com/MaartenGr/BERTopic/pull/2416)

Fixes:

* Fix incorrect label in zero-shot svg in documentation by [@huisman](https://github.com/huisman) in [#2448](https://github.com/MaartenGr/BERTopic/pull/2448) * Enable ruff rule RUF by [@afuetterer](https://github.com/afuetterer) in [#2457](https://github.com/MaartenGr/BERTopic/pull/2457) * CI: bump github actions versions by [@afuetterer](https://github.com/afuetterer) in [#2427](https://github.com/MaartenGr/BERTopic/pull/2427) * CI: prefer action-pre-commit-uv for lint job by [@afuetterer](https://github.com/afuetterer) in [#2434](https://github.com/MaartenGr/BERTopic/pull/2434) * CI: switch to uv based project installation by [@afuetterer](https://github.com/afuetterer) in [#2445](https://github.com/MaartenGr/BERTopic/pull/2445) * Chore: update pre-commit hooks by [@afuetterer](https://github.com/afuetterer) in [#2414](https://github.com/MaartenGr/BERTopic/pull/2414) and [#2443](https://github.com/MaartenGr/BERTopic/pull/2443) * Chore: remove obsolete `version_info` check by [@afuetterer](https://github.com/afuetterer) in [#2444](https://github.com/MaartenGr/BERTopic/pull/2444) ## **Version 0.17.3** *Release date: 8 July, 2025* * Fixed issues with `uv add bertopic` by adding `llvmlite` as an explicit dependency. Although the fixes in `v0.17.2` and `v0.17.2.post1` actually worked locally, this wasn't the case after pushing the package to pypi.org unfortunately. ## **Version 0.17.1** *Release date: 8 July, 2025*

Highlights:

* Added FastEmbed backend by [@nickprock](https://github.com/nickprock) in [#2213](https://github.com/MaartenGr/BERTopic/pull/2312) * Added LangChain backend by [@regaltsui](https://github.com/regaltsui) in [#2303](https://github.com/MaartenGr/BERTopic/pull/2303) * Pass precomputed embeddings to KeyBERTInspired.extract_topics [@saikumaru](https://github.com/saikumaru) in [#2368](https://github.com/MaartenGr/BERTopic/pull/2368)

Fixes:

* Merge models without pytorch (using safetensors) by [@MaartenGr](https://github.com/MaartenGr) in [#2329](https://github.com/MaartenGr/BERTopic/pull/2329) * Fix installation issue with uv by [@MaartenGr](https://github.com/MaartenGr) in [#2328](https://github.com/MaartenGr/BERTopic/pull/2328) * Fix incorrect comparison in update_topics by [@uply23333](https://github.com/uply23333) in [#2336](https://github.com/MaartenGr/BERTopic/pull/2336) * Add missing comma under Exploration subsection by [@angelonazzaro](https://github.com/angelonazzaro) in [#2374](https://github.com/MaartenGr/BERTopic/pull/2374) * Fix typo in Lightweight installation under tips_and_tricks by [@angelonazzaro](https://github.com/angelonazzaro) in [#2375](https://github.com/MaartenGr/BERTopic/pull/2375) * Fix IndexError in zero-shot topic modeling by [@MaartenGr](https://github.com/MaartenGr) in [#2267](https://github.com/MaartenGr/BERTopic/pull/2267) ## **Version 0.17.0** *Release date: 19 March, 2025*

Highlights:

* Light-weight installation without UMAP and HDBSCAN by [@MaartenGr](https://github.com/MaartenGr) in [#2289](https://github.com/MaartenGr/BERTopic/pull/2289) * Add Model2Vec as an embedding backend by [@MaartenGr](https://github.com/MaartenGr) in [#2245](https://github.com/MaartenGr/BERTopic/pull/2245) * Add LiteLLM as a representation model by [@MaartenGr](https://github.com/MaartenGr) in [#2213](https://github.com/MaartenGr/BERTopic/pull/2213) * Interactive DataMapPlot and deprecate non-chat OpenAI models by [@MaartenGr](https://github.com/MaartenGr) in [#2287](https://github.com/MaartenGr/BERTopic/pull/2287)

Fixes:

* Lightweight installation: use safetensors without torch by [@hedgeho](https://github.com/hedgeho) in [#2306](https://github.com/MaartenGr/BERTopic/pull/2306) * Fix missing links by [@MaartenGr](https://github.com/MaartenGr) in [#2305](https://github.com/MaartenGr/BERTopic/pull/2305) * Set up pre-commit hooks by [@afuetterer](https://github.com/afuetterer) in [#2283](https://github.com/MaartenGr/BERTopic/pull/2283) * Fix handling OpenAI returning None objects by [@jeaninejuliettes](https://github.com/jeaninejuliettes) in [#2280](https://github.com/MaartenGr/BERTopic/pull/2280) * Add support for python 3.13 by [@afuetterer](https://github.com/afuetterer) in [#2173](https://github.com/MaartenGr/BERTopic/pull/2173) * Added system prompts by [@Leo-LiHao](https://github.com/Leo-LiHao) in [#2145](https://github.com/MaartenGr/BERTopic/pull/2145) * More documentation for topic reduction by [@MaartenGr](https://github.com/MaartenGr) in [#2260](https://github.com/MaartenGr/BERTopic/pull/2260) * Drop support for python 3.8 by [@afuetterer](https://github.com/afuetterer) in [#2243](https://github.com/MaartenGr/BERTopic/pull/2243) * Fixed online topic modeling on GPU by [@SSivakumar12](https://github.com/SSivakumar12) in [#2181](https://github.com/MaartenGr/BERTopic/pull/2181) * Fixed hierarchical cluster visualization by [@PipaFlores](https://github.com/PipaFlores) in [#2191](https://github.com/MaartenGr/BERTopic/pull/2191) * Remove duplicated phrase by [@AndreaFrancis](https://github.com/AndreaFrancis) in [#2197](https://github.com/MaartenGr/BERTopic/pull/2197) ## **Version 0.16.4** *Release date: 9 October, 2024*

Fixes:

* Fix ValueError in Guided Topic Modeling by [@RTChou](https://github.com/RTChou) in [#2115](https://github.com/MaartenGr/BERTopic/pull/2115) * Fix saving BERTopic when c-TF-IDF is None by [@sete39](https://github.com/sete39) in [#2112](https://github.com/MaartenGr/BERTopic/pull/2112) * Fix `KeyError: 'topics_from'` in [#2101](https://github.com/MaartenGr/BERTopic/pull/2101) * Fix issues related Zero-shot Topic Modeling by [@ianrandman](https://github.com/ianrandman) in [#2105](https://github.com/MaartenGr/BERTopic/pull/2105) * Fix regex matching being used in PartOfSpeech representation model by [@woranov](https://github.com/woranov) in [#2138](https://github.com/MaartenGr/BERTopic/pull/2138) * Update typo by [@saikumaru](https://github.com/saikumaru) in [#2162](https://github.com/MaartenGr/BERTopic/pull/2162) ## **Version 0.16.3** *Release date: 22 July, 2024*

Highlights:

* Simplify zero-shot topic modeling by [@ianrandman](https://github.com/ianrandman) in [#2060](https://github.com/MaartenGr/BERTopic/pull/2060) * Option to choose between c-TF-IDF and Topic Embeddings in many functions by [@azikoss](https://github.com/azikoss) in [#1894](https://github.com/MaartenGr/BERTopic/pull/1894) * Use the `use_ctfidf` parameter in the following function to choose between c-TF-IDF and topic embeddings: * `hierarchical_topics`, `reduce_topics`, `visualize_hierarchy`, `visualize_heatmap`, `visualize_topics` * Linting with Ruff by [@afuetterer](https://github.com/afuetterer) in [#2033](https://github.com/MaartenGr/BERTopic/pull/2033) * Switch from setup.py to pyproject.toml by [@afuetterer](https://github.com/afuetterer) in [#1978](https://github.com/MaartenGr/BERTopic/pull/1978) * In multi-aspect context, allow Main model to be chained by [@ddicato](https://github.com/ddicato) in [#2002](https://github.com/MaartenGr/BERTopic/pull/2002)

Fixes:

* Added templates for [issues](https://github.com/MaartenGr/BERTopic/tree/master/.github/ISSUE_TEMPLATE) and [pull requests](https://github.com/MaartenGr/BERTopic/blob/master/.github/PULL_REQUEST_TEMPLATE.md) * Update River documentation example by [@Proteusiq](https://github.com/Proteusiq) in [#2004](https://github.com/MaartenGr/BERTopic/pull/2004) * Fix PartOfSpeech reproducibility by [@Greenpp](https://github.com/Greenpp) in [#1996](https://github.com/MaartenGr/BERTopic/pull/1996) * Fix PartOfSpeech ignoring first word by [@Greenpp](https://github.com/Greenpp) in [#2024](https://github.com/MaartenGr/BERTopic/pull/2024) * Make sklearn embedding backend auto-select more cautious by [@freddyheppell](https://github.com/freddyheppell) in [#1984](https://github.com/MaartenGr/BERTopic/pull/1984) * Fix typos by [@afuetterer](https://github.com/afuetterer) in [#1974](https://github.com/MaartenGr/BERTopic/pull/1974) * Fix hierarchical_topics(...) when the distances between three clusters are the same by [@azikoss](https://github.com/azikoss) in [#1929](https://github.com/MaartenGr/BERTopic/pull/1929) * Fixes to chain strategy example in outlier_reduction.md by [@reuning](https://github.com/reuning) in [#2065](https://github.com/MaartenGr/BERTopic/pull/2065) * Remove obsolete flake8 config and update line length by [@afuetterer](https://github.com/afuetterer) in [#22066](https://github.com/MaartenGr/BERTopic/pull/2066) ## **Version 0.16.2** *Release date: 12 May, 2024*

Fixes:

* Fix issue with zeroshot topic modeling missing outlier [#1957](https://github.com/MaartenGr/BERTopic/issues/1957) * Bump github actions versions by [@afuetterer](https://github.com/afuetterer) in [#1941](https://github.com/MaartenGr/BERTopic/pull/1941) * Drop support for python 3.7 by [@afuetterer](https://github.com/afuetterer) in [#1949](https://github.com/MaartenGr/BERTopic/pull/1949) * Add testing python 3.10+ in Github actions by [@afuetterer](https://github.com/afuetterer) in [#1968](https://github.com/MaartenGr/BERTopic/pull/1968) * Speed up fitting CountVectorizer by [@dannywhuang](https://github.com/dannywhuang) in [#1938](https://github.com/MaartenGr/BERTopic/pull/1938) * Fix `transform` when using cuML HDBSCAN by [@beckernick](https://github.com/beckernick) in [#1960](https://github.com/MaartenGr/BERTopic/pull/1960) * Fix wrong link in algorithm documentation by [@naeyn](https://github.com/naeyn) in [#1970](https://github.com/MaartenGr/BERTopic/pull/1970) ## **Version 0.16.1** *Release date: 21 April, 2024*

Highlights:

* Add Quantized [LLM Tutorial](https://colab.research.google.com/drive/1DdSHvVPJA3rmNfBWjCo2P1E9686xfxFx?usp=sharing) * Add optional [datamapplot](https://github.com/TutteInstitute/datamapplot) visualization using `topic_model.visualize_document_datamap` by [@lmcinnes](https://github.com/lmcinnes) in [#1750](https://github.com/MaartenGr/BERTopic/pull/1750) * Migrated OpenAIBackend to openai>=1 by [@peguerosdc](https://github.com/peguerosdc) in [#1724](https://github.com/MaartenGr/BERTopic/pull/1724) * Add automatic height scaling and font resize by [@ir2718](https://github.com/ir2718) in [#1863](https://github.com/MaartenGr/BERTopic/pull/1863) * Use `[KEYWORDS]` tags with the LangChain representation model by [@mcantimmy](https://github.com/mcantimmy) in [#1871](https://github.com/MaartenGr/BERTopic/pull/1871)

Fixes:

* Fixed issue with `.merge_models` seemingly skipping topic [#1898](https://github.com/MaartenGr/BERTopic/issues/1898) * Fixed Cohere client.embed TypeError [#1904](https://github.com/MaartenGr/BERTopic/issues/1904) * Fixed `AttributeError: 'TextGeneration' object has no attribute 'random_state'` [#1870](https://github.com/MaartenGr/BERTopic/issues/1870) * Fixed topic embeddings not properly updated if all outliers were removed [#1838](https://github.com/MaartenGr/BERTopic/issues/1838) * Fixed issue with representation models not properly merging [#1762](https://github.com/MaartenGr/BERTopic/issues/1762) * Fixed Embeddings not ordered correctly when using `.merge_models` [#1804](https://github.com/MaartenGr/BERTopic/issues/1804) * Fixed Outlier topic not in the 0th position when using zero-shot topic modeling causing prediction issues (amongst others) [#1804](https://github.com/MaartenGr/BERTopic/issues/1804) * Fixed Incorrect label in ZeroShot doc SVG [#1732](https://github.com/MaartenGr/BERTopic/issues/1732) * Fixed MultiModalBackend throws error with clip-ViT-B-32-multilingual-v1 [#1670](https://github.com/MaartenGr/BERTopic/issues/1670) * Fixed AuthenticationError while using OpenAI() [#1678](https://github.com/MaartenGr/BERTopic/issues/1678) * Update FAQ on Apple Silicon by [@benz0li](https://github.com/benz0li) in [#1901](https://github.com/MaartenGr/BERTopic/pull/1901) * Add documentation DataMapPlot + FAQ for running on Apple Silicon by [@dkapitan](https://github.com/dkapitan) in [#1854](https://github.com/MaartenGr/BERTopic/pull/1854) * Remove commas from pip install reference in readme by [@luisoala](https://github.com/luisoala) in [#1850](https://github.com/MaartenGr/BERTopic/pull/1850) * Spelling corrections by [@joouha](https://github.com/joouha) in [#1801](https://github.com/MaartenGr/BERTopic/pull/1801) * Replacing the deprecated `text-ada-001` model with the latest `text-embedding-3-small` from OpenAI by [@atmb4u](https://github.com/atmb4u) in [#1800](https://github.com/MaartenGr/BERTopic/pull/1800) * Prevent invalid empty input error when retrieving embeddings with openai backend by [@liaoelton](https://github.com/liaoelton) in [#1827](https://github.com/MaartenGr/BERTopic/pull/1827) * Remove spurious warning about missing embedding model by [@sliedes](https://github.com/sliedes) in [#1774](https://github.com/MaartenGr/BERTopic/pull/1774) * Fix type hint in ClassTfidfTransformer constructor [@snape](https://github.com/snape) in [#1803](https://github.com/MaartenGr/BERTopic/pull/1803) * Fix typo and simplify wording in OnlineCountVectorizer docstring by [@chrisji](https://github.com/chrisji) in [#1802](https://github.com/MaartenGr/BERTopic/pull/1802) * Fixed warning when saving a topic model without an embedding model by [@zilch42](https://github.com/zilch42) in [#1740](https://github.com/MaartenGr/BERTopic/pull/1740) * Fix bug in `TextGeneration` by [@manveersadhal](https://github.com/manveersadhal) in [#1726](https://github.com/MaartenGr/BERTopic/pull/1726) * Fix an incorrect link to usecases.md by [@nicholsonjf](https://github.com/nicholsonjf) in [#1731](https://github.com/MaartenGr/BERTopic/pull/1731) * Prevent `model` argument being passed twice when using `generator_kwargs` in OpenAI by [@ninavandiermen](https://github.com/ninavandiermen) in [#1733](https://github.com/MaartenGr/BERTopic/pull/1733) * Several fixes to the docstrings by [@arpadikuma](https://github.com/arpadikuma) in [#1719](https://github.com/MaartenGr/BERTopic/pull/1719) * Remove unused `cluster_df` variable in `hierarchical_topics` by [@shadiakiki1986](https://github.com/shadiakiki1986) in [#1701](https://github.com/MaartenGr/BERTopic/pull/1701) * Removed redundant quotation mark by [@LawrenceFulton](https://github.com/LawrenceFulton) in [#1695](https://github.com/MaartenGr/BERTopic/pull/1695) * Fix typo in merge models docs by [@zilch42](https://github.com/zilch42) in [#1660](https://github.com/MaartenGr/BERTopic/pull/1660) ## **Version 0.16.0** *Release date: 26 November, 2023*

Highlights:

* Merge pre-trained BERTopic models with [**`.merge_models`**](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) * Combine models with different representations together! * Use this for *incremental/online topic modeling* to detect new incoming topics * First step towards *federated learning* with BERTopic * [**Zero-shot**](https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html) Topic Modeling * Use a predefined list of topics to assign documents * If needed, allows for further exploration of undefined topics * [**Seed (domain-specific) words**](https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html) with `ClassTfidfTransformer` * Make sure selected words are more likely to end up in the representation without influencing the clustering process * Added params to [**truncate documents**](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#truncating-documents) to length when using LLMs * Added [**LlamaCPP**](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#llamacpp) as a representation model * LangChain: Support for **LCEL Runnables** by [@joshuasundance-swca](https://github.com/joshuasundance-swca) in [#1586](https://github.com/MaartenGr/BERTopic/pull/1586) * Added `topics` parameter to `.topics_over_time` to select a subset of documents and topics * Documentation: * [Best practices Guide](https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html) * [Llama 2 Tutorial](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#llama-2) * [Zephyr Tutorial](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#zephyr-mistral-7b) * Improved [embeddings guidance](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html#sentence-transformers) (MTEB) * Improved logging throughout the package * Added support for **Cohere's Embed v3**: ```python cohere_model = CohereBackend( client, embedding_model="embed-english-v3.0", embed_kwargs={"input_type": "clustering"} ) ```

Fixes:

* Fixed n-gram Keywords need delimiting in OpenAI() [#1546](https://github.com/MaartenGr/BERTopic/issues/1546) * Fixed OpenAI v1.0 issues [#1629](https://github.com/MaartenGr/BERTopic/issues/1629) * Improved documentation/logging to address [#1589](https://github.com/MaartenGr/BERTopic/issues/1589), [#1591](https://github.com/MaartenGr/BERTopic/issues/1591) * Fixed engine support for Azure OpenAI embeddings [#1577](https://github.com/MaartenGr/BERTopic/issues/1487) * Fixed OpenAI Representation: KeyError: 'content' [#1570](https://github.com/MaartenGr/BERTopic/issues/1570) * Fixed Loading topic model with multiple topic aspects changes their format [#1487](https://github.com/MaartenGr/BERTopic/issues/1487) * Fix expired link in algorithm.md by [@burugaria7](https://github.com/burugaria7) in [#1396](https://github.com/MaartenGr/BERTopic/pull/1396) * Fix guided topic modeling in cuML's UMAP by [@stevetracvc](https://github.com/stevetracvc) in [#1326](https://github.com/MaartenGr/BERTopic/pull/1326) * OpenAI: Allow retrying on Service Unavailable errors by [@agamble](https://github.com/agamble) in [#1407](https://github.com/MaartenGr/BERTopic/pull/1407) * Fixed parameter naming for HDBSCAN in best practices by [@rnckp](https://github.com/rnckp) in [#1408](https://github.com/MaartenGr/BERTopic/pull/1408) * Fixed typo in tips_and_tricks.md by [@aronnoordhoek](https://github.com/aronnoordhoek) in [#1446](https://github.com/MaartenGr/BERTopic/pull/1446) * Fix typos in documentation by [@bobchien](https://github.com/bobchien) in [#1481](https://github.com/MaartenGr/BERTopic/pull/1481) * Fix IndexError when all outliers are removed by reduce_outliers by [@Aratako](https://github.com/Aratako) in [#1466](https://github.com/MaartenGr/BERTopic/pull/1466) * Fix TypeError on reduce_outliers "probabilities" by [@ananaphasia](https://github.com/ananaphasia) in [#1501](https://github.com/MaartenGr/BERTopic/pull/1501) * Add new line to fix markdown bullet point formatting by [@saeedesmaili](https://github.com/saeedesmaili) in [#1519](https://github.com/MaartenGr/BERTopic/pull/1519) * Update typo in topicrepresentation.md by [@oliviercaron](https://github.com/oliviercaron) in [#1537](https://github.com/MaartenGr/BERTopic/pull/1537) * Fix typo in FAQ by [@sandijou](https://github.com/sandijou) in [#1542](https://github.com/MaartenGr/BERTopic/pull/1542) * Fixed typos in best practices documentation by [@poomkusa](https://github.com/poomkusa) in [#1557](https://github.com/MaartenGr/BERTopic/pull/1557) * Correct TopicMapper doc example by [@chrisji](https://github.com/chrisji) in [#1637](https://github.com/MaartenGr/BERTopic/pull/1637) * Fix typing in hierarchical_topics by [@dschwalm](https://github.com/dschwalm) in [#1364](https://github.com/MaartenGr/BERTopic/pull/1364) * Fixed typing issue with threshold parameter in reduce_outliers by [@dschwalm](https://github.com/dschwalm) in [#1380](https://github.com/MaartenGr/BERTopic/pull/1380) * Fix several typos by [@mertyyanik](https://github.com/mertyyanik) in [#1307](https://github.com/MaartenGr/BERTopic/pull/1307) (#1307) * Fix inconsistent naming by [@rolanderdei](https://github.com/rolanderdei) in [#1073](https://github.com/MaartenGr/BERTopic/pull/1073)

Merge Pre-trained BERTopic Models

The new `.merge_models` feature allows for any number of fitted BERTopic models to be merged. Doing so allows for a number of use cases: * **Incremental topic modeling** -- Continuously merge models together to detect whether new topics have appeared * **Federated Learning** - Train BERTopic models on different clients and combine them on a central server * **Minimal compute** - We can essentially batch the training process into multiple instances to reduce compute * **Different datasets** - When you have different datasets that you want to train separately on, for example with different languages, you can train each model separately and join them after training To demonstrate merging different topic models with BERTopic, we use the ArXiv paper abstracts to see which topics they generally contain. First, we train three separate models on different parts of the data: ```python from umap import UMAP from bertopic import BERTopic from datasets import load_dataset dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"] # Extract abstracts to train on and corresponding titles abstracts_1 = dataset["abstract"][:5_000] abstracts_2 = dataset["abstract"][5_000:10_000] abstracts_3 = dataset["abstract"][10_000:15_000] # Create topic models umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) topic_model_1 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_1) topic_model_2 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_2) topic_model_3 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_3) ``` Then, we can combine all three models into one with `.merge_models`: ```python # Combine all models into one merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3]) ```

Zero-shot Topic Modeling

Zeroshot Topic Modeling is a technique that allows you to find pre-defined topics in large amounts of documents. This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics. This allows for extensive flexibility as there are three scenario's to explore. * No zeroshot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run. * Only zeroshot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics. * Both zeroshot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found. ![zeroshot](https://github.com/MaartenGr/BERTopic/assets/25746895/9cce6ee3-445f-440a-b93b-f8008578c839) In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However, there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers. We know the data and believe there to be at least the following topics: *clustering*, *topic modeling*, and *large language models*. However, we are not sure whether other topics exist and want to explore those. Using this feature is straightforward: ```python from datasets import load_dataset from bertopic import BERTopic from bertopic.representation import KeyBERTInspired # We select a subsample of 5000 abstracts from ArXiv dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"] docs = dataset["abstract"][:5_000] # We define a number of topics that we know are in the documents zeroshot_topic_list = ["Clustering", "Topic Modeling", "Large Language Models"] # We fit our model using the zero-shot topics # and we define a minimum similarity. For each document, # if the similarity does not exceed that value, it will be used # for clustering instead. topic_model = BERTopic( embedding_model="thenlper/gte-small", min_topic_size=15, zeroshot_topic_list=zeroshot_topic_list, zeroshot_min_similarity=.85, representation_model=KeyBERTInspired() ) topics, _ = topic_model.fit_transform(docs) ``` When we run `topic_model.get_topic_info()` you will see something like this: ![zeroshot_output](https://github.com/MaartenGr/BERTopic/assets/25746895/1801e0a9-cda7-4d74-929f-e975fa67404b)

Seed (Domain-specific) Words

When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the *"TNM"* classification is a method for identifying the stage of most cancers. The word *"TNM"* is an abbreviation and might not be correctly captured in generic embedding models. To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of `seed_words` in the `bertopic.vectorizer.ClassTfidfTransformer`. To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like "agent" and "robot" should be important in such a topic were it to be found. Using the `ClassTfidfTransformer`, we can define those `seed_words` and also choose by how much their values are multiplied. The full example is then as follows: ```python from umap import UMAP from datasets import load_dataset from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer # Let's take a subset of ArXiv abstracts as the training data dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"] abstracts = dataset["abstract"][:5_000] # For illustration purposes, we make sure the output is fixed when running this code multiple times umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) # We can choose any number of seed words for which we want their representation # to be strengthen. We increase the importance of these words as we want them to be more # likely to end up in the topic representations. ctfidf_model = ClassTfidfTransformer( seed_words=["agent", "robot", "behavior", "policies", "environment"], seed_multiplier=2 ) # We run the topic model with the seeded words topic_model = BERTopic( umap_model=umap_model, min_topic_size=15, ctfidf_model=ctfidf_model, ).fit(abstracts) ```

Truncate Documents in LLMs

When using LLMs with BERTopic, we can truncate the input documents in `[DOCUMENTS]` in order to reduce the number of tokens that we have in our input prompt. To do so, all text generation modules have two parameters that we can tweak: * `doc_length` - The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. * `tokenizer` - The tokenizer used to calculate to split the document into segments used to count the length of a document. * Options include `'char'`, `'whitespace'`, `'vectorizer'`, and a callable This means that the definition of `doc_length` changes depending on what constitutes a token in the `tokenizer` parameter. If a token is a character, then `doc_length` refers to max length in characters. If a token is a word, then `doc_length` refers to the max length in words. Let's illustrate this with an example. In the code below, we will use [`tiktoken`](https://github.com/openai/tiktoken) to count the number of tokens in each document and limit them to 100 tokens. All documents that have more than 100 tokens will be truncated. We use `bertopic.representation.OpenAI` to represent our topics with nicely written labels. We specify that documents that we put in the prompt cannot exceed 100 tokens each. Since we will put 4 documents in the prompt, they will total roughly 400 tokens: ```python import openai import tiktoken from bertopic.representation import OpenAI from bertopic import BERTopic # Tokenizer tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo") # Create your representation model client = openai.OpenAI(api_key="sk-...") representation_model = OpenAI( client, model="gpt-3.5-turbo", delay_in_seconds=2, chat=True, nr_docs=4, doc_length=100, tokenizer=tokenizer ) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ## **Version 0.15.0** *Release date: 29 May, 2023*

Highlights:

* [**Multimodal**](https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html) Topic Modeling * Train your topic modeling on text, images, or images and text! * Use the `bertopic.backend.MultiModalBackend` to embed images, text, both or even caption images! * [**Multi-Aspect**](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) Topic Modeling * Create multiple topic representations simultaneously * Improved [**Serialization**](https://maartengr.github.io/BERTopic/getting_started/serialization/serialization.html) options * Push your model to the HuggingFace Hub with `.push_to_hf_hub` * Safer, smaller and more flexible serialization options with `safetensors` * Thanks to a great collaboration with HuggingFace and the authors of [BERTransfer](https://github.com/opinionscience/BERTransfer)! * Added new embedding models * OpenAI: `bertopic.backend.OpenAIBackend` * Cohere: `bertopic.backend.CohereBackend` * Added example of [summarizing topics](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#summarization) with OpenAI's GPT-models * Added `nr_docs` and `diversity` parameters to OpenAI and Cohere representation models * Use `custom_labels="Aspect1"` to use the aspect labels for visualizations instead * Added cuML support for probability calculation in `.transform` * Updated **topic embeddings** * Centroids by default and c-TF-IDF weighted embeddings for `partial_fit` and `.update_topics` * Added `exponential_backoff` parameter to `OpenAI` model

Fixes:

* Fixed custom prompt not working in `TextGeneration` * Fixed [#1142](https://github.com/MaartenGr/BERTopic/pull/1142) * Add additional logic to handle cupy arrays by [@metasyn](https://github.com/metasyn) in [#1179](https://github.com/MaartenGr/BERTopic/pull/1179) * Fix hierarchy viz and handle any form of distance matrix by [@elashrry](https://github.com/elashrry) in [#1173](https://github.com/MaartenGr/BERTopic/pull/1173) * Updated languages list by [@sam9111](https://github.com/sam9111) in [#1099](https://github.com/MaartenGr/BERTopic/pull/1099) * Added level_scale argument to visualize_hierarchical_documents by [@zilch42](https://github.com/zilch42) in [#1106](https://github.com/MaartenGr/BERTopic/pull/1106) * Fix inconsistent naming by [@rolanderdei](https://github.com/rolanderdei) in [#1073](https://github.com/MaartenGr/BERTopic/pull/1073)

Multimodal Topic Modeling

With v0.15, we can now perform multimodal topic modeling in BERTopic! The most basic example of multimodal topic modeling in BERTopic is when you have images that accompany your documents. This means that it is expected that each document has an image and vice versa. Instagram pictures, for example, almost always have some descriptions to them.
![Image title](getting_started/multimodal/images_and_text.svg)
In this example, we are going to use images from `flickr` that each have a caption associated to it: ```python # NOTE: This requires the `datasets` package which you can # install with `pip install datasets` from datasets import load_dataset ds = load_dataset("maderix/flickr_bw_rgb") images = ds["train"]["image"] docs = ds["train"]["caption"] ``` The `docs` variable contains the captions for each image in `images`. We can now use these variables to run our multimodal example: ```python from bertopic import BERTopic from bertopic.representation import VisualRepresentation # Additional ways of representing a topic visual_model = VisualRepresentation() # Make sure to add the `visual_model` to a dictionary representation_model = { "Visual_Aspect": visual_model, } topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` We can now access our image representations for each topic with `topic_model.topic_aspects_["Visual_Aspect"]`. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following: ```python import base64 from io import BytesIO from IPython.display import HTML def image_base64(im): if isinstance(im, str): im = get_thumbnail(im) with BytesIO() as buffer: im.save(buffer, 'jpeg') return base64.b64encode(buffer.getvalue()).decode() def image_formatter(im): return f'' # Extract dataframe df = topic_model.get_topic_info().drop("Representative_Docs", 1).drop("Name", 1) # Visualize the images HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False)) ``` ![images_and_text](https://github.com/MaartenGr/BERTopic/assets/25746895/3a741e2b-5810-4865-9664-0c6bb24ca3f9)

Multi-aspect Topic Modeling

In this new release, we introduce `multi-aspect topic modeling`! During the `.fit` or `.fit_transform` stages, you can now get multiple representations of a single topic. In practice, it works by generating and storing all kinds of different topic representations (see image below).
![Image title](getting_started/multiaspect/multiaspect.svg)
The approach is rather straightforward. We might want to represent our topics using a `PartOfSpeech` representation model but we might also want to try out `KeyBERTInspired` and compare those representation models. We can do this as follows: ```python from bertopic.representation import KeyBERTInspired from bertopic.representation import PartOfSpeech from bertopic.representation import MaximalMarginalRelevance from sklearn.datasets import fetch_20newsgroups # Documents to train on docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] # The main representation of a topic main_representation = KeyBERTInspired() # Additional ways of representing a topic aspect_model1 = PartOfSpeech("en_core_web_sm") aspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)] # Add all models together to be run in a single `fit` representation_model = { "Main": main_representation, "Aspect1": aspect_model1, "Aspect2": aspect_model2 } topic_model = BERTopic(representation_model=representation_model).fit(docs) ``` As show above, to perform multi-aspect topic modeling, we make sure that `representation_model` is a dictionary where each representation model pipeline is defined. The main pipeline, that is used in most visualization options, is defined with the `"Main"` key. All other aspects can be defined however you want. In the example above, the two additional aspects that we are interested in are defined as `"Aspect1"` and `"Aspect2"`. After we have fitted our model, we can access all representations with `topic_model.get_topic_info()`:
As you can see, there are a number of different representations for our topics that we can inspect. All aspects are found in `topic_model.topic_aspects_`.

Serialization

Saving, loading, and sharing a BERTopic model can be done in several ways. With this new release, it is now advised to go with `.safetensors` as that allows for a small, safe, and fast method for saving your BERTopic model. However, other formats, such as `.pickle` and pytorch `.bin` are also possible. The methods are used as follows: ```python topic_model = BERTopic().fit(my_docs) # Method 1 - safetensors embedding_model = "sentence-transformers/all-MiniLM-L6-v2" topic_model.save("path/to/my/model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model) # Method 2 - pytorch embedding_model = "sentence-transformers/all-MiniLM-L6-v2" topic_model.save("path/to/my/model_dir", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model) # Method 3 - pickle topic_model.save("my_model", serialization="pickle") ``` Saving the topic modeling with `.safetensors` or `pytorch` has a number of advantages: * `.safetensors` is a relatively **safe format** * The resulting model can be **very small** (often < 20MB>) since no sub-models need to be saved * Although version control is important, there is a bit more **flexibility** with respect to specific versions of packages * More easily used in **production** * **Share** models with the HuggingFace Hub



The above image, a model trained on 100,000 documents, demonstrates the differences in sizes comparing `safetensors`, `pytorch`, and `pickle`. The difference in sizes can mostly be explained due to the efficient saving procedure and that the clustering and dimensionality reductions are not saved in safetensors/pytorch since inference can be done based on the topic embeddings.

HuggingFace Hub

When you have created a BERTopic model, you can easily share it with other through the HuggingFace Hub. First, you need to log in to your HuggingFace account: ```python from huggingface_hub import login login() ``` When you have logged in to your HuggingFace account, you can save and upload the model as follows: ```python from bertopic import BERTopic # Train model topic_model = BERTopic().fit(my_docs) # Push to HuggingFace Hub topic_model.push_to_hf_hub( repo_id="MaartenGr/BERTopic_ArXiv", save_ctfidf=True ) # Load from HuggingFace loaded_model = BERTopic.load("MaartenGr/BERTopic_ArXiv") ``` ## **Version 0.14.1** *Release date: 2 March, 2023*

Highlights:

* Use [**ChatGPT**](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#chatgpt) to create topic representations!: * Added `delay_in_seconds` parameter to OpenAI and Cohere representation models for throttling the API * Setting this between 5 and 10 allows for trial users now to use more easily without hitting RateLimitErrors * Fixed missing `title` param to visualization methods * Fixed probabilities not correctly aligning ([#1024](https://github.com/MaartenGr/BERTopic/issues/1024)) * Fix typo in textgenerator [@dkopljar27](https://github.com/dkopljar27) in [#1002](https://github.com/MaartenGr/BERTopic/pull/1002)

ChatGPT

Within OpenAI's API, the ChatGPT models use a different API structure compared to the GPT-3 models. In order to use ChatGPT with BERTopic, we need to define the model and make sure to set `chat=True`: ```python import openai from bertopic import BERTopic from bertopic.representation import OpenAI # Create your representation model openai.api_key = MY_API_KEY representation_model = OpenAI(model="gpt-3.5-turbo", delay_in_seconds=10, chat=True) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` Prompting with ChatGPT is very satisfying and can be customized in BERTopic by using certain tags. There are currently two tags, namely `"[KEYWORDS]"` and `"[DOCUMENTS]"`. These tags indicate where in the prompt they are to be replaced with a topics keywords and top 4 most representative documents respectively. For example, if we have the following prompt: ```python prompt = """ I have topic that contains the following documents: \n[DOCUMENTS] The topic is described by the following keywords: [KEYWORDS] Based on the information above, extract a short topic label in the following format: topic: """ ``` then that will be rendered as follows and passed to OpenAI's API: ```python """ I have a topic that contains the following documents: - Our videos are also made possible by your support on patreon.co. - If you want to help us make more videos, you can do so on patreon.com or get one of our posters from our shop. - If you want to help us make more videos, you can do so there. - And if you want to support us in our endeavor to survive in the world of online video, and make more videos, you can do so on patreon.com. The topic is described by the following keywords: videos video you our support want this us channel patreon make on we if facebook to patreoncom can for and more watch Based on the information above, extract a short topic label in the following format: topic: """ ``` !!! note Whenever you create a custom prompt, it is important to add ``` Based on the information above, extract a short topic label in the following format: topic: ``` at the end of your prompt as BERTopic extracts everything that comes after `topic: `. Having said that, if `topic: ` is not in the output, then it will simply extract the entire response, so feel free to experiment with the prompts. ## **Version 0.14.0** *Release date: 14 February, 2023*

Highlights:

* Fine-tune [topic representations](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) with `bertopic.representation` * Diverse range of models, including KeyBERT, MMR, POS, Transformers, OpenAI, and more!' * Create your own prompts for text generation models, like GPT3: * Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and set of representative documents need to be inserted. * Chain models to perform fine-grained fine-tuning * Create and customize your represention model * Improved the topic reduction technique when using `nr_topics=int` * Added `title` parameters for all graphs ([#800](https://github.com/MaartenGr/BERTopic/issues/800))

Fixes:

* Improve documentation ([#837](https://github.com/MaartenGr/BERTopic/issues/837), [#769](https://github.com/MaartenGr/BERTopic/issues/769), [#954](https://github.com/MaartenGr/BERTopic/issues/954), [#912](https://github.com/MaartenGr/BERTopic/issues/912), [#911](https://github.com/MaartenGr/BERTopic/issues/911)) * Bump pyyaml ([#903](https://github.com/MaartenGr/BERTopic/issues/903)) * Fix large number of representative docs ([#965](https://github.com/MaartenGr/BERTopic/issues/965)) * Prevent stochastisch behavior in `.visualize_topics` ([#952](https://github.com/MaartenGr/BERTopic/issues/952)) * Add custom labels parameter to `.visualize_topics` ([#976](https://github.com/MaartenGr/BERTopic/issues/976)) * Fix cuML HDBSCAN type checks by [@FelSiq](https://github.com/FelSiq) in [#981](https://github.com/MaartenGr/BERTopic/pull/981)

API Changes:

* The `diversity` parameter was removed in favor of `bertopic.representation.MaximalMarginalRelevance` * The `representation_model` parameter was added to `bertopic.BERTopic`

Representation Models

Fine-tune the c-TF-IDF representation with a variety of models. Whether that is through a KeyBERT-Inspired model or GPT-3, the choice is up to you!

KeyBERTInspired

The algorithm follows some principles of [KeyBERT](https://github.com/MaartenGr/KeyBERT) but does some optimization in order to speed up inference. Usage is straightforward: ![keybertinspired](https://user-images.githubusercontent.com/25746895/216336376-d2c4e5d6-6cf7-435c-904c-fc195aae7dcd.svg) ```python from bertopic.representation import KeyBERTInspired from bertopic import BERTopic # Create your representation model representation_model = KeyBERTInspired() # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ![keybert](https://user-images.githubusercontent.com/25746895/218417161-bfd5980e-43c7-498a-904a-b6018ba58d45.svg)

PartOfSpeech

Our candidate topics, as extracted with c-TF-IDF, do not take into account a keyword's part of speech as extracting noun-phrases from all documents can be computationally quite expensive. Instead, we can leverage c-TF-IDF to perform part of speech on a subset of keywords and documents that best represent a topic. ![partofspeech](https://user-images.githubusercontent.com/25746895/216336534-48ff400e-72e1-4c50-9030-414576bac01e.svg) ```python from bertopic.representation import PartOfSpeech from bertopic import BERTopic # Create your representation model representation_model = PartOfSpeech("en_core_web_sm") # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ![pos](https://user-images.githubusercontent.com/25746895/218417198-41c19b5c-251f-43c1-bfe2-0a480731565a.svg)

MaximalMarginalRelevance

When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like "car" and "cars" essentially represent the same information and often redundant. We can use `MaximalMarginalRelevance` to improve diversity of our candidate topics: ![mmr](https://user-images.githubusercontent.com/25746895/216336697-558f1409-8da3-4076-a21b-d87eec583ac7.svg) ```python from bertopic.representation import MaximalMarginalRelevance from bertopic import BERTopic # Create your representation model representation_model = MaximalMarginalRelevance(diversity=0.3) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ![mmr (1)](https://user-images.githubusercontent.com/25746895/218417234-88b145e2-7293-43c0-888c-36abe469a48a.svg)

Zero-Shot Classification

To perform zero-shot classification, we feed the model with the keywords as generated through c-TF-IDF and a set of candidate labels. If, for a certain topic, we find a similar enough label, then it is assigned. If not, then we keep the original c-TF-IDF keywords. We use it in BERTopic as follows: ```python from bertopic.representation import ZeroShotClassification from bertopic import BERTopic # Create your representation model candidate_topics = ["space and nasa", "bicycles", "sports"] representation_model = ZeroShotClassification(candidate_topics, model="facebook/bart-large-mnli") # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ![zero](https://user-images.githubusercontent.com/25746895/218417276-dcef3519-acba-4792-8601-45dc7ed39488.svg)

Text Generation: 🤗 Transformers

Nearly every week, there are new and improved models released on the 🤗 [Model Hub](https://huggingface.co/models) that, with some creativity, allow for further fine-tuning of our c-TF-IDF based topics. These models range from text generation to zero-classification. In BERTopic, wrappers around these methods are created as a way to support whatever might be released in the future. Using a GPT-like model from the huggingface hub is rather straightforward: ```python from bertopic.representation import TextGeneration from bertopic import BERTopic # Create your representation model representation_model = TextGeneration('gpt2') # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ![hf](https://user-images.githubusercontent.com/25746895/218417310-2b0eabc7-296d-499d-888b-0ab48a65a2fb.svg)

Text Generation: Cohere

Instead of using a language model from 🤗 transformers, we can use external APIs instead that do the work for you. Here, we can use [Cohere](https://docs.cohere.ai/) to extract our topic labels from the candidate documents and keywords. To use this, you will need to install cohere first: ```bash pip install cohere ``` Then, get yourself an API key and use Cohere's API as follows: ```python import cohere from bertopic.representation import Cohere from bertopic import BERTopic # Create your representation model co = cohere.Client(my_api_key) representation_model = Cohere(co) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ![cohere](https://user-images.githubusercontent.com/25746895/218417337-294cb52a-93c9-4fd5-b981-29b40e4f0c1e.svg)

Text Generation: OpenAI

Instead of using a language model from 🤗 transformers, we can use external APIs instead that do the work for you. Here, we can use [OpenAI](https://openai.com/api/) to extract our topic labels from the candidate documents and keywords. To use this, you will need to install openai first: ``` pip install openai ``` Then, get yourself an API key and use OpenAI's API as follows: ```python import openai from bertopic.representation import OpenAI from bertopic import BERTopic # Create your representation model openai.api_key = MY_API_KEY representation_model = OpenAI() # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ![openai](https://user-images.githubusercontent.com/25746895/218417357-cf8c0fab-4450-43d3-b4fd-219ed276d870.svg)

Text Generation: LangChain

[Langchain](https://github.com/hwchase17/langchain) is a package that helps users with chaining large language models. In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this external knowledge are the most representative documents in each topic. To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai: ```bash pip install langchain, openai ``` Then, you can create your chain as follows: ```python from langchain.chains.question_answering import load_qa_chain from langchain.llms import OpenAI chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=MY_API_KEY), chain_type="stuff") ``` Finally, you can pass the chain to BERTopic as follows: ```python from bertopic.representation import LangChain # Create your representation model representation_model = LangChain(chain) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ## **Version 0.13.0** *Release date: 4 January, 2023*

Highlights:

* Calculate [topic distributions](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) with `.approximate_distribution` regardless of the cluster model used * Generates topic distributions on a document- and token-levels * Can be used for any document regardless of its size! * [Fully supervised BERTopic](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html) * You can now use a classification model for the clustering step instead to create a fully supervised topic model * [Manual topic modeling](https://maartengr.github.io/BERTopic/getting_started/manual/manual.html) * Generate topic representations from labels directly * Allows for skipping the embedding and clustering steps in order to go directly to the topic representation step * [Reduce outliers](https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html) with 4 different strategies using `.reduce_outliers` * Install BERTopic without `SentenceTransformers` for a [lightweight package](https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#lightweight-installation): * `pip install --no-deps bertopic` * `pip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml` * Get meta data of trained documents such as topics and probabilities using `.get_document_info(docs)` * Added more support for cuML's HDBSCAN * Calculate and predict probabilities during `fit_transform` and `transform` respectively * This should give a major speed-up when setting `calculate_probabilities=True` * More images to the documentation and a lot of changes/updates/clarifications * Get representative documents for non-HDBSCAN models by comparing document and topic c-TF-IDF representations * Sklearn Pipeline [Embedder](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html#scikit-learn-embeddings) by [@koaning](https://github.com/koaning) in [#791](https://github.com/MaartenGr/BERTopic/pull/791)

Fixes:

* Improve `.partial_fit` documentation ([#837](https://github.com/MaartenGr/BERTopic/issues/837)) * Fixed scipy linkage usage ([#807](https://github.com/MaartenGr/BERTopic/issues/807)) * Fixed shifted heatmap ([#782](https://github.com/MaartenGr/BERTopic/issues/782)) * Fixed SpaCy backend ([#744](https://github.com/MaartenGr/BERTopic/issues/744)) * Fixed representative docs with small clusters (<3) ([#703](https://github.com/MaartenGr/BERTopic/issues/703)) * Typo fixed by [@timpal0l](https://github.com/timpal0l) in [#734](https://github.com/MaartenGr/BERTopic/pull/734) * Typo fixed by [@srulikbd](https://github.com/timpal0l) in [#842](https://github.com/MaartenGr/BERTopic/pull/842) * Correcting iframe urls by [@Mustapha-AJEGHRIR](https://github.com/Mustapha-AJEGHRIR) in [#798](https://github.com/MaartenGr/BERTopic/pull/798) * Refactor embedding methods by [@zachschillaci27](https://github.com/zachschillaci27) in [#855](https://github.com/MaartenGr/BERTopic/pull/855) * Added diversity parameter to update_topics() function by [@anubhabdaserrr](https://github.com/anubhabdaserrr) in [#887](https://github.com/MaartenGr/BERTopic/pull/887)

Documentation

Personally, I believe that documentation can be seen as a feature and is an often underestimated aspect of open-source. So I went a bit overboard😅... and created an animation about the three pillars of BERTopic using Manim. There are many other visualizations added, one of each variation of BERTopic, and many smaller changes.

Topic Distributions

The difficulty with a cluster-based topic modeling technique is that it does not directly consider that documents may contain multiple topics. With the new release, we can now model the distributions of topics! We even consider that a single word might be related to multiple topics. If a document is a mixture of topics, what is preventing a single word to be the same? To do so, we approximate the distribution of topics in a document by calculating and summing the similarities of tokensets (achieved by applying a sliding window) with the topics: ```python # After fitting your model run the following for either your trained documents or even unseen documents topic_distr, _ = topic_model.approximate_distribution(docs) ``` To calculate and visualize the topic distributions in a document on a token-level, we can run the following: ```python # We need to calculate the topic distributions on a token level topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) # Create a visualization using a styled dataframe if Jinja2 is installed df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]); df ```

Supervised Topic Modeling

BERTopic now supports fully-supervised classification! Instead of using a clustering algorithm, like HDBSCAN, we can replace it with a classifier, like Logistic Regression: ```python from bertopic import BERTopic from bertopic.dimensionality import BaseDimensionalityReduction from sklearn.datasets import fetch_20newsgroups from sklearn.linear_model import LogisticRegression # Get labeled data data= fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = data['data'] y = data['target'] # Allows us to skip over the dimensionality reduction step empty_dimensionality_model = BaseDimensionalityReduction() # Create a classifier to be used instead of the cluster model clf= LogisticRegression() # Create a fully supervised BERTopic instance topic_model= BERTopic( umap_model=empty_dimensionality_model, hdbscan_model=clf ) topics, probs = topic_model.fit_transform(docs, y=y) ```

Manual Topic Modeling

When you already have a bunch of labels and simply want to extract topic representations from them, you might not need to actually learn how those can predicted. We can bypass the `embeddings -> dimensionality reduction -> clustering` steps and go straight to the c-TF-IDF representation of our labels: ```python from bertopic import BERTopic from bertopic.backend import BaseEmbedder from bertopic.cluster import BaseCluster from bertopic.dimensionality import BaseDimensionalityReduction # Prepare our empty sub-models and reduce frequent words while we are at it. empty_embedding_model = BaseEmbedder() empty_dimensionality_model = BaseDimensionalityReduction() empty_cluster_model = BaseCluster() # Fit BERTopic without actually performing any clustering topic_model= BERTopic( embedding_model=empty_embedding_model, umap_model=empty_dimensionality_model, hdbscan_model=empty_cluster_model, ) topics, probs = topic_model.fit_transform(docs, y=y) ```

Outlier Reduction

Outlier reduction is an frequently-discussed topic in BERTopic as its default cluster model, HDBSCAN, has a tendency to generate many outliers. This often helps in the topic representation steps, as we do not consider documents that are less relevant, but you might want to still assign those outliers to actual topics. In the modular philosophy of BERTopic, keeping training times in mind, it is now possible to perform outlier reduction **after** having trained your topic model. This allows for ease of iteration and prevents having to train BERTopic many times to find the parameters you are searching for. There are 4 different strategies that you can use, so make sure to check out the [documentation](https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html)! Using it is rather straightforward: ```python new_topics = topic_model.reduce_outliers(docs, topics) ```

Lightweight BERTopic

The default embedding model in BERTopic is one of the amazing sentence-transformers models, namely `"all-MiniLM-L6-v2"`. Although this model performs well out of the box, it typically needs a GPU to transform the documents into embeddings in a reasonable time. Moreover, the installation requires `pytorch` which often results in a rather large environment, memory-wise. Fortunately, it is possible to install BERTopic without `sentence-transformers` and use it as a lightweight solution instead. The installation can be done as follows: ```bash pip install --no-deps bertopic pip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml ``` Then, we can use BERTopic without `sentence-transformers` as follows using a CPU-based embedding technique: ```python from sklearn.pipeline import make_pipeline from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer pipe = make_pipeline( TfidfVectorizer(), TruncatedSVD(100) ) topic_model = BERTopic(embedding_model=pipe) ``` As a result, the entire package and resulting model can be run quickly on the CPU and no GPU is necessary!

Document Information

Get information about the documents on which the topic was trained including the documents themselves, their respective topics, the name of each topic, the top n words of each topic, whether it is a representative document, and the probability of the clustering if the cluster model supports it. There are also options to include other metadata, such as the topic distributions or the x and y coordinates of the reduced embeddings that you can learn more about here. To get the document info, you will only need to pass the documents on which the topic model was trained: ```python >>> topic_model.get_document_info(docs) Document Topic Name Top_n_words Probability ... I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ... My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ... Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ... Think! It is the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ... 1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ... ``` ## **Version 0.12.0** *Release date: 5 September, 2022* **Highlights**: * Perform [online/incremental topic modeling](https://maartengr.github.io/BERTopic/getting_started/online/online.html) with `.partial_fit` * Expose [c-TF-IDF model](https://maartengr.github.io/BERTopic/getting_started/ctfidf/ctfidf.html) for customization with `bertopic.vectorizers.ClassTfidfTransformer` * The parameters `bm25_weighting` and `reduce_frequent_words` were added to potentially improve representations: * Expose attributes for easier access to internal data * Major changes to the [Algorithm](https://maartengr.github.io/BERTopic/algorithm/algorithm.html) page of the documentation, which now contains three overviews of the algorithm: * [Visualize Overview](https://maartengr.github.io/BERTopic/algorithm/algorithm.html#visual-overview) * [Code Overview](https://maartengr.github.io/BERTopic/algorithm/algorithm.html#code-overview) * [Detailed Overview](https://maartengr.github.io/BERTopic/algorithm/algorithm.html#detailed-overview) * Added an [example](https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#keybert-bertopic) of combining BERTopic with KeyBERT * Added many tests with the intention of making development a bit more stable **Fixes**: * Fixed iteratively merging topics ([#632](https://github.com/MaartenGr/BERTopic/issues/632) and ([#648](https://github.com/MaartenGr/BERTopic/issues/648)) * Fixed 0th topic not showing up in visualizations ([#667](https://github.com/MaartenGr/BERTopic/issues/667)) * Fixed lowercasing not being optional ([#682](https://github.com/MaartenGr/BERTopic/issues/682)) * Fixed spelling ([#664](https://github.com/MaartenGr/BERTopic/issues/664) and ([#673](https://github.com/MaartenGr/BERTopic/issues/673)) * Fixed 0th topic not shown in `.get_topic_info` by [@oxymor0n](https://github.com/oxymor0n) in [#660](https://github.com/MaartenGr/BERTopic/pull/660) * Fixed spelling by [@domenicrosati](https://github.com/domenicrosati) in [#674](https://github.com/MaartenGr/BERTopic/pull/674) * Add custom labels and title options to barchart [@leloykun](https://github.com/leloykun) in [#694](https://github.com/MaartenGr/BERTopic/pull/694) **Online/incremental topic modeling**: Online topic modeling (sometimes called "incremental topic modeling") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained on before. In Scikit-Learn, this technique is often modeled through a `.partial_fit` function, which is also used in BERTopic. At a minimum, the cluster model needs to support a `.partial_fit` function in order to use this feature. The default HDBSCAN model will not work as it does not support online updating. ```python from sklearn.datasets import fetch_20newsgroups from sklearn.cluster import MiniBatchKMeans from sklearn.decomposition import IncrementalPCA from bertopic.vectorizers import OnlineCountVectorizer from bertopic import BERTopic # Prepare documents all_docs = fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'))["data"] doc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)] # Prepare sub-models that support online learning umap_model = IncrementalPCA(n_components=5) cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0) vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01) topic_model = BERTopic(umap_model=umap_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model) # Incrementally fit the topic model by training on 1000 documents at a time for docs in doc_chunks: topic_model.partial_fit(docs) ``` Only the topics for the most recent batch of documents are tracked. If you want to be using online topic modeling, not for a streaming setting but merely for low-memory use cases, then it is advised to also update the `.topics_` attribute as variations such as hierarchical topic modeling will not work afterward: ```python # Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration topics = [] for docs in doc_chunks: topic_model.partial_fit(docs) topics.extend(topic_model.topics_) topic_model.topics_ = topics ``` **c-TF-IDF**: Explicitly define, use, and adjust the `ClassTfidfTransformer` with new parameters, `bm25_weighting` and `reduce_frequent_words`, to potentially improve the topic representation: ```python from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer ctfidf_model = ClassTfidfTransformer(bm25_weighting=True) topic_model = BERTopic(ctfidf_model=ctfidf_model) ``` **Attributes**: After having fitted your BERTopic instance, you can use the following attributes to have quick access to certain information, such as the topic assignment for each document in `topic_model.topics_`. | Attribute | Type | Description | |--------------------|----|---------------------------------------------------------------------------------------------| | topics_ | List[int] | The topics that are generated for each document after training or updating the topic model. The most recent topics are tracked. | | probabilities_ | List[float] | The probability of the assigned topic per document. These are only calculated if a HDBSCAN model is used for the clustering step. When `calculate_probabilities=True`, then it is the probabilities of all topics per document. | | topic_sizes_ | Mapping[int, int] | The size of each topic. | | topic_mapper_ | TopicMapper | A class for tracking topics and their mappings anytime they are merged, reduced, added, or removed. | | topic_representations_ | Mapping[int, Tuple[int, float]] | The top *n* terms per topic and their respective c-TF-IDF values. | | c_tf_idf_ | csr_matrix | The topic-term matrix as calculated through c-TF-IDF. To access its respective words, run `.vectorizer_model.get_feature_names()` or `.vectorizer_model.get_feature_names_out()` | | topic_labels_ | Mapping[int, str] | The default labels for each topic. | | custom_labels_ | List[str] | Custom labels for each topic as generated through `.set_topic_labels`. | | topic_embeddings_ | np.ndarray | The embeddings for each topic. It is calculated by taking the weighted average of word embeddings in a topic based on their c-TF-IDF values. | | representative_docs_ | Mapping[int, str] | The representative documents for each topic if HDBSCAN is used. | ## **Version 0.11.0** *Release date: 11 July, 2022* **Highlights**: * Perform [hierarchical topic modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) with `.hierarchical_topics` ```python hierarchical_topics = topic_model.hierarchical_topics(docs, topics) ``` * Visualize [hierarchical topic representations](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html#visualizations) with `.visualize_hierarchy` ```python topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) ``` * Extract a [text-based hierarchical topic representation](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html#visualizations) with `.get_topic_tree` ```python tree = topic_model.get_topic_tree(hierarchical_topics) ``` * Visualize [2D documents](https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-documents) with `.visualize_documents()` ```python # Use input embeddings topic_model.visualize_documents(docs, embeddings=embeddings) # or use 2D reduced embeddings through a method of your own (e.g., PCA, t-SNE, UMAP, etc.) reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) ``` * Visualize [2D hierarchical documents](https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-hierarchical-documents) with `.visualize_hierarchical_documents()` ```python # Run the visualization with the original embeddings topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings) # Or, if you have reduced the original embeddings already which speed things up quite a bit: reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) ``` * Create [custom labels](https://maartengr.github.io/BERTopic/getting_started/topicrepresentation/topicrepresentation.html#custom-labels) to the topics throughout most visualizations ```python # Generate topic labels topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=10, separator=", ") # Set them internally in BERTopic topic_model.set_topic_labels(topics_labels) ``` * Manually [merge topics](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html#merge-topics) with `.merge_topics()` ```python # Merge topics 1, 2, and 3 topics_to_merge = [1, 2, 3] topic_model.merge_topics(docs, topics, topics_to_merge) # Merge topics 1 and 2, and separately merge topics 3 and 4 topics_to_merge = [[1, 2], [3, 4]] topic_model.merge_topics(docs, topics, topics_to_merge) ``` * Added example for finding similar topics between two models in the [tips & tricks](https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html) page * Add multi-modal example in the [tips & tricks](https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html) page * Added native [Hugging Face transformers](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html#hugging-face-transformers) support **Fixes**: * Fix support for k-Means in `.visualize_heatmap` ([#532](https://github.com/MaartenGr/BERTopic/issues/532)) * Fix missing topic 0 in `.visualize_topics` ([#533](https://github.com/MaartenGr/BERTopic/issues/533)) * Fix inconsistencies in `.get_topic_info` ([#572](https://github.com/MaartenGr/BERTopic/issues/572)) and ([#581](https://github.com/MaartenGr/BERTopic/issues/581)) * Add `optimal_ordering` parameter to `.visualize_hierarchy` by [@rafaelvalero](https://github.com/rafaelvalero) in [#390](https://github.com/MaartenGr/BERTopic/pull/390) * Fix RuntimeError when used as sklearn estimator by [@simonfelding](https://github.com/simonfelding) in [#448](https://github.com/MaartenGr/BERTopic/pull/448) * Fix typo in visualization documentation by [@dwhdai](https://github.com/dwhdai) in [#475](https://github.com/MaartenGr/BERTopic/pull/475) * Fix typo in docstrings by [@xwwwwww](https://github.com/xwwwwww) in [#549](https://github.com/MaartenGr/BERTopic/pull/549) * Support higher Flair versions ## **Version 0.10.0** *Release date: 30 April, 2022* **Highlights**: * Use any dimensionality reduction technique instead of UMAP: ```python from bertopic import BERTopic from sklearn.decomposition import PCA dim_model = PCA(n_components=5) topic_model = BERTopic(umap_model=dim_model) ``` * Use any clustering technique instead of HDBSCAN: ```python from bertopic import BERTopic from sklearn.cluster import KMeans cluster_model = KMeans(n_clusters=50) topic_model = BERTopic(hdbscan_model=cluster_model) ``` **Documentation**: * Add a CountVectorizer page with tips and tricks on how to create topic representations that fit your use case * Added pages on how to use other dimensionality reduction and clustering algorithms * Additional instructions on how to reduce outliers in the FAQ: ```python import numpy as np probability_threshold = 0.01 new_topics = [np.argmax(prob) if max(prob) >= probability_threshold else -1 for prob in probs] ``` **Fixes**: * Fixed `None` being returned for probabilities when transforming unseen documents * Replaced all instances of `arg:` with `Arguments:` for consistency * Before saving a fitted BERTopic instance, we remove the stopwords in the fitted CountVectorizer model as it can get quite large due to the number of words that end in stopwords if `min_df` is set to a value larger than 1 * Set `"hdbscan>=0.8.28"` to prevent numpy issues * Although this was already fixed by the new release of HDBSCAN, it is technically still possible to install 0.8.27 with BERTopic which leads to these numpy issues * Update gensim dependency to `>=4.0.0` ([#371](https://github.com/MaartenGr/BERTopic/issues/371)) * Fix topic 0 not appearing in visualizations ([#472](https://github.com/MaartenGr/BERTopic/issues/472)) * Fix ([#506](https://github.com/MaartenGr/BERTopic/issues/506)) * Fix ([#429](https://github.com/MaartenGr/BERTopic/issues/429)) * Fix typo in DTM documentation by [@hp0404](https://github.com/hp0404) in [#386](https://github.com/MaartenGr/BERTopic/pull/386) ## **Version 0.9.4** *Release date: 14 December, 2021* A number of fixes, documentation updates, and small features: * Expose diversity parameter * Use `BERTopic(diversity=0.1)` to change how diverse the words in a topic representation are (ranges from 0 to 1) * Improve stability of topic reduction by only computing the cosine similarity within c-TF-IDF and not the topic embeddings * Added property to c-TF-IDF that all IDF values should be positive ([#351](https://github.com/MaartenGr/BERTopic/issues/351)) * Improve stability of `.visualize_barchart()` and `.visualize_hierarchy()` * Major [documentation](https://maartengr.github.io/BERTopic/) overhaul (mkdocs, tutorials, FAQ, images, etc. ) ([#330](https://github.com/MaartenGr/BERTopic/issues/330)) * Drop python 3.6 ([#333](https://github.com/MaartenGr/BERTopic/issues/333)) * Relax plotly dependency ([#88](https://github.com/MaartenGr/BERTopic/issues/88)) * Additional logging for `.transform` ([#356](https://github.com/MaartenGr/BERTopic/issues/356)) ## **Version 0.9.3** *Release date: 17 October, 2021* * Fix [#282](https://github.com/MaartenGr/BERTopic/issues/282) * As it turns out the old implementation of topic mapping was still found in the `transform` function * Fix [#285](https://github.com/MaartenGr/BERTopic/issues/285) * Fix getting all representative docs * Fix [#288](https://github.com/MaartenGr/BERTopic/issues/288) * A recent issue with the package `pyyaml` that can be found in Google Colab ## **Version 0.9.2** *Release date: 12 October, 2021* A release focused on algorithmic optimization and fixing several issues: **Highlights**: * Update the non-multilingual paraphrase-* models to the all-* models due to improved [performance](https://www.sbert.net/docs/pretrained_models.html) * Reduce necessary RAM in c-TF-IDF top 30 word [extraction](https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix) **Fixes**: * Fix topic mapping * When reducing the number of topics, these need to be mapped to the correct input/output which had some issues in the previous version * A new class was created as a way to track these mappings regardless of how many times they were executed * In other words, you can iteratively reduce the number of topics after training the model without the need to continuously train the model * Fix typo in embeddings page ([#200](https://github.com/MaartenGr/BERTopic/issues/200)) * Fix link in README ([#233](https://github.com/MaartenGr/BERTopic/issues/233)) * Fix documentation `.visualize_term_rank()` ([#253](https://github.com/MaartenGr/BERTopic/issues/253)) * Fix getting correct representative docs ([#258](https://github.com/MaartenGr/BERTopic/issues/258)) * Update [memory FAQ](https://maartengr.github.io/BERTopic/faq.html#i-am-facing-memory-issues-help) with [HDBSCAN pr](https://github.com/MaartenGr/BERTopic/issues/151) ## **Version 0.9.1** *Release date: 1 September, 2021* A release focused on fixing several issues: **Fixes**: * Fix TypeError when auto-reducing topics ([#210](https://github.com/MaartenGr/BERTopic/issues/210)) * Fix mapping representative docs when reducing topics ([#208](https://github.com/MaartenGr/BERTopic/issues/208)) * Fix visualization issues with probabilities ([#205](https://github.com/MaartenGr/BERTopic/issues/205)) * Fix missing `normalize_frequency` param in plots ([#213](https://github.com/MaartenGr/BERTopic/issues/208)) ## **Version 0.9.0** *Release date: 9 August, 2021* **Highlights**: * Implemented a [**Guided BERTopic**](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) -> Use seeds to steer the Topic Modeling * Get the most representative documents per topic: `topic_model.get_representative_docs(topic=1)` * This allows users to see which documents are good representations of a topic and better understand the topics that were created * Added `normalize_frequency` parameter to `visualize_topics_per_class` and `visualize_topics_over_time` in order to better compare the relative topic frequencies between topics * Return flat probabilities as default, only calculate the probabilities of all topics per document if `calculate_probabilities` is True * Added several FAQs **Fixes**: * Fix loading pre-trained BERTopic model * Fix mapping of probabilities * Fix [#190](https://github.com/MaartenGr/BERTopic/issues/190) **Guided BERTopic**: Guided BERTopic works in two ways: First, we create embeddings for each seeded topics by joining them and passing them through the document embedder. These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. If the document is most similar to a seeded topic, then it will get that topic's label. If it is most similar to the average document embedding, it will get the -1 label. These labels are then passed through UMAP to create a semi-supervised approach that should nudge the topic creation to the seeded topics. Second, we take all words in `seed_topic_list` and assign them a multiplier larger than 1. Those multipliers will be used to increase the IDF values of the words across all topics thereby increasing the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier. ```python seed_topic_list = [["company", "billion", "quarter", "shrs", "earnings"], ["acquisition", "procurement", "merge"], ["exchange", "currency", "trading", "rate", "euro"], ["grain", "wheat", "corn"], ["coffee", "cocoa"], ["natural", "gas", "oil", "fuel", "products", "petrol"]] topic_model = BERTopic(seed_topic_list=seed_topic_list) topics, probs = topic_model.fit_transform(docs) ``` ## **Version 0.8.1** *Release date: 8 June, 2021* **Highlights**: * Improved models: * For English documents the default is now: `"paraphrase-MiniLM-L6-v2"` * For Non-English or multi-lingual documents the default is now: `"paraphrase-multilingual-MiniLM-L12-v2"` * Both models show not only great performance but are much faster! * Add interactive visualizations to the `plotting` API documentation For better performance, please use the following models: * English: `"paraphrase-mpnet-base-v2"` * Non-English or multi-lingual: `"paraphrase-multilingual-mpnet-base-v2"` **Fixes**: * Improved unit testing for more stability * Set transformers version for Flair ## **Version 0.8.0** *Release date: 31 May, 2021* **Highlights**: * Additional visualizations: * Topic Hierarchy: `topic_model.visualize_hierarchy()` * Topic Similarity Heatmap: `topic_model.visualize_heatmap()` * Topic Representation Barchart: `topic_model.visualize_barchart()` * Term Score Decline: `topic_model.visualize_term_rank()` * Created `bertopic.plotting` library to easily extend visualizations * Improved automatic topic reduction by using HDBSCAN to detect similar topics * Sort topic ids by their frequency. -1 is the outlier class and contains typically the most documents. After that 0 is the largest topic, 1 the second largest, etc. **Fixes**: * Fix typo [#113](https://github.com/MaartenGr/BERTopic/pull/113), [#117](https://github.com/MaartenGr/BERTopic/pull/117) * Fix [#121](https://github.com/MaartenGr/BERTopic/issues/121) by removing [these](https://github.com/MaartenGr/BERTopic/blob/5c6cf22776fafaaff728370781a5d33727d3dc8f/bertopic/_bertopic.py#L359-L360) two lines * Fix mapping of topics after reduction (it now excludes 0) ([#103](https://github.com/MaartenGr/BERTopic/issues/103)) ## **Version 0.7.0** *Release date: 26 April, 2021* The two main features are **(semi-)supervised topic modeling** and several **backends** to use instead of Flair and SentenceTransformers! **Highlights**: * (semi-)supervised topic modeling by leveraging supervised options in UMAP * `model.fit(docs, y=target_classes)` * Backends: * Added Spacy, Gensim, USE (TFHub) * Use a different backend for document embeddings and word embeddings * Create your own backends with `bertopic.backend.BaseEmbedder` * Click [here](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html) for an overview of all new backends * Calculate and visualize topics per class * Calculate: `topics_per_class = topic_model.topics_per_class(docs, topics, classes)` * Visualize: `topic_model.visualize_topics_per_class(topics_per_class)` * Several tutorials were updated and added: | Name | Link | |---|---| | Topic Modeling with BERTopic | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1FieRA9fLdkQEGDIMYl0I3MCjSUKVF8C-?usp=sharing) | | (Custom) Embedding Models in BERTopic | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/18arPPe50szvcCp_Y6xS56H2tY0m-RLqv?usp=sharing) | | Advanced Customization in BERTopic | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ClTYut039t-LDtlcd-oQAdXWgcsSGTw9?usp=sharing) | | (semi-)Supervised Topic Modeling with BERTopic | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bxizKzv5vfxJEB29sntU__ZC7PBSIPaQ?usp=sharing) | | Dynamic Topic Modeling with Trump's Tweets | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1un8ooI-7ZNlRoK0maVkYhmNRl0XGK88f?usp=sharing) | **Fixes**: * Fixed issues with Torch req * Prevent saving term frequency matrix in CTFIDF class * Fixed DTM not working when reducing topics ([#96](https://github.com/MaartenGr/BERTopic/issues/96)) * Moved visualization dependencies to base BERTopic * `pip install bertopic[visualization]` becomes `pip install bertopic` * Allow precomputed embeddings in bertopic.find_topics() ([#79](https://github.com/MaartenGr/BERTopic/issues/79)): ```python model = BERTopic(embedding_model=my_embedding_model) model.fit(docs, my_precomputed_embeddings) model.find_topics(search_term) ``` ## **Version 0.6.0** *Release date: 1 March, 2021* **Highlights**: * DTM: Added a basic dynamic topic modeling technique based on the global c-TF-IDF representation * `model.topics_over_time(docs, timestamps, global_tuning=True)` * DTM: Option to evolve topics based on t-1 c-TF-IDF representation which results in evolving topics over time * Only uses topics at t-1 and skips evolution if there is a gap * `model.topics_over_time(docs, timestamps, evolution_tuning=True)` * DTM: Function to visualize topics over time * `model.visualize_topics_over_time(topics_over_time)` * DTM: Add binning of timestamps * `model.topics_over_time(docs, timestamps, nr_bins=10)` * Add function get general information about topics (id, frequency, name, etc.) * `get_topic_info()` * Improved stability of c-TF-IDF by taking the average number of words across all topics instead of the number of documents **Fixes**: * `_map_probabilities()` does not take into account that there is no probability of the outlier class and the probabilities are mutated instead of copied (#63, #64) ## **Version 0.5.0** *Release date: 8 Februari, 2021* **Highlights**: * Add `Flair` to allow for more (custom) token/document embeddings, including 🤗 transformers * Option to use custom UMAP, HDBSCAN, and CountVectorizer * Added `low_memory` parameter to reduce memory during computation * Improved verbosity (shows progress bar) * Return the figure of `visualize_topics()` * Expose all parameters with a single function: `get_params()` **Fixes**: * To simplify the API, the parameters stop_words and n_neighbors were removed. These can still be used when a custom UMAP or CountVectorizer is used. * Set `calculate_probabilities` to False as a default. Calculating probabilities with HDBSCAN significantly increases computation time and memory usage. Better to remove calculating probabilities or only allow it by manually turning this on. * Use the newest version of `sentence-transformers` as it speeds ups encoding significantly ## **Version 0.4.2** *Release date: 10 Januari, 2021* **Fixes**: * Selecting `embedding_model` did not work when `language` was also used. This led to the user needing to set `language` to None before being able to use `embedding_model`. Fixed by using `embedding_model` when `language` is used (as a default parameter). ## **Version 0.4.1** *Release date: 07 Januari, 2021* **Fixes**: * Simple fix by lowering the languages variable to match the lowered input language. ## **Version 0.4.0** *Release date: 21 December, 2020* **Highlights**: * Visualize Topics similar to [LDAvis](https://github.com/cpsievert/LDAvis) * Added option to reduce topics after training * Added option to update topic representation after training * Added option to search topics using a search term * Significantly improved the stability of generating clusters * Finetune the topic words by selecting the most coherent words with the highest c-TF-IDF values * More extensive tutorials in the documentation **Notable Changes**: * Option to select language instead of sentence-transformers models to minimize the complexity of using BERTopic * Improved logging (remove duplicates) * Check if BERTopic is fitted * Added TF-IDF as an embedder instead of transformer models (see tutorial) * Numpy for Python 3.6 will be dropped and was therefore removed from the workflow. * Preprocess text before passing it through c-TF-IDF * Merged `get_topics_freq()` with `get_topic_freq()` **Fixes**: * Fix error handling topic probabilities ## **Version 0.3.2** *Release date: 16 November, 2020* **Highlights**: * Fixed a bug with the topic reduction method that seems to reduce the number of topics but not to the nr_topics as defined in the class. Since this was, to a certain extend, breaking the topic reduction method a new release was necessary. ## **Version 0.3.1** *Release date: 4 November, 2020* **Highlights**: * Adding the option to use custom embeddings or embeddings that you generated beforehand with whatever package you'd like to use. This allows users to further customize BERTopic to their liking. ## **Version 0.3.0** *Release date: 29 October, 2020* **Highlights**: - transform() and fit_transform() now also return the topic probability distributions - Added visualize_distribution() which visualizes the topic probability distribution for a single document ## **Version 0.2.2** *Release date: 17 October, 2020* **Highlights**: - Fixed n_gram_range not being used - Added option for using stopwords ## **Version 0.2.1** *Release date: 11 October, 2020* **Highlights**: * Improved the calculation of the class-based TF-IDF procedure by limiting the calculation to sparse matrices. This prevents out-of-memory problems when faced with large datasets. ## **Version 0.2.0** *Release date: 11 October, 2020* **Highlights**: - Changed c-TF-IDF procedure such that it implements a version of scikit-learns procedure. This should also speed up the calculation of the sparse matrix and prevent memory errors. - Added automated unit tests ## **Version 0.1.2** *Release date: 1 October, 2020* **Highlights**: * When transforming new documents, self.mapped_topics seemed to be missing. Added to the init. ## **Version 0.1.1** *Release date: 24 September, 2020* **Highlights**: * Fixed requirements --> Issue with pytorch * Update documentation ## **Version 0.1.0** *Release date: 24 September, 2020* **Highlights**: - First release of `BERTopic` - Added parameters for UMAP and HDBSCAN - Option to choose sentence-transformer model - Method for transforming unseen documents - Save and load trained models (UMAP and HDBSCAN) - Extract topics and their sizes **Notable Changes**: - Optimized c-TF-IDF - Improved documentation - Improved topic reduction ================================================ FILE: docs/faq.md ================================================ --- hide: - navigation --- # Frequently Asked Questions ## **Why are the results not consistent between runs?** Due to the stochastic nature of UMAP, the results from BERTopic might differ even if you run the same code multiple times. Using custom embeddings allows you to try out BERTopic several times until you find the topics that suit you best. You only need to generate the embeddings themselves once and run BERTopic several times with different parameters. If you want to reproduce the results, at the expense of [performance](https://umap-learn.readthedocs.io/en/latest/reproducibility.html), you can set a `random_state` in UMAP to prevent any stochastic behavior: ```python from bertopic import BERTopic from umap import UMAP umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) topic_model = BERTopic(umap_model=umap_model) ``` ## **Which embedding model should I choose?** Unfortunately, there is not a definitive list of the best models for each language, this highly depends on your data, the model, and your specific use case. However, the default model in BERTopic (`"all-MiniLM-L6-v2"`) works great for **English** documents. In contrast, for **multi-lingual** documents or any other language, `"paraphrase-multilingual-MiniLM-L12-v2"` has shown great performance. If you want to use a model that provides a higher quality, but takes more computing time, then I would advise using `all-mpnet-base-v2` and `paraphrase-multilingual-mpnet-base-v2` instead. **MTEB Leaderboard** New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look. Many of these models can be used with `SentenceTransformers` in BERTopic, like so: ```python from bertopic import BERTopic from sentence_transformers import SentenceTransformer embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5") topic_model = BERTopic(embedding_model=embedding_model) ``` **SentenceTransformers** [SentenceTransformers](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models) work typically quite well and are the preferred models to use. They are great at generating document embeddings and have several multi-lingual versions available. **🤗 transformers** BERTopic allows you to use any 🤗 transformers model. These models are typically embeddings created on a word/sentence level but can easily be pooled using Flair (see Guides/Embeddings). If you have a specific language for which you want to generate embeddings, you can choose the model [here](https://huggingface.co/models). ## **How do I reduce topic outliers?** There are several ways we can reduce outliers. First, the amount of datapoint classified as outliers is handled by the `min_samples` parameters in HDBSCAN. This value is automatically set to the same value of `min_cluster_size`. However, you can set it independently if you want to reduce the number of generated outliers. Lowering this value will result in less noise being generated. ```python from bertopic import BERTopic from hdbscan import HDBSCAN hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=5) topic_model = BERTopic(hdbscan_model=hdbscan_model) topics, probs = topic_model.fit_transform(docs) ``` !!! note "Note" Although this will lower outliers found in the data, this might force outliers to be put into topics where they do not belong. So make sure to strike a balance between keeping noise and reducing outliers. Second, after training our BERTopic model, we can assign outliers to topics by making use of the `.reduce_outliers` function in BERTopic. An advantage of using this approach is that there are four built in strategies one can choose for reducing outliers. Moreover, this technique allows the user to experiment with reducing outliers across a number of strategies and parameters without actually having to re-train the topic model each time. You can learn more about the `.reduce_outlier` function [here](https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html). The following is a minimal example of how to use this function: ```python from bertopic import BERTopic # Train your BERTopic model topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Reduce outliers new_topics = topic_model.reduce_outliers(docs, topics) ``` Third, we can replace HDBSCAN with any other clustering algorithm that we want. So we can choose a clustering algorithm, like k-Means, that does not produce any outliers at all. Using k-Means instead of HDBSCAN is straightforward: ```python from bertopic import BERTopic from sklearn.cluster import KMeans cluster_model = KMeans(n_clusters=50) topic_model = BERTopic(hdbscan_model=cluster_model) ``` ## **How do I remove stop words?** At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context to create accurate embeddings. Instead, we can use the `CountVectorizer` to preprocess our documents **after** having generated embeddings and clustered our documents. I have found almost no disadvantages to using the `CountVectorizer` to remove stop words and it is something I would strongly advise to try out: ```python from bertopic import BERTopic from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(stop_words="english") topic_model = BERTopic(vectorizer_model=vectorizer_model) ``` We can also use the `ClassTfidfTransformer` to reduce the impact of frequent words. The result is very similar to explicitly removing stop words but this process does this automatically: ```python from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) topic_model = BERTopic(ctfidf_model=ctfidf_model) ``` ## **How can I speed up BERTopic?** You can speed up BERTopic by either generating your embeddings beforehand or by setting `calculate_probabilities` to False. Calculating the probabilities is quite expensive and can significantly increase the computation time. Thus, only use it if you do not mind waiting a bit before the model is done running or if you have less than a couple of hundred thousand documents. Also, make sure to use a GPU when extracting the sentence/document embeddings. Transformer models typically require a GPU and using only a CPU can slow down computation time quite a lot. However, if you do not have access to a GPU, looking into quantization might help. Lastly, it is also possible to speed up BERTopic with [cuML's](https://rapids.ai/start.html#rapids-release-selector) GPU acceleration of UMAP and HDBSCAN: ```python from bertopic import BERTopic from cuml.cluster import HDBSCAN from cuml.manifold import UMAP # Create instances of GPU-accelerated UMAP and HDBSCAN umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0) hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True) # Pass the above models to be used in BERTopic topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model) ``` ## **I am facing memory issues. Help!** There are several ways to perform computation with large datasets: * First, you can set `low_memory` to True when instantiating BERTopic. This may prevent blowing up the memory in UMAP. * Second, setting `calculate_probabilities` to False when instantiating BERTopic prevents a huge document-topic probability matrix from being created. Moreover, HDBSCAN is quite slow when it tries to calculate probabilities on large datasets. * Third, you can set the minimum frequency of words in the CountVectorizer class to reduce the size of the resulting sparse c-TF-IDF matrix. You can do this as follows: ```python from bertopic import BERTopic from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=10) topic_model = BERTopic(vectorizer_model=vectorizer_model) ``` The [min_df](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) parameter is used to indicate the minimum frequency of words. Setting this value larger than 1 can significantly reduce memory. * Fourth, you can use online topic modeling instead to use BERTopic on big data by training the model in chunks If the problem persists, then this could be an issue related to your available memory. The processing of millions of documents is quite computationally expensive and sufficient RAM is necessary. ## **I have only a few topics, how do I increase them?** There are several reasons why your topic model may result in only a few topics: * First, you might only have a few documents (~1000). This makes it very difficult to properly extract topics due to the little amount of data available. Increasing the number of documents might solve your issues. * Second, `min_topic_size` might be simply too large for your number of documents. If you decrease the minimum size of topics, then you are much more likely to increase the number of topics generated. You could also decrease the `n_neighbors` parameter used in `UMAP` if this does not work. * Third, although this does not happen very often, there simply aren't that many topics to be found in your documents. You can often see this when you have many `-1` topics, which is not a topic but a category of outliers. ## **I have too many topics, how do I decrease them?** If you have a large dataset, then it is possible to generate thousands of topics. Especially with large datasets, there is a good chance they contain many small topics. In practice, you might want a few hundred topics at most to interpret them nicely. There are a few ways of decreasing the number of generated topics: * First, we can set the `min_topic_size` in the BERTopic initialization much higher (e.g., 300) to make sure that those small clusters will not be generated. This is an HDBSCAN parameter that specifies the minimum number of documents needed in a cluster. More documents in a cluster mean fewer topics will be generated. * Second, you can create a custom UMAP model and set `n_neighbors` much higher than the default 15 (e.g., 200). This also prevents those micro clusters to be generated as it will need many neighboring documents to create a cluster. * Third, we can set `nr_topics` to a value that seems logical to the user. Do note that topics are forced to merge which might result in a lower quality of topics. In practice, I would advise using `nr_topic="auto"` as that will merge topics that are very similar. Dissimilar topics will therefore remain separated. ## **How do I calculate the probabilities of all topics in a document?** Although it is possible to calculate all the probabilities, the process of doing so is quite computationally inefficient and might significantly increase the computation time. To prevent this, the probabilities are not calculated as a default. To calculate them, you will have to set `calculate_probabilities` to True: ```python from bertopic import BERTopic topic_model = BERTopic(calculate_probabilities=True) topics, probs = topic_model.fit_transform(docs) ``` !!! note The `calculate_probabilities` parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN. Instead, we can approximate the topic distributions across all documents with [`.approximate_distribution`](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html). ## **Numpy gives me an error when running BERTopic** With the release of Numpy 1.20.0, there have been significant issues with using that version (and previous ones) due to compilation issues and pypi. This is a known issue with the order of installation using pypi. You can find more details about this issue [here](https://github.com/lmcinnes/umap/issues/567) and [here](https://github.com/scikit-learn-contrib/hdbscan/issues/457). I would suggest doing one of the following: * Install the newest version from BERTopic (>= v0.5). * You can install hdbscan with `pip install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation` which might resolve the issue * Install BERTopic in a fresh environment using these steps. ## **How can I run BERTopic without an internet connection?** The great thing about using sentence-transformers is that it searches automatically for an embedding model locally. If it cannot find one, it will download the pre-trained model from its servers. Make sure that you set the correct path for sentence-transformers to work. You can find a bit more about that [here](https://github.com/UKPLab/sentence-transformers/issues/888). You can download the corresponding model [here](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/) and unzip it. Then, simply use the following to create your embedding model: ```python from sentence_transformers import SentenceTransformer embedding_model = SentenceTransformer('path/to/unzipped/model') ``` Then, pass it to BERTopic: ```python from bertopic import BERTopic topic_model = BERTopic(embedding_model=embedding_model) ``` ## **Can I use the GPU to speed up the model?** Yes. The GPU is automatically used when you use a SentenceTransformer or Flair embedding model. Using a CPU would then definitely slow things down. However, you can use other embeddings like TF-IDF or Doc2Vec embeddings in BERTopic which do not depend on GPU acceleration. You can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up both UMAP and HDBSCAN through GPU acceleration: ```python from bertopic import BERTopic from cuml.cluster import HDBSCAN from cuml.manifold import UMAP # Create instances of GPU-accelerated UMAP and HDBSCAN umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0) hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True) # Pass the above models to be used in BERTopic topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model) topics, probs = topic_model.fit_transform(docs) ``` Depending on the embeddings you are using, you might want to normalize them first to force a cosine-related distance metric in UMAP: ```python from cuml.preprocessing import normalize embeddings = normalize(embeddings) ``` ## **How can I use BERTopic with Chinese documents?** Currently, CountVectorizer tokenizes text by splitting whitespace which does not work for Chinese. To get it to work, you will have to create a custom `CountVectorizer` with `jieba`: ```python from sklearn.feature_extraction.text import CountVectorizer import jieba def tokenize_zh(text): words = jieba.lcut(text) return words vectorizer = CountVectorizer(tokenizer=tokenize_zh) ``` Next, we pass our custom vectorizer to BERTopic and create our topic model: ```python from bertopic import BERTopic topic_model = BERTopic(embedding_model=model, verbose=True, vectorizer_model=vectorizer) topics, _ = topic_model.fit_transform(docs, embeddings=embeddings) ``` ## **Why does it take so long to import BERTopic?** The main culprit here seems to be UMAP. After running tests with [Tuna](https://github.com/nschloe/tuna) we can see that most of the resources when importing BERTopic can be dedicated to UMAP: Unfortunately, there currently is no fix for this issue. The most recent ticket regarding this issue can be found [here](https://github.com/lmcinnes/umap/issues/631). ## **Should I preprocess the data?** No. By using document embeddings there is typically no need to preprocess the data as all parts of a document are important in understanding the general topic of the document. Although this holds in 99% of cases, if you have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply topic modeling to HTML-code to extract topics of code, then it becomes important. ## **I run into issues running on Apple Silicon. What should I do?** Apple Silicon chips (M1 & M2) are based on `arm64` (aka [`AArch64`](https://apple.stackexchange.com/questions/451238/is-m1-chip-aarch64-or-amd64), not to be confused with `amd64`/`x86_64`). There are known issues with upstream dependencies for this architecture, for example [numba](https://github.com/numba/numba/issues/5520). You may not always run into this issue, depending on the extras that you need. One possible solution is to use [VS Code Dev Containers](https://code.visualstudio.com/docs/devcontainers/containers), which allow you to setup a Linux-based environment. To run BERTopic effectively you need to be aware of two things: 1. Make sure to use a Docker image specifically built for arm64 2. Make sure to use a *volume* instead of a *bind-mount* ℹ️ the latter significantly reduces disk I/O Using the pre-configured [Data Science Dev Containers](https://github.com/b-data/data-science-devcontainers) makes sure these setting are optimized. To start using them, do the following: * Install and run Docker * Clone repository [data-science-devcontainers](https://github.com/b-data/data-science-devcontainers) * Open VS Code, build the `Python base` or `Python scipy` container and start working ℹ️ Change `PYTHON_VERSION` to `3.11` in the respective `devcontainer.json` to work with the latest patch release of Python 3.11 * Note that data is persisted in the container * When using an unmodified `devcontainer.json`: Work in `/home/vscode` 👉 This is the *home directory* of user `vscode` * Python packages are installed to the home directory by default 👉 This is due to env variable `PIP_USER=1` * Note that the directory `/workspaces` is also persisted ### **Do these Data Science Dev Containers support GPU acceleration?** Yes, but only on Linux and Windows. The CUDA-enabled variants require the following in addition to Docker: * NVIDIA GPU * NVIDIA driver * Linux: [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) * Windows: [GPU support in Docker Desktop](https://docs.docker.com/desktop/gpu/) ℹ️ The host running the GPU accelerated Dev Containers only requires the NVIDIA driver, the CUDA toolkit does not have to be installed. See the [CUDA Version Matrix](https://github.com/b-data/jupyterlab-python-docker-stack/blob/main/CUDA_VERSION_MATRIX.md) regarding Ubuntu/CUDA/Python versions and recommended NVIDIA drivers. ================================================ FILE: docs/getting_started/best_practices/best_practices.md ================================================ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing) - Overview of Best Practices Through the nature of BERTopic, its modularity, many variations of the topic modeling technique is possible. However, during the development and through the usage of the package, a set of best practices have been developed that generally lead to great results. The following are a number of steps, parameters, and settings that you can use that will generally improve the quality of the resulting topics. In other words, after going through the quick start and getting a feeling for the API these steps should get you to the next level of performance. !!! Note Although these are called *best practices*, it does not necessarily mean that they work across all use cases perfectly. The underlying modular nature of BERTopic is meant to take different use cases into account. After going through these practices it is advised to fine-tune wherever necessary. To showcase how these "best practices" work, we will go through an example dataset and apply all practices to it. ## **Data** For this example, we will use a dataset containing abstracts and metadata from [ArXiv articles](https://huggingface.co/datasets/arxiv_dataset). ```python from datasets import load_dataset dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"] # Extract abstracts to train on and corresponding titles abstracts = dataset["abstract"] titles = dataset["title"] ``` !!! Tip "Sentence Splitter" Whenever you have large documents, you typically want to split them up into either paragraphs or sentences. A nice way to do so is by using NLTK's sentence splitter which is nothing more than: ```python from nltk.tokenize import sent_tokenize, word_tokenize sentences = [sent_tokenize(abstract) for abstract in abstracts] sentences = [sentence for doc in sentences for sentence in doc] ``` ## **Pre-calculate Embeddings** After having created our data, namely `abstracts`, we can dive into the very first best practice, **pre-calculating embeddings**. BERTopic works by converting documents into numerical values, called embeddings. This process can be very costly, especially if we want to iterate over parameters. Instead, we can calculate those embeddings once and feed them to BERTopic to skip calculating embeddings each time. ```python from sentence_transformers import SentenceTransformer # Pre-calculate embeddings embedding_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = embedding_model.encode(abstracts, show_progress_bar=True) ``` !!! Tip New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look. ## **Preventing Stochastic Behavior** In BERTopic, we generally use a dimensionality reduction algorithm to reduce the size of the embeddings. This is done to prevent the [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) to a certain degree. As a default, this is done with [UMAP](https://github.com/lmcinnes/umap) which is an incredible algorithm for reducing dimensional space. However, by default, it shows stochastic behavior which creates different results each time you run it. To prevent that, we will need to set a `random_state` of the model before passing it to BERTopic. As a result, we can now fully reproduce the results each time we run the model. ```python from umap import UMAP umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) ``` ## **Controlling Number of Topics** There is a parameter to control the number of topics, namely `nr_topics`. This parameter, however, merges topics **after** they have been created. It is a parameter that supports creating a fixed number of topics. However, it is advised to control the number of topics through the cluster model which is by default HDBSCAN. HDBSCAN has a parameter, namely `min_cluster_size` that indirectly controls the number of topics that will be created. A higher `min_cluster_size` will generate fewer topics and a lower `min_cluster_size` will generate more topics. Here, we will go with `min_cluster_size=150` to prevent too many micro-clusters from being created: ```python from hdbscan import HDBSCAN hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True) ``` ## **Improving Default Representation** The default representation of topics is calculated through [c-TF-IDF](https://maartengr.github.io/BERTopic/algorithm/algorithm.html#5-topic-representation). However, c-TF-IDF is powered by the [CountVectorizer](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html) which converts text into tokens. Using the CountVectorizer, we can do a number of things: * Remove stopwords * Ignore infrequent words * Increase the n-gram range In other words, we can preprocess the topic representations **after** documents are assigned to topics. This will not influence the clustering process in any way. Here, we will ignore English stopwords and infrequent words. Moreover, by increasing the n-gram range we will consider topic representations that are made up of one or two words. ```python from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2)) ``` ## **Additional Representations** Previously, we have tuned the default representation but there are quite a number of [other topic representations](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) in BERTopic that we can choose from. From [KeyBERTInspired](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#keybertinspired) and [PartOfSpeech](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html#partofspeech), to [OpenAI's ChatGPT](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#chatgpt) and [open-source](https://maartengr.github.io/BERTopic/getting_started/representation/llm.html#langchain) alternatives, many representations are possible. In BERTopic, you can model many different topic representations simultaneously to test them out and get different perspectives of topic descriptions. This is called [multi-aspect](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) topic modeling. Here, we will demonstrate a number of interesting and useful representations in BERTopic: * KeyBERTInspired * A method that derives inspiration from how KeyBERT works * PartOfSpeech * Using SpaCy's POS tagging to extract words * MaximalMarginalRelevance * Diversify the topic words * OpenAI * Use ChatGPT to label our topics ```python import openai from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech # KeyBERT keybert_model = KeyBERTInspired() # Part-of-Speech pos_model = PartOfSpeech("en_core_web_sm") # MMR mmr_model = MaximalMarginalRelevance(diversity=0.3) # GPT-3.5 client = openai.OpenAI(api_key="sk-...") prompt = """ I have a topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: [KEYWORDS] Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format: topic: """ openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, prompt=prompt) # All representation models representation_model = { "KeyBERT": keybert_model, # "OpenAI": openai_model, # Uncomment if you will use OpenAI "MMR": mmr_model, "POS": pos_model } ``` ## **Training** Now that we have a set of best practices, we can use them in our training loop. Here, several different representations, keywords and labels for our topics will be created. If you want to iterate over the topic model it is advised to use the pre-calculated embeddings as that significantly speeds up training. ```python from bertopic import BERTopic topic_model = BERTopic( # Pipeline models embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, representation_model=representation_model, # Hyperparameters top_n_words=10, verbose=True ) # Train model topics, probs = topic_model.fit_transform(abstracts, embeddings) # Show topics topic_model.get_topic_info() ``` To get all representations for a single topic, we simply run the following: ```python >>> topic_model.get_topic(1, full=True) {'Main': [('adversarial', 0.028838938990764302), ('attacks', 0.021726302042463556), ('attack', 0.016803574415028524), ('robustness', 0.013046135743326167), ('adversarial examples', 0.01151254557995679), ('examples', 0.009920962487998853), ('perturbations', 0.009053305826870773), ('adversarial attacks', 0.008747627064844006), ('malware', 0.007675131707700338), ('defense', 0.007365955840313783)], 'KeyBERT': [('adversarial training', 0.76427937), ('adversarial attack', 0.74271905), ('vulnerable adversarial', 0.73302543), ('adversarial', 0.7311052), ('adversarial examples', 0.7179245), ('adversarial attacks', 0.7082), ('adversarially', 0.7005141), ('adversarial robustness', 0.69911957), ('adversarial perturbations', 0.6588783), ('adversary', 0.4467769)], 'OpenAI': [('Adversarial attacks and defense', 1)], 'MMR': [('adversarial', 0.028838938990764302), ('attacks', 0.021726302042463556), ('attack', 0.016803574415028524), ('robustness', 0.013046135743326167), ('adversarial examples', 0.01151254557995679), ('examples', 0.009920962487998853), ('perturbations', 0.009053305826870773), ('adversarial attacks', 0.008747627064844006), ('malware', 0.007675131707700338), ('defense', 0.007365955840313783)], 'POS': [('adversarial', 0.028838938990764302), ('attacks', 0.021726302042463556), ('attack', 0.016803574415028524), ('robustness', 0.013046135743326167), ('adversarial examples', 0.01151254557995679), ('examples', 0.009920962487998853), ('perturbations', 0.009053305826870773), ('adversarial attacks', 0.008747627064844006), ('malware', 0.007675131707700338), ('defense', 0.007365955840313783)]} ``` **NOTE**: The labels generated by OpenAI's **ChatGPT** are especially interesting to use throughout your model. Below, we will go into more detail how to set that as a custom label. !!! Tip "Parameters" If you would like to return the topic-document probability matrix, then it is advised to use `calculate_probabilities=True`. Do note that this can significantly slow down training. To speed it up, use [cuML's HDBSCAN](https://maartengr.github.io/BERTopic/getting_started/clustering/clustering.html#cuml-hdbscan) instead. You could also approximate the topic-document probability matrix with `.approximate_distribution` which will be discussed later. ## **(Custom) Labels** The default label of each topic are the top 3 words in each topic combined with an underscore between them. This, of course, might not be the best label that you can think of for a certain topic. Instead, we can use `.set_topic_labels` to manually label all or certain topics. We can also use `.set_topic_labels` to use one of the other topic representations that we had before, like `KeyBERTInspired` or even `OpenAI`. ```python # Label the topics yourself topic_model.set_topic_labels({1: "Space Travel", 7: "Religion"}) # or use one of the other topic representations, like KeyBERTInspired keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_["KeyBERT"].items()} topic_model.set_topic_labels(keybert_topic_labels) # or ChatGPT's labels chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()} chatgpt_topic_labels[-1] = "Outlier Topic" topic_model.set_topic_labels(chatgpt_topic_labels) ``` Now that we have set the updated topic labels, we can access them with the many functions used throughout BERTopic. Most notably, you can show the updated labels in visualizations with the `custom_labels=True` parameters. If we were to run `topic_model.get_topic_info()` it will now include the column `CustomName`. That is the custom label that we just created for each topic. ## **Topic-Document Distribution** If using `calculate_probabilities=True` is not possible, then you can [approximate the topic-document distributions](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) using `.approximate_distribution`. It is a fast and flexible method for creating different topic-document distributions. ```python # `topic_distr` contains the distribution of topics in each document topic_distr, _ = topic_model.approximate_distribution(abstracts, window=8, stride=4) ``` Next, lets take a look at a specific abstract and see how the topic distribution was extracted: ```python # Visualize the topic-document distribution for a single document topic_model.visualize_distribution(topic_distr[abstract_id], custom_labels=True) ``` It seems to have extracted a number of topics that are relevant and shows the distributions of these topics across the abstract. We can go one step further and visualize them on a token-level: ```python # Calculate the topic distributions on a token-level topic_distr, topic_token_distr = topic_model.approximate_distribution(abstracts[abstract_id], calculate_tokens=True) # Visualize the token-level distributions df = topic_model.visualize_approximate_distribution(abstracts[abstract_id], topic_token_distr[0]) df ``` !!! Tip "use_embedding_model" As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected embedding_model instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower: ```python topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True) ``` ## **Outlier Reduction** By default, HDBSCAN generates outliers which is a helpful mechanic in creating accurate topic representations. However, you might want to assign every single document to a topic. We can use `.reduce_outliers` to map some or all outliers to a topic: ```python # Reduce outliers new_topics = topic_model.reduce_outliers(abstracts, topics) # Reduce outliers with pre-calculate embeddings instead new_topics = topic_model.reduce_outliers(abstracts, topics, strategy="embeddings", embeddings=embeddings) ``` !!! Note "Update Topics with Outlier Reduction" After having generated updated topic assignments, we can pass them to BERTopic in order to update the topic representations: ```python topic_model.update_topics(docs, topics=new_topics) ``` It is important to realize that updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2. ## **Visualize Topics** With visualizations, we are closing into the realm of subjective "best practices". These are things that I generally do because I like the representations but your experience might differ. Having said that, there are two visualizations that are my go-to when visualizing the topics themselves: * `topic_model.visualize_topics()` * `topic_model.visualize_hierarchy()` ```python # Visualize topics with custom labels topic_model.visualize_topics(custom_labels=True) # Visualize hierarchy with custom labels topic_model.visualize_hierarchy(custom_labels=True) ``` ## **Visualize Documents** When visualizing documents, it helps to have embedded the documents beforehand to speed up computation. Fortunately, we have already done that as a "best practice". Visualizing documents in 2-dimensional space helps in understanding the underlying structure of the documents and topics. ```python # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively: reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) ``` The following plot is **interactive** which means that you can zoom in, double click on a label to only see that one and generally interact with the plot: ```python # Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts # NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset # NOTE: You can also hide the annotations with `hide_annotations=True` which is helpful to see the larger structure topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True) ``` !!! Note "2-dimensional space" Although visualizing the documents in 2-dimensional gives an idea of their underlying structure, there is a risk involved. Visualizing the documents in 2-dimensional space means that we have lost significant information since the original embeddings were more than 384 dimensions. Condensing all that information in 2 dimensions is simply not possible. In other words, it is merely an **approximation**, albeit quite an accurate one. ## **Serialization** When saving a BERTopic model, there are several ways in doing so. You can either save the entire model with `pickle`, `pytorch`, or `safetensors`. Personally, I would advise going with `safetensors` whenever possible. The reason for this is that the format allows for a very small topic model to be saved and shared. When saving a model with `safetensors`, it skips over saving the dimensionality reduction and clustering models. The `.transform` function will still work without these models but instead assign topics based on the similarity between document embeddings and the topic embeddings. As a result, the `.transform` step might give different results but it is generally worth it considering the smaller and significantly faster model. ```python embedding_model = "sentence-transformers/all-MiniLM-L6-v2" topic_model.save("my_model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model) ``` !!! Note "Embedding Model" Using `safetensors`, we are not saving the underlying embedding model but merely a pointer to the model. For example, in the above example we are saving the string `"sentence-transformers/all-MiniLM-L6-v2"` so that we can load in the embedding model alongside the topic model. This currently only works if you are using a sentence transformer model. If you are using a different model, you can load it in when loading the topic model like this: ```python from sentence_transformers import SentenceTransformer # Define embedding model embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # Load model and add embedding model loaded_model = BERTopic.load("my_model_dir", embedding_model=embedding_model) ``` ## **Inference** To speed up the inference, we can leverage a "best practice" that we used before, namely serialization. When you save a model as `safetensors` and then load it in, we are removing the dimensionality reduction and clustering steps from the pipeline. Instead, the assignment of topics is done through cosine similarity of document embeddings and topic embeddings. This speeds up inferences significantly. To show its effect, let's start by disabling the logger: ```python from bertopic._utils import MyLogger logger = MyLogger() logger.configure("ERROR") loaded_model.verbose = False topic_model.verbose = False ``` Then, we run inference on both the loaded model and the non-loaded model: ```python >>> %timeit loaded_model.transform(abstracts[:100]) 343 ms ± 31.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` ```python >>> %timeit topic_model.transform(abstracts[:100]) 1.37 s ± 166 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` Based on the above, the `loaded_model` seems to be quite a bit faster for inference than the original `topic_model`. ================================================ FILE: docs/getting_started/clustering/clustering.md ================================================ After reducing the dimensionality of our input embeddings, we need to cluster them into groups of similar embeddings to extract our topics. This process of clustering is quite important because the more performant our clustering technique the more accurate our topic representations are. In BERTopic, we typically use HDBSCAN as it is quite capable of capturing structures with different densities. However, there is not one perfect clustering model and you might want to be using something entirely different for your use case. Moreover, what if a new state-of-the-art model is released tomorrow? We would like to be able to use that in BERTopic, right? Since BERTopic assumes some independence among steps, we can allow for this modularity:
![Image title](clustering.svg)
As a result, the `hdbscan_model` parameter in BERTopic now allows for a variety of clustering models. To do so, the class should have the following attributes: * `.fit(X)` * A function that can be used to fit the model * `.predict(X)` * A predict function that transforms the input to cluster labels * `.labels_` * The labels after fitting the model In other words, it should have the following structure: ```python class ClusterModel: def fit(self, X): self.labels_ = None return self def predict(self, X): return X ``` In this section, we will go through several examples of clustering algorithms and how they can be implemented. ## **HDBSCAN** As a default, BERTopic uses HDBSCAN to perform its clustering. To use a HDBSCAN model with custom parameters, we simply define it and pass it to BERTopic: ```python from bertopic import BERTopic from hdbscan import HDBSCAN hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True) topic_model = BERTopic(hdbscan_model=hdbscan_model) ``` Here, we can define any parameters in HDBSCAN to optimize for the best performance based on whatever validation metrics you are using. ## **k-Means** Although HDBSCAN works quite well in BERTopic and is typically advised, you might want to be using k-Means instead. It allows you to select how many clusters you would like and forces every single point to be in a cluster. Therefore, no outliers will be created. This also has disadvantages. When you force every single point in a cluster, it will mean that the cluster is highly likely to contain noise which can hurt the topic representations. As a small tip, using the `vectorizer_model=CountVectorizer(stop_words="english")` helps quite a bit to then improve the topic representation. Having said that, using k-Means is quite straightforward: ```python from bertopic import BERTopic from sklearn.cluster import KMeans cluster_model = KMeans(n_clusters=50) topic_model = BERTopic(hdbscan_model=cluster_model) ``` !!! note As you might have noticed, the `cluster_model` is passed to `hdbscan_model` which might be a bit confusing considering you are not passing an HDBSCAN model. For now, the name of the parameter is kept the same to adhere to the current state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible. ## **Agglomerative Clustering** Like k-Means, there are a bunch more clustering algorithms in `sklearn` that you can be using. Some of these models do not have a `.predict()` method but still can be used in BERTopic. However, using BERTopic's `.transform()` function will then give errors. Here, we will demonstrate Agglomerative Clustering: ```python from bertopic import BERTopic from sklearn.cluster import AgglomerativeClustering cluster_model = AgglomerativeClustering(n_clusters=50) topic_model = BERTopic(hdbscan_model=cluster_model) ``` ## **cuML HDBSCAN** Although the original HDBSCAN implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead, we can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up HDBSCAN through GPU acceleration: ```python from bertopic import BERTopic from cuml.cluster import HDBSCAN hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True) topic_model = BERTopic(hdbscan_model=hdbscan_model) ``` The great thing about using cuML's HDBSCAN implementation is that it supports many features of the original implementation. In other words, `calculate_probabilities=True` also works! !!! note As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., `.transform`) using cuML's HDBSCAN. However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., `.fit` and `.fit_transform`). !!! note To install cuML with BERTopic, run these commands: **For CUDA 12:** ```bash !pip install cuml-cu12 !pip install bertopic ``` **For CUDA 13:** ```bash !pip install cuml-cu13 !pip install bertopic ``` !!! warning Install cuML first, then BERTopic. Installing both in a single command can fail due to pip resolver limitations with CUDA runtime dependencies. **Note:** cuML is already installed on Google Colab. For more detailed information on installing cuML, including additional dependencies and platform-specific instructions, see the [RAPIDS installation guide](https://docs.rapids.ai/install/). ================================================ FILE: docs/getting_started/ctfidf/ctfidf.md ================================================ # c-TF-IDF In BERTopic, in order to get an accurate representation of the topics from our bag-of-words matrix, TF-IDF was adjusted to work on a cluster/categorical/topic level instead of a document level. This adjusted TF-IDF representation is called **c-TF-IDF** and takes into account what makes the documents in one cluster different from documents in another cluster:
Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word `x` in class `c`, where `c` refers to the cluster we created before. This results in our class-based `tf` representation. This representation is L1-normalized to account for the differences in topic sizes.

Then, we take the logarithm of one plus the average number of words per class `A` divided by the frequency of word `x` across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based `idf` representation. Like with the classic TF-IDF, we then multiply `tf` with `idf` to get the importance score per word in each class. In other words, the classical TF-IDF procedure is **not** used here but a modified version of the algorithm that allows for a much better representation. Since the topic representation is somewhat independent of the clustering step, we can change how the c-TF-IDF representation will look like. This can be in the form of parameter tuning, different weighting schemes, or using a diversity metric on top of it. This allows for some modularity concerning the weighting scheme:
![Image title](ctfidf.svg)
This class-based TF-IDF representation is enabled by default in BERTopic. However, we can explicitly pass it to BERTopic through the `ctfidf_model` allowing for parameter tuning and the customization of the topic extraction technique: ```python from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer ctfidf_model = ClassTfidfTransformer() topic_model = BERTopic(ctfidf_model=ctfidf_model ) ``` ## **Parameters** There are two parameters worth exploring in the `ClassTfidfTransformer`, namely `bm25_weighting` and `reduce_frequent_words`. ### bm25_weighting The `bm25_weighting` is a boolean parameter that indicates whether a class-based BM-25 weighting measure is used instead of the default method as defined in the formula at the beginning of this page. Instead of using the following weighting scheme: the class-based BM-25 weighting is used instead: At smaller datasets, this variant can be more robust to stop words that appear in your data. It can be enabled as follows: ```python from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer ctfidf_model = ClassTfidfTransformer(bm25_weighting=True) topic_model = BERTopic(ctfidf_model=ctfidf_model ) ``` ### reduce_frequent_words Some words appear quite often in every topic but are generally not considered stop words as found in the `CountVectorizer(stop_words="english")` list. To further reduce these frequent words, we can use `reduce_frequent_words` to take the square root of the term frequency after applying the weighting scheme. Instead of the default term frequency: we take the square root of the term frequency after normalizing the frequency matrix: Although seemingly a small change, it can have quite a large effect on the number of stop words in the resulting topic representations. It can be enabled as follows: ```python from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) topic_model = BERTopic(ctfidf_model=ctfidf_model ) ``` !!! tip Both parameters can be used simultaneously: `ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)` ================================================ FILE: docs/getting_started/dim_reduction/dim_reduction.md ================================================ An important aspect of BERTopic is the dimensionality reduction of the input embeddings. As embeddings are often high in dimensionality, clustering becomes difficult due to the curse of dimensionality. A solution is to reduce the dimensionality of the embeddings to a workable dimensional space (e.g., 5) for clustering algorithms to work with. UMAP is used as a default in BERTopic since it can capture both the local and global high-dimensional space in lower dimensions. However, there are other solutions out there, such as PCA that users might be interested in trying out. Since BERTopic allows assumes some independency between steps, we can use any other dimensionality reduction algorithm. The image below illustrates this modularity:
![Image title](dimensionality.svg)
As a result, the `umap_model` parameter in BERTopic now allows for a variety of dimensionality reduction models. To do so, the class should have the following attributes: * `.fit(X)` * A function that can be used to fit the model * `.transform(X)` * A transform function that transforms the input to a lower dimensional size In other words, it should have the following structure: ```python class DimensionalityReduction: def fit(self, X): return self def transform(self, X): return X ``` In this section, we will go through several examples of dimensionality reduction techniques and how they can be implemented. ## **UMAP** As a default, BERTopic uses UMAP to perform its dimensionality reduction. To use a UMAP model with custom parameters, we simply define it and pass it to BERTopic: ```python from bertopic import BERTopic from umap import UMAP umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine') topic_model = BERTopic(umap_model=umap_model) ``` Here, we can define any parameters in UMAP to optimize for the best performance based on whatever validation metrics you are using. ## **PCA** Although UMAP works quite well in BERTopic and is typically advised, you might want to be using PCA instead. It can be faster to train and perform inference. To use PCA, we can simply import it from `sklearn` and pass it to the `umap_model` parameter: ```python from bertopic import BERTopic from sklearn.decomposition import PCA dim_model = PCA(n_components=5) topic_model = BERTopic(umap_model=dim_model) ``` As a small note, PCA and k-Means have worked quite well in my experiments and might be interesting to use instead of PCA and HDBSCAN. !!! note As you might have noticed, the `dim_model` is passed to `umap_model` which might be a bit confusing considering you are not passing a UMAP model. For now, the name of the parameter is kept the same to adhere to the current state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible. ## **Truncated SVD** Like PCA, there are a bunch more dimensionality reduction techniques in `sklearn` that you can be using. Here, we will demonstrate Truncated SVD but any model can be used as long as it has both a `.fit()` and `.transform()` method: ```python from bertopic import BERTopic from sklearn.decomposition import TruncatedSVD dim_model = TruncatedSVD(n_components=5) topic_model = BERTopic(umap_model=dim_model) ``` ## **cuML UMAP** Although the original UMAP implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead, we can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up UMAP through GPU acceleration: ```python from bertopic import BERTopic from cuml.manifold import UMAP umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0) topic_model = BERTopic(umap_model=umap_model) ``` !!! note To install cuML with BERTopic, run these commands: **For CUDA 12:** ```bash !pip install cuml-cu12 !pip install bertopic ``` **For CUDA 13:** ```bash !pip install cuml-cu13 !pip install bertopic ``` !!! warning Install cuML first, then BERTopic. Installing both in a single command can fail due to pip resolver limitations with CUDA runtime dependencies. **Note:** cuML is already installed on Google Colab. For more detailed information on installing cuML, including additional dependencies and platform-specific instructions, see the [RAPIDS installation guide](https://docs.rapids.ai/install/). ## **Skip dimensionality reduction** Although BERTopic applies dimensionality reduction as a default in its pipeline, this is a step that you might want to skip. We generate an "empty" model that simply returns the data pass it to: ```python from bertopic import BERTopic from bertopic.dimensionality import BaseDimensionalityReduction # Fit BERTopic without actually performing any dimensionality reduction empty_dimensionality_model = BaseDimensionalityReduction() topic_model = BERTopic(umap_model=empty_dimensionality_model) ``` In other words, we go from this pipeline:
--8<-- "docs/getting_started/dim_reduction/default_pipeline.svg"

To the following pipeline:
--8<-- "docs/getting_started/dim_reduction/no_dimensionality.svg"

================================================ FILE: docs/getting_started/distribution/distribution.md ================================================ BERTopic approaches topic modeling as a cluster task and attempts to cluster semantically similar documents to extract common topics. A disadvantage of using such a method is that each document is assigned to a single cluster and therefore also a single topic. In practice, documents may contain a mixture of topics. This can be accounted for by splitting up the documents into sentences and feeding those to BERTopic. Another option is to use a cluster model that can perform soft clustering, like HDBSCAN. As BERTopic focuses on modularity, we may still want to model that mixture of topics even when we are using a hard-clustering model, like k-Means without the need to split up our documents. This is where `.approximate_distribution` comes in!
--8<-- "docs/getting_started/distribution/approximate_distribution.svg"

To perform this approximation, each document is split into tokens according to the provided tokenizer in the `CountVectorizer`. Then, a **sliding window** is applied on each document creating subsets of the document. For example, with a window size of 3 and stride of 1, the document: > Solving the right problem is difficult. can be split up into `solving the right`, `the right problem`, `right problem is`, and `problem is difficult`. These are called token sets. For each of these token sets, we calculate their c-TF-IDF representation and find out how similar they are to the previously generated topics. Then, the similarities to the topics for each token set are summed to create a topic distribution for the entire document. Although it is often said that documents can contain a mixture of topics, these are often modeled by assigning each word to a single topic. With this approach, we take into account that there may be multiple topics for a single word. We can make this multiple-topic word assignment a bit more accurate by then splitting these token sets up into individual tokens and assigning the topic distributions for each token set to each individual token. That way, we can visualize the extent to which a certain word contributes to a document's topic distribution. ## **Example** To calculate our topic distributions, we first need to fit a basic topic model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic().fit(docs) ``` After doing so, we can approximate the topic distributions for your documents: ```python topic_distr, _ = topic_model.approximate_distribution(docs) ``` The resulting `topic_distr` is a *n* x *m* matrix where *n* are the documents and *m* the topics. We can then visualize the distribution of topics in a document: ```python topic_model.visualize_distribution(topic_distr[1]) ``` Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results: ```python # Calculate the topic distributions on a token-level topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) # Visualize the token-level distributions df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1]) df ```



!!! tip You can also approximate the topic distributions for unseen documents. It will not be as accurate as `.transform` but it is quite fast and can serve you well in a production setting. !!! note To get the stylized dataframe for `.visualize_approximate_distribution` you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via `pip install jinja2` ## **Parameters** There are a few parameters that are of interest which will be discussed below. ### **batch_size** Creating token sets for each document can result in quite a large list of token sets. The similarity of these token sets with the topics can result a large matrix that might not fit into memory anymore. To circumvent this, we can process batches of documents instead to minimize the memory overload. The value for `batch_size` indicates the number of documents that will be processed at once: ```python topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=500) ``` ### **window** The number of tokens that are combined into token sets are defined by the `window` parameter. Seeing as we are performing a sliding window, we can change the size of the window. A larger window takes more tokens into account but setting it too large can result in considering too much information. Personally, I like to have this window between 4 and 8: ```python topic_distr, _ = topic_model.approximate_distribution(docs, window=4) ``` ### **stride** The sliding window that is performed on a document shifts, as a default, 1 token to the right each time to create its token sets. As a result, especially with large windows, a single token gets judged several times. We can use the `stride` parameter to increase the number of tokens the window shifts to the right. By increasing this value, we are judging each token less frequently which often results in a much faster calculation. Combining this parameter with `window` is preferred. For example, if we have a very large dataset, we can set `stride=4` and `window=8` to judge token sets that contain 8 tokens but that are shifted with 4 steps each time. As a result, this increases the computational speed quite a bit: ```python topic_distr, _ = topic_model.approximate_distribution(docs, window=4) ``` ### **use_embedding_model** As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected `embedding_model` instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower: ```python topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True) ``` ================================================ FILE: docs/getting_started/distribution/distribution_viz.html ================================================
================================================ FILE: docs/getting_started/embeddings/embeddings.md ================================================ # Embedding Models BERTopic starts with transforming our input documents into numerical representations. Although there are many ways this can be achieved, we typically use sentence-transformers (`"all-MiniLM-L6-v2"`) as it is quite capable of capturing the semantic similarity between documents. However, there is not one perfect embedding model and you might want to be using something entirely different for your use case. Since BERTopic assumes some independence among steps, we can allow for this modularity:
![Image title](embeddings.svg)
This modularity allows us not only to choose any embedding model to convert our documents into numerical representations, we can use essentially any data to perform our clustering. When new state-of-the-art pre-trained embedding models are released, BERTopic will be able to use them. As a result, BERTopic grows with any new models being released. Out of the box, BERTopic supports several embedding techniques. In this section, we will go through several of them and how they can be implemented. ## **Sentence Transformers** You can select any model from sentence-transformers [here](https://www.sbert.net/docs/pretrained_models.html) and pass it through BERTopic with `embedding_model`: ```python from bertopic import BERTopic topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2") ``` Or select a SentenceTransformer model with your parameters: ```python from sentence_transformers import SentenceTransformer sentence_model = SentenceTransformer("all-MiniLM-L6-v2") topic_model = BERTopic(embedding_model=sentence_model) ``` !!! tip "Tip 1!" This embedding back-end was put here first for a reason, sentence-transformers works amazing out of the box! Playing around with different models can give you great results. Also, make sure to frequently visit [this](https://www.sbert.net/docs/pretrained_models.html) page as new models are often released. !!! tip "Tip 2!" New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look. Many of these models can be used with `SentenceTransformers` in BERTopic, like so: ```python from sentence_transformers import SentenceTransformer embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5") topic_model = BERTopic(embedding_model=embedding_model) ``` ## **Model2Vec** To use a blazingly fast [Model2Vec](https://github.com/MinishLab/model2vec) model, you first need to install model2vec: ``` pip install model2vec ``` Then, you can load in any of their models and pass it to BERTopic like so: ```python from model2vec import StaticModel embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M") topic_model = BERTopic(embedding_model=embedding_model) ``` ### **Distillation** These models are extremely versatile and can be distilled from existing embedding model (like those compatible with `sentence-transformers`). This distillation process doesn't require a vocabulary (as it uses the tokenizer's vocabulary) but can benefit from having one. Fortunately, this allows you to use the vocabulary from your input documents to distill a model yourself. Doing so requires you to install some additional dependencies of model2vec like so: ``` pip install model2vec[distill] ``` To then distill common embedding models, you need to import the `Model2VecBackend` from BERTopic: ```python from bertopic.backend import Model2VecBackend # Choose a model to distill (a non-Model2Vec model) embedding_model = Model2VecBackend( "sentence-transformers/all-MiniLM-L6-v2", distill=True ) topic_model = BERTopic(embedding_model=embedding_model) ``` You can also choose a custom vectorizer for creating the vocabulary and define custom arguments for the distillatio process: ```python from bertopic.backend import Model2VecBackend from sklearn.feature_extraction.text import CountVectorizer # Choose a model to distill (a non-Model2Vec model) embedding_model = Model2VecBackend( "sentence-transformers/all-MiniLM-L6-v2", distill=True, distill_kwargs={"pca_dims": 256, "apply_zipf": True, "use_subword": True}, distill_vectorizer=CountVectorizer(ngram_range=(1, 3)) ) topic_model = BERTopic(embedding_model=embedding_model) ``` !!! tip "Tip!" You can save the resulting model with `topic_model.embedding_model.embedding_model.save_pretrained("m2v_model")`. ## **🤗 Hugging Face Transformers** To use a Hugging Face transformers model, load in a pipeline and point to any model found on their model hub (https://huggingface.co/models): ```python from transformers.pipelines import pipeline embedding_model = pipeline("feature-extraction", model="distilbert-base-cased") topic_model = BERTopic(embedding_model=embedding_model) ``` !!! tip "Tip!" These transformers also work quite well using `sentence-transformers` which has great optimizations tricks that make using it a bit faster. **Langchain** [Langchain](https://python.langchain.com/docs/introduction) allows you to use different embedding models supported by various cloud providers. On top of that, it supports various integrations to open source models. To get started: ```python from langchain_community.embeddings import HuggingFaceInstructEmbeddings from bertopic.backend import LangChainBackend hf_embedding = HuggingFaceInstructEmbeddings() langchain_embedder = LangChainBackend(hf_embedding) ``` To see what providers are being supported by Langchain, you can check the list [here](https://python.langchain.com/docs/integrations/providers/). For more information, you can have a look on [Langchain's Embedding Models](https://python.langchain.com/docs/integrations/text_embedding/). ## **Flair** [Flair](https://github.com/flairNLP/flair) allows you to choose almost any embedding model that is publicly available. Flair can be used as follows: ```python from flair.embeddings import TransformerDocumentEmbeddings roberta = TransformerDocumentEmbeddings('roberta-base') topic_model = BERTopic(embedding_model=roberta) ``` You can select any 🤗 transformers model [here](https://huggingface.co/models). Moreover, you can also use Flair to use word embeddings and pool them to create document embeddings. Under the hood, Flair simply averages all word embeddings in a document. Then, we can easily pass it to BERTopic to use those word embeddings as document embeddings: ```python from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings glove_embedding = WordEmbeddings('crawl') document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding]) topic_model = BERTopic(embedding_model=document_glove_embeddings) ``` ## **Spacy** [Spacy](https://github.com/explosion/spaCy) is an amazing framework for processing text. There are many models available across many languages for modeling text. To use Spacy's non-transformer models in BERTopic: ```python import spacy nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) topic_model = BERTopic(embedding_model=nlp) ``` Using spacy-transformer models: ```python import spacy spacy.prefer_gpu() nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) topic_model = BERTopic(embedding_model=nlp) ``` If you run into memory issues with spacy-transformer models, try: ```python import spacy from thinc.api import set_gpu_allocator, require_gpu nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) set_gpu_allocator("pytorch") require_gpu(0) topic_model = BERTopic(embedding_model=nlp) ``` ## **Universal Sentence Encoder (USE)** The Universal Sentence Encoder encodes text into high-dimensional vectors that are used here for embedding the documents. The model is trained and optimized for greater-than-word length text, such as sentences, phrases, or short paragraphs. Using USE in BERTopic is rather straightforward: ```python import tensorflow_hub embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") topic_model = BERTopic(embedding_model=embedding_model) ``` ## **Gensim** BERTopic supports the `gensim.downloader` module, which allows it to download any word embedding model supported by Gensim. Typically, these are Glove, Word2Vec, or FastText embeddings: ```python import gensim.downloader as api ft = api.load('fasttext-wiki-news-subwords-300') topic_model = BERTopic(embedding_model=ft) ``` !!! tip "Tip!" Gensim is primarily used for Word Embedding models. This works typically best for short documents since the word embeddings are pooled. ## **Scikit-Learn Embeddings** Scikit-Learn is a framework for more than just machine learning. It offers many preprocessing tools, some of which can be used to create representations for text. Many of these tools are relatively lightweight and do not require a GPU. While the representations may be less expressive than many BERT models, the fact that it runs much faster can make it a relevant candidate to consider. If you have a scikit-learn compatible pipeline that you'd like to use to embed text then you can also pass this to BERTopic. ```python from sklearn.pipeline import make_pipeline from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer pipe = make_pipeline( TfidfVectorizer(), TruncatedSVD(100) ) topic_model = BERTopic(embedding_model=pipe) ``` !!! Warning One caveat to be aware of is that scikit-learns base `Pipeline` class does not support the `.partial_fit()`-API. If you have a pipeline that theoretically should be able to support online learning then you might want to explore the [scikit-partial](https://github.com/koaning/scikit-partial) project. Moreover, since this backend does not generate representations on a word level, it does not support the `bertopic.representation` models. ## **OpenAI** To use OpenAI's external API, we need to define our key and explicitly call `bertopic.backend.OpenAIBackend` to be used in our topic model: ```python import openai from bertopic.backend import OpenAIBackend client = openai.OpenAI(api_key="sk-...") embedding_model = OpenAIBackend(client, "text-embedding-ada-002") topic_model = BERTopic(embedding_model=embedding_model) ``` ## **Cohere** To use Cohere's external API, we need to define our key and explicitly call `bertopic.backend.CohereBackend` to be used in our topic model: ```python import cohere from bertopic.backend import CohereBackend client = cohere.Client("MY_API_KEY") embedding_model = CohereBackend(client) topic_model = BERTopic(embedding_model=embedding_model) ``` ## **FastEmbed** FastEmbed[https://qdrant.tech/documentation/fastembed/] is a lightweight python library for embedding generation and it supports popular embedding models. You can easily use it as in the example below: ```python from bertopic.backend import FastEmbedBackend embedding_model = FastEmbedBackend("BAAI/bge-small-en-v1.5") topic_model = BERTopic(embedding_model=embedding_model) ``` !!! tip "Tip!" Before to start check the supported FastEmbed text embedding models [here](https://qdrant.github.io/fastembed/examples/Supported_Models/). ## **Multimodal** To create embeddings for both text and images in the same vector space, we can use the `MultiModalBackend`. This model uses a clip-vit based model that is capable of embedding text, images, or both: ```python from bertopic.backend import MultiModalBackend model = MultiModalBackend('clip-ViT-B-32', batch_size=32) # Embed documents only doc_embeddings = model.embed_documents(docs) # Embedding images only image_embeddings = model.embed_images(images) # Embed both images and documents, then average them doc_image_embeddings = model.embed(docs, images) ``` ## **Custom Backend** If your backend or model cannot be found in the ones currently available, you can use the `bertopic.backend.BaseEmbedder` class to create your backend. Below, you will find an example of creating a SentenceTransformer backend for BERTopic: ```python from bertopic.backend import BaseEmbedder from sentence_transformers import SentenceTransformer class CustomEmbedder(BaseEmbedder): def __init__(self, embedding_model): super().__init__() self.embedding_model = embedding_model def embed(self, documents, verbose=False): embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose) return embeddings # Create custom backend embedding_model = SentenceTransformer("all-MiniLM-L6-v2") custom_embedder = CustomEmbedder(embedding_model=embedding_model) # Pass custom backend to bertopic topic_model = BERTopic(embedding_model=custom_embedder) ``` ## **Custom Embeddings** The base models in BERTopic are BERT-based models that work well with document similarity tasks. Your documents, however, might be too specific for a general pre-trained model to be used. Fortunately, you can use the embedding model in BERTopic to create document features. You only need to prepare the document embeddings yourself and pass them through `fit_transform` of BERTopic: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train our topic model using our pre-trained sentence-transformers embeddings topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs, embeddings) ``` As you can see above, we used a SentenceTransformer model to create the embedding. You could also have used `🤗 transformers`, `Doc2Vec`, or any other embedding method. ### **TF-IDF** As mentioned above, any embedding technique can be used. However, when running UMAP, the typical distance metric is `cosine` which does not work quite well for a TF-IDF matrix. Instead, BERTopic will recognize that a sparse matrix is passed and use `hellinger` instead which works quite well for the similarity between probability distributions. We simply create a TF-IDF matrix and use them as embeddings in our `fit_transform` method: ```python from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer # Create TF-IDF sparse matrix docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] vectorizer = TfidfVectorizer(min_df=5) embeddings = vectorizer.fit_transform(docs) # Train our topic model using TF-IDF vectors topic_model = BERTopic(stop_words="english") topics, probs = topic_model.fit_transform(docs, embeddings) ``` Here, you will probably notice that creating the embeddings is quite fast whereas `fit_transform` is quite slow. This is to be expected as reducing the dimensionality of a large sparse matrix takes some time. The inverse of using transformer embeddings is true: creating the embeddings is slow whereas `fit_transform` is quite fast. ================================================ FILE: docs/getting_started/guided/guided.md ================================================ Guided Topic Modeling or Seeded Topic Modeling is a collection of techniques that guides the topic modeling approach by setting several seed topics to which the model will converge to. These techniques allow the user to set a predefined number of topic representations that are sure to be in documents. For example, take an IT business that has a ticket system for the software their clients use. Those tickets may typically contain information about a specific bug regarding login issues that the IT business is aware of. To model that bug, we can create a seed topic representation containing the words `bug`, `login`, `password`, and `username`. By defining those words, a Guided Topic Modeling approach will try to converge at least one topic to those words.
--8<-- "docs/getting_started/guided/guided.svg"

Guided BERTopic has two main steps: First, we create embeddings for each seeded topic by joining them and passing them through the document embedder. These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. If the document is most similar to a seeded topic, then it will get that topic's label. If it is most similar to the average document embedding, it will get the -1 label. These labels are then passed through UMAP to create a semi-supervised approach that should nudge the topic creation to the seeded topics. Second, we take all words in seed_topic_list and assign them a multiplier larger than 1. Those multipliers will be used to increase the IDF values of the words across all topics thereby increasing the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier. ### **Example** To demonstrate Guided BERTopic, we use the 20 Newsgroups dataset as our example. We have frequently used this dataset in BERTopic examples and we sometimes see a topic generated about health with words such as `drug` and `cancer` being important. However, due to the stochastic nature of UMAP, this topic is not always found. In order to guide BERTopic to that topic, we create a seed topic list that we pass through our model. However, there may be several other topics that we know should be in the documents. Let's also initialize those: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))["data"] seed_topic_list = [["drug", "cancer", "drugs", "doctor"], ["windows", "drive", "dos", "file"], ["space", "launch", "orbit", "lunar"]] topic_model = BERTopic(seed_topic_list=seed_topic_list) topics, probs = topic_model.fit_transform(docs) ``` As you can see above, the `seed_topic_list` contains a list of topic representations. By defining the above topics BERTopic is more likely to model the defined seeded topics. However, BERTopic is merely nudged towards creating those topics. In practice, if the seeded topics do not exist or might be divided into smaller topics, then they will not be modeled. Thus, seed topics need to be accurate to accurately converge towards them. ================================================ FILE: docs/getting_started/hierarchicaltopics/hierarchical_topics.html ================================================
================================================ FILE: docs/getting_started/hierarchicaltopics/hierarchicaltopics.md ================================================ When tweaking your topic model, the number of topics that are generated has a large effect on the quality of the topic representations. Some topics could be merged and having an understanding of the effect will help you understand which topics should and which should not be merged. That is where hierarchical topic modeling comes in. It tries to model the possible hierarchical nature of the topics you have created to understand which topics are similar to each other. Moreover, you will have more insight into sub-topics that might exist in your data.
--8<-- "docs/getting_started/hierarchicaltopics/hierarchical.svg"

In BERTopic, we can approximate this potential hierarchy by making use of our topic-term matrix (c-TF-IDF matrix). This matrix contains information about the importance of every word in every topic and makes for a nice numerical representation of our topics. The smaller the distance between two c-TF-IDF representations, the more similar we assume they are. In practice, this process of merging topics is done through the hierarchical clustering capabilities of `scipy` (see [here](https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html)). It allows for several linkage methods through which we can approximate our topic hierarchy. As a default, we are using the [ward](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.ward.html#scipy.cluster.hierarchy.ward) but many others are available. Whenever we merge two topics, we can calculate the c-TF-IDF representation of these two merged by summing their bag-of-words representation. We assume that two sets of topics are merged and that all others are kept the same, regardless of their location in the hierarchy. This helps us isolate the potential effect of merging sets of topics. As a result, we can see the topic representation at each level in the tree. ## **Example** To demonstrate hierarchical topic modeling with BERTopic, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents. First, we train a basic BERTopic model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))["data"] topic_model = BERTopic(verbose=True) topics, probs = topic_model.fit_transform(docs) ``` Next, we can use our fitted BERTopic model to extract possible hierarchies from our c-TF-IDF matrix: ```python hierarchical_topics = topic_model.hierarchical_topics(docs) ``` The resulting `hierarchical_topics` is a dataframe in which merged topics are described. For example, if you would merge two topics, what would the topic representation of the new topic be? ## **Linkage functions** When creating the potential hierarchical nature of topics, we use Scipy's ward `linkage` function as a default to generate the hierarchy. However, you might want to use a [different linkage function](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html) for your use case, such as `single`, `complete`, `average`, `centroid`, or `median`. In BERTopic, you can define the linkage function yourself, including the distance function that you would like to use: ```python from scipy.cluster import hierarchy as sch from bertopic import BERTopic topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Hierarchical topics linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True) hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function) ``` ## **Visualizations** To visualize these results, we can start by running a familiar function, namely `topic_model.visualize_hierarchy`: ```python topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) ``` If you **hover** over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes. Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use `topic_model.get_topic_tree` to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged: ```python >>> tree = topic_model.get_topic_tree(hierarchical_topics) >>> print(tree) . └─atheists_atheism_god_moral_atheist ├─atheists_atheism_god_atheist_argument │ ├─■──atheists_atheism_god_atheist_argument ── Topic: 21 │ └─■──br_god_exist_genetic_existence ── Topic: 124 └─■──moral_morality_objective_immoral_morals ── Topic: 29 ```
Click here to view the full tree. ```bash . ├─people_armenian_said_god_armenians │ ├─god_jesus_jehovah_lord_christ │ │ ├─god_jesus_jehovah_lord_christ │ │ │ ├─jehovah_lord_mormon_mcconkie_god │ │ │ │ ├─■──ra_satan_thou_god_lucifer ── Topic: 94 │ │ │ │ └─■──jehovah_lord_mormon_mcconkie_unto ── Topic: 78 │ │ │ └─jesus_mary_god_hell_sin │ │ │ ├─jesus_hell_god_eternal_heaven │ │ │ │ ├─hell_jesus_eternal_god_heaven │ │ │ │ │ ├─■──jesus_tomb_disciples_resurrection_john ── Topic: 69 │ │ │ │ │ └─■──hell_eternal_god_jesus_heaven ── Topic: 53 │ │ │ │ └─■──aaron_baptism_sin_law_god ── Topic: 89 │ │ │ └─■──mary_sin_maria_priest_conception ── Topic: 56 │ │ └─■──marriage_married_marry_ceremony_marriages ── Topic: 110 │ └─people_armenian_armenians_said_mr │ ├─people_armenian_armenians_said_israel │ │ ├─god_homosexual_homosexuality_atheists_sex │ │ │ ├─homosexual_homosexuality_sex_gay_homosexuals │ │ │ │ ├─■──kinsey_sex_gay_men_sexual ── Topic: 44 │ │ │ │ └─homosexuality_homosexual_sin_homosexuals_gay │ │ │ │ ├─■──gay_homosexual_homosexuals_sexual_cramer ── Topic: 50 │ │ │ │ └─■──homosexuality_homosexual_sin_paul_sex ── Topic: 27 │ │ │ └─god_atheists_atheism_moral_atheist │ │ │ ├─islam_quran_judas_islamic_book │ │ │ │ ├─■──jim_context_challenges_articles_quote ── Topic: 36 │ │ │ │ └─islam_quran_judas_islamic_book │ │ │ │ ├─■──islam_quran_islamic_rushdie_muslims ── Topic: 31 │ │ │ │ └─■──judas_scripture_bible_books_greek ── Topic: 33 │ │ │ └─atheists_atheism_god_moral_atheist │ │ │ ├─atheists_atheism_god_atheist_argument │ │ │ │ ├─■──atheists_atheism_god_atheist_argument ── Topic: 21 │ │ │ │ └─■──br_god_exist_genetic_existence ── Topic: 124 │ │ │ └─■──moral_morality_objective_immoral_morals ── Topic: 29 │ │ └─armenian_armenians_people_israel_said │ │ ├─armenian_armenians_israel_people_jews │ │ │ ├─tax_rights_government_income_taxes │ │ │ │ ├─■──rights_right_slavery_slaves_residence ── Topic: 106 │ │ │ │ └─tax_government_taxes_income_libertarians │ │ │ │ ├─■──government_libertarians_libertarian_regulation_party ── Topic: 58 │ │ │ │ └─■──tax_taxes_income_billion_deficit ── Topic: 41 │ │ │ └─armenian_armenians_israel_people_jews │ │ │ ├─gun_guns_militia_firearms_amendment │ │ │ │ ├─■──blacks_penalty_death_cruel_punishment ── Topic: 55 │ │ │ │ └─■──gun_guns_militia_firearms_amendment ── Topic: 7 │ │ │ └─armenian_armenians_israel_jews_turkish │ │ │ ├─■──israel_israeli_jews_arab_jewish ── Topic: 4 │ │ │ └─■──armenian_armenians_turkish_armenia_azerbaijan ── Topic: 15 │ │ └─stephanopoulos_president_mr_myers_ms │ │ ├─■──serbs_muslims_stephanopoulos_mr_bosnia ── Topic: 35 │ │ └─■──myers_stephanopoulos_president_ms_mr ── Topic: 87 │ └─batf_fbi_koresh_compound_gas │ ├─■──reno_workers_janet_clinton_waco ── Topic: 77 │ └─batf_fbi_koresh_gas_compound │ ├─batf_koresh_fbi_warrant_compound │ │ ├─■──batf_warrant_raid_compound_fbi ── Topic: 42 │ │ └─■──koresh_batf_fbi_children_compound ── Topic: 61 │ └─■──fbi_gas_tear_bds_building ── Topic: 23 └─use_like_just_dont_new ├─game_team_year_games_like │ ├─game_team_games_25_year │ │ ├─game_team_games_25_season │ │ │ ├─window_printer_use_problem_mhz │ │ │ │ ├─mhz_wire_simms_wiring_battery │ │ │ │ │ ├─simms_mhz_battery_cpu_heat │ │ │ │ │ │ ├─simms_pds_simm_vram_lc │ │ │ │ │ │ │ ├─■──pds_nubus_lc_slot_card ── Topic: 119 │ │ │ │ │ │ │ └─■──simms_simm_vram_meg_dram ── Topic: 32 │ │ │ │ │ │ └─mhz_battery_cpu_heat_speed │ │ │ │ │ │ ├─mhz_cpu_speed_heat_fan │ │ │ │ │ │ │ ├─mhz_cpu_speed_heat_fan │ │ │ │ │ │ │ │ ├─■──fan_cpu_heat_sink_fans ── Topic: 92 │ │ │ │ │ │ │ │ └─■──mhz_speed_cpu_fpu_clock ── Topic: 22 │ │ │ │ │ │ │ └─■──monitor_turn_power_computer_electricity ── Topic: 91 │ │ │ │ │ │ └─battery_batteries_concrete_duo_discharge │ │ │ │ │ │ ├─■──duo_battery_apple_230_problem ── Topic: 121 │ │ │ │ │ │ └─■──battery_batteries_concrete_discharge_temperature ── Topic: 75 │ │ │ │ │ └─wire_wiring_ground_neutral_outlets │ │ │ │ │ ├─wire_wiring_ground_neutral_outlets │ │ │ │ │ │ ├─wire_wiring_ground_neutral_outlets │ │ │ │ │ │ │ ├─■──leds_uv_blue_light_boards ── Topic: 66 │ │ │ │ │ │ │ └─■──wire_wiring_ground_neutral_outlets ── Topic: 120 │ │ │ │ │ │ └─scope_scopes_phone_dial_number │ │ │ │ │ │ ├─■──dial_number_phone_line_output ── Topic: 93 │ │ │ │ │ │ └─■──scope_scopes_motorola_generator_oscilloscope ── Topic: 113 │ │ │ │ │ └─celp_dsp_sampling_antenna_digital │ │ │ │ │ ├─■──antenna_antennas_receiver_cable_transmitter ── Topic: 70 │ │ │ │ │ └─■──celp_dsp_sampling_speech_voice ── Topic: 52 │ │ │ │ └─window_printer_xv_mouse_windows │ │ │ │ ├─window_xv_error_widget_problem │ │ │ │ │ ├─error_symbol_undefined_xterm_rx │ │ │ │ │ │ ├─■──symbol_error_undefined_doug_parse ── Topic: 63 │ │ │ │ │ │ └─■──rx_remote_server_xdm_xterm ── Topic: 45 │ │ │ │ │ └─window_xv_widget_application_expose │ │ │ │ │ ├─window_widget_expose_application_event │ │ │ │ │ │ ├─■──gc_mydisplay_draw_gxxor_drawing ── Topic: 103 │ │ │ │ │ │ └─■──window_widget_application_expose_event ── Topic: 25 │ │ │ │ │ └─xv_den_polygon_points_algorithm │ │ │ │ │ ├─■──den_polygon_points_algorithm_polygons ── Topic: 28 │ │ │ │ │ └─■──xv_24bit_image_bit_images ── Topic: 57 │ │ │ │ └─printer_fonts_print_mouse_postscript │ │ │ │ ├─printer_fonts_print_font_deskjet │ │ │ │ │ ├─■──scanner_logitech_grayscale_ocr_scanman ── Topic: 108 │ │ │ │ │ └─printer_fonts_print_font_deskjet │ │ │ │ │ ├─■──printer_print_deskjet_hp_ink ── Topic: 18 │ │ │ │ │ └─■──fonts_font_truetype_tt_atm ── Topic: 49 │ │ │ │ └─mouse_ghostscript_midi_driver_postscript │ │ │ │ ├─ghostscript_midi_postscript_files_file │ │ │ │ │ ├─■──ghostscript_postscript_pageview_ghostview_dsc ── Topic: 104 │ │ │ │ │ └─midi_sound_file_windows_driver │ │ │ │ │ ├─■──location_mar_file_host_rwrr ── Topic: 83 │ │ │ │ │ └─■──midi_sound_driver_blaster_soundblaster ── Topic: 98 │ │ │ │ └─■──mouse_driver_mice_ball_problem ── Topic: 68 │ │ │ └─game_team_games_25_season │ │ │ ├─1st_sale_condition_comics_hulk │ │ │ │ ├─sale_condition_offer_asking_cd │ │ │ │ │ ├─condition_stereo_amp_speakers_asking │ │ │ │ │ │ ├─■──miles_car_amfm_toyota_cassette ── Topic: 62 │ │ │ │ │ │ └─■──amp_speakers_condition_stereo_audio ── Topic: 24 │ │ │ │ │ └─games_sale_pom_cds_shipping │ │ │ │ │ ├─pom_cds_sale_shipping_cd │ │ │ │ │ │ ├─■──size_shipping_sale_condition_mattress ── Topic: 100 │ │ │ │ │ │ └─■──pom_cds_cd_sale_picture ── Topic: 37 │ │ │ │ │ └─■──games_game_snes_sega_genesis ── Topic: 40 │ │ │ │ └─1st_hulk_comics_art_appears │ │ │ │ ├─1st_hulk_comics_art_appears │ │ │ │ │ ├─lens_tape_camera_backup_lenses │ │ │ │ │ │ ├─■──tape_backup_tapes_drive_4mm ── Topic: 107 │ │ │ │ │ │ └─■──lens_camera_lenses_zoom_pouch ── Topic: 114 │ │ │ │ │ └─1st_hulk_comics_art_appears │ │ │ │ │ ├─■──1st_hulk_comics_art_appears ── Topic: 105 │ │ │ │ │ └─■──books_book_cover_trek_chemistry ── Topic: 125 │ │ │ │ └─tickets_hotel_ticket_voucher_package │ │ │ │ ├─■──hotel_voucher_package_vacation_room ── Topic: 74 │ │ │ │ └─■──tickets_ticket_june_airlines_july ── Topic: 84 │ │ │ └─game_team_games_season_hockey │ │ │ ├─game_hockey_team_25_550 │ │ │ │ ├─■──espn_pt_pts_game_la ── Topic: 17 │ │ │ │ └─■──team_25_game_hockey_550 ── Topic: 2 │ │ │ └─■──year_game_hit_baseball_players ── Topic: 0 │ │ └─bike_car_greek_insurance_msg │ │ ├─car_bike_insurance_cars_engine │ │ │ ├─car_insurance_cars_radar_engine │ │ │ │ ├─insurance_health_private_care_canada │ │ │ │ │ ├─■──insurance_health_private_care_canada ── Topic: 99 │ │ │ │ │ └─■──insurance_car_accident_rates_sue ── Topic: 82 │ │ │ │ └─car_cars_radar_engine_detector │ │ │ │ ├─car_radar_cars_detector_engine │ │ │ │ │ ├─■──radar_detector_detectors_ka_alarm ── Topic: 39 │ │ │ │ │ └─car_cars_mustang_ford_engine │ │ │ │ │ ├─■──clutch_shift_shifting_transmission_gear ── Topic: 88 │ │ │ │ │ └─■──car_cars_mustang_ford_v8 ── Topic: 14 │ │ │ │ └─oil_diesel_odometer_diesels_car │ │ │ │ ├─odometer_oil_sensor_car_drain │ │ │ │ │ ├─■──odometer_sensor_speedo_gauge_mileage ── Topic: 96 │ │ │ │ │ └─■──oil_drain_car_leaks_taillights ── Topic: 102 │ │ │ │ └─■──diesel_diesels_emissions_fuel_oil ── Topic: 79 │ │ │ └─bike_riding_ride_bikes_motorcycle │ │ │ ├─bike_ride_riding_bikes_lane │ │ │ │ ├─■──bike_ride_riding_lane_car ── Topic: 11 │ │ │ │ └─■──bike_bikes_miles_honda_motorcycle ── Topic: 19 │ │ │ └─■──countersteering_bike_motorcycle_rear_shaft ── Topic: 46 │ │ └─greek_msg_kuwait_greece_water │ │ ├─greek_msg_kuwait_greece_water │ │ │ ├─greek_msg_kuwait_greece_dog │ │ │ │ ├─greek_msg_kuwait_greece_dog │ │ │ │ │ ├─greek_kuwait_greece_turkish_greeks │ │ │ │ │ │ ├─■──greek_greece_turkish_greeks_cyprus ── Topic: 71 │ │ │ │ │ │ └─■──kuwait_iraq_iran_gulf_arabia ── Topic: 76 │ │ │ │ │ └─msg_dog_drugs_drug_food │ │ │ │ │ ├─dog_dogs_cooper_trial_weaver │ │ │ │ │ │ ├─■──clinton_bush_quayle_reagan_panicking ── Topic: 101 │ │ │ │ │ │ └─dog_dogs_cooper_trial_weaver │ │ │ │ │ │ ├─■──cooper_trial_weaver_spence_witnesses ── Topic: 90 │ │ │ │ │ │ └─■──dog_dogs_bike_trained_springer ── Topic: 67 │ │ │ │ │ └─msg_drugs_drug_food_chinese │ │ │ │ │ ├─■──msg_food_chinese_foods_taste ── Topic: 30 │ │ │ │ │ └─■──drugs_drug_marijuana_cocaine_alcohol ── Topic: 72 │ │ │ │ └─water_theory_universe_science_larsons │ │ │ │ ├─water_nuclear_cooling_steam_dept │ │ │ │ │ ├─■──rocketry_rockets_engines_nuclear_plutonium ── Topic: 115 │ │ │ │ │ └─water_cooling_steam_dept_plants │ │ │ │ │ ├─■──water_dept_phd_environmental_atmospheric ── Topic: 97 │ │ │ │ │ └─■──cooling_water_steam_towers_plants ── Topic: 109 │ │ │ │ └─theory_universe_larsons_larson_science │ │ │ │ ├─■──theory_universe_larsons_larson_science ── Topic: 54 │ │ │ │ └─■──oort_cloud_grbs_gamma_burst ── Topic: 80 │ │ │ └─helmet_kirlian_photography_lock_wax │ │ │ ├─helmet_kirlian_photography_leaf_mask │ │ │ │ ├─kirlian_photography_leaf_pictures_deleted │ │ │ │ │ ├─deleted_joke_stuff_maddi_nickname │ │ │ │ │ │ ├─■──joke_maddi_nickname_nicknames_frank ── Topic: 43 │ │ │ │ │ │ └─■──deleted_stuff_bookstore_joke_motto ── Topic: 81 │ │ │ │ │ └─■──kirlian_photography_leaf_pictures_aura ── Topic: 85 │ │ │ │ └─helmet_mask_liner_foam_cb │ │ │ │ ├─■──helmet_liner_foam_cb_helmets ── Topic: 112 │ │ │ │ └─■──mask_goalies_77_santore_tl ── Topic: 123 │ │ │ └─lock_wax_paint_plastic_ear │ │ │ ├─■──lock_cable_locks_bike_600 ── Topic: 117 │ │ │ └─wax_paint_ear_plastic_skin │ │ │ ├─■──wax_paint_plastic_scratches_solvent ── Topic: 65 │ │ │ └─■──ear_wax_skin_greasy_acne ── Topic: 116 │ │ └─m4_mp_14_mw_mo │ │ ├─m4_mp_14_mw_mo │ │ │ ├─■──m4_mp_14_mw_mo ── Topic: 111 │ │ │ └─■──test_ensign_nameless_deane_deanebinahccbrandeisedu ── Topic: 118 │ │ └─■──ites_cheek_hello_hi_ken ── Topic: 3 │ └─space_medical_health_disease_cancer │ ├─medical_health_disease_cancer_patients │ │ ├─■──cancer_centers_center_medical_research ── Topic: 122 │ │ └─health_medical_disease_patients_hiv │ │ ├─patients_medical_disease_candida_health │ │ │ ├─■──candida_yeast_infection_gonorrhea_infections ── Topic: 48 │ │ │ └─patients_disease_cancer_medical_doctor │ │ │ ├─■──hiv_medical_cancer_patients_doctor ── Topic: 34 │ │ │ └─■──pain_drug_patients_disease_diet ── Topic: 26 │ │ └─■──health_newsgroup_tobacco_vote_votes ── Topic: 9 │ └─space_launch_nasa_shuttle_orbit │ ├─space_moon_station_nasa_launch │ │ ├─■──sky_advertising_billboard_billboards_space ── Topic: 59 │ │ └─■──space_station_moon_redesign_nasa ── Topic: 16 │ └─space_mission_hst_launch_orbit │ ├─space_launch_nasa_orbit_propulsion │ │ ├─■──space_launch_nasa_propulsion_astronaut ── Topic: 47 │ │ └─■──orbit_km_jupiter_probe_earth ── Topic: 86 │ └─■──hst_mission_shuttle_orbit_arrays ── Topic: 60 └─drive_file_key_windows_use ├─key_file_jpeg_encryption_image │ ├─key_encryption_clipper_chip_keys │ │ ├─■──key_clipper_encryption_chip_keys ── Topic: 1 │ │ └─■──entry_file_ripem_entries_key ── Topic: 73 │ └─jpeg_image_file_gif_images │ ├─motif_graphics_ftp_available_3d │ │ ├─motif_graphics_openwindows_ftp_available │ │ │ ├─■──openwindows_motif_xview_windows_mouse ── Topic: 20 │ │ │ └─■──graphics_widget_ray_3d_available ── Topic: 95 │ │ └─■──3d_machines_version_comments_contact ── Topic: 38 │ └─jpeg_image_gif_images_format │ ├─■──gopher_ftp_files_stuffit_images ── Topic: 51 │ └─■──jpeg_image_gif_format_images ── Topic: 13 └─drive_db_card_scsi_windows ├─db_windows_dos_mov_os2 │ ├─■──copy_protection_program_software_disk ── Topic: 64 │ └─■──db_windows_dos_mov_os2 ── Topic: 8 └─drive_card_scsi_drives_ide ├─drive_scsi_drives_ide_disk │ ├─■──drive_scsi_drives_ide_disk ── Topic: 6 │ └─■──meg_sale_ram_drive_shipping ── Topic: 12 └─card_modem_monitor_video_drivers ├─■──card_monitor_video_drivers_vga ── Topic: 5 └─■──modem_port_serial_irq_com ── Topic: 10 ```
## **Merge topics** After seeing the potential hierarchy of your topic, you might want to merge specific topics. For example, if topic 1 is `1_space_launch_moon_nasa` and topic 2 is `2_spacecraft_solar_space_orbit` it might make sense to merge those two topics as they are quite similar in meaning. In BERTopic, you can use `.merge_topics` to manually select and merge those topics. Doing so will update their topic representation which in turn updates the entire model: ```python topics_to_merge = [1, 2] topic_model.merge_topics(docs, topics_to_merge) ``` If you have several groups of topics you want to merge, create a list of lists instead: ```python topics_to_merge = [[1, 2], [3, 4]] topic_model.merge_topics(docs, topics_to_merge) ``` ================================================ FILE: docs/getting_started/manual/manual.md ================================================ Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used [20 NewsGroups dataset](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) is already split up into 20 classes. Here, we might want to see how we can transform those 20 classes into 20 topics. Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic without actually learning them. We can view this as a manual topic modeling approach. There is no underlying algorithm for detecting these topics since you already have done that before. Whether that is simply because they are already available, like with the 20 NewsGroups dataset, or maybe because you have created clusters of documents before using packages like [human-learn](https://github.com/koaning/human-learn), [bulk](https://github.com/koaning/bulk), [thisnotthat](https://github.com/TutteInstitute/thisnotthat) or something entirely different. In other words, we can pass our labels to BERTopic and it will try to transform those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer.
--8<-- "docs/getting_started/manual/pipeline.svg"

To do so, we need to skip over the dimensionality reduction and clustering steps since we already know the labels for our documents. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels: ```python from sklearn.datasets import fetch_20newsgroups # Get labeled data data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = data['data'] y = data['target'] ``` Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process: ```python from bertopic import BERTopic from bertopic.backend import BaseEmbedder from bertopic.cluster import BaseCluster from bertopic.vectorizers import ClassTfidfTransformer from bertopic.dimensionality import BaseDimensionalityReduction # Prepare our empty sub-models and reduce frequent words while we are at it. empty_embedding_model = BaseEmbedder() empty_dimensionality_model = BaseDimensionalityReduction() empty_cluster_model = BaseCluster() ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) # Fit BERTopic without actually performing any clustering topic_model= BERTopic( embedding_model=empty_embedding_model, umap_model=empty_dimensionality_model, hdbscan_model=empty_cluster_model, ctfidf_model=ctfidf_model ) topics, probs = topic_model.fit_transform(docs, y=y) ``` Let's take a look at a few topics that we get out of training this way by running `topic_model.get_topic_info()`:
--8<-- "docs/getting_started/manual/table.svg"

We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship: ```python # Map input `y` to topics mappings = topic_model.topic_mapper_.get_mappings() mappings = {value: data["target_names"][key] for key, value in mappings.items()} # Assign original classes to our topics df = topic_model.get_topic_info() df["Class"] = df.Topic.map(mappings) df ```
--8<-- "docs/getting_started/manual/table_classes.svg"

We can see that the c-TF-IDF representations nicely extract the words that give a nice representation of our input classes. This is all done without actually embedding and clustering the data. As a result, the entire "training" process only takes a couple of seconds. Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc. !!! note The resulting `topics` may be a different mapping from the `y` labels. To map `y` to `topics`, we can run the following: ```python mappings = topic_model.topic_mapper_.get_mappings() y_mapped = [mappings[val] for val in y] ``` ================================================ FILE: docs/getting_started/merge/merge.md ================================================ # Merge Multiple Fitted Models After you have trained a new BERTopic model on your data, new data might still be coming in. Although you can use [online BERTopic](https://maartengr.github.io/BERTopic/getting_started/online/online.html), you might prefer to use the default HDBSCAN and UMAP models since they do not support incremental learning out of the box. Instead, we you can train a new BERTopic on incoming data and merge it with your base model to detect whether new topics have appeared in the unseen documents. This is a great way of detecting whether your new model contains information that was not previously found in your base topic model. Similarly, you might want to train multiple BERTopic models using different sets of settings, even though they might all be using the same underlying embedding model. Merging these models would also allow for a single model that you can use throughout your use cases. Lastly, this methods also allows for a degree of `federated learning` where each node trains a topic model that are aggregated in a central server. ## **Example** To demonstrate merging different topic models with BERTopic, we use the ArXiv paper abstracts to see which topics they generally contain. First, we train three separate models on different parts of the data: ```python from umap import UMAP from bertopic import BERTopic from datasets import load_dataset dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"] # Extract abstracts to train on and corresponding titles abstracts_1 = dataset["abstract"][:5_000] abstracts_2 = dataset["abstract"][5_000:10_000] abstracts_3 = dataset["abstract"][10_000:15_000] # Create topic models umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) topic_model_1 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_1) topic_model_2 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_2) topic_model_3 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_3) ``` Then, we can combine all three models into one with `.merge_models`: ```python # Combine all models into one merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3]) ``` When we inspect the first model, we can see it has 52 topics: ```python >>> len(topic_model_1.get_topic_info()) 52 ``` Now, we inspect the merged model, we can see it has 57 topics: ```python >>> len(merged_model.get_topic_info()) 57 ``` It seems that by merging these three models, there were 6 undiscovered topics that we could add to the very first model. !!! Note Note that the models are merged sequentially. This means that the comparison starts with `topic_model_1` and that each new topic from `topic_model_2` and `topic_model_3` will be added to `topic_model_1`. We can check the newly added topics in the `merged_model` by simply looking at the 6 latest topics that were added. The order of topics from `topic_model_1` remains the same. All new topics are simply added on top of them. Let's inspect them: ```python >>> merged_model.get_topic_info().tail(5) ``` | | Topic | Count | Name | Representation | Representative_Docs | |---:|--------:|--------:|:---------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|----------------------:| | 52 | 51 | 47 | 50_activity_mobile_wearable_sensors | ['activity', 'mobile', 'wearable', 'sensors', 'falls', 'human', 'phone', 'recognition', 'activities', 'accelerometer'] | nan | | 53 | 52 | 48 | 25_music_musical_audio_chord | ['music', 'musical', 'audio', 'chord', 'and', 'we', 'to', 'that', 'of', 'for'] | nan | | 54 | 53 | 32 | 36_fairness_discrimination_fair_groups | ['fairness', 'discrimination', 'fair', 'groups', 'protected', 'decision', 'we', 'of', 'classifier', 'to'] | nan | | 55 | 54 | 30 | 38_traffic_driver_prediction_flow | ['traffic', 'driver', 'prediction', 'flow', 'trajectory', 'the', 'and', 'congestion', 'of', 'transportation'] | nan | | 56 | 55 | 22 | 50_spiking_neurons_networks_learning | ['spiking', 'neurons', 'networks', 'learning', 'neural', 'snn', 'dynamics', 'plasticity', 'snns', 'of'] | nan | It seems that topics about activity, music, fairness, traffic, and spiking networks were added to the base topic model! Two things that you might have noticed. First, the representative documents were not added to the model. This is because of privacy reasons, you might want to combine models that were trained on different stations which would allow for a degree of `federated learning`. Second, the names of the new topics contain topic ids that refer to one of the old models. They were purposefully left this way so that the user can identify which topics were newly added which you could inspect in the original models. ## **min_similarity** The way the models are merged is through comparison of their topic embeddings. If topics between models are similar enough, then they will be regarded as the same topics and the topic of the first model in the list will be chosen. However, if topics between models are dissimilar enough, then the topic of the latter model will be added to the former. This (dis)similarity is can be tweaked using the `min_similarity` parameter. Increasing this value will increase the chance of adding new topics. In contrast, decreasing this value will make it more strict and threfore decrease the chance of adding new topics. The value is set to `0.7` by default, so let's see what happens if we were to increase this value to `0.9``: ```python # Combine all models into one merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3], min_similarity=0.9) ``` When we inspect the number of topics in our new model, we can see that they have increased quite a bit: ```python >>> len(merged_model.get_topic_info()) 102 ``` This demonstrates the influence of `min_similarity` on the number of new topics that are added to the base model. ================================================ FILE: docs/getting_started/multiaspect/multiaspect.md ================================================ During the development of BERTopic, many different types of representations can be created, from keywords and phrases to summaries and custom labels. There is a variety of techniques that one can choose from to represent a topic. As such, there are a number of interesting and creative ways one can summarize topics. A topic is more than just a single representation. Therefore, `multi-aspect topic modeling` is introduced! During the `.fit` or `.fit_transform` stages, you can now get multiple representations of a single topic. In practice, it works by generating and storing all kinds of different topic representations (see image below).
![Image title](multiaspect.svg)
The approach is rather straightforward. We might want to represent our topics using a `PartOfSpeech` representation model but we might also want to try out `KeyBERTInspired` and compare those representation models. We can do this as follows: ```python from bertopic.representation import KeyBERTInspired from bertopic.representation import PartOfSpeech from bertopic.representation import MaximalMarginalRelevance from sklearn.datasets import fetch_20newsgroups # Documents to train on docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] # The main representation of a topic main_representation = KeyBERTInspired() # Additional ways of representing a topic aspect_model1 = PartOfSpeech("en_core_web_sm") aspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)] # Add all models together to be run in a single `fit` representation_model = { "Main": main_representation, "Aspect1": aspect_model1, "Aspect2": aspect_model2 } topic_model = BERTopic(representation_model=representation_model).fit(docs) ``` As show above, to perform multi-aspect topic modeling, we make sure that `representation_model` is a dictionary where each representation model pipeline is defined. The main pipeline, that is used in most visualization options, is defined with the `"Main"` key. All other aspects can be defined however you want. In the example above, the two additional aspects that we are interested in are defined as `"Aspect1"` and `"Aspect2"`. After we have fitted our model, we can access all representations with `topic_model.get_topic_info()`:



As you can see, there are a number of different representations for our topics that we can inspect. All aspects are found in `topic_model.topic_aspects_`. ================================================ FILE: docs/getting_started/multimodal/multimodal.md ================================================ Documents or text are often accompanied by imagery or the other way around. For example, social media images with captions and products with descriptions. Topic modeling has traditionally focused on creating topics from textual representations. However, as more multimodal representations are created, the need for multimodal topics increases. BERTopic can perform **multimodal topic modeling** in a number of ways during `.fit` and `.fit_transform` stages. ## **Text + Images** The most basic example of multimodal topic modeling in BERTopic is when you have images that accompany your documents. This means that it is expected that each document has an image and vice versa. Instagram pictures, for example, almost always have some descriptions to them.
![Image title](images_and_text.svg)
In this example, we are going to use images from `flickr` that each have a caption associated to it: ```python # NOTE: This requires the `datasets` package which you can # install with `pip install datasets` from datasets import load_dataset ds = load_dataset("maderix/flickr_bw_rgb") images = ds["train"]["image"] docs = ds["train"]["caption"] ``` The `docs` variable contains the captions for each image in `images`. We can now use these variables to run our multimodal example: !!! Tip Do note that it is better to pass the paths of the images instead of the images themselves as there is no need to keep all images in memory. When passing the paths of the images, they are only opened temporarily when they are needed. ```python from bertopic import BERTopic from bertopic.representation import VisualRepresentation # Additional ways of representing a topic visual_model = VisualRepresentation() # Make sure to add the `visual_model` to a dictionary representation_model = { "Visual_Aspect": visual_model, } topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` In this example, we are clustering the documents and are then looking for the best matching images to the resulting clusters. We can now access our image representations for each topic with `topic_model.topic_aspects_["Visual_Aspect"]`. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following: ```python import base64 from io import BytesIO from IPython.display import HTML def image_base64(im): if isinstance(im, str): im = get_thumbnail(im) with BytesIO() as buffer: im.save(buffer, 'jpeg') return base64.b64encode(buffer.getvalue()).decode() def image_formatter(im): return f'' # Extract dataframe df = topic_model.get_topic_info().drop("Representative_Docs", 1).drop("Name", 1) # Visualize the images HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False)) ```



!!! Tip In the example above, we are clustering the documents but since you have images, you might want to cluster those or cluster an aggregation of both images and documents. For that, you can use the new `MultiModalBackend` to generate embeddings: ```python from bertopic.backend import MultiModalBackend model = MultiModalBackend('clip-ViT-B-32', batch_size=32) # Embed documents only doc_embeddings = model.embed_documents(docs) # Embedding images only image_embeddings = model.embed_images(images) # Embed both images and documents, then average them doc_image_embeddings = model.embed(docs, images) ``` ## **Images Only** Traditional topic modeling techniques can only be run on textual data, as is shown in the example above. However, there are plenty of cases where textual data is not available but images are. BERTopic allows topic modeling to be performed using only images as your input data.
![Image title](images_only.svg)
To run BERTopic on images only, we first need to embed our images and then define a model that convert images to text. To do so, we are going to need some images. We will take the same images as the above but instead save them locally and pass the paths to the images instead. As mentioned before, this will make sure that we do not hold too many images in memory whilst only a small subset is needed: ```python import os import glob import zipfile import numpy as np import pandas as pd from tqdm import tqdm from sentence_transformers import util # Flickr 8k images img_folder = 'photos/' caps_folder = 'captions/' if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0: os.makedirs(img_folder, exist_ok=True) if not os.path.exists('Flickr8k_Dataset.zip'): #Download dataset if does not exist util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip', 'Flickr8k_Dataset.zip') util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip', 'Flickr8k_text.zip') for folder, file in [(img_folder, 'Flickr8k_Dataset.zip'), (caps_folder, 'Flickr8k_text.zip')]: with zipfile.ZipFile(file, 'r') as zf: for member in tqdm(zf.infolist(), desc='Extracting'): zf.extract(member, folder) images = list(glob.glob('photos/Flicker8k_Dataset/*.jpg')) ``` Next, we can run our pipeline: ```python from bertopic.representation import KeyBERTInspired, VisualRepresentation from bertopic.backend import MultiModalBackend # Image embedding model embedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32) # Image to text representation model representation_model = { "Visual_Aspect": VisualRepresentation(image_to_text_model="nlpconnect/vit-gpt2-image-captioning") } ``` Using these models, we can run our pipeline: ```python from bertopic import BERTopic # Train our model with images only topic_model = BERTopic(embedding_model=embedding_model, representation_model=representation_model, min_topic_size=30) topics, probs = topic_model.fit_transform(documents=None, images=images) ``` We can now access our image representations for each topic with `topic_model.topic_aspects_["Visual_Aspect"]`. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following: ```python import base64 from io import BytesIO from IPython.display import HTML def image_base64(im): if isinstance(im, str): im = get_thumbnail(im) with BytesIO() as buffer: im.save(buffer, 'jpeg') return base64.b64encode(buffer.getvalue()).decode() def image_formatter(im): return f'' # Extract dataframe df = topic_model.get_topic_info().drop("Representative_Docs", 1).drop("Name", 1) # Visualize the images HTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False)) ```



================================================ FILE: docs/getting_started/online/online.md ================================================ Online topic modeling (sometimes called "incremental topic modeling") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained before. In Scikit-Learn, this technique is often modeled through a `.partial_fit` function, which is also used in BERTopic. !!! Tip Another method for online topic modeling can be found with the [**.merge_models**](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) functionality of BERTopic. It allows for merging multiple BERTopic models to create a single new one. This method can be used to discover new topics by training a new model and exploring whether that new model added new topics to the original model when merging. A major benefit, compared to `.partial_fit` is that you can keep using the original UMAP and HDBSCAN models which tends result in improved performance and gives you significant more flexibility. In BERTopic, there are three main goals for using this technique. * To reduce the memory necessary for training a topic model. * To continuously update the topic model as new data comes in. * To continuously find new topics as new data comes in. In BERTopic, online topic modeling can be a bit tricky as there are several steps involved in which online learning needs to be made available. To recap, BERTopic consists of the following 6 steps: 1. Extract embeddings 2. Reduce dimensionality 3. Cluster reduced embeddings 4. Tokenize topics 5. Extract topic words 6. (Optional) Fine-tune topic words For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need of continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, steps 5 and 6 do not necessarily need online variants since they are built upon step 4, tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6.
--8<-- "docs/getting_started/online/online.svg"

This means that we will need online variants for steps 2 through 4. Steps 2 and 3, dimensionality reduction and clustering, can be modeled through the use of Scikit-Learn's `.partial_fit` function. In other words, it supports any algorithm that can be trained using `.partial_fit` since these algorithms can be trained incrementally. For example, incremental dimensionality reduction can be achieved using Scikit-Learn's `IncrementalPCA` and incremental clustering with `MiniBatchKMeans`. Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the `CountVectorizer`. However, as new data comes in, its vocabulary will need to be updated. For that purpose, `bertopic.vectorizers.OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. Most notably, the `decay` parameter is a value between 0 and 1 to weigh the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix by 10% at each iteration. This will make sure that recent data has more weight than previous iterations. Similarly, `delete_min_df` will remove certain words from its vocabulary if their frequency is lower than a set value. This ties together with the `decay` parameter as some words will decay over time if not used. For more information regarding the `OnlineCountVectorizer`, please see the [vectorizers documentation](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer). ## **Example** Online topic modeling in BERTopic is rather straightforward. We first need to have our documents split into chunks such that we can train and update our topic model incrementally. ```python from sklearn.datasets import fetch_20newsgroups # Prepare documents all_docs = fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'))["data"] doc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)] ``` Here, we created chunks of 1000 documents to be fed in BERTopic. Then, we will need to define several sub-models that support online learning. Specifically, we are going to be using `IncrementalPCA`, `MiniBatchKMeans`, and the `OnlineCountVectorizer`: ```python from sklearn.cluster import MiniBatchKMeans from sklearn.decomposition import IncrementalPCA from bertopic.vectorizers import OnlineCountVectorizer # Prepare sub-models that support online learning umap_model = IncrementalPCA(n_components=5) cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0) vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01) ``` After having defined our sub-models, we can start training our topic model incrementally by looping over our document chunks: ```python from bertopic import BERTopic topic_model = BERTopic(umap_model=umap_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model) # Incrementally fit the topic model by training on 1000 documents at a time for docs in doc_chunks: topic_model.partial_fit(docs) ``` And that is it! During each iteration, you can access the predicted topics through the `.topics_` attribute. !!! note Do note that in BERTopic it is not possible to use `.partial_fit` after the `.fit` as they work quite differently concerning internally updating topics, frequencies, representations, etc. !!! tip Tip You can use any other dimensionality reduction and clustering algorithm as long as they have a `.partial_fit` function. Moreover, you can use dimensionality reduction algorithms that do not support `.partial_fit` functions but do have a `.fit` function to first train it on a large amount of data and then continuously add documents. The dimensionality reduction will not be updated but may be trained sufficiently to properly reduce the embeddings without the need to continuously add documents. !!! warning Only the most recent batch of documents is tracked. If you want to be using online topic modeling for low-memory use cases, then it is advised to also update the `.topics_` attribute. Otherwise, variations such as **hierarchical topic modeling** will not work. ```python # Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration topics = [] for docs in doc_chunks: topic_model.partial_fit(docs) topics.extend(topic_model.topics_) topic_model.topics_ = topics ``` ## **River** To continuously find new topics as they come in, we can use the package [river](https://github.com/online-ml/river). It contains several clustering models that can create new clusters as new data comes in. To make sure we can use their models, we first need to create a class that has a `.partial_fit` function and the option to extract labels through `.labels_`: ```python from river import stream from river import cluster class River: def __init__(self, model): self.model = model def partial_fit(self, umap_embeddings): for umap_embedding, _ in stream.iter_array(umap_embeddings): self.model.learn_one(umap_embedding) labels = [] for umap_embedding, _ in stream.iter_array(umap_embeddings): label = self.model.predict_one(umap_embedding) labels.append(label) self.labels_ = labels return self ``` Then, we can choose any `river.cluster` model that we are interested in and pass it to the `River` class before using it in BERTopic: ```python # Using DBSTREAM to detect new topics as they come in cluster_model = River(cluster.DBSTREAM()) vectorizer_model = OnlineCountVectorizer(stop_words="english") ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True) # Prepare model topic_model = BERTopic( hdbscan_model=cluster_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, ) # Incrementally fit the topic model by training on 1000 documents at a time for docs in doc_chunks: topic_model.partial_fit(docs) ``` ================================================ FILE: docs/getting_started/outlier_reduction/fig_base.html ================================================
================================================ FILE: docs/getting_started/outlier_reduction/fig_reduced.html ================================================
================================================ FILE: docs/getting_started/outlier_reduction/outlier_reduction.md ================================================ When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created that do not fall within any of the created topics. These are labeled as -1. Depending on your use case, you might want to decrease the number of documents that are labeled as outliers. Fortunately, there are a number of strategies one might use to reduce the number of outliers after you have trained your BERTopic model. The main way to reduce your outliers in BERTopic is by using the `.reduce_outliers` function. To make it work without too much tweaking, you will only need to pass the `docs` and their corresponding `topics`. You can pass outlier and non-outlier documents together since it will only try to reduce outlier documents and label them to a non-outlier topic. The following is a minimal example: ```python from bertopic import BERTopic # Train your BERTopic model topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Reduce outliers new_topics = topic_model.reduce_outliers(docs, topics) ``` !!! note You can use the `threshold` parameter to select the minimum distance or similarity when matching outlier documents with non-outlier topics. This allows the user to change the amount of outlier documents are assigned to non-outlier topics. ## **Strategies** The default method for reducing outliers is by calculating the c-TF-IDF representations of outlier documents and assigning them to the best matching c-TF-IDF representations of non-outlier topics. However, there are a number of other strategies one can use, either separately or in conjunction that are worthwhile to explore: * Using the topic-document probabilities to assign topics * Using the topic-document distributions to assign topics * Using c-TF-IDF representations to assign topics * Using document and topic embeddings to assign topics ### **Probabilities** This strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. To use this, make sure to calculate the `probabilities` beforehand by instantiating BERTopic with `calculate_probabilities=True`. ```python from bertopic import BERTopic # Train your BERTopic model and calculate the document-topic probabilities topic_model = BERTopic(calculate_probabilities=True) topics, probs = topic_model.fit_transform(docs) # Reduce outliers using the `probabilities` strategy new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities") ``` ### **Topic Distributions** Use the topic distributions, as calculated with `.approximate_distribution` to find the most frequent topic in each outlier document. You can use the `distributions_params` variable to tweak the parameters of `.approximate_distribution`. ```python from bertopic import BERTopic # Train your BERTopic model topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Reduce outliers using the `distributions` strategy new_topics = topic_model.reduce_outliers(docs, topics, strategy="distributions") ``` ### **c-TF-IDF** Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity. ```python from bertopic import BERTopic # Train your BERTopic model topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Reduce outliers using the `c-tf-idf` strategy new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf") ``` ### **Embeddings** Using the embeddings of each outlier documents, find the best matching topic embedding using cosine similarity. ```python from bertopic import BERTopic # Train your BERTopic model topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Reduce outliers using the `embeddings` strategy new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings") ``` !!! note If you have pre-calculated the documents embeddings you can speed up the outlier reduction process for the `"embeddings"` strategy as it will prevent re-calculating the document embeddings. ### **Chain Strategies** Since the `.reduce_outliers` function does not internally update the topics, we can easily try out different strategies but also chain them together. You might want to do a first pass with the `"c-tf-idf"` strategy as it is quite fast. Then, we can perform the `"distributions"` strategy on the outliers that are left since this method is typically much slower: ```python # Use the "c-TF-IDF" strategy with a threshold new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1) # Reduce all outliers that are left with the "distributions" strategy new_topics = topic_model.reduce_outliers(docs, new_topics, strategy="distributions") ``` ## **Update Topics** After generating our updated topics, we can feed them back into BERTopic in one of two ways. We can either update the topic representations themselves based on the documents that now belong to new topics or we can only update the topic frequency without updating the topic representations themselves. !!! warning In both cases, it is important to realize that updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2. ### **Update Topic Representation** When outlier documents are generated, they are not used when modeling the topic representations. These documents are completely ignored when finding good descriptions of topics. Thus, after having reduced the number of outliers in your topic model, you might want to update the topic representations with the documents that now belong to actual topics. To do so, we can make use of the `.update_topics` function: ```python topic_model.update_topics(docs, topics=new_topics) ``` As seen above, you will only need to pass the documents on which the model was trained including the new topics that were generated using one of the above four strategies. ### **Exploration** When you are reducing the number of topics, it might be worthwhile to iteratively visualize the results in order to get an intuitive understanding of the effect of the above four strategies. Making use of `.visualize_documents`, we can quickly iterate over the different strategies and view their effects. Here, an example will be shown on how to approach such a pipeline. First, we train our model: ```python from umap import UMAP from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer # Prepare data, extract embeddings, and prepare sub-models docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) vectorizer_model = CountVectorizer(stop_words="english") sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=True) # We reduce our embeddings to 2D as it will allows us to quickly iterate later on reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) # Train our topic model topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, vectorizer_model=vectorizer_model, calculate_probabilities=True, nr_topics=40) topics, probs = topic_model.fit_transform(docs, embeddings) ``` After having trained our model, let us take a look at the 2D representation of the generated topics: ```python topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, hide_document_hover=True, hide_annotations=True) ``` Next, we reduce the number of outliers using the `probabilities` strategy: ```python new_topics = reduce_outliers(topic_model, docs, topics, probabilities=probs, threshold=0.05, strategy="probabilities") topic_model.update_topics(docs, topics=new_topics) ``` And finally, we visualize the results: ```python topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, hide_document_hover=True, hide_annotations=True) ``` ================================================ FILE: docs/getting_started/parameter tuning/parametertuning.md ================================================ # Hyperparameter Tuning Although BERTopic works quite well out of the box, there are a number of hyperparameters to tune according to your use case. This section will focus on important parameters directly accessible in BERTopic but also hyperparameter optimization in sub-models such as HDBSCAN and UMAP. ## **BERTopic** When instantiating BERTopic, there are several hyperparameters that you can directly adjust that could significantly improve the performance of your topic model. In this section, we will go through the most impactful parameters in BERTopic and directions on how to optimize them. ### **language** The `language` parameter is used to simplify the selection of models for those who are not familiar with sentence-transformers models. In essence, there are two options to choose from: * `language = "english"` or * `language = "multilingual"` The English model is "all-MiniLM-L6-v2" and can be found [here](https://www.sbert.net/docs/pretrained_models.html). It is the default model that is used in BERTopic and works great for English documents. The multilingual model is "paraphrase-multilingual-MiniLM-L12-v2" and supports over 50+ languages which can be found [here](https://www.sbert.net/docs/pretrained_models.html). The model is very similar to the base model but is trained on many languages and has a slightly different architecture. ### **top_n_words** `top_n_words` refers to the number of words per topic that you want to be extracted. In practice, I would advise you to keep this value below 30 and preferably between 10 and 20. The reasoning for this is that the more words you put in a topic the less coherent it can become. The top words are the most representative of the topic and should be focused on. ### **n_gram_range** The `n_gram_range` parameter refers to the CountVectorizer used when creating the topic representation. It relates to the number of words you want in your topic representation. For example, "New" and "York" are two separate words but are often used as "New York" which represents an n-gram of 2. Thus, the `n_gram_range` should be set to (1, 2) if you want "New York" in your topic representation. ### **min_topic_size** `min_topic_size` is an important parameter! It is used to specify what the minimum size of a topic can be. The lower this value the more topics are created. If you set this value too high, then it is possible that simply no topics will be created! Set this value too low and you will get many microclusters. It is advised to play around with this value depending on the size of your dataset. If it nears a million documents, then it is advised to set it much higher than the default of 10, for example, 100 or even 500. ### **nr_topics** `nr_topics` can be a tricky parameter. It specifies, after training the topic model, the number of topics that will be reduced. For example, if your topic model results in 100 topics but you have set `nr_topics` to 20 then the topic model will try to reduce the number of topics from 100 to 20. This reduction can take a while as each reduction in topics activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use "auto" to automatically reduce topics using HDBSCAN. ### **low_memory** `low_memory` sets UMAP's `low_memory` to True to make sure that less memory is used in the computation. This slows down computation but allows UMAP to be run on low-memory machines. ### **calculate_probabilities** `calculate_probabilities` lets you calculate the probabilities of each topic in each document. This is computationally quite expensive and is turned off by default. ## **UMAP** UMAP is an amazing technique for dimensionality reduction. In BERTopic, it is used to reduce the dimensionality of document embedding into something easier to use with HDBSCAN to create good clusters. However, it does has a significant number of parameters you could take into account. As exposing all parameters in BERTopic would be difficult to manage, we can instantiate our UMAP model and pass it to BERTopic: ```python from umap import UMAP umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False) topic_model = BERTopic(umap_model=umap_model).fit(docs) ``` ### **n_neighbors** `n_neighbors` is the number of neighboring sample points used when making the manifold approximation. Increasing this value typically results in a more global view of the embedding structure whilst smaller values result in a more local view. Increasing this value often results in larger clusters being created. ### **n_components** `n_components` refers to the dimensionality of the embeddings after reducing them. This is set as a default to `5` to reduce dimensionality as much as possible whilst trying to maximize the information kept in the resulting embeddings. Although lowering or increasing this value influences the quality of embeddings, its effect is largest on the performance of HDBSCAN. Increasing this value too much and HDBSCAN will have a hard time clustering the high-dimensional embeddings. Lower this value too much and too little information in the resulting embeddings are available to create proper clusters. If you want to increase this value, I would advise setting using a metric for HDBSCAN that works well in high dimensional data. ### **metric** `metric` refers to the method used to compute the distances in high dimensional space. The default is `cosine` as we are dealing with high dimensional data. However, BERTopic is also able to use any input, even regular tabular data, to cluster the documents. Thus, you might want to change the metric to something that fits your use case. ### **low_memory** `low_memory` is used when datasets may consume a lot of memory. Using millions of documents can lead to memory issues and setting this value to `True` might alleviate some of the issues. ## **HDBSCAN** After reducing the embeddings with UMAP, we use HDBSCAN to cluster our documents into clusters of similar documents. Similar to UMAP, HDBSCAN has many parameters that could be tweaked to improve the cluster's quality. ```python from hdbscan import HDBSCAN hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True) topic_model = BERTopic(hdbscan_model=hdbscan_model).fit(docs) ``` ### **min_cluster_size** `min_cluster_size` is arguably the most important parameter in HDBSCAN. It controls the minimum size of a cluster and thereby the number of clusters that will be generated. It is set to `10` as a default. Increasing this value results in fewer clusters but of larger size whereas decreasing this value results in more micro clusters being generated. Typically, I would advise increasing this value rather than decreasing it. ### **min_samples** `min_samples` is automatically set to `min_cluster_size` and controls the number of outliers generated. Setting this value significantly lower than `min_cluster_size` might help you reduce the amount of noise you will get. Do note that outliers are to be expected and forcing the output to have no outliers may not properly represent the data. ### **metric** `metric`, like with HDBSCAN is used to calculate the distances. Here, we went with `euclidean` as, after reducing the dimensionality, we have low dimensional data and not much optimization is necessary. However, if you increase `n_components` in UMAP, then it would be advised to look into metrics that work with high dimensional data. ### **prediction_data** Make sure you always set this value to `True` as it is needed to predict new points later on. You can set this to False if you do not wish to predict any unseen data points. ================================================ FILE: docs/getting_started/quickstart/quickstart.md ================================================ ## **Installation** Installation, with sentence-transformers, can be done using [pypi](https://pypi.org/project/bertopic/): ```bash pip install bertopic ``` You may want to install more depending on the transformers and language backends that you will be using. The possible installations are: ```bash # Choose an embedding backend pip install bertopic[flair, gensim, spacy, use] # Topic modeling with images pip install bertopic[vision] ``` ## **Quick Start** We start by extracting topics from the well-known 20 newsgroups dataset which is comprised of English documents: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` After generating topics, we can access the frequent topics that were generated: ```python >>> topic_model.get_topic_info() Topic Count Name -1 4630 -1_can_your_will_any 0 693 49_windows_drive_dos_file 1 466 32_jesus_bible_christian_faith 2 441 2_space_launch_orbit_lunar 3 381 22_key_encryption_keys_encrypted ``` -1 refers to all outliers and should typically be ignored. Next, let's take a look at the most frequent topic that was generated, topic 0: ```python >>> topic_model.get_topic(0) [('windows', 0.006152228076250982), ('drive', 0.004982897610645755), ('dos', 0.004845038866360651), ('file', 0.004140142872194834), ('disk', 0.004131678774810884), ('mac', 0.003624848635985097), ('memory', 0.0034840976976789903), ('software', 0.0034415334250699077), ('email', 0.0034239554442333257), ('pc', 0.003047105930670237)] ``` Using `.get_document_info`, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.: ```python >>> topic_model.get_document_info(docs) Document Topic Name Top_n_words Probability ... I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ... My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ... Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ... Think! It is the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ... 1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ... ``` !!! Tip "Multilingual" Use `BERTopic(language="multilingual")` to select a model that supports 50+ languages. ## **Fine-tune Topic Representations** In BERTopic, there are a number of different [topic representations](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) that we can choose from. They are all quite different from one another and give interesting perspectives and variations of topic representations. A great start is `KeyBERTInspired`, which for many users increases the coherence and reduces stopwords from the resulting topic representations: ```python from bertopic.representation import KeyBERTInspired # Fine-tune your topic representations representation_model = KeyBERTInspired() topic_model = BERTopic(representation_model=representation_model) ``` However, you might want to use something more powerful to describe your clusters. You can even use ChatGPT or other models from OpenAI to generate labels, summaries, phrases, keywords, and more: ```python import openai from bertopic.representation import OpenAI # Fine-tune topic representations with GPT client = openai.OpenAI(api_key="sk-...") representation_model = OpenAI(client, model="gpt-4o-mini", chat=True) topic_model = BERTopic(representation_model=representation_model) ``` !!! tip "Multi-aspect Topic Modeling" Instead of iterating over all of these different topic representations, you can model them simultaneously with [multi-aspect topic representations](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) in BERTopic. ## **Visualizations** After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can use one of the [many visualization options](https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html) in BERTopic. For example, we can visualize the topics that were generated in a way very similar to [LDAvis](https://github.com/cpsievert/LDAvis): ```python topic_model.visualize_topics() ``` ## **Save/Load BERTopic model** There are three methods for saving BERTopic: 1. A light model with `.safetensors` and config files 2. A light model with pytorch `.bin` and config files 3. A full model with `.pickle` Method 3 allows for saving the entire topic model but has several drawbacks: * Arbitrary code can be run from `.pickle` files * The resulting model is rather large (often > 500MB) since all sub-models need to be saved * Explicit and specific version control is needed as they typically only run if the environment is exactly the same > **It is advised to use methods 1 or 2 for saving.** These methods have a number of advantages: * `.safetensors` is a relatively **safe format** * The resulting model can be **very small** (often < 20MB) since no sub-models need to be saved * Although version control is important, there is a bit more **flexibility** with respect to specific versions of packages * More easily used in **production** * **Share** models with the HuggingFace Hub !!! Tip "Tip" For more detail about how to load in a custom vectorizer, representation model, and more, it is highly advised to checkout the [serialization](https://maartengr.github.io/BERTopic/getting_started/serialization/serialization.html) page. It contains more examples, details, and some tips and tricks for loading and saving your environment. The methods are as used as follows: ```python topic_model = BERTopic().fit(my_docs) # Method 1 - safetensors embedding_model = "sentence-transformers/all-MiniLM-L6-v2" topic_model.save("path/to/my/model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model) # Method 2 - pytorch embedding_model = "sentence-transformers/all-MiniLM-L6-v2" topic_model.save("path/to/my/model_dir", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model) # Method 3 - pickle topic_model.save("my_model", serialization="pickle") ``` To load a model: ```python # Load from directory loaded_model = BERTopic.load("path/to/my/model_dir") # Load from file loaded_model = BERTopic.load("my_model") # Load from HuggingFace loaded_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia") ``` !!! Warning "Warning" When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic should not be loaded in other versions. ================================================ FILE: docs/getting_started/quickstart/viz.html ================================================
================================================ FILE: docs/getting_started/representation/llm.md ================================================ As we have seen in the [previous section](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html), the topics that you get from BERTopic can be fine-tuned using a number of approaches. Here, we are going to focus on text generation Large Language Models such as ChatGPT, GPT-4, and open-source solutions. Using these techniques, we can further fine-tune topics to generate labels, summaries, poems of topics, and more. To do so, we first generate a set of keywords and documents that describe a topic best using BERTopic's c-TF-IDF calculate. Then, these candidate keywords and documents are passed to the text generation model and asked to generate output that fits the topic best. A huge benefit of this is that we can describe a topic with only a few documents and we therefore do not need to pass all documents to the text generation model. Not only speeds this the generation of topic labels up significantly, you also do not need a massive amount of credits when using an external API, such as Cohere or OpenAI. ## **Prompt Engineering** In most of the examples below, we use certain tags to customize our prompts. There are currently two tags, namely `"[KEYWORDS]"` and `"[DOCUMENTS]"`. These tags indicate where in the prompt they are to be replaced with a topics keywords and top 4 most representative documents respectively. For example, if we have the following prompt: ```python prompt = """ I have topic that contains the following documents: \n[DOCUMENTS] The topic is described by the following keywords: [KEYWORDS] Based on the above information, can you give a short label of the topic? """ ``` then that will be rendered as follows: ```python """ I have a topic that contains the following documents: - Our videos are also made possible by your support on patreon.co. - If you want to help us make more videos, you can do so on patreon.com or get one of our posters from our shop. - If you want to help us make more videos, you can do so there. - And if you want to support us in our endeavor to survive in the world of online video, and make more videos, you can do so on patreon.com. The topic is described by the following keywords: videos video you our support want this us channel patreon make on we if facebook to patreoncom can for and more watch Based on the above information, can you give a short label of the topic? """ ``` !!! tip "Tip 1" You can access the default prompts of these models with `representation_model.default_prompt_`. The prompts that were generated after training can be accessed with `topic_model.representation_model.prompts_`. ### **Selecting Documents** By default, four of the most representative documents will be passed to `[DOCUMENTS]`. These documents are selected by calculating their similarity (through c-TF-IDF representations) with the main c-TF-IDF representation of the topics. The four best matching documents per topic are selected. To increase the number of documents passed to `[DOCUMENTS]`, we can use the `nr_docs` parameter which is accessible in all LLMs on this page. Using this value allows you to select the top *n* most representative documents instead. If you have a long enough context length, then you could even give the LLM dozens of documents. However, some of these documents might be very similar to one another and might be near duplicates. They will not provide much additional information about the content of the topic. Instead, we can use the `diversity` parameter in each LLM to only select documents that are sufficiently diverse. It takes values between 0 and 1 but a value of 0.1 already does wonders! ### **Truncating Documents** We can truncate the input documents in `[DOCUMENTS]` in order to reduce the number of tokens that we have in our input prompt. To do so, all text generation modules have two parameters that we can tweak: * `doc_length` * The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. * `tokenizer` * The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is `'char'`, then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is `'whitespace'`, the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is `'vectorizer'`, then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` * If tokenizer is a callable, then that callable is used to tokenized the document. These tokens are counted and truncated depending on `doc_length` This means that the definition of `doc_length` changes depending on what constitutes a token in the `tokenizer` parameter. If a token is a character, then `doc_length` refers to max length in characters. If a token is a word, then `doc_length` refers to the max length in words. Let's illustrate this with an example. In the code below, we will use [`tiktoken`](https://github.com/openai/tiktoken) to count the number of tokens in each document and limit them to 100 tokens. All documents that have more than 100 tokens will be truncated. We start by installing the relevant packages: ```bash pip install tiktoken openai ``` Then, we use `bertopic.representation.OpenAI` to represent our topics with nicely written labels. We specify that documents that we put in the prompt cannot exceed 100 tokens each. Since we will put 4 documents in the prompt, they will total roughly 400 tokens: ```python import openai import tiktoken from bertopic.representation import OpenAI from bertopic import BERTopic # Tokenizer tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo") # Create your representation model client = openai.OpenAI(api_key="sk-...") representation_model = OpenAI( client, model="gpt-3.5-turbo", delay_in_seconds=2, chat=True, nr_docs=4, doc_length=100, tokenizer=tokenizer ) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ## **🤗 Transformers** Nearly every week, there are new and improved models released on the 🤗 [Model Hub](https://huggingface.co/models) that, with some creativity, allow for further fine-tuning of our c-TF-IDF based topics. These models range from text generation to zero-classification. In BERTopic, wrappers around these methods are created as a way to support whatever might be released in the future. Using a GPT-like model from the huggingface hub is rather straightforward: ```python from bertopic.representation import TextGeneration from bertopic import BERTopic # Create your representation model representation_model = TextGeneration('gpt2') # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` GPT2, however, is not the most accurate model out there on HuggingFace models. You can get much better results with a `flan-T5` like model: ```python from transformers import pipeline from bertopic.representation import TextGeneration prompt = "I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?" # Create your representation model generator = pipeline('text2text-generation', model='google/flan-t5-base') representation_model = TextGeneration(generator) ```
--8<-- "docs/getting_started/representation/hf.svg"

As can be seen from the example above, if you would like to use a `text2text-generation` model, you will to pass a `transformers.pipeline` with the `"text2text-generation"` parameter. Moreover, you can use a custom prompt and decide where the keywords should be inserted by using the `[KEYWORDS]` or documents with the `[DOCUMENTS]` tag. ### **Mistral (GGUF)** We can go a step further with open-source Large Language Models (LLMs) that have shown to match the performance of closed-source LLMs like ChatGPT. In this example, we will show you how to use Zephyr, a fine-tuning version of Mistral 7B. Mistral 7B outperforms other open-source LLMs at a much smaller scale and is a worthwhile solution for use cases such as topic modeling. We want to keep inference as fast as possible and a relatively small model helps with that. Zephyr is a fine-tuned version of Mistral 7B that was trained on a mix of publicly available and synthetic datasets using Direct Preference Optimization (DPO). To use Zephyr in BERTopic, we will first need to install and update a couple of packages that can handle quantized versions of Zephyr: ```python pip install ctransformers[cuda] pip install --upgrade git+https://github.com/huggingface/transformers ``` Instead of loading in the full model, we can instead load a quantized model which is a compressed version of the original model: ```python from ctransformers import AutoModelForCausalLM from transformers import AutoTokenizer, pipeline # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. model = AutoModelForCausalLM.from_pretrained( "TheBloke/zephyr-7B-alpha-GGUF", model_file="zephyr-7b-alpha.Q4_K_M.gguf", model_type="mistral", gpu_layers=50, hf=True ) tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha") # Pipeline generator = pipeline( model=model, tokenizer=tokenizer, task='text-generation', max_new_tokens=50, repetition_penalty=1.1 ) ``` This Zephyr model requires a specific prompt template in order to work: ```python prompt = """<|system|>You are a helpful, respectful and honest assistant for labeling topics.. <|user|> I have a topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: '[KEYWORDS]'. Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more. <|assistant|>""" ``` After creating this prompt template, we can create our representation model to be used in BERTopic: ```python from bertopic.representation import TextGeneration # Text generation with Zephyr zephyr = TextGeneration(generator, prompt=prompt) representation_model = {"Zephyr": zephyr} # Topic Modeling topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` ### **Llama (Manual Quantization)** Full Llama Tutorial: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QCERSMUjqGetGGujdrvv_6_EeoIcd_9M?usp=sharing) Open-source LLMs are starting to become more and more popular. Here, we will go through a minimal example of using [Llama 2](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) together with BERTopic. !!! Note Although this is an example of the older Llama 2 model, you can use the code below for any Llama variant. First, we need to load in our Llama model: ```python from torch import bfloat16 import transformers # set quantization configuration to load large model with less GPU memory # this requires the `bitsandbytes` library bnb_config = transformers.BitsAndBytesConfig( load_in_4bit=True, # 4-bit quantization bnb_4bit_quant_type='nf4', # Normalized float 4 bnb_4bit_use_double_quant=True, # Second quantization after the first bnb_4bit_compute_dtype=bfloat16 # Computation type ) # Llama Tokenizer tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) # Llama Model model = transformers.AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, quantization_config=bnb_config, device_map='auto', ) model.eval() # Our text generator generator = transformers.pipeline( model=model, tokenizer=tokenizer, task='text-generation', temperature=0.1, max_new_tokens=500, repetition_penalty=1.1 ) ``` After doing so, we will need to define a prompt that works with both Llama as well as BERTopic: ```python # System prompt describes information given to all conversations system_prompt = """ [INST] <> You are a helpful, respectful and honest assistant for labeling topics. <> """ # Example prompt demonstrating the output we are looking for example_prompt = """ I have a topic that contains the following documents: - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food. - Meat, but especially beef, is the word food in terms of emissions. - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'. Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more. [/INST] Environmental impacts of eating meat """ # Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags main_prompt = """ [INST] I have a topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: '[KEYWORDS]'. Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more. [/INST] """ prompt = system_prompt + example_prompt + main_prompt ``` Three pieces of the prompt were created: * `system_prompt` helps us guide the model during a conversation. For example, we can say that it is a helpful assistant that is specialized in labeling topics. * `example_prompt` gives an example of a correctly labeled topic to guide Llama * `main_prompt` contains the main question we are going to ask it, namely to label a topic. Note that it uses the `[DOCUMENTS]` and `[KEYWORDS]` to provide the most relevant documents and keywords as additional context After having generated our prompt template, we can start running our topic model: ```python from bertopic.representation import TextGeneration from bertopic import BERTopic # Text generation with Llama llama2 = TextGeneration(generator, prompt=prompt) representation_model = { "Llama2": llama2, } # Create our BERTopic model topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` ## **llama.cpp** An amazing framework for using LLMs for inference is [`llama.cpp`](https://github.com/ggerganov/llama.cpp) which has [python bindings](https://github.com/abetlen/llama-cpp-python) that we can use in BERTopic. To start with, we first need to install `llama-cpp-python`: ```bash pip install llama-cpp-python ``` or using the following for hardware acceleration: ```bash CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` !!! Note There are a number of [installation options](https://github.com/abetlen/llama-cpp-python#installation-with-hardware-acceleration) depending on your hardware and OS. Make sure that you select the correct one to optimize your performance. After installation, you need to download your LLM locally before we use it in BERTopic, like so: ```bash wget https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf ``` Finally, we can now use the model with BERTopic in just a couple of lines: ```python from bertopic import BERTopic from bertopic.representation import LlamaCPP # Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha representation_model = LlamaCPP("zephyr-7b-alpha.Q4_K_M.gguf") # Create our BERTopic model topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` If you want to have more control over the LLMs parameters, you can run it like so: ```python from bertopic import BERTopic from bertopic.representation import LlamaCPP from llama_cpp import Llama # Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha llm = Llama(model_path="zephyr-7b-alpha.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop="Q:") representation_model = LlamaCPP(llm) # Create our BERTopic model topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` !!! Note The default template that is being used uses a "Q: ... A: ... " type of structure which is why the `stop` is set at `"Q:"`. The default template is: ```python """ Q: I have a topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: '[KEYWORDS]'. Based on the above information, can you give a short label of the topic? A: """ ``` ## **OpenAI** Instead of using a language model from 🤗 transformers, we can use external APIs instead that do the work for you. Here, we can use [OpenAI](https://openai.com/api/) to extract our topic labels from the candidate documents and keywords. To use this, you will need to install openai first: ```bash pip install openai ``` Then, get yourself an API key and use OpenAI's API as follows: ```python import openai from bertopic.representation import OpenAI from bertopic import BERTopic # Create your representation model client = openai.OpenAI(api_key="sk-...") representation_model = OpenAI(client) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ```
--8<-- "docs/getting_started/representation/openai.svg"

You can also use a custom prompt: ```python prompt = "I have the following documents: [DOCUMENTS] \nThese documents are about the following topic: '" representation_model = OpenAI(client, prompt=prompt) ``` ### **GPT-4o** To choose a specific model from OpenAI's offering: ```python representation_model = OpenAI(client, model="gpt-4o-mini", delay_in_seconds=10) ``` Prompting with their models is very satisfying and is customizable as follows: ```python prompt = """ I have a topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: [KEYWORDS] Based on the information above, extract a short topic label in the following format: topic: """ ``` !!! note Whenever you create a custom prompt, it is important to add ``` Based on the information above, extract a short topic label in the following format: topic: ``` at the end of your prompt as BERTopic extracts everything that comes after `topic: `. Having said that, if `topic: ` is not in the output, then it will simply extract the entire response, so feel free to experiment with the prompts. ### **Summarization** Due to the structure of the prompts in OpenAI's chat models, we can extract different types of topic representations from their GPT models. Instead of extracting a topic label, we can instead ask it to extract a short description of the topic instead: ```python summarization_prompt = """ I have a topic that is described by the following keywords: [KEYWORDS] In this topic, the following documents are a small but representative subset of all documents in the topic: [DOCUMENTS] Based on the information above, please give a description of this topic in the following format: topic: """ representation_model = OpenAI(client, model="gpt-4o-mini", prompt=summarization_prompt, nr_docs=5, delay_in_seconds=3) ``` The above is not constrained to just creating a short description or summary of the topic, we can extract labels, keywords, poems, example documents, extensitive descriptions, and more using this method! If you want to have multiple representations of a single topic, it might be worthwhile to also check out [**multi-aspect**](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) topic modeling with BERTopic. ## **Ollama** To use [Ollama](https://github.com/ollama/ollama) within BERTopic, it is advised to use the `openai` package as it allows to pass through a model using the url on which the model is running. You will first need to install `openai`: ```bash pip install openai ``` After installation, usage is straightforward and you can select any model that you have prepared in your `ollama` model list. You can see all models by running `ollama list`. Select one from the list and you can use it in BERTopic as follows: ```python import openai from bertopic.representation import OpenAI from bertopic import BERTopic client = openai.OpenAI( base_url = 'http://localhost:11434/v1', #wherever ollama is running api_key='ollama', # required, but unused ) # Create your representation model representation_model = OpenAI(client, model='phi3:14b-medium-128k-instruct-q4_K_M') # Create your BERTopic model topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` ## **LiteLLM** An amazing framework to simplify connecting to external LLMs, is [LiteLLM](https://docs.litellm.ai). This package allows you to connect to OpenAI, Cohere, Anthropic, etc. all within one package. This makes iteration and testing out different models a breeze! o start with, we first need to install `litellm`: ```bash pip install litellm ``` After installation, usage is straightforward and you can select any model found in their [docs](https://docs.litellm.ai/docs/providers). Let's show an example with OpenAI: ```python import os from bertopic import BERTopic from bertopic.representation import LiteLLM # set ENV variables os.environ["OPENAI_API_KEY"] = "MY_KEY" # Create your representation model representation_model = LiteLLM(model="gpt-4o-mini") # Create our BERTopic model topic_model = BERTopic(representation_model=representation_model, verbose=True) ``` ## **LangChain** [Langchain](https://github.com/hwchase17/langchain) is a package that helps users with chaining large language models. In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this external knowledge are the most representative documents in each topic. To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai: ```bash pip install langchain, openai ``` Then, you can create your chain as follows: ```python from langchain.chains.question_answering import load_qa_chain from langchain.llms import OpenAI chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") ``` Finally, you can pass the chain to BERTopic as follows: ```python from bertopic.representation import LangChain # Create your representation model representation_model = LangChain(chain) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` You can also use a custom prompt: ```python prompt = "What are these documents about? Please give a single label." representation_model = LangChain(chain, prompt=prompt) ``` !!! note Note The prompt does not make use of `[KEYWORDS]` and `[DOCUMENTS]` tags as the documents are already used within langchain's `load_qa_chain`. ## **Cohere** Instead of using a language model from 🤗 transformers, we can use external APIs instead that do the work for you. Here, we can use [Cohere](https://docs.cohere.ai/) to extract our topic labels from the candidate documents and keywords. To use this, you will need to install cohere first: ```bash pip install cohere ``` Then, get yourself an API key and use Cohere's API as follows: ```python import cohere from bertopic.representation import Cohere from bertopic import BERTopic # Create your representation model co = cohere.Client(my_api_key) representation_model = Cohere(co) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ```
--8<-- "docs/getting_started/representation/cohere.svg"

You can also use a custom prompt: ```python prompt = """ I have topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: [KEYWORDS]. Based on the above information, can you give a short label of the topic? """ representation_model = Cohere(co, prompt=prompt) ``` ================================================ FILE: docs/getting_started/representation/representation.md ================================================ One of the core components of BERTopic is its Bag-of-Words representation and weighting with c-TF-IDF. This method is fast and can quickly generate a number of keywords for a topic without depending on the clustering task. As a result, topics can easily and quickly be updated after training the model without the need to re-train it. Although these give good topic representations, we may want to further fine-tune the topic representations. As such, there are a number of representation models implemented in BERTopic that allows for further fine-tuning of the topic representations. These are optional and are **not used by default**. You are not restrained by the how the representation can be fine-tuned, from GPT-like models to fast keyword extraction with KeyBERT-like models: For each model below, an example will be shown on how it may change or improve upon the default topic keywords that are generated. The dataset used in these examples can be found [here](https://www.kaggle.com/datasets/maartengr/kurzgesagt-transcriptions). If you want to have multiple representations of a single topic, it might be worthwhile to also check out [**multi-aspect**](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) topic modeling with BERTopic. ## **KeyBERTInspired** After having generated our topics with c-TF-IDF, we might want to do some fine-tuning based on the semantic relationship between keywords/keyphrases and the set of documents in each topic. Although we can use a centroid-based technique for this, it can be costly and does not take the structure of a cluster into account. Instead, we leverage c-TF-IDF to create a set of representative documents per topic and use those as our updated topic embedding. Then, we calculate the similarity between candidate keywords and the topic embedding using the same embedding model that embedded the documents.
--8<-- "docs/getting_started/representation/keybertinspired.svg"

Thus, the algorithm follows some principles of [KeyBERT](https://github.com/MaartenGr/KeyBERT) but does some optimization in order to speed up inference. Usage is straightforward: ```python from bertopic.representation import KeyBERTInspired from bertopic import BERTopic # Create your representation model representation_model = KeyBERTInspired() # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ```
--8<-- "docs/getting_started/representation/keybert.svg"

## **PartOfSpeech** Our candidate topics, as extracted with c-TF-IDF, do not take into account a keyword's part of speech as extracting noun-phrases from all documents can be computationally quite expensive. Instead, we can leverage c-TF-IDF to perform part of speech on a subset of keywords and documents that best represent a topic.
--8<-- "docs/getting_started/representation/partofspeech.svg"

More specifically, we find documents that contain the keywords from our candidate topics as calculated with c-TF-IDF. These documents serve as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic. These candidate keywords are first put through Spacy's POS module to see whether they match with the `DEFAULT_PATTERNS`: ```python DEFAULT_PATTERNS = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] ] ``` These patterns follow Spacy's [Rule-Based Matching](https://spacy.io/usage/rule-based-matching). Then, the resulting keywords are sorted by their respective c-TF-IDF values. ```python from bertopic.representation import PartOfSpeech from bertopic import BERTopic # Create your representation model representation_model = PartOfSpeech("en_core_web_sm") # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ```
--8<-- "docs/getting_started/representation/pos.svg"

You can define custom POS patterns to be extracted: ```python pos_patterns = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] ] representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns) ``` ## **MaximalMarginalRelevance** When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like "car" and "cars" essentially represent the same information and often redundant.
--8<-- "docs/getting_started/representation/mmr.svg"

To decrease this redundancy and improve the diversity of keywords, we can use an algorithm called Maximal Marginal Relevance (MMR). MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document. ```python from bertopic.representation import MaximalMarginalRelevance from bertopic import BERTopic # Create your representation model representation_model = MaximalMarginalRelevance(diversity=0.3) # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ```
--8<-- "docs/getting_started/representation/mmr_output.svg"

## **Zero-Shot Classification** For some use cases, you might already have a set of candidate labels that you would like to automatically assign to some of the topics. Although we can use guided or supervised BERTopic for that, we can also use zero-shot classification to assign labels to our topics. For that, we can make use of 🤗 transformers on their models on the [model hub](https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=downloads). To perform this classification, we feed the model with the keywords as generated through c-TF-IDF and a set of candidate labels. If, for a certain topic, we find a similar enough label, then it is assigned. If not, then we keep the original c-TF-IDF keywords. We use it in BERTopic as follows: ```python from bertopic.representation import ZeroShotClassification from bertopic import BERTopic # Create your representation model candidate_topics = ["space and nasa", "bicycles", "sports"] representation_model = ZeroShotClassification(candidate_topics, model="facebook/bart-large-mnli") # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ```
--8<-- "docs/getting_started/representation/zero.svg"

## **Chain Models** All of the above models can make use of the candidate topics, as generated by c-TF-IDF, to further fine-tune the topic representations. For example, `MaximalMarginalRelevance` takes the keywords in the candidate topics and re-ranks them. Similarly, the keywords in the candidate topic can be used as the input for GPT-prompts in `OpenAI`. Although the default candidate topics are generated by c-TF-IDF, what if we were to chain these models? For example, we can use `MaximalMarginalRelevance` to improve upon the keywords in each topic before passing them to `OpenAI`. This is supported in BERTopic by simply passing a list of representation models when instantiation the topic model: ```python from bertopic.representation import MaximalMarginalRelevance, OpenAI from bertopic import BERTopic import openai # Create your representation models client = openai.OpenAI(api_key="sk-...") openai_generator = OpenAI(client) mmr = MaximalMarginalRelevance(diversity=0.3) representation_models = [mmr, openai_generator] # Use the chained models topic_model = BERTopic(representation_model=representation_models) ``` ## **Custom Model** Although several representation models have been implemented in BERTopic, new technologies get released often and we should not have to wait until they get implemented in BERTopic. Therefore, you can create your own representation model and use that to fine-tune the topics. The following is the basic structure for creating your custom model. Note that it returns the same topics as the those calculated with c-TF-IDF: ```python from bertopic.representation._base import BaseRepresentation class CustomRepresentationModel(BaseRepresentation): def extract_topics(self, topic_model, documents, c_tf_idf, topics ) -> Mapping[str, List[Tuple[str, float]]]: """ Extract topics Arguments: topic_model: The BERTopic model documents: A dataframe of documents with their related topics c_tf_idf: The c-TF-IDF matrix topics: The candidate topics as calculated with c-TF-IDF Returns: updated_topics: Updated topic representations """ updated_topics = topics.copy() return updated_topics ``` Then, we can use that model as follows: ```python from bertopic import BERTopic # Create our custom representation model representation_model = CustomRepresentationModel() # Pass our custom representation model to BERTopic topic_model = BERTopic(representation_model=representation_model) ``` There are a few things to take into account when creating your custom model: * It needs to have the exact same parameter input: `topic_model`, `documents`, `c_tf_idf`, `topics`. * Make sure that `updated_topics` has the exact same structure as `topics`: ```python updated_topics = { "1", [("space", 0.9), ("nasa", 0.7)], "2": [("science", 0.66), ("article", 0.6)] } ``` !!! Tip You can change the `__init__` however you want, it does not influence the underlying structure. This also means that you can save data/embeddings/representations/sentiment in your custom representation model. ================================================ FILE: docs/getting_started/search/search.md ================================================ After having created a BERTopic model, you might end up with over a hundred topics. Searching through those can be quite cumbersome especially if you are searching for a specific topic. Fortunately, BERTopic allows you to search for topics using search terms. First, let's create and train a BERTopic model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups # Create topics docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` After having trained our model, we can use `find_topics` to search for topics that are similar to an input search_term. Here, we are going to be searching for topics that closely relate the search term "motor". Then, we extract the most similar topic and check the results: ```python >>> similar_topics, similarity = topic_model.find_topics("motor", top_n=5) >>> topic_model.get_topic(similar_topics[0]) [('bike', 0.02275997701645559), ('motorcycle', 0.011391202866080292), ('bikes', 0.00981187573649205), ('dod', 0.009614623748226669), ('honda', 0.008247663662558535), ('ride', 0.0064683227888861945), ('harley', 0.006355502638631013), ('riding', 0.005766601561614182), ('motorcycles', 0.005596372493714447), ('advice', 0.005534544418830091)] ``` It definitely seems that a topic was found that closely matches "motor". The topic seems to be motorcycle related and therefore matches our "motor" input. You can use the `similarity` variable to see how similar the extracted topics are to the search term. !!! note You can only use this method if an embedding model was supplied to BERTopic using `embedding_model`. ================================================ FILE: docs/getting_started/seed_words/seed_words.md ================================================ When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the *"TNM"* classification is a method for identifying the stage of most cancers. The word *"TNM"* is an abbreviation and might not be correctly captured in generic embedding models. To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of `seed_words` in the `bertopic.vectorizer.ClassTfidfTransformer`. The `ClassTfidfTransformer` is the base representation of BERTopic and essentially represents each topic as a bag of words. As such, we can choose to increase the importance of certain words, such as *"TNM"*. To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like "agent" and "robot" should be important in such a topic were it to be found. Using the `ClassTfidfTransformer`, we can define those `seed_words` and also choose by how much their values are multiplied. The full example is then as follows: ```python from umap import UMAP from datasets import load_dataset from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer # Let's take a subset of ArXiv abstracts as the training data dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"] abstracts = dataset["abstract"][:5_000] # For illustration purposes, we make sure the output is fixed when running this code multiple times umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) # We can choose any number of seed words for which we want their representation # to be strengthen. We increase the importance of these words as we want them to be more # likely to end up in the topic representations. ctfidf_model = ClassTfidfTransformer( seed_words=["agent", "robot", "behavior", "policies", "environment"], seed_multiplier=2 ) # We run the topic model with the seeded words topic_model = BERTopic( umap_model=umap_model, min_topic_size=15, ctfidf_model=ctfidf_model, ).fit(abstracts) ``` Then, when we run `topic_model.get_topic(0)`, we get the following output: ```python [('policy', 0.023413102511982354), ('reinforcement', 0.021796126795834238), ('agent', 0.021131601305431902), ('policies', 0.01888385271486409), ('environment', 0.017819874593917057), ('learning', 0.015321710504308708), ('robot', 0.013881115279230468), ('control', 0.013297705894983875), ('the', 0.013247933839985382), ('to', 0.013058208312484141)] ``` As we can see, the output includes some of the seed words that we assigned. However, if a word is not found to be important in a topic than we can still multiply its importance but it will remain relatively low. This is a great feature as it allows you to improve their importance with less risk of making words important in topics that really should not be. A benefit of this method is that this often influences all other representation methods, like KeyBERTInspired and OpenAI. The reason for this is that each representation model uses the words generated by the `ClassTfidfTransformer` as candidate words to be further optimized. In many cases, words like *"TNM"* might not end up in the candidate words. By increasing their importance, they are more likely to end up as candidate words in representation models. Another benefit of using this method is that it artificially increases the interpretability of topics. Sure, some words might be more important than others but there might not mean something to a domain expert. For them, certain words, like *"TNM"* are highly descriptive and that is something difficult to capture using any method (embedding model, large language model, etc.). Moreover, these `seed_words` can be defined together with the domain expert as they can decide what type of words are generally important and might need a nudge from you the algorithmic developer. ================================================ FILE: docs/getting_started/semisupervised/semisupervised.md ================================================ In BERTopic, you have several options to nudge the creation of topics toward certain pre-specified topics. Here, we will be looking at semi-supervised topic modeling with BERTopic. Semi-supervised modeling allows us to steer the dimensionality reduction of the embeddings into a space that closely follows any labels you might already have.
--8<-- "docs/getting_started/semisupervised/semisupervised.svg"

In other words, we use a semi-supervised UMAP instance to reduce the dimensionality of embeddings before clustering the documents with HDBSCAN. First, let us prepare the data needed for our topic model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = data["data"] categories = data["target"] category_names = data["target_names"] ``` We are using the popular 20 Newsgroups dataset which contains roughly 18000 newsgroups posts that each is assigned to one of 20 categories. Using this dataset we can try to extract its corresponding topic model whilst taking its underlying categories into account. These categories are here the variable `targets`. Each document can be put into one of the following categories: ```python >>> category_names ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] ``` To perform this semi-supervised approach, we can take in some pre-defined topics and simply pass those to the `y` parameter when fitting BERTopic. These labels can be pre-defined topics or simply documents that you feel belong together regardless of their content. BERTopic will nudge the creation of topics toward these categories using the pre-defined labels. To perform supervised topic modeling, we simply use all categories: ```python topic_model = BERTopic(verbose=True).fit(docs, y=categories) ``` The topic model will be much more attuned to the categories that were defined previously. However, this does not mean that only topics for these categories will be found. BERTopic is likely to find more specific topics in those you have already defined. This allows you to discover previously unknown topics! ## **Partial labels** At times, you might only have labels for a subset of documents. Fortunately, we can still use those labels to at least nudge the documents for which those labels exist. The documents for which we do not have labels are assigned a -1. For this example, imagine we only have the labels of categories that are related to computers and we want to create a topic model using semi-supervised modeling: ```python labels_to_add = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',] indices = [category_names.index(label) for label in labels_to_add] y = [label if label in indices else -1 for label in categories] ``` The `y` variable contains many -1 values since we do not know all the categories. Next, we use those newly constructed labels to again BERTopic semi-supervised: ```python topic_model = BERTopic(verbose=True).fit(docs, y=y) ``` And that is it! By defining certain classes for our documents, we can steer the topic modeling towards modeling the pre-defined categories. ================================================ FILE: docs/getting_started/serialization/serialization.md ================================================ Saving, loading, and sharing a BERTopic model can be done in several ways. It is generally advised to go with `.safetensors` as that allows for a small, safe, and fast method for saving your BERTopic model. However, other formats, such as `.pickle` and pytorch `.bin` are also possible. ## **Saving** There are three methods for saving BERTopic: 1. A light model with `.safetensors` and config files 2. A light model with pytorch `.bin` and config files 3. A full model with `.pickle` !!! Tip "Tip" It is advised to use methods 1 or 2 for saving as they generated very small models. Especially method 1 (`safetensors`) allows for a relatively safe format compared to the other methods. The methods are used as follows: ```python topic_model = BERTopic().fit(my_docs) # Method 1 - safetensors embedding_model = "sentence-transformers/all-MiniLM-L6-v2" topic_model.save("path/to/my/model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model) # Method 2 - pytorch embedding_model = "sentence-transformers/all-MiniLM-L6-v2" topic_model.save("path/to/my/model_dir", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model) # Method 3 - pickle topic_model.save("my_model", serialization="pickle") ``` !!! Warning "Warning" When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic are not guaranteed to load in other versions. ### **Pickle Drawbacks** Saving the model with `pickle` allows for saving the entire topic model, including dimensionality reduction and clustering algorithms, but has several drawbacks: * Arbitrary code can be run from `.pickle` files * The resulting model is rather large (often > 500MB) since all sub-models need to be saved * Explicit and specific version control is needed as they typically only run if the environment is exactly the same ### **Safetensors and Pytorch Advantages** Saving the topic modeling with `.safetensors` or `pytorch` has a number of advantages: * `.safetensors` is a relatively **safe format** * The resulting model can be **very small** (often < 20MB>) since no sub-models need to be saved * Although version control is important, there is a bit more **flexibility** with respect to specific versions of packages * More easily used in **production** * **Share** models with the HuggingFace Hub



The above image, a model trained on 100,000 documents, demonstrates the differences in sizes comparing `safetensors`, `pytorch`, and `pickle`. The difference in sizes can mostly be explained due to the efficient saving procedure and that the clustering and dimensionality reductions are not saved in safetensors/pytorch since inference can be done based on the topic embeddings. ## **HuggingFace Hub** When you have created a BERTopic model, you can easily share it with other through the HuggingFace Hub. First, you need to log in to your HuggingFace account which you can do in a number of ways: * Log in to your Hugging Face account with the command below ```bash huggingface-cli login # or using an environment variable huggingface-cli login --token $HUGGINGFACE_TOKEN ``` * Alternatively, you can programmatically login using login() in a notebook or a script ```python from huggingface_hub import login login() ``` * Or you can give a token with the `token` variable When you have logged in to your HuggingFace account, you can save and upload the model as follows: ```python from bertopic import BERTopic # Train model topic_model = BERTopic().fit(my_docs) # Push to HuggingFace Hub topic_model.push_to_hf_hub( repo_id="MaartenGr/BERTopic_ArXiv", save_ctfidf=True ) # Load from HuggingFace loaded_model = BERTopic.load("MaartenGr/BERTopic_ArXiv") ``` ### **Parameters** There are number of parameters that may be worthwhile to know: * `private` * Whether to create a private repository * `serialization` * The type of serialization. Either `safetensors` or `pytorch`. Make sure to run `pip install safetensors` for safetensors. * `save_embedding_model` * A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., `sentence-transformers/all-MiniLM-L6-v2` * `save_ctfidf` * Whether to save c-TF-IDF information ## **Loading** To load a model: ```python # Load from directory loaded_model = BERTopic.load("path/to/my/model_dir") # Load from file loaded_model = BERTopic.load("my_model") # Load from HuggingFace loaded_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia") ``` The embedding model cannot always be saved using a non-pickle method if, for example, you are using OpenAI embeddings. Instead, you can load them in as follows: ```python # Define embedding model import openai from bertopic.backend import OpenAIBackend client = openai.OpenAI(api_key="sk-...") embedding_model = OpenAIBackend(client, "text-embedding-ada-002") # Load model and add embedding model loaded_model = BERTopic.load("path/to/my/model_dir", embedding_model=embedding_model) ``` ================================================ FILE: docs/getting_started/supervised/supervised.md ================================================ Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used [20 NewsGroups dataset](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) is already split up into 20 classes. Similarly, you might already have created some labels yourself through packages like [human-learn](https://github.com/koaning/human-learn), [bulk](https://github.com/koaning/bulk), [thisnotthat](https://github.com/TutteInstitute/thisnotthat) or something entirely different. Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic and try to learn the relationship between those topics and the input documents. > In other words, we are going to be performing classification instead! We can view this as a supervised topic modeling approach. Instead of using a clustering algorithm, we are going to be using a classification algorithm instead. Generally, we have the following pipeline:
--8<-- "docs/getting_started/supervised/default_pipeline.svg"

Instead, we are now going to skip over the dimensionality reduction step and replace the clustering step with a classification model:
--8<-- "docs/getting_started/supervised/classification_pipeline.svg"

In other words, we can pass our labels to BERTopic and it will not only learn how to predict labels for new instances, but it also transforms those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer. To do so, we need to skip over the dimensionality reduction step and replace the clustering step with a classification algorithm. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels: ```python from sklearn.datasets import fetch_20newsgroups # Get labeled data data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = data['data'] y = data['target'] ``` Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process: ```python from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer from bertopic.dimensionality import BaseDimensionalityReduction from sklearn.linear_model import LogisticRegression # Get labeled data data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = data['data'] y = data['target'] # Skip over dimensionality reduction, replace cluster model with classifier, # and reduce frequent words while we are at it. empty_dimensionality_model = BaseDimensionalityReduction() clf = LogisticRegression() ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) # Create a fully supervised BERTopic instance topic_model= BERTopic( umap_model=empty_dimensionality_model, hdbscan_model=clf, ctfidf_model=ctfidf_model ) topics, probs = topic_model.fit_transform(docs, y=y) ``` Let's take a look at a few topics that we get out of training this way by running `topic_model.get_topic_info()`:
--8<-- "docs/getting_started/supervised/table.svg"

We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship: ```python # Map input `y` to topics mappings = topic_model.topic_mapper_.get_mappings() mappings = {value: data["target_names"][key] for key, value in mappings.items()} # Assign original classes to our topics df = topic_model.get_topic_info() df["Class"] = df.Topic.map(mappings) df ```
--8<-- "docs/getting_started/supervised/table_classes.svg"

We can see that the c-TF-IDF representations extract the words that give a good representation of our input classes. This is all done directly from the labeling. A welcome side-effect is that we now have a classification algorithm that allows us to predict the topics of unseen data: ```python >>> topic, _ = topic_model.transform("this is a document about cars") >>> topic_model.get_topic(topic) [('car', 0.4407600315538472), ('cars', 0.32348015696446325), ('engine', 0.28032518444946686), ('ford', 0.2500224508115155), ('oil', 0.2325984913598611), ('dealer', 0.2310723968585826), ('my', 0.22045777551991935), ('it', 0.21327993649430219), ('tires', 0.20420842634292657), ('brake', 0.20246902481367085)] ``` Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc. !!! note The resulting `topics` may be a different mapping from the `y` labels. To map `y` to `topics`, we can run the following: ```python mappings = topic_model.topic_mapper_.get_mappings() y_mapped = [mappings[val] for val in y] ``` ================================================ FILE: docs/getting_started/tips_and_tricks/tips_and_tricks.md ================================================ # Tips & Tricks ## **Document length** As a default, we are using sentence-transformers to embed our documents. However, as the name implies, the embedding model works best for either sentences or paragraphs. This means that whenever you have a set of documents, where each documents contains several paragraphs, the document is truncated and the topic model is only trained on a small part of the data. One way to solve this issue is by splitting up longer documents into either sentences or paragraphs before embedding them. Another solution is to approximate the [topic distributions](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) of topics after having trained your topic model. ## **Removing stop words** At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context in order to create accurate embeddings. Instead, we can use the `CountVectorizer` to preprocess our documents **after** having generated embeddings and clustered our documents. Personally, I have found almost no disadvantages to using the `CountVectorizer` to remove stopwords and it is something I would strongly advise to try out: ```python from bertopic import BERTopic from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(stop_words="english") topic_model = BERTopic(vectorizer_model=vectorizer_model) ``` We can also use the `ClassTfidfTransformer` to reduce the impact of frequent words. The end result is very similar to explicitly removing stopwords but this process does this automatically: ```python from bertopic import BERTopic from bertopic.vectorizers import ClassTfidfTransformer ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) topic_model = BERTopic(ctfidf_model=ctfidf_model) ``` Lastly, we can use a KeyBERT-Inspired model to reduce the appearance of stop words. This also often improves the topic representation: ```python from bertopic.representation import KeyBERTInspired from bertopic import BERTopic # Create your representation model representation_model = KeyBERTInspired() # Use the representation model in BERTopic on top of the default pipeline topic_model = BERTopic(representation_model=representation_model) ``` ## **Diversify topic representation** After having calculated our top *n* words per topic there might be many words that essentially mean the same thing. As a little bonus, we can use `bertopic.representation.MaximalMarginalRelevance` in BERTopic to diversify words in each topic such that we limit the number of duplicate words we find in each topic. This is done using an algorithm called Maximal Marginal Relevance which compares word embeddings with the topic embedding. We do this by specifying a value between 0 and 1, with 0 being not at all diverse and 1 being completely diverse: ```python from bertopic import BERTopic from bertopic.representation import MaximalMarginalRelevance representation_model = MaximalMarginalRelevance(diversity=0.2) topic_model = BERTopic(representation_model=representation_model) ``` Since MMR is using word embeddings to diversify the topic representations, it is necessary to pass the embedding model to BERTopic if you are using pre-computed embeddings: ```python from bertopic import BERTopic from bertopic.representation import MaximalMarginalRelevance from sentence_transformers import SentenceTransformer sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) representation_model = MaximalMarginalRelevance(diversity=0.2) topic_model = BERTopic(embedding_model=sentence_model, representation_model=representation_model) ``` ## **Topic-term matrix** Although BERTopic focuses on clustering our documents, the end result does contain a topic-term matrix. This topic-term matrix is calculated using c-TF-IDF, a TF-IDF procedure optimized for class-based analyses. To extract the topic-term matrix (or c-TF-IDF matrix) with the corresponding words, we can simply do the following: ```python topic_term_matrix = topic_model.c_tf_idf_ words = topic_model.vectorizer_model.get_feature_names() ``` ## **Pre-compute embeddings** Typically, we want to iterate fast over different versions of our BERTopic model whilst we are trying to optimize it to a specific use case. To speed up this process, we can pre-compute the embeddings, save them, and pass them to BERTopic so it does not need to calculate the embeddings each time: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train our topic model using our pre-trained sentence-transformers embeddings topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs, embeddings) ``` ## **Speed up UMAP** At times, UMAP may take a while to fit on the embeddings that you have. This often happens when you have the embeddings millions of documents that you want to reduce in dimensionality. There is a trick that can speed up this process somewhat: Initializing UMAP with rescaled PCA embeddings. Without going in too much detail (look [here](https://github.com/lmcinnes/umap/issues/771#issuecomment-931886015) for more information), you can reduce the embeddings using PCA and use that as a starting point. This can speed up the dimensionality reduction a bit: ```python import numpy as np from umap import UMAP from bertopic import BERTopic from sklearn.decomposition import PCA def rescale(x, inplace=False): """ Rescale an embedding so optimization will not have convergence issues. """ if not inplace: x = np.array(x, copy=True) x /= np.std(x[:, 0]) * 10000 return x # Initialize and rescale PCA embeddings pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings)) # Start UMAP from PCA embeddings umap_model = UMAP( n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", init=pca_embeddings, ) # Pass the model to BERTopic: topic_model = BERTopic(umap_model=umap_model) ``` ## **GPU acceleration** You can use [cuML](https://rapids.ai/start.html#rapids-release-selector) to speed up both UMAP and HDBSCAN through GPU acceleration: ```python from bertopic import BERTopic from cuml.cluster import HDBSCAN from cuml.manifold import UMAP # Create instances of GPU-accelerated UMAP and HDBSCAN umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0) hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True) # Pass the above models to be used in BERTopic topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model) topics, probs = topic_model.fit_transform(docs) ``` Depending on the embeddings you are using, you might want to normalize them first in order to force a cosine-related distance metric in UMAP: ```python from cuml.preprocessing import normalize embeddings = normalize(embeddings) ``` !!! note As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., `.transform`) using cuML's HDBSCAN. However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., `.fit` and `.fit_transform`). !!! note To install cuML with BERTopic, run these commands: **For CUDA 12:** ```bash !pip install cuml-cu12 !pip install bertopic ``` **For CUDA 13:** ```bash !pip install cuml-cu13 !pip install bertopic ``` !!! warning Install cuML first, then BERTopic. Installing both in a single command can fail due to pip resolver limitations with CUDA runtime dependencies. **Note:** cuML is already installed on Google Colab. For more detailed information on installing cuML, including additional dependencies and platform-specific instructions, see the [RAPIDS installation guide](https://docs.rapids.ai/install/). ## **Lightweight installation** The default embedding model in BERTopic is one of the amazing sentence-transformers models, namely `"all-MiniLM-L6-v2"`. Although this model performs well out of the box, it typically needs a GPU to transform the documents into embeddings in a reasonable time. Moreover, the installation requires `pytorch` which often results in a rather large environment, memory-wise. Fortunately, it is possible to install BERTopic without `sentence-transformers`, `UMAP`, `HDBSCAN` and/or `plotly`. This can be to reduce your docker images for inference or when you do not use `pytorch` but for instance [Model2Vec](https://github.com/MinishLab/model2vec) instead. The installation can be done as follows: ```bash pip install --no-deps bertopic pip install --upgrade numpy pandas scikit-learn tqdm pyyaml ``` This installs a bare-bones version of BERTopic. If you want to use UMAP and Model2Vec for instance, you'll need to first install them: `pip install model2vec umap-learn` Then, you can BERTopic without needing to have a GPU: ```python from bertopic import BERTopic from model2vec import StaticModel # Model2Vec embedding_model = StaticModel.from_pretrained("minishlab/potion-base-8M") # BERTopic topic_model = BERTopic(embedding_model=embedding_model) ``` As a result, the entire package and resulting model can be run quickly on the CPU and no GPU is necessary! !!! Note If you have an alternative embedding model, you can use that instead of Model2Vec. Likewise, if you have a different method for dimensionality reduction that you want to use, you can use that instead of UMAP. ## **WordCloud** To minimize the number of dependencies in BERTopic, it is not possible to generate wordclouds out-of-the-box. However, there is a minimal script that you can use to generate wordclouds in BERTopic. First, you will need to install the [wordcloud](https://github.com/amueller/word_cloud) package with `pip install wordcloud`. Then, run the following code to generate the wordcloud for a specific topic: ```python from wordcloud import WordCloud import matplotlib.pyplot as plt def create_wordcloud(model, topic): text = {word: value for word, value in model.get_topic(topic)} wc = WordCloud(background_color="white", max_words=1000) wc.generate_from_frequencies(text) plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.show() # Show wordcloud create_wordcloud(topic_model, topic=1) ``` ![](wordcloud.jpg) !!! tip Tip To increase the number of words shown in the wordcloud, you can increase the `top_n_words` parameter when instantiating BERTopic. You can also increase the number of words in a topic after training the model using `.update_topics()`. ## **Finding similar topics between models** Whenever you have trained separate BERTopic models on different datasets, it might be worthful to find the similarities among these models. Is there overlap between topics in model A and topic in model B? In other words, can we find topics in model A that are similar to those in model B? We can compare the topic representations of several models in two ways. First, by comparing the topic embeddings that are created when using the same embedding model across both fitted BERTopic instances. Second, we can compare the c-TF-IDF representations instead assuming we have fixed the vocabulary in both instances. This example will go into the former, using the same embedding model across two BERTopic instances. To do this comparison, let's first create an example where I trained two models, one on an English dataset and one on a Dutch dataset: ```python from datasets import load_dataset from bertopic import BERTopic from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # The same embedding model needs to be used for both topic models # and since we are dealing with multiple languages, the model needs to be multi-lingual sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") # To make this example reproducible umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) # English en_dataset = load_dataset("stsb_multi_mt", name="en", split="train").to_pandas().sentence1.tolist() en_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model) en_model.fit(en_dataset) # Dutch nl_dataset = load_dataset("stsb_multi_mt", name="nl", split="train").to_pandas().sentence1.tolist() nl_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model) nl_model.fit(nl_dataset) ``` In the code above, there is one important thing to note and that is the `sentence_model`. This model needs to be exactly the same in all BERTopic models, otherwise, it is not possible to compare topic models. Next, we can calculate the similarity between topics in the English topic model `en_model` and the Dutch model `nl_model`. To do so, we can simply calculate the cosine similarity between the `topic_embedding` of both models: ```python from sklearn.metrics.pairwise import cosine_similarity sim_matrix = cosine_similarity(en_model.topic_embeddings_, nl_model.topic_embeddings_) ``` Now that we know which topics are similar to each other, we can extract the most similar topics. Let's say that we have topic 10 in the `en_model` which represents a topic related to trains: ```python >>> topic = 10 >>> en_model.get_topic(topic) [('train', 0.2588080580844999), ('tracks', 0.1392140438801078), ('station', 0.12126454635946024), ('passenger', 0.058057876475695866), ('engine', 0.05123717127783682), ('railroad', 0.048142847325312044), ('waiting', 0.04098973702226946), ('track', 0.03978248702913929), ('subway', 0.03834661195748458), ('steam', 0.03834661195748458)] ``` To find the matching topic, we extract the most similar topic in the `sim_matrix`: ```python >>> most_similar_topic = np.argmax(sim_matrix[topic + 1])-1 >>> nl_model.get_topic(most_similar_topic) [('trein', 0.24186603209316418), ('spoor', 0.1338118418551581), ('sporen', 0.07683661859111401), ('station', 0.056990389779394225), ('stoommachine', 0.04905829711711234), ('zilveren', 0.04083879598477808), ('treinen', 0.03534099197032758), ('treinsporen', 0.03534099197032758), ('staat', 0.03481332997324445), ('zwarte', 0.03179591746822408)] ``` It seems to be working as, for example, `trein` is a translation of `train` and `sporen` a translation of `tracks`! You can do this for every single topic to find out which topic in the `en_model` might belong to a model in the `nl_model`. ## **Multimodal data** [Concept](https://github.com/MaartenGr/Concept) is a variation of BERTopic for multimodal data, such as images with captions. Although we can use that package for multimodal data, we can perform a small trick with BERTopic to have a similar feature. BERTopic is a relatively modular approach that attempts to isolate steps from one another. This means, for example, that you can use k-Means instead of HDBSCAN or PCA instead of UMAP as it does not make any assumptions with respect to the nature of the clustering. Similarly, you can pass pre-calculated embeddings to BERTopic that represent the documents that you have. However, it does not make any assumption with respect to the relationship between those embeddings and the documents. This means that we could pass any metadata to BERTopic to cluster on instead of document embeddings. In this example, we can separate our embeddings from our documents so that the embeddings are generated from images instead of their corresponding images. Thus, we will cluster image embeddings but create the topic representation from the related captions. In this example, we first need to fetch our data, namely the Flickr 8k dataset that contains images with captions: ```python import os import glob import zipfile import numpy as np import pandas as pd from tqdm import tqdm from PIL import Image from sentence_transformers import SentenceTransformer, util # Flickr 8k images img_folder = 'photos/' caps_folder = 'captions/' if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0: os.makedirs(img_folder, exist_ok=True) if not os.path.exists('Flickr8k_Dataset.zip'): #Download dataset if does not exist util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip', 'Flickr8k_Dataset.zip') util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip', 'Flickr8k_text.zip') for folder, file in [(img_folder, 'Flickr8k_Dataset.zip'), (caps_folder, 'Flickr8k_text.zip')]: with zipfile.ZipFile(file, 'r') as zf: for member in tqdm(zf.infolist(), desc='Extracting'): zf.extract(member, folder) images = list(glob.glob('photos/Flicker8k_Dataset/*.jpg')) # Prepare dataframe captions = pd.read_csv("captions/Flickr8k.lemma.token.txt",sep='\t',names=["img_id","img_caption"]) captions.img_id = captions.apply(lambda row: "photos/Flicker8k_Dataset/" + row.img_id.split(".jpg")[0] + ".jpg", 1) captions = captions.groupby(["img_id"])["img_caption"].apply(','.join).reset_index() captions = pd.merge(captions, pd.Series(images, name="img_id"), on="img_id") # Extract images together with their documents/captions images = captions.img_id.to_list() docs = captions.img_caption.to_list() ``` Now that we have our images and captions, we need to generate our image embeddings: ```python model = SentenceTransformer('clip-ViT-B-32') # Prepare images batch_size = 32 nr_iterations = int(np.ceil(len(images) / batch_size)) # Embed images per batch embeddings = [] for i in tqdm(range(nr_iterations)): start_index = i * batch_size end_index = (i * batch_size) + batch_size images_to_embed = [Image.open(filepath) for filepath in images[start_index:end_index]] img_emb = model.encode(images_to_embed, show_progress_bar=False) embeddings.extend(img_emb.tolist()) # Close images for image in images_to_embed: image.close() embeddings = np.array(embeddings) ``` Finally, we can fit BERTopic the way we are used to, with documents and embeddings: ```python from bertopic import BERTopic from sklearn.cluster import KMeans from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(stop_words="english") topic_model = BERTopic(vectorizer_model=vectorizer_model) topics, probs = topic_model.fit_transform(docs, embeddings) captions["Topic"] = topics ``` After fitting our model, let's inspect a topic about skateboarders: ```python >>> topic_model.get_topic(2) [('skateboard', 0.09592033177340711), ('skateboarder', 0.07792520092546491), ('trick', 0.07481578896400298), ('ramp', 0.056952605147927216), ('skate', 0.03745127816149923), ('perform', 0.036546213623432654), ('bicycle', 0.03453483070441857), ('bike', 0.033233021253898994), ('jump', 0.026709362981948037), ('air', 0.025422798170830936)] ``` Based on the above output, we can take an image to see if the representation makes sense: ```python image = captions.loc[captions.Topic == 2, "img_id"].values.tolist()[0] Image.open(image) ``` ![](skateboarders.jpg) ## **KeyBERT** & **BERTopic** Although BERTopic focuses on topic extraction methods that does not assume specific structures for the generated clusters, it is possible to do this on a more local level. More specifically, we can use KeyBERT to generate a number of keywords for each document and then build a vocabulary on top of that as the input for BERTopic. This way, we can select words that we know have meaning to a topic, without focusing on the centroid of that cluster. This also allows more frequent words to pop-up regardless of the structure and density of a cluster. To do this, we first need to run [KeyBERT](https://github.com/MaartenGr/KeyBERT) on our data and create our vocabulary: ```python from sklearn.datasets import fetch_20newsgroups from keybert import KeyBERT # Prepare documents docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] # Extract keywords kw_model = KeyBERT() keywords = kw_model.extract_keywords(docs) # Create our vocabulary vocabulary = [k[0] for keyword in keywords for k in keyword] vocabulary = list(set(vocabulary)) ``` Then, we pass our `vocabulary` to BERTopic and train the model: ```python from bertopic import BERTopic from sklearn.feature_extraction.text import CountVectorizer vectorizer_model= CountVectorizer(vocabulary=vocabulary) topic_model = BERTopic(vectorizer_model=vectorizer_model) topics, probs = topic_model.fit_transform(docs) ``` ================================================ FILE: docs/getting_started/topicreduction/topicreduction.md ================================================ BERTopic uses HDBSCAN for clustering the data and it cannot specify the number of clusters you would want. To a certain extent, this is an advantage, as we can trust HDBSCAN to be better in finding the number of clusters than we are. Instead, we can try to reduce the number of topics that have been created. Below, you will find three methods of doing so. !!! Warning For all cases of topic reduction it is generally advised to create the number of topics you would first through the clustering algorithm. That tends to be the most stable technique and often gives you the best results. This also applies with algorithms that do not allow you to select the number of topics beforehands, like HDBSCAN where you can make sure of the `min_cluster_size` parameter to control the number of topics. Therefore, it is **highly** advised to not use `nr_topics` before you have attempted to control the number of topics through the clustering algorithm! ### **Manual Topic Reduction** Each resulting topic has its feature vector constructed from c-TF-IDF. Using those feature vectors, we can find the most similar topics and merge them. Using `sklearn.cluster.AgglomerativeClustering`, the resulting feature vectors are clustered to get to the set value of `nr_topics` by finding out which topics are most similar to one another through cosine similarity. To do so, you can make sure of the `nr_topics` parameter: ```python from bertopic import BERTopic topic_model = BERTopic(nr_topics=20) ``` It is also possible to manually select certain topics that you believe should be merged. For example, if topic 1 is `1_space_launch_moon_nasa` and topic 2 is `2_spacecraft_solar_space_orbit` it might make sense to merge those two topics: ```python topics_to_merge = [1, 2] topic_model.merge_topics(docs, topics_to_merge) ``` If you have several groups of topics you want to merge, create a list of lists instead: ```python topics_to_merge = [[1, 2] [3, 4]] topic_model.merge_topics(docs, topics_to_merge) ``` ### **Automatic Topic Reduction** One issue with the approach above is that it will merge topics regardless of whether they are very similar. They are simply the most similar out of all options. This can be resolved by reducing the number of topics automatically. To do this, we can use HDBSCAN to cluster our topics using each c-TF-IDF representation. Then, we merge topics that are clustered together. Another benefit of HDBSCAN is that it generates outliers. These outliers prevent topics from being merged if no other topics are similar. To use this option, we simply set `nr_topics` to `"auto"`: ```python from bertopic import BERTopic topic_model = BERTopic(nr_topics="auto") ``` ### **Topic Reduction after Training** Finally, we can also reduce the number of topics after having trained a BERTopic model. The advantage of doing so is that you can decide the number of topics after knowing how many are created. It is difficult to predict before training your model how many topics that are in your documents and how many will be extracted. Instead, we can decide afterward how many topics seem realistic: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups # Create topics -> Typically over 50 topics docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Further reduce topics topic_model.reduce_topics(docs, nr_topics=30) # Access updated topics topics = topic_model.topics_ ``` The reasoning for putting `docs` as a parameter is that the documents are not saved within BERTopic on purpose. If you were to have a million documents, it is very inefficient to save those in BERTopic instead of a dedicated database. ================================================ FILE: docs/getting_started/topicrepresentation/topicrepresentation.md ================================================ The topics that are extracted from BERTopic are represented by words. These words are extracted from the documents occupying their topics using a class-based TF-IDF. This allows us to extract words that are interesting to a topic but less so to another. ### **Update Topic Representation after Training** When you have trained a model and viewed the topics and the words that represent them, you might not be satisfied with the representation. Perhaps you forgot to remove stop_words or you want to try out a different n_gram_range. We can use the function `update_topics` to update the topic representation with new parameters for `c-TF-IDF`: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups # Create topics docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic(n_gram_range=(2, 3)) topics, probs = topic_model.fit_transform(docs) ``` From the model created above, one of the most frequent topics is the following: ```python >>> topic_model.get_topic(31)[:10] [('clipper chip', 0.007240771542316232), ('key escrow', 0.004601603973377443), ('law enforcement', 0.004277247929596332), ('intercon com', 0.0035961920238955824), ('amanda walker', 0.003474856425297157), ('serial number', 0.0029876119137150358), ('com amanda', 0.002789303096817983), ('intercon com amanda', 0.0027386688593327084), ('amanda intercon', 0.002585262048515583), ('amanda intercon com', 0.002585262048515583)] ``` Although there does seems to be some relation between words, it is difficult, at least for me, to intuitively understand what the topic is about. Instead, let's simplify the topic representation by setting `n_gram_range` to (1, 3) to also allow for single words. ```python >>> topic_model.update_topics(docs, n_gram_range=(1, 3)) >>> topic_model.get_topic(31)[:10] [('encryption', 0.008021846079148017), ('clipper', 0.00789642647602742), ('chip', 0.00637127942464045), ('key', 0.006363124787175884), ('escrow', 0.005030980365244285), ('clipper chip', 0.0048271268437973395), ('keys', 0.0043245812747907545), ('crypto', 0.004311198708675516), ('intercon', 0.0038772934659295076), ('amanda', 0.003516026493904586)] ``` To me, the combination of the words above seem a bit more intuitive than the words we previously had! You can play around with `n_gram_range` or use your own custom `sklearn.feature_extraction.text.CountVectorizer` and pass that instead: ```python from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5)) topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` !!! Tip "Tip!" If you want to change the topics to something else, whether that is merging them or removing outliers, you can pass a custom list of topics to update them: `topic_model.update_topics(docs, topics=my_updated_topics)` ### **Custom labels** The topic labels are currently automatically generated by taking the top 3 words and combining them using the `_` separator. Although this is an informative label, in practice, this is definitely not the prettiest nor necessarily the most accurate label. For example, although the topic label `1_space_nasa_orbit` is informative, but we would prefer to have a bit more intuitive label, such as `space travel`. The difficulty with creating such topic labels is that much of the interpretation is left to the user. Would `space travel` be more accurate or perhaps `space explorations`? To truly understand which labels are most suited, going into some of the documents in topics is especially helpful. Although we can go through every single topic ourselves and try to label them, we can start by creating an overview of labels that have the length and number of words that we are looking for. To do so, we can generate our list of topic labels with `.generate_topic_labels` and define the number of words, the separator, word length, etc: ```python topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=10, separator=", ") ``` !!! Tip If you created [**multiple topic representations**](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) or aspects, you can choose one of these aspects with `aspect="Aspect1"` or whatever you named the aspect. In the above example, `1_space_nasa_orbit` would turn into `space, nasa, orbit` since we selected 3 words, no topic prefix, and the `, ` separator. We can then either change our `topic_labels` to whatever we want or directly pass them to `.set_topic_labels` so that they can be used across most visualization functions: ```python topic_model.set_topic_labels(topic_labels) ``` It is also possible to only change a few topic labels at a time by passing a dictionary where the key represents the *topic ID* and the value is the *topic label*: ```python topic_model.set_topic_labels({1: "Space Travel", 7: "Religion"}) ``` Then, to make use of those custom topic labels across visualizations, such as `.visualize_hierarchy()`, we can use the `custom_labels=True` parameter that is found in most visualizations. ```python fig = topic_model.visualize_barchart(custom_labels=True) ``` #### Optimize labels The great advantage of passing custom labels to BERTopic is that when more accurate zero-shot are released, we can simply use those on top of BERTopic to further fine-tune the labeling. For example, let's say you have a set of potential topic labels that you want to use instead of the ones generated by BERTopic. You could use the [bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli) model to find which user-defined labels best represent the BERTopic-generated labels: ```python from transformers import pipeline classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # A selected topic representation # 'god jesus atheists atheism belief atheist believe exist beliefs existence' sequence_to_classify = " ".join([word for word, _ in topic_model.get_topic(1)]) # Our set of potential topic labels candidate_labels = ['cooking', 'dancing', 'religion'] classifier(sequence_to_classify, candidate_labels) #{'labels': ['cooking', 'dancing', 'religion'], # 'scores': [0.086, 0.063, 0.850], # 'sequence': 'god jesus atheists atheism belief atheist believe exist beliefs existence'} ``` ================================================ FILE: docs/getting_started/topicsovertime/topicsovertime.md ================================================ Dynamic topic modeling (DTM) is a collection of techniques aimed at analyzing the evolution of topics over time. These methods allow you to understand how a topic is represented across different times. For example, in 1995 people may talk differently about environmental awareness than those in 2015. Although the topic itself remains the same, environmental awareness, the exact representation of that topic might differ. BERTopic allows for DTM by calculating the topic representation at each timestep without the need to run the entire model several times. To do this, we first need to fit BERTopic as if there were no temporal aspect in the data. Thus, a general topic model will be created. We use the global representation as to the main topics that can be found at, most likely, different timesteps. For each topic and timestep, we calculate the c-TF-IDF representation. This will result in a specific topic representation at each timestep without the need to create clusters from embeddings as they were already created.
--8<-- "docs/getting_started/topicsovertime/topicsovertime.svg"

Next, there are two main ways to further fine-tune these specific topic representations, namely **globally** and **evolutionary**. A topic representation at timestep *t* can be fine-tuned **globally** by averaging its c-TF-IDF representation with that of the global representation. This allows each topic representation to move slightly towards the global representation whilst still keeping some of its specific words. A topic representation at timestep *t* can be fine-tuned **evolutionary** by averaging its c-TF-IDF representation with that of the c-TF-IDF representation at timestep *t-1*. This is done for each topic representation allowing for the representations to evolve over time. Both fine-tuning methods are set to `True` as a default and allow for interesting representations to be created. ## **Example** To demonstrate DTM in BERTopic, we first need to prepare our data. A good example of where DTM is useful is topic modeling on Twitter data. We can analyze how certain people have talked about certain topics in the years they have been on Twitter. Due to the controversial nature of his tweets, we are going to be using all tweets by Donald Trump. First, we need to load the data and do some very basic cleaning. For example, I am not interested in his re-tweets for this use-case: ```python import re import pandas as pd # Prepare data trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6') trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1) trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1) trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1) trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :] timestamps = trump.date.to_list() tweets = trump.text.to_list() ``` Then, we need to extract the global topic representations by simply creating and training a BERTopic model: ```python from bertopic import BERTopic topic_model = BERTopic(verbose=True) topics, probs = topic_model.fit_transform(tweets) ``` From these topics, we are going to generate the topic representations at each timestamp for each topic. We do this by simply calling `topics_over_time` and passing the tweets, the corresponding timestamps, and the related topics: ```python topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=20) ``` And that is it! Aside from what you always need for BERTopic, you now only need to add `timestamps` to quickly calculate the topics over time. ## **Parameters** There are a few parameters that are of interest which will be discussed below. ### **Tuning** Both `global_tuning` and `evolutionary_tuning` are set to True as a default, but can easily be changed. Perhaps you do not want the representations to be influenced by the global representation and merely see how they evolved over time: ```python topics_over_time = topic_model.topics_over_time(tweets, timestamps, global_tuning=True, evolution_tuning=True, nr_bins=20) ``` ### **Bins** If you have more than 100 unique timestamps, then there will be topic representations created for each of those timestamps which can negatively affect the topic representations. It is advised to keep the number of unique timestamps below 50. To do this, you can simply set the number of bins that are created when calculating the topic representations. The timestamps will be taken and put into equal-sized bins: ```python topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=20) ``` ### **Datetime format** If you are passing strings (dates) instead of integers, then BERTopic will try to automatically detect which datetime format your strings have. Unfortunately, this will not always work if they are in an unexpected format. We can use `datetime_format` to pass the format the timestamps have: ```python topics_over_time = topic_model.topics_over_time(tweets, timestamps, datetime_format="%b%M", nr_bins=20) ``` ## **Visualization** To me, DTM becomes truly interesting when you have a good way of visualizing how topics have changed over time. A nice way of doing so is by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call `visualize_topics_over_time` with the newly created topics over time: ```python topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20) ``` I used `top_n_topics` to only show the top 20 most frequent topics. If I were to visualize all topics, which is possible by leaving `top_n_topics` empty, there is a chance that hundreds of lines will fill the plot. You can also use `topics` to show specific topics: ```python topic_model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91]) ``` ================================================ FILE: docs/getting_started/topicsovertime/trump.html ================================================
================================================ FILE: docs/getting_started/topicsperclass/topics_per_class.html ================================================
================================================ FILE: docs/getting_started/topicsperclass/topicsperclass.md ================================================ In some cases, you might be interested in how certain topics are represented over certain categories. Perhaps there are specific groups of users for which you want to see how they talk about certain topics. Instead of running the topic model per class, we can simply create a topic model and then extract, for each topic, its representation per class. This allows you to see how certain topics, calculated over all documents, are represented for certain subgroups.
--8<-- "docs/getting_started/topicsperclass/class_modeling.svg"

To do so, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents. First, let's prepare the data: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = data["data"] targets = data["target"] target_names = data["target_names"] classes = [data["target_names"][i] for i in data["target"]] ``` Next, we want to extract the topics across all documents without taking the categories into account: ```python topic_model = BERTopic(verbose=True) topics, probs = topic_model.fit_transform(docs) ``` Now that we have created our global topic model, let us calculate the topic representations across each category: ```python topics_per_class = topic_model.topics_per_class(docs, classes=classes) ``` The `classes` variable contains the class for each document. Then, we simply visualize these topics per class: ```python topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10) ``` You can hover over the bars to see the topic representation per class. As you can see in the visualization above, the topics `93_homosexual_homosexuality_sex` and `58_bike_bikes_motorcycle` are somewhat distributed over all classes. You can see that the topic representation between rec.motorcycles and rec.autos in `58_bike_bikes_motorcycle` clearly differs from one another. It seems that BERTopic has tried to combine those two categories into a single topic. However, since they do contain two separate topics, the topic representation in those two categories differs. We see something similar for `93_homosexual_homosexuality_sex`, where the topic is distributed among several categories and is represented slightly differently. Thus, you can see that although in certain categories the topic is similar, the way the topic is represented can differ. ================================================ FILE: docs/getting_started/vectorizers/vectorizers.md ================================================ In topic modeling, the quality of the topic representations is key for interpreting the topics, communicating results, and understanding patterns. It is of utmost importance to make sure that the topic representations fit with your use case. In practice, there is not one correct way of creating topic representations. Some use cases might opt for higher n-grams, whereas others might focus more on single words without any stop words. The diversity in use cases also means that we need to have some flexibility in BERTopic to make sure it can be used across most use cases. The image below illustrates this modularity:
![Image title](vectorizers.svg)
In this section, we will go through several examples of vectorization algorithms and how they can be implemented. ## **CountVectorizer** One often underestimated component of BERTopic is the `CountVectorizer` and `c-TF-IDF` calculation. Together, they are responsible for creating the topic representations and luckily can be quite flexible in parameter tuning. Here, we will go through tips and tricks for tuning your `CountVectorizer` and see how they might affect the topic representations. Before starting, it should be noted that you can pass the `CountVectorizer` before and after training your topic model. Passing it before training allows you to minimize the size of the resulting `c-TF-IDF` matrix: ```python from bertopic import BERTopic from sklearn.feature_extraction.text import CountVectorizer # Train BERTopic with a custom CountVectorizer vectorizer_model = CountVectorizer(min_df=10) topic_model = BERTopic(vectorizer_model=vectorizer_model) topics, probs = topic_model.fit_transform(docs) ``` Passing it after training allows you to fine-tune the topic representations by using `.update_topics()`: ```python from bertopic import BERTopic from sklearn.feature_extraction.text import CountVectorizer # Train a BERTopic model topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) # Fine-tune topic representations after training BERTopic vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=10) topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` The great thing about using `.update_topics()` is that it allows you to tweak the topic representations without re-training your model! Thus, here we will be focusing on fine-tuning our topic representations after training our model. !!! note The great thing about processing our topic representations with the `CountVectorizer` is that it does **not** influence the quality of clusters as that is being performed before generating the topic representations. ### **Basic Usage** First, let's start with defining our documents and training our topic model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups # Prepare documents docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] # Train a BERTopic model topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` Now, let's see the top 10 most frequent topics that have been generated: ```python >>> topic_model.get_topic_info()[1:11] Topic Count Name 1 0 1822 0_game_team_games_he 2 1 580 1_key_clipper_chip_encryption 3 2 532 2_ites_hello_cheek_hi 4 3 493 3_israel_israeli_jews_arab 5 4 453 4_card_monitor_video_drivers 6 5 438 5_you_your_post_jim 7 6 314 6_car_cars_engine_ford 8 7 279 7_health_newsgroup_cancer_1993 9 8 218 8_fbi_koresh_fire_gas 10 9 174 9_amp_audio_condition_asking ``` The topic representations generated already seem quite interpretable! However, I am quite sure we do much better without having to re-train our model. Next, we will go through common parameters in `CountVectorizer` and focus on the effects that they might have. As a baseline, we will be comparing them to the topic representation above. ### **Parameters** There are several basic parameters in the CountVectorizer that we can use to improve upon the quality of the resulting topic representations. #### ngram_range The `ngram_range` parameter allows us to decide how many tokens each entity is in a topic representation. For example, we have words like `game` and `team` with a length of 1 in a topic but it would also make sense to have words like `hockey league` with a length of 2. To allow for these words to be generated, we can set the `ngram_range` parameter: ```python from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english") topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` As you might have noticed, I also added `stop_words="english"`. This is necessary as longer words tend to have many stop words and removing them allows for nicer topic representations: ```python >>> topic_model.get_topic_info()[1:11] Topic Count Name 1 0 1822 0_game_team_games_players 2 1 580 1_key_clipper_chip_encryption 3 2 532 2_hello ites_forget hello_ites 15_huh hi 4 3 493 3_israel_israeli_jews_arab 5 4 453 4_card_monitor_video_drivers 6 5 438 5_post_jim_context_forged 7 6 314 6_car_cars_engine_ford 8 7 279 7_health_newsgroup_cancer_1993 9 8 218 8_fbi_koresh_gas_compound 10 9 174 9_amp_audio_condition_asking ``` Although they look very similar, if we zoom in on topic 8, we can see longer words in our representation: ```python >>> topic_model.get_topic(8) [('fbi', 0.019637149205975653), ('koresh', 0.019054514637064403), ('gas', 0.014156057632897179), ('compound', 0.012381224868591681), ('batf', 0.010349992314076047), ('children', 0.009336408916322387), ('tear gas', 0.008941747802855279), ('tear', 0.008446786597564537), ('davidians', 0.007911119583253022), ('started', 0.007398687505638955)] ``` `tear` and `gas` have now been combined into a single representation. This helps us understand what those individual words might have been representing. #### stop_words In some of the topics, we can see stop words appearing like `he` or `the`. Stop words are something we typically want to prevent in our topic representations as they do not give additional information to the topic. To prevent those stop words, we can use the `stop_words` parameter in the `CountVectorizer` to remove them from the representations: ```python from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(stop_words="english") topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` After running the above, we get the following output: ```python >>> topic_model.get_topic_info()[1:11] Topic Count Name 1 0 1822 0_game_team_games_players 2 1 580 1_key_clipper_chip_encryption 3 2 532 2_ites_cheek_hello_hi 4 3 493 3_israel_israeli_jews_arab 5 4 453 4_monitor_card_video_vga 6 5 438 5_post_jim_context_forged 7 6 314 6_car_cars_engine_ford 8 7 279 7_health_newsgroup_cancer_tobacco 9 8 218 8_fbi_koresh_gas_compound 10 9 174 9_amp_audio_condition_stereo ``` As you can see, the topic representations already look much better! Stop words are removed and the representations are more interpretable. We can also pass in a list of stop words if you have multiple languages to take into account. #### min_df One important parameter to keep in mind is the `min_df`. This is typically an integer representing how frequent a word must be before being added to our representation. You can imagine that if we have a million documents and a certain word only appears a single time across all of them, then it would be highly unlikely to be representative of a topic. Typically, the `c-TF-IDF` calculation removes that word from the topic representation but when you have millions of documents, that will also lead to a very large topic-term matrix. To prevent a huge vocabulary, we can set the `min_df` to only accept words that have a minimum frequency. When you have millions of documents or error issues, I would advise increasing the value of `min_df` as long as the topic representations might sense: ```python from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(min_df=10) topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` With the following topic representation: ```python >>> topic_model.get_topic_info()[1:11] Topic Count Name 1 0 1822 0_game_team_games_he 2 1 580 1_key_clipper_chip_encryption 3 2 532 2_hello_hi_yep_huh 4 3 493 3_israel_jews_jewish_peace 5 4 453 4_card_monitor_video_drivers 6 5 438 5_you_your_post_jim 7 6 314 6_car_cars_engine_ford 8 7 279 7_health_newsgroup_cancer_1993 9 8 218 8_fbi_koresh_fire_gas 10 9 174 9_audio_condition_stereo_asking ``` As you can see, the output is nearly the same which is what we would like to achieve. All words that appear less than 10 times are now removed from our topic-term matrix (i.e., `c-TF-IDF` matrix) which drastically lowers the matrix in size. #### max_features A parameter similar to `min_df` is `max_features` which allows you to select the top n most frequent words to be used in the topic representation. Setting this, for example, to `10_000` creates a topic-term matrix with `10_000` terms. This helps you control the size of the topic-term matrix directly without having to fiddle around with the `min_df` parameter: ```python from sklearn.feature_extraction.text import CountVectorizer vectorizer_model = CountVectorizer(max_features=10_000) topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` With the following representation: ```python >>> topic_model.get_topic_info()[1:11] Topic Count Name 1 0 1822 0_game_team_games_he 2 1 580 1_key_clipper_chip_encryption 3 2 532 2_hello_hi_yep_huh 4 3 493 3_israel_israeli_jews_arab 5 4 453 4_card_monitor_video_drivers 6 5 438 5_you_your_post_jim 7 6 314 6_car_cars_engine_ford 8 7 279 7_health_newsgroup_cancer_1993 9 8 218 8_fbi_koresh_fire_gas 10 9 174 9_amp_audio_condition_asking ``` As with `min_df`, we would like the topic representations to be very similar. #### tokenizer The default tokenizer in the CountVectorizer works well for western languages but fails to tokenize some non-western languages, like Chinese. Fortunately, we can use the `tokenizer` variable in the CountVectorizer to use [`jieba`](https://github.com/fxsjy/jieba), which is a package for Chinese text segmentation. Using it is straightforward: ```python from sklearn.feature_extraction.text import CountVectorizer import jieba def tokenize_zh(text): words = jieba.lcut(text) return words vectorizer = CountVectorizer(tokenizer=tokenize_zh) ``` Then, we can simply pass the vectorizer to update our topic representations: ```python topic_model.update_topics(docs, vectorizer_model=vectorizer_model) ``` ## **OnlineCountVectorizer** When using the online/incremental variant of BERTopic, we need a `CountVectorizer` than can incrementally update its representation. For that purpose, `OnlineCountVectorizer` was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. It is a class that can be found in `bertopic.vectorizers` which extends `sklearn.feature_extraction.text.CountVectorizer`. In other words, you can use the exact same parameter in `OnlineCountVectorizer` as found in Scikit-Learn's `CountVectorizer`. We can use it as follows: ```python from bertopic import BERTopic from bertopic.vectorizers import OnlineCountVectorizer # Train BERTopic with a custom OnlineCountVectorizer vectorizer_model = OnlineCountVectorizer() topic_model = BERTopic(vectorizer_model=vectorizer_model) ``` ### **Parameters** Other than parameters found in `CountVectorizer`, such as `stop_words` and `ngram_range`, we can two parameters in `OnlineCountVectorizer` to adjust the way old data is processed and kept. #### decay At each iteration, we sum the bag-of-words representation of the new documents with the bag-of-words representation of all documents processed thus far. In other words, the bag-of-words matrix keeps increasing with each iteration. However, especially in a streaming setting, older documents might become less and less relevant as time goes on. Therefore, a `decay` parameter was implemented that decays the bag-of-words' frequencies at each iteration before adding the document frequencies of new documents. The `decay` parameter is a value between 0 and 1 and indicates the percentage of frequencies the previous bag-of-words matrix should be reduced to. For example, a value of `.1` will decrease the frequencies in the bag-of-words matrix by 10% at each iteration before adding the new bag-of-words matrix. This will make sure that recent data has more weight than previous iterations. #### delete_min_df In BERTopic, we might want to remove words from the topic representation that appear infrequently. The `min_df` in the `CountVectorizer` works quite well for that. However, when we have a streaming setting, the `min_df` does not work as well since a word's frequency might start below `min_df` but will end up higher than that over time. Setting that value high might not always be advised. As a result, the vocabulary of the resulting bag-of-words matrix can become quite large. Similarly, if we implement the `decay` parameter, then some values will decrease over time until they are below `min_df`. For these reasons, the `delete_min_df` parameter was implemented. The parameter takes positive integers and indicates, at each iteration, which words will be removed. If the value is set to 5, it will check after each iteration if the total frequency of a word is exceeded by that value. If so, the word will be removed in its entirety from the bag-of-words matrix. This helps to keep the bag-of-words matrix of a manageable size. !!! note Although the `delete_min_df` parameter removes words from the bag-of-words matrix, it is not permanent. If new documents come in where those previously deleted words are used frequently, they get added back to the matrix. ================================================ FILE: docs/getting_started/visualization/bar_chart.html ================================================
================================================ FILE: docs/getting_started/visualization/datamapplot.html ================================================ Interactive Data Map
Loading...
Point Data: 0%
Label Data: 0%
Meta Data: 0%
================================================ FILE: docs/getting_started/visualization/documents.html ================================================
================================================ FILE: docs/getting_started/visualization/heatmap.html ================================================
================================================ FILE: docs/getting_started/visualization/hierarchical_documents.html ================================================
================================================ FILE: docs/getting_started/visualization/hierarchical_topics.html ================================================
================================================ FILE: docs/getting_started/visualization/hierarchy.html ================================================
================================================ FILE: docs/getting_started/visualization/probabilities.html ================================================
================================================ FILE: docs/getting_started/visualization/term_rank.html ================================================
================================================ FILE: docs/getting_started/visualization/term_rank_log.html ================================================
================================================ FILE: docs/getting_started/visualization/topics_per_class.html ================================================
================================================ FILE: docs/getting_started/visualization/trump.html ================================================
================================================ FILE: docs/getting_started/visualization/visualization.md ================================================ Visualizing BERTopic and its derivatives is important in understanding the model, how it works, and more importantly, where it works. Since topic modeling can be quite a subjective field it is difficult for users to validate their models. Looking at the topics and seeing if they make sense is an important factor in alleviating this issue. ## **Visualize Topics** After having trained our `BERTopic` model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can visualize the topics that were generated in a way very similar to [LDAvis](https://github.com/cpsievert/LDAvis). We embed our c-TF-IDF representation of the topics in 2D using Umap and then visualize the two dimensions using plotly such that we can create an interactive view. First, we need to train our model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` Then, we can call `.visualize_topics` to create a 2D representation of your topics. The resulting graph is a plotly interactive graph which can be converted to HTML: ```python topic_model.visualize_topics() ``` You can use the slider to select the topic which then lights up red. If you hover over a topic, then general information is given about the topic, including the size of the topic and its corresponding words. ## **Visualize Documents** Using the previous method, we can visualize the topics and get insight into their relationships. However, you might want a more fine-grained approach where we can visualize the documents inside the topics to see if they were assigned correctly or whether they make sense. To do so, we can use the `topic_model.visualize_documents()` function. This function recalculates the document embeddings and reduces them to 2-dimensional space for easier visualization purposes. This process can be quite expensive, so it is advised to adhere to the following pipeline: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic topic_model = BERTopic().fit(docs, embeddings) # Run the visualization with the original embeddings topic_model.visualize_documents(docs, embeddings=embeddings) # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively: reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) ``` !!! note The visualization above was generated with the additional parameter `hide_document_hover=True` which disables the option to hover over the individual points and see the content of the documents. This was done for demonstration purposes as saving all those documents in the visualization can be quite expensive and result in large files. However, it might be interesting to set `hide_document_hover=False` in order to hover over the points and see the content of the documents. ### **Custom Hover** When you visualize the documents, you might not always want to see the complete document over hover. Many documents have shorter information that might be more interesting to visualize, such as its title. To create the hover based on a documents' title instead of its content, you can simply pass a variable (`titles`) containing the title for each document: ```python topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings) ``` ## **Visualize Topic Hierarchy** The topics that were created can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use `scipy.cluster.hierarchy` to create clusters and visualize how they relate to one another. This might help to select an appropriate `nr_topics` when reducing the number of topics that you have created. To visualize this hierarchy, run the following: ```python topic_model.visualize_hierarchy() ``` !!! note Do note that this is not the actual procedure of `.reduce_topics()` when `nr_topics` is set to auto since HDBSCAN is used to automatically extract topics. The visualization above closely resembles the actual procedure of `.reduce_topics()` when any number of `nr_topics` is selected. ### **Hierarchical labels** Although visualizing this hierarchy gives us information about the structure, it would be helpful to see what happens to the topic representations when merging topics. To do so, we first need to calculate the representations of the hierarchical topics: First, we train a basic BERTopic model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))["data"] topic_model = BERTopic(verbose=True) topics, probs = topic_model.fit_transform(docs) hierarchical_topics = topic_model.hierarchical_topics(docs) ``` To visualize these results, we simply need to pass the resulting `hierarchical_topics` to our `.visualize_hierarchy` function: ```python topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) ``` If you **hover** over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes. ### **Text-based topic tree** Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use `topic_model.get_topic_tree` to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged: ```python >>> tree = topic_model.get_topic_tree(hierarchical_topics) >>> print(tree) . └─atheists_atheism_god_moral_atheist ├─atheists_atheism_god_atheist_argument │ ├─■──atheists_atheism_god_atheist_argument ── Topic: 21 │ └─■──br_god_exist_genetic_existence ── Topic: 124 └─■──moral_morality_objective_immoral_morals ── Topic: 29 ```
Click here to view the full tree. ```bash . ├─people_armenian_said_god_armenians │ ├─god_jesus_jehovah_lord_christ │ │ ├─god_jesus_jehovah_lord_christ │ │ │ ├─jehovah_lord_mormon_mcconkie_god │ │ │ │ ├─■──ra_satan_thou_god_lucifer ── Topic: 94 │ │ │ │ └─■──jehovah_lord_mormon_mcconkie_unto ── Topic: 78 │ │ │ └─jesus_mary_god_hell_sin │ │ │ ├─jesus_hell_god_eternal_heaven │ │ │ │ ├─hell_jesus_eternal_god_heaven │ │ │ │ │ ├─■──jesus_tomb_disciples_resurrection_john ── Topic: 69 │ │ │ │ │ └─■──hell_eternal_god_jesus_heaven ── Topic: 53 │ │ │ │ └─■──aaron_baptism_sin_law_god ── Topic: 89 │ │ │ └─■──mary_sin_maria_priest_conception ── Topic: 56 │ │ └─■──marriage_married_marry_ceremony_marriages ── Topic: 110 │ └─people_armenian_armenians_said_mr │ ├─people_armenian_armenians_said_israel │ │ ├─god_homosexual_homosexuality_atheists_sex │ │ │ ├─homosexual_homosexuality_sex_gay_homosexuals │ │ │ │ ├─■──kinsey_sex_gay_men_sexual ── Topic: 44 │ │ │ │ └─homosexuality_homosexual_sin_homosexuals_gay │ │ │ │ ├─■──gay_homosexual_homosexuals_sexual_cramer ── Topic: 50 │ │ │ │ └─■──homosexuality_homosexual_sin_paul_sex ── Topic: 27 │ │ │ └─god_atheists_atheism_moral_atheist │ │ │ ├─islam_quran_judas_islamic_book │ │ │ │ ├─■──jim_context_challenges_articles_quote ── Topic: 36 │ │ │ │ └─islam_quran_judas_islamic_book │ │ │ │ ├─■──islam_quran_islamic_rushdie_muslims ── Topic: 31 │ │ │ │ └─■──judas_scripture_bible_books_greek ── Topic: 33 │ │ │ └─atheists_atheism_god_moral_atheist │ │ │ ├─atheists_atheism_god_atheist_argument │ │ │ │ ├─■──atheists_atheism_god_atheist_argument ── Topic: 21 │ │ │ │ └─■──br_god_exist_genetic_existence ── Topic: 124 │ │ │ └─■──moral_morality_objective_immoral_morals ── Topic: 29 │ │ └─armenian_armenians_people_israel_said │ │ ├─armenian_armenians_israel_people_jews │ │ │ ├─tax_rights_government_income_taxes │ │ │ │ ├─■──rights_right_slavery_slaves_residence ── Topic: 106 │ │ │ │ └─tax_government_taxes_income_libertarians │ │ │ │ ├─■──government_libertarians_libertarian_regulation_party ── Topic: 58 │ │ │ │ └─■──tax_taxes_income_billion_deficit ── Topic: 41 │ │ │ └─armenian_armenians_israel_people_jews │ │ │ ├─gun_guns_militia_firearms_amendment │ │ │ │ ├─■──blacks_penalty_death_cruel_punishment ── Topic: 55 │ │ │ │ └─■──gun_guns_militia_firearms_amendment ── Topic: 7 │ │ │ └─armenian_armenians_israel_jews_turkish │ │ │ ├─■──israel_israeli_jews_arab_jewish ── Topic: 4 │ │ │ └─■──armenian_armenians_turkish_armenia_azerbaijan ── Topic: 15 │ │ └─stephanopoulos_president_mr_myers_ms │ │ ├─■──serbs_muslims_stephanopoulos_mr_bosnia ── Topic: 35 │ │ └─■──myers_stephanopoulos_president_ms_mr ── Topic: 87 │ └─batf_fbi_koresh_compound_gas │ ├─■──reno_workers_janet_clinton_waco ── Topic: 77 │ └─batf_fbi_koresh_gas_compound │ ├─batf_koresh_fbi_warrant_compound │ │ ├─■──batf_warrant_raid_compound_fbi ── Topic: 42 │ │ └─■──koresh_batf_fbi_children_compound ── Topic: 61 │ └─■──fbi_gas_tear_bds_building ── Topic: 23 └─use_like_just_dont_new ├─game_team_year_games_like │ ├─game_team_games_25_year │ │ ├─game_team_games_25_season │ │ │ ├─window_printer_use_problem_mhz │ │ │ │ ├─mhz_wire_simms_wiring_battery │ │ │ │ │ ├─simms_mhz_battery_cpu_heat │ │ │ │ │ │ ├─simms_pds_simm_vram_lc │ │ │ │ │ │ │ ├─■──pds_nubus_lc_slot_card ── Topic: 119 │ │ │ │ │ │ │ └─■──simms_simm_vram_meg_dram ── Topic: 32 │ │ │ │ │ │ └─mhz_battery_cpu_heat_speed │ │ │ │ │ │ ├─mhz_cpu_speed_heat_fan │ │ │ │ │ │ │ ├─mhz_cpu_speed_heat_fan │ │ │ │ │ │ │ │ ├─■──fan_cpu_heat_sink_fans ── Topic: 92 │ │ │ │ │ │ │ │ └─■──mhz_speed_cpu_fpu_clock ── Topic: 22 │ │ │ │ │ │ │ └─■──monitor_turn_power_computer_electricity ── Topic: 91 │ │ │ │ │ │ └─battery_batteries_concrete_duo_discharge │ │ │ │ │ │ ├─■──duo_battery_apple_230_problem ── Topic: 121 │ │ │ │ │ │ └─■──battery_batteries_concrete_discharge_temperature ── Topic: 75 │ │ │ │ │ └─wire_wiring_ground_neutral_outlets │ │ │ │ │ ├─wire_wiring_ground_neutral_outlets │ │ │ │ │ │ ├─wire_wiring_ground_neutral_outlets │ │ │ │ │ │ │ ├─■──leds_uv_blue_light_boards ── Topic: 66 │ │ │ │ │ │ │ └─■──wire_wiring_ground_neutral_outlets ── Topic: 120 │ │ │ │ │ │ └─scope_scopes_phone_dial_number │ │ │ │ │ │ ├─■──dial_number_phone_line_output ── Topic: 93 │ │ │ │ │ │ └─■──scope_scopes_motorola_generator_oscilloscope ── Topic: 113 │ │ │ │ │ └─celp_dsp_sampling_antenna_digital │ │ │ │ │ ├─■──antenna_antennas_receiver_cable_transmitter ── Topic: 70 │ │ │ │ │ └─■──celp_dsp_sampling_speech_voice ── Topic: 52 │ │ │ │ └─window_printer_xv_mouse_windows │ │ │ │ ├─window_xv_error_widget_problem │ │ │ │ │ ├─error_symbol_undefined_xterm_rx │ │ │ │ │ │ ├─■──symbol_error_undefined_doug_parse ── Topic: 63 │ │ │ │ │ │ └─■──rx_remote_server_xdm_xterm ── Topic: 45 │ │ │ │ │ └─window_xv_widget_application_expose │ │ │ │ │ ├─window_widget_expose_application_event │ │ │ │ │ │ ├─■──gc_mydisplay_draw_gxxor_drawing ── Topic: 103 │ │ │ │ │ │ └─■──window_widget_application_expose_event ── Topic: 25 │ │ │ │ │ └─xv_den_polygon_points_algorithm │ │ │ │ │ ├─■──den_polygon_points_algorithm_polygons ── Topic: 28 │ │ │ │ │ └─■──xv_24bit_image_bit_images ── Topic: 57 │ │ │ │ └─printer_fonts_print_mouse_postscript │ │ │ │ ├─printer_fonts_print_font_deskjet │ │ │ │ │ ├─■──scanner_logitech_grayscale_ocr_scanman ── Topic: 108 │ │ │ │ │ └─printer_fonts_print_font_deskjet │ │ │ │ │ ├─■──printer_print_deskjet_hp_ink ── Topic: 18 │ │ │ │ │ └─■──fonts_font_truetype_tt_atm ── Topic: 49 │ │ │ │ └─mouse_ghostscript_midi_driver_postscript │ │ │ │ ├─ghostscript_midi_postscript_files_file │ │ │ │ │ ├─■──ghostscript_postscript_pageview_ghostview_dsc ── Topic: 104 │ │ │ │ │ └─midi_sound_file_windows_driver │ │ │ │ │ ├─■──location_mar_file_host_rwrr ── Topic: 83 │ │ │ │ │ └─■──midi_sound_driver_blaster_soundblaster ── Topic: 98 │ │ │ │ └─■──mouse_driver_mice_ball_problem ── Topic: 68 │ │ │ └─game_team_games_25_season │ │ │ ├─1st_sale_condition_comics_hulk │ │ │ │ ├─sale_condition_offer_asking_cd │ │ │ │ │ ├─condition_stereo_amp_speakers_asking │ │ │ │ │ │ ├─■──miles_car_amfm_toyota_cassette ── Topic: 62 │ │ │ │ │ │ └─■──amp_speakers_condition_stereo_audio ── Topic: 24 │ │ │ │ │ └─games_sale_pom_cds_shipping │ │ │ │ │ ├─pom_cds_sale_shipping_cd │ │ │ │ │ │ ├─■──size_shipping_sale_condition_mattress ── Topic: 100 │ │ │ │ │ │ └─■──pom_cds_cd_sale_picture ── Topic: 37 │ │ │ │ │ └─■──games_game_snes_sega_genesis ── Topic: 40 │ │ │ │ └─1st_hulk_comics_art_appears │ │ │ │ ├─1st_hulk_comics_art_appears │ │ │ │ │ ├─lens_tape_camera_backup_lenses │ │ │ │ │ │ ├─■──tape_backup_tapes_drive_4mm ── Topic: 107 │ │ │ │ │ │ └─■──lens_camera_lenses_zoom_pouch ── Topic: 114 │ │ │ │ │ └─1st_hulk_comics_art_appears │ │ │ │ │ ├─■──1st_hulk_comics_art_appears ── Topic: 105 │ │ │ │ │ └─■──books_book_cover_trek_chemistry ── Topic: 125 │ │ │ │ └─tickets_hotel_ticket_voucher_package │ │ │ │ ├─■──hotel_voucher_package_vacation_room ── Topic: 74 │ │ │ │ └─■──tickets_ticket_june_airlines_july ── Topic: 84 │ │ │ └─game_team_games_season_hockey │ │ │ ├─game_hockey_team_25_550 │ │ │ │ ├─■──espn_pt_pts_game_la ── Topic: 17 │ │ │ │ └─■──team_25_game_hockey_550 ── Topic: 2 │ │ │ └─■──year_game_hit_baseball_players ── Topic: 0 │ │ └─bike_car_greek_insurance_msg │ │ ├─car_bike_insurance_cars_engine │ │ │ ├─car_insurance_cars_radar_engine │ │ │ │ ├─insurance_health_private_care_canada │ │ │ │ │ ├─■──insurance_health_private_care_canada ── Topic: 99 │ │ │ │ │ └─■──insurance_car_accident_rates_sue ── Topic: 82 │ │ │ │ └─car_cars_radar_engine_detector │ │ │ │ ├─car_radar_cars_detector_engine │ │ │ │ │ ├─■──radar_detector_detectors_ka_alarm ── Topic: 39 │ │ │ │ │ └─car_cars_mustang_ford_engine │ │ │ │ │ ├─■──clutch_shift_shifting_transmission_gear ── Topic: 88 │ │ │ │ │ └─■──car_cars_mustang_ford_v8 ── Topic: 14 │ │ │ │ └─oil_diesel_odometer_diesels_car │ │ │ │ ├─odometer_oil_sensor_car_drain │ │ │ │ │ ├─■──odometer_sensor_speedo_gauge_mileage ── Topic: 96 │ │ │ │ │ └─■──oil_drain_car_leaks_taillights ── Topic: 102 │ │ │ │ └─■──diesel_diesels_emissions_fuel_oil ── Topic: 79 │ │ │ └─bike_riding_ride_bikes_motorcycle │ │ │ ├─bike_ride_riding_bikes_lane │ │ │ │ ├─■──bike_ride_riding_lane_car ── Topic: 11 │ │ │ │ └─■──bike_bikes_miles_honda_motorcycle ── Topic: 19 │ │ │ └─■──countersteering_bike_motorcycle_rear_shaft ── Topic: 46 │ │ └─greek_msg_kuwait_greece_water │ │ ├─greek_msg_kuwait_greece_water │ │ │ ├─greek_msg_kuwait_greece_dog │ │ │ │ ├─greek_msg_kuwait_greece_dog │ │ │ │ │ ├─greek_kuwait_greece_turkish_greeks │ │ │ │ │ │ ├─■──greek_greece_turkish_greeks_cyprus ── Topic: 71 │ │ │ │ │ │ └─■──kuwait_iraq_iran_gulf_arabia ── Topic: 76 │ │ │ │ │ └─msg_dog_drugs_drug_food │ │ │ │ │ ├─dog_dogs_cooper_trial_weaver │ │ │ │ │ │ ├─■──clinton_bush_quayle_reagan_panicking ── Topic: 101 │ │ │ │ │ │ └─dog_dogs_cooper_trial_weaver │ │ │ │ │ │ ├─■──cooper_trial_weaver_spence_witnesses ── Topic: 90 │ │ │ │ │ │ └─■──dog_dogs_bike_trained_springer ── Topic: 67 │ │ │ │ │ └─msg_drugs_drug_food_chinese │ │ │ │ │ ├─■──msg_food_chinese_foods_taste ── Topic: 30 │ │ │ │ │ └─■──drugs_drug_marijuana_cocaine_alcohol ── Topic: 72 │ │ │ │ └─water_theory_universe_science_larsons │ │ │ │ ├─water_nuclear_cooling_steam_dept │ │ │ │ │ ├─■──rocketry_rockets_engines_nuclear_plutonium ── Topic: 115 │ │ │ │ │ └─water_cooling_steam_dept_plants │ │ │ │ │ ├─■──water_dept_phd_environmental_atmospheric ── Topic: 97 │ │ │ │ │ └─■──cooling_water_steam_towers_plants ── Topic: 109 │ │ │ │ └─theory_universe_larsons_larson_science │ │ │ │ ├─■──theory_universe_larsons_larson_science ── Topic: 54 │ │ │ │ └─■──oort_cloud_grbs_gamma_burst ── Topic: 80 │ │ │ └─helmet_kirlian_photography_lock_wax │ │ │ ├─helmet_kirlian_photography_leaf_mask │ │ │ │ ├─kirlian_photography_leaf_pictures_deleted │ │ │ │ │ ├─deleted_joke_stuff_maddi_nickname │ │ │ │ │ │ ├─■──joke_maddi_nickname_nicknames_frank ── Topic: 43 │ │ │ │ │ │ └─■──deleted_stuff_bookstore_joke_motto ── Topic: 81 │ │ │ │ │ └─■──kirlian_photography_leaf_pictures_aura ── Topic: 85 │ │ │ │ └─helmet_mask_liner_foam_cb │ │ │ │ ├─■──helmet_liner_foam_cb_helmets ── Topic: 112 │ │ │ │ └─■──mask_goalies_77_santore_tl ── Topic: 123 │ │ │ └─lock_wax_paint_plastic_ear │ │ │ ├─■──lock_cable_locks_bike_600 ── Topic: 117 │ │ │ └─wax_paint_ear_plastic_skin │ │ │ ├─■──wax_paint_plastic_scratches_solvent ── Topic: 65 │ │ │ └─■──ear_wax_skin_greasy_acne ── Topic: 116 │ │ └─m4_mp_14_mw_mo │ │ ├─m4_mp_14_mw_mo │ │ │ ├─■──m4_mp_14_mw_mo ── Topic: 111 │ │ │ └─■──test_ensign_nameless_deane_deanebinahccbrandeisedu ── Topic: 118 │ │ └─■──ites_cheek_hello_hi_ken ── Topic: 3 │ └─space_medical_health_disease_cancer │ ├─medical_health_disease_cancer_patients │ │ ├─■──cancer_centers_center_medical_research ── Topic: 122 │ │ └─health_medical_disease_patients_hiv │ │ ├─patients_medical_disease_candida_health │ │ │ ├─■──candida_yeast_infection_gonorrhea_infections ── Topic: 48 │ │ │ └─patients_disease_cancer_medical_doctor │ │ │ ├─■──hiv_medical_cancer_patients_doctor ── Topic: 34 │ │ │ └─■──pain_drug_patients_disease_diet ── Topic: 26 │ │ └─■──health_newsgroup_tobacco_vote_votes ── Topic: 9 │ └─space_launch_nasa_shuttle_orbit │ ├─space_moon_station_nasa_launch │ │ ├─■──sky_advertising_billboard_billboards_space ── Topic: 59 │ │ └─■──space_station_moon_redesign_nasa ── Topic: 16 │ └─space_mission_hst_launch_orbit │ ├─space_launch_nasa_orbit_propulsion │ │ ├─■──space_launch_nasa_propulsion_astronaut ── Topic: 47 │ │ └─■──orbit_km_jupiter_probe_earth ── Topic: 86 │ └─■──hst_mission_shuttle_orbit_arrays ── Topic: 60 └─drive_file_key_windows_use ├─key_file_jpeg_encryption_image │ ├─key_encryption_clipper_chip_keys │ │ ├─■──key_clipper_encryption_chip_keys ── Topic: 1 │ │ └─■──entry_file_ripem_entries_key ── Topic: 73 │ └─jpeg_image_file_gif_images │ ├─motif_graphics_ftp_available_3d │ │ ├─motif_graphics_openwindows_ftp_available │ │ │ ├─■──openwindows_motif_xview_windows_mouse ── Topic: 20 │ │ │ └─■──graphics_widget_ray_3d_available ── Topic: 95 │ │ └─■──3d_machines_version_comments_contact ── Topic: 38 │ └─jpeg_image_gif_images_format │ ├─■──gopher_ftp_files_stuffit_images ── Topic: 51 │ └─■──jpeg_image_gif_format_images ── Topic: 13 └─drive_db_card_scsi_windows ├─db_windows_dos_mov_os2 │ ├─■──copy_protection_program_software_disk ── Topic: 64 │ └─■──db_windows_dos_mov_os2 ── Topic: 8 └─drive_card_scsi_drives_ide ├─drive_scsi_drives_ide_disk │ ├─■──drive_scsi_drives_ide_disk ── Topic: 6 │ └─■──meg_sale_ram_drive_shipping ── Topic: 12 └─card_modem_monitor_video_drivers ├─■──card_monitor_video_drivers_vga ── Topic: 5 └─■──modem_port_serial_irq_com ── Topic: 10 ```
## **Visualize Hierarchical Documents** We can extend the previous method by calculating the topic representation at different levels of the hierarchy and plotting them on a 2D plane. To do so, we first need to calculate the hierarchical topics: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic and extract hierarchical topics topic_model = BERTopic().fit(docs, embeddings) hierarchical_topics = topic_model.hierarchical_topics(docs) ``` Then, we can visualize the hierarchical documents by either supplying it with our embeddings or by reducing their dimensionality ourselves: ```python # Run the visualization with the original embeddings topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings) # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively: reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) ``` !!! note The visualization above was generated with the additional parameter `hide_document_hover=True` which disables the option to hover over the individual points and see the content of the documents. This makes the resulting visualization smaller and fit into your RAM. However, it might be interesting to set `hide_document_hover=False` to hover over the points and see the content of the documents. ## **Visualize Terms** We can visualize the selected terms for a few topics by creating bar charts out of the c-TF-IDF scores for each topic representation. Insights can be gained from the relative c-TF-IDF scores between and within topics. Moreover, you can easily compare topic representations to each other. To visualize this hierarchy, run the following: ```python topic_model.visualize_barchart() ``` ## **Visualize Topic Similarity** Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other. To visualize the heatmap, run the following: ```python topic_model.visualize_heatmap() ``` !!! note You can set `n_clusters` in `visualize_heatmap` to order the topics by their similarity. This will result in blocks being formed in the heatmap indicating which clusters of topics are similar to each other. This step is very much recommended as it will make reading the heatmap easier. ## **Visualize Term Score Decline** Topics are represented by a number of words starting with the best representative word. Each word is represented by a c-TF-IDF score. The higher the score, the more representative a word to the topic is. Since the topic words are sorted by their c-TF-IDF score, the scores slowly decline with each word that is added. At some point adding words to the topic representation only marginally increases the total c-TF-IDF score and would not be beneficial for its representation. To visualize this effect, we can plot the c-TF-IDF scores for each topic by the term rank of each word. In other words, the position of the words (term rank), where the words with the highest c-TF-IDF score will have a rank of 1, will be put on the x-axis. Whereas the y-axis will be populated by the c-TF-IDF scores. The result is a visualization that shows you the decline of c-TF-IDF score when adding words to the topic representation. It allows you, using the elbow method, the select the best number of words in a topic. To visualize the c-TF-IDF score decline, run the following: ```python topic_model.visualize_term_rank() ``` To enable the log scale on the y-axis for a better view of individual topics, run the following: ```python topic_model.visualize_term_rank(log_scale=True) ``` This visualization was heavily inspired by the "Term Probability Decline" visualization found in an analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/). Reference to that specific analysis can be found [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html). ## **Visualize Topics over Time** After creating topics over time with Dynamic Topic Modeling, we can visualize these topics by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call `.visualize_topics_over_time` with the newly created topics over time: ```python import re import pandas as pd from bertopic import BERTopic # Prepare data trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6') trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1) trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1) trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1) trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :] timestamps = trump.date.to_list() tweets = trump.text.to_list() # Create topics over time model = BERTopic(verbose=True) topics, probs = model.fit_transform(tweets) topics_over_time = model.topics_over_time(tweets, timestamps) ``` Then, we visualize some interesting topics: ```python model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91]) ``` ## **Visualize Topics per Class** You might want to extract and visualize the topic representation per class. For example, if you have specific groups of users that might approach topics differently, then extracting them would help understanding how these users talk about certain topics. In other words, this is simply creating a topic representation for certain classes that you might have in your data. First, we need to train our model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups # Prepare data and classes data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = data["data"] classes = [data["target_names"][i] for i in data["target"]] # Create topic model and calculate topics per class topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) topics_per_class = topic_model.topics_per_class(docs, classes=classes) ``` Then, we visualize the topic representation of major topics per class: ```python topic_model.visualize_topics_per_class(topics_per_class) ``` ## **Visualize Probabilities or Distribution** We can generate the topic-document probability matrix by simply setting `calculate_probabilities=True` if a HDBSCAN model is used: ```python from bertopic import BERTopic topic_model = BERTopic(calculate_probabilities=True) topics, probs = topic_model.fit_transform(docs) ``` The resulting `probs` variable contains the soft-clustering as done through HDBSCAN. If a non-HDBSCAN model is used, we can estimate the topic distributions after training our model: ```python from bertopic import BERTopic topic_model = BERTopic() topics, _ = topic_model.fit_transform(docs) topic_distr, _ = topic_model.approximate_distribution(docs, min_similarity=0) ``` Then, we either pass the `probs` or `topic_distr` variable to `.visualize_distribution` to visualize either the probability distributions or the topic distributions: ```python # To visualize the probabilities of topic assignment topic_model.visualize_distribution(probs[0]) # To visualize the topic distributions in a document topic_model.visualize_distribution(topic_distr[0]) ``` Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results: ```python # Calculate the topic distributions on a token-level topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) # Visualize the token-level distributions df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1]) df ```



!!! note To get the stylized dataframe for `.visualize_approximate_distribution` you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via `pip install jinja2` !!! note The distribution of the probabilities does not give an indication to the distribution of the frequencies of topics across a document. It merely shows how confident BERTopic is that certain topics can be found in a document. ================================================ FILE: docs/getting_started/visualization/visualize_documents.md ================================================ ## **Visualize documents with Plotly** Using the `.visualize_topics`, we can visualize the topics and get insight into their relationships. However, you might want a more fine-grained approach where we can visualize the documents inside the topics to see if they were assigned correctly or whether they make sense. To do so, we can use the `topic_model.visualize_documents()` function. This function recalculates the document embeddings and reduces them to 2-dimensional space for easier visualization purposes. This process can be quite expensive, so it is advised to adhere to the following pipeline: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic topic_model = BERTopic().fit(docs, embeddings) # Run the visualization with the original embeddings topic_model.visualize_documents(docs, embeddings=embeddings) # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively: reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) ``` !!! note The visualization above was generated with the additional parameter `hide_document_hover=True` which disables the option to hover over the individual points and see the content of the documents. This was done for demonstration purposes as saving all those documents in the visualization can be quite expensive and result in large files. However, it might be interesting to set `hide_document_hover=False` in order to hover over the points and see the content of the documents. ### **Custom Hover** When you visualize the documents, you might not always want to see the complete document over hover. Many documents have shorter information that might be more interesting to visualize, such as its title. To create the hover based on a documents' title instead of its content, you can simply pass a variable (`titles`) containing the title for each document: ```python topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings) ``` ## **Visualize documents with DataMapPlot** `.visualize_document_datamap` provides an alternative way to visualize the documents inside the topics as a static [DataMapPlot](https://datamapplot.readthedocs.io/en/latest/intro_splash.html). Using the same pipeline as above, you can generate a DataMapPlot by running: ```python # with the original embeddings topic_model.visualize_document_datamap(docs, embeddings=embeddings) # with the reduced embeddings topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings) ```



Or if you want to save the resulting figure: ```python fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings) fig.savefig("path/to/file.png", bbox_inches="tight") ``` ### Interactive DataMapPlot DataMapPlot has the amazing ability to also generate interactive plots. These plots generate HTML files that allow you zoom in on the generated topics and explore the data. Usage is straightforward, simply set `interactive=True`: ```python fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings, interactive=True) ``` ## **Visualize Probabilities or Distribution** We can generate the topic-document probability matrix by simply setting `calculate_probabilities=True` if a HDBSCAN model is used: ```python from bertopic import BERTopic topic_model = BERTopic(calculate_probabilities=True) topics, probs = topic_model.fit_transform(docs) ``` The resulting `probs` variable contains the soft-clustering as done through HDBSCAN. If a non-HDBSCAN model is used, we can estimate the topic distributions after training our model: ```python from bertopic import BERTopic topic_model = BERTopic() topics, _ = topic_model.fit_transform(docs) topic_distr, _ = topic_model.approximate_distribution(docs, min_similarity=0) ``` Then, we either pass the `probs` or `topic_distr` variable to `.visualize_distribution` to visualize either the probability distributions or the topic distributions: ```python # To visualize the probabilities of topic assignment topic_model.visualize_distribution(probs[0]) # To visualize the topic distributions in a document topic_model.visualize_distribution(topic_distr[0]) ``` Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results: ```python # Calculate the topic distributions on a token-level topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) # Visualize the token-level distributions df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1]) df ```



!!! note To get the stylized dataframe for `.visualize_approximate_distribution` you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via `pip install jinja2` !!! note The distribution of the probabilities does not give an indication to the distribution of the frequencies of topics across a document. It merely shows how confident BERTopic is that certain topics can be found in a document. ================================================ FILE: docs/getting_started/visualization/visualize_hierarchy.md ================================================ The topics that you create can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use `scipy.cluster.hierarchy` to create clusters and visualize how they relate to one another. This might help to select an appropriate `nr_topics` when reducing the number of topics that you have created. To visualize this hierarchy, run the following: ```python topic_model.visualize_hierarchy() ``` !!! note Do note that this is not the actual procedure of `.reduce_topics()` when `nr_topics` is set to auto since HDBSCAN is used to automatically extract topics. The visualization above closely resembles the actual procedure of `.reduce_topics()` when any number of `nr_topics` is selected. ### **Hierarchical labels** Although visualizing this hierarchy gives us information about the structure, it would be helpful to see what happens to the topic representations when merging topics. To do so, we first need to calculate the representations of the hierarchical topics: First, we train a basic BERTopic model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))["data"] topic_model = BERTopic(verbose=True) topics, probs = topic_model.fit_transform(docs) hierarchical_topics = topic_model.hierarchical_topics(docs) ``` To visualize these results, we simply need to pass the resulting `hierarchical_topics` to our `.visualize_hierarchy` function: ```python topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) ``` If you **hover** over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes. ### **Text-based topic tree** Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use `topic_model.get_topic_tree` to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged: ```python >>> tree = topic_model.get_topic_tree(hierarchical_topics) >>> print(tree) . └─atheists_atheism_god_moral_atheist ├─atheists_atheism_god_atheist_argument │ ├─■──atheists_atheism_god_atheist_argument ── Topic: 21 │ └─■──br_god_exist_genetic_existence ── Topic: 124 └─■──moral_morality_objective_immoral_morals ── Topic: 29 ```
Click here to view the full tree. ```bash . ├─people_armenian_said_god_armenians │ ├─god_jesus_jehovah_lord_christ │ │ ├─god_jesus_jehovah_lord_christ │ │ │ ├─jehovah_lord_mormon_mcconkie_god │ │ │ │ ├─■──ra_satan_thou_god_lucifer ── Topic: 94 │ │ │ │ └─■──jehovah_lord_mormon_mcconkie_unto ── Topic: 78 │ │ │ └─jesus_mary_god_hell_sin │ │ │ ├─jesus_hell_god_eternal_heaven │ │ │ │ ├─hell_jesus_eternal_god_heaven │ │ │ │ │ ├─■──jesus_tomb_disciples_resurrection_john ── Topic: 69 │ │ │ │ │ └─■──hell_eternal_god_jesus_heaven ── Topic: 53 │ │ │ │ └─■──aaron_baptism_sin_law_god ── Topic: 89 │ │ │ └─■──mary_sin_maria_priest_conception ── Topic: 56 │ │ └─■──marriage_married_marry_ceremony_marriages ── Topic: 110 │ └─people_armenian_armenians_said_mr │ ├─people_armenian_armenians_said_israel │ │ ├─god_homosexual_homosexuality_atheists_sex │ │ │ ├─homosexual_homosexuality_sex_gay_homosexuals │ │ │ │ ├─■──kinsey_sex_gay_men_sexual ── Topic: 44 │ │ │ │ └─homosexuality_homosexual_sin_homosexuals_gay │ │ │ │ ├─■──gay_homosexual_homosexuals_sexual_cramer ── Topic: 50 │ │ │ │ └─■──homosexuality_homosexual_sin_paul_sex ── Topic: 27 │ │ │ └─god_atheists_atheism_moral_atheist │ │ │ ├─islam_quran_judas_islamic_book │ │ │ │ ├─■──jim_context_challenges_articles_quote ── Topic: 36 │ │ │ │ └─islam_quran_judas_islamic_book │ │ │ │ ├─■──islam_quran_islamic_rushdie_muslims ── Topic: 31 │ │ │ │ └─■──judas_scripture_bible_books_greek ── Topic: 33 │ │ │ └─atheists_atheism_god_moral_atheist │ │ │ ├─atheists_atheism_god_atheist_argument │ │ │ │ ├─■──atheists_atheism_god_atheist_argument ── Topic: 21 │ │ │ │ └─■──br_god_exist_genetic_existence ── Topic: 124 │ │ │ └─■──moral_morality_objective_immoral_morals ── Topic: 29 │ │ └─armenian_armenians_people_israel_said │ │ ├─armenian_armenians_israel_people_jews │ │ │ ├─tax_rights_government_income_taxes │ │ │ │ ├─■──rights_right_slavery_slaves_residence ── Topic: 106 │ │ │ │ └─tax_government_taxes_income_libertarians │ │ │ │ ├─■──government_libertarians_libertarian_regulation_party ── Topic: 58 │ │ │ │ └─■──tax_taxes_income_billion_deficit ── Topic: 41 │ │ │ └─armenian_armenians_israel_people_jews │ │ │ ├─gun_guns_militia_firearms_amendment │ │ │ │ ├─■──blacks_penalty_death_cruel_punishment ── Topic: 55 │ │ │ │ └─■──gun_guns_militia_firearms_amendment ── Topic: 7 │ │ │ └─armenian_armenians_israel_jews_turkish │ │ │ ├─■──israel_israeli_jews_arab_jewish ── Topic: 4 │ │ │ └─■──armenian_armenians_turkish_armenia_azerbaijan ── Topic: 15 │ │ └─stephanopoulos_president_mr_myers_ms │ │ ├─■──serbs_muslims_stephanopoulos_mr_bosnia ── Topic: 35 │ │ └─■──myers_stephanopoulos_president_ms_mr ── Topic: 87 │ └─batf_fbi_koresh_compound_gas │ ├─■──reno_workers_janet_clinton_waco ── Topic: 77 │ └─batf_fbi_koresh_gas_compound │ ├─batf_koresh_fbi_warrant_compound │ │ ├─■──batf_warrant_raid_compound_fbi ── Topic: 42 │ │ └─■──koresh_batf_fbi_children_compound ── Topic: 61 │ └─■──fbi_gas_tear_bds_building ── Topic: 23 └─use_like_just_dont_new ├─game_team_year_games_like │ ├─game_team_games_25_year │ │ ├─game_team_games_25_season │ │ │ ├─window_printer_use_problem_mhz │ │ │ │ ├─mhz_wire_simms_wiring_battery │ │ │ │ │ ├─simms_mhz_battery_cpu_heat │ │ │ │ │ │ ├─simms_pds_simm_vram_lc │ │ │ │ │ │ │ ├─■──pds_nubus_lc_slot_card ── Topic: 119 │ │ │ │ │ │ │ └─■──simms_simm_vram_meg_dram ── Topic: 32 │ │ │ │ │ │ └─mhz_battery_cpu_heat_speed │ │ │ │ │ │ ├─mhz_cpu_speed_heat_fan │ │ │ │ │ │ │ ├─mhz_cpu_speed_heat_fan │ │ │ │ │ │ │ │ ├─■──fan_cpu_heat_sink_fans ── Topic: 92 │ │ │ │ │ │ │ │ └─■──mhz_speed_cpu_fpu_clock ── Topic: 22 │ │ │ │ │ │ │ └─■──monitor_turn_power_computer_electricity ── Topic: 91 │ │ │ │ │ │ └─battery_batteries_concrete_duo_discharge │ │ │ │ │ │ ├─■──duo_battery_apple_230_problem ── Topic: 121 │ │ │ │ │ │ └─■──battery_batteries_concrete_discharge_temperature ── Topic: 75 │ │ │ │ │ └─wire_wiring_ground_neutral_outlets │ │ │ │ │ ├─wire_wiring_ground_neutral_outlets │ │ │ │ │ │ ├─wire_wiring_ground_neutral_outlets │ │ │ │ │ │ │ ├─■──leds_uv_blue_light_boards ── Topic: 66 │ │ │ │ │ │ │ └─■──wire_wiring_ground_neutral_outlets ── Topic: 120 │ │ │ │ │ │ └─scope_scopes_phone_dial_number │ │ │ │ │ │ ├─■──dial_number_phone_line_output ── Topic: 93 │ │ │ │ │ │ └─■──scope_scopes_motorola_generator_oscilloscope ── Topic: 113 │ │ │ │ │ └─celp_dsp_sampling_antenna_digital │ │ │ │ │ ├─■──antenna_antennas_receiver_cable_transmitter ── Topic: 70 │ │ │ │ │ └─■──celp_dsp_sampling_speech_voice ── Topic: 52 │ │ │ │ └─window_printer_xv_mouse_windows │ │ │ │ ├─window_xv_error_widget_problem │ │ │ │ │ ├─error_symbol_undefined_xterm_rx │ │ │ │ │ │ ├─■──symbol_error_undefined_doug_parse ── Topic: 63 │ │ │ │ │ │ └─■──rx_remote_server_xdm_xterm ── Topic: 45 │ │ │ │ │ └─window_xv_widget_application_expose │ │ │ │ │ ├─window_widget_expose_application_event │ │ │ │ │ │ ├─■──gc_mydisplay_draw_gxxor_drawing ── Topic: 103 │ │ │ │ │ │ └─■──window_widget_application_expose_event ── Topic: 25 │ │ │ │ │ └─xv_den_polygon_points_algorithm │ │ │ │ │ ├─■──den_polygon_points_algorithm_polygons ── Topic: 28 │ │ │ │ │ └─■──xv_24bit_image_bit_images ── Topic: 57 │ │ │ │ └─printer_fonts_print_mouse_postscript │ │ │ │ ├─printer_fonts_print_font_deskjet │ │ │ │ │ ├─■──scanner_logitech_grayscale_ocr_scanman ── Topic: 108 │ │ │ │ │ └─printer_fonts_print_font_deskjet │ │ │ │ │ ├─■──printer_print_deskjet_hp_ink ── Topic: 18 │ │ │ │ │ └─■──fonts_font_truetype_tt_atm ── Topic: 49 │ │ │ │ └─mouse_ghostscript_midi_driver_postscript │ │ │ │ ├─ghostscript_midi_postscript_files_file │ │ │ │ │ ├─■──ghostscript_postscript_pageview_ghostview_dsc ── Topic: 104 │ │ │ │ │ └─midi_sound_file_windows_driver │ │ │ │ │ ├─■──location_mar_file_host_rwrr ── Topic: 83 │ │ │ │ │ └─■──midi_sound_driver_blaster_soundblaster ── Topic: 98 │ │ │ │ └─■──mouse_driver_mice_ball_problem ── Topic: 68 │ │ │ └─game_team_games_25_season │ │ │ ├─1st_sale_condition_comics_hulk │ │ │ │ ├─sale_condition_offer_asking_cd │ │ │ │ │ ├─condition_stereo_amp_speakers_asking │ │ │ │ │ │ ├─■──miles_car_amfm_toyota_cassette ── Topic: 62 │ │ │ │ │ │ └─■──amp_speakers_condition_stereo_audio ── Topic: 24 │ │ │ │ │ └─games_sale_pom_cds_shipping │ │ │ │ │ ├─pom_cds_sale_shipping_cd │ │ │ │ │ │ ├─■──size_shipping_sale_condition_mattress ── Topic: 100 │ │ │ │ │ │ └─■──pom_cds_cd_sale_picture ── Topic: 37 │ │ │ │ │ └─■──games_game_snes_sega_genesis ── Topic: 40 │ │ │ │ └─1st_hulk_comics_art_appears │ │ │ │ ├─1st_hulk_comics_art_appears │ │ │ │ │ ├─lens_tape_camera_backup_lenses │ │ │ │ │ │ ├─■──tape_backup_tapes_drive_4mm ── Topic: 107 │ │ │ │ │ │ └─■──lens_camera_lenses_zoom_pouch ── Topic: 114 │ │ │ │ │ └─1st_hulk_comics_art_appears │ │ │ │ │ ├─■──1st_hulk_comics_art_appears ── Topic: 105 │ │ │ │ │ └─■──books_book_cover_trek_chemistry ── Topic: 125 │ │ │ │ └─tickets_hotel_ticket_voucher_package │ │ │ │ ├─■──hotel_voucher_package_vacation_room ── Topic: 74 │ │ │ │ └─■──tickets_ticket_june_airlines_july ── Topic: 84 │ │ │ └─game_team_games_season_hockey │ │ │ ├─game_hockey_team_25_550 │ │ │ │ ├─■──espn_pt_pts_game_la ── Topic: 17 │ │ │ │ └─■──team_25_game_hockey_550 ── Topic: 2 │ │ │ └─■──year_game_hit_baseball_players ── Topic: 0 │ │ └─bike_car_greek_insurance_msg │ │ ├─car_bike_insurance_cars_engine │ │ │ ├─car_insurance_cars_radar_engine │ │ │ │ ├─insurance_health_private_care_canada │ │ │ │ │ ├─■──insurance_health_private_care_canada ── Topic: 99 │ │ │ │ │ └─■──insurance_car_accident_rates_sue ── Topic: 82 │ │ │ │ └─car_cars_radar_engine_detector │ │ │ │ ├─car_radar_cars_detector_engine │ │ │ │ │ ├─■──radar_detector_detectors_ka_alarm ── Topic: 39 │ │ │ │ │ └─car_cars_mustang_ford_engine │ │ │ │ │ ├─■──clutch_shift_shifting_transmission_gear ── Topic: 88 │ │ │ │ │ └─■──car_cars_mustang_ford_v8 ── Topic: 14 │ │ │ │ └─oil_diesel_odometer_diesels_car │ │ │ │ ├─odometer_oil_sensor_car_drain │ │ │ │ │ ├─■──odometer_sensor_speedo_gauge_mileage ── Topic: 96 │ │ │ │ │ └─■──oil_drain_car_leaks_taillights ── Topic: 102 │ │ │ │ └─■──diesel_diesels_emissions_fuel_oil ── Topic: 79 │ │ │ └─bike_riding_ride_bikes_motorcycle │ │ │ ├─bike_ride_riding_bikes_lane │ │ │ │ ├─■──bike_ride_riding_lane_car ── Topic: 11 │ │ │ │ └─■──bike_bikes_miles_honda_motorcycle ── Topic: 19 │ │ │ └─■──countersteering_bike_motorcycle_rear_shaft ── Topic: 46 │ │ └─greek_msg_kuwait_greece_water │ │ ├─greek_msg_kuwait_greece_water │ │ │ ├─greek_msg_kuwait_greece_dog │ │ │ │ ├─greek_msg_kuwait_greece_dog │ │ │ │ │ ├─greek_kuwait_greece_turkish_greeks │ │ │ │ │ │ ├─■──greek_greece_turkish_greeks_cyprus ── Topic: 71 │ │ │ │ │ │ └─■──kuwait_iraq_iran_gulf_arabia ── Topic: 76 │ │ │ │ │ └─msg_dog_drugs_drug_food │ │ │ │ │ ├─dog_dogs_cooper_trial_weaver │ │ │ │ │ │ ├─■──clinton_bush_quayle_reagan_panicking ── Topic: 101 │ │ │ │ │ │ └─dog_dogs_cooper_trial_weaver │ │ │ │ │ │ ├─■──cooper_trial_weaver_spence_witnesses ── Topic: 90 │ │ │ │ │ │ └─■──dog_dogs_bike_trained_springer ── Topic: 67 │ │ │ │ │ └─msg_drugs_drug_food_chinese │ │ │ │ │ ├─■──msg_food_chinese_foods_taste ── Topic: 30 │ │ │ │ │ └─■──drugs_drug_marijuana_cocaine_alcohol ── Topic: 72 │ │ │ │ └─water_theory_universe_science_larsons │ │ │ │ ├─water_nuclear_cooling_steam_dept │ │ │ │ │ ├─■──rocketry_rockets_engines_nuclear_plutonium ── Topic: 115 │ │ │ │ │ └─water_cooling_steam_dept_plants │ │ │ │ │ ├─■──water_dept_phd_environmental_atmospheric ── Topic: 97 │ │ │ │ │ └─■──cooling_water_steam_towers_plants ── Topic: 109 │ │ │ │ └─theory_universe_larsons_larson_science │ │ │ │ ├─■──theory_universe_larsons_larson_science ── Topic: 54 │ │ │ │ └─■──oort_cloud_grbs_gamma_burst ── Topic: 80 │ │ │ └─helmet_kirlian_photography_lock_wax │ │ │ ├─helmet_kirlian_photography_leaf_mask │ │ │ │ ├─kirlian_photography_leaf_pictures_deleted │ │ │ │ │ ├─deleted_joke_stuff_maddi_nickname │ │ │ │ │ │ ├─■──joke_maddi_nickname_nicknames_frank ── Topic: 43 │ │ │ │ │ │ └─■──deleted_stuff_bookstore_joke_motto ── Topic: 81 │ │ │ │ │ └─■──kirlian_photography_leaf_pictures_aura ── Topic: 85 │ │ │ │ └─helmet_mask_liner_foam_cb │ │ │ │ ├─■──helmet_liner_foam_cb_helmets ── Topic: 112 │ │ │ │ └─■──mask_goalies_77_santore_tl ── Topic: 123 │ │ │ └─lock_wax_paint_plastic_ear │ │ │ ├─■──lock_cable_locks_bike_600 ── Topic: 117 │ │ │ └─wax_paint_ear_plastic_skin │ │ │ ├─■──wax_paint_plastic_scratches_solvent ── Topic: 65 │ │ │ └─■──ear_wax_skin_greasy_acne ── Topic: 116 │ │ └─m4_mp_14_mw_mo │ │ ├─m4_mp_14_mw_mo │ │ │ ├─■──m4_mp_14_mw_mo ── Topic: 111 │ │ │ └─■──test_ensign_nameless_deane_deanebinahccbrandeisedu ── Topic: 118 │ │ └─■──ites_cheek_hello_hi_ken ── Topic: 3 │ └─space_medical_health_disease_cancer │ ├─medical_health_disease_cancer_patients │ │ ├─■──cancer_centers_center_medical_research ── Topic: 122 │ │ └─health_medical_disease_patients_hiv │ │ ├─patients_medical_disease_candida_health │ │ │ ├─■──candida_yeast_infection_gonorrhea_infections ── Topic: 48 │ │ │ └─patients_disease_cancer_medical_doctor │ │ │ ├─■──hiv_medical_cancer_patients_doctor ── Topic: 34 │ │ │ └─■──pain_drug_patients_disease_diet ── Topic: 26 │ │ └─■──health_newsgroup_tobacco_vote_votes ── Topic: 9 │ └─space_launch_nasa_shuttle_orbit │ ├─space_moon_station_nasa_launch │ │ ├─■──sky_advertising_billboard_billboards_space ── Topic: 59 │ │ └─■──space_station_moon_redesign_nasa ── Topic: 16 │ └─space_mission_hst_launch_orbit │ ├─space_launch_nasa_orbit_propulsion │ │ ├─■──space_launch_nasa_propulsion_astronaut ── Topic: 47 │ │ └─■──orbit_km_jupiter_probe_earth ── Topic: 86 │ └─■──hst_mission_shuttle_orbit_arrays ── Topic: 60 └─drive_file_key_windows_use ├─key_file_jpeg_encryption_image │ ├─key_encryption_clipper_chip_keys │ │ ├─■──key_clipper_encryption_chip_keys ── Topic: 1 │ │ └─■──entry_file_ripem_entries_key ── Topic: 73 │ └─jpeg_image_file_gif_images │ ├─motif_graphics_ftp_available_3d │ │ ├─motif_graphics_openwindows_ftp_available │ │ │ ├─■──openwindows_motif_xview_windows_mouse ── Topic: 20 │ │ │ └─■──graphics_widget_ray_3d_available ── Topic: 95 │ │ └─■──3d_machines_version_comments_contact ── Topic: 38 │ └─jpeg_image_gif_images_format │ ├─■──gopher_ftp_files_stuffit_images ── Topic: 51 │ └─■──jpeg_image_gif_format_images ── Topic: 13 └─drive_db_card_scsi_windows ├─db_windows_dos_mov_os2 │ ├─■──copy_protection_program_software_disk ── Topic: 64 │ └─■──db_windows_dos_mov_os2 ── Topic: 8 └─drive_card_scsi_drives_ide ├─drive_scsi_drives_ide_disk │ ├─■──drive_scsi_drives_ide_disk ── Topic: 6 │ └─■──meg_sale_ram_drive_shipping ── Topic: 12 └─card_modem_monitor_video_drivers ├─■──card_monitor_video_drivers_vga ── Topic: 5 └─■──modem_port_serial_irq_com ── Topic: 10 ```
## **Visualize Hierarchical Documents** We can extend the previous method by calculating the topic representation at different levels of the hierarchy and plotting them on a 2D plane. To do so, we first need to calculate the hierarchical topics: ```python from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from bertopic import BERTopic from umap import UMAP # Prepare embeddings docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] sentence_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = sentence_model.encode(docs, show_progress_bar=False) # Train BERTopic and extract hierarchical topics topic_model = BERTopic().fit(docs, embeddings) hierarchical_topics = topic_model.hierarchical_topics(docs) ``` Then, we can visualize the hierarchical documents by either supplying it with our embeddings or by reducing their dimensionality ourselves: ```python # Run the visualization with the original embeddings topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings) # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively: reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) ``` !!! note The visualization above was generated with the additional parameter `hide_document_hover=True` which disables the option to hover over the individual points and see the content of the documents. This makes the resulting visualization smaller and fit into your RAM. However, it might be interesting to set `hide_document_hover=False` to hover over the points and see the content of the documents. ================================================ FILE: docs/getting_started/visualization/visualize_terms.md ================================================ We can visualize the selected terms for a few topics by creating bar charts out of the c-TF-IDF scores for each topic representation. Insights can be gained from the relative c-TF-IDF scores between and within topics. Moreover, you can easily compare topic representations to each other. To visualize this hierarchy, run the following: ```python topic_model.visualize_barchart() ``` ## **Visualize Term Score Decline** Topics are represented by a number of words starting with the best representative word. Each word is represented by a c-TF-IDF score. The higher the score, the more representative a word to the topic is. Since the topic words are sorted by their c-TF-IDF score, the scores slowly decline with each word that is added. At some point adding words to the topic representation only marginally increases the total c-TF-IDF score and would not be beneficial for its representation. To visualize this effect, we can plot the c-TF-IDF scores for each topic by the term rank of each word. In other words, the position of the words (term rank), where the words with the highest c-TF-IDF score will have a rank of 1, will be put on the x-axis. Whereas the y-axis will be populated by the c-TF-IDF scores. The result is a visualization that shows you the decline of c-TF-IDF score when adding words to the topic representation. It allows you, using the elbow method, the select the best number of words in a topic. To visualize the c-TF-IDF score decline, run the following: ```python topic_model.visualize_term_rank() ``` To enable the log scale on the y-axis for a better view of individual topics, run the following: ```python topic_model.visualize_term_rank(log_scale=True) ``` This visualization was heavily inspired by the "Term Probability Decline" visualization found in an analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/). Reference to that specific analysis can be found [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html). ================================================ FILE: docs/getting_started/visualization/visualize_topics.md ================================================ Visualizing BERTopic and its derivatives is important in understanding the model, how it works, and more importantly, where it works. Since topic modeling can be quite a subjective field it is difficult for users to validate their models. Looking at the topics and seeing if they make sense is an important factor in alleviating this issue. ## **Visualize Topics** After having trained our `BERTopic` model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can visualize the topics that were generated in a way very similar to [LDAvis](https://github.com/cpsievert/LDAvis). We embed our c-TF-IDF representation of the topics in 2D using Umap and then visualize the two dimensions using plotly such that we can create an interactive view. First, we need to train our model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` Then, we can call `.visualize_topics` to create a 2D representation of your topics. The resulting graph is a plotly interactive graph which can be converted to HTML: ```python topic_model.visualize_topics() ``` You can use the slider to select the topic which then lights up red. If you hover over a topic, then general information is given about the topic, including the size of the topic and its corresponding words. ## **Visualize Topic Similarity** Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other. To visualize the heatmap, run the following: ```python topic_model.visualize_heatmap() ``` !!! note You can set `n_clusters` in `visualize_heatmap` to order the topics by their similarity. This will result in blocks being formed in the heatmap indicating which clusters of topics are similar to each other. This step is very much recommended as it will make reading the heatmap easier. ## **Visualize Topics over Time** After creating topics over time with Dynamic Topic Modeling, we can visualize these topics by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call `.visualize_topics_over_time` with the newly created topics over time: ```python import re import pandas as pd from bertopic import BERTopic # Prepare data trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6') trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1) trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1) trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1) trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :] timestamps = trump.date.to_list() tweets = trump.text.to_list() # Create topics over time model = BERTopic(verbose=True) topics, probs = model.fit_transform(tweets) topics_over_time = model.topics_over_time(tweets, timestamps) ``` Then, we visualize some interesting topics: ```python model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91]) ``` ## **Visualize Topics per Class** You might want to extract and visualize the topic representation per class. For example, if you have specific groups of users that might approach topics differently, then extracting them would help understanding how these users talk about certain topics. In other words, this is simply creating a topic representation for certain classes that you might have in your data. First, we need to train our model: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups # Prepare data and classes data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) docs = data["data"] classes = [data["target_names"][i] for i in data["target"]] # Create topic model and calculate topics per class topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) topics_per_class = topic_model.topics_per_class(docs, classes=classes) ``` Then, we visualize the topic representation of major topics per class: ```python topic_model.visualize_topics_per_class(topics_per_class) ``` ================================================ FILE: docs/getting_started/visualization/viz.html ================================================
================================================ FILE: docs/getting_started/zeroshot/zeroshot.md ================================================ Zero-shot Topic Modeling is a technique that allows you to find topics in large amounts of documents that were predefined. When faced with many documents, you often have an idea of which topics will definitely be in there. Whether that is a result of simply knowing your data or if a domain expert is involved in defining those topics. This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics. This allows for extensive flexibility as there are three scenario's to explore: * First, both zero-shot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found. * Second, only zero-shot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics. * Third, no zero-shot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.
--8<-- "docs/getting_started/zeroshot/zeroshot.svg"
This method works as follows. First, we create a number of labels for our predefined topics and embed them using any embedding model. Then, we compare the embeddings of the documents with the predefined labels using cosine similarity. If they pass a user-defined threshold, the zero-shot topic is assigned to a document. If it does not, then that document, along with others, will follow the regular BERTopic pipeline and attempt to find clusters that do not fit with the zero-shot topics. ### **Example** In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However, there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers. We know the data and believe there to be at least the following topics: *clustering*, *topic modeling*, and *large language models*. However, we are not sure whether other topics exist and want to explore those. Zero-shot BERTopic needs two parameters: * `zeroshot_topic_list` - The names of the topics to assign documents to. Making sure this is as descriptive as possible helps improve the assignment since they are based on cosine similarities between embeddings. * `zeroshot_min_similarity` - The minimum cosine similarity needed to match a document to a document. It is a value between 0 and 1. Using this feature is straightforward: ```python from datasets import load_dataset from bertopic import BERTopic from bertopic.representation import KeyBERTInspired # We select a subsample of 5000 abstracts from ArXiv dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"] docs = dataset["abstract"][:5_000] # We define a number of topics that we know are in the documents zeroshot_topic_list = ["Clustering", "Topic Modeling", "Large Language Models"] # We fit our model using the zero-shot topics # and we define a minimum similarity. For each document, # if the similarity does not exceed that value, it will be used # for clustering instead. topic_model = BERTopic( embedding_model="thenlper/gte-small", min_topic_size=15, zeroshot_topic_list=zeroshot_topic_list, zeroshot_min_similarity=.85, representation_model=KeyBERTInspired() ) topics, _ = topic_model.fit_transform(docs) ``` When we run `topic_model.get_topic_info()` you will see something like this:
The `zeroshot_min_similarity` parameter controls how many of the documents are assigned to the predefined zero-shot topics. Lower this value and you will have more documents assigned to zero-shot topics and fewer documents will be clustered. Increase this value you will have fewer documents assigned to zero-shot topics and more documents will be clustered. !!! Note Setting the `zeroshot_min_similarity` parameter requires a bit of experimentation. Some embedding models have different similarity distributions, so trying out the values manually and exploring the results is highly advised. !!! Tip Because zero-shot topic modeling is essentially merging two different topic models, the `probs` will be empty initially. If you want to have the probabilities of topics across documents, you can run `topic_model.transform` on your documents to extract the updated `probs`. ================================================ FILE: docs/img/probabilities.html ================================================
================================================ FILE: docs/index.md ================================================ --- hide: - navigation --- # BERTopic BERTopic is a topic modeling technique that leverages 🤗 transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions. BERTopic supports all kinds of topic modeling techniques:
Guided Supervised Semi-supervised
Manual Multi-topic distributions Hierarchical
Class-based Dynamic Online/Incremental
Multimodal Multi-aspect Text Generation/LLM
Zero-shot (new!) Merge Models (new!) Seed Words (new!)
Corresponding medium posts can be found [here](https://medium.com/data-science/topic-modeling-with-bert-779f7db187e6?sk=0b5a470c006d1842ad4c8a3057063a99 ), [here](https://medium.com/data-science/using-whisper-and-bertopic-to-model-kurzgesagts-videos-7d8a63139bdf?sk=b1e0fd46f70cb15e8422b4794a81161d ) and [here](https://medium.com/data-science/interactive-topic-modeling-with-bertopic-1ea55e7d73d8?sk=03c2168e9e74b6bda2a1f3ed953427e4 ). For a more detailed overview, you can read the [paper](https://arxiv.org/abs/2203.05794) or see a [brief overview](https://maartengr.github.io/BERTopic/algorithm/algorithm.html). ## **Installation** Installation, with sentence-transformers, can be done using [pypi](https://pypi.org/project/bertopic/): ```bash pip install bertopic ``` You may want to install more depending on the transformers and language backends that you will be using. The possible installations are: ```bash # Choose an embedding backend pip install bertopic[flair, gensim, spacy, use] # Topic modeling with images pip install bertopic[vision] ``` ## **Quick Start** We start by extracting topics from the well-known 20 newsgroups dataset containing English documents: ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] topic_model = BERTopic() topics, probs = topic_model.fit_transform(docs) ``` After generating topics and their probabilities, we can access the frequent topics that were generated: ```python >>> topic_model.get_topic_info() Topic Count Name -1 4630 -1_can_your_will_any 0 693 49_windows_drive_dos_file 1 466 32_jesus_bible_christian_faith 2 441 2_space_launch_orbit_lunar 3 381 22_key_encryption_keys_encrypted ``` -1 refers to all outliers and should typically be ignored. Next, let's take a look at the most frequent topic that was generated, topic 0: ```python >>> topic_model.get_topic(0) [('windows', 0.006152228076250982), ('drive', 0.004982897610645755), ('dos', 0.004845038866360651), ('file', 0.004140142872194834), ('disk', 0.004131678774810884), ('mac', 0.003624848635985097), ('memory', 0.0034840976976789903), ('software', 0.0034415334250699077), ('email', 0.0034239554442333257), ('pc', 0.003047105930670237)] ``` Using `.get_document_info`, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.: ```python >>> topic_model.get_document_info(docs) Document Topic Name Top_n_words Probability ... I am sure some bashers of Pens... 0 0_game_team_games_season game - team - games... 0.200010 ... My brother is in the market for... -1 -1_can_your_will_any can - your - will... 0.420668 ... Finally you said what you dream... -1 -1_can_your_will_any can - your - will... 0.807259 ... Think! It is the SCSI card doing... 49 49_windows_drive_dos_file windows - drive - docs... 0.071746 ... 1) I have an old Jasmine drive... 49 49_windows_drive_dos_file windows - drive - docs... 0.038983 ... ``` !!! tip "Multilingual" Use `BERTopic(language="multilingual")` to select a model that supports 50+ languages. ## **Fine-tune Topic Representations** In BERTopic, there are a number of different [topic representations](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) that we can choose from. They are all quite different from one another and give interesting perspectives and variations of topic representations. A great start is `KeyBERTInspired`, which for many users increases the coherence and reduces stopwords from the resulting topic representations: ```python from bertopic.representation import KeyBERTInspired # Fine-tune your topic representations representation_model = KeyBERTInspired() topic_model = BERTopic(representation_model=representation_model) ``` However, you might want to use something more powerful to describe your clusters. You can even use ChatGPT or other models from OpenAI to generate labels, summaries, phrases, keywords, and more: ```python import openai from bertopic.representation import OpenAI # Fine-tune topic representations with GPT client = openai.OpenAI(api_key="sk-...") representation_model = OpenAI(client, model="gpt-4o-mini", chat=True) topic_model = BERTopic(representation_model=representation_model) ``` !!! tip "Multi-aspect Topic Modeling" Instead of iterating over all of these different topic representations, you can model them simultaneously with [multi-aspect topic representations](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) in BERTopic. ## **Modularity** By default, the main steps for topic modeling with BERTopic are sentence-transformers, UMAP, HDBSCAN, and c-TF-IDF run in sequence. However, it assumes some independence between these steps which makes BERTopic quite modular. In other words, BERTopic not only allows you to build your own topic model but to explore several topic modeling techniques on top of your customized topic model: You can swap out any of these models or even remove them entirely. The following steps are completely modular: 1. [Embedding](https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html) documents 2. [Reducing dimensionality](https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html) of embeddings 3. [Clustering](https://maartengr.github.io/BERTopic/getting_started/clustering/clustering.html) reduced embeddings into topics 4. [Tokenization](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html) of topics 5. [Weight](https://maartengr.github.io/BERTopic/getting_started/ctfidf/ctfidf.html) tokens 6. [Represent topics](https://maartengr.github.io/BERTopic/getting_started/representation/representation.html) with one or [multiple](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) representations To find more about the underlying algorithm and assumptions [here](https://maartengr.github.io/BERTopic/algorithm/algorithm.html). ## **Overview** BERTopic has many functions that quickly can become overwhelming. To alleviate this issue, you will find an overview of all methods and a short description of its purpose. ### Common Below, you will find an overview of common functions in BERTopic. | Method | Code | |-----------------------|---| | Fit the model | `.fit(docs)` | | Fit the model and predict documents | `.fit_transform(docs)` | | Predict new documents | `.transform([new_doc])` | | Access single topic | `.get_topic(topic=12)` | | Access all topics | `.get_topics()` | | Get topic freq | `.get_topic_freq()` | | Get all topic information| `.get_topic_info()` | | Get all document information| `.get_document_info(docs)` | | Get representative docs per topic | `.get_representative_docs()` | | Update topic representation | `.update_topics(docs, n_gram_range=(1, 3))` | | Generate topic labels | `.generate_topic_labels()` | | Set topic labels | `.set_topic_labels(my_custom_labels)` | | Merge topics | `.merge_topics(docs, topics_to_merge)` | | Reduce nr of topics | `.reduce_topics(docs, nr_topics=30)` | | Reduce outliers | `.reduce_outliers(docs, topics)` | | Find topics | `.find_topics("vehicle")` | | Save model | `.save("my_model", serialization="safetensors")` | | Load model | `BERTopic.load("my_model")` | | Get parameters | `.get_params()` | ### Attributes After having trained your BERTopic model, several are saved within your model. These attributes, in part, refer to how model information is stored on an estimator during fitting. The attributes that you see below all end in `_` and are public attributes that can be used to access model information. | Attribute | Description | |------------------------|---------------------------------------------------------------------------------------------| | `.topics_` | The topics that are generated for each document after training or updating the topic model. | | `.probabilities_` | The probabilities that are generated for each document if HDBSCAN is used. | | `.topic_sizes_` | The size of each topic | | `.topic_mapper_` | A class for tracking topics and their mappings anytime they are merged/reduced. | | `.topic_representations_` | The top *n* terms per topic and their respective c-TF-IDF values. | | `.c_tf_idf_` | The topic-term matrix as calculated through c-TF-IDF. | | `.topic_aspects_` | The different aspects, or representations, of each topic. | | `.topic_labels_` | The default labels for each topic. | | `.custom_labels_` | Custom labels for each topic as generated through `.set_topic_labels`. | | `.topic_embeddings_` | The embeddings for each topic if `embedding_model` was used. | | `.representative_docs_` | The representative documents for each topic if HDBSCAN is used. | ### Variations There are many different use cases in which topic modeling can be used. As such, several variations of BERTopic have been developed such that one package can be used across many use cases. | Method | Code | |-----------------------|---| | [Topic Distribution Approximation](https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html) | `.approximate_distribution(docs)` | | [Online Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/online/online.html) | `.partial_fit(doc)` | | [Semi-supervised Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html) | `.fit(docs, y=y)` | | [Supervised Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html) | `.fit(docs, y=y)` | | [Manual Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/manual/manual.html) | `.fit(docs, y=y)` | | [Multimodal Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html) | ``.fit(docs, images=images)`` | | [Topic Modeling per Class](https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html) | `.topics_per_class(docs, classes)` | | [Dynamic Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html) | `.topics_over_time(docs, timestamps)` | | [Hierarchical Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html) | `.hierarchical_topics(docs)` | | [Guided Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/guided/guided.html) | `BERTopic(seed_topic_list=seed_topic_list)` | | [Zero-shot Topic Modeling](https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html) | `BERTopic(zeroshot_topic_list=zeroshot_topic_list)` | | [Merge Multiple Models](https://maartengr.github.io/BERTopic/getting_started/merge/merge.html) | `BERTopic.merge_models([topic_model_1, topic_model_2])` | ### Visualizations Evaluating topic models can be rather difficult due to the somewhat subjective nature of evaluation. Visualizing different aspects of the topic model helps in understanding the model and makes it easier to tweak the model to your liking. | Method | Code | |-----------------------|---| | Visualize Topics | `.visualize_topics()` | | Visualize Documents | `.visualize_documents()` | | Visualize Document with DataMapPlot | `.visualize_document_datamap()` | | Visualize Document Hierarchy | `.visualize_hierarchical_documents()` | | Visualize Topic Hierarchy | `.visualize_hierarchy()` | | Visualize Topic Tree | `.get_topic_tree(hierarchical_topics)` | | Visualize Topic Terms | `.visualize_barchart()` | | Visualize Topic Similarity | `.visualize_heatmap()` | | Visualize Term Score Decline | `.visualize_term_rank()` | | Visualize Topic Probability Distribution | `.visualize_distribution(probs[0])` | | Visualize Topics over Time | `.visualize_topics_over_time(topics_over_time)` | | Visualize Topics per Class | `.visualize_topics_per_class(topics_per_class)` | ## **Citation** To cite the [BERTopic paper](https://arxiv.org/abs/2203.05794), please use the following bibtex reference: ```bibtext @article{grootendorst2022bertopic, title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure}, author={Grootendorst, Maarten}, journal={arXiv preprint arXiv:2203.05794}, year={2022} } ``` ================================================ FILE: docs/stylesheets/extra.css ================================================ :root { --md-primary-fg-color: #234E70; } :root>* { --md-typeset-a-color: #016198; --md-text-link-color: #000000; } body[data-md-color-primary="black"] .svg_image svg { filter: invert(100%) hue-rotate(180deg); } body[data-md-color-primary="black"] .svg_image svg rect { fill: transparent; } .svg_image { text-align: center; width: 100%; } .center { display: block; margin: 0 auto; } ================================================ FILE: docs/usecases.md ================================================ --- hide: - navigation --- Over the last few years, BERTopic has been used on a wide variety of use cases and domains, from cancer research and voice perception, to employee surveys and social media. This diversity allows for interesting use cases but it might quickly become overwhelming. This page is meant to demonstrate how, when, and why BERTopic is used in practice.

## **Examples** Below are a number of use cases that have been applied in practice. These use cases are collected from and written by data-professionals. !!! note If you would like to add your use case, feel free open up a PR! You only need to update [this file](https://github.com/MaartenGr/BERTopic/blob/master/docs/usecases.md) and add your example. You can just copy-paste one of the existing examples and adjust it contain a description of your use case. #### App User Feedback !!! quote ""


"Analyzing user reviews from the App Store and Play Store helps us reveal valuable customer information, fix technical or usability issues, and help constantly improve customer experience. We utilize BERTopic for topic modeling and supervised classification of predefined categories."
•••
Tibor Fabian, Ph.D.
Lead/Master Data Scientist
Telefónica Germany

#### Employee Surveys !!! quote ""


"We are using BERTopic to support analysis of employee surveys. Here, we use BERTopic to compute the topics of discussion found in employee responses to open-ended survey questions. To further understand how employees feel about certain topics, we combined BERTopic with sentiment analysis to identify the sentiments associated with different topics and vice versa."
•••
Steve Quirolgico, Ph.D.
Principal Engineer

U.S. Department of Homeland Security

#### Voice Perception !!! quote ""


"A research project on voice perception to categorize what people describe when they make first impressions based on hearing people say, "Hi"." [preprint](https://psyarxiv.com/9ysqx/) | [code](https://osf.io/5t4b7/)
•••
David Feinberg
Associate Professor

McMaster University

#### Social Media !!! quote ""


"We use BERTopic to detect trending topics in social media, Our product (AIM Insights) is a social media monitoring tool so detecting trending topics in social media helps our clients to capitalize on them for their campaigns.

We use BERTopic to group social media posts into clusters, sort them by engagement to detect the ones that are trending, and then use OpenAI's GPT-3 to generate a label for each of the top clusters based on the most relevant documents in it. This is all done on Arabic posts using an in-house sentence embeddings model."

•••
Ahmed Rashwan
AI lead

AIM Technologies

#### IT Service Management !!! quote ""




"In IT Service Management systems (e.g., Service Now) we receive Incidents, Problems, Change requests etc. We use BERTopic to categorize them into a group of topics/clusters to understand the distribution of the work requests over the period of time to plan and act accordingly for the future."
•••
Rajesh Thanaseelan
Data Science Consultant

DXC Technology

#### Colon Cancer !!! quote ""




"We use BERTopic to evaluate P53 in Ovarian cancer for Computational backgrounds researchers, who find it easier to relate Artificial Intelligence with advancing the transformer model and unstructured medical data. The paper explores the heterogeneity of keyBERT, BERTopic, PyCaret, and LDAs as key phrase generators and topic model extractors, with P53 in ovarian cancer as a use case."
•••
Mary Adewunmi
PhD Student in Colon Cancer and AI

UTAS

#### Telephone Help Line !!! quote ""


"We analyzed 100K+ phone call memos from a telephone help line. The Help Line is open to all people, regardless of religion, culture, and origin. It follows the principles of IFOTES (International Federation Of Telephone Emergency Services). The regional offices each offer independent counseling services via telephone or online.

The phone call memos are written by hundreds of independent volunteers and come in various shapes, lengths, forms, and wordings - additionally to have them in multiple languages. While using BERTopic we ran a few tests to figure out if the topic modeling works. Selecting only one language with ~60K data points and a mixed language model we achieved good results. It helped identify topics within the calls and therefore show the organization what reasons there are for people calling them. We identified in a workshop a few interesting topics, which they were not aware of, for example, religious topics.

The identification of existing and new, arising topics is crucial for the service quality of the organization. It furthermore helps detect trends over time, which can then be reported directly to Public Health institutions, which can then come up with campaigns to inform the public and help reduce certain psychological concerns. It acts as a representative psychological health barometer of the population."

•••
Kevin Kuhn
Chief Executive Officer

gopf

#### Regional Newspaper !!! quote ""



"Recently, we wanted to evaluate our overall section structure, especially our local news section. As you can imagine, local news is quite a big part of what we do in a regional newspaper. We used BERTopic on a year's worth of local news data to explore the topics in local news and define a new section structure. The results from this analysis helped to define the new section structure, which was implemented this month. "
•••
Thomas Husken
Data Scientist
Bergens Tidende

#### Intelligent Virtual Assistants !!! quote ""

"We have been using BERTopic as an early step in our exploratory analysis for intelligent virtual assistants. It helps us get a quick read on what some of the intents may be. The results help in the design discussions with customers."
•••
Stephen Drew
VP, AI and Automation Solutions

Five9

#### Electronic Health Records !!! quote ""




"Given physician-created documents from hospitals, find themes in the text as well as differentiate between "relevant" and "irrelevant" text, and disambiguate homonyms. "
•••
Alexis Raykhel
Senior NLP Engineer
Iodine Software

#### Teaching !!! quote ""

"BERTopic was used to determine a taxonomy of climate change risks discussed in financial news, and to compute firms' related exposure. It was used in a context a course offering on Climate Risks modelling with NLP."
•••
Thomas Lorans
Senior Associate, Quantitative Analyst

#### Zero Hunger Lab !!! quote ""



"I am a PhD student at Tilburg University, at a lab called Zero Hunger Lab, where we try to use data science methods to improve food insecurity. One key issue is classifying and predicting food insecurity in food-insecure nations. The Integrated Food Security Phase Classification (IPC) system serves this purpose. The IPC categorizes food insecurity into five phases, ranging from minimal food insecurity to famine, and serves as a guide for directing humanitarian resources to the most affected regions.

The IPC system strives to be based on evidence, however, obtaining accurate information about food insecurity in remote regions can prove challenging. Despite the availability of weather data, data in the socio-economic domain, such as food prices and conflict, can be scarce or unreliable due to limited infrastructure and bureaucratic obstacles. These complications often result in infrequent releases of IPC classifications and projections, making it difficult to effectively respond to food insecurity in these areas.

One large source of daily-updated information is local news. Thus, one can build a model that classifies/predicts IPC by relying on news features obtained by NLP methods in addition to stuff like weather data. Previous research shows this is possible (see https://arxiv.org/pdf/2111.15602.pdf). The authors find words related to food insecurity using semantic frame parsing. After which, they count the occurrence of these words to create features. The features are put into a linear classifier. We wanted to apply more advanced methods and use local news sources (which we suppose contain more localized information). We used BERTopic on over a million articles scraped from Somali news websites. Because articles are both in English and Somali, we use a multilingual sentence encoder (LaBSE, which outperforms newer models in Somali). The results are quite nice. For example, topics most strongly correlated with known conflict casualty data are topics about terrorist attacks, car bombings, etc. And topics most negatively correlated with known conflict casualty data are about peace talks. We can also get an indication of food price development and forced migration. Most importantly, we can track the development of topics relating to food insecurity over time. While topic modelling cannot replace evidence-based food insecurity assessment, it can give a quick insight into a local situation when 'hard data' is lacking.

I applaud you on your success with BERTopic. The package is incredibly clean and easy to use, and the method works well with little parameter tuning. To me, the fact that you were able to deliver such a useful tool on your own is incredible, especially in the field of NLP, which is dominated by large organizations such as Google and Meta. "

•••
Cascha van Wanrooij
PhD Student
Tilburg University

## **Papers** BERTopic has also been adopted more and more in the academic field. Here are a few from all different kinds of research domains with interesting applications: * *Adewunmi, M., Sharma, S. K., Sharma, N., Sushma, N. S., & Mounmo, B.* (2022). [Cancer Health Disparities drivers with BERTopic modelling and PyCaret Evaluation](https://companyofscientists.com/index.php/chd/article/download/217/232). Cancer Health Disparities, 6. * *Ebeling, R., Sáenz, C. A. C., Nobre, J. C., & Becker, K.* (2022, May). [Analysis of the influence of political polarization in the vaccination stance: the Brazilian COVID-19 scenario](https://ojs.aaai.org/index.php/ICWSM/article/download/19281/19053). In Proceedings of the International AAAI Conference on Web and Social Media (Vol. 16, pp. 159-170). * *Hoseini, M., Melo, P., Benevenuto, F., Feldmann, A., & Zannettou, S.* (2021). [On the globalization of the QAnon conspiracy theory through Telegram.](https://arxiv.org/pdf/2105.13020) arXiv preprint arXiv:2105.13020. * *Falkenberg, M., Galeazzi, A., Torricelli, M., Di Marco, N., Larosa, F., Sas, M., ... & Baronchelli, A.* (2022). [Growing polarization around climate change on social media.](https://www.nature.com/articles/s41558-022-01527-x) Nature Climate Change, 1-8. * *Sánchez‐Franco, M. J., & Rey‐Moreno, M.* (2022). [Do travelers' reviews depend on the destination? An analysis in coastal and urban peer‐to‐peer lodgings.](https://onlinelibrary.wiley.com/doi/pdf/10.1002/mar.21608) Psychology & Marketing, 39(2), 441-459. * Zhunis, A., Lima, G., Song, H., Han, J., & Cha, M. (2022, April). [Emotion bubbles: Emotional composition of online discourse before and after the COVID-19 outbreak.](https://dl.acm.org/doi/pdf/10.1145/3485447.3512132) In Proceedings of the ACM Web Conference 2022 (pp. 2603-2613). * *Alhaj, F., Al-Haj, A., Sharieh, A., & Jabri, R.* (2022). [Improving Arabic cognitive distortion classification in Twitter using BERTopic.](https://oars.uos.ac.uk/2327/1/Paper_99-Improving_Arabic_Cognitive_Distortion_Classification_in_Twitter.pdf) International Journal of Advanced Computer Science and Applications, 13(1), 854-860. Click [here](https://scholar.google.nl/scholar?oi=bibs&hl=nl&cites=17397195143750028634,8180869421575866889,8997387995186833349,18341994629392979746,1103958012065447153,735156116265912963,513933600215578884) for a full overview of papers citing BERTopic. ================================================ FILE: mkdocs.yml ================================================ site_name: BERTopic repo_url: https://github.com/MaartenGr/BERTopic site_url: https://maartengr.github.io/BERTopic/ site_description: Leveraging BERT and a class-based TF-IDF to create easily interpretable topics. site_author: Maarten P. Grootendorst use_directory_urls: false extra_css: - stylesheets/extra.css nav: - Home: index.md - The Algorithm: algorithm/algorithm.md - Getting Started: - Quick Start: getting_started/quickstart/quickstart.md - Serialization: getting_started/serialization/serialization.md - Search Topics: getting_started/search/search.md - Best Practices: getting_started/best_practices/best_practices.md - In-depth: - Visualizations: - Topics: getting_started/visualization/visualize_topics.md - Documents: getting_started/visualization/visualize_documents.md - Terms: getting_started/visualization/visualize_terms.md - Hierarchy: getting_started/visualization/visualize_hierarchy.md - Update Topics: - Topic Reduction: getting_started/topicreduction/topicreduction.md - Update Topic Representations: getting_started/topicrepresentation/topicrepresentation.md - Outlier reduction: getting_started/outlier_reduction/outlier_reduction.md - Parameter tuning: getting_started/parameter tuning/parametertuning.md - Tips & Tricks: getting_started/tips_and_tricks/tips_and_tricks.md - Sub-models: - 1. Embeddings: getting_started/embeddings/embeddings.md - 2. Dimensionality Reduction: getting_started/dim_reduction/dim_reduction.md - 3. Clustering: getting_started/clustering/clustering.md - 4. Vectorizers: getting_started/vectorizers/vectorizers.md - 5. c-TF-IDF: getting_started/ctfidf/ctfidf.md - 6. Fine-tune Topics: - 6A. Representation Models: getting_started/representation/representation.md - 6B. LLM & Generative AI: getting_started/representation/llm.md - 6C. Multiple Representations: getting_started/multiaspect/multiaspect.md - Variations: - Dynamic Topic Modeling: getting_started/topicsovertime/topicsovertime.md - Hierarchical Topic Modeling: getting_started/hierarchicaltopics/hierarchicaltopics.md - Multimodal Topic Modeling: getting_started/multimodal/multimodal.md - Online Topic Modeling: getting_started/online/online.md - Merge Multiple Models: getting_started/merge/merge.md - (semi)-supervised: - Semi-supervised Topic Modeling: getting_started/semisupervised/semisupervised.md - Supervised Topic Modeling: getting_started/supervised/supervised.md - Manual Topic Modeling: getting_started/manual/manual.md - Guided Topic Modeling: getting_started/guided/guided.md - Zero-shot Topic Modeling: getting_started/zeroshot/zeroshot.md - Topic Distributions: getting_started/distribution/distribution.md - Topics per Class: getting_started/topicsperclass/topicsperclass.md - Seed Words: getting_started/seed_words/seed_words.md - FAQ: faq.md - Use Cases: usecases.md - API: - BERTopic: api/bertopic.md - Sub-models: - 1. Backends: api/backends.md - 2. Dimensionality Reduction: api/dimensionality.md - 3. Clustering: api/cluster.md - 4. Vectorizers: api/vectorizers.md - 5. c-TF-IDF: api/ctfidf.md - 6. Fine-Tune Topic Representation: api/representations.md - Plotting: - Barchart: api/plotting/barchart.md - Documents: api/plotting/documents.md - Documents with DataMapPlot: api/plotting/document_datamap.md - DTM: api/plotting/dtm.md - Hierarchical documents: api/plotting/hierarchical_documents.md - Hierarchical topics: api/plotting/hierarchy.md - Distribution: api/plotting/distribution.md - Heatmap: api/plotting/heatmap.md - Term Scores: api/plotting/term.md - Topics: api/plotting/topics.md - Topics per Class: api/plotting/topics_per_class.md - Changelog: changelog.md plugins: - mkdocstrings: watch: - bertopic - search copyright: Copyright © 2024 Maintained by Maarten Grootendorst. theme: custom_dir: images/ name: material icon: logo: material/library font: text: Ubuntu code: Ubuntu Mono favicon: icon.png logo: img/icon.png feature: tabs: true features: - navigation.tabs - navigation.sections - navigation.instant - navigation.top - navigation.tracking - toc.follow - content.code.copy palette: - media: "(prefers-color-scheme: light)" primary: custom scheme: black toggle: icon: material/weather-sunny name: Switch to dark mode - media: "(prefers-color-scheme: dark)" scheme: slate primary: black toggle: icon: material/weather-night name: Switch to light mode markdown_extensions: - admonition - md_in_html - pymdownx.details - pymdownx.highlight - pymdownx.superfences - pymdownx.snippets - toc: permalink: true ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] name = "bertopic" version = "0.17.4" description = "BERTopic performs topic Modeling with state-of-the-art transformer models." readme = "README.md" license = {file = "LICENSE"} requires-python = ">=3.10" authors = [ { name = "Maarten P. Grootendorst", email = "maartengrootendorst@gmail.com" }, ] keywords = [ "nlp", "bert", "topic", "modeling", "embeddings", ] classifiers = [ "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Operating System :: MacOS", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX", "Operating System :: Unix", "Programming Language :: Python", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ "hdbscan>=0.8.29", "umap-learn>=0.5.0", "numpy>=1.20.0", "pandas>=1.1.5", "plotly>=4.7.0", "scikit-learn>=1.0", "sentence-transformers>=0.4.1", "tqdm>=4.41.1", "llvmlite>0.36.0" # To prevent conflicts with `uv` dependency manager ] [project.optional-dependencies] datamap = [ "datamapplot>=0.1", "matplotlib>=3.8", ] dev = [ "bertopic[docs,test]", ] docs = [ "mkdocs==1.5.3", "mkdocs-material==9.5.18", "mkdocstrings-python==1.10.0", "mkdocstrings==0.24.3", ] fastembed = [ "fastembed>=0.6.0", ] flair = [ "flair>=0.7", "torch>=1.4.0", "transformers>=3.5.1", ] gensim = [ "gensim>=4.0.0", ] spacy = [ "spacy>=3.0.1", ] test = [ "pytest>=5.4.3", "pytest-cov>=2.6.1", "ruff~=0.14.0", ] use = [ "tensorflow", "tensorflow_hub", "tensorflow_text", ] vision = [ "accelerate>=0.19.0", # To prevent "cannot import name 'PartialState' from 'accelerate'" "Pillow>=9.2.0", ] [project.urls] Documentation = "https://maartengr.github.io/BERTopic/" Homepage = "https://github.com/MaartenGr/BERTopic" Issues = "https://github.com/MaartenGr/BERTopic/issues" Repository = "https://github.com/MaartenGr/BERTopic.git" [tool.setuptools.packages.find] include = ["bertopic*"] exclude = ["tests"] [tool.ruff] line-length = 120 [tool.ruff.lint] select = [ "E4", "E7", "E9", "F", # pyflakes "D", # pydocstyle "PD", # pandas-vet "RUF", # ruff ] ignore = [ "D100", # Missing docstring in public module "D104", # Missing docstring in public package "D205", # 1 blank line required between summary line and description "E731", # Do not assign a lambda expression, use a def ] [tool.ruff.lint.per-file-ignores] "**/tests/*" = ["D"] # Ignore all docstring errors in tests [tool.ruff.lint.pydocstyle] convention = "google" ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/conftest.py ================================================ import copy import pytest from umap import UMAP from hdbscan import HDBSCAN from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups from sentence_transformers import SentenceTransformer from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn.decomposition import PCA from bertopic.vectorizers import OnlineCountVectorizer from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance from bertopic.dimensionality import BaseDimensionalityReduction from sklearn.linear_model import LogisticRegression @pytest.fixture(scope="session") def embedding_model(): model = SentenceTransformer("all-MiniLM-L6-v2") return model @pytest.fixture(scope="session") def document_embeddings(documents, embedding_model): embeddings = embedding_model.encode(documents) return embeddings @pytest.fixture(scope="session") def reduced_embeddings(document_embeddings): reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit_transform( document_embeddings ) return reduced_embeddings @pytest.fixture(scope="session") def documents(): newsgroup_docs = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))["data"][:1000] return newsgroup_docs @pytest.fixture(scope="session") def targets(): data = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes")) y = data["target"][:1000] return y @pytest.fixture(scope="session") def base_topic_model(documents, document_embeddings, embedding_model): model = BERTopic(embedding_model=embedding_model, calculate_probabilities=True) model.umap_model.random_state = 42 model.hdbscan_model.min_cluster_size = 3 model.fit(documents, document_embeddings) return model @pytest.fixture(scope="session") def zeroshot_topic_model(documents, document_embeddings, embedding_model): zeroshot_topic_list = ["religion", "cars", "electronics"] model = BERTopic( embedding_model=embedding_model, calculate_probabilities=True, zeroshot_topic_list=zeroshot_topic_list, zeroshot_min_similarity=0.3, ) model.umap_model.random_state = 42 model.hdbscan_model.min_cluster_size = 2 model.fit(documents, document_embeddings) return model @pytest.fixture(scope="session") def custom_topic_model(documents, document_embeddings, embedding_model): umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42) hdbscan_model = HDBSCAN( min_cluster_size=3, metric="euclidean", cluster_selection_method="eom", prediction_data=True, ) model = BERTopic( umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model, calculate_probabilities=True, ).fit(documents, document_embeddings) return model @pytest.fixture(scope="session") def representation_topic_model(documents, document_embeddings, embedding_model): umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric="cosine", random_state=42) hdbscan_model = HDBSCAN( min_cluster_size=3, metric="euclidean", cluster_selection_method="eom", prediction_data=True, ) representation_model = { "Main": KeyBERTInspired(), "MMR": [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance()], } model = BERTopic( umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model, representation_model=representation_model, calculate_probabilities=True, ).fit(documents, document_embeddings) return model @pytest.fixture(scope="session") def reduced_topic_model(custom_topic_model, documents): model = copy.deepcopy(custom_topic_model) model.reduce_topics(documents, nr_topics="auto") return model @pytest.fixture(scope="session") def merged_topic_model(custom_topic_model, documents): model = copy.deepcopy(custom_topic_model) # Merge once topics_to_merge = [[1, 2], [3, 4]] model.merge_topics(documents, topics_to_merge) # Merge second time topics_to_merge = [[5, 6, 7]] model.merge_topics(documents, topics_to_merge) return model @pytest.fixture(scope="session") def kmeans_pca_topic_model(documents, document_embeddings): hdbscan_model = KMeans(n_clusters=15, random_state=42) dim_model = PCA(n_components=5) model = BERTopic( hdbscan_model=hdbscan_model, umap_model=dim_model, embedding_model=embedding_model, ).fit(documents, document_embeddings) return model @pytest.fixture(scope="session") def supervised_topic_model(documents, document_embeddings, embedding_model, targets): empty_dimensionality_model = BaseDimensionalityReduction() clf = LogisticRegression() model = BERTopic( embedding_model=embedding_model, umap_model=empty_dimensionality_model, hdbscan_model=clf, ).fit(documents, embeddings=document_embeddings, y=targets) return model @pytest.fixture(scope="session") def online_topic_model(documents, document_embeddings, embedding_model): umap_model = PCA(n_components=5) cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0) vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=0.01) model = BERTopic( umap_model=umap_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model, embedding_model=embedding_model, ) topics = [] for index in range(0, len(documents), 50): model.partial_fit(documents[index : index + 50], document_embeddings[index : index + 50]) topics.extend(model.topics_) model.topics_ = topics return model @pytest.fixture(scope="session") def cuml_base_topic_model(documents, document_embeddings, embedding_model): from cuml.cluster import HDBSCAN as cuml_hdbscan from cuml.manifold import UMAP as cuml_umap model = BERTopic( embedding_model=embedding_model, calculate_probabilities=True, umap_model=cuml_umap(n_components=5, n_neighbors=5, random_state=42), hdbscan_model=cuml_hdbscan(min_cluster_size=3, prediction_data=True), ) model.fit(documents, document_embeddings) return model ================================================ FILE: tests/test_bertopic.py ================================================ import copy import pytest from bertopic import BERTopic import importlib.util def cuml_available(): try: return importlib.util.find_spec("cuml") is not None except ImportError: return False @pytest.mark.parametrize( "model", [ ("base_topic_model"), ("kmeans_pca_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ("supervised_topic_model"), ("representation_topic_model"), ("zeroshot_topic_model"), pytest.param( "cuml_base_topic_model", marks=pytest.mark.skipif(not cuml_available(), reason="cuML not available"), ), ], ) def test_full_model(model, documents, request): """Tests the entire pipeline in one go. This serves as a sanity check to see if the default settings result in a good separation of topics. NOTE: This does not cover all cases but merely combines it all together """ topic_model = copy.deepcopy(request.getfixturevalue(model)) if model == "base_topic_model": topic_model.save( "model_dir", serialization="pytorch", save_ctfidf=True, save_embedding_model="sentence-transformers/all-MiniLM-L6-v2", ) topic_model = BERTopic.load("model_dir") if model == "cuml_base_topic_model": assert "cuml" in str(type(topic_model.umap_model)).lower() assert "cuml" in str(type(topic_model.hdbscan_model)).lower() topics = topic_model.topics_ for topic in set(topics): words = topic_model.get_topic(topic)[:10] assert len(words) == 10 for topic in topic_model.get_topic_freq().Topic: words = topic_model.get_topic(topic)[:10] assert len(words) == 10 assert len(topic_model.get_topic_freq()) > 2 assert len(topic_model.get_topics()) == len(topic_model.get_topic_freq()) # Test extraction of document info document_info = topic_model.get_document_info(documents) assert len(document_info) == len(documents) # Test transform doc = "This is a new document to predict." topics_test, _probs_test = topic_model.transform([doc, doc]) assert len(topics_test) == 2 # Test zero-shot topic modeling if topic_model._is_zeroshot(): if topic_model._outliers: assert set(topic_model.topic_labels_.keys()) == set(range(-1, len(topic_model.topic_labels_) - 1)) else: assert set(topic_model.topic_labels_.keys()) == set(range(len(topic_model.topic_labels_))) # Test topics over time timestamps = [i % 10 for i in range(len(documents))] topics_over_time = topic_model.topics_over_time(documents, timestamps) assert topics_over_time.Frequency.sum() == len(documents) assert len(topics_over_time.Topic.unique()) == len(set(topics)) # Test hierarchical topics hier_topics = topic_model.hierarchical_topics(documents) assert len(hier_topics) > 0 assert hier_topics.Parent_ID.astype(int).min() > max(topics) # Test creation of topic tree tree = topic_model.get_topic_tree(hier_topics, tight_layout=False) assert isinstance(tree, str) assert len(tree) > 10 # Test find topic similar_topics, similarity = topic_model.find_topics("query", top_n=2) assert len(similar_topics) == 2 assert len(similarity) == 2 assert max(similarity) <= 1 # Test topic reduction nr_topics = len(set(topics)) nr_topics = 2 if nr_topics < 2 else nr_topics - 1 topic_model.reduce_topics(documents, nr_topics=nr_topics) assert len(topic_model.get_topic_freq()) == nr_topics assert len(topic_model.topics_) == len(topics) # Test update topics topic = topic_model.get_topic(1)[:10] vectorizer_model = topic_model.vectorizer_model topic_model.update_topics(documents, n_gram_range=(2, 2)) updated_topic = topic_model.get_topic(1)[:10] topic_model.update_topics(documents, vectorizer_model=vectorizer_model) original_topic = topic_model.get_topic(1)[:10] assert topic != updated_topic if topic_model.representation_model is not None: assert topic != original_topic # Test updating topic labels topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=10, separator=", ") assert len(topic_labels) == len(set(topic_model.topics_)) # Test setting topic labels topic_model.set_topic_labels(topic_labels) assert topic_model.custom_labels_ == topic_labels # Test merging topics freq = topic_model.get_topic_freq(0) topics_to_merge = [0, 1] topic_model.merge_topics(documents, topics_to_merge) assert freq < topic_model.get_topic_freq(0) # Test reduction of outliers if -1 in topics: new_topics = topic_model.reduce_outliers(documents, topics, threshold=0.0) nr_outliers_topic_model = sum([1 for topic in topic_model.topics_ if topic == -1]) nr_outliers_new_topics = sum([1 for topic in new_topics if topic == -1]) if topic_model._outliers == 1: assert nr_outliers_topic_model > nr_outliers_new_topics # Combine models topic_model1 = BERTopic.load("model_dir") merged_model = BERTopic.merge_models([topic_model, topic_model1]) assert len(merged_model.get_topic_info()) > len(topic_model.get_topic_info()) ================================================ FILE: tests/test_other.py ================================================ from bertopic import BERTopic from bertopic.dimensionality import BaseDimensionalityReduction try: from plotly.graph_objects import Figure except ImportError: Figure = None def test_load_save_model(): model = BERTopic(language="Dutch", embedding_model=None) model.save("test", serialization="pickle") loaded_model = BERTopic.load("test") assert type(model) is type(loaded_model) assert model.language == loaded_model.language assert model.embedding_model == loaded_model.embedding_model assert model.top_n_words == loaded_model.top_n_words def test_get_params(): model = BERTopic() params = model.get_params() assert not params["embedding_model"] assert not params["low_memory"] assert not params["nr_topics"] assert params["n_gram_range"] == (1, 1) assert params["min_topic_size"] == 10 assert params["language"] == "english" def test_no_plotly(): model = BERTopic( language="Dutch", embedding_model=None, min_topic_size=2, top_n_words=1, umap_model=BaseDimensionalityReduction(), ) model.fit(["hello", "hi", "goodbye", "goodbye", "whats up"] * 10) try: out = model.visualize_topics() assert isinstance(out, Figure) if Figure else False except ImportError as e: assert "Plotly is required to use" in str(e) ================================================ FILE: tests/test_plotting/__init__.py ================================================ ================================================ FILE: tests/test_plotting/test_approximate.py ================================================ import copy import pytest @pytest.mark.parametrize("batch_size", [50, None]) @pytest.mark.parametrize("padding", [True, False]) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ], ) def test_approximate_distribution(batch_size, padding, model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) # Calculate only on a document-level based on tokensets topic_distr, _ = topic_model.approximate_distribution(documents, padding=padding, batch_size=batch_size) assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers # Use the distribution visualization for i in range(3): topic_model.visualize_distribution(topic_distr[i]) # Calculate distribution on a token-level topic_distr, topic_token_distr = topic_model.approximate_distribution(documents[:100], calculate_tokens=True) assert topic_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers assert len(topic_token_distr) == len(documents[:100]) for token_distr in topic_token_distr: assert token_distr.shape[1] == len(topic_model.topic_labels_) - topic_model._outliers ================================================ FILE: tests/test_plotting/test_bar.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_barchart(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) fig = topic_model.visualize_barchart() assert len(fig.to_dict()["layout"]["annotations"]) == 8 for annotation in fig.to_dict()["layout"]["annotations"]: assert int(annotation["text"].split(" ")[-1]) != -1 fig = topic_model.visualize_barchart(top_n_topics=5) assert len(fig.to_dict()["layout"]["annotations"]) == 5 for annotation in fig.to_dict()["layout"]["annotations"]: assert int(annotation["text"].split(" ")[-1]) != -1 @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_barchart_outlier(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topic_model.topic_sizes_[-1] = 4 fig = topic_model.visualize_barchart() assert len(fig.to_dict()["layout"]["annotations"]) == 8 for annotation in fig.to_dict()["layout"]["annotations"]: assert int(annotation["text"].split(" ")[-1]) != -1 fig = topic_model.visualize_barchart(top_n_topics=5) assert len(fig.to_dict()["layout"]["annotations"]) == 5 for annotation in fig.to_dict()["layout"]["annotations"]: assert int(annotation["text"].split(" ")[-1]) != -1 ================================================ FILE: tests/test_plotting/test_documents.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ], ) def test_documents(model, reduced_embeddings, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topics = set(topic_model.topics_) if -1 in topics: topics.remove(-1) fig = topic_model.visualize_documents(documents, embeddings=reduced_embeddings, hide_document_hover=True) fig_topics = [int(data["name"].split("_")[0]) for data in fig.to_dict()["data"][1:]] assert set(fig_topics) == topics ================================================ FILE: tests/test_plotting/test_dynamic.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_dynamic(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) timestamps = [i % 10 for i in range(len(documents))] topics_over_time = topic_model.topics_over_time(documents, timestamps) fig = topic_model.visualize_topics_over_time(topics_over_time) assert len(fig.to_dict()["data"]) == len(set(topic_model.topics_)) - topic_model._outliers ================================================ FILE: tests/test_plotting/test_heatmap.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ], ) def test_heatmap(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topics = set(topic_model.topics_) if -1 in topics: topics.remove(-1) fig = topic_model.visualize_heatmap() fig_topics = [int(topic.split("_")[0]) for topic in fig.to_dict()["data"][0]["x"]] assert set(fig_topics) == topics ================================================ FILE: tests/test_plotting/test_term_rank.py ================================================ import copy import pytest @pytest.mark.parametrize("model", [("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model")]) def test_term_rank(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topic_model.visualize_term_rank() ================================================ FILE: tests/test_plotting/test_topics.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_topics(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) fig = topic_model.visualize_topics() for slider in fig.to_dict()["layout"]["sliders"]: for step in slider["steps"]: assert int(step["label"].split(" ")[-1]) != -1 fig = topic_model.visualize_topics(top_n_topics=5) for slider in fig.to_dict()["layout"]["sliders"]: for step in slider["steps"]: assert int(step["label"].split(" ")[-1]) != -1 @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_topics_outlier(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topic_model.topic_sizes_[-1] = 4 fig = topic_model.visualize_topics() for slider in fig.to_dict()["layout"]["sliders"]: for step in slider["steps"]: assert int(step["label"].split(" ")[-1]) != -1 fig = topic_model.visualize_topics(top_n_topics=5) for slider in fig.to_dict()["layout"]["sliders"]: for step in slider["steps"]: assert int(step["label"].split(" ")[-1]) != -1 ================================================ FILE: tests/test_reduction/__init__.py ================================================ ================================================ FILE: tests/test_reduction/test_delete.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_delete(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) nr_topics = len(set(topic_model.topics_)) length_documents = len(topic_model.topics_) # First deletion topics_to_delete = [1, 2] topic_model.delete_topics(topics_to_delete) mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] if model == "online_topic_model" or model == "kmeans_pca_topic_model": assert nr_topics == len(set(topic_model.topics_)) + 1 assert topic_model.get_topic_info().Count.sum() == length_documents else: assert nr_topics == len(set(topic_model.topics_)) + 2 assert topic_model.get_topic_info().Count.sum() == length_documents if model == "online_topic_model": assert mapped_labels == topic_model.topics_[950:] else: assert mapped_labels == topic_model.topics_ # Find two existing topics for second deletion remaining_topics = sorted(list(set(topic_model.topics_))) remaining_topics = [t for t in remaining_topics if t != -1] # Exclude outlier topic topics_to_delete = remaining_topics[:2] # Take first two remaining topics # Second deletion topic_model.delete_topics(topics_to_delete) mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] if model == "online_topic_model" or model == "kmeans_pca_topic_model": assert nr_topics == len(set(topic_model.topics_)) + 3 assert topic_model.get_topic_info().Count.sum() == length_documents else: assert nr_topics == len(set(topic_model.topics_)) + 4 assert topic_model.get_topic_info().Count.sum() == length_documents if model == "online_topic_model": assert mapped_labels == topic_model.topics_[950:] else: assert mapped_labels == topic_model.topics_ ================================================ FILE: tests/test_reduction/test_merge.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_merge(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) nr_topics = len(set(topic_model.topics_)) topics_to_merge = [1, 2] topic_model.merge_topics(documents, topics_to_merge) mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] assert nr_topics == len(set(topic_model.topics_)) + 1 assert topic_model.get_topic_info().Count.sum() == len(documents) if model == "online_topic_model": assert mapped_labels == topic_model.topics_[950:] else: assert mapped_labels == topic_model.topics_ topics_to_merge = [1, 2] topic_model.merge_topics(documents, topics_to_merge) mappings = topic_model.topic_mapper_.get_mappings(list(topic_model.hdbscan_model.labels_)) mapped_labels = [mappings[label] for label in topic_model.hdbscan_model.labels_] assert nr_topics == len(set(topic_model.topics_)) + 2 assert topic_model.get_topic_info().Count.sum() == len(documents) if model == "online_topic_model": assert mapped_labels == topic_model.topics_[950:] else: assert mapped_labels == topic_model.topics_ ================================================ FILE: tests/test_representation/__init__.py ================================================ ================================================ FILE: tests/test_representation/test_get.py ================================================ import copy import pytest import numpy as np import pandas as pd @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_get_topic(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)] unknown_topic = topic_model.get_topic(500) for topic in topics: assert topic is not False assert len(topics) == len(topic_model.get_topic_info()) assert not unknown_topic @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_get_topics(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topics = topic_model.get_topics() assert topics == topic_model.topic_representations_ assert len(topics.keys()) == len(set(topic_model.topics_)) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_get_topic_freq(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) for topic in set(topic_model.topics_): assert not isinstance(topic_model.get_topic_freq(topic), pd.DataFrame) topic_freq = topic_model.get_topic_freq() unique_topics = set(topic_model.topics_) topics_in_mapper = set(np.array(topic_model.topic_mapper_.mappings_)[:, -1]) assert isinstance(topic_freq, pd.DataFrame) assert len(topic_freq) == len(set(topic_model.topics_)) assert len(topics_in_mapper.difference(unique_topics)) == 0 assert len(unique_topics.difference(topics_in_mapper)) == 0 @pytest.mark.parametrize( "model", [ ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ], ) def test_get_representative_docs(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) all_docs = topic_model.get_representative_docs() unique_topics = set(topic_model.topics_) topics_in_mapper = set(np.array(topic_model.topic_mapper_.mappings_)[:, -1]) assert len(all_docs) == len(topic_model.topic_sizes_.keys()) assert len(all_docs) == len(topics_in_mapper) assert len(all_docs) == topic_model.c_tf_idf_.shape[0] assert len(all_docs) == len(topic_model.topic_labels_) assert all([True if len(docs) == 3 else False for docs in all_docs.values()]) topics = set(list(all_docs.keys())) assert len(topics.difference(unique_topics)) == 0 assert len(topics.difference(topics_in_mapper)) == 0 @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_get_topic_info(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) info = topic_model.get_topic_info() if topic_model._outliers: assert info.iloc[0].Topic == -1 else: assert info.iloc[0].Topic == 0 for topic in set(topic_model.topics_): assert len(topic_model.get_topic_info(topic)) == 1 assert len(topic_model.get_topic_info(200)) == 0 ================================================ FILE: tests/test_representation/test_labels.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_generate_topic_labels(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) labels = topic_model.generate_topic_labels(topic_prefix=False) assert sum([label[0].isdigit() for label in labels[1:]]) / len(labels) < 0.2 labels = [int(label.split("_")[0]) for label in topic_model.generate_topic_labels()] assert labels == sorted(list(set(topic_model.topics_))) labels = topic_model.generate_topic_labels(nr_words=1, topic_prefix=False) assert all([True if len(label) < 15 else False for label in labels]) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_set_labels(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) labels = topic_model.generate_topic_labels() topic_model.set_topic_labels(labels) assert topic_model.custom_labels_ == labels if model != "online_topic_model": labels = {1: "My label", 2: "Another label"} topic_model.set_topic_labels(labels) assert topic_model.custom_labels_[1 + topic_model._outliers] == "My label" assert topic_model.custom_labels_[2 + topic_model._outliers] == "Another label" labels = {1: "Change label", 3: "New label"} topic_model.set_topic_labels(labels) assert topic_model.custom_labels_[1 + topic_model._outliers] == "Change label" assert topic_model.custom_labels_[3 + topic_model._outliers] == "New label" else: labels = { sorted(set(topic_model.topics_))[0]: "My label", sorted(set(topic_model.topics_))[1]: "Another label", } topic_model.set_topic_labels(labels) assert topic_model.custom_labels_[0] == "My label" assert topic_model.custom_labels_[1] == "Another label" labels = { sorted(set(topic_model.topics_))[0]: "Change label", sorted(set(topic_model.topics_))[2]: "New label", } topic_model.set_topic_labels(labels) assert topic_model.custom_labels_[0 + topic_model._outliers] == "Change label" assert topic_model.custom_labels_[2 + topic_model._outliers] == "New label" ================================================ FILE: tests/test_representation/test_representations.py ================================================ import copy import pytest import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ], ) def test_update_topics(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) old_ctfidf = topic_model.c_tf_idf_ old_topics = topic_model.topics_ topic_model.update_topics(documents, n_gram_range=(1, 3)) assert old_ctfidf.shape[1] < topic_model.c_tf_idf_.shape[1] assert old_topics == topic_model.topics_ updated_topics = [topic if topic != 1 else 0 for topic in old_topics] topic_model.update_topics(documents, topics=updated_topics, n_gram_range=(1, 3)) assert len(set(old_topics)) - 1 == len(set(topic_model.topics_)) old_topics = topic_model.topics_ updated_topics = [topic if topic != 2 else 0 for topic in old_topics] topic_model.update_topics(documents, topics=updated_topics, n_gram_range=(1, 3)) assert len(set(old_topics)) - 1 == len(set(topic_model.topics_)) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_extract_topics(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) nr_topics = 5 documents = pd.DataFrame( { "Document": documents, "ID": range(len(documents)), "Topic": np.random.randint(-1, nr_topics - 1, len(documents)), } ) topic_model._update_topic_size(documents) topic_model._extract_topics(documents) freq = topic_model.get_topic_freq() assert topic_model.c_tf_idf_.shape[0] == 5 assert topic_model.c_tf_idf_.shape[1] > 100 assert isinstance(freq, pd.DataFrame) assert nr_topics == len(freq.Topic.unique()) assert freq.Count.sum() == len(documents) assert len(freq.Topic.unique()) == len(freq) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_extract_topics_custom_cv(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) nr_topics = 5 documents = pd.DataFrame( { "Document": documents, "ID": range(len(documents)), "Topic": np.random.randint(-1, nr_topics - 1, len(documents)), } ) cv = CountVectorizer(ngram_range=(1, 2)) topic_model.vectorizer_model = cv topic_model._update_topic_size(documents) topic_model._extract_topics(documents) freq = topic_model.get_topic_freq() assert topic_model.c_tf_idf_.shape[0] == 5 assert topic_model.c_tf_idf_.shape[1] > 100 assert isinstance(freq, pd.DataFrame) assert nr_topics == len(freq.Topic.unique()) assert freq.Count.sum() == len(documents) assert len(freq.Topic.unique()) == len(freq) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) @pytest.mark.parametrize("reduced_topics", [2, 4, 10]) def test_topic_reduction(model, reduced_topics, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) old_topics = copy.deepcopy(topic_model.topics_) old_freq = topic_model.get_topic_freq() topic_model.reduce_topics(documents, nr_topics=reduced_topics) new_freq = topic_model.get_topic_freq() if model != "online_topic_model": assert old_freq.Count.sum() == new_freq.Count.sum() assert len(old_freq.Topic.unique()) == len(old_freq) assert len(new_freq.Topic.unique()) == len(new_freq) assert len(topic_model.topics_) == len(old_topics) assert topic_model.topics_ != old_topics @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_topic_reduction_edge_cases(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topic_model.nr_topics = 100 nr_topics = 5 topics = np.random.randint(-1, nr_topics - 1, len(documents)) old_documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) topic_model._update_topic_size(old_documents) old_documents = topic_model._sort_mappings_by_frequency(old_documents) topic_model._extract_topics(old_documents) old_freq = topic_model.get_topic_freq() new_documents = topic_model._reduce_topics(old_documents) new_freq = topic_model.get_topic_freq() assert not set(old_documents.Topic).difference(set(new_documents.Topic)) pd.testing.assert_frame_equal(old_documents, new_documents) pd.testing.assert_frame_equal(old_freq, new_freq) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_find_topics(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) similar_topics, similarity = topic_model.find_topics("car") assert np.mean(similarity) > 0.1 assert len(similar_topics) > 0 ================================================ FILE: tests/test_sub_models/__init__.py ================================================ ================================================ FILE: tests/test_sub_models/test_cluster.py ================================================ import pytest import pandas as pd from sklearn.datasets import make_blobs from sklearn.cluster import KMeans from hdbscan import HDBSCAN from bertopic import BERTopic @pytest.mark.parametrize("cluster_model", ["hdbscan", "kmeans"]) @pytest.mark.parametrize( "samples,features,centers", [ (200, 500, 1), (500, 200, 1), (200, 500, 2), (500, 200, 2), (200, 500, 4), (500, 200, 4), ], ) def test_hdbscan_cluster_embeddings(cluster_model, samples, features, centers): embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42) documents = [str(i + 1) for i in range(embeddings.shape[0])] old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None}) if cluster_model == "kmeans": cluster_model = KMeans(n_clusters=centers) else: cluster_model = HDBSCAN( min_cluster_size=10, metric="euclidean", cluster_selection_method="eom", prediction_data=True, ) model = BERTopic(hdbscan_model=cluster_model) new_df, _ = model._cluster_embeddings(embeddings, old_df) assert len(new_df.Topic.unique()) == centers assert "Topic" in new_df.columns pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1)) @pytest.mark.parametrize("cluster_model", ["hdbscan", "kmeans"]) @pytest.mark.parametrize( "samples,features,centers", [ (200, 500, 1), (500, 200, 1), (200, 500, 2), (500, 200, 2), (200, 500, 4), (500, 200, 4), ], ) def test_custom_hdbscan_cluster_embeddings(cluster_model, samples, features, centers): embeddings, _ = make_blobs(n_samples=samples, centers=centers, n_features=features, random_state=42) documents = [str(i + 1) for i in range(embeddings.shape[0])] old_df = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None}) if cluster_model == "kmeans": cluster_model = KMeans(n_clusters=centers) else: cluster_model = HDBSCAN( min_cluster_size=10, metric="euclidean", cluster_selection_method="eom", prediction_data=True, ) model = BERTopic(hdbscan_model=cluster_model) new_df, _ = model._cluster_embeddings(embeddings, old_df) assert len(new_df.Topic.unique()) == centers assert "Topic" in new_df.columns pd.testing.assert_frame_equal(old_df.drop("Topic", axis=1), new_df.drop("Topic", axis=1)) ================================================ FILE: tests/test_sub_models/test_dim_reduction.py ================================================ import copy import pytest import numpy as np from umap import UMAP from sklearn.decomposition import PCA from bertopic import BERTopic @pytest.mark.parametrize("dim_model", [UMAP, PCA]) @pytest.mark.parametrize( "embeddings,shape,n_components", [ (np.random.rand(100, 128), 100, 5), (np.random.rand(10, 256), 10, 5), (np.random.rand(50, 15), 50, 10), ], ) def test_reduce_dimensionality(dim_model, embeddings, shape, n_components): model = BERTopic(umap_model=dim_model(n_components=n_components)) umap_embeddings = model._reduce_dimensionality(embeddings) assert umap_embeddings.shape == (shape, n_components) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_custom_reduce_dimensionality(model, request): embeddings = np.random.rand(500, 128) topic_model = copy.deepcopy(request.getfixturevalue(model)) umap_embeddings = topic_model._reduce_dimensionality(embeddings) assert umap_embeddings.shape[1] < embeddings.shape[1] ================================================ FILE: tests/test_sub_models/test_embeddings.py ================================================ import copy import pytest import numpy as np from bertopic import BERTopic from sklearn.metrics.pairwise import cosine_similarity @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_extract_embeddings(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) single_embedding = topic_model._extract_embeddings("a document") multiple_embeddings = topic_model._extract_embeddings(["something different", "another document"]) sim_matrix = cosine_similarity(single_embedding, multiple_embeddings)[0] assert single_embedding.shape[0] == 1 assert single_embedding.shape[1] == 384 assert np.min(single_embedding) > -5 assert np.max(single_embedding) < 5 assert multiple_embeddings.shape[0] == 2 assert multiple_embeddings.shape[1] == 384 assert np.min(multiple_embeddings) > -5 assert np.max(multiple_embeddings) < 5 assert sim_matrix[0] < 0.5 assert sim_matrix[1] > 0.5 @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_extract_embeddings_compare(model, embedding_model, request): docs = ["some document"] topic_model = copy.deepcopy(request.getfixturevalue(model)) bertopic_embeddings = topic_model._extract_embeddings(docs) assert isinstance(bertopic_embeddings, np.ndarray) assert bertopic_embeddings.shape == (1, 384) sentence_embeddings = embedding_model.encode(docs, show_progress_bar=False) assert np.array_equal(bertopic_embeddings, sentence_embeddings) def test_extract_incorrect_embeddings(): with pytest.raises(ValueError): model = BERTopic(language="Unknown language") model.fit(["some document"]) ================================================ FILE: tests/test_utils.py ================================================ import pytest import logging import numpy as np from typing import List from bertopic._utils import ( check_documents_type, check_embeddings_shape, MyLogger, select_topic_representation, get_unique_distances, ) from scipy.sparse import csr_matrix def test_logger(): logger = MyLogger() logger.configure("DEBUG") assert isinstance(logger.logger, logging.Logger) assert logger.logger.level == 10 logger = MyLogger() logger.configure("WARNING") assert isinstance(logger.logger, logging.Logger) assert logger.logger.level == 30 @pytest.mark.parametrize( "docs", ["A document not in an iterable", [None], 5], ) def test_check_documents_type(docs): with pytest.raises(TypeError): check_documents_type(docs) def test_check_embeddings_shape(): docs = ["doc_one", "doc_two"] embeddings = np.array([[1, 2, 3], [2, 3, 4]]) check_embeddings_shape(embeddings, docs) def test_make_unique_distances(): def check_dists(dists: List[float], noise_max: float): unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max) assert len(unique_dists) == len(dists), "The number of elements must be the same" assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique" check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7) # testing whether the distances are sorted in ascending order when if the noise is extremely high check_dists([0, 0, 0, 0.5, 0.75, 1, 1], noise_max=20) # test whether the distances are sorted in ascending order when the distances are all the same check_dists([0, 0, 0, 0, 0, 0, 0], noise_max=1e-7) def test_select_topic_representation(): ctfidf_embeddings = np.array([[1, 1, 1]]) ctfidf_embeddings_sparse = csr_matrix( (ctfidf_embeddings.reshape(-1).tolist(), ([0, 0, 0], [0, 1, 2])), shape=ctfidf_embeddings.shape, ) topic_embeddings = np.array([[2, 2, 2]]) # Use topic embeddings repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=False) np.testing.assert_array_equal(topic_embeddings, repr_) assert not ctfidf_used # Fallback to c-TF-IDF repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, None, use_ctfidf=False) np.testing.assert_array_equal(ctfidf_embeddings, repr_) assert ctfidf_used # Use c-TF-IDF repr_, ctfidf_used = select_topic_representation(ctfidf_embeddings, topic_embeddings, use_ctfidf=True) np.testing.assert_array_equal(ctfidf_embeddings, repr_) assert ctfidf_used # Fallback to topic embeddings repr_, ctfidf_used = select_topic_representation(None, topic_embeddings, use_ctfidf=True) np.testing.assert_array_equal(topic_embeddings, repr_) assert not ctfidf_used # `scipy.sparse.csr_matrix` can be used as c-TF-IDF embeddings np.testing.assert_array_equal( ctfidf_embeddings, select_topic_representation(ctfidf_embeddings_sparse, None, use_ctfidf=True, output_ndarray=True)[0], ) # check that `csr_matrix` is not casted to `np.ndarray` when `ctfidf_as_ndarray` is False repr_ = select_topic_representation(ctfidf_embeddings_sparse, None, output_ndarray=False)[0] assert isinstance(repr_, csr_matrix) ================================================ FILE: tests/test_variations/__init__.py ================================================ ================================================ FILE: tests/test_variations/test_class.py ================================================ import copy import pytest from sklearn.datasets import fetch_20newsgroups data = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes")) classes = [data["target_names"][i] for i in data["target"]][:1000] @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_class(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topics_per_class_global = topic_model.topics_per_class(documents, classes=classes, global_tuning=True) topics_per_class_local = topic_model.topics_per_class(documents, classes=classes, global_tuning=False) assert topics_per_class_global.Frequency.sum() == len(documents) assert topics_per_class_local.Frequency.sum() == len(documents) assert set(topics_per_class_global.Topic.unique()) == set(topic_model.topics_) assert set(topics_per_class_local.Topic.unique()) == set(topic_model.topics_) assert len(topics_per_class_global.Class.unique()) == len(set(classes)) assert len(topics_per_class_local.Class.unique()) == len(set(classes)) ================================================ FILE: tests/test_variations/test_dynamic.py ================================================ import copy import pytest @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_dynamic(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) timestamps = [i % 10 for i in range(len(documents))] topics_over_time = topic_model.topics_over_time(documents, timestamps) assert topics_over_time.Frequency.sum() == len(documents) assert set(topics_over_time.Topic.unique()) == set(topic_model.topics_) assert len(topics_over_time.Timestamp.unique()) == len(set(timestamps)) ================================================ FILE: tests/test_variations/test_hierarchy.py ================================================ import copy import pytest from scipy.cluster import hierarchy as sch @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_hierarchy(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) hierarchical_topics = topic_model.hierarchical_topics(documents) merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals]) assert len(hierarchical_topics) > 0 assert merged_topics == set(topic_model.topics_).difference({-1}) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_linkage(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True) hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function) merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals]) tree = topic_model.get_topic_tree(hierarchical_topics) assert len(hierarchical_topics) > 0 assert len(tree) > 50 assert len(tree.split("\n")) <= 2 * len(set(topic_model.topics_)) assert merged_topics == set(topic_model.topics_).difference({-1}) @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_tree(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) linkage_function = lambda x: sch.linkage(x, "single", optimal_ordering=True) hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function) merged_topics = set([v for vals in hierarchical_topics.Topics.values for v in vals]) tree = topic_model.get_topic_tree(hierarchical_topics) assert len(hierarchical_topics) > 0 assert len(tree) > 50 assert len(tree.split("\n")) <= 2 * len(set(topic_model.topics_)) assert merged_topics == set(topic_model.topics_).difference({-1}) ================================================ FILE: tests/test_vectorizers/__init__.py ================================================ ================================================ FILE: tests/test_vectorizers/test_ctfidf.py ================================================ import copy import pytest import numpy as np import pandas as pd from packaging import version from scipy.sparse import csr_matrix from sklearn import __version__ as sklearn_version from sklearn.feature_extraction.text import CountVectorizer from bertopic.vectorizers import ClassTfidfTransformer @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_ctfidf(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topics = topic_model.topics_ documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) documents = topic_model._preprocess_text(documents_per_topic.Document.values) count = topic_model.vectorizer_model.fit(documents) # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 # and will be removed in 1.2. Please use get_feature_names_out instead. if version.parse(sklearn_version) >= version.parse("1.0.0"): words = count.get_feature_names_out() else: words = count.get_feature_names() X = count.transform(documents) transformer = ClassTfidfTransformer().fit(X) c_tf_idf = transformer.transform(X) assert len(words) > 1000 assert all([isinstance(x, str) for x in words]) assert isinstance(X, csr_matrix) assert isinstance(c_tf_idf, csr_matrix) assert X.shape[0] == len(set(topics)) assert X.shape[1] == len(words) assert c_tf_idf.shape[0] == len(set(topics)) assert c_tf_idf.shape[1] == len(words) assert np.min(X) == 0 @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("base_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_ctfidf_custom_cv(model, documents, request): cv = CountVectorizer(ngram_range=(1, 3), stop_words="english") topic_model = copy.deepcopy(request.getfixturevalue(model)) topic_model.vectorizer_model = cv topics = topic_model.topics_ documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) documents = topic_model._preprocess_text(documents_per_topic.Document.values) count = topic_model.vectorizer_model.fit(documents) # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 # and will be removed in 1.2. Please use get_feature_names_out instead. if version.parse(sklearn_version) >= version.parse("1.0.0"): words = count.get_feature_names_out() else: words = count.get_feature_names() X = count.transform(documents) transformer = ClassTfidfTransformer().fit(X) c_tf_idf = transformer.transform(X) assert len(words) > 1000 assert all([isinstance(x, str) for x in words]) assert isinstance(X, csr_matrix) assert isinstance(c_tf_idf, csr_matrix) assert X.shape[0] == len(set(topics)) assert X.shape[1] == len(words) assert c_tf_idf.shape[0] == len(set(topics)) assert c_tf_idf.shape[1] == len(words) assert np.min(X) == 0 ================================================ FILE: tests/test_vectorizers/test_online_cv.py ================================================ import copy import pytest from bertopic.vectorizers import OnlineCountVectorizer @pytest.mark.parametrize( "model", [ ("kmeans_pca_topic_model"), ("custom_topic_model"), ("merged_topic_model"), ("reduced_topic_model"), ("online_topic_model"), ], ) def test_online_cv(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) vectorizer_model = OnlineCountVectorizer(stop_words="english", ngram_range=(2, 2)) topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)] topic_model.update_topics(documents, vectorizer_model=vectorizer_model) new_topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)] for old_topic, new_topic in zip(topics, new_topics): if old_topic[0][0] != "": assert old_topic != new_topic @pytest.mark.parametrize("model", [("online_topic_model")]) def test_clean_bow(model, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) original_shape = topic_model.vectorizer_model.X_.shape topic_model.vectorizer_model.delete_min_df = 2 topic_model.vectorizer_model._clean_bow() assert original_shape[0] == topic_model.vectorizer_model.X_.shape[0] assert original_shape[1] > topic_model.vectorizer_model.X_.shape[1]