Showing preview only (2,273K chars total). Download the full file or copy to clipboard to get everything.
Repository: facebookresearch/fastText
Branch: main
Commit: 1142dc4c4ecb
Files: 462
Total size: 2.1 MB
Directory structure:
gitextract_5y6fukma/
├── .gitignore
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── PACKAGE
├── README.md
├── alignment/
│ ├── README.md
│ ├── align.py
│ ├── eval.py
│ ├── example.sh
│ ├── unsup_align.py
│ ├── unsup_multialign.py
│ └── utils.py
├── classification-example.sh
├── classification-results.sh
├── crawl/
│ ├── README.md
│ ├── dedup.cc
│ ├── download_crawl.sh
│ ├── filter_dedup.sh
│ ├── filter_utf8.cc
│ └── process_wet_file.sh
├── docs/
│ ├── aligned-vectors.md
│ ├── api.md
│ ├── autotune.md
│ ├── cheatsheet.md
│ ├── crawl-vectors.md
│ ├── dataset.md
│ ├── english-vectors.md
│ ├── faqs.md
│ ├── language-identification.md
│ ├── options.md
│ ├── pretrained-vectors.md
│ ├── python-module.md
│ ├── references.md
│ ├── supervised-models.md
│ ├── supervised-tutorial.md
│ ├── support.md
│ ├── unsupervised-tutorials.md
│ └── webassembly-module.md
├── download_model.py
├── eval.py
├── fasttext.pc.in
├── get-wikimedia.sh
├── pyproject.toml
├── python/
│ ├── README.md
│ ├── README.rst
│ ├── benchmarks/
│ │ ├── README.rst
│ │ └── get_word_vector.py
│ └── doc/
│ └── examples/
│ ├── FastTextEmbeddingBag.py
│ ├── bin_to_vec.py
│ ├── compute_accuracy.py
│ ├── get_vocab.py
│ ├── train_supervised.py
│ └── train_unsupervised.py
├── quantization-example.sh
├── reduce_model.py
├── runtests.py
├── scripts/
│ ├── kbcompletion/
│ │ ├── README.md
│ │ ├── data.sh
│ │ ├── eval.cpp
│ │ ├── fb15k.sh
│ │ ├── fb15k237.sh
│ │ ├── svo.sh
│ │ └── wn18.sh
│ └── quantization/
│ └── quantization-results.sh
├── setup.cfg
├── setup.py
├── src/
│ ├── aligned.h
│ ├── args.cc
│ ├── args.h
│ ├── autotune.cc
│ ├── autotune.h
│ ├── densematrix.cc
│ ├── densematrix.h
│ ├── dictionary.cc
│ ├── dictionary.h
│ ├── fasttext.cc
│ ├── fasttext.h
│ ├── loss.cc
│ ├── loss.h
│ ├── main.cc
│ ├── matrix.cc
│ ├── matrix.h
│ ├── meter.cc
│ ├── meter.h
│ ├── model.cc
│ ├── model.h
│ ├── productquantizer.cc
│ ├── productquantizer.h
│ ├── quantmatrix.cc
│ ├── quantmatrix.h
│ ├── real.h
│ ├── utils.cc
│ ├── utils.h
│ ├── vector.cc
│ └── vector.h
├── tests/
│ └── fetch_test_data.sh
├── webassembly/
│ ├── README.md
│ ├── doc/
│ │ └── examples/
│ │ ├── misc.html
│ │ ├── predict.html
│ │ ├── train_supervised.html
│ │ └── train_unsupervised.html
│ ├── fasttext.js
│ └── fasttext_wasm.cc
├── website/
│ ├── README.md
│ ├── blog/
│ │ ├── 2016-08-18-blog-post.md
│ │ ├── 2017-05-02-blog-post.md
│ │ ├── 2017-10-02-blog-post.md
│ │ └── 2019-06-25-blog-post.md
│ ├── core/
│ │ └── Footer.js
│ ├── package.json
│ ├── pages/
│ │ └── en/
│ │ └── index.js
│ ├── sidebars.json
│ ├── siteConfig.js
│ └── static/
│ ├── docs/
│ │ └── en/
│ │ └── html/
│ │ ├── .classfasttext_1_1QMatrix-members.html.i4eKqy
│ │ ├── annotated.html
│ │ ├── annotated_dup.js
│ │ ├── args_8cc.html
│ │ ├── args_8h.html
│ │ ├── args_8h.js
│ │ ├── args_8h_source.html
│ │ ├── classes.html
│ │ ├── classfasttext_1_1Args-members.html
│ │ ├── classfasttext_1_1Args.html
│ │ ├── classfasttext_1_1Args.js
│ │ ├── classfasttext_1_1Dictionary-members.html
│ │ ├── classfasttext_1_1Dictionary.html
│ │ ├── classfasttext_1_1Dictionary.js
│ │ ├── classfasttext_1_1FastText-members.html
│ │ ├── classfasttext_1_1FastText.html
│ │ ├── classfasttext_1_1FastText.js
│ │ ├── classfasttext_1_1Matrix-members.html
│ │ ├── classfasttext_1_1Matrix.html
│ │ ├── classfasttext_1_1Matrix.js
│ │ ├── classfasttext_1_1Model-members.html
│ │ ├── classfasttext_1_1Model.html
│ │ ├── classfasttext_1_1Model.js
│ │ ├── classfasttext_1_1ProductQuantizer-members.html
│ │ ├── classfasttext_1_1ProductQuantizer.html
│ │ ├── classfasttext_1_1ProductQuantizer.js
│ │ ├── classfasttext_1_1QMatrix-members.html
│ │ ├── classfasttext_1_1QMatrix.html
│ │ ├── classfasttext_1_1QMatrix.js
│ │ ├── classfasttext_1_1Vector-members.html
│ │ ├── classfasttext_1_1Vector.html
│ │ ├── classfasttext_1_1Vector.js
│ │ ├── dictionary_8cc.html
│ │ ├── dictionary_8h.html
│ │ ├── dictionary_8h.js
│ │ ├── dictionary_8h_source.html
│ │ ├── dir_68267d1309a1af8e8297ef4c3efbcdba.html
│ │ ├── dir_68267d1309a1af8e8297ef4c3efbcdba.js
│ │ ├── doxygen.css
│ │ ├── dynsections.js
│ │ ├── fasttext_8cc.html
│ │ ├── fasttext_8h.html
│ │ ├── fasttext_8h.js
│ │ ├── fasttext_8h_source.html
│ │ ├── files.html
│ │ ├── files.js
│ │ ├── functions.html
│ │ ├── functions_0x7e.html
│ │ ├── functions_b.html
│ │ ├── functions_c.html
│ │ ├── functions_d.html
│ │ ├── functions_dup.js
│ │ ├── functions_e.html
│ │ ├── functions_f.html
│ │ ├── functions_func.html
│ │ ├── functions_g.html
│ │ ├── functions_h.html
│ │ ├── functions_i.html
│ │ ├── functions_k.html
│ │ ├── functions_l.html
│ │ ├── functions_m.html
│ │ ├── functions_n.html
│ │ ├── functions_o.html
│ │ ├── functions_p.html
│ │ ├── functions_q.html
│ │ ├── functions_r.html
│ │ ├── functions_s.html
│ │ ├── functions_t.html
│ │ ├── functions_u.html
│ │ ├── functions_v.html
│ │ ├── functions_vars.html
│ │ ├── functions_w.html
│ │ ├── functions_z.html
│ │ ├── globals.html
│ │ ├── globals_defs.html
│ │ ├── globals_func.html
│ │ ├── index.html
│ │ ├── jquery.js
│ │ ├── main_8cc.html
│ │ ├── main_8cc.js
│ │ ├── matrix_8cc.html
│ │ ├── matrix_8h.html
│ │ ├── matrix_8h_source.html
│ │ ├── menu.js
│ │ ├── menudata.js
│ │ ├── model_8cc.html
│ │ ├── model_8h.html
│ │ ├── model_8h.js
│ │ ├── model_8h_source.html
│ │ ├── namespacefasttext.html
│ │ ├── namespacefasttext.js
│ │ ├── namespacefasttext_1_1utils.html
│ │ ├── namespacemembers.html
│ │ ├── namespacemembers_enum.html
│ │ ├── namespacemembers_func.html
│ │ ├── namespacemembers_type.html
│ │ ├── namespaces.html
│ │ ├── namespaces.js
│ │ ├── navtree.css
│ │ ├── navtree.js
│ │ ├── navtreedata.js
│ │ ├── navtreeindex0.js
│ │ ├── navtreeindex1.js
│ │ ├── productquantizer_8cc.html
│ │ ├── productquantizer_8cc.js
│ │ ├── productquantizer_8h.html
│ │ ├── productquantizer_8h_source.html
│ │ ├── qmatrix_8cc.html
│ │ ├── qmatrix_8h.html
│ │ ├── qmatrix_8h_source.html
│ │ ├── real_8h.html
│ │ ├── real_8h.js
│ │ ├── real_8h_source.html
│ │ ├── resize.js
│ │ ├── search/
│ │ │ ├── .files_7.html.StRRNc
│ │ │ ├── .variables_a.html.1MGQ27
│ │ │ ├── all_0.html
│ │ │ ├── all_0.js
│ │ │ ├── all_1.html
│ │ │ ├── all_1.js
│ │ │ ├── all_10.html
│ │ │ ├── all_10.js
│ │ │ ├── all_11.html
│ │ │ ├── all_11.js
│ │ │ ├── all_12.html
│ │ │ ├── all_12.js
│ │ │ ├── all_13.html
│ │ │ ├── all_13.js
│ │ │ ├── all_14.html
│ │ │ ├── all_14.js
│ │ │ ├── all_15.html
│ │ │ ├── all_15.js
│ │ │ ├── all_16.html
│ │ │ ├── all_16.js
│ │ │ ├── all_17.html
│ │ │ ├── all_17.js
│ │ │ ├── all_2.html
│ │ │ ├── all_2.js
│ │ │ ├── all_3.html
│ │ │ ├── all_3.js
│ │ │ ├── all_4.html
│ │ │ ├── all_4.js
│ │ │ ├── all_5.html
│ │ │ ├── all_5.js
│ │ │ ├── all_6.html
│ │ │ ├── all_6.js
│ │ │ ├── all_7.html
│ │ │ ├── all_7.js
│ │ │ ├── all_8.html
│ │ │ ├── all_8.js
│ │ │ ├── all_9.html
│ │ │ ├── all_9.js
│ │ │ ├── all_a.html
│ │ │ ├── all_a.js
│ │ │ ├── all_b.html
│ │ │ ├── all_b.js
│ │ │ ├── all_c.html
│ │ │ ├── all_c.js
│ │ │ ├── all_d.html
│ │ │ ├── all_d.js
│ │ │ ├── all_e.html
│ │ │ ├── all_e.js
│ │ │ ├── all_f.html
│ │ │ ├── all_f.js
│ │ │ ├── classes_0.html
│ │ │ ├── classes_0.js
│ │ │ ├── classes_1.html
│ │ │ ├── classes_1.js
│ │ │ ├── classes_2.html
│ │ │ ├── classes_2.js
│ │ │ ├── classes_3.html
│ │ │ ├── classes_3.js
│ │ │ ├── classes_4.html
│ │ │ ├── classes_4.js
│ │ │ ├── classes_5.html
│ │ │ ├── classes_5.js
│ │ │ ├── classes_6.html
│ │ │ ├── classes_6.js
│ │ │ ├── classes_7.html
│ │ │ ├── classes_7.js
│ │ │ ├── classes_8.html
│ │ │ ├── classes_8.js
│ │ │ ├── defines_0.html
│ │ │ ├── defines_0.js
│ │ │ ├── defines_1.html
│ │ │ ├── defines_1.js
│ │ │ ├── defines_2.html
│ │ │ ├── defines_2.js
│ │ │ ├── defines_3.html
│ │ │ ├── defines_3.js
│ │ │ ├── enums_0.html
│ │ │ ├── enums_0.js
│ │ │ ├── enums_1.html
│ │ │ ├── enums_1.js
│ │ │ ├── enums_2.html
│ │ │ ├── enums_2.js
│ │ │ ├── enumvalues_0.html
│ │ │ ├── enumvalues_0.js
│ │ │ ├── enumvalues_1.html
│ │ │ ├── enumvalues_1.js
│ │ │ ├── enumvalues_2.html
│ │ │ ├── enumvalues_2.js
│ │ │ ├── enumvalues_3.html
│ │ │ ├── enumvalues_3.js
│ │ │ ├── enumvalues_4.html
│ │ │ ├── enumvalues_4.js
│ │ │ ├── enumvalues_5.html
│ │ │ ├── enumvalues_5.js
│ │ │ ├── files_0.html
│ │ │ ├── files_0.js
│ │ │ ├── files_1.html
│ │ │ ├── files_1.js
│ │ │ ├── files_2.html
│ │ │ ├── files_2.js
│ │ │ ├── files_3.html
│ │ │ ├── files_3.js
│ │ │ ├── files_4.html
│ │ │ ├── files_4.js
│ │ │ ├── files_5.html
│ │ │ ├── files_5.js
│ │ │ ├── files_6.html
│ │ │ ├── files_6.js
│ │ │ ├── files_7.html
│ │ │ ├── files_7.js
│ │ │ ├── files_8.html
│ │ │ ├── files_8.js
│ │ │ ├── functions_0.html
│ │ │ ├── functions_0.js
│ │ │ ├── functions_1.html
│ │ │ ├── functions_1.js
│ │ │ ├── functions_10.html
│ │ │ ├── functions_10.js
│ │ │ ├── functions_11.html
│ │ │ ├── functions_11.js
│ │ │ ├── functions_12.html
│ │ │ ├── functions_12.js
│ │ │ ├── functions_13.html
│ │ │ ├── functions_13.js
│ │ │ ├── functions_14.html
│ │ │ ├── functions_14.js
│ │ │ ├── functions_15.html
│ │ │ ├── functions_15.js
│ │ │ ├── functions_16.html
│ │ │ ├── functions_16.js
│ │ │ ├── functions_17.html
│ │ │ ├── functions_17.js
│ │ │ ├── functions_2.html
│ │ │ ├── functions_2.js
│ │ │ ├── functions_3.html
│ │ │ ├── functions_3.js
│ │ │ ├── functions_4.html
│ │ │ ├── functions_4.js
│ │ │ ├── functions_5.html
│ │ │ ├── functions_5.js
│ │ │ ├── functions_6.html
│ │ │ ├── functions_6.js
│ │ │ ├── functions_7.html
│ │ │ ├── functions_7.js
│ │ │ ├── functions_8.html
│ │ │ ├── functions_8.js
│ │ │ ├── functions_9.html
│ │ │ ├── functions_9.js
│ │ │ ├── functions_a.html
│ │ │ ├── functions_a.js
│ │ │ ├── functions_b.html
│ │ │ ├── functions_b.js
│ │ │ ├── functions_c.html
│ │ │ ├── functions_c.js
│ │ │ ├── functions_d.html
│ │ │ ├── functions_d.js
│ │ │ ├── functions_e.html
│ │ │ ├── functions_e.js
│ │ │ ├── functions_f.html
│ │ │ ├── functions_f.js
│ │ │ ├── namespaces_0.html
│ │ │ ├── namespaces_0.js
│ │ │ ├── nomatches.html
│ │ │ ├── search.css
│ │ │ ├── search.js
│ │ │ ├── searchdata.js
│ │ │ ├── typedefs_0.html
│ │ │ ├── typedefs_0.js
│ │ │ ├── typedefs_1.html
│ │ │ ├── typedefs_1.js
│ │ │ ├── variables_0.html
│ │ │ ├── variables_0.js
│ │ │ ├── variables_1.html
│ │ │ ├── variables_1.js
│ │ │ ├── variables_10.html
│ │ │ ├── variables_10.js
│ │ │ ├── variables_11.html
│ │ │ ├── variables_11.js
│ │ │ ├── variables_12.html
│ │ │ ├── variables_12.js
│ │ │ ├── variables_13.html
│ │ │ ├── variables_13.js
│ │ │ ├── variables_2.html
│ │ │ ├── variables_2.js
│ │ │ ├── variables_3.html
│ │ │ ├── variables_3.js
│ │ │ ├── variables_4.html
│ │ │ ├── variables_4.js
│ │ │ ├── variables_5.html
│ │ │ ├── variables_5.js
│ │ │ ├── variables_6.html
│ │ │ ├── variables_6.js
│ │ │ ├── variables_7.html
│ │ │ ├── variables_7.js
│ │ │ ├── variables_8.html
│ │ │ ├── variables_8.js
│ │ │ ├── variables_9.html
│ │ │ ├── variables_9.js
│ │ │ ├── variables_a.html
│ │ │ ├── variables_a.js
│ │ │ ├── variables_b.html
│ │ │ ├── variables_b.js
│ │ │ ├── variables_c.html
│ │ │ ├── variables_c.js
│ │ │ ├── variables_d.html
│ │ │ ├── variables_d.js
│ │ │ ├── variables_e.html
│ │ │ ├── variables_e.js
│ │ │ ├── variables_f.html
│ │ │ └── variables_f.js
│ │ ├── structfasttext_1_1Node-members.html
│ │ ├── structfasttext_1_1Node.html
│ │ ├── structfasttext_1_1Node.js
│ │ ├── structfasttext_1_1entry-members.html
│ │ ├── structfasttext_1_1entry.html
│ │ ├── structfasttext_1_1entry.js
│ │ ├── tabs.css
│ │ ├── utils_8cc.html
│ │ ├── utils_8cc.js
│ │ ├── utils_8h.html
│ │ ├── utils_8h.js
│ │ ├── utils_8h_source.html
│ │ ├── vector_8cc.html
│ │ ├── vector_8cc.js
│ │ ├── vector_8h.html
│ │ ├── vector_8h.js
│ │ └── vector_8h_source.html
│ ├── fasttext.css
│ └── tabber.js
├── wikifil.pl
└── word-vector-example.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.*.swp
*.o
*.bin
*.vec
*.bc
.DS_Store
data
fasttext
result
website/node_modules/
package-lock.json
node_modules/
================================================
FILE: CMakeLists.txt
================================================
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
cmake_minimum_required(VERSION 2.8.9)
project(fasttext)
set(CMAKE_CXX_STANDARD 17)
# The version number.
set (fasttext_VERSION_MAJOR 0)
set (fasttext_VERSION_MINOR 1)
include_directories(fasttext)
set(CMAKE_CXX_FLAGS " -pthread -std=c++17 -funroll-loops -O3 -march=native")
set(HEADER_FILES
src/args.h
src/autotune.h
src/densematrix.h
src/dictionary.h
src/fasttext.h
src/loss.h
src/matrix.h
src/meter.h
src/model.h
src/productquantizer.h
src/quantmatrix.h
src/real.h
src/utils.h
src/vector.h)
set(SOURCE_FILES
src/args.cc
src/autotune.cc
src/densematrix.cc
src/dictionary.cc
src/fasttext.cc
src/loss.cc
src/main.cc
src/matrix.cc
src/meter.cc
src/model.cc
src/productquantizer.cc
src/quantmatrix.cc
src/utils.cc
src/vector.cc)
if (NOT MSVC)
include(GNUInstallDirs)
configure_file("fasttext.pc.in" "fasttext.pc" @ONLY)
install(FILES "${CMAKE_BINARY_DIR}/fasttext.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
endif()
add_library(fasttext-shared SHARED ${SOURCE_FILES} ${HEADER_FILES})
add_library(fasttext-static STATIC ${SOURCE_FILES} ${HEADER_FILES})
add_library(fasttext-static_pic STATIC ${SOURCE_FILES} ${HEADER_FILES})
set_target_properties(fasttext-shared PROPERTIES OUTPUT_NAME fasttext
SOVERSION "${fasttext_VERSION_MAJOR}")
set_target_properties(fasttext-static PROPERTIES OUTPUT_NAME fasttext)
set_target_properties(fasttext-static_pic PROPERTIES OUTPUT_NAME fasttext_pic
POSITION_INDEPENDENT_CODE True)
add_executable(fasttext-bin src/main.cc)
target_link_libraries(fasttext-bin pthread fasttext-static)
set_target_properties(fasttext-bin PROPERTIES PUBLIC_HEADER "${HEADER_FILES}" OUTPUT_NAME fasttext)
install (TARGETS fasttext-shared
LIBRARY DESTINATION lib)
install (TARGETS fasttext-static
ARCHIVE DESTINATION lib)
install (TARGETS fasttext-static_pic
ARCHIVE DESTINATION lib)
install (TARGETS fasttext-bin
RUNTIME DESTINATION bin
PUBLIC_HEADER DESTINATION include/fasttext)
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to make participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at <opensource-conduct@fb.com>. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to fastText
We want to make contributing to this project as easy and transparent as possible.
## Issues
We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue.
### Reproducing issues
Please make sure that the issue you mention is not a result of one of the existing third-party libraries. For example, please do not post an issue if you encountered an error within a third-party Python library. We can only help you with errors which can be directly reproduced either with our C++ code or the corresponding Python bindings. If you do find an error, please post detailed steps to reproduce it. If we can't reproduce your error, we can't help you fix it.
## Pull Requests
Please post an Issue before submitting a pull request. This might save you some time as it is possible we can't support your contribution, albeit we try our best to accomodate your (planned) work and highly appreciate your time. Generally, it is best to have a pull request emerge from an issue rather than the other way around.
To create a pull request:
1. Fork the repo and create your branch from `master`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA").
## Tests
First, you will need to make sure you have the required data. For that, please have a look at the fetch_test_data.sh script under tests. Next run the tests using the runtests.py script passing a path to the directory containing the datasets.
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## License
By contributing to fastText, you agree that your contributions will be licensed under its MIT license.
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2016-present, Facebook, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: MANIFEST.in
================================================
include LICENSE
include PATENTS
recursive-include python *.md *.rst
recursive-include src *.h
================================================
FILE: Makefile
================================================
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
CXX = c++
CXXFLAGS = -pthread -std=c++17 -march=native
OBJS = args.o autotune.o matrix.o dictionary.o loss.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o
INCLUDES = -I.
opt: CXXFLAGS += -O3 -funroll-loops -DNDEBUG
opt: fasttext
coverage: CXXFLAGS += -O0 -fno-inline -fprofile-arcs --coverage
coverage: fasttext
debug: CXXFLAGS += -g -O0 -fno-inline
debug: fasttext
wasm: webassembly/fasttext_wasm.js
wasmdebug: export EMCC_DEBUG=1
wasmdebug: webassembly/fasttext_wasm.js
args.o: src/args.cc src/args.h
$(CXX) $(CXXFLAGS) -c src/args.cc
autotune.o: src/autotune.cc src/autotune.h
$(CXX) $(CXXFLAGS) -c src/autotune.cc
matrix.o: src/matrix.cc src/matrix.h
$(CXX) $(CXXFLAGS) -c src/matrix.cc
dictionary.o: src/dictionary.cc src/dictionary.h src/args.h
$(CXX) $(CXXFLAGS) -c src/dictionary.cc
loss.o: src/loss.cc src/loss.h src/matrix.h src/real.h
$(CXX) $(CXXFLAGS) -c src/loss.cc
productquantizer.o: src/productquantizer.cc src/productquantizer.h src/utils.h
$(CXX) $(CXXFLAGS) -c src/productquantizer.cc
densematrix.o: src/densematrix.cc src/densematrix.h src/utils.h src/matrix.h
$(CXX) $(CXXFLAGS) -c src/densematrix.cc
quantmatrix.o: src/quantmatrix.cc src/quantmatrix.h src/utils.h src/matrix.h
$(CXX) $(CXXFLAGS) -c src/quantmatrix.cc
vector.o: src/vector.cc src/vector.h src/utils.h
$(CXX) $(CXXFLAGS) -c src/vector.cc
model.o: src/model.cc src/model.h src/args.h
$(CXX) $(CXXFLAGS) -c src/model.cc
utils.o: src/utils.cc src/utils.h
$(CXX) $(CXXFLAGS) -c src/utils.cc
meter.o: src/meter.cc src/meter.h
$(CXX) $(CXXFLAGS) -c src/meter.cc
fasttext.o: src/fasttext.cc src/*.h
$(CXX) $(CXXFLAGS) -c src/fasttext.cc
fasttext: $(OBJS) src/fasttext.cc src/main.cc
$(CXX) $(CXXFLAGS) $(OBJS) src/main.cc -o fasttext
clean:
rm -rf *.o *.gcno *.gcda fasttext *.bc webassembly/fasttext_wasm.js webassembly/fasttext_wasm.wasm
EMCXX = em++
EMCXXFLAGS = --bind --std=c++11 -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -s "EXTRA_EXPORTED_RUNTIME_METHODS=['addOnPostRun', 'FS']" -s "DISABLE_EXCEPTION_CATCHING=0" -s "EXCEPTION_DEBUG=1" -s "FORCE_FILESYSTEM=1" -s "MODULARIZE=1" -s "EXPORT_ES6=1" -s 'EXPORT_NAME="FastTextModule"' -Isrc/
EMOBJS = args.bc autotune.bc matrix.bc dictionary.bc loss.bc productquantizer.bc densematrix.bc quantmatrix.bc vector.bc model.bc utils.bc meter.bc fasttext.bc main.bc
main.bc: webassembly/fasttext_wasm.cc
$(EMCXX) $(EMCXXFLAGS) webassembly/fasttext_wasm.cc -o main.bc
args.bc: src/args.cc src/args.h
$(EMCXX) $(EMCXXFLAGS) src/args.cc -o args.bc
autotune.bc: src/autotune.cc src/autotune.h
$(EMCXX) $(EMCXXFLAGS) src/autotune.cc -o autotune.bc
matrix.bc: src/matrix.cc src/matrix.h
$(EMCXX) $(EMCXXFLAGS) src/matrix.cc -o matrix.bc
dictionary.bc: src/dictionary.cc src/dictionary.h src/args.h
$(EMCXX) $(EMCXXFLAGS) src/dictionary.cc -o dictionary.bc
loss.bc: src/loss.cc src/loss.h src/matrix.h src/real.h
$(EMCXX) $(EMCXXFLAGS) src/loss.cc -o loss.bc
productquantizer.bc: src/productquantizer.cc src/productquantizer.h src/utils.h
$(EMCXX) $(EMCXXFLAGS) src/productquantizer.cc -o productquantizer.bc
densematrix.bc: src/densematrix.cc src/densematrix.h src/utils.h src/matrix.h
$(EMCXX) $(EMCXXFLAGS) src/densematrix.cc -o densematrix.bc
quantmatrix.bc: src/quantmatrix.cc src/quantmatrix.h src/utils.h src/matrix.h
$(EMCXX) $(EMCXXFLAGS) src/quantmatrix.cc -o quantmatrix.bc
vector.bc: src/vector.cc src/vector.h src/utils.h
$(EMCXX) $(EMCXXFLAGS) src/vector.cc -o vector.bc
model.bc: src/model.cc src/model.h src/args.h
$(EMCXX) $(EMCXXFLAGS) src/model.cc -o model.bc
utils.bc: src/utils.cc src/utils.h
$(EMCXX) $(EMCXXFLAGS) src/utils.cc -o utils.bc
meter.bc: src/meter.cc src/meter.h
$(EMCXX) $(EMCXXFLAGS) src/meter.cc -o meter.bc
fasttext.bc: src/fasttext.cc src/*.h
$(EMCXX) $(EMCXXFLAGS) src/fasttext.cc -o fasttext.bc
webassembly/fasttext_wasm.js: $(EMOBJS) webassembly/fasttext_wasm.cc Makefile
$(EMCXX) $(EMCXXFLAGS) $(EMOBJS) -o webassembly/fasttext_wasm.js
================================================
FILE: PACKAGE
================================================
load("@fbcode_macros//build_defs:package_local_utils.bzl", "package_local_utils")
package_local_utils.set_clang_version(15, True)
================================================
FILE: README.md
================================================
# fastText
[fastText](https://fasttext.cc/) is a library for efficient learning of word representations and sentence classification.
[](https://circleci.com/gh/facebookresearch/fastText/tree/master)
## Table of contents
* [Resources](#resources)
* [Models](#models)
* [Supplementary data](#supplementary-data)
* [FAQ](#faq)
* [Cheatsheet](#cheatsheet)
* [Requirements](#requirements)
* [Building fastText](#building-fasttext)
* [Getting the source code](#getting-the-source-code)
* [Building fastText using make (preferred)](#building-fasttext-using-make-preferred)
* [Building fastText using cmake](#building-fasttext-using-cmake)
* [Building fastText for Python](#building-fasttext-for-python)
* [Example use cases](#example-use-cases)
* [Word representation learning](#word-representation-learning)
* [Obtaining word vectors for out-of-vocabulary words](#obtaining-word-vectors-for-out-of-vocabulary-words)
* [Text classification](#text-classification)
* [Full documentation](#full-documentation)
* [References](#references)
* [Enriching Word Vectors with Subword Information](#enriching-word-vectors-with-subword-information)
* [Bag of Tricks for Efficient Text Classification](#bag-of-tricks-for-efficient-text-classification)
* [FastText.zip: Compressing text classification models](#fasttextzip-compressing-text-classification-models)
* [Join the fastText community](#join-the-fasttext-community)
* [License](#license)
## Resources
### Models
- Recent state-of-the-art [English word vectors](https://fasttext.cc/docs/en/english-vectors.html).
- Word vectors for [157 languages trained on Wikipedia and Crawl](https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md).
- Models for [language identification](https://fasttext.cc/docs/en/language-identification.html#content) and [various supervised tasks](https://fasttext.cc/docs/en/supervised-models.html#content).
### Supplementary data
- The preprocessed [YFCC100M data](https://fasttext.cc/docs/en/dataset.html#content) used in [2].
### FAQ
You can find [answers to frequently asked questions](https://fasttext.cc/docs/en/faqs.html#content) on our [website](https://fasttext.cc/).
### Cheatsheet
We also provide a [cheatsheet](https://fasttext.cc/docs/en/cheatsheet.html#content) full of useful one-liners.
## Requirements
We are continuously building and testing our library, CLI and Python bindings under various docker images using [circleci](https://circleci.com/).
Generally, **fastText** builds on modern Mac OS and Linux distributions.
Since it uses some C++11 features, it requires a compiler with good C++11 support.
These include :
* (g++-4.7.2 or newer) or (clang-3.3 or newer)
Compilation is carried out using a Makefile, so you will need to have a working **make**.
If you want to use **cmake** you need at least version 2.8.9.
One of the oldest distributions we successfully built and tested the CLI under is [Debian jessie](https://www.debian.org/releases/jessie/).
For the word-similarity evaluation script you will need:
* Python 2.6 or newer
* NumPy & SciPy
For the python bindings (see the subdirectory python) you will need:
* Python version 2.7 or >=3.4
* NumPy & SciPy
* [pybind11](https://github.com/pybind/pybind11)
One of the oldest distributions we successfully built and tested the Python bindings under is [Debian jessie](https://www.debian.org/releases/jessie/).
If these requirements make it impossible for you to use fastText, please open an issue and we will try to accommodate you.
## Building fastText
We discuss building the latest stable version of fastText.
### Getting the source code
You can find our [latest stable release](https://github.com/facebookresearch/fastText/releases/latest) in the usual place.
There is also the master branch that contains all of our most recent work, but comes along with all the usual caveats of an unstable branch. You might want to use this if you are a developer or power-user.
### Building fastText using make (preferred)
```
$ wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
$ unzip v0.9.2.zip
$ cd fastText-0.9.2
$ make
```
This will produce object files for all the classes as well as the main binary `fasttext`.
If you do not plan on using the default system-wide compiler, update the two macros defined at the beginning of the Makefile (CC and INCLUDES).
### Building fastText using cmake
For now this is not part of a release, so you will need to clone the master branch.
```
$ git clone https://github.com/facebookresearch/fastText.git
$ cd fastText
$ mkdir build && cd build && cmake ..
$ make && make install
```
This will create the fasttext binary and also all relevant libraries (shared, static, PIC).
### Building fastText for Python
For now this is not part of a release, so you will need to clone the master branch.
```
$ git clone https://github.com/facebookresearch/fastText.git
$ cd fastText
$ pip install .
```
For further information and introduction see python/README.md
## Example use cases
This library has two main use cases: word representation learning and text classification.
These were described in the two papers [1](#enriching-word-vectors-with-subword-information) and [2](#bag-of-tricks-for-efficient-text-classification).
### Word representation learning
In order to learn word vectors, as described in [1](#enriching-word-vectors-with-subword-information), do:
```
$ ./fasttext skipgram -input data.txt -output model
```
where `data.txt` is a training file containing `UTF-8` encoded text.
By default the word vectors will take into account character n-grams from 3 to 6 characters.
At the end of optimization the program will save two files: `model.bin` and `model.vec`.
`model.vec` is a text file containing the word vectors, one per line.
`model.bin` is a binary file containing the parameters of the model along with the dictionary and all hyper parameters.
The binary file can be used later to compute word vectors or to restart the optimization.
### Obtaining word vectors for out-of-vocabulary words
The previously trained model can be used to compute word vectors for out-of-vocabulary words.
Provided you have a text file `queries.txt` containing words for which you want to compute vectors, use the following command:
```
$ ./fasttext print-word-vectors model.bin < queries.txt
```
This will output word vectors to the standard output, one vector per line.
This can also be used with pipes:
```
$ cat queries.txt | ./fasttext print-word-vectors model.bin
```
See the provided scripts for an example. For instance, running:
```
$ ./word-vector-example.sh
```
will compile the code, download data, compute word vectors and evaluate them on the rare words similarity dataset RW [Thang et al. 2013].
### Text classification
This library can also be used to train supervised text classifiers, for instance for sentiment analysis.
In order to train a text classifier using the method described in [2](#bag-of-tricks-for-efficient-text-classification), use:
```
$ ./fasttext supervised -input train.txt -output model
```
where `train.txt` is a text file containing a training sentence per line along with the labels.
By default, we assume that labels are words that are prefixed by the string `__label__`.
This will output two files: `model.bin` and `model.vec`.
Once the model was trained, you can evaluate it by computing the precision and recall at k (P@k and R@k) on a test set using:
```
$ ./fasttext test model.bin test.txt k
```
The argument `k` is optional, and is equal to `1` by default.
In order to obtain the k most likely labels for a piece of text, use:
```
$ ./fasttext predict model.bin test.txt k
```
or use `predict-prob` to also get the probability for each label
```
$ ./fasttext predict-prob model.bin test.txt k
```
where `test.txt` contains a piece of text to classify per line.
Doing so will print to the standard output the k most likely labels for each line.
The argument `k` is optional, and equal to `1` by default.
See `classification-example.sh` for an example use case.
In order to reproduce results from the paper [2](#bag-of-tricks-for-efficient-text-classification), run `classification-results.sh`, this will download all the datasets and reproduce the results from Table 1.
If you want to compute vector representations of sentences or paragraphs, please use:
```
$ ./fasttext print-sentence-vectors model.bin < text.txt
```
This assumes that the `text.txt` file contains the paragraphs that you want to get vectors for.
The program will output one vector representation per line in the file.
You can also quantize a supervised model to reduce its memory usage with the following command:
```
$ ./fasttext quantize -output model
```
This will create a `.ftz` file with a smaller memory footprint. All the standard functionality, like `test` or `predict` work the same way on the quantized models:
```
$ ./fasttext test model.ftz test.txt
```
The quantization procedure follows the steps described in [3](#fasttextzip-compressing-text-classification-models). You can
run the script `quantization-example.sh` for an example.
## Full documentation
Invoke a command without arguments to list available arguments and their default values:
```
$ ./fasttext supervised
Empty input or output path.
The following arguments are mandatory:
-input training file path
-output output file path
The following arguments are optional:
-verbose verbosity level [2]
The following arguments for the dictionary are optional:
-minCount minimal number of word occurrences [1]
-minCountLabel minimal number of label occurrences [0]
-wordNgrams max length of word ngram [1]
-bucket number of buckets [2000000]
-minn min length of char ngram [0]
-maxn max length of char ngram [0]
-t sampling threshold [0.0001]
-label labels prefix [__label__]
The following arguments for training are optional:
-lr learning rate [0.1]
-lrUpdateRate change the rate of updates for the learning rate [100]
-dim size of word vectors [100]
-ws size of the context window [5]
-epoch number of epochs [5]
-neg number of negatives sampled [5]
-loss loss function {ns, hs, softmax} [softmax]
-thread number of threads [12]
-pretrainedVectors pretrained word vectors for supervised learning []
-saveOutput whether output params should be saved [0]
The following arguments for quantization are optional:
-cutoff number of words and ngrams to retain [0]
-retrain finetune embeddings if a cutoff is applied [0]
-qnorm quantizing the norm separately [0]
-qout quantizing the classifier [0]
-dsub size of each sub-vector [2]
```
Defaults may vary by mode. (Word-representation modes `skipgram` and `cbow` use a default `-minCount` of 5.)
## References
Please cite [1](#enriching-word-vectors-with-subword-information) if using this code for learning word representations or [2](#bag-of-tricks-for-efficient-text-classification) if using for text classification.
### Enriching Word Vectors with Subword Information
[1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606)
```
@article{bojanowski2017enriching,
title={Enriching Word Vectors with Subword Information},
author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
journal={Transactions of the Association for Computational Linguistics},
volume={5},
year={2017},
issn={2307-387X},
pages={135--146}
}
```
### Bag of Tricks for Efficient Text Classification
[2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759)
```
@InProceedings{joulin2017bag,
title={Bag of Tricks for Efficient Text Classification},
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
booktitle={Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers},
month={April},
year={2017},
publisher={Association for Computational Linguistics},
pages={427--431},
}
```
### FastText.zip: Compressing text classification models
[3] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651)
```
@article{joulin2016fasttext,
title={FastText.zip: Compressing text classification models},
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
journal={arXiv preprint arXiv:1612.03651},
year={2016}
}
```
(\* These authors contributed equally.)
## Join the fastText community
* Facebook page: https://www.facebook.com/groups/1174547215919768
* Google group: https://groups.google.com/forum/#!forum/fasttext-library
* Contact: [egrave@fb.com](mailto:egrave@fb.com), [bojanowski@fb.com](mailto:bojanowski@fb.com), [ajoulin@fb.com](mailto:ajoulin@fb.com), [tmikolov@fb.com](mailto:tmikolov@fb.com)
See the CONTRIBUTING file for information about how to help out.
## License
fastText is MIT-licensed.
================================================
FILE: alignment/README.md
================================================
## Alignment of Word Embeddings
This directory provides code for learning alignments between word embeddings in different languages.
The code is in Python 3 and requires [NumPy](http://www.numpy.org/).
The script `example.sh` shows how to use this code to learn and evaluate a bilingual alignment of word embeddings.
The word embeddings used in [1] can be found on the [fastText project page](https://fasttext.cc) and the supervised bilingual lexicons on the [MUSE project page](https://github.com/facebookresearch/MUSE).
### Supervised alignment
The script `align.py` aligns word embeddings from two languages using a bilingual lexicon as supervision.
The details of this approach can be found in [1].
### Unsupervised alignment
The script `unsup_align.py` aligns word embeddings from two languages without requiring any supervision.
Additionally, the script `unsup_multialign.py` aligns multiple languages to a common space with no supervision.
The details of these approaches can be found in [2] and [3] respectively.
In addition to NumPy, the unsupervised methods require the [Python Optimal Transport](https://pot.readthedocs.io/en/stable/) toolbox.
### Download
Wikipedia fastText embeddings aligned with our method can be found [here](https://fasttext.cc/docs/en/aligned-vectors.html).
### References
If you use the supervised alignment method, please cite:
[1] A. Joulin, P. Bojanowski, T. Mikolov, H. Jegou, E. Grave, [*Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion*](https://arxiv.org/abs/1804.07745)
```
@InProceedings{joulin2018loss,
title={Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion},
author={Joulin, Armand and Bojanowski, Piotr and Mikolov, Tomas and J\'egou, Herv\'e and Grave, Edouard},
year={2018},
booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
}
```
If you use the unsupervised bilingual alignment method, please cite:
[2] E. Grave, A. Joulin, Q. Berthet, [*Unsupervised Alignment of Embeddings with Wasserstein Procrustes*](https://arxiv.org/abs/1805.11222)
```
@article{grave2018unsupervised,
title={Unsupervised Alignment of Embeddings with Wasserstein Procrustes},
author={Grave, Edouard and Joulin, Armand and Berthet, Quentin},
journal={arXiv preprint arXiv:1805.11222},
year={2018}
}
```
If you use the unsupervised alignment script `unsup_multialign.py`, please cite:
[3] J. Alaux, E. Grave, M. Cuturi, A. Joulin, [*Unsupervised Hyperalignment for Multilingual Word Embeddings*](https://arxiv.org/abs/1811.01124)
```
@article{alaux2018unsupervised,
title={Unsupervised hyperalignment for multilingual word embeddings},
author={Alaux, Jean and Grave, Edouard and Cuturi, Marco and Joulin, Armand},
journal={arXiv preprint arXiv:1811.01124},
year={2018}
}
```
================================================
FILE: alignment/align.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import argparse
from utils import *
import sys
parser = argparse.ArgumentParser(description='RCSLS for supervised word alignment')
parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings")
parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings")
parser.add_argument('--center', action='store_true', help='whether to center embeddings or not')
parser.add_argument("--dico_train", type=str, default='', help="train dictionary")
parser.add_argument("--dico_test", type=str, default='', help="validation dictionary")
parser.add_argument("--output", type=str, default='', help="where to save aligned embeddings")
parser.add_argument("--knn", type=int, default=10, help="number of nearest neighbors in RCSL/CSLS")
parser.add_argument("--maxneg", type=int, default=200000, help="Maximum number of negatives for the Extended RCSLS")
parser.add_argument("--maxsup", type=int, default=-1, help="Maximum number of training examples")
parser.add_argument("--maxload", type=int, default=200000, help="Maximum number of loaded vectors")
parser.add_argument("--model", type=str, default="none", help="Set of constraints: spectral or none")
parser.add_argument("--reg", type=float, default=0.0 , help='regularization parameters')
parser.add_argument("--lr", type=float, default=1.0, help='learning rate')
parser.add_argument("--niter", type=int, default=10, help='number of iterations')
parser.add_argument('--sgd', action='store_true', help='use sgd')
parser.add_argument("--batchsize", type=int, default=10000, help="batch size for sgd")
params = parser.parse_args()
###### SPECIFIC FUNCTIONS ######
# functions specific to RCSLS
# the rest of the functions are in utils.py
def getknn(sc, x, y, k=10):
sidx = np.argpartition(sc, -k, axis=1)[:, -k:]
ytopk = y[sidx.flatten(), :]
ytopk = ytopk.reshape(sidx.shape[0], sidx.shape[1], y.shape[1])
f = np.sum(sc[np.arange(sc.shape[0])[:, None], sidx])
df = np.dot(ytopk.sum(1).T, x)
return f / k, df / k
def rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, knn=10):
X_trans = np.dot(X_src, R.T)
f = 2 * np.sum(X_trans * Y_tgt)
df = 2 * np.dot(Y_tgt.T, X_src)
fk0, dfk0 = getknn(np.dot(X_trans, Z_tgt.T), X_src, Z_tgt, knn)
fk1, dfk1 = getknn(np.dot(np.dot(Z_src, R.T), Y_tgt.T).T, Y_tgt, Z_src, knn)
f = f - fk0 -fk1
df = df - dfk0 - dfk1.T
return -f / X_src.shape[0], -df / X_src.shape[0]
def proj_spectral(R):
U, s, V = np.linalg.svd(R)
s[s > 1] = 1
s[s < 0] = 0
return np.dot(U, np.dot(np.diag(s), V))
###### MAIN ######
# load word embeddings
words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center)
words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center)
# load validation bilingual lexicon
src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt)
# word --> vector indices
idx_src = idx(words_src)
idx_tgt = idx(words_tgt)
# load train bilingual lexicon
pairs = load_pairs(params.dico_train, idx_src, idx_tgt)
if params.maxsup > 0 and params.maxsup < len(pairs):
pairs = pairs[:params.maxsup]
# selecting training vector pairs
X_src, Y_tgt = select_vectors_from_pairs(x_src, x_tgt, pairs)
# adding negatives for RCSLS
Z_src = x_src[:params.maxneg, :]
Z_tgt = x_tgt[:params.maxneg, :]
# initialization:
R = procrustes(X_src, Y_tgt)
nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
print("[init -- Procrustes] NN: %.4f"%(nnacc))
sys.stdout.flush()
# optimization
fold, Rold = 0, []
niter, lr = params.niter, params.lr
for it in range(0, niter + 1):
if lr < 1e-4:
break
if params.sgd:
indices = np.random.choice(X_src.shape[0], size=params.batchsize, replace=False)
f, df = rcsls(X_src[indices, :], Y_tgt[indices, :], Z_src, Z_tgt, R, params.knn)
else:
f, df = rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, params.knn)
if params.reg > 0:
R *= (1 - lr * params.reg)
R -= lr * df
if params.model == "spectral":
R = proj_spectral(R)
print("[it=%d] f = %.4f" % (it, f))
sys.stdout.flush()
if f > fold and it > 0 and not params.sgd:
lr /= 2
f, R = fold, Rold
fold, Rold = f, R
if (it > 0 and it % 10 == 0) or it == niter:
nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
print("[it=%d] NN = %.4f - Coverage = %.4f" % (it, nnacc, len(src2tgt) / lexicon_size))
nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
print("[final] NN = %.4f - Coverage = %.4f" % (nnacc, len(src2tgt) / lexicon_size))
if params.output != "":
print("Saving all aligned vectors at %s" % params.output)
words_full, x_full = load_vectors(params.src_emb, maxload=-1, center=params.center, verbose=False)
x = np.dot(x_full, R.T)
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
save_vectors(params.output, x, words_full)
save_matrix(params.output + "-mat", R)
================================================
FILE: alignment/eval.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import io
import numpy as np
import argparse
from utils import *
parser = argparse.ArgumentParser(description='Evaluation of word alignment')
parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings")
parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings")
parser.add_argument('--center', action='store_true', help='whether to center embeddings or not')
parser.add_argument("--src_mat", type=str, default='', help="Load source alignment matrix. If none given, the aligment matrix is the identity.")
parser.add_argument("--tgt_mat", type=str, default='', help="Load target alignment matrix. If none given, the aligment matrix is the identity.")
parser.add_argument("--dico_test", type=str, default='', help="test dictionary")
parser.add_argument("--maxload", type=int, default=200000)
parser.add_argument("--nomatch", action='store_true', help="no exact match in lexicon")
params = parser.parse_args()
###### SPECIFIC FUNCTIONS ######
# function specific to evaluation
# the rest of the functions are in utils.py
def load_transform(fname, d1=300, d2=300):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
R = np.zeros([d1, d2])
for i, line in enumerate(fin):
tokens = line.split(' ')
R[i, :] = np.array(tokens[0:d2], dtype=float)
return R
###### MAIN ######
print("Evaluation of alignment on %s" % params.dico_test)
if params.nomatch:
print("running without exact string matches")
words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center)
words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center)
if params.tgt_mat != "":
R_tgt = load_transform(params.tgt_mat)
x_tgt = np.dot(x_tgt, R_tgt)
if params.src_mat != "":
R_src = load_transform(params.src_mat)
x_src = np.dot(x_src, R_src)
src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt)
nnacc = compute_nn_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size)
cslsproc = compute_csls_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size)
print("NN = %.4f - CSLS = %.4f - Coverage = %.4f" % (nnacc, cslsproc, len(src2tgt) / lexicon_size))
================================================
FILE: alignment/example.sh
================================================
#!/bin/usr/env sh
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
set -e
s=${1:-en}
t=${2:-es}
echo "Example based on the ${s}->${t} alignment"
if [ ! -d data/ ]; then
mkdir -p data;
fi
if [ ! -d res/ ]; then
mkdir -p res;
fi
dico_train=data/${s}-${t}.0-5000.txt
if [ ! -f "${dico_train}" ]; then
DICO=$(basename -- "${dico_train}")
wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/
fi
dico_test=data/${s}-${t}.5000-6500.txt
if [ ! -f "${dico_test}" ]; then
DICO=$(basename -- "${dico_test}")
wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/
fi
src_emb=data/wiki.${s}.vec
if [ ! -f "${src_emb}" ]; then
EMB=$(basename -- "${src_emb}")
wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/
fi
tgt_emb=data/wiki.${t}.vec
if [ ! -f "${tgt_emb}" ]; then
EMB=$(basename -- "${tgt_emb}")
wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/
fi
output=res/wiki.${s}-${t}.vec
python3 align.py --src_emb "${src_emb}" --tgt_emb "${tgt_emb}" \
--dico_train "${dico_train}" --dico_test "${dico_test}" --output "${output}" \
--lr 25 --niter 10
python3 eval.py --src_emb "${output}" --tgt_emb "${tgt_emb}" \
--dico_test "${dico_test}"
================================================
FILE: alignment/unsup_align.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import codecs, sys, time, math, argparse, ot
import numpy as np
from utils import *
parser = argparse.ArgumentParser(description='Wasserstein Procrustes for Embedding Alignment')
parser.add_argument('--model_src', type=str, help='Path to source word embeddings')
parser.add_argument('--model_tgt', type=str, help='Path to target word embeddings')
parser.add_argument('--lexicon', type=str, help='Path to the evaluation lexicon')
parser.add_argument('--output_src', default='', type=str, help='Path to save the aligned source embeddings')
parser.add_argument('--output_tgt', default='', type=str, help='Path to save the aligned target embeddings')
parser.add_argument('--seed', default=1111, type=int, help='Random number generator seed')
parser.add_argument('--nepoch', default=5, type=int, help='Number of epochs')
parser.add_argument('--niter', default=5000, type=int, help='Initial number of iterations')
parser.add_argument('--bsz', default=500, type=int, help='Initial batch size')
parser.add_argument('--lr', default=500., type=float, help='Learning rate')
parser.add_argument('--nmax', default=20000, type=int, help='Vocabulary size for learning the alignment')
parser.add_argument('--reg', default=0.05, type=float, help='Regularization parameter for sinkhorn')
args = parser.parse_args()
def objective(X, Y, R, n=5000):
Xn, Yn = X[:n], Y[:n]
C = -np.dot(np.dot(Xn, R), Yn.T)
P = ot.sinkhorn(np.ones(n), np.ones(n), C, 0.025, stopThr=1e-3)
return 1000 * np.linalg.norm(np.dot(Xn, R) - np.dot(P, Yn)) / n
def sqrt_eig(x):
U, s, VT = np.linalg.svd(x, full_matrices=False)
return np.dot(U, np.dot(np.diag(np.sqrt(s)), VT))
def align(X, Y, R, lr=10., bsz=200, nepoch=5, niter=1000,
nmax=10000, reg=0.05, verbose=True):
for epoch in range(1, nepoch + 1):
for _it in range(1, niter + 1):
# sample mini-batch
xt = X[np.random.permutation(nmax)[:bsz], :]
yt = Y[np.random.permutation(nmax)[:bsz], :]
# compute OT on minibatch
C = -np.dot(np.dot(xt, R), yt.T)
P = ot.sinkhorn(np.ones(bsz), np.ones(bsz), C, reg, stopThr=1e-3)
# compute gradient
G = - np.dot(xt.T, np.dot(P, yt))
R -= lr / bsz * G
# project on orthogonal matrices
U, s, VT = np.linalg.svd(R)
R = np.dot(U, VT)
bsz *= 2
niter //= 4
if verbose:
print("epoch: %d obj: %.3f" % (epoch, objective(X, Y, R)))
return R
def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
n, d = X.shape
if apply_sqrt:
X, Y = sqrt_eig(X), sqrt_eig(Y)
K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
P = np.ones([n, n]) / float(n)
for it in range(1, niter + 1):
G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
alpha = 2.0 / float(2.0 + it)
P = alpha * q + (1.0 - alpha) * P
obj = np.linalg.norm(np.dot(P, K_X) - np.dot(K_Y, P))
print(obj)
return procrustes(np.dot(P, X), Y).T
print("\n*** Wasserstein Procrustes ***\n")
np.random.seed(args.seed)
maxload = 200000
w_src, x_src = load_vectors(args.model_src, maxload, norm=True, center=True)
w_tgt, x_tgt = load_vectors(args.model_tgt, maxload, norm=True, center=True)
src2trg, _ = load_lexicon(args.lexicon, w_src, w_tgt)
print("\nComputing initial mapping with convex relaxation...")
t0 = time.time()
R0 = convex_init(x_src[:2500], x_tgt[:2500], reg=args.reg, apply_sqrt=True)
print("Done [%03d sec]" % math.floor(time.time() - t0))
print("\nComputing mapping with Wasserstein Procrustes...")
t0 = time.time()
R = align(x_src, x_tgt, R0.copy(), bsz=args.bsz, lr=args.lr, niter=args.niter,
nepoch=args.nepoch, reg=args.reg, nmax=args.nmax)
print("Done [%03d sec]" % math.floor(time.time() - t0))
acc = compute_nn_accuracy(x_src, np.dot(x_tgt, R.T), src2trg)
print("\nPrecision@1: %.3f\n" % acc)
if args.output_src != '':
x_src = x_src / np.linalg.norm(x_src, 2, 1).reshape([-1, 1])
save_vectors(args.output_src, x_src, w_src)
if args.output_tgt != '':
x_tgt = x_tgt / np.linalg.norm(x_tgt, 2, 1).reshape([-1, 1])
save_vectors(args.output_tgt, np.dot(x_tgt, R.T), w_tgt)
================================================
FILE: alignment/unsup_multialign.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import io, os, ot, argparse, random
import numpy as np
from utils import *
parser = argparse.ArgumentParser(description=' ')
parser.add_argument('--embdir', default='data/', type=str)
parser.add_argument('--outdir', default='output/', type=str)
parser.add_argument('--lglist', default='en-fr-es-it-pt-de-pl-ru-da-nl-cs', type=str,
help='list of languages. The first element is the pivot. Example: en-fr-es to align English, French and Spanish with English as the pivot.')
parser.add_argument('--maxload', default=20000, type=int, help='Max number of loaded vectors')
parser.add_argument('--uniform', action='store_true', help='switch to uniform probability of picking language pairs')
# optimization parameters for the square loss
parser.add_argument('--epoch', default=2, type=int, help='nb of epochs for square loss')
parser.add_argument('--niter', default=500, type=int, help='max number of iteration per epoch for square loss')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate for square loss')
parser.add_argument('--bsz', default=500, type=int, help='batch size for square loss')
# optimization parameters for the RCSLS loss
parser.add_argument('--altepoch', default=100, type=int, help='nb of epochs for RCSLS loss')
parser.add_argument('--altlr', default=25, type=float, help='learning rate for RCSLS loss')
parser.add_argument("--altbsz", type=int, default=1000, help="batch size for RCSLS")
args = parser.parse_args()
###### SPECIFIC FUNCTIONS ######
def getknn(sc, x, y, k=10):
sidx = np.argpartition(sc, -k, axis=1)[:, -k:]
ytopk = y[sidx.flatten(), :]
ytopk = ytopk.reshape(sidx.shape[0], sidx.shape[1], y.shape[1])
f = np.sum(sc[np.arange(sc.shape[0])[:, None], sidx])
df = np.dot(ytopk.sum(1).T, x)
return f / k, df / k
def rcsls(Xi, Xj, Zi, Zj, R, knn=10):
X_trans = np.dot(Xi, R.T)
f = 2 * np.sum(X_trans * Xj)
df = 2 * np.dot(Xj.T, Xi)
fk0, dfk0 = getknn(np.dot(X_trans, Zj.T), Xi, Zj, knn)
fk1, dfk1 = getknn(np.dot(np.dot(Zi, R.T), Xj.T).T, Xj, Zi, knn)
f = f - fk0 -fk1
df = df - dfk0 - dfk1.T
return -f / Xi.shape[0], -df.T / Xi.shape[0]
def GWmatrix(emb0):
N = np.shape(emb0)[0]
N2 = .5* np.linalg.norm(emb0, axis=1).reshape(1, N)
C2 = np.tile(N2.transpose(), (1, N)) + np.tile(N2, (N, 1))
C2 -= np.dot(emb0,emb0.T)
return C2
def gromov_wasserstein(x_src, x_tgt, C2):
N = x_src.shape[0]
C1 = GWmatrix(x_src)
M = ot.gromov_wasserstein(C1,C2,np.ones(N),np.ones(N),'square_loss',epsilon=0.55,max_iter=100,tol=1e-4)
return procrustes(np.dot(M,x_tgt), x_src)
def align(EMB, TRANS, lglist, args):
nmax, l = args.maxload, len(lglist)
# create a list of language pairs to sample from
# (default == higher probability to pick a language pair contianing the pivot)
# if --uniform: uniform probability of picking a language pair
samples = []
for i in range(l):
for j in range(l):
if j == i :
continue
if j > 0 and args.uniform == False:
samples.append((0,j))
if i > 0 and args.uniform == False:
samples.append((i,0))
samples.append((i,j))
# optimization of the l2 loss
print('start optimizing L2 loss')
lr0, bsz, nepoch, niter = args.lr, args.bsz, args.epoch, args.niter
for epoch in range(nepoch):
print("start epoch %d / %d"%(epoch+1, nepoch))
ones = np.ones(bsz)
f, fold, nb, lr = 0.0, 0.0, 0.0, lr0
for it in range(niter):
if it > 1 and f > fold + 1e-3:
lr /= 2
if lr < .05:
break
fold = f
f, nb = 0.0, 0.0
for k in range(100 * (l-1)):
(i,j) = random.choice(samples)
embi = EMB[i][np.random.permutation(nmax)[:bsz], :]
embj = EMB[j][np.random.permutation(nmax)[:bsz], :]
perm = ot.sinkhorn(ones, ones, np.linalg.multi_dot([embi, -TRANS[i], TRANS[j].T,embj.T]), reg = 0.025, stopThr = 1e-3)
grad = np.linalg.multi_dot([embi.T, perm, embj])
f -= np.trace(np.linalg.multi_dot([TRANS[i].T, grad, TRANS[j]])) / embi.shape[0]
nb += 1
if i > 0:
TRANS[i] = proj_ortho(TRANS[i] + lr * np.dot(grad, TRANS[j]))
if j > 0:
TRANS[j] = proj_ortho(TRANS[j] + lr * np.dot(grad.transpose(), TRANS[i]))
print("iter %d / %d - epoch %d - loss: %.5f lr: %.4f" % (it, niter, epoch+1, f / nb , lr))
print("end of epoch %d - loss: %.5f - lr: %.4f" % (epoch+1, f / max(nb,1), lr))
niter, bsz = max(int(niter/2),2), min(1000, bsz * 2)
#end for epoch in range(nepoch):
# optimization of the RCSLS loss
print('start optimizing RCSLS loss')
f, fold, nb, lr = 0.0, 0.0, 0.0, args.altlr
for epoch in range(args.altepoch):
if epoch > 1 and f-fold > -1e-4 * abs(fold):
lr/= 2
if lr < 1e-1:
break
fold = f
f, nb = 0.0, 0.0
for k in range(round(nmax / args.altbsz) * 10 * (l-1)):
(i,j) = random.choice(samples)
sgdidx = np.random.choice(nmax, size=args.altbsz, replace=False)
embi = EMB[i][sgdidx, :]
embj = EMB[j][:nmax, :]
# crude alignment approximation:
T = np.dot(TRANS[i], TRANS[j].T)
scores = np.linalg.multi_dot([embi, T, embj.T])
perm = np.zeros_like(scores)
perm[np.arange(len(scores)), scores.argmax(1)] = 1
embj = np.dot(perm, embj)
# normalization over a subset of embeddings for speed up
fi, grad = rcsls(embi, embj, embi, embj, T.T)
f += fi
nb += 1
if i > 0:
TRANS[i] = proj_ortho(TRANS[i] - lr * np.dot(grad, TRANS[j]))
if j > 0:
TRANS[j] = proj_ortho(TRANS[j] - lr * np.dot(grad.transpose(), TRANS[i]))
print("epoch %d - loss: %.5f - lr: %.4f" % (epoch+1, f / max(nb,1), lr))
#end for epoch in range(args.altepoch):
return TRANS
def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
n, d = X.shape
K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
P = np.ones([n, n]) / float(n)
for it in range(1, niter + 1):
G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
alpha = 2.0 / float(2.0 + it)
P = alpha * q + (1.0 - alpha) * P
return procrustes(np.dot(P, X), Y).T
###### MAIN ######
lglist = args.lglist.split('-')
l = len(lglist)
# embs:
EMB = {}
for i in range(l):
fn = args.embdir + '/wiki.' + lglist[i] + '.vec'
_, vecs = load_vectors(fn, maxload=args.maxload)
EMB[i] = vecs
#init
print("Computing initial bilingual apping with Gromov-Wasserstein...")
TRANS={}
maxinit = 2000
emb0 = EMB[0][:maxinit,:]
C0 = GWmatrix(emb0)
TRANS[0] = np.eye(300)
for i in range(1, l):
print("init "+lglist[i])
embi = EMB[i][:maxinit,:]
TRANS[i] = gromov_wasserstein(embi, emb0, C0)
# align
align(EMB, TRANS, lglist, args)
print('saving matrices in ' + args.outdir)
languages=''.join(lglist)
for i in range(l):
save_matrix(args.outdir + '/W-' + languages + '-' + lglist[i], TRANS[i])
================================================
FILE: alignment/utils.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import io
import numpy as np
import collections
def load_vectors(fname, maxload=200000, norm=True, center=False, verbose=True):
if verbose:
print("Loading vectors from %s" % fname)
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
if maxload > 0:
n = min(n, maxload)
x = np.zeros([n, d])
words = []
for i, line in enumerate(fin):
if i >= n:
break
tokens = line.rstrip().split(' ')
words.append(tokens[0])
v = np.array(tokens[1:], dtype=float)
x[i, :] = v
if norm:
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
if center:
x -= x.mean(axis=0)[np.newaxis, :]
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
if verbose:
print("%d word vectors loaded" % (len(words)))
return words, x
def idx(words):
w2i = {}
for i, w in enumerate(words):
if w not in w2i:
w2i[w] = i
return w2i
def save_vectors(fname, x, words):
n, d = x.shape
fout = io.open(fname, 'w', encoding='utf-8')
fout.write(u"%d %d\n" % (n, d))
for i in range(n):
fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
fout.close()
def save_matrix(fname, x):
n, d = x.shape
fout = io.open(fname, 'w', encoding='utf-8')
fout.write(u"%d %d\n" % (n, d))
for i in range(n):
fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
fout.close()
def procrustes(X_src, Y_tgt):
U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src))
return np.dot(U, V)
def select_vectors_from_pairs(x_src, y_tgt, pairs):
n = len(pairs)
d = x_src.shape[1]
x = np.zeros([n, d])
y = np.zeros([n, d])
for k, ij in enumerate(pairs):
i, j = ij
x[k, :] = x_src[i, :]
y[k, :] = y_tgt[j, :]
return x, y
def load_lexicon(filename, words_src, words_tgt, verbose=True):
f = io.open(filename, 'r', encoding='utf-8')
lexicon = collections.defaultdict(set)
idx_src , idx_tgt = idx(words_src), idx(words_tgt)
vocab = set()
for line in f:
word_src, word_tgt = line.split()
if word_src in idx_src and word_tgt in idx_tgt:
lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
vocab.add(word_src)
if verbose:
coverage = len(lexicon) / float(len(vocab))
print("Coverage of source vocab: %.4f" % (coverage))
return lexicon, float(len(vocab))
def load_pairs(filename, idx_src, idx_tgt, verbose=True):
f = io.open(filename, 'r', encoding='utf-8')
pairs = []
tot = 0
for line in f:
a, b = line.rstrip().split(' ')
tot += 1
if a in idx_src and b in idx_tgt:
pairs.append((idx_src[a], idx_tgt[b]))
if verbose:
coverage = (1.0 * len(pairs)) / tot
print("Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f" % (len(pairs), tot, coverage))
return pairs
def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1):
if lexicon_size < 0:
lexicon_size = len(lexicon)
idx_src = list(lexicon.keys())
acc = 0.0
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
for i in range(0, len(idx_src), bsz):
e = min(i + bsz, len(idx_src))
scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
pred = scores.argmax(axis=0)
for j in range(i, e):
if pred[j - i] in lexicon[idx_src[j]]:
acc += 1.0
return acc / lexicon_size
def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024):
if lexicon_size < 0:
lexicon_size = len(lexicon)
idx_src = list(lexicon.keys())
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
sr = x_src[list(idx_src)]
sc = np.dot(sr, x_tgt.T)
similarities = 2 * sc
sc2 = np.zeros(x_tgt.shape[0])
for i in range(0, x_tgt.shape[0], bsz):
j = min(i + bsz, x_tgt.shape[0])
sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
sc2[i:j] = np.mean(dotprod, axis=1)
similarities -= sc2[np.newaxis, :]
nn = np.argmax(similarities, axis=1).tolist()
correct = 0.0
for k in range(0, len(lexicon)):
if nn[k] in lexicon[idx_src[k]]:
correct += 1.0
return correct / lexicon_size
================================================
FILE: classification-example.sh
================================================
#!/usr/bin/env bash
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
myshuf() {
perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
}
normalize_text() {
tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
}
RESULTDIR=result
DATADIR=data
mkdir -p "${RESULTDIR}"
mkdir -p "${DATADIR}"
if [ ! -f "${DATADIR}/dbpedia.train" ]
then
wget -c "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" -O "${DATADIR}/dbpedia_csv.tar.gz"
tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}"
cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train"
cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test"
fi
make
./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4
./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
================================================
FILE: classification-results.sh
================================================
#!/usr/bin/env bash
#
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# This script produces the results from Table 1 in the following paper:
# Bag of Tricks for Efficient Text Classification, arXiv 1607.01759, 2016
myshuf() {
perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
}
normalize_text() {
tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
}
DATASET=(
ag_news
sogou_news
dbpedia
yelp_review_polarity
yelp_review_full
yahoo_answers
amazon_review_full
amazon_review_polarity
)
ID=(
0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
)
# These learning rates were chosen by validation on a subset of the training set.
LR=( 0.25 0.5 0.5 0.1 0.1 0.1 0.05 0.05 )
RESULTDIR=result
DATADIR=data
mkdir -p "${RESULTDIR}"
mkdir -p "${DATADIR}"
# Small datasets first
for i in {0..0}
do
echo "Downloading dataset ${DATASET[i]}"
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
then
wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
fi
done
# Large datasets require a bit more work due to the extra request page
for i in {1..7}
do
echo "Downloading dataset ${DATASET[i]}"
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
then
curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
fi
done
make
for i in {0..7}
do
echo "Working on dataset ${DATASET[i]}"
./fasttext supervised -input "${DATADIR}/${DATASET[i]}.train" \
-output "${RESULTDIR}/${DATASET[i]}" -dim 10 -lr "${LR[i]}" -wordNgrams 2 \
-minCount 1 -bucket 10000000 -epoch 5 -thread 4 > /dev/null
./fasttext test "${RESULTDIR}/${DATASET[i]}.bin" \
"${DATADIR}/${DATASET[i]}.test"
done
================================================
FILE: crawl/README.md
================================================
## Preprocessing Common Crawl
This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/).
This script uses the scripts and language identifier of [1].
This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText).
Set the variable WET_PATHS_URL to the crawl you want to process.
Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine.
Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM).
### Reference
If you use this code, please cite:
[1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
```
@inproceedings{grave2018learning,
title={Learning Word Vectors for 157 Languages},
author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
year={2018}
}
```
================================================
FILE: crawl/dedup.cc
================================================
// Copyright (c) 2018-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.
#include <cstdint>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656037ull)
{
for (size_t i = 0; i < sz; i++, data++) {
h ^= uint64_t(*data);
h *= 1099511628211ull;
}
return h;
}
int main(int argc, char** argv)
{
uint64_t init_values[] = {
14695981039346656037ull,
9425296925403859339ull,
13716263814064014149ull,
3525492407291847033ull,
8607404175481815707ull,
9818874561736458749ull,
10026508429719773353ull,
3560712257386009938ull
};
size_t n = 1ull<<34, num_hashes = 2;
std::vector<bool> seen(n);
std::ios_base::sync_with_stdio(false);
for (std::string line; std::getline(std::cin, line);) {
bool b = true;
for (size_t i = 0; i < num_hashes; i++) {
uint64_t h = fnv1a_64((uint8_t*) line.data(), line.length(), init_values[i]) % n;
b = b && seen[h];
seen[h] = true;
}
if (!b) {
std::cout << line << std::endl;
}
}
return 0;
}
================================================
FILE: crawl/download_crawl.sh
================================================
#!/bin/usr/env sh
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
set -e
# Set this variable to the crawl you want to process.
WET_PATHS_URL="https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-34/wet.paths.gz"
# Set NUM_LANGID and NUM_DEDUP according to the capacity of your machine.
# Please note that each dedup process uses 2GB of RAM, while langid is
# mostly limited by cpu usage.
NUM_LANGID=12
NUM_DEDUP=8
URL="https://commoncrawl.s3.amazonaws.com/"
if [ ! -d fastText ]; then
git clone https://github.com/facebookresearch/fastText.git
fi
if [ ! -f fastText/fasttext ]; then
cd fastText
make
cd ..
fi
if [ ! -f lid.176.bin ]; then
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
fi
if [ ! -d tmp ]; then
mkdir tmp
fi
if [ ! -d shard ]; then
mkdir shard
fi
if [ ! -f wet.paths ]; then
wget "${WET_PATHS_URL}"
gunzip wet.paths.gz
fi
## Language identification
cat wet.paths | xargs -n 1 -P "${NUM_LANGID}" -I '{}' sh process_wet_file.sh "${URL}{}"
## Deduplication
g++ -std=c++11 -O3 -o dedup dedup.cc
g++ -std=c++11 -O3 -o filter_utf8 filter_utf8.cc
find shard -name '*.txt' | xargs -n 1 -P "${NUM_DEDUP}" -I '{}' sh filter_dedup.sh "{}"
## Example of data filtering + tokenization
git clone https://github.com/moses-smt/mosesdecoder.git
perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l es < shard/es.dedup > shard/es.tok
================================================
FILE: crawl/filter_dedup.sh
================================================
#!/bin/usr/env sh
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
set -e
LG=$(basename --suffix=".txt" "${1}")
./filter_utf8 < "shard/${LG}.txt" \
| ./dedup > "shard/${LG}.dedup"
================================================
FILE: crawl/filter_utf8.cc
================================================
// Copyright (c) 2018-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.
#include <cstdint>
#include <iostream>
#include <string>
// Check that the next n bytes are continuation bytes.
bool continuation(uint8_t* str, int n)
{
for (int i = 0; i < n; i++) {
if ((str[i] & 0xc0) != 0x80) return false;
}
return true;
}
// Invalid UTF8 correspond to codepoints which are larger than U+10FFFF.
// This value is encoded in UTF8 as:
// * 11110.100 10.001111 10.111111 10.111111
// We thus check if the first byte is larger than 0xf4, or if it is equal
// to 0xf4 and the second byte is larger than 0x8f.
bool invalid(uint8_t* str)
{
return str[0] > 0xf4 || (str[0] == 0xf4 && str[1] > 0x8f);
}
// Surrogate halves corresponds to the range U+D800 through U+DFFF,
// which are encoded in UTF8 as:
// * 1110.1101 10.100000 10.000000
// * 1110.1101 10.111111 10.111111
// We thus check is the first byte is equal to 0xed and if the
// sixth bit of the second byte is set.
bool surrogate(uint8_t* str)
{
return str[0] == 0xed && str[1] & 0x20;
}
// Sequences of length 2 are overlong if the leading 4 bits (noted as y)
// are equal to 0: 110.yyyyx 10xxxxxx
bool overlong_2(uint8_t* str)
{
return (str[0] & 0x1e) == 0;
}
// Sequences of lenth 3 are overlong if the leading 5 bits (noted as y)
// are equal to 0: 1110.yyyy 10.yxxxxx 10.xxxxxx
bool overlong_3(uint8_t* str)
{
return (str[0] & 0x0f) == 0 && (str[1] & 0x20) == 0;
}
// Sequences of length 4 are overlong if the leading 5 bits (noted as y)
// are equal to 0: 11110.yyy 10.yyxxxx 10.xxxxxx 10.xxxxxx
bool overlong_4(uint8_t* str)
{
return (str[0] & 0x07) == 0 && (str[1] & 0x30) == 0;
}
bool valid_utf8(uint8_t* str, size_t length)
{
uint8_t* end = str + length;
while (str < end) {
if (str[0] < 0x80) {
// 0.xxxxxxx
str += 1;
} else if ((str[0] & 0xe0) == 0xc0) {
// 110.xxxxx 10.xxxxxx
if (str + 1 >= end) return false;
if (!continuation(str + 1, 1)) return false;
if (overlong_2(str)) return false;
str += 2;
} else if ((str[0] & 0xf0) == 0xe0) {
// 1110.xxxx 10.xxxxxx 10.xxxxxx
if (str + 2 >= end) return false;
if (!continuation(str + 1, 2)) return false;
if (overlong_3(str)) return false;
if (surrogate(str)) return false;
str += 3;
} else if ((str[0] & 0xf8) == 0xf0) {
// 11110.xxx 10.xxxxxx 10.xxxxxx 10.xxxxxx
if (str + 3 >= end) return false;
if (!continuation(str + 1, 3)) return false;
if (overlong_4(str)) return false;
if (invalid(str)) return false;
str += 4;
} else {
return false;
}
}
return true;
}
int main(int argc, char** argv)
{
std::ios_base::sync_with_stdio(false);
for (std::string line; std::getline(std::cin, line);) {
if (valid_utf8((uint8_t*) line.data(), line.length())) {
std::cout << line << std::endl;
}
}
return 0;
}
================================================
FILE: crawl/process_wet_file.sh
================================================
#!/bin/usr/env sh
# Copyright (c) 2018-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
set -e
URL=$1
FILENAME=$(basename --suffix=".warc.wet.gz" "${URL}")
echo "Processing ${FILENAME}."
wget -q -P tmp "${URL}"
#echo "Extracting ${FILENAME}.warc.wet.gz"
gunzip "tmp/${FILENAME}.warc.wet.gz"
#echo "Language identification for ${FILENAME}.warc.wet"
fastText/fasttext predict-prob lid.176.bin "tmp/${FILENAME}.warc.wet" > "tmp/${FILENAME}.lid"
#echo "Splitting ${FILENAME}.warc.wet per language"
paste "tmp/${FILENAME}.lid" "tmp/${FILENAME}.warc.wet" | \
awk '($2 > 0.8 || ($1=="__label__hr" && $2 > 0.4)) && length() > 100 {lang = substr($1, 10); $1=""; $2=""; print $0 >> "shard/"lang".txt"}'
#echo "Removing tmp files"
rm "tmp/${FILENAME}.lid"
rm "tmp/${FILENAME}.warc.wet"
================================================
FILE: docs/aligned-vectors.md
================================================
---
id: aligned-vectors
title: Aligned word vectors
---
We are publishing aligned word vectors for 44 languages based on the pre-trained vectors computed on [*Wikipedia*](https://www.wikipedia.org) using fastText.
The alignments are performed with the RCSLS method described in [*Joulin et al (2018)*](https://arxiv.org/abs/1804.07745).
### Vectors
The aligned vectors can be downloaded from:
|||||
|-|-|-|-|
| Afrikaans: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.af.align.vec) | Arabic: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ar.align.vec) | Bulgarian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bg.align.vec) | Bengali: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bn.align.vec) |
| Bosnian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bs.align.vec) | Catalan: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ca.align.vec) | Czech: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.cs.align.vec) | Danish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.da.align.vec) |
| German: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.de.align.vec) | Greek: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.el.align.vec) | English: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.en.align.vec) | Spanish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.es.align.vec) |
| Estonian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.et.align.vec) | Persian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fa.align.vec) | Finnish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fi.align.vec) | French: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fr.align.vec) |
| Hebrew: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.he.align.vec) | Hindi: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hi.align.vec) | Croatian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hr.align.vec) | Hungarian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hu.align.vec) |
| Indonesian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.id.align.vec) | Italian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.it.align.vec) | Korean: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ko.align.vec) | Lithuanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.lt.align.vec) |
| Latvian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.lv.align.vec) | Macedonian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.mk.align.vec) | Malay: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ms.align.vec) | Dutch: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.nl.align.vec) |
| Norwegian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.no.align.vec) | Polish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.pl.align.vec) | Portuguese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.pt.align.vec) | Romanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ro.align.vec) |
| Russian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ru.align.vec) | Slovak: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sk.align.vec) | Slovenian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sl.align.vec) | Albanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sq.align.vec) |
| Swedish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sv.align.vec) | Tamil: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ta.align.vec) | Thai: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.th.align.vec) | Tagalog: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.tl.align.vec) |
| Turkish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.tr.align.vec) | Ukrainian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.uk.align.vec) | Vietnamese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.vi.align.vec) | Chinese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.zh.align.vec) |
### Format
The word vectors come in the default text format of fastText.
The first line gives the number of vectors and their dimension.
The other lines contain a word followed by its vector. Each value is space separated.
### License
The word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/).
### References
If you use these word vectors, please cite the following papers:
[1] A. Joulin, P. Bojanowski, T. Mikolov, H. Jegou, E. Grave, [*Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion*](https://arxiv.org/abs/1804.07745)
```markup
@InProceedings{joulin2018loss,
title={Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion},
author={Joulin, Armand and Bojanowski, Piotr and Mikolov, Tomas and J\'egou, Herv\'e and Grave, Edouard},
year={2018},
booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
}
```
[2] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606)
```markup
@article{bojanowski2017enriching,
title={Enriching Word Vectors with Subword Information},
author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
journal={Transactions of the Association for Computational Linguistics},
volume={5},
year={2017},
issn={2307-387X},
pages={135--146}
}
```
================================================
FILE: docs/api.md
================================================
---
id: api
title:API
---
We automatically generate our [API documentation](/docs/en/html/index.html) with doxygen.
================================================
FILE: docs/autotune.md
================================================
---
id: autotune
title: Automatic hyperparameter optimization
---
As we saw in [the tutorial](/docs/en/supervised-tutorial.html#more-epochs-and-larger-learning-rate), finding the best hyperparameters is crucial for building efficient models. However, searching the best hyperparameters manually is difficult. Parameters are dependent and the effect of each parameter vary from one dataset to another.
FastText's autotune feature allows you to find automatically the best hyperparameters for your dataset.
# How to use it
In order to activate hyperparameter optimization, we must provide a validation file with the `-autotune-validation` argument.
For example, using the same data as our [tutorial example](/docs/en/supervised-tutorial.html#our-first-classifier), the autotune can be used in the following way:
<!--DOCUSAURUS_CODE_TABS-->
<!--Command line-->
```sh
>> ./fasttext supervised -input cooking.train -output model_cooking -autotune-validation cooking.valid
```
<!--Python-->
```py
>>> import fasttext
>>> model = fasttext.train_supervised(input='cooking.train', autotuneValidationFile='cooking.valid')
```
<!--END_DOCUSAURUS_CODE_TABS-->
Then, fastText will search the hyperparameters that gives the best f1-score on `cooking.valid` file:
```sh
Progress: 100.0% Trials: 27 Best score: 0.406763 ETA: 0h 0m 0s
```
Now we can test the obtained model with:
<!--DOCUSAURUS_CODE_TABS-->
<!--Command line-->
```sh
>> ./fasttext test model_cooking.bin cooking.valid
N 3000
P@1 0.666
R@1 0.288
```
<!--Python-->
```py
>>> model.test("cooking.valid")
(3000L, 0.666, 0.288)
```
<!--END_DOCUSAURUS_CODE_TABS-->
By default, the search will take 5 minutes. You can set the timeout in seconds with the `-autotune-duration` argument. For example, if you want to set the limit to 10 minutes:
<!--DOCUSAURUS_CODE_TABS-->
<!--Command line-->
```sh
>> ./fasttext supervised -input cooking.train -output model_cooking -autotune-validation cooking.valid -autotune-duration 600
```
<!--Python-->
```py
>>> import fasttext
>>> model = fasttext.train_supervised(input='cooking.train', autotuneValidationFile='cooking.valid', autotuneDuration=600)
```
<!--END_DOCUSAURUS_CODE_TABS-->
While autotuning, fastText displays the best f1-score found so far. If we decide to stop the tuning before the time limit, we can send one `SIGINT` signal (via `CTLR-C` for example). FastText will then finish the current training, and retrain with the best parameters found so far.
# Constrain model size
As you may know, fastText can compress the model with [quantization](/docs/en/cheatsheet.html#quantization). However, this compression task comes with its own [hyperparameters](/docs/en/options.html) (`-cutoff`, `-retrain`, `-qnorm`, `-qout`, `-dsub`) that have a consequence on the accuracy and the size of the final model.
Fortunately, autotune can also find the hyperparameters for this compression task while targeting the desired model size. To this end, we can set the `-autotune-modelsize` argument:
<!--DOCUSAURUS_CODE_TABS-->
<!--Command line-->
```sh
>> ./fasttext supervised -input cooking.train -output model_cooking -autotune-validation cooking.valid -autotune-modelsize 2M
```
This will produce a `.ftz` file with the best accuracy having the desired size:
```sh
>> ls -la model_cooking.ftz
-rw-r--r--. 1 celebio users 1990862 Aug 25 05:39 model_cooking.ftz
>> ./fasttext test model_cooking.ftz cooking.valid
N 3000
P@1 0.57
R@1 0.246
```
<!--Python-->
```py
>>> import fasttext
>>> model = fasttext.train_supervised(input='cooking.train', autotuneValidationFile='cooking.valid', autotuneModelSize="2M")
```
If you save the model, you will obtain a model file with the desired size:
```py
>>> model.save_model("model_cooking.ftz")
>>> import os
>>> os.stat("model_cooking.ftz").st_size
1990862
>>> model.test("cooking.valid")
(3000L, 0.57, 0.246)
```
<!--END_DOCUSAURUS_CODE_TABS-->
# How to set the optimization metric?
<!--DOCUSAURUS_CODE_TABS-->
<!--Command line-->
<br />
By default, autotune will test the validation file you provide, exactly the same way as `./fasttext test model_cooking.bin cooking.valid` and try to optimize to get the highest [f1-score](https://en.wikipedia.org/wiki/F1_score).
But, if we want to optimize the score of a specific label, say `__label__baking`, we can set the `-autotune-metric` argument:
```sh
>> ./fasttext supervised -input cooking.train -output model_cooking -autotune-validation cooking.valid -autotune-metric f1:__label__baking
```
This is equivalent to manually optimize the f1-score we get when we test with `./fasttext test-label model_cooking.bin cooking.valid | grep __label__baking` in command line.
Sometimes, you may be interested in predicting more than one label. For example, if you were optimizing the hyperparameters manually to get the best score to predict two labels, you would test with `./fasttext test model_cooking.bin cooking.valid 2`. You can also tell autotune to optimize the parameters by testing two labels with the `-autotune-predictions` argument.
<!--Python-->
<br />
By default, autotune will test the validation file you provide, exactly the same way as `model.test("cooking.valid")` and try to optimize to get the highest [f1-score](https://en.wikipedia.org/wiki/F1_score).
But, if we want to optimize the score of a specific label, say `__label__baking`, we can set the `autotuneMetric` argument:
```py
>>> import fasttext
>>> model = fasttext.train_supervised(input='cooking.train', autotuneValidationFile='cooking.valid', autotuneMetric="f1:__label__baking")
```
This is equivalent to manually optimize the f1-score we get when we test with `model.test_label('cooking.valid')['__label__baking']`.
Sometimes, you may be interested in predicting more than one label. For example, if you were optimizing the hyperparameters manually to get the best score to predict two labels, you would test with `model.test("cooking.valid", k=2)`. You can also tell autotune to optimize the parameters by testing two labels with the `autotunePredictions` argument.
<!--END_DOCUSAURUS_CODE_TABS-->
You can also force autotune to optimize for the best precision for a given recall, or the best recall for a given precision, for all labels, or for a specific label:
For example, in order to get the best precision at recall = `30%`:
```sh
>> ./fasttext supervised [...] -autotune-metric precisionAtRecall:30
```
And to get the best precision at recall = `30%` for the label `__label__baking`:
```sh
>> ./fasttext supervised [...] -autotune-metric precisionAtRecall:30:__label__baking
```
Similarly, you can use `recallAtPrecision`:
```sh
>> ./fasttext supervised [...] -autotune-metric recallAtPrecision:30
>> ./fasttext supervised [...] -autotune-metric recallAtPrecision:30:__label__baking
```
================================================
FILE: docs/cheatsheet.md
================================================
---
id: cheatsheet
title: Cheatsheet
---
## Word representation learning
In order to learn word vectors do:
```bash
$ ./fasttext skipgram -input data.txt -output model
```
## Obtaining word vectors
Print word vectors for a text file `queries.txt` containing words.
```bash
$ ./fasttext print-word-vectors model.bin < queries.txt
```
## Text classification
In order to train a text classifier do:
```bash
$ ./fasttext supervised -input train.txt -output model
```
Once the model was trained, you can evaluate it by computing the precision and recall at k (P@k and R@k) on a test set using:
```bash
$ ./fasttext test model.bin test.txt 1
```
In order to obtain the k most likely labels for a piece of text, use:
```bash
$ ./fasttext predict model.bin test.txt k
```
In order to obtain the k most likely labels and their associated probabilities for a piece of text, use:
```bash
$ ./fasttext predict-prob model.bin test.txt k
```
If you want to compute vector representations of sentences or paragraphs, please use:
```bash
$ ./fasttext print-sentence-vectors model.bin < text.txt
```
## Quantization
In order to create a `.ftz` file with a smaller memory footprint do:
```bash
$ ./fasttext quantize -output model
```
All other commands such as test also work with this model
```bash
$ ./fasttext test model.ftz test.txt
```
## Autotune
Activate hyperparameter optimization with `-autotune-validation` argument:
```bash
$ ./fasttext supervised -input train.txt -output model -autotune-validation valid.txt
```
Set timeout (in seconds):
```bash
$ ./fasttext supervised -input train.txt -output model -autotune-validation valid.txt -autotune-duration 600
```
Constrain the final model size:
```bash
$ ./fasttext supervised -input train.txt -output model -autotune-validation valid.txt -autotune-modelsize 2M
```
================================================
FILE: docs/crawl-vectors.md
================================================
---
id: crawl-vectors
title: Word vectors for 157 languages
---
We distribute pre-trained word vectors for 157 languages, trained on [*Common Crawl*](http://commoncrawl.org/) and [*Wikipedia*](https://www.wikipedia.org) using fastText.
These models were trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives.
We also distribute three new word analogy datasets, for French, Hindi and Polish.
### Download directly with command line or from python
In order to download with command line or from python code, you must have installed the python package as [described here](/docs/en/support.html#building-fasttext-python-module).
<!--DOCUSAURUS_CODE_TABS-->
<!--Command line-->
```bash
$ ./download_model.py en # English
Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
(19.78%) [=========> ]
```
Once the download is finished, use the model as usual:
```bash
$ ./fasttext nn cc.en.300.bin 10
Query word?
```
<!--Python-->
```py
>>> import fasttext.util
>>> fasttext.util.download_model('en', if_exists='ignore') # English
>>> ft = fasttext.load_model('cc.en.300.bin')
```
<!--END_DOCUSAURUS_CODE_TABS-->
#### 🤗 HuggingFace Integration
Word vectors for 157 languages available on the Hugging Face Hub under the [`fasttext`](https://huggingface.co/models?library=fasttext) tag and more documentation is available [here](https://huggingface.co/facebook/fasttext-en-vectors/blob/main/README.md).
```python
>>> import fasttext
>>> from huggingface_hub import hf_hub_download
>>> model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin")
>>> model = fasttext.load_model(model_path)
```
### Adapt the dimension
The pre-trained word vectors we distribute have dimension 300. If you need a smaller size, you can use our dimension reducer.
In order to use that feature, you must have installed the python package as [described here](/docs/en/support.html#building-fasttext-python-module).
For example, in order to get vectors of dimension 100:
<!--DOCUSAURUS_CODE_TABS-->
<!--Command line-->
```bash
$ ./reduce_model.py cc.en.300.bin 100
Loading model
Reducing matrix dimensions
Saving model
cc.en.100.bin saved
```
Then you can use the `cc.en.100.bin` model file as usual.
<!--Python-->
```py
>>> import fasttext
>>> import fasttext.util
>>> ft = fasttext.load_model('cc.en.300.bin')
>>> ft.get_dimension()
300
>>> fasttext.util.reduce_model(ft, 100)
>>> ft.get_dimension()
100
```
Then you can use `ft` model object as usual:
```py
>>> ft.get_word_vector('hello').shape
(100,)
>>> ft.get_nearest_neighbors('hello')
[(0.775576114654541, u'heyyyy'), (0.7686290144920349, u'hellow'), (0.7663413286209106, u'hello-'), (0.7579624056816101, u'heyyyyy'), (0.7495524287223816, u'hullo'), (0.7473770380020142, u'.hello'), (0.7407292127609253, u'Hiiiii'), (0.7402616739273071, u'hellooo'), (0.7399682402610779, u'hello.'), (0.7396857738494873, u'Heyyyyy')]
```
or save it for later use:
```py
>>> ft.save_model('cc.en.100.bin')
```
<!--END_DOCUSAURUS_CODE_TABS-->
### Format
The word vectors are available in both binary and text formats.
Using the binary models, vectors for out-of-vocabulary words can be obtained with
```
$ ./fasttext print-word-vectors wiki.it.300.bin < oov_words.txt
```
where the file oov_words.txt contains out-of-vocabulary words.
In the text format, each line contain a word followed by its vector.
Each value is space separated, and words are sorted by frequency in descending order.
These text models can easily be loaded in Python using the following code:
```python
import io
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = map(float, tokens[1:])
return data
```
### Tokenization
We used the [*Stanford word segmenter*](https://nlp.stanford.edu/software/segmenter.html) for Chinese, [*Mecab*](http://taku910.github.io/mecab/) for Japanese and [*UETsegmenter*](https://github.com/phongnt570/UETsegmenter) for Vietnamese.
For languages using the Latin, Cyrillic, Hebrew or Greek scripts, we used the tokenizer from the [*Europarl*](http://www.statmt.org/europarl/) preprocessing tools.
For the remaining languages, we used the ICU tokenizer.
More information about the training of these models can be found in the article [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893).
### License
The word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/).
### References
If you use these word vectors, please cite the following paper:
E. Grave\*, P. Bojanowski\*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
```markup
@inproceedings{grave2018learning,
title={Learning Word Vectors for 157 Languages},
author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
year={2018}
}
```
### Evaluation datasets
The analogy evaluation datasets described in the paper are available here: [French](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-fr.txt), [Hindi](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-hi.txt), [Polish](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-pl.txt).
### Models
The models can be downloaded from:
||||
|-|-|-|
| Afrikaans: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.vec.gz) | Albanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz) | Alemannic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.vec.gz) |
| Amharic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.vec.gz) | Arabic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz) | Aragonese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.vec.gz) |
| Armenian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.vec.gz) | Assamese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.vec.gz) | Asturian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.vec.gz) |
| Azerbaijani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.vec.gz) | Bashkir: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.vec.gz) | Basque: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.vec.gz) |
| Bavarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.vec.gz) | Belarusian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.vec.gz) | Bengali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz) |
| Bihari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.vec.gz) | Bishnupriya Manipuri: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.vec.gz) | Bosnian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.vec.gz) |
| Breton: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.vec.gz) | Bulgarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz) | Burmese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.vec.gz) |
| Catalan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.vec.gz) | Cebuano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.vec.gz) | Central Bicolano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.vec.gz) |
| Chechen: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.vec.gz) | Chinese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz) | Chuvash: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.vec.gz) |
| Corsican: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.vec.gz) | Croatian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.vec.gz) | Czech: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.vec.gz) |
| Danish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.vec.gz) | Divehi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.vec.gz) | Dutch: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz) |
| Eastern Punjabi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.vec.gz) | Egyptian Arabic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.vec.gz) | Emilian-Romagnol: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.vec.gz) |
| English: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz) | Erzya: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.vec.gz) | Esperanto: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.vec.gz) |
| Estonian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.vec.gz) | Fiji Hindi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.vec.gz) | Finnish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz) |
| French: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz) | Galician: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.vec.gz) | Georgian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.vec.gz) |
| German: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz) | Goan Konkani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.vec.gz) | Greek: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz) |
| Gujarati: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.vec.gz) | Haitian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.vec.gz) | Hebrew: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.vec.gz) |
| Hill Mari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.vec.gz) | Hindi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz) | Hungarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.vec.gz) |
| Icelandic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.vec.gz) | Ido: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.vec.gz) | Ilokano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.vec.gz) |
| Indonesian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz) | Interlingua: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.vec.gz) | Irish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz) |
| Italian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz) | Japanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz) | Javanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.vec.gz) |
| Kannada: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.vec.gz) | Kapampangan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.vec.gz) | Kazakh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.vec.gz) |
| Khmer: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.vec.gz) | Kirghiz: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.vec.gz) | Korean: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.vec.gz) |
| Kurdish (Kurmanji): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.vec.gz) | Kurdish (Sorani): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.vec.gz) | Latin: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz) |
| Latvian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.vec.gz) | Limburgish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.vec.gz) | Lithuanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.vec.gz) |
| Lombard: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.vec.gz) | Low Saxon: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.vec.gz) | Luxembourgish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.vec.gz) |
| Macedonian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.vec.gz) | Maithili: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.vec.gz) | Malagasy: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.vec.gz) |
| Malay: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.vec.gz) | Malayalam: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.vec.gz) | Maltese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.vec.gz) |
| Manx: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.vec.gz) | Marathi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz) | Mazandarani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.vec.gz) |
| Meadow Mari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.vec.gz) | Minangkabau: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.vec.gz) | Mingrelian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.vec.gz) |
| Mirandese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.vec.gz) | Mongolian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.vec.gz) | Nahuatl: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.vec.gz) |
| Neapolitan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.vec.gz) | Nepali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.vec.gz) | Newar: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.vec.gz) |
| North Frisian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.vec.gz) | Northern Sotho: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.vec.gz) | Norwegian (Bokmål): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.vec.gz) |
| Norwegian (Nynorsk): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.vec.gz) | Occitan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.vec.gz) | Oriya: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.vec.gz) |
| Ossetian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.vec.gz) | Palatinate German: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.vec.gz) | Pashto: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.vec.gz) |
| Persian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz) | Piedmontese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.vec.gz) | Polish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz) |
| Portuguese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz) | Quechua: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.vec.gz) | Romanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz) |
| Romansh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.vec.gz) | Russian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz) | Sakha: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.vec.gz) |
| Sanskrit: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.vec.gz) | Sardinian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.vec.gz) | Scots: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.vec.gz) |
| Scottish Gaelic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.vec.gz) | Serbian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.vec.gz) | Serbo-Croatian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.vec.gz) |
| Sicilian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.vec.gz) | Sindhi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.vec.gz) | Sinhalese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz) |
| Slovak: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.vec.gz) | Slovenian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz) | Somali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.vec.gz) |
| Southern Azerbaijani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.vec.gz) | Spanish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz) | Sundanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.vec.gz) |
| Swahili: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.vec.gz) | Swedish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.vec.gz) | Tagalog: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.vec.gz) |
| Tajik: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.vec.gz) | Tamil: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz) | Tatar: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.vec.gz) |
| Telugu: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz) | Thai: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.vec.gz) | Tibetan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.vec.gz) |
| Turkish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz) | Turkmen: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.vec.gz) | Ukrainian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.vec.gz) |
| Upper Sorbian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.vec.gz) | Urdu: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.vec.gz) | Uyghur: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.vec.gz) |
| Uzbek: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.vec.gz) | Venetian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.vec.gz) | Vietnamese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz) |
| Volapük: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.vec.gz) | Walloon: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.vec.gz) | Waray: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.vec.gz) |
| Welsh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.vec.gz) | West Flemish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.vec.gz) | West Frisian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.vec.gz) |
| Western Punjabi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.vec.gz) | Yiddish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.vec.gz) | Yoruba: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.vec.gz) |
| Zazaki: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.vec.gz) | Zeelandic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.vec.gz) |
================================================
FILE: docs/dataset.md
================================================
---
id: dataset
title: Datasets
---
[Download YFCC100M Dataset](https://fb-public.box.com/s/htfdbrvycvroebv9ecaezaztocbcnsdn)
================================================
FILE: docs/english-vectors.md
================================================
---
id: english-vectors
title: English word vectors
---
This page gathers several pre-trained word vectors trained using fastText.
### Download pre-trained word vectors
Pre-trained word vectors learned on different sources can be downloaded below:
1. [wiki-news-300d-1M.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip): 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
2. [wiki-news-300d-1M-subword.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip): 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
3. [crawl-300d-2M.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip): 2 million word vectors trained on Common Crawl (600B tokens).
4. [crawl-300d-2M-subword.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip): 2 million word vectors trained with subword information on Common Crawl (600B tokens).
### Format
The first line of the file contains the number of words in the vocabulary and the size of the vectors.
Each line contains a word followed by its vectors, like in the default fastText text format.
Each value is space separated. Words are ordered by descending frequency.
These text models can easily be loaded in Python using the following code:
```python
import io
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = map(float, tokens[1:])
return data
```
### License
These word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/).
### References
If you use these word vectors, please cite the following paper:
T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. [*Advances in Pre-Training Distributed Word Representations*](https://arxiv.org/abs/1712.09405)
```markup
@inproceedings{mikolov2018advances,
title={Advances in Pre-Training Distributed Word Representations},
author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand},
booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
year={2018}
}
```
================================================
FILE: docs/faqs.md
================================================
---
id: faqs
title:FAQ
---
## What is fastText? Are there tutorials?
FastText is a library for text classification and representation. It transforms text into continuous vectors that can later be used on any language related task. A few tutorials are available.
## How can I reduce the size of my fastText models?
fastText uses a hashtable for either word or character ngrams. The size of the hashtable directly impacts the size of a model. To reduce the size of the model, it is possible to reduce the size of this table with the option '-hash'. For example a good value is 20000. Another option that greatly impacts the size of a model is the size of the vectors (-dim). This dimension can be reduced to save space but this can significantly impact performance. If that still produce a model that is too big, one can further reduce the size of a trained model with the quantization option.
```bash
./fasttext quantize -output model
```
## What would be the best way to represent word phrases rather than words?
Currently the best approach to represent word phrases or sentence is to take a bag of words of word vectors. Additionally, for phrases like “New York”, preprocessing the data so that it becomes a single token “New_York” can greatly help.
## Why does fastText produce vectors even for unknown words?
One of the key features of fastText word representation is its ability to produce vectors for any words, even made-up ones.
Indeed, fastText word vectors are built from vectors of substrings of characters contained in it.
This allows to build vectors even for misspelled words or concatenation of words.
## Why is the hierarchical softmax slightly worse in performance than the full softmax?
The hierarchical softmax is an approximation of the full softmax loss that allows to train on large number of class efficiently. This is often at the cost of a few percent of accuracy.
Note also that this loss is thought for classes that are unbalanced, that is some classes are more frequent than others. If your dataset has a balanced number of examples per class, it is worth trying the negative sampling loss (-loss ns -neg 100).
However, negative sampling will still be very slow at test time, since the full softmax will be computed.
## Can we run fastText program on a GPU?
As of now, fastText only works on CPU.
Please note that one of the goal of fastText is to be an efficient CPU tool, allowing to train models without requiring a GPU.
## Can I use fastText with python? Or other languages?
[Python is officially supported](/docs/en/support.html#building-fasttext-python-module).
There are few unofficial wrappers for javascript, lua and other languages available on github.
## Can I use fastText with continuous data?
FastText works on discrete tokens and thus cannot be directly used on continuous tokens. However, one can discretize continuous tokens to use fastText on them, for example by rounding values to a specific digit ("12.3" becomes "12").
## There are misspellings in the dictionary. Should we improve text normalization?
If the words are infrequent, there is no need to worry.
## I'm encountering a NaN, why could this be?
You'll likely see this behavior because your learning rate is too high. Try reducing it until you don't see this error anymore.
## My compiler / architecture can't build fastText. What should I do?
Try a newer version of your compiler. We try to maintain compatibility with older versions of gcc and many platforms, however sometimes maintaining backwards compatibility becomes very hard. In general, compilers and tool chains that ship with LTS versions of major linux distributions should be fair game. In any case, create an issue with your compiler version and architecture and we'll try to implement compatibility.
## How do I run fastText in a fully reproducible way? Each time I run it I get different results.
If you run fastText multiple times you'll obtain slightly different results each time due to the optimization algorithm (asynchronous stochastic gradient descent, or Hogwild). If you need to get the same results (e.g. to confront different input params set) you have to set the 'thread' parameter to 1. In this way you'll get exactly the same performances at each run (with the same input params).
## Why do I get a probability of 1.00001?
This is a known rounding issue. You can consider it as 1.0.
## How can I change the dimension of word vectors of a model file?
If you already trained a model, or downloaded a pre-trained word vectors model, you can adapt the dimension of the word vectors with the `reduce_model.py` script or by calling `fasttext.util.reduce_model` from python, as [described here](/docs/en/crawl-vectors.html#adapt-the-dimension)
================================================
FILE: docs/language-identification.md
================================================
---
id: language-identification
title: Language identification
---
### Description
We distribute two models for language identification, which can recognize 176 languages (see the list of ISO codes below). These models were trained on data from [Wikipedia](https://www.wikipedia.org/), [Tatoeba](https://tatoeba.org/eng/) and [SETimes](http://nlp.ffzg.hr/resources/corpora/setimes/), used under [CC-BY-SA](http://creativecommons.org/licenses/by-sa/3.0/).
We distribute two versions of the models:
* [lid.176.bin](https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin), which is faster and slightly more accurate, but has a file size of 126MB ;
* [lid.176.ftz](https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz), which is the compressed version of the model, with a file size of 917kB.
These models were trained on UTF-8 data, and therefore expect UTF-8 as input.
#### Updated model (NLLB project)
A newer LID (**L**anguage **ID**entification) model was [released as part of the NLLB project](https://github.com/facebookresearch/fairseq/tree/nllb#lid-model) under [CC-BY-NC 4.0](LICENSE.model.md) license.
* [lid218e.bin](https://tinyurl.com/nllblid218e) uses different language codes from the original models—the ISO 639-3 code (e.g. "eng", "fra", "rus") plus an additional code describing the script (e.g., "eng_Latn", "ukr_Cyrl")—and has a file size of 1.2GB.
You can read more about the data the model was trained on [here](https://github.com/facebookresearch/fairseq/blob/nllb/README.md#datasets).
#### 🤗 HuggingFace Integration
This model is [available](https://huggingface.co/facebook/fasttext-language-identification) on the Hugging Face Hub.
```python
>>> import fasttext
>>> from huggingface_hub import hf_hub_download
>>> model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
>>> model = fasttext.load_model(model_path)
>>> model.predict("Hello, world!")
(('__label__eng_Latn',), array([0.81148803]))
>>> model.predict("Hello, world!", k=5)
(('__label__eng_Latn', '__label__vie_Latn', '__label__nld_Latn', '__label__pol_Latn', '__label__deu_Latn'),
array([0.61224753, 0.21323682, 0.09696738, 0.01359863, 0.01319415]))
```
### License
The models are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/).
### List of supported languages
```
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
```
### References
If you use these models, please cite the following papers:
[1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759)
```
@article{joulin2016bag,
title={Bag of Tricks for Efficient Text Classification},
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
journal={arXiv preprint arXiv:1607.01759},
year={2016}
}
```
[2] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models* ](https://arxiv.org/abs/1612.03651)
```
@article{joulin2016fasttext,
title={FastText.zip: Compressing text classification models},
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
journal={arXiv preprint arXiv:1612.03651},
year={2016}
}
```
================================================
FILE: docs/options.md
================================================
---
id: options
title: List of options
---
Invoke a command without arguments to list available arguments and their default values:
```bash
$ ./fasttext supervised
Empty input or output path.
The following arguments are mandatory:
-input training file path
-output output file path
The following arguments are optional:
-verbose verbosity level [2]
The following arguments for the dictionary are optional:
-minCount minimal number of word occurrences [1]
-minCountLabel minimal number of label occurrences [0]
-wordNgrams max length of word ngram [1]
-bucket number of buckets [2000000]
-minn min length of char ngram [0]
-maxn max length of char ngram [0]
-t sampling threshold [0.0001]
-label labels prefix [__label__]
The following arguments for training are optional:
-lr learning rate [0.1]
-lrUpdateRate change the rate of updates for the learning rate [100]
-dim size of word vectors [100]
-ws size of the context window [5]
-epoch number of epochs [5]
-neg number of negatives sampled [5]
-loss loss function {ns, hs, softmax} [softmax]
-thread number of threads [12]
-pretrainedVectors pretrained word vectors for supervised learning []
-saveOutput whether output params should be saved [0]
The following arguments for quantization are optional:
-cutoff number of words and ngrams to retain [0]
-retrain finetune embeddings if a cutoff is applied [0]
-qnorm quantizing the norm separately [0]
-qout quantizing the classifier [0]
-dsub size of each sub-vector [2]
```
Defaults may vary by mode. (Word-representation modes `skipgram` and `cbow` use a default `-minCount` of 5.)
Hyperparameter optimization (autotune) is activated when you provide a validation file with `-autotune-validation` argument.
```text
The following arguments are for autotune:
-autotune-validation validation file to be used for evaluation
-autotune-metric metric objective {f1, f1:labelname} [f1]
-autotune-predictions number of predictions used for evaluation [1]
-autotune-duration maximum duration in seconds [300]
-autotune-modelsize constraint model file size [] (empty = do not quantize)
```
================================================
FILE: docs/pretrained-vectors.md
================================================
---
id: pretrained-vectors
title: Wiki word vectors
---
We are publishing pre-trained word vectors for 294 languages, trained on [*Wikipedia*](https://www.wikipedia.org) using fastText.
These vectors in dimension 300 were obtained using the skip-gram model described in [*Bojanowski et al. (2016)*](https://arxiv.org/abs/1607.04606) with default parameters.
Please note that a newer version of multi-lingual word vectors are available at: [Word vectors for 157 languages](https://fasttext.cc/docs/en/crawl-vectors.html).
### Models
The models can be downloaded from:
||||
|-|-|-|
| Abkhazian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ab.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ab.vec) | Acehnese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ace.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ace.vec) | Adyghe: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ady.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ady.vec) |
| Afar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.aa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.aa.vec) | Afrikaans: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.af.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.af.vec) | Akan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ak.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ak.vec) |
| Albanian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sq.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sq.vec) | Alemannic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.als.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.als.vec) | Amharic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.am.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.am.vec) |
| Anglo_Saxon: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ang.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ang.vec) | Arabic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ar.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ar.vec) | Aragonese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.an.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.an.vec) |
| Aramaic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.arc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.arc.vec) | Armenian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hy.vec) | Aromanian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.roa_rup.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.roa_rup.vec) |
| Assamese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.as.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.as.vec) | Asturian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ast.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ast.vec) | Avar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.av.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.av.vec) |
| Aymara: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ay.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ay.vec) | Azerbaijani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.az.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.az.vec) | Bambara: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bm.vec) |
| Banjar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bjn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bjn.vec) | Banyumasan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.map_bms.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.map_bms.vec) | Bashkir: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ba.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ba.vec) |
| Basque: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eu.vec) | Bavarian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bar.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bar.vec) | Belarusian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.be.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.be.vec) |
| Bengali: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bn.vec) | Bihari: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bh.vec) | Bishnupriya Manipuri: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bpy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bpy.vec) |
| Bislama: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bi.vec) | Bosnian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bs.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bs.vec) | Breton: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.br.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.br.vec) |
| Buginese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bug.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bug.vec) | Bulgarian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bg.vec) | Burmese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.my.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.my.vec) |
| Buryat: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bxr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bxr.vec) | Cantonese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_yue.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_yue.vec) | Catalan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ca.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ca.vec) |
| Cebuano: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ceb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ceb.vec) | Central Bicolano: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bcl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bcl.vec) | Chamorro: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ch.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ch.vec) |
| Chavacano: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cbk_zam.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cbk_zam.vec) | Chechen: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ce.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ce.vec) | Cherokee: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.chr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.chr.vec) |
| Cheyenne: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.chy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.chy.vec) | Chichewa: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ny.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ny.vec) | Chinese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.vec) |
| Choctaw: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cho.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cho.vec) | Chuvash: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cv.vec) | Classical Chinese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.vec) |
| Cornish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kw.vec) | Corsican: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.co.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.co.vec) | Cree: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cr.vec) |
| Crimean Tatar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.crh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.crh.vec) | Croatian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hr.vec) | Czech: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cs.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cs.vec) |
| Danish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.da.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.da.vec) | Divehi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dv.vec) | Dutch: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nl.vec) |
| Dutch Low Saxon: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nds_nl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nds_nl.vec) | Dzongkha: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dz.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dz.vec) | Eastern Punjabi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pa.vec) |
| Egyptian Arabic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.arz.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.arz.vec) | Emilian_Romagnol: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eml.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eml.vec) | English: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec) |
| Erzya: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.myv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.myv.vec) | Esperanto: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eo.vec) | Estonian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.et.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.et.vec) |
| Ewe: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ee.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ee.vec) | Extremaduran: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ext.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ext.vec) | Faroese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fo.vec) |
| Fiji Hindi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hif.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hif.vec) | Fijian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fj.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fj.vec) | Finnish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fi.vec) |
| Franco_Provençal: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.frp.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.frp.vec) | French: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fr.vec) | Friulian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fur.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fur.vec) |
| Fula: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ff.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ff.vec) | Gagauz: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gag.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gag.vec) | Galician: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gl.vec) |
| Gan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gan.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gan.vec) | Georgian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ka.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ka.vec) | German: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.de.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.de.vec) |
| Gilaki: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.glk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.glk.vec) | Goan Konkani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gom.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gom.vec) | Gothic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.got.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.got.vec) |
| Greek: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.el.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.el.vec) | Greenlandic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kl.vec) | Guarani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gn.vec) |
| Gujarati: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gu.vec) | Haitian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ht.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ht.vec) | Hakka: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hak.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hak.vec) |
| Hausa: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ha.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ha.vec) | Hawaiian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.haw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.haw.vec) | Hebrew: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.he.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.he.vec) |
| Herero: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hz.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hz.vec) | Hill Mari: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mrj.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mrj.vec) | Hindi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hi.vec) |
| Hiri Motu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ho.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ho.vec) | Hungarian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hu.vec) | Icelandic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.is.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.is.vec) |
| Ido: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.io.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.io.vec) | Igbo: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ig.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ig.vec) | Ilokano: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ilo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ilo.vec) |
| Indonesian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.id.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.id.vec) | Interlingua: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ia.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ia.vec) | Interlingue: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ie.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ie.vec) |
| Inuktitut: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.iu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.iu.vec) | Inupiak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ik.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ik.vec) | Irish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ga.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ga.vec) |
| Italian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.it.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.it.vec) | Jamaican Patois: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jam.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jam.vec) | Japanese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ja.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ja.vec) |
| Javanese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jv.vec) | Kabardian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kbd.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kbd.vec) | Kabyle: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kab.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kab.vec) |
| Kalmyk: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xal.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xal.vec) | Kannada: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kn.vec) | Kanuri: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kr.vec) |
| Kapampangan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pam.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pam.vec) | Karachay_Balkar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.krc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.krc.vec) | Karakalpak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kaa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kaa.vec) |
| Kashmiri: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ks.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ks.vec) | Kashubian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.csb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.csb.vec) | Kazakh: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kk.vec) |
| Khmer: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.km.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.km.vec) | Kikuyu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ki.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ki.vec) | Kinyarwanda: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rw.vec) |
| Kirghiz: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ky.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ky.vec) | Kirundi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rn.vec) | Komi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kv.vec) |
| Komi_Permyak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.koi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.koi.vec) | Kongo: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kg.vec) | Korean: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ko.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ko.vec) |
| Kuanyama: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kj.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kj.vec) | Kurdish (Kurmanji): [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ku.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ku.vec) | Kurdish (Sorani): [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ckb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ckb.vec) |
| Ladino: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lad.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lad.vec) | Lak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lbe.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lbe.vec) | Lao: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lo.vec) |
| Latgalian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ltg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ltg.vec) | Latin: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.la.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.la.vec) | Latvian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lv.vec) |
| Lezgian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lez.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lez.vec) | Ligurian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lij.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lij.vec) | Limburgish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.li.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.li.vec) |
| Lingala: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ln.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ln.vec) | Lithuanian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lt.vec) | Livvi_Karelian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.olo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.olo.vec) |
| Lojban: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jbo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jbo.vec) | Lombard: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lmo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lmo.vec) | Low Saxon: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nds.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nds.vec) |
| Lower Sorbian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dsb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dsb.vec) | Luganda: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lg.vec) | Luxembourgish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lb.vec) |
| Macedonian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mk.vec) | Maithili: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mai.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mai.vec) | Malagasy: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mg.vec) |
| Malay: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ms.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ms.vec) | Malayalam: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ml.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ml.vec) | Maltese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mt.vec) |
| Manx: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gv.vec) | Maori: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mi.vec) | Marathi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mr.vec) |
| Marshallese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mh.vec) | Mazandarani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mzn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mzn.vec) | Meadow Mari: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mhr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mhr.vec) |
| Min Dong: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cdo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cdo.vec) | Min Nan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_min_nan.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_min_nan.vec) | Minangkabau: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.min.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.min.vec) |
| Mingrelian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xmf.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xmf.vec) | Mirandese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mwl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mwl.vec) | Moksha: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mdf.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mdf.vec) |
| Moldovan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mo.vec) | Mongolian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mn.vec) | Muscogee: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mus.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mus.vec) |
| Nahuatl: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nah.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nah.vec) | Nauruan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.na.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.na.vec) | Navajo: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nv.vec) |
| Ndonga: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ng.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ng.vec) | Neapolitan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nap.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nap.vec) | Nepali: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ne.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ne.vec) |
| Newar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.new.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.new.vec) | Norfolk: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pih.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pih.vec) | Norman: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nrm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nrm.vec) |
| North Frisian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.frr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.frr.vec) | Northern Luri: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lrc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lrc.vec) | Northern Sami: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.se.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.se.vec) |
| Northern Sotho: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nso.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nso.vec) | Norwegian (Bokmål): [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.no.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.no.vec) | Norwegian (Nynorsk): [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nn.vec) |
| Novial: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nov.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nov.vec) | Nuosu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ii.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ii.vec) | Occitan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.oc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.oc.vec) |
| Old Church Slavonic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cu.vec) | Oriya: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.or.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.or.vec) | Oromo: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.om.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.om.vec) |
| Ossetian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.os.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.os.vec) | Palatinate German: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pfl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pfl.vec) | Pali: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pi.vec) |
| Pangasinan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pag.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pag.vec) | Papiamentu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pap.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pap.vec) | Pashto: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ps.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ps.vec) |
| Pennsylvania German: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pdc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pdc.vec) | Persian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.vec) | Picard: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pcd.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pcd.vec) |
| Piedmontese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pms.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pms.vec) | Polish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pl.vec) | Pontic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pnt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pnt.vec) |
| Portuguese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pt.vec) | Quechua: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.qu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.qu.vec) | Ripuarian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ksh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ksh.vec) |
| Romani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rmy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rmy.vec) | Romanian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ro.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ro.vec) | Romansh: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rm.vec) |
| Russian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.vec) | Rusyn: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rue.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rue.vec) | Sakha: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sah.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sah.vec) |
| Samoan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sm.vec) | Samogitian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bat_smg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bat_smg.vec) | Sango: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sg.vec) |
| Sanskrit: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sa.vec) | Sardinian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sc.vec) | Saterland Frisian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.stq.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.stq.vec) |
| Scots: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sco.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sco.vec) | Scottish Gaelic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gd.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gd.vec) | Serbian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sr.vec) |
| Serbo_Croatian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sh.vec) | Sesotho: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.st.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.st.vec) | Shona: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sn.vec) |
| Sicilian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.scn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.scn.vec) | Silesian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.szl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.szl.vec) | Simple English: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec) |
| Sindhi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sd.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sd.vec) | Sinhalese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.si.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.si.vec) | Slovak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sk.vec) |
| Slovenian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sl.vec) | Somali: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.so.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.so.vec) | Southern Azerbaijani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.azb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.azb.vec) |
| Spanish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.es.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.es.vec) | Sranan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.srn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.srn.vec) | Sundanese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.su.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.su.vec) |
| Swahili: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sw.vec) | Swati: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ss.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ss.vec) | Swedish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sv.vec) |
| Tagalog: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tl.vec) | Tahitian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ty.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ty.vec) | Tajik: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tg.vec) |
| Tamil: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ta.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ta.vec) | Tarantino: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.roa_tara.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.roa_tara.vec) | Tatar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tt.vec) |
| Telugu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.te.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.te.vec) | Tetum: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tet.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tet.vec) | Thai: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.th.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.th.vec) |
| Tibetan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bo.vec) | Tigrinya: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ti.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ti.vec) | Tok Pisin: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tpi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tpi.vec) |
| Tongan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.to.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.to.vec) | Tsonga: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ts.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ts.vec) | Tswana: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tn.vec) |
| Tulu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tcy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tcy.vec) | Tumbuka: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tum.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tum.vec) | Turkish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tr.vec) |
| Turkmen: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tk.vec) | Tuvan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tyv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tyv.vec) | Twi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tw.vec) |
| Udmurt: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.udm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.udm.vec) | Ukrainian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.uk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.uk.vec) | Upper Sorbian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hsb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hsb.vec) |
| Urdu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ur.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ur.vec) | Uyghur: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ug.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ug.vec) | Uzbek: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.uz.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.uz.vec) |
| Venda: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ve.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ve.vec) | Venetian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vec.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vec.vec) | Vepsian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vep.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vep.vec) |
| Vietnamese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vi.vec) | Volapük: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vo.vec) | Võro: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fiu_vro.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fiu_vro.vec) |
| Walloon: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wa.vec) | Waray: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.war.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.war.vec) | Welsh: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cy.vec) |
| West Flemish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vls.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vls.vec) | West Frisian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fy.vec) | Western Punjabi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pnb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pnb.vec) |
| Wolof: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wo.vec) | Wu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wuu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wuu.vec) | Xhosa: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xh.vec) |
| Yiddish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.yi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.yi.vec) | Yoruba: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.yo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.yo.vec) | Zazaki: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.diq.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.diq.vec) |
| Zeelandic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zea.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zea.vec) | Zhuang: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.za.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.za.vec) | Zulu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zu.vec) |
### Format
The word vectors come in both the binary and text default formats of fastText.
In the text format, each line contains a word followed by its vector. Each value is space separated.
Words are ordered by their frequency in a descending order.
### License
The word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/).
### References
If you use these word vectors, please cite the following paper:
P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606)
```markup
@article{bojanowski2017enriching,
title={Enriching Word Vectors with Subword Information},
author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
journal={Transactions of the Association for Computational Linguistics},
volume={5},
year={2017},
issn={2307-387X},
pages={135--146}
}
```
================================================
FILE: docs/python-module.md
================================================
---
id: python-module
title: Python module
---
In this document we present how to use fastText in python.
## Table of contents
* [Requirements](#requirements)
* [Installation](#installation)
* [Usage overview](#usage-overview)
* [Word representation model](#word-representation-model)
* [Text classification model](#text-classification-model)
* [IMPORTANT: Preprocessing data / encoding conventions](#important-preprocessing-data-encoding-conventions)
* [More examples](#more-examples)
* [API](#api)
* [`train_unsupervised` parameters](#train_unsupervised-parameters)
* [`train_supervised` parameters](#train_supervised-parameters)
* [`model` object](#model-object)
# Requirements
[fastText](https://fasttext.cc/) builds on modern Mac OS and Linux distributions.
Since it uses C\++11 features, it requires a compiler with good C++11 support. You will need [Python](https://www.python.org/) (version 2.7 or ≥ 3.4), [NumPy](http://www.numpy.org/) & [SciPy](https://www.scipy.org/) and [pybind11](https://github.com/pybind/pybind11).
# Installation
To install the latest release, you can do :
```bash
$ pip install fasttext
```
or, to get the latest development version of fasttext, you can install from our github repository :
```bash
$ git clone https://github.com/facebookresearch/fastText.git
$ cd fastText
$ sudo pip install .
$ # or :
$ sudo python setup.py install
```
# Usage overview
## Word representation model
In order to learn word vectors, as [described here](/docs/en/references.html#enriching-word-vectors-with-subword-information), we can use `fasttext.train_unsupervised` function like this:
```py
import fasttext
# Skipgram model :
model = fasttext.train_unsupervised('data.txt', model='skipgram')
# or, cbow model :
model = fasttext.train_unsupervised('data.txt', model='cbow')
```
where `data.txt` is a training file containing utf-8 encoded text.
The returned `model` object represents your learned model, and you can use it to retrieve information.
```py
print(model.words) # list of words in dictionary
print(model['king']) # get the vector of the word 'king'
```
### Saving and loading a model object
You can save your trained model object by calling the function `save_model`.
```py
model.save_model("model_filename.bin")
```
and retrieve it later thanks to the function `load_model` :
```py
model = fasttext.load_model("model_filename.bin")
```
For more information about word representation usage of fasttext, you can refer to our [word representations tutorial](/docs/en/unsupervised-tutorial.html).
## Text classification model
In order to train a text classifier using the method [described here](/docs/en/references.html#bag-of-tricks-for-efficient-text-classification), we can use `fasttext.train_supervised` function like this:
```py
import fasttext
model = fasttext.train_supervised('data.train.txt')
```
where `data.train.txt` is a text file containing a training sentence per line along with the labels. By default, we assume that labels are words that are prefixed by the string `__label__`
Once the model is trained, we can retrieve the list of words and labels:
```py
print(model.words)
print(model.labels)
```
To evaluate our model by computing the precision at 1 (P@1) and the recall on a test set, we use the `test` function:
```py
def print_results(N, p, r):
print("N\t" + str(N))
print("P@{}\t{:.3f}".format(1, p))
print("R@{}\t{:.3f}".format(1, r))
print_results(*model.test('test.txt'))
```
We can also predict labels for a specific text :
```py
model.predict("Which baking dish is best to bake a banana bread ?")
```
By default, `predict` returns only one label : the one with the highest probability. You can also predict more than one label by specifying the parameter `k`:
```py
model.predict("Which baking dish is best to bake a banana bread ?", k=3)
```
If you want to predict more than one sentence you can pass an array of strings :
```py
model.predict(["Which baking dish is best to bake a banana bread ?", "Why not put knives in the dishwasher?"], k=3)
```
Of course, you can also save and load a model to/from a file as [in the word representation usage](#saving-and-loading-a-model-object).
For more information about text classification usage of fasttext, you can refer to our [text classification tutorial](/docs/en/supervised-tutorial.html).
### Compress model files with quantization
When you want to save a supervised model file, fastText can compress it in order to have a much smaller model file by sacrificing only a little bit performance.
```py
# with the previously trained `model` object, call :
model.quantize(input='data.train.txt', retrain=True)
# then display results and save the new model :
print_results(*model.test(valid_data))
model.save_model("model_filename.ftz")
```
`model_filename.ftz` will have a much smaller size than `model_filename.bin`.
For further reading on quantization, you can refer to [this paragraph from our blog post](/blog/2017/10/02/blog-post.html#model-compression).
## IMPORTANT: Preprocessing data / encoding conventions
In general it is important to properly preprocess your data. In particular our example scripts in the [root folder](https://github.com/facebookresearch/fastText) do this.
fastText assumes UTF-8 encoded text. All text must be [unicode for Python2](https://docs.python.org/2/library/functions.html#unicode) and [str for Python3](https://docs.python.org/3.5/library/stdtypes.html#textseq). The passed text will be [encoded as UTF-8 by pybind11](https://pybind11.readthedocs.io/en/master/advanced/cast/strings.html?highlight=utf-8#strings-bytes-and-unicode-conversions) before passed to the fastText C++ library. This means it is important to use UTF-8 encoded text when building a model. On Unix-like systems you can convert text using [iconv](https://en.wikipedia.org/wiki/Iconv).
fastText will tokenize (split text into pieces) based on the following ASCII characters (bytes). In particular, it is not aware of UTF-8 whitespace. We advice the user to convert UTF-8 whitespace / word boundaries into one of the following symbols as appropiate.
* space
* tab
* vertical tab
* carriage return
* formfeed
* the null character
The newline character is used to delimit lines of text. In particular, the EOS token is appended to a line of text if a newline character is encountered. The only exception is if the number of tokens exceeds the MAX\_LINE\_SIZE constant as defined in the [Dictionary header](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h). This means if you have text that is not separate by newlines, such as the [fil9 dataset](http://mattmahoney.net/dc/textdata), it will be broken into chunks with MAX\_LINE\_SIZE of tokens and the EOS token is not appended.
The length of a token is the number of UTF-8 characters by considering the [leading two bits of a byte](https://en.wikipedia.org/wiki/UTF-8#Description) to identify [subsequent bytes of a multi-byte sequence](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc). Knowing this is especially important when choosing the minimum and maximum length of subwords. Further, the EOS token (as specified in the [Dictionary header](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h)) is considered a character and will not be broken into subwords.
## More examples
In order to have a better knowledge of fastText models, please consider the main [README](https://github.com/facebookresearch/fastText/blob/master/README.md) and in particular [the tutorials on our website](https://fasttext.cc/docs/en/supervised-tutorial.html).
You can find further python examples in [the doc folder](https://github.com/facebookresearch/fastText/tree/master/python/doc/examples).
As with any package you can get help on any Python function using the help function.
For example
```
+>>> import fasttext
+>>> help(fasttext.FastText)
Help on module fasttext.FastText in fasttext:
NAME
fasttext.FastText
DESCRIPTION
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
FUNCTIONS
load_model(path)
Load a model given a filepath and return a model object.
tokenize(text)
Given a string of text, tokenize it and return a list of tokens
[...]
```
# API
## `train_unsupervised` parameters
```python
input # training file path (required)
model # unsupervised fasttext model {cbow, skipgram} [skipgram]
lr # learning rate [0.05]
dim # size of word vectors [100]
ws # size of the context window [5]
epoch # number of epochs [5]
minCount # minimal number of word occurences [5]
minn # min length of char ngram [3]
maxn # max length of char ngram [6]
neg # number of negatives sampled [5]
wordNgrams # max length of word ngram [1]
loss # loss function {ns, hs, softmax, ova} [ns]
bucket # number of buckets [2000000]
thread # number of threads [number of cpus]
lrUpdateRate # change the rate of updates for the learning rate [100]
t # sampling threshold [0.0001]
verbose # verbose [2]
```
## `train_supervised` parameters
```python
input # training file path (required)
lr # learning rate [0.1]
dim # size of word vectors [100]
ws # size of the context window [5]
epoch # number of epochs [5]
minCount # minimal number of word occurences [1]
minCountLabel # minimal number of label occurences [1]
minn # min length of char ngram [0]
maxn # max length of char ngram [0]
neg # number of negatives sampled [5]
wordNgrams # max length of word ngram [1]
loss # loss function {ns, hs, softmax, ova} [softmax]
bucket # number of buckets [2000000]
thread # number of threads [number of cpus]
lrUpdateRate # change the rate of updates for the learning rate [100]
t # sampling threshold [0.0001]
label # label prefix ['__label__']
verbose # verbose [2]
pretrainedVectors # pretrained word vectors (.vec file) for supervised learning []
```
## `model` object
`train_supervised`, `train_unsupervised` and `load_model` functions return an instance of `_FastText` class, that we generaly name `model` object.
This object exposes those training arguments as properties : `lr`, `dim`, `ws`, `epoch`, `minCount`, `minCountLabel`, `minn`, `maxn`, `neg`, `wordNgrams`, `loss`, `bucket`, `thread`, `lrUpdateRate`, `t`, `label`, `verbose`, `pretrainedVectors`. So `model.wordNgrams` will give you the max length of word ngram used for training this model.
In addition, the object exposes several functions :
```python
get_dimension # Get the dimension (size) of a lookup vector (hidden layer).
# This is equivalent to `dim` property.
get_input_vector # Given an index, get the corresponding vector of the Input Matrix.
get_input_matrix # Get a copy of the full input matrix of a Model.
get_labels # Get the entire list of labels of the dictionary
# This is equivalent to `labels` property.
get_line # Split a line of text into words and labels.
get_output_matrix # Get a copy of the full output matrix of a Model.
get_sentence_vector # Given a string, get a single vector represenation. This function
# assumes to be given a single line of text. We split words on
# whitespace (space, newline, tab, vertical tab) and the control
# characters carriage return, formfeed and the null character.
get_subword_id # Given a subword, return the index (within input matrix) it hashes to.
get_subwords # Given a word, get the subwords and their indicies.
get_word_id # Given a word, get the word id within the dictionary.
get_word_vector # Get the vector representation of word.
get_words # Get the entire list of words of the dictionary
# This is equivalent to `words` property.
is_quantized # whether the model has been quantized
predict # Given a string, get a list of labels and a list of corresponding probabilities.
quantize # Quantize the model reducing the size of the model and it's memory footprint.
save_model # Save the model to the given path
test # Evaluate supervised model using file given by path
test_label # Return the precision and recall score for each label.
```
The properties `words`, `labels` return the words and labels from the dictionary :
```py
model.words # equivalent to model.get_words()
model.labels # equivalent to model.get_labels()
```
gitextract_5y6fukma/ ├── .gitignore ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── PACKAGE ├── README.md ├── alignment/ │ ├── README.md │ ├── align.py │ ├── eval.py │ ├── example.sh │ ├── unsup_align.py │ ├── unsup_multialign.py │ └── utils.py ├── classification-example.sh ├── classification-results.sh ├── crawl/ │ ├── README.md │ ├── dedup.cc │ ├── download_crawl.sh │ ├── filter_dedup.sh │ ├── filter_utf8.cc │ └── process_wet_file.sh ├── docs/ │ ├── aligned-vectors.md │ ├── api.md │ ├── autotune.md │ ├── cheatsheet.md │ ├── crawl-vectors.md │ ├── dataset.md │ ├── english-vectors.md │ ├── faqs.md │ ├── language-identification.md │ ├── options.md │ ├── pretrained-vectors.md │ ├── python-module.md │ ├── references.md │ ├── supervised-models.md │ ├── supervised-tutorial.md │ ├── support.md │ ├── unsupervised-tutorials.md │ └── webassembly-module.md ├── download_model.py ├── eval.py ├── fasttext.pc.in ├── get-wikimedia.sh ├── pyproject.toml ├── python/ │ ├── README.md │ ├── README.rst │ ├── benchmarks/ │ │ ├── README.rst │ │ └── get_word_vector.py │ └── doc/ │ └── examples/ │ ├── FastTextEmbeddingBag.py │ ├── bin_to_vec.py │ ├── compute_accuracy.py │ ├── get_vocab.py │ ├── train_supervised.py │ └── train_unsupervised.py ├── quantization-example.sh ├── reduce_model.py ├── runtests.py ├── scripts/ │ ├── kbcompletion/ │ │ ├── README.md │ │ ├── data.sh │ │ ├── eval.cpp │ │ ├── fb15k.sh │ │ ├── fb15k237.sh │ │ ├── svo.sh │ │ └── wn18.sh │ └── quantization/ │ └── quantization-results.sh ├── setup.cfg ├── setup.py ├── src/ │ ├── aligned.h │ ├── args.cc │ ├── args.h │ ├── autotune.cc │ ├── autotune.h │ ├── densematrix.cc │ ├── densematrix.h │ ├── dictionary.cc │ ├── dictionary.h │ ├── fasttext.cc │ ├── fasttext.h │ ├── loss.cc │ ├── loss.h │ ├── main.cc │ ├── matrix.cc │ ├── matrix.h │ ├── meter.cc │ ├── meter.h │ ├── model.cc │ ├── model.h │ ├── productquantizer.cc │ ├── productquantizer.h │ ├── quantmatrix.cc │ ├── quantmatrix.h │ ├── real.h │ ├── utils.cc │ ├── utils.h │ ├── vector.cc │ └── vector.h ├── tests/ │ └── fetch_test_data.sh ├── webassembly/ │ ├── README.md │ ├── doc/ │ │ └── examples/ │ │ ├── misc.html │ │ ├── predict.html │ │ ├── train_supervised.html │ │ └── train_unsupervised.html │ ├── fasttext.js │ └── fasttext_wasm.cc ├── website/ │ ├── README.md │ ├── blog/ │ │ ├── 2016-08-18-blog-post.md │ │ ├── 2017-05-02-blog-post.md │ │ ├── 2017-10-02-blog-post.md │ │ └── 2019-06-25-blog-post.md │ ├── core/ │ │ └── Footer.js │ ├── package.json │ ├── pages/ │ │ └── en/ │ │ └── index.js │ ├── sidebars.json │ ├── siteConfig.js │ └── static/ │ ├── docs/ │ │ └── en/ │ │ └── html/ │ │ ├── .classfasttext_1_1QMatrix-members.html.i4eKqy │ │ ├── annotated.html │ │ ├── annotated_dup.js │ │ ├── args_8cc.html │ │ ├── args_8h.html │ │ ├── args_8h.js │ │ ├── args_8h_source.html │ │ ├── classes.html │ │ ├── classfasttext_1_1Args-members.html │ │ ├── classfasttext_1_1Args.html │ │ ├── classfasttext_1_1Args.js │ │ ├── classfasttext_1_1Dictionary-members.html │ │ ├── classfasttext_1_1Dictionary.html │ │ ├── classfasttext_1_1Dictionary.js │ │ ├── classfasttext_1_1FastText-members.html │ │ ├── classfasttext_1_1FastText.html │ │ ├── classfasttext_1_1FastText.js │ │ ├── classfasttext_1_1Matrix-members.html │ │ ├── classfasttext_1_1Matrix.html │ │ ├── classfasttext_1_1Matrix.js │ │ ├── classfasttext_1_1Model-members.html │ │ ├── classfasttext_1_1Model.html │ │ ├── classfasttext_1_1Model.js │ │ ├── classfasttext_1_1ProductQuantizer-members.html │ │ ├── classfasttext_1_1ProductQuantizer.html │ │ ├── classfasttext_1_1ProductQuantizer.js │ │ ├── classfasttext_1_1QMatrix-members.html │ │ ├── classfasttext_1_1QMatrix.html │ │ ├── classfasttext_1_1QMatrix.js │ │ ├── classfasttext_1_1Vector-members.html │ │ ├── classfasttext_1_1Vector.html │ │ ├── classfasttext_1_1Vector.js │ │ ├── dictionary_8cc.html │ │ ├── dictionary_8h.html │ │ ├── dictionary_8h.js │ │ ├── dictionary_8h_source.html │ │ ├── dir_68267d1309a1af8e8297ef4c3efbcdba.html │ │ ├── dir_68267d1309a1af8e8297ef4c3efbcdba.js │ │ ├── doxygen.css │ │ ├── dynsections.js │ │ ├── fasttext_8cc.html │ │ ├── fasttext_8h.html │ │ ├── fasttext_8h.js │ │ ├── fasttext_8h_source.html │ │ ├── files.html │ │ ├── files.js │ │ ├── functions.html │ │ ├── functions_0x7e.html │ │ ├── functions_b.html │ │ ├── functions_c.html │ │ ├── functions_d.html │ │ ├── functions_dup.js │ │ ├── functions_e.html │ │ ├── functions_f.html │ │ ├── functions_func.html │ │ ├── functions_g.html │ │ ├── functions_h.html │ │ ├── functions_i.html │ │ ├── functions_k.html │ │ ├── functions_l.html │ │ ├── functions_m.html │ │ ├── functions_n.html │ │ ├── functions_o.html │ │ ├── functions_p.html │ │ ├── functions_q.html │ │ ├── functions_r.html │ │ ├── functions_s.html │ │ ├── functions_t.html │ │ ├── functions_u.html │ │ ├── functions_v.html │ │ ├── functions_vars.html │ │ ├── functions_w.html │ │ ├── functions_z.html │ │ ├── globals.html │ │ ├── globals_defs.html │ │ ├── globals_func.html │ │ ├── index.html │ │ ├── jquery.js │ │ ├── main_8cc.html │ │ ├── main_8cc.js │ │ ├── matrix_8cc.html │ │ ├── matrix_8h.html │ │ ├── matrix_8h_source.html │ │ ├── menu.js │ │ ├── menudata.js │ │ ├── model_8cc.html │ │ ├── model_8h.html │ │ ├── model_8h.js │ │ ├── model_8h_source.html │ │ ├── namespacefasttext.html │ │ ├── namespacefasttext.js │ │ ├── namespacefasttext_1_1utils.html │ │ ├── namespacemembers.html │ │ ├── namespacemembers_enum.html │ │ ├── namespacemembers_func.html │ │ ├── namespacemembers_type.html │ │ ├── namespaces.html │ │ ├── namespaces.js │ │ ├── navtree.css │ │ ├── navtree.js │ │ ├── navtreedata.js │ │ ├── navtreeindex0.js │ │ ├── navtreeindex1.js │ │ ├── productquantizer_8cc.html │ │ ├── productquantizer_8cc.js │ │ ├── productquantizer_8h.html │ │ ├── productquantizer_8h_source.html │ │ ├── qmatrix_8cc.html │ │ ├── qmatrix_8h.html │ │ ├── qmatrix_8h_source.html │ │ ├── real_8h.html │ │ ├── real_8h.js │ │ ├── real_8h_source.html │ │ ├── resize.js │ │ ├── search/ │ │ │ ├── .files_7.html.StRRNc │ │ │ ├── .variables_a.html.1MGQ27 │ │ │ ├── all_0.html │ │ │ ├── all_0.js │ │ │ ├── all_1.html │ │ │ ├── all_1.js │ │ │ ├── all_10.html │ │ │ ├── all_10.js │ │ │ ├── all_11.html │ │ │ ├── all_11.js │ │ │ ├── all_12.html │ │ │ ├── all_12.js │ │ │ ├── all_13.html │ │ │ ├── all_13.js │ │ │ ├── all_14.html │ │ │ ├── all_14.js │ │ │ ├── all_15.html │ │ │ ├── all_15.js │ │ │ ├── all_16.html │ │ │ ├── all_16.js │ │ │ ├── all_17.html │ │ │ ├── all_17.js │ │ │ ├── all_2.html │ │ │ ├── all_2.js │ │ │ ├── all_3.html │ │ │ ├── all_3.js │ │ │ ├── all_4.html │ │ │ ├── all_4.js │ │ │ ├── all_5.html │ │ │ ├── all_5.js │ │ │ ├── all_6.html │ │ │ ├── all_6.js │ │ │ ├── all_7.html │ │ │ ├── all_7.js │ │ │ ├── all_8.html │ │ │ ├── all_8.js │ │ │ ├── all_9.html │ │ │ ├── all_9.js │ │ │ ├── all_a.html │ │ │ ├── all_a.js │ │ │ ├── all_b.html │ │ │ ├── all_b.js │ │ │ ├── all_c.html │ │ │ ├── all_c.js │ │ │ ├── all_d.html │ │ │ ├── all_d.js │ │ │ ├── all_e.html │ │ │ ├── all_e.js │ │ │ ├── all_f.html │ │ │ ├── all_f.js │ │ │ ├── classes_0.html │ │ │ ├── classes_0.js │ │ │ ├── classes_1.html │ │ │ ├── classes_1.js │ │ │ ├── classes_2.html │ │ │ ├── classes_2.js │ │ │ ├── classes_3.html │ │ │ ├── classes_3.js │ │ │ ├── classes_4.html │ │ │ ├── classes_4.js │ │ │ ├── classes_5.html │ │ │ ├── classes_5.js │ │ │ ├── classes_6.html │ │ │ ├── classes_6.js │ │ │ ├── classes_7.html │ │ │ ├── classes_7.js │ │ │ ├── classes_8.html │ │ │ ├── classes_8.js │ │ │ ├── defines_0.html │ │ │ ├── defines_0.js │ │ │ ├── defines_1.html │ │ │ ├── defines_1.js │ │ │ ├── defines_2.html │ │ │ ├── defines_2.js │ │ │ ├── defines_3.html │ │ │ ├── defines_3.js │ │ │ ├── enums_0.html │ │ │ ├── enums_0.js │ │ │ ├── enums_1.html │ │ │ ├── enums_1.js │ │ │ ├── enums_2.html │ │ │ ├── enums_2.js │ │ │ ├── enumvalues_0.html │ │ │ ├── enumvalues_0.js │ │ │ ├── enumvalues_1.html │ │ │ ├── enumvalues_1.js │ │ │ ├── enumvalues_2.html │ │ │ ├── enumvalues_2.js │ │ │ ├── enumvalues_3.html │ │ │ ├── enumvalues_3.js │ │ │ ├── enumvalues_4.html │ │ │ ├── enumvalues_4.js │ │ │ ├── enumvalues_5.html │ │ │ ├── enumvalues_5.js │ │ │ ├── files_0.html │ │ │ ├── files_0.js │ │ │ ├── files_1.html │ │ │ ├── files_1.js │ │ │ ├── files_2.html │ │ │ ├── files_2.js │ │ │ ├── files_3.html │ │ │ ├── files_3.js │ │ │ ├── files_4.html │ │ │ ├── files_4.js │ │ │ ├── files_5.html │ │ │ ├── files_5.js │ │ │ ├── files_6.html │ │ │ ├── files_6.js │ │ │ ├── files_7.html │ │ │ ├── files_7.js │ │ │ ├── files_8.html │ │ │ ├── files_8.js │ │ │ ├── functions_0.html │ │ │ ├── functions_0.js │ │ │ ├── functions_1.html │ │ │ ├── functions_1.js │ │ │ ├── functions_10.html │ │ │ ├── functions_10.js │ │ │ ├── functions_11.html │ │ │ ├── functions_11.js │ │ │ ├── functions_12.html │ │ │ ├── functions_12.js │ │ │ ├── functions_13.html │ │ │ ├── functions_13.js │ │ │ ├── functions_14.html │ │ │ ├── functions_14.js │ │ │ ├── functions_15.html │ │ │ ├── functions_15.js │ │ │ ├── functions_16.html │ │ │ ├── functions_16.js │ │ │ ├── functions_17.html │ │ │ ├── functions_17.js │ │ │ ├── functions_2.html │ │ │ ├── functions_2.js │ │ │ ├── functions_3.html │ │ │ ├── functions_3.js │ │ │ ├── functions_4.html │ │ │ ├── functions_4.js │ │ │ ├── functions_5.html │ │ │ ├── functions_5.js │ │ │ ├── functions_6.html │ │ │ ├── functions_6.js │ │ │ ├── functions_7.html │ │ │ ├── functions_7.js │ │ │ ├── functions_8.html │ │ │ ├── functions_8.js │ │ │ ├── functions_9.html │ │ │ ├── functions_9.js │ │ │ ├── functions_a.html │ │ │ ├── functions_a.js │ │ │ ├── functions_b.html │ │ │ ├── functions_b.js │ │ │ ├── functions_c.html │ │ │ ├── functions_c.js │ │ │ ├── functions_d.html │ │ │ ├── functions_d.js │ │ │ ├── functions_e.html │ │ │ ├── functions_e.js │ │ │ ├── functions_f.html │ │ │ ├── functions_f.js │ │ │ ├── namespaces_0.html │ │ │ ├── namespaces_0.js │ │ │ ├── nomatches.html │ │ │ ├── search.css │ │ │ ├── search.js │ │ │ ├── searchdata.js │ │ │ ├── typedefs_0.html │ │ │ ├── typedefs_0.js │ │ │ ├── typedefs_1.html │ │ │ ├── typedefs_1.js │ │ │ ├── variables_0.html │ │ │ ├── variables_0.js │ │ │ ├── variables_1.html │ │ │ ├── variables_1.js │ │ │ ├── variables_10.html │ │ │ ├── variables_10.js │ │ │ ├── variables_11.html │ │ │ ├── variables_11.js │ │ │ ├── variables_12.html │ │ │ ├── variables_12.js │ │ │ ├── variables_13.html │ │ │ ├── variables_13.js │ │ │ ├── variables_2.html │ │ │ ├── variables_2.js │ │ │ ├── variables_3.html │ │ │ ├── variables_3.js │ │ │ ├── variables_4.html │ │ │ ├── variables_4.js │ │ │ ├── variables_5.html │ │ │ ├── variables_5.js │ │ │ ├── variables_6.html │ │ │ ├── variables_6.js │ │ │ ├── variables_7.html │ │ │ ├── variables_7.js │ │ │ ├── variables_8.html │ │ │ ├── variables_8.js │ │ │ ├── variables_9.html │ │ │ ├── variables_9.js │ │ │ ├── variables_a.html │ │ │ ├── variables_a.js │ │ │ ├── variables_b.html │ │ │ ├── variables_b.js │ │ │ ├── variables_c.html │ │ │ ├── variables_c.js │ │ │ ├── variables_d.html │ │ │ ├── variables_d.js │ │ │ ├── variables_e.html │ │ │ ├── variables_e.js │ │ │ ├── variables_f.html │ │ │ └── variables_f.js │ │ ├── structfasttext_1_1Node-members.html │ │ ├── structfasttext_1_1Node.html │ │ ├── structfasttext_1_1Node.js │ │ ├── structfasttext_1_1entry-members.html │ │ ├── structfasttext_1_1entry.html │ │ ├── structfasttext_1_1entry.js │ │ ├── tabs.css │ │ ├── utils_8cc.html │ │ ├── utils_8cc.js │ │ ├── utils_8h.html │ │ ├── utils_8h.js │ │ ├── utils_8h_source.html │ │ ├── vector_8cc.html │ │ ├── vector_8cc.js │ │ ├── vector_8h.html │ │ ├── vector_8h.js │ │ └── vector_8h_source.html │ ├── fasttext.css │ └── tabber.js ├── wikifil.pl └── word-vector-example.sh
SYMBOL INDEX (329 symbols across 58 files)
FILE: alignment/align.py
function getknn (line 45) | def getknn(sc, x, y, k=10):
function rcsls (line 54) | def rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, knn=10):
function proj_spectral (line 65) | def proj_spectral(R):
FILE: alignment/eval.py
function load_transform (line 31) | def load_transform(fname, d1=300, d2=300):
FILE: alignment/unsup_align.py
function objective (line 28) | def objective(X, Y, R, n=5000):
function sqrt_eig (line 35) | def sqrt_eig(x):
function align (line 40) | def align(X, Y, R, lr=10., bsz=200, nepoch=5, niter=1000,
function convex_init (line 63) | def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
FILE: alignment/unsup_multialign.py
function getknn (line 39) | def getknn(sc, x, y, k=10):
function rcsls (line 48) | def rcsls(Xi, Xj, Zi, Zj, R, knn=10):
function GWmatrix (line 59) | def GWmatrix(emb0):
function gromov_wasserstein (line 67) | def gromov_wasserstein(x_src, x_tgt, C2):
function align (line 74) | def align(EMB, TRANS, lglist, args):
function convex_init (line 154) | def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
FILE: alignment/utils.py
function load_vectors (line 13) | def load_vectors(fname, maxload=200000, norm=True, center=False, verbose...
function idx (line 39) | def idx(words):
function save_vectors (line 47) | def save_vectors(fname, x, words):
function save_matrix (line 56) | def save_matrix(fname, x):
function procrustes (line 65) | def procrustes(X_src, Y_tgt):
function select_vectors_from_pairs (line 70) | def select_vectors_from_pairs(x_src, y_tgt, pairs):
function load_lexicon (line 82) | def load_lexicon(filename, words_src, words_tgt, verbose=True):
function load_pairs (line 98) | def load_pairs(filename, idx_src, idx_tgt, verbose=True):
function compute_nn_accuracy (line 113) | def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1):
function compute_csls_accuracy (line 130) | def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, ...
FILE: crawl/dedup.cc
function fnv1a_64 (line 13) | uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656...
function main (line 22) | int main(int argc, char** argv)
FILE: crawl/filter_utf8.cc
function continuation (line 12) | bool continuation(uint8_t* str, int n)
function invalid (line 25) | bool invalid(uint8_t* str)
function surrogate (line 36) | bool surrogate(uint8_t* str)
function overlong_2 (line 43) | bool overlong_2(uint8_t* str)
function overlong_3 (line 50) | bool overlong_3(uint8_t* str)
function overlong_4 (line 57) | bool overlong_4(uint8_t* str)
function valid_utf8 (line 62) | bool valid_utf8(uint8_t* str, size_t length)
function main (line 96) | int main(int argc, char** argv)
FILE: download_model.py
function command_download (line 23) | def command_download(lang_id, if_exists):
function main (line 31) | def main():
FILE: eval.py
function compat_splitting (line 22) | def compat_splitting(line):
function similarity (line 26) | def similarity(v1, v2):
FILE: python/benchmarks/get_word_vector.py
function get_word_vector (line 19) | def get_word_vector(data, model):
FILE: python/doc/examples/FastTextEmbeddingBag.py
class FastTextEmbeddingBag (line 27) | class FastTextEmbeddingBag(EmbeddingBag):
method __init__ (line 28) | def __init__(self, model_path):
method forward (line 35) | def forward(self, words):
function random_word (line 48) | def random_word(N):
FILE: python/doc/examples/compute_accuracy.py
function process_question (line 21) | def process_question(question, cossims, model, words, vectors):
function print_compute_accuracy_score (line 59) | def print_compute_accuracy_score(
FILE: python/doc/examples/train_supervised.py
function print_results (line 18) | def print_results(N, p, r):
FILE: python/doc/examples/train_unsupervised.py
function compute_similarity (line 22) | def compute_similarity(data_path):
FILE: reduce_model.py
function eprint (line 26) | def eprint(*args, **kwargs):
function guess_target_name (line 30) | def guess_target_name(model_file, initial_dim, target_dim):
function command_reduce (line 46) | def command_reduce(model_file, target_dim, if_exists):
function main (line 79) | def main():
FILE: runtests.py
function run_tests (line 26) | def run_tests(tests):
FILE: scripts/kbcompletion/eval.cpp
function readWord (line 17) | bool readWord(std::istream& in, std::string& word)
function main (line 43) | int main(int argc, char** argv) {
FILE: setup.py
class get_pybind_include (line 30) | class get_pybind_include:
method __init__ (line 37) | def __init__(self, user=False):
method __str__ (line 46) | def __str__(self):
function has_flag (line 93) | def has_flag(compiler, flags):
function cpp_flag (line 108) | def cpp_flag(compiler):
class BuildExt (line 117) | class BuildExt(build_ext):
method build_extensions (line 125) | def build_extensions(self):
function _get_readme (line 162) | def _get_readme():
FILE: src/aligned.h
function namespace (line 16) | namespace intgemm {
FILE: src/args.cc
type fasttext (line 19) | namespace fasttext {
function metric_name (line 399) | metric_name Args::getAutotuneMetric() const {
FILE: src/args.h
type class (line 19) | enum class
type class (line 20) | enum class
function metric_name (line 21) | enum class metric_name : int {
FILE: src/autotune.cc
function signalHandler (line 36) | void signalHandler(int signal) {
class ElapsedTimeMarker (line 42) | class ElapsedTimeMarker {
method ElapsedTimeMarker (line 46) | ElapsedTimeMarker() {
method getElapsed (line 49) | double getElapsed() {
type fasttext (line 57) | namespace fasttext {
function T (line 63) | T getArgGauss(
function T (line 92) | T updateArgGauss(
function Args (line 126) | Args AutotuneStrategy::ask(double elapsed) {
FILE: src/autotune.h
function namespace (line 20) | namespace fasttext {
FILE: src/densematrix.cc
type fasttext (line 22) | namespace fasttext {
function real (line 94) | real DenseMatrix::l2NormRow(int64_t i) const {
function real (line 112) | real DenseMatrix::dotRow(const Vector& vec, int64_t i) const {
function Register (line 156) | inline Register Add(Register first, Register second) { return _mm512_a...
function Register (line 157) | inline Register Set1(float to) { return _mm512_set1_ps(to); }
function Register (line 158) | inline Register Multiply(Register first, Register second) { return _mm...
function Register (line 161) | inline Register Add(Register first, Register second) { return _mm256_a...
function Register (line 162) | inline Register Set1(float to) { return _mm256_set1_ps(to); }
function Register (line 163) | inline Register Multiply(Register first, Register second) { return _mm...
function Register (line 166) | inline Register Add(Register first, Register second) { return _mm_add_...
function Register (line 167) | inline Register Set1(float to) { return _mm_set1_ps(to); }
function Register (line 168) | inline Register Multiply(Register first, Register second) { return _mm...
function averageRowsFast (line 174) | void averageRowsFast(Vector& x, const std::vector<int32_t>& rows, cons...
FILE: src/densematrix.h
function namespace (line 22) | namespace fasttext {
FILE: src/dictionary.cc
type fasttext (line 20) | namespace fasttext {
function entry_type (line 139) | entry_type Dictionary::getType(int32_t id) const {
function entry_type (line 145) | entry_type Dictionary::getType(const std::string_view w) const {
function readWordNoNewline (line 415) | bool readWordNoNewline(std::string_view& in, std::string_view& word) {
FILE: src/dictionary.h
function namespace (line 23) | namespace fasttext {
FILE: src/fasttext.cc
type fasttext (line 23) | namespace fasttext {
function Args (line 61) | const Args FastText::getArgs() const {
function comparePairs (line 826) | bool comparePairs(
FILE: src/fasttext.h
function namespace (line 32) | namespace fasttext {
FILE: src/loss.cc
type fasttext (line 14) | namespace fasttext {
function comparePairs (line 20) | bool comparePairs(
function real (line 26) | real std_log(real x) {
function real (line 44) | real Loss::log(real x) const {
function real (line 52) | real Loss::sigmoid(real x) const {
function real (line 98) | real BinaryLogisticLoss::binaryLogistic(
function real (line 129) | real OneVsAllLoss::forward(
function real (line 164) | real NegativeSamplingLoss::forward(
function real (line 247) | real HierarchicalSoftmaxLoss::forward(
function real (line 322) | real SoftmaxLoss::forward(
FILE: src/loss.h
function virtual (line 69) | virtual ~BinaryLogisticLoss() noexcept override = default;
function class (line 109) | class HierarchicalSoftmaxLoss : public BinaryLogisticLoss {
function class (line 150) | class SoftmaxLoss : public Loss {
FILE: src/main.cc
function printUsage (line 19) | void printUsage() {
function printQuantizeUsage (line 46) | void printQuantizeUsage() {
function printTestUsage (line 50) | void printTestUsage() {
function printPredictUsage (line 60) | void printPredictUsage() {
function printTestLabelUsage (line 70) | void printTestLabelUsage() {
function printPrintWordVectorsUsage (line 80) | void printPrintWordVectorsUsage() {
function printPrintSentenceVectorsUsage (line 86) | void printPrintSentenceVectorsUsage() {
function printPrintNgramsUsage (line 92) | void printPrintNgramsUsage() {
function quantize (line 99) | void quantize(const std::vector<std::string>& args) {
function printNNUsage (line 115) | void printNNUsage() {
function printAnalogiesUsage (line 122) | void printAnalogiesUsage() {
function printDumpUsage (line 129) | void printDumpUsage() {
function test (line 135) | void test(const std::vector<std::string>& args) {
function printPredictions (line 189) | void printPredictions(
function predict (line 212) | void predict(const std::vector<std::string>& args) {
function printWordVectors (line 252) | void printWordVectors(const std::vector<std::string> args) {
function printSentenceVectors (line 268) | void printSentenceVectors(const std::vector<std::string> args) {
function printNgrams (line 284) | void printNgrams(const std::vector<std::string> args) {
function nn (line 303) | void nn(const std::vector<std::string> args) {
function analogies (line 326) | void analogies(const std::vector<std::string> args) {
function train (line 358) | void train(const std::vector<std::string> args) {
function dump (line 389) | void dump(const std::vector<std::string>& args) {
function main (line 422) | int main(int argc, char** argv) {
FILE: src/matrix.cc
type fasttext (line 11) | namespace fasttext {
FILE: src/matrix.h
function namespace (line 19) | namespace fasttext {
FILE: src/meter.cc
type fasttext (line 17) | namespace fasttext {
FILE: src/meter.h
function namespace (line 18) | namespace fasttext {
function precision (line 76) | double precision() const;
FILE: src/model.cc
type fasttext (line 16) | namespace fasttext {
function real (line 26) | real Model::State::getLoss() const {
function real (line 89) | real Model::std_log(real x) const {
FILE: src/model.h
function namespace (line 21) | namespace fasttext {
FILE: src/productquantizer.cc
type fasttext (line 17) | namespace fasttext {
function real (line 19) | real distL2(const real* x, const real* y, int32_t d) {
function real (line 42) | const real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) const {
function real (line 49) | real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) {
function real (line 56) | real ProductQuantizer::assign_centroid(
function real (line 177) | real ProductQuantizer::mulcode(
FILE: src/productquantizer.h
function namespace (line 20) | namespace fasttext {
FILE: src/quantmatrix.cc
type fasttext (line 15) | namespace fasttext {
function real (line 52) | real QuantMatrix::dotRow(const Vector& vec, int64_t i) const {
FILE: src/quantmatrix.h
function namespace (line 26) | namespace fasttext {
FILE: src/real.h
function namespace (line 11) | namespace fasttext {
FILE: src/utils.cc
type fasttext (line 14) | namespace fasttext {
type utils (line 16) | namespace utils {
function size (line 18) | int64_t size(std::ifstream& ifs) {
function seek (line 23) | void seek(std::ifstream& ifs, int64_t pos) {
function getDuration (line 28) | double getDuration(
function compareFirstLess (line 47) | bool compareFirstLess(const std::pair<double, double>& l, const doub...
FILE: src/utils.h
function namespace (line 27) | namespace fasttext {
function class (line 59) | class ClockPrint {
FILE: src/vector.cc
type fasttext (line 18) | namespace fasttext {
function real (line 26) | real Vector::norm() const {
FILE: src/vector.h
function namespace (line 18) | namespace fasttext {
FILE: webassembly/fasttext.js
class FastText (line 43) | class FastText {
method constructor (line 44) | constructor() {
method loadModel (line 60) | loadModel(url) {
method _train (line 80) | _train(url, modelName, kwargs = {}, callback = null) {
method trainSupervised (line 144) | trainSupervised(url, kwargs = {}, callback) {
method trainUnsupervised (line 183) | trainUnsupervised(url, modelName, kwargs = {}, callback) {
class FastTextModel (line 197) | class FastTextModel {
method constructor (line 206) | constructor(fastTextNative) {
method isQuant (line 216) | isQuant() {
method getDimension (line 226) | getDimension() {
method getWordVector (line 238) | getWordVector(word) {
method getSentenceVector (line 253) | getSentenceVector(text) {
method getNearestNeighbors (line 276) | getNearestNeighbors(word, k = 10) {
method getAnalogies (line 295) | getAnalogies(wordA, wordB, wordC, k) {
method getWordId (line 308) | getWordId(word) {
method getSubwordId (line 320) | getSubwordId(subword) {
method getSubwords (line 335) | getSubwords(word) {
method getInputVector (line 349) | getInputVector(ind) {
method predict (line 370) | predict(text, k = 1, threshold = 0.0) {
method getInputMatrix (line 387) | getInputMatrix() {
method getOutputMatrix (line 407) | getOutputMatrix() {
method getWords (line 425) | getWords() {
method getLabels (line 439) | getLabels() {
method getLine (line 455) | getLine(text) {
method saveModel (line 467) | saveModel() {
method test (line 498) | test(url, k, threshold) {
FILE: webassembly/fasttext_wasm.cc
type Float32ArrayBridge (line 20) | struct Float32ArrayBridge {
function fillFloat32ArrayFromVector (line 25) | void fillFloat32ArrayFromVector(
function predict (line 35) | std::vector<std::pair<float, std::string>>
function getWordVector (line 45) | void getWordVector(
function getSentenceVector (line 56) | void getSentenceVector(
function getSubwords (line 68) | std::pair<std::vector<std::string>, std::vector<int32_t>> getSubwords(
function getInputVector (line 81) | void getInputVector(
function train (line 92) | void train(FastText* fasttext, Args* args, emscripten::val jsCallback) {
function DenseMatrix (line 102) | const DenseMatrix* getInputMatrix(FastText* fasttext) {
function DenseMatrix (line 108) | const DenseMatrix* getOutputMatrix(FastText* fasttext) {
function getTokens (line 114) | std::pair<std::vector<std::string>, std::vector<int32_t>> getTokens(
function getWords (line 130) | std::pair<std::vector<std::string>, std::vector<int32_t>> getWords(
function getLabels (line 136) | std::pair<std::vector<std::string>, std::vector<int32_t>> getLabels(
function getLine (line 142) | std::pair<std::vector<std::string>, std::vector<std::string>> getLine(
function Meter (line 168) | Meter test(
function EMSCRIPTEN_BINDINGS (line 185) | EMSCRIPTEN_BINDINGS(fasttext) {
FILE: website/core/Footer.js
class Footer (line 25) | class Footer extends React.Component {
method render (line 26) | render() {
FILE: website/pages/en/index.js
class Button (line 18) | class Button extends React.Component {
method render (line 19) | render() {
class HomeSplash (line 35) | class HomeSplash extends React.Component {
method render (line 36) | render() {
function VideoContainer (line 74) | function VideoContainer() {
function SocialBanner (line 97) | function SocialBanner() {
class Index (line 111) | class Index extends React.Component {
method render (line 112) | render() {
FILE: website/static/docs/en/html/dynsections.js
function toggleVisibility (line 1) | function toggleVisibility(linkObj)
function updateStripes (line 22) | function updateStripes()
function toggleLevel (line 28) | function toggleLevel(level)
function toggleFolder (line 49) | function toggleFolder(id)
function toggleInherit (line 84) | function toggleInherit(id)
FILE: website/static/docs/en/html/jquery.js
function b0 (line 16) | function b0(b3,b4){return new b0.fn.init(b3,b4)}
function bw (line 16) | function bw(){if(bF.isReady){return}try{av.documentElement.doScroll("lef...
function X (line 16) | function X(e){var bv=a2[e]={},bw,bx;e=e.split(/\s+/);for(bw=0,bx=e.lengt...
function bD (line 16) | function bD(bF){return function(bG){bx[bF]=arguments.length>1?aJ.call(ar...
function bz (line 16) | function bz(bF){return function(bG){bB[bF]=arguments.length>1?aJ.call(ar...
function a5 (line 16) | function a5(bx,bw,by){if(by===L&&bx.nodeType===1){var bv="data-"+bw.repl...
function S (line 16) | function S(bv){for(var e in bv){if(e==="data"&&b.isEmptyObject(bv[e])){c...
function bi (line 16) | function bi(by,bx,bA){var bw=bx+"defer",bv=bx+"queue",e=bx+"mark",bz=b._...
function bE (line 16) | function bE(){if(!(--bB)){e.resolveWith(bv,[bv])}}
function bk (line 16) | function bk(){return false}
function i (line 16) | function i(){return true}
function bv (line 23) | function bv(bR,bW,bV,bZ,bX,bY){for(var bT=0,bS=bZ.length;bT<bS;bT++){var...
function bN (line 23) | function bN(bR,bW,bV,bZ,bX,bY){for(var bT=0,bS=bZ.length;bT<bS;bT++){var...
function C (line 23) | function C(e){return !e||!e.parentNode||e.parentNode.nodeType===11}
function aG (line 23) | function aG(bx,bw,e){bw=bw||0;if(b.isFunction(bw)){return b.grep(bx,func...
function a (line 23) | function a(e){var bw=aR.split("|"),bv=e.createDocumentFragment();if(bv.c...
function ba (line 23) | function ba(e,bv){return b.nodeName(e,"table")?(e.getElementsByTagName("...
function t (line 23) | function t(bB,bv){if(bv.nodeType!==1||!b.hasData(bB)){return}var by,bx,e...
function ai (line 23) | function ai(bv,e){var bw;if(e.nodeType!==1){return}if(e.clearAttributes)...
function bg (line 23) | function bg(e){if(typeof e.getElementsByTagName!=="undefined"){return e....
function az (line 23) | function az(e){if(e.type==="checkbox"||e.type==="radio"){e.defaultChecke...
function E (line 23) | function E(e){var bv=(e.nodeName||"").toLowerCase();if(bv==="input"){az(...
function al (line 23) | function al(e){var bv=av.createElement("div");ac.appendChild(bv);bv.inne...
function bo (line 23) | function bo(e,bv){if(bv.src){b.ajax({url:bv.src,async:false,dataType:"sc...
function p (line 23) | function p(by,bw,bv){var bA=bw==="width"?by.offsetWidth:by.offsetHeight,...
function f (line 23) | function f(e){return function(by,bA){if(typeof by!=="string"){bA=by;by="...
function aW (line 23) | function aW(bv,bE,bz,bD,bB,bx){bB=bB||bE.dataTypes[0];bx=bx||{};bx[bB]=t...
function am (line 23) | function am(bw,bx){var bv,e,by=b.ajaxSettings.flatOptions||{};for(bv in ...
function bF (line 23) | function bF(bZ,bU,b0,bW){if(bA===2){return}bA=2;if(bE){clearTimeout(bE)}...
function v (line 23) | function v(bw,by,bv,bx){if(b.isArray(by)){b.each(by,function(bA,bz){if(b...
function bj (line 23) | function bj(bD,bC,bz){var bv=bD.contents,bB=bD.dataTypes,bw=bD.responseF...
function G (line 23) | function G(bH,bz){if(bH.dataFilter){bz=bH.dataFilter(bz,bH.dataType)}var...
function aL (line 23) | function aL(){try{return new bb.XMLHttpRequest()}catch(bv){}}
function aj (line 23) | function aj(){try{return new bb.ActiveXObject("Microsoft.XMLHTTP")}catch...
function bv (line 23) | function bv(){if(e.queue===false){b._mark(this)}var bE=b.extend({},e),bK...
function bB (line 23) | function bB(bE,bF,bD){var bC=bF[bD];b.removeData(bE,bD,true);bC.stop(e)}
function bh (line 23) | function bh(){setTimeout(at,0);return(a4=b.now())}
function at (line 23) | function at(){a4=L}
function a0 (line 23) | function a0(bv,e){var bw={};b.each(aH.concat.apply([],aH.slice(0,e)),fun...
function bv (line 23) | function bv(bA){return e.step(bA)}
function x (line 23) | function x(bx){if(!Q[bx]){var e=av.body,bv=b("<"+bx+">").appendTo(e),bw=...
function aK (line 23) | function aK(e){return b.isWindow(e)?e:e.nodeType===9?e.defaultView||e.pa...
function j (line 32) | function j(m,l,i,n){a.each(f,function(){l-=parseFloat(a.curCSS(m,"paddin...
function c (line 32) | function c(g,e){var j=g.nodeName.toLowerCase();if("area"===j){var i=g.pa...
function b (line 32) | function b(e){return !a(e).parents().andSelf().filter(function(){return ...
function a (line 61) | function a(j){j=j||location.href;return"#"+j.replace(/^[^#]*#?(.*)$/,"$1")}
function n (line 61) | function n(){var r=a(),q=o(m);if(r!==m){l(m=r,q);$(e).trigger(c)}else{if...
function h (line 61) | function h(n){j.animate(g,e,d.easing,n&&function(){n.call(this,f,d)})}
function b (line 61) | function b(d){return typeof d=="object"?d:{top:d,left:d}}
function b (line 68) | function b(){var F=this;F.top="auto";F.left="auto";F.right="auto";F.bott...
function t (line 68) | function t(K,N,F){var J=null;function L(P,Q){M();if(!K.data(e)){if(!P){c...
function j (line 68) | function j(){function G(M,L,J,O,P){var K=L.split("-")[0],N=new b(),I;if(...
function x (line 68) | function x(Q){var P=new j(),O=k("#"+Q.popupId);if(O.length===0){O=k("<di...
function q (line 68) | function q(F){return window.SVGElement&&F[0] instanceof SVGElement}
function h (line 68) | function h(){if(!c.mouseTrackingActive){c.mouseTrackingActive=true;k(fun...
function i (line 68) | function i(F){c.currentX=F.pageX;c.currentY=F.pageY}
function v (line 68) | function v(F){var H=F.offset(),J=F[0].getBoundingClientRect(),I=J.right-...
function B (line 68) | function B(I){var G=I.data(y),F=I.data(o),K=I.data(l),H,J;if(G){if(k.isF...
function m (line 68) | function m(M,L,K){var G=c.scrollTop,J=c.scrollLeft,I=G+c.windowHeight,F=...
function a (line 68) | function a(G){var F=0;while(G){G&=G-1;F++}return F}
function e (line 78) | function e(h,i){if(h.originalEvent.touches.length>1){return}h.preventDef...
function k (line 87) | function k(m){var n=".smartmenus_mouse";if(!h&&!m){var o=true,l=null;a(d...
function j (line 87) | function j(l){return !/^(4|mouse)$/.test(l.pointerType)}
function i (line 87) | function i(l,n){if(!n){n=""}var m={};a.each(l,function(o,p){m[p[0].split...
FILE: website/static/docs/en/html/menu.js
function initMenu (line 1) | function initMenu(relPath,searchEnabled,serverSide,searchPage,search) {
FILE: website/static/docs/en/html/navtree.js
function getData (line 5) | function getData(varName)
function stripPath (line 12) | function stripPath(uri)
function stripPath2 (line 17) | function stripPath2(uri)
function hashValue (line 25) | function hashValue()
function hashUrl (line 30) | function hashUrl()
function pathName (line 35) | function pathName()
function localStorageSupported (line 40) | function localStorageSupported()
function storeLink (line 51) | function storeLink(link)
function deleteLink (line 58) | function deleteLink()
function cachedLink (line 65) | function cachedLink()
function getScript (line 74) | function getScript(scriptName,func,show)
function createIndent (line 93) | function createIndent(o,domNode,node,level)
function gotoAnchor (line 128) | function gotoAnchor(anchor,aname,updateLocation)
function newNode (line 156) | function newNode(o, po, text, link, childrenData, lastNode)
function showRoot (line 237) | function showRoot()
function expandNode (line 252) | function expandNode(o, node, imm, showRoot)
function glowEffect (line 276) | function glowEffect(n,duration)
function highlightAnchor (line 283) | function highlightAnchor()
function selectAndHighlight (line 302) | function selectAndHighlight(hash,n)
function showNode (line 325) | function showNode(o, node, index, hash)
function removeToInsertLater (line 366) | function removeToInsertLater(element) {
function getNode (line 379) | function getNode(o, po)
function gotoNode (line 392) | function gotoNode(o,subIndex,root,hash,relpath)
function navTo (line 407) | function navTo(o,root,hash,relpath)
function showSyncOff (line 437) | function showSyncOff(n,relpath)
function showSyncOn (line 442) | function showSyncOn(n,relpath)
function toggleSyncButton (line 447) | function toggleSyncButton(relpath)
function initNavTree (line 461) | function initNavTree(toroot,relpath)
FILE: website/static/docs/en/html/resize.js
function initResizable (line 1) | function initResizable()
FILE: website/static/docs/en/html/search/search.js
function convertToId (line 1) | function convertToId(search)
function getXPos (line 24) | function getXPos(item)
function getYPos (line 38) | function getYPos(item)
function SearchBox (line 59) | function SearchBox(name, resultsPath, inFrame, label)
function SearchResults (line 404) | function SearchResults(name)
function setKeyActions (line 709) | function setKeyActions(elem,action)
function setClassAttr (line 716) | function setClassAttr(elem,attr)
function createResults (line 722) | function createResults()
function init_search (line 777) | function init_search()
FILE: website/static/tabber.js
function addLoadEvent (line 1) | function addLoadEvent(func) {
function tabber (line 16) | function tabber(){
Condensed preview — 462 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,331K chars).
[
{
"path": ".gitignore",
"chars": 113,
"preview": ".*.swp\n*.o\n*.bin\n*.vec\n*.bc\n.DS_Store\ndata\nfasttext\nresult\nwebsite/node_modules/\npackage-lock.json\nnode_modules/\n"
},
{
"path": "CMakeLists.txt",
"chars": 2242,
"preview": "#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the MIT lic"
},
{
"path": "CODE_OF_CONDUCT.md",
"chars": 3356,
"preview": "# Code of Conduct\n\n## Our Pledge\n\nIn the interest of fostering an open and welcoming environment, we as\ncontributors and"
},
{
"path": "CONTRIBUTING.md",
"chars": 2061,
"preview": "# Contributing to fastText\nWe want to make contributing to this project as easy and transparent as possible.\n\n## Issues\n"
},
{
"path": "LICENSE",
"chars": 1080,
"preview": "MIT License\n\nCopyright (c) 2016-present, Facebook, Inc.\n\nPermission is hereby granted, free of charge, to any person obt"
},
{
"path": "MANIFEST.in",
"chars": 95,
"preview": "include LICENSE\ninclude PATENTS\n\nrecursive-include python *.md *.rst\nrecursive-include src *.h\n"
},
{
"path": "Makefile",
"chars": 4217,
"preview": "#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the MIT lic"
},
{
"path": "PACKAGE",
"chars": 131,
"preview": "load(\"@fbcode_macros//build_defs:package_local_utils.bzl\", \"package_local_utils\")\n\npackage_local_utils.set_clang_version"
},
{
"path": "README.md",
"chars": 13611,
"preview": "# fastText\n[fastText](https://fasttext.cc/) is a library for efficient learning of word representations and sentence cla"
},
{
"path": "alignment/README.md",
"chars": 2871,
"preview": "## Alignment of Word Embeddings\n\nThis directory provides code for learning alignments between word embeddings in differe"
},
{
"path": "alignment/align.py",
"chars": 5335,
"preview": "#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n#"
},
{
"path": "alignment/eval.py",
"chars": 2478,
"preview": "#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n#"
},
{
"path": "alignment/example.sh",
"chars": 1408,
"preview": "#!/bin/usr/env sh\n# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed u"
},
{
"path": "alignment/unsup_align.py",
"chars": 4616,
"preview": "#!/usr/bin/env python3\n# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "alignment/unsup_multialign.py",
"chars": 7730,
"preview": "#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2019-present, Facebook, Inc.\n# All rights reserved.\n#\n#"
},
{
"path": "alignment/utils.py",
"chars": 4803,
"preview": "#!/usr/bin/env python3\n# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "classification-example.sh",
"chars": 1425,
"preview": "#!/usr/bin/env bash\n#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licens"
},
{
"path": "classification-results.sh",
"chars": 3154,
"preview": "#!/usr/bin/env bash\n#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licens"
},
{
"path": "crawl/README.md",
"chars": 1144,
"preview": "## Preprocessing Common Crawl\n\nThis code downloads, preprocesses and splits per language the data from [Common Crawl](ht"
},
{
"path": "crawl/dedup.cc",
"chars": 1237,
"preview": "// Copyright (c) 2018-present, Facebook, Inc.\n// All rights reserved.\n//\n// This source code is licensed under the MIT l"
},
{
"path": "crawl/download_crawl.sh",
"chars": 1563,
"preview": "#!/bin/usr/env sh\n# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed u"
},
{
"path": "crawl/filter_dedup.sh",
"chars": 332,
"preview": "#!/bin/usr/env sh\n# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed u"
},
{
"path": "crawl/filter_utf8.cc",
"chars": 3034,
"preview": "// Copyright (c) 2018-present, Facebook, Inc.\n// All rights reserved.\n//\n// This source code is licensed under the MIT l"
},
{
"path": "crawl/process_wet_file.sh",
"chars": 912,
"preview": "#!/bin/usr/env sh\n# Copyright (c) 2018-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed u"
},
{
"path": "docs/aligned-vectors.md",
"chars": 6122,
"preview": "---\nid: aligned-vectors\ntitle: Aligned word vectors\n---\n\nWe are publishing aligned word vectors for 44 languages based o"
},
{
"path": "docs/api.md",
"chars": 117,
"preview": "---\nid: api\ntitle:API\n---\n\nWe automatically generate our [API documentation](/docs/en/html/index.html) with doxygen.\n"
},
{
"path": "docs/autotune.md",
"chars": 6815,
"preview": "---\nid: autotune\ntitle: Automatic hyperparameter optimization\n---\n\nAs we saw in [the tutorial](/docs/en/supervised-tutor"
},
{
"path": "docs/cheatsheet.md",
"chars": 1840,
"preview": "---\nid: cheatsheet\ntitle: Cheatsheet\n---\n\n## Word representation learning\n\nIn order to learn word vectors do:\n\n```bash\n$"
},
{
"path": "docs/crawl-vectors.md",
"chars": 33194,
"preview": "---\nid: crawl-vectors\ntitle: Word vectors for 157 languages\n---\n\nWe distribute pre-trained word vectors for 157 language"
},
{
"path": "docs/dataset.md",
"chars": 127,
"preview": "---\nid: dataset\ntitle: Datasets\n---\n\n[Download YFCC100M Dataset](https://fb-public.box.com/s/htfdbrvycvroebv9ecaezaztocb"
},
{
"path": "docs/english-vectors.md",
"chars": 2548,
"preview": "---\nid: english-vectors\ntitle: English word vectors\n---\n\nThis page gathers several pre-trained word vectors trained usin"
},
{
"path": "docs/faqs.md",
"chars": 4750,
"preview": "---\nid: faqs\ntitle:FAQ\n---\n\n## What is fastText? Are there tutorials?\n\nFastText is a library for text classification and"
},
{
"path": "docs/language-identification.md",
"chars": 3930,
"preview": "---\nid: language-identification\ntitle: Language identification\n---\n\n### Description\n\nWe distribute two models for langua"
},
{
"path": "docs/options.md",
"chars": 2533,
"preview": "---\nid: options\ntitle: List of options\n---\n\nInvoke a command without arguments to list available arguments and their def"
},
{
"path": "docs/pretrained-vectors.md",
"chars": 51192,
"preview": "---\nid: pretrained-vectors\ntitle: Wiki word vectors\n---\n\nWe are publishing pre-trained word vectors for 294 languages, t"
},
{
"path": "docs/python-module.md",
"chars": 13691,
"preview": "---\nid: python-module\ntitle: Python module\n---\n\nIn this document we present how to use fastText in python.\n\n## Table of "
},
{
"path": "docs/references.md",
"chars": 1542,
"preview": "---\nid: references\ntitle: References\n---\n\nPlease cite [1](#enriching-word-vectors-with-subword-information) if using thi"
},
{
"path": "docs/supervised-models.md",
"chars": 3625,
"preview": "---\nid: supervised-models\ntitle: Supervised models\n---\n\nThis page gathers several pre-trained supervised models on sever"
},
{
"path": "docs/supervised-tutorial.md",
"chars": 22910,
"preview": "---\nid: supervised-tutorial\ntitle: Text classification\n---\n\nText classification is a core problem to many applications, "
},
{
"path": "docs/support.md",
"chars": 1521,
"preview": "---\nid: support\ntitle: Get started\n---\n\n## What is fastText?\n\nfastText is a library for efficient learning of word repre"
},
{
"path": "docs/unsupervised-tutorials.md",
"chars": 20147,
"preview": "---\nid: unsupervised-tutorial\ntitle: Word representations\n---\nA popular idea in modern machine learning is to represent "
},
{
"path": "docs/webassembly-module.md",
"chars": 14233,
"preview": "---\nid: webassembly-module\ntitle: WebAssembly module\n---\n\nIn this document we present how to use fastText in javascript "
},
{
"path": "download_model.py",
"chars": 1287,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# "
},
{
"path": "eval.py",
"chars": 2102,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# "
},
{
"path": "fasttext.pc.in",
"chars": 338,
"preview": "prefix=@CMAKE_INSTALL_PREFIX@\nexec_prefix=@CMAKE_INSTALL_FULL_LIBEXECDIR@\nlibdir=@CMAKE_INSTALL_FULL_LIBDIR@\nincludedir="
},
{
"path": "get-wikimedia.sh",
"chars": 3301,
"preview": "#!/usr/bin/env bash\n#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licens"
},
{
"path": "pyproject.toml",
"chars": 62,
"preview": "[build-system]\nrequires = [\"setuptools\", \"wheel\", \"pybind11\"]\n"
},
{
"path": "python/README.md",
"chars": 14358,
"preview": "# fastText [](https://circleci.c"
},
{
"path": "python/README.rst",
"chars": 15419,
"preview": "fastText |CircleCI|\n===================\n\n`fastText <https://fasttext.cc/>`__ is a library for efficient learning\nof word"
},
{
"path": "python/benchmarks/README.rst",
"chars": 187,
"preview": "These programs allow us to compare the performance of a few key operations when consindering changes. \n\nIt is important "
},
{
"path": "python/benchmarks/get_word_vector.py",
"chars": 1491,
"preview": "# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licensed under the MIT licen"
},
{
"path": "python/doc/examples/FastTextEmbeddingBag.py",
"chars": 2816,
"preview": "#!/usr/bin/env python\n\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "python/doc/examples/bin_to_vec.py",
"chars": 1120,
"preview": "#!/usr/bin/env python\n\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "python/doc/examples/compute_accuracy.py",
"chars": 5182,
"preview": "#!/usr/bin/env python\n\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "python/doc/examples/get_vocab.py",
"chars": 1285,
"preview": "#!/usr/bin/env python\n\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "python/doc/examples/train_supervised.py",
"chars": 1338,
"preview": "#!/usr/bin/env python\n\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "python/doc/examples/train_unsupervised.py",
"chars": 1615,
"preview": "#!/usr/bin/env python\n\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "quantization-example.sh",
"chars": 1573,
"preview": "myshuf() {\n perl -MList::Util=shuffle -e 'print shuffle(<>);' \"$@\";\n}\n\nnormalize_text() {\n tr '[:upper:]' '[:lower:]' "
},
{
"path": "reduce_model.py",
"chars": 2853,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# "
},
{
"path": "runtests.py",
"chars": 1820,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# "
},
{
"path": "scripts/kbcompletion/README.md",
"chars": 608,
"preview": "# Fast Linear Model for Knowledge Graph Embeddings\n\n## Knowledge base completion\n\nThese scripts require the [fastText li"
},
{
"path": "scripts/kbcompletion/data.sh",
"chars": 2456,
"preview": "#!/usr/bin/env bash\n#\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licens"
},
{
"path": "scripts/kbcompletion/eval.cpp",
"chars": 2743,
"preview": "/**\n * Copyright (c) 2017-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "scripts/kbcompletion/fb15k.sh",
"chars": 1685,
"preview": "#!/usr/bin/env bash\n#\n# copyright (c) 2017-present, facebook, inc.\n# all rights reserved.\n#\n# this source code is licens"
},
{
"path": "scripts/kbcompletion/fb15k237.sh",
"chars": 1206,
"preview": "#!/usr/bin/env bash\n#\n# copyright (c) 2017-present, facebook, inc.\n# all rights reserved.\n#\n# this source code is licens"
},
{
"path": "scripts/kbcompletion/svo.sh",
"chars": 1032,
"preview": "#!/usr/bin/env bash\n#\n# copyright (c) 2017-present, facebook, inc.\n# all rights reserved.\n#\n# this source code is licens"
},
{
"path": "scripts/kbcompletion/wn18.sh",
"chars": 1579,
"preview": "#!/usr/bin/env bash\n#\n# copyright (c) 2017-present, facebook, inc.\n# all rights reserved.\n#\n# this source code is licens"
},
{
"path": "scripts/quantization/quantization-results.sh",
"chars": 1181,
"preview": "#!/usr/bin/env bash\n#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licens"
},
{
"path": "setup.cfg",
"chars": 40,
"preview": "[metadata]\ndescription-file = README.md\n"
},
{
"path": "setup.py",
"chars": 6363,
"preview": "#!/usr/bin/env python\n\n# Copyright (c) 2017-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licen"
},
{
"path": "src/aligned.h",
"chars": 2486,
"preview": "#pragma once\n#include <cstdlib>\n#include <new>\n#ifdef _MSC_VER\n// Ensure _HAS_EXCEPTIONS is defined\n#include <vcruntime."
},
{
"path": "src/args.cc",
"chars": 16260,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/args.h",
"chars": 2176,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/autotune.cc",
"chars": 13879,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/autotune.h",
"chars": 2266,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/densematrix.cc",
"chars": 7713,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/densematrix.h",
"chars": 2275,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/dictionary.cc",
"chars": 15114,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/dictionary.h",
"chars": 3244,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/fasttext.cc",
"chars": 24481,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/fasttext.h",
"chars": 4763,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/loss.cc",
"chars": 9124,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/loss.h",
"chars": 3879,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/main.cc",
"chars": 12854,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/matrix.cc",
"chars": 494,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/matrix.h",
"chars": 1063,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/meter.cc",
"chars": 6129,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/meter.h",
"chars": 2510,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/model.cc",
"chars": 2168,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/model.h",
"chars": 1720,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/productquantizer.cc",
"chars": 6234,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/productquantizer.h",
"chars": 1607,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/quantmatrix.cc",
"chars": 3595,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/quantmatrix.h",
"chars": 1562,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/real.h",
"chars": 266,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/utils.cc",
"chars": 1275,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/utils.h",
"chars": 1722,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/vector.cc",
"chars": 1961,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "src/vector.h",
"chars": 1296,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "tests/fetch_test_data.sh",
"chars": 6658,
"preview": "#!/usr/bin/env bash\n#\n# Copyright (c) 2016-present, Facebook, Inc.\n# All rights reserved.\n#\n# This source code is licens"
},
{
"path": "webassembly/README.md",
"chars": 1168,
"preview": " fastText [](https://circleci.co"
},
{
"path": "webassembly/doc/examples/misc.html",
"chars": 1910,
"preview": "<!DOCTYPE html>\n<html>\n<head>\n <meta charset=\"UTF-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-"
},
{
"path": "webassembly/doc/examples/predict.html",
"chars": 1250,
"preview": "<!DOCTYPE html>\n<html>\n<head>\n <meta charset=\"UTF-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-"
},
{
"path": "webassembly/doc/examples/train_supervised.html",
"chars": 2210,
"preview": "<!DOCTYPE html>\n<html>\n<head>\n <meta charset=\"UTF-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-"
},
{
"path": "webassembly/doc/examples/train_unsupervised.html",
"chars": 1309,
"preview": "<!DOCTYPE html>\n<html>\n<head>\n <meta charset=\"UTF-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-"
},
{
"path": "webassembly/fasttext.js",
"chars": 14042,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "webassembly/fasttext_wasm.cc",
"chars": 11086,
"preview": "/**\n * Copyright (c) 2016-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "website/README.md",
"chars": 116,
"preview": "Prerequisites\n- nodejs\n\nTo build locally, navigate into subfolder website and execute\n- npm install\n- npm run start\n"
},
{
"path": "website/blog/2016-08-18-blog-post.md",
"chars": 6199,
"preview": "---\ntitle: Releasing fastText\nauthor: Edouard Grave\nauthorURL: https://research.fb.com/people/grave-edouard/\nauthorFBID:"
},
{
"path": "website/blog/2017-05-02-blog-post.md",
"chars": 8643,
"preview": "---\ntitle: fastText on mobile\nauthor: Armand Joulin\nauthorURL: https://research.fb.com/people/joulin-armand/\nauthorFBID:"
},
{
"path": "website/blog/2017-10-02-blog-post.md",
"chars": 6934,
"preview": "---\ntitle: Language identification\nauthor: Edouard Grave\nauthorURL: https://research.fb.com/people/grave-edouard/\nauthor"
},
{
"path": "website/blog/2019-06-25-blog-post.md",
"chars": 7095,
"preview": "---\ntitle: New release of python module\nauthor: Onur Çelebi\nauthorURL: https://research.fb.com/people/celebi-onur/\nautho"
},
{
"path": "website/core/Footer.js",
"chars": 3409,
"preview": "/**\n * Copyright (c) 2017-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "website/package.json",
"chars": 274,
"preview": "{\n \"scripts\": {\n \"start\": \"docusaurus-start\",\n \"build\": \"docusaurus-build\",\n \"publish-gh-pages\": \"docusaurus-p"
},
{
"path": "website/pages/en/index.js",
"chars": 10564,
"preview": "/**\n * Copyright (c) 2017-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "website/sidebars.json",
"chars": 461,
"preview": "{\n \"docs\": {\n \"Introduction\": [\"support\", \"cheatsheet\", \"options\"],\n \"Tutorials\": [\"supervised-tutorial\", \"unsupe"
},
{
"path": "website/siteConfig.js",
"chars": 3138,
"preview": "/**\n * Copyright (c) 2017-present, Facebook, Inc.\n * All rights reserved.\n *\n * This source code is licensed under the M"
},
{
"path": "website/static/docs/en/html/.classfasttext_1_1QMatrix-members.html.i4eKqy",
"chars": 0,
"preview": ""
},
{
"path": "website/static/docs/en/html/annotated.html",
"chars": 6919,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/annotated_dup.js",
"chars": 90,
"preview": "var annotated_dup =\n[\n [ \"fasttext\", \"namespacefasttext.html\", \"namespacefasttext\" ]\n];"
},
{
"path": "website/static/docs/en/html/args_8cc.html",
"chars": 4595,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/args_8h.html",
"chars": 6900,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/args_8h.js",
"chars": 885,
"preview": "var args_8h =\n[\n [ \"Args\", \"classfasttext_1_1Args.html\", \"classfasttext_1_1Args\" ],\n [ \"loss_name\", \"args_8h.html#"
},
{
"path": "website/static/docs/en/html/args_8h_source.html",
"chars": 28715,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classes.html",
"chars": 8054,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Args-members.html",
"chars": 12994,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Args.html",
"chars": 35335,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Args.js",
"chars": 3334,
"preview": "var classfasttext_1_1Args =\n[\n [ \"Args\", \"classfasttext_1_1Args.html#ab196dccd500190c3831af2cbfdd3eb03\", null ],\n "
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Dictionary-members.html",
"chars": 17712,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Dictionary.html",
"chars": 57455,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Dictionary.js",
"chars": 3886,
"preview": "var classfasttext_1_1Dictionary =\n[\n [ \"Dictionary\", \"classfasttext_1_1Dictionary.html#ae0f87ea47dcc779231cd0d2cd6607"
},
{
"path": "website/static/docs/en/html/classfasttext_1_1FastText-members.html",
"chars": 15966,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1FastText.html",
"chars": 53276,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1FastText.js",
"chars": 4031,
"preview": "var classfasttext_1_1FastText =\n[\n [ \"FastText\", \"classfasttext_1_1FastText.html#a3f1c81aafc45ad71824b332f5cb577d5\", "
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Matrix-members.html",
"chars": 9273,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Matrix.html",
"chars": 28475,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Matrix.js",
"chars": 1829,
"preview": "var classfasttext_1_1Matrix =\n[\n [ \"Matrix\", \"classfasttext_1_1Matrix.html#ae3eed8f78b046582d6504eaae17b9890\", null ]"
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Model-members.html",
"chars": 17645,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Model.html",
"chars": 65808,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Model.js",
"chars": 4138,
"preview": "var classfasttext_1_1Model =\n[\n [ \"Model\", \"classfasttext_1_1Model.html#a63f17ed51e4a9adf73322bf62d2cf338\", null ],\n "
},
{
"path": "website/static/docs/en/html/classfasttext_1_1ProductQuantizer-members.html",
"chars": 12796,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1ProductQuantizer.html",
"chars": 43372,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1ProductQuantizer.js",
"chars": 2913,
"preview": "var classfasttext_1_1ProductQuantizer =\n[\n [ \"ProductQuantizer\", \"classfasttext_1_1ProductQuantizer.html#a08b62937f90"
},
{
"path": "website/static/docs/en/html/classfasttext_1_1QMatrix-members.html",
"chars": 9131,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1QMatrix.html",
"chars": 25131,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1QMatrix.js",
"chars": 1767,
"preview": "var classfasttext_1_1QMatrix =\n[\n [ \"QMatrix\", \"classfasttext_1_1QMatrix.html#a976442aaed5b1afee2f2cd4473c0d62b\", nul"
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Vector-members.html",
"chars": 8645,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Vector.html",
"chars": 25888,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/classfasttext_1_1Vector.js",
"chars": 1648,
"preview": "var classfasttext_1_1Vector =\n[\n [ \"Vector\", \"classfasttext_1_1Vector.html#ab7f9177915b3d3837213abb15de9b939\", null ]"
},
{
"path": "website/static/docs/en/html/dictionary_8cc.html",
"chars": 4769,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/dictionary_8h.html",
"chars": 7231,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/dictionary_8h.js",
"chars": 582,
"preview": "var dictionary_8h =\n[\n [ \"entry\", \"structfasttext_1_1entry.html\", \"structfasttext_1_1entry\" ],\n [ \"Dictionary\", \"c"
},
{
"path": "website/static/docs/en/html/dictionary_8h_source.html",
"chars": 29851,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html",
"chars": 9946,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.js",
"chars": 1354,
"preview": "var dir_68267d1309a1af8e8297ef4c3efbcdba =\n[\n [ \"args.cc\", \"args_8cc.html\", null ],\n [ \"args.h\", \"args_8h.html\", \""
},
{
"path": "website/static/docs/en/html/doxygen.css",
"chars": 27960,
"preview": "/* The standard CSS for doxygen 1.8.13 */\n\nbody, table, div, p, dl {\n\tfont: 400 14px/22px Roboto,sans-serif;\n}\n\np.refere"
},
{
"path": "website/static/docs/en/html/dynsections.js",
"chars": 3140,
"preview": "function toggleVisibility(linkObj)\n{\n var base = $(linkObj).attr('id');\n var summary = $('#'+base+'-summary');\n var cont"
},
{
"path": "website/static/docs/en/html/fasttext_8cc.html",
"chars": 4881,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/fasttext_8h.html",
"chars": 8074,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/fasttext_8h.js",
"chars": 295,
"preview": "var fasttext_8h =\n[\n [ \"FastText\", \"classfasttext_1_1FastText.html\", \"classfasttext_1_1FastText\" ],\n [ \"FASTTEXT_F"
},
{
"path": "website/static/docs/en/html/fasttext_8h_source.html",
"chars": 38150,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/files.html",
"chars": 9252,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/files.js",
"chars": 115,
"preview": "var files =\n[\n [ \"src\", \"dir_68267d1309a1af8e8297ef4c3efbcdba.html\", \"dir_68267d1309a1af8e8297ef4c3efbcdba\" ]\n];"
},
{
"path": "website/static/docs/en/html/functions.html",
"chars": 5538,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_0x7e.html",
"chars": 4137,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_b.html",
"chars": 4252,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_c.html",
"chars": 5602,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_d.html",
"chars": 5258,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_dup.js",
"chars": 960,
"preview": "var functions_dup =\n[\n [ \"a\", \"functions.html\", null ],\n [ \"b\", \"functions_b.html\", null ],\n [ \"c\", \"functions_"
},
{
"path": "website/static/docs/en/html/functions_e.html",
"chars": 4287,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_f.html",
"chars": 4144,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_func.html",
"chars": 22018,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_g.html",
"chars": 5622,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_h.html",
"chars": 4135,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_i.html",
"chars": 4542,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_k.html",
"chars": 3912,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_l.html",
"chars": 5585,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_m.html",
"chars": 6075,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_n.html",
"chars": 6489,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_o.html",
"chars": 4356,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_p.html",
"chars": 6401,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_q.html",
"chars": 5222,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_r.html",
"chars": 4386,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_s.html",
"chars": 6551,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_t.html",
"chars": 5369,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_u.html",
"chars": 3874,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_v.html",
"chars": 3870,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_vars.html",
"chars": 18204,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_w.html",
"chars": 4624,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/functions_z.html",
"chars": 3854,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/globals.html",
"chars": 6228,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/globals_defs.html",
"chars": 4059,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/globals_func.html",
"chars": 5564,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/index.html",
"chars": 3585,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/jquery.js",
"chars": 172871,
"preview": "/*!\n * jQuery JavaScript Library v1.7.1\n * http://jquery.com/\n *\n * Copyright 2011, John Resig\n * Dual licensed under th"
},
{
"path": "website/static/docs/en/html/main_8cc.html",
"chars": 24092,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/main_8cc.js",
"chars": 1582,
"preview": "var main_8cc =\n[\n [ \"analogies\", \"main_8cc.html#a7ffcd938d3c75d2f9249d6c122b780a4\", null ],\n [ \"main\", \"main_8cc.h"
},
{
"path": "website/static/docs/en/html/matrix_8cc.html",
"chars": 4752,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/matrix_8h.html",
"chars": 5159,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
},
{
"path": "website/static/docs/en/html/matrix_8h_source.html",
"chars": 20187,
"preview": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\""
}
]
// ... and 262 more files (download for full content)
About this extraction
This page contains the full source code of the facebookresearch/fastText GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 462 files (2.1 MB), approximately 569.7k tokens, and a symbol index with 329 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.