Copy disabled (too large)
Download .txt
Showing preview only (15,886K chars total). Download the full file to get everything.
Repository: huggingface/sentence-transformers
Branch: main
Commit: aebd46c05d3d
Files: 537
Total size: 16.9 MB
Directory structure:
gitextract_by8kvk5i/
├── .github/
│ └── workflows/
│ ├── quality.yml
│ └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── NOTICE.txt
├── README.md
├── docs/
│ ├── .htaccess
│ ├── Makefile
│ ├── _static/
│ │ ├── css/
│ │ │ └── custom.css
│ │ ├── html/
│ │ │ └── models_en_sentence_embeddings.html
│ │ └── js/
│ │ └── custom.js
│ ├── _templates/
│ │ └── layout.html
│ ├── conf.py
│ ├── cross_encoder/
│ │ ├── loss_overview.md
│ │ ├── pretrained_models.md
│ │ ├── training/
│ │ │ └── examples.rst
│ │ ├── training_overview.md
│ │ └── usage/
│ │ ├── efficiency.rst
│ │ └── usage.rst
│ ├── img/
│ │ └── logo.xcf
│ ├── installation.md
│ ├── migration_guide.md
│ ├── package_reference/
│ │ ├── cross_encoder/
│ │ │ ├── cross_encoder.md
│ │ │ ├── evaluation.md
│ │ │ ├── index.rst
│ │ │ ├── losses.md
│ │ │ ├── trainer.md
│ │ │ └── training_args.md
│ │ ├── sentence_transformer/
│ │ │ ├── SentenceTransformer.md
│ │ │ ├── datasets.md
│ │ │ ├── evaluation.md
│ │ │ ├── index.rst
│ │ │ ├── losses.md
│ │ │ ├── models.md
│ │ │ ├── quantization.md
│ │ │ ├── sampler.md
│ │ │ ├── trainer.md
│ │ │ └── training_args.md
│ │ ├── sparse_encoder/
│ │ │ ├── SparseEncoder.md
│ │ │ ├── callbacks.md
│ │ │ ├── evaluation.md
│ │ │ ├── index.rst
│ │ │ ├── losses.md
│ │ │ ├── models.md
│ │ │ ├── search_engines.md
│ │ │ ├── trainer.md
│ │ │ └── training_args.md
│ │ └── util.md
│ ├── pretrained-models/
│ │ ├── ce-msmarco.md
│ │ ├── dpr.md
│ │ ├── msmarco-v1.md
│ │ ├── msmarco-v2.md
│ │ ├── msmarco-v3.md
│ │ ├── msmarco-v5.md
│ │ ├── nli-models.md
│ │ ├── nq-v1.md
│ │ ├── sts-models.md
│ │ └── wikipedia-sections-models.md
│ ├── publications.md
│ ├── quickstart.rst
│ ├── requirements.txt
│ ├── sentence_transformer/
│ │ ├── dataset_overview.md
│ │ ├── loss_overview.md
│ │ ├── pretrained_models.md
│ │ ├── training/
│ │ │ ├── distributed.rst
│ │ │ └── examples.rst
│ │ ├── training_overview.md
│ │ └── usage/
│ │ ├── backend_export_sidebar.rst
│ │ ├── custom_models.rst
│ │ ├── efficiency.rst
│ │ ├── mteb_evaluation.md
│ │ ├── semantic_textual_similarity.rst
│ │ └── usage.rst
│ └── sparse_encoder/
│ ├── loss_overview.md
│ ├── pretrained_models.md
│ ├── training/
│ │ └── examples.rst
│ ├── training_overview.md
│ └── usage/
│ ├── efficiency.rst
│ └── usage.rst
├── examples/
│ ├── cross_encoder/
│ │ ├── applications/
│ │ │ ├── README.md
│ │ │ ├── cross-encoder_reranking.py
│ │ │ └── cross-encoder_usage.py
│ │ └── training/
│ │ ├── README.md
│ │ ├── distillation/
│ │ │ ├── README.md
│ │ │ ├── train_cross_encoder_kd_margin_mse.py
│ │ │ └── train_cross_encoder_kd_mse.py
│ │ ├── ms_marco/
│ │ │ ├── README.md
│ │ │ ├── eval_cross-encoder-trec-dl.py
│ │ │ ├── training_ms_marco_bce.py
│ │ │ ├── training_ms_marco_bce_preprocessed.py
│ │ │ ├── training_ms_marco_cmnrl.py
│ │ │ ├── training_ms_marco_lambda.py
│ │ │ ├── training_ms_marco_lambda_hard_neg.py
│ │ │ ├── training_ms_marco_lambda_preprocessed.py
│ │ │ ├── training_ms_marco_listmle.py
│ │ │ ├── training_ms_marco_listnet.py
│ │ │ ├── training_ms_marco_plistmle.py
│ │ │ └── training_ms_marco_ranknet.py
│ │ ├── nli/
│ │ │ ├── README.md
│ │ │ └── training_nli.py
│ │ ├── quora_duplicate_questions/
│ │ │ ├── README.md
│ │ │ └── training_quora_duplicate_questions.py
│ │ ├── rerankers/
│ │ │ ├── README.md
│ │ │ ├── training_gooaq_bce.py
│ │ │ ├── training_gooaq_cmnrl.py
│ │ │ ├── training_gooaq_lambda.py
│ │ │ └── training_nq_bce.py
│ │ └── sts/
│ │ ├── README.md
│ │ └── training_stsbenchmark.py
│ ├── sentence_transformer/
│ │ ├── README.md
│ │ ├── applications/
│ │ │ ├── README.md
│ │ │ ├── clustering/
│ │ │ │ ├── README.md
│ │ │ │ ├── agglomerative.py
│ │ │ │ ├── fast_clustering.py
│ │ │ │ └── kmeans.py
│ │ │ ├── computing-embeddings/
│ │ │ │ ├── README.rst
│ │ │ │ ├── computing_embeddings.py
│ │ │ │ ├── computing_embeddings_multi_gpu.py
│ │ │ │ └── computing_embeddings_streaming.py
│ │ │ ├── embedding-quantization/
│ │ │ │ ├── README.md
│ │ │ │ ├── semantic_search_faiss.py
│ │ │ │ ├── semantic_search_faiss_benchmark.py
│ │ │ │ ├── semantic_search_recommended.py
│ │ │ │ ├── semantic_search_usearch.py
│ │ │ │ └── semantic_search_usearch_benchmark.py
│ │ │ ├── image-search/
│ │ │ │ ├── Image_Classification.ipynb
│ │ │ │ ├── Image_Clustering.ipynb
│ │ │ │ ├── Image_Duplicates.ipynb
│ │ │ │ ├── Image_Search-multilingual.ipynb
│ │ │ │ ├── Image_Search.ipynb
│ │ │ │ ├── README.md
│ │ │ │ └── example.py
│ │ │ ├── parallel-sentence-mining/
│ │ │ │ ├── README.md
│ │ │ │ ├── bitext_mining.py
│ │ │ │ ├── bitext_mining_utils.py
│ │ │ │ └── bucc2018.py
│ │ │ ├── paraphrase-mining/
│ │ │ │ └── README.md
│ │ │ ├── retrieve_rerank/
│ │ │ │ ├── README.md
│ │ │ │ ├── in_document_search_crossencoder.py
│ │ │ │ └── retrieve_rerank_simple_wikipedia.ipynb
│ │ │ ├── semantic-search/
│ │ │ │ ├── README.md
│ │ │ │ ├── semantic_search.py
│ │ │ │ ├── semantic_search_nq_opensearch.py
│ │ │ │ ├── semantic_search_publications.py
│ │ │ │ ├── semantic_search_quora_annoy.py
│ │ │ │ ├── semantic_search_quora_elasticsearch.py
│ │ │ │ ├── semantic_search_quora_faiss.py
│ │ │ │ ├── semantic_search_quora_hnswlib.py
│ │ │ │ ├── semantic_search_quora_pytorch.py
│ │ │ │ └── semantic_search_wikipedia_qa.py
│ │ │ └── text-summarization/
│ │ │ ├── LexRank.py
│ │ │ ├── README.md
│ │ │ └── text-summarization.py
│ │ ├── domain_adaptation/
│ │ │ └── README.md
│ │ ├── evaluation/
│ │ │ ├── evaluation_inference_speed.py
│ │ │ ├── evaluation_no_dup_batch_sampler_speed.py
│ │ │ ├── evaluation_stsbenchmark.py
│ │ │ └── evaluation_translation_matching.py
│ │ ├── training/
│ │ │ ├── README.md
│ │ │ ├── adaptive_layer/
│ │ │ │ ├── README.md
│ │ │ │ ├── adaptive_layer_nli.py
│ │ │ │ └── adaptive_layer_sts.py
│ │ │ ├── avg_word_embeddings/
│ │ │ │ ├── training_stsbenchmark_avg_word_embeddings.py
│ │ │ │ ├── training_stsbenchmark_bilstm.py
│ │ │ │ ├── training_stsbenchmark_bow.py
│ │ │ │ ├── training_stsbenchmark_cnn.py
│ │ │ │ └── training_stsbenchmark_tf-idf_word_embeddings.py
│ │ │ ├── clip/
│ │ │ │ ├── train_clip.ipynb
│ │ │ │ └── training_clip_flickr8k_mlflow.py
│ │ │ ├── data_augmentation/
│ │ │ │ ├── README.md
│ │ │ │ ├── train_sts_indomain_bm25.py
│ │ │ │ ├── train_sts_indomain_nlpaug.py
│ │ │ │ ├── train_sts_indomain_semantic.py
│ │ │ │ ├── train_sts_qqp_crossdomain.py
│ │ │ │ └── train_sts_seed_optimization.py
│ │ │ ├── distillation/
│ │ │ │ ├── README.md
│ │ │ │ ├── dimensionality_reduction.py
│ │ │ │ ├── model_distillation.py
│ │ │ │ ├── model_distillation_layer_reduction.py
│ │ │ │ └── model_quantization.py
│ │ │ ├── hpo/
│ │ │ │ ├── README.rst
│ │ │ │ └── hpo_nli.py
│ │ │ ├── matryoshka/
│ │ │ │ ├── 2d_matryoshka_nli.py
│ │ │ │ ├── 2d_matryoshka_sts.py
│ │ │ │ ├── README.md
│ │ │ │ ├── matryoshka_eval_stsb.py
│ │ │ │ ├── matryoshka_nli.py
│ │ │ │ ├── matryoshka_nli_reduced_dim.py
│ │ │ │ └── matryoshka_sts.py
│ │ │ ├── ms_marco/
│ │ │ │ ├── README.md
│ │ │ │ ├── eval_msmarco.py
│ │ │ │ ├── multilingual/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── translate_queries.py
│ │ │ │ ├── train-kldiv.py
│ │ │ │ ├── train-margin-mse.py
│ │ │ │ ├── train_bi-encoder_margin-mse.py
│ │ │ │ └── train_bi-encoder_mnrl.py
│ │ │ ├── multilingual/
│ │ │ │ ├── README.md
│ │ │ │ ├── get_parallel_data_opus.py
│ │ │ │ ├── get_parallel_data_talks.py
│ │ │ │ ├── get_parallel_data_tatoeba.py
│ │ │ │ ├── get_parallel_data_wikimatrix.py
│ │ │ │ └── make_multilingual.py
│ │ │ ├── nli/
│ │ │ │ ├── README.md
│ │ │ │ ├── training_nli.py
│ │ │ │ ├── training_nli_angle.py
│ │ │ │ ├── training_nli_v2.py
│ │ │ │ └── training_nli_v3.py
│ │ │ ├── other/
│ │ │ │ ├── training_batch_hard_trec.py
│ │ │ │ ├── training_gooaq_infonce_gor.py
│ │ │ │ ├── training_multi-task.py
│ │ │ │ └── training_wikipedia_sections.py
│ │ │ ├── paraphrases/
│ │ │ │ ├── README.md
│ │ │ │ └── training.py
│ │ │ ├── peft/
│ │ │ │ ├── README.md
│ │ │ │ └── training_gooaq_lora.py
│ │ │ ├── prompts/
│ │ │ │ ├── README.md
│ │ │ │ └── training_nq_prompts.py
│ │ │ ├── quora_duplicate_questions/
│ │ │ │ ├── README.md
│ │ │ │ ├── application_duplicate_questions_mining.py
│ │ │ │ ├── create_splits.py
│ │ │ │ ├── training_MultipleNegativesRankingLoss.py
│ │ │ │ ├── training_OnlineContrastiveLoss.py
│ │ │ │ └── training_multi-task-learning.py
│ │ │ ├── sts/
│ │ │ │ ├── README.md
│ │ │ │ ├── training_stsbenchmark.py
│ │ │ │ └── training_stsbenchmark_continue_training.py
│ │ │ └── unsloth/
│ │ │ ├── README.md
│ │ │ ├── training_gooaq_unsloth.py
│ │ │ └── training_medical_unsloth.py
│ │ └── unsupervised_learning/
│ │ ├── CT/
│ │ │ ├── README.md
│ │ │ ├── train_askubuntu_ct.py
│ │ │ ├── train_ct_from_file.py
│ │ │ └── train_stsb_ct.py
│ │ ├── CT_In-Batch_Negatives/
│ │ │ ├── README.md
│ │ │ ├── train_askubuntu_ct-improved.py
│ │ │ ├── train_ct-improved_from_file.py
│ │ │ └── train_stsb_ct-improved.py
│ │ ├── MLM/
│ │ │ ├── README.md
│ │ │ └── train_mlm.py
│ │ ├── README.md
│ │ ├── SimCSE/
│ │ │ ├── README.md
│ │ │ ├── train_askubuntu_simcse.py
│ │ │ ├── train_simcse_from_file.py
│ │ │ └── train_stsb_simcse.py
│ │ ├── TSDAE/
│ │ │ ├── README.md
│ │ │ ├── eval_askubuntu.py
│ │ │ ├── train_askubuntu_tsdae.py
│ │ │ ├── train_stsb_tsdae.py
│ │ │ └── train_tsdae_from_file.py
│ │ └── query_generation/
│ │ ├── 1_programming_query_generation.py
│ │ ├── 2_programming_train_bi-encoder.py
│ │ ├── 3_programming_semantic_search.py
│ │ ├── README.md
│ │ └── example_query_generation.py
│ └── sparse_encoder/
│ ├── applications/
│ │ ├── README.md
│ │ ├── computing_embeddings/
│ │ │ ├── README.rst
│ │ │ └── compute_embeddings.py
│ │ ├── retrieve_rerank/
│ │ │ ├── README.md
│ │ │ ├── hybrid_search.py
│ │ │ └── retrieve_rerank_simple_wikipedia.ipynb
│ │ ├── semantic_search/
│ │ │ ├── README.md
│ │ │ ├── semantic_search_elasticsearch.py
│ │ │ ├── semantic_search_manual.py
│ │ │ ├── semantic_search_opensearch.py
│ │ │ ├── semantic_search_qdrant.py
│ │ │ ├── semantic_search_seismic.py
│ │ │ └── semantic_search_splade_index.py
│ │ └── semantic_textual_similarity/
│ │ ├── README.md
│ │ └── semantic_textual_similarity.py
│ ├── evaluation/
│ │ ├── README.md
│ │ ├── sparse_classification_evaluator.py
│ │ ├── sparse_mse_evaluator.py
│ │ ├── sparse_nanobeir_advanced_evaluator.py
│ │ ├── sparse_nanobeir_evaluator.py
│ │ ├── sparse_reranking_evaluator.py
│ │ ├── sparse_retrieval_evaluator.py
│ │ ├── sparse_similarity_evaluator.py
│ │ ├── sparse_translation_evaluator.py
│ │ └── sparse_triplet_evaluator.py
│ └── training/
│ ├── README.md
│ ├── distillation/
│ │ ├── README.md
│ │ └── train_splade_msmarco_margin_mse.py
│ ├── ms_marco/
│ │ ├── README.md
│ │ └── train_splade_msmarco_mnrl.py
│ ├── nli/
│ │ ├── README.md
│ │ └── train_splade_nli.py
│ ├── peft/
│ │ └── train_splade_gooaq_peft.py
│ ├── quora_duplicate_questions/
│ │ ├── README.md
│ │ └── training_splade_quora.py
│ ├── retrievers/
│ │ ├── README.md
│ │ ├── train_csr_nq.py
│ │ ├── train_splade_gooaq.py
│ │ ├── train_splade_nq.py
│ │ └── train_splade_nq_cached.py
│ └── sts/
│ ├── README.md
│ └── train_splade_stsbenchmark.py
├── index.rst
├── pyproject.toml
├── sentence_transformers/
│ ├── LoggingHandler.py
│ ├── SentenceTransformer.py
│ ├── __init__.py
│ ├── backend/
│ │ ├── __init__.py
│ │ ├── load.py
│ │ ├── optimize.py
│ │ ├── quantize.py
│ │ └── utils.py
│ ├── cross_encoder/
│ │ ├── CrossEncoder.py
│ │ ├── __init__.py
│ │ ├── data_collator.py
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── classification.py
│ │ │ ├── correlation.py
│ │ │ ├── deprecated.py
│ │ │ ├── nano_beir.py
│ │ │ └── reranking.py
│ │ ├── fit_mixin.py
│ │ ├── losses/
│ │ │ ├── BinaryCrossEntropyLoss.py
│ │ │ ├── CachedMultipleNegativesRankingLoss.py
│ │ │ ├── CrossEntropyLoss.py
│ │ │ ├── LambdaLoss.py
│ │ │ ├── ListMLELoss.py
│ │ │ ├── ListNetLoss.py
│ │ │ ├── MSELoss.py
│ │ │ ├── MarginMSELoss.py
│ │ │ ├── MultipleNegativesRankingLoss.py
│ │ │ ├── PListMLELoss.py
│ │ │ ├── RankNetLoss.py
│ │ │ └── __init__.py
│ │ ├── model_card.py
│ │ ├── model_card_template.md
│ │ ├── trainer.py
│ │ ├── training_args.py
│ │ └── util.py
│ ├── data_collator.py
│ ├── datasets/
│ │ ├── DenoisingAutoEncoderDataset.py
│ │ ├── NoDuplicatesDataLoader.py
│ │ ├── ParallelSentencesDataset.py
│ │ ├── SentenceLabelDataset.py
│ │ ├── SentencesDataset.py
│ │ └── __init__.py
│ ├── evaluation/
│ │ ├── BinaryClassificationEvaluator.py
│ │ ├── EmbeddingSimilarityEvaluator.py
│ │ ├── InformationRetrievalEvaluator.py
│ │ ├── LabelAccuracyEvaluator.py
│ │ ├── MSEEvaluator.py
│ │ ├── MSEEvaluatorFromDataFrame.py
│ │ ├── NanoBEIREvaluator.py
│ │ ├── ParaphraseMiningEvaluator.py
│ │ ├── RerankingEvaluator.py
│ │ ├── SentenceEvaluator.py
│ │ ├── SequentialEvaluator.py
│ │ ├── SimilarityFunction.py
│ │ ├── TranslationEvaluator.py
│ │ ├── TripletEvaluator.py
│ │ └── __init__.py
│ ├── fit_mixin.py
│ ├── losses/
│ │ ├── AdaptiveLayerLoss.py
│ │ ├── AnglELoss.py
│ │ ├── BatchAllTripletLoss.py
│ │ ├── BatchHardSoftMarginTripletLoss.py
│ │ ├── BatchHardTripletLoss.py
│ │ ├── BatchSemiHardTripletLoss.py
│ │ ├── CachedGISTEmbedLoss.py
│ │ ├── CachedMultipleNegativesRankingLoss.py
│ │ ├── CachedMultipleNegativesSymmetricRankingLoss.py
│ │ ├── CoSENTLoss.py
│ │ ├── ContrastiveLoss.py
│ │ ├── ContrastiveTensionLoss.py
│ │ ├── CosineSimilarityLoss.py
│ │ ├── DenoisingAutoEncoderLoss.py
│ │ ├── DistillKLDivLoss.py
│ │ ├── GISTEmbedLoss.py
│ │ ├── GlobalOrthogonalRegularizationLoss.py
│ │ ├── MSELoss.py
│ │ ├── MarginMSELoss.py
│ │ ├── Matryoshka2dLoss.py
│ │ ├── MatryoshkaLoss.py
│ │ ├── MegaBatchMarginLoss.py
│ │ ├── MultipleNegativesRankingLoss.py
│ │ ├── MultipleNegativesSymmetricRankingLoss.py
│ │ ├── OnlineContrastiveLoss.py
│ │ ├── SoftmaxLoss.py
│ │ ├── TripletLoss.py
│ │ └── __init__.py
│ ├── model_card.py
│ ├── model_card_template.md
│ ├── model_card_templates.py
│ ├── models/
│ │ ├── BoW.py
│ │ ├── CLIPModel.py
│ │ ├── CNN.py
│ │ ├── Dense.py
│ │ ├── Dropout.py
│ │ ├── InputModule.py
│ │ ├── LSTM.py
│ │ ├── LayerNorm.py
│ │ ├── Module.py
│ │ ├── Normalize.py
│ │ ├── Pooling.py
│ │ ├── Router.py
│ │ ├── StaticEmbedding.py
│ │ ├── Transformer.py
│ │ ├── WeightedLayerPooling.py
│ │ ├── WordEmbeddings.py
│ │ ├── WordWeights.py
│ │ ├── __init__.py
│ │ └── tokenizer/
│ │ ├── PhraseTokenizer.py
│ │ ├── WhitespaceTokenizer.py
│ │ ├── WordTokenizer.py
│ │ └── __init__.py
│ ├── peft_mixin.py
│ ├── py.typed
│ ├── quantization.py
│ ├── readers/
│ │ ├── InputExample.py
│ │ ├── LabelSentenceReader.py
│ │ ├── NLIDataReader.py
│ │ ├── PairedFilesReader.py
│ │ ├── STSDataReader.py
│ │ ├── TripletReader.py
│ │ └── __init__.py
│ ├── sampler.py
│ ├── similarity_functions.py
│ ├── sparse_encoder/
│ │ ├── SparseEncoder.py
│ │ ├── __init__.py
│ │ ├── callbacks/
│ │ │ ├── __init__.py
│ │ │ └── splade_callbacks.py
│ │ ├── data_collator.py
│ │ ├── evaluation/
│ │ │ ├── ReciprocalRankFusionEvaluator.py
│ │ │ ├── SparseBinaryClassificationEvaluator.py
│ │ │ ├── SparseEmbeddingSimilarityEvaluator.py
│ │ │ ├── SparseInformationRetrievalEvaluator.py
│ │ │ ├── SparseMSEEvaluator.py
│ │ │ ├── SparseNanoBEIREvaluator.py
│ │ │ ├── SparseRerankingEvaluator.py
│ │ │ ├── SparseTranslationEvaluator.py
│ │ │ ├── SparseTripletEvaluator.py
│ │ │ └── __init__.py
│ │ ├── losses/
│ │ │ ├── CSRLoss.py
│ │ │ ├── CachedSpladeLoss.py
│ │ │ ├── FlopsLoss.py
│ │ │ ├── SparseAnglELoss.py
│ │ │ ├── SparseCoSENTLoss.py
│ │ │ ├── SparseCosineSimilarityLoss.py
│ │ │ ├── SparseDistillKLDivLoss.py
│ │ │ ├── SparseMSELoss.py
│ │ │ ├── SparseMarginMSELoss.py
│ │ │ ├── SparseMultipleNegativesRankingLoss.py
│ │ │ ├── SparseTripletLoss.py
│ │ │ ├── SpladeLoss.py
│ │ │ └── __init__.py
│ │ ├── model_card.py
│ │ ├── model_card_template.md
│ │ ├── models/
│ │ │ ├── MLMTransformer.py
│ │ │ ├── SparseAutoEncoder.py
│ │ │ ├── SparseStaticEmbedding.py
│ │ │ ├── SpladePooling.py
│ │ │ └── __init__.py
│ │ ├── search_engines.py
│ │ ├── trainer.py
│ │ └── training_args.py
│ ├── trainer.py
│ ├── training_args.py
│ └── util/
│ ├── __init__.py
│ ├── decorators.py
│ ├── distributed.py
│ ├── environment.py
│ ├── file_io.py
│ ├── hard_negatives.py
│ ├── misc.py
│ ├── retrieval.py
│ ├── similarity.py
│ └── tensor.py
└── tests/
├── __init__.py
├── conftest.py
├── cross_encoder/
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_backends.py
│ ├── test_cross_encoder.py
│ ├── test_deprecated_imports.py
│ ├── test_model_card.py
│ ├── test_multi_process.py
│ ├── test_pretrained.py
│ ├── test_train_stsb.py
│ └── test_trainer.py
├── evaluation/
│ ├── test_binary_classification_evaluator.py
│ ├── test_information_retrieval_evaluator.py
│ ├── test_label_accuracy_evaluator.py
│ ├── test_nanobeir_evaluator.py
│ ├── test_paraphrase_mining_evaluator.py
│ └── test_triplet_evaluator.py
├── losses/
│ └── test_MatryoshkaLoss.py
├── models/
│ ├── __init__.py
│ ├── test_dense.py
│ ├── test_pooling.py
│ ├── test_router.py
│ ├── test_static_embedding.py
│ └── test_transformer.py
├── samplers/
│ ├── test_group_by_label_batch_sampler.py
│ ├── test_no_duplicates_batch_sampler.py
│ └── test_round_robin_batch_sampler.py
├── sparse_encoder/
│ ├── __init__.py
│ ├── conftest.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── test_csr.py
│ │ └── test_sparse_static_embedding.py
│ ├── test_backends.py
│ ├── test_model_card.py
│ ├── test_multi_process.py
│ ├── test_opensearch_models.py
│ ├── test_pretrained.py
│ ├── test_sparse_encoder.py
│ ├── test_train_stsb.py
│ ├── test_trainer.py
│ └── utils.py
├── test_backends.py
├── test_cmnrl.py
├── test_compute_embeddings.py
├── test_custom_models.py
├── test_image_embeddings.py
├── test_model_card.py
├── test_model_card_data.py
├── test_multi_process.py
├── test_pretrained.py
├── test_pretrained_stsb.py
├── test_sentence_transformer.py
├── test_train_stsb.py
├── test_trainer.py
├── test_training_args.py
├── util/
│ ├── test_hard_negatives.py
│ ├── test_import.py
│ ├── test_retrieval.py
│ ├── test_similarity.py
│ └── test_tensor.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/quality.yml
================================================
name: Quality
on:
push:
branches:
- main
- "*-release"
- "*-pre"
pull_request:
branches:
- main
- "*-release"
- "*-pre"
workflow_dispatch:
jobs:
check_code_quality:
name: Check code quality
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Setup Python environment
uses: actions/setup-python@v6
with:
python-version: "3.10"
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install pre-commit
run: uv pip install pre-commit --system
- name: Code quality
run: |
make check
================================================
FILE: .github/workflows/tests.yml
================================================
name: Unit tests
on:
push:
branches:
- main
- "*-release"
pull_request:
branches:
- main
- "*-release"
workflow_dispatch:
env:
TRANSFORMERS_IS_CI: 1
HF_HUB_DISABLE_PROGRESS_BARS: 1 # The Transformers v5 weight loading progress bars heavily expand the logs
jobs:
test_sampling:
name: Run unit tests
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13"]
os: [ubuntu-latest, windows-latest]
transformers-version: ["<5.0.0", ">=5.0.0"]
fail-fast: false
runs-on: ${{ matrix.os }}
steps:
- name: Remove unnecessary files
run: |
df -h /
# Remove software and language runtimes we're not using
sudo rm -rf \
"$AGENT_TOOLSDIRECTORY" \
/opt/google/chrome \
/opt/microsoft/msedge \
/opt/microsoft/powershell \
/opt/pipx \
/usr/lib/mono \
/usr/local/julia* \
/usr/local/lib/android \
/usr/local/lib/node_modules \
/usr/local/share/chromium \
/usr/local/share/powershell \
/usr/share/dotnet \
/usr/share/swift
df -h /
if: runner.os == 'Linux'
- name: Checkout code
uses: actions/checkout@v6
- name: Setup Python environment
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install dependencies (transformers < 5.0.0)
if: ${{ matrix.transformers-version == '<5.0.0' }}
run: uv pip install '.[train, onnx, openvino, dev]' 'transformers<5.0.0' --system
- name: Install dependencies (transformers >= 5.0.0)
if: ${{ matrix.transformers-version == '>=5.0.0' }}
run: uv pip install '.[train, dev]' 'transformers>=5.0.0' --system
- name: Install model2vec
run: uv pip install model2vec --system
if: ${{ contains(fromJSON('["3.10", "3.11", "3.12", "3.13"]'), matrix.python-version) }}
- name: Run unit tests
run: |
python -m pytest --durations 20 -sv tests/
================================================
FILE: .gitignore
================================================
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# Docs
/docs/_build/
/docs/make.bat
# Editors
.idea
.vscode
# Coverage
htmlcov
.coverage*
coverage.xml
# Examples
/examples/**/output/*
/examples/datasets/
/examples/embeddings/
/examples/sentence_transformer/training/quora_duplicate_questions/quora-IR-dataset/
examples/datasets/*/
# Specific files and folders
/pretrained-models/
/cheatsheet.txt
/testsuite.txt
/TODO.txt
# Virtual environments
.env
.venv
env/
venv/
# Database
/qdrant_storage
/elastic-start-local
# Others
*.pyc
*.gz
*.tsv
tmp_*.py
nr_*/
wandb
checkpoints
tmp
.DS_Store
/runs
/tmp_trainer/
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.5
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
- id: ruff-format
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 Nils Reimers
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include sentence_transformers/model_card_template.md
include sentence_transformers/cross_encoder/model_card_template.md
include sentence_transformers/sparse_encoder/model_card_template.md
================================================
FILE: Makefile
================================================
.PHONY: check
check: ## Run code quality tools.
@echo "Linting code via pre-commit"
@pre-commit run -a
.PHONY: test
test: ## Run unit tests
@pytest
.PHONY: test-cov
test-cov: ## Run unit tests and generate a coverage report
@pytest --cov-report term --cov-report=html --cov=sentence_transformers
.PHONY: help
help: ## Show help for the commands.
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
.DEFAULT_GOAL := help
================================================
FILE: NOTICE.txt
================================================
-------------------------------------------------------------------------------
Sentence Transformers
Copyright 2019-2025
Ubiquitous Knowledge Processing (UKP) Lab
Technische Universität Darmstadt
Copyright 2025-present
Hugging Face, Inc.
-------------------------------------------------------------------------------
================================================
FILE: README.md
================================================
<!--- BADGES: START --->
[](https://huggingface.co/models?library=sentence-transformers)
[][#github-license]
[][#pypi-package]
[][#pypi-package]
[][#docs-package]
<!-- [][#pypi-package] -->
<!--- BADGES: END --->
# Sentence Transformers: Embeddings, Retrieval, and Reranking
This framework provides an easy method to compute embeddings for accessing, using, and training state-of-the-art embedding and reranker models. It can be used to compute embeddings using Sentence Transformer models ([quickstart](https://sbert.net/docs/quickstart.html#sentence-transformer)), to calculate similarity scores using Cross-Encoder (a.k.a. reranker) models ([quickstart](https://sbert.net/docs/quickstart.html#cross-encoder)) or to generate sparse embeddings using Sparse Encoder models ([quickstart](https://sbert.net/docs/quickstart.html#sparse-encoder)). This unlocks a wide range of applications, including [semantic search](https://sbert.net/examples/applications/semantic-search/README.html), [semantic textual similarity](https://sbert.net/docs/sentence_transformer/usage/semantic_textual_similarity.html), and [paraphrase mining](https://sbert.net/examples/applications/paraphrase-mining/README.html).
A wide selection of over [15,000 pre-trained Sentence Transformers models](https://huggingface.co/models?library=sentence-transformers) are available for immediate use on 🤗 Hugging Face, including many of the state-of-the-art models from the [Massive Text Embeddings Benchmark (MTEB) leaderboard](https://huggingface.co/spaces/mteb/leaderboard). Additionally, it is easy to train or finetune your own [embedding models](https://sbert.net/docs/sentence_transformer/training_overview.html), [reranker models](https://sbert.net/docs/cross_encoder/training_overview.html) or [sparse encoder models](https://sbert.net/docs/sparse_encoder/training_overview.html) using Sentence Transformers, enabling you to create custom models for your specific use cases.
For the **full documentation**, see **[www.SBERT.net](https://www.sbert.net)**.
## Installation
We recommend **Python 3.10+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[transformers v4.34.0+](https://github.com/huggingface/transformers)**.
**Install with pip**
```
pip install -U sentence-transformers
```
**Install with conda**
```
conda install -c conda-forge sentence-transformers
```
**Install from sources**
Alternatively, you can also clone the latest version from the [repository](https://github.com/huggingface/sentence-transformers) and install it directly from the source code:
```
pip install -e .
```
**PyTorch with CUDA**
If you want to use a GPU / CUDA, you must install PyTorch with the matching CUDA Version. Follow
[PyTorch - Get Started](https://pytorch.org/get-started/locally/) for further details how to install PyTorch.
## Getting Started
See [Quickstart](https://www.sbert.net/docs/quickstart.html) in our documentation.
### Embedding Models
First download a pretrained embedding a.k.a. Sentence Transformer model.
```python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
```
Then provide some texts to the model.
```python
sentences = [
"The weather is lovely today.",
"It's so sunny outside!",
"He drove to the stadium.",
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# => (3, 384)
```
And that's already it. We now have numpy arrays with the embeddings, one for each text. We can use these to compute similarities.
```python
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
# [0.6660, 1.0000, 0.1411],
# [0.1046, 0.1411, 1.0000]])
```
### Reranker Models
First download a pretrained reranker a.k.a. Cross Encoder model.
```python
from sentence_transformers import CrossEncoder
# 1. Load a pretrained CrossEncoder model
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")
```
Then provide some texts to the model.
```python
# The texts for which to predict similarity scores
query = "How many people live in Berlin?"
passages = [
"Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.",
"Berlin has a yearly total of about 135 million day visitors, making it one of the most-visited cities in the European Union.",
"In 2013 around 600,000 Berliners were registered in one of the more than 2,300 sport and fitness clubs.",
]
# 2a. predict scores for pairs of texts
scores = model.predict([(query, passage) for passage in passages])
print(scores)
# => [8.607139 5.506266 6.352977]
```
And we're good to go. You can also use [`model.rank`](https://sbert.net/docs/package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.CrossEncoder.rank) to avoid having to perform the reranking manually:
```python
# 2b. Rank a list of passages for a query
ranks = model.rank(query, passages, return_documents=True)
print("Query:", query)
for rank in ranks:
print(f"- #{rank['corpus_id']} ({rank['score']:.2f}): {rank['text']}")
"""
Query: How many people live in Berlin?
- #0 (8.61): Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.
- #2 (6.35): In 2013 around 600,000 Berliners were registered in one of the more than 2,300 sport and fitness clubs.
- #1 (5.51): Berlin has a yearly total of about 135 million day visitors, making it one of the most-visited cities in the European Union.
"""
```
### Sparse Encoder Models
First download a pretrained sparse embedding a.k.a. Sparse Encoder model.
```python
from sentence_transformers import SparseEncoder
# 1. Load a pretrained SparseEncoder model
model = SparseEncoder("naver/splade-cocondenser-ensembledistil")
# The sentences to encode
sentences = [
"The weather is lovely today.",
"It's so sunny outside!",
"He drove to the stadium.",
]
# 2. Calculate sparse embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 30522] - sparse representation with vocabulary size dimensions
# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[ 35.629, 9.154, 0.098],
# [ 9.154, 27.478, 0.019],
# [ 0.098, 0.019, 29.553]])
# 4. Check sparsity stats
stats = SparseEncoder.sparsity(embeddings)
print(f"Sparsity: {stats['sparsity_ratio']:.2%}")
# Sparsity: 99.84%
```
## Pre-Trained Models
We provide a large list of pretrained models for more than 100 languages. Some models are general purpose models, while others produce embeddings for specific use cases.
- [Pretrained Sentence Transformer (Embedding) Models](https://sbert.net/docs/sentence_transformer/pretrained_models.html)
- [Pretrained Cross Encoder (Reranker) Models](https://sbert.net/docs/cross_encoder/pretrained_models.html)
- [Pretrained Sparse Encoder (Sparse Embeddings) Models](https://sbert.net/docs/sparse_encoder/pretrained_models.html)
## Training
This framework allows you to fine-tune your own sentence embedding methods, so that you get task-specific sentence embeddings. You have various options to choose from in order to get perfect sentence embeddings for your specific task.
- Embedding Models
- [Sentence Transformer > Training Overview](https://www.sbert.net/docs/sentence_transformer/training_overview.html)
- [Sentence Transformer > Training Examples](https://www.sbert.net/docs/sentence_transformer/training/examples.html) or [training examples on GitHub](https://github.com/huggingface/sentence-transformers/tree/main/examples/sentence_transformer/training).
- Reranker Models
- [Cross Encoder > Training Overview](https://www.sbert.net/docs/cross_encoder/training_overview.html)
- [Cross Encoder > Training Examples](https://www.sbert.net/docs/cross_encoder/training/examples.html) or [training examples on GitHub](https://github.com/huggingface/sentence-transformers/tree/main/examples/cross_encoder/training).
- Sparse Embedding Models
- [Sparse Encoder > Training Overview](https://www.sbert.net/docs/sparse_encoder/training_overview.html)
- [Sparse Encoder > Training Examples](https://www.sbert.net/docs/sparse_encoder/training/examples.html) or [training examples on GitHub](https://github.com/huggingface/sentence-transformers/tree/main/examples/sparse_encoder/training).
Some highlights across the different types of training are:
- Support of various transformer networks including BERT, RoBERTa, XLM-R, DistilBERT, Electra, BART, ...
- Multi-Lingual and multi-task learning
- Evaluation during training to find optimal model
- [20+ loss functions](https://www.sbert.net/docs/package_reference/sentence_transformer/losses.html) for embedding models, [10+ loss functions](https://www.sbert.net/docs/package_reference/cross_encoder/losses.html) for reranker models and [10+ loss functions](https://www.sbert.net/docs/package_reference/sparse_encoder/losses.html) for sparse embedding models, allowing you to tune models specifically for semantic search, paraphrase mining, semantic similarity comparison, clustering, triplet loss, contrastive loss, etc.
## Application Examples
You can use this framework for:
- **Computing Sentence Embeddings**
- [Dense Embeddings](https://www.sbert.net/examples/sentence_transformer/applications/computing-embeddings/README.html)
- [Sparse Embeddings](https://www.sbert.net/examples/sparse_encoder/applications/computing_embeddings/README.html)
- **Semantic Textual Similarity**
- [Dense STS](https://www.sbert.net/docs/sentence_transformer/usage/semantic_textual_similarity.html)
- [Sparse STS](https://www.sbert.net/examples/sparse_encoder/applications/semantic_textual_similarity/README.html)
- **Semantic Search**
- [Dense Search](https://www.sbert.net/examples/sentence_transformer/applications/semantic-search/README.html)
- [Sparse Search](https://www.sbert.net/examples/sparse_encoder/applications/semantic_search/README.html)
- **Retrieve & Re-Rank**
- [Dense only Retrieval](https://www.sbert.net/examples/sentence_transformer/applications/retrieve_rerank/README.html)
- [Sparse/Dense/Hybrid Retrieval](https://www.sbert.net/examples/sentence_transformer/applications/retrieve_rerank/README.html)
- [Clustering](https://www.sbert.net/examples/sentence_transformer/applications/clustering/README.html)
- [Paraphrase Mining](https://www.sbert.net/examples/sentence_transformer/applications/paraphrase-mining/README.html)
- [Translated Sentence Mining](https://www.sbert.net/examples/sentence_transformer/applications/parallel-sentence-mining/README.html)
- [Multilingual Image Search, Clustering & Duplicate Detection](https://www.sbert.net/examples/sentence_transformer/applications/image-search/README.html)
and many more use-cases.
For all examples, see [examples/sentence_transformer/applications](https://github.com/huggingface/sentence-transformers/tree/main/examples/sentence_transformer/applications).
## Development setup
After cloning the repo (or a fork) to your machine, in a virtual environment, run:
```
python -m pip install -e ".[dev]"
pre-commit install
```
To test your changes, run:
```
pytest
```
## Citing & Authors
If you find this repository helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://huggingface.co/papers/1908.10084):
```bibtex
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
```
If you use one of the multilingual models, feel free to cite our publication [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://huggingface.co/papers/2004.09813):
```bibtex
@inproceedings{reimers-2020-multilingual-sentence-bert,
title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2020",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/2004.09813",
}
```
Please have a look at [Publications](https://www.sbert.net/docs/publications.html) for our different publications that are integrated into SentenceTransformers.
### Maintainers
Maintainer: [Tom Aarsen](https://github.com/tomaarsen), 🤗 Hugging Face
Don't hesitate to open an issue if something is broken (and it shouldn't be) or if you have further questions.
---
This project was originally developed by the [Ubiquitous Knowledge Processing (UKP) Lab](https://www.ukp.tu-darmstadt.de/) at TU Darmstadt. We're grateful for their foundational work and continued contributions to the field.
> This repository contains experimental software and is published for the sole purpose of giving additional background details on the respective publication.
[#docs-package]: https://www.sbert.net/
[#github-license]: https://github.com/huggingface/sentence-transformers/blob/main/LICENSE
[#pypi-package]: https://pypi.org/project/sentence-transformers/
================================================
FILE: docs/.htaccess
================================================
RewriteEngine On
RewriteCond %{HTTPS} !=on
RewriteRule ^ https://%{HTTP_HOST}%{REQUEST_URI} [L,R=301]
# Moved main pages for v3.0
Redirect 301 /docs/pretrained_models.html /docs/sentence_transformer/pretrained_models.html
Redirect 301 /docs/pretrained_cross-encoders.html /docs/cross_encoder/pretrained_models.html
Redirect 301 /docs/usage/semantic_textual_similarity.html /docs/sentence_transformer/usage/semantic_textual_similarity.html
Redirect 301 /docs/training/loss_overview.html /docs/sentence_transformer/loss_overview.html
Redirect 301 /docs/training/multilingual.html /examples/sentence_transformer/training/multilingual/README.html
Redirect 301 /docs/training/overview.html /docs/sentence_transformer/training_overview.html
Redirect 301 /examples/applications/information-retrieval/README.html /examples/sentence_transformer/applications/retrieve_rerank/README.html
Redirect 301 /examples/datasets/README.html /docs/sentence_transformer/dataset_overview.html
Redirect 301 /examples/training/datasets/README.html /docs/sentence_transformer/dataset_overview.html
# Moved API Reference pages for v3.0
Redirect 301 /docs/package_reference/cross_encoder.html /docs/package_reference/cross_encoder/cross_encoder.html
Redirect 301 /docs/package_reference/datasets.html /docs/package_reference/sentence_transformer/datasets.html
Redirect 301 /docs/package_reference/evaluation.html /docs/package_reference/sentence_transformer/evaluation.html
Redirect 301 /docs/package_reference/losses.html /docs/package_reference/sentence_transformer/losses.html
Redirect 301 /docs/package_reference/models.html /docs/package_reference/sentence_transformer/models.html
Redirect 301 /docs/package_reference/quantization.html /docs/package_reference/sentence_transformer/quantization.html
Redirect 301 /docs/package_reference/SentenceTransformer.html /docs/package_reference/sentence_transformer/SentenceTransformer.html
# Removed pages for v3.0
Redirect 301 /release_notes.html /index.html
Redirect 301 /docs/contact.html /index.html
Redirect 301 /docs/index.html /index.html
Redirect 301 /examples/applications/image-search/tmp-clip-model/README.html /index.html
# Removed pages for v3.0 (that shouldnt go to Home)
Redirect 301 /docs/hugging_face.html /docs/sentence_transformer/pretrained_models.html
Redirect 301 /docs/pretrained_models_performance.html /docs/sentence_transformer/pretrained_models.html
Redirect 301 /docs/package_reference/readers.html /docs/package_reference/sentence_transformer/index.html
Redirect 301 /docs/pretrained-models/msmarco.html /docs/pretrained-models/msmarco-v1.html
Redirect 301 /docs/examples/training/sts/README.html /examples/sentence_transformer/training/sts/README.html
# Moved example pages for v4.0
Redirect 301 /examples/training/ms_marco/cross_encoder_README.html /examples/cross_encoder/training/ms_marco/README.html
Redirect 301 /examples/applications/cross-encoder/README.html /examples/cross_encoder/applications/README.html
Redirect 301 /examples/applications/clustering/README.html /examples/sentence_transformer/applications/clustering/README.html
Redirect 301 /examples/applications/embedding-quantization/README.html /examples/sentence_transformer/applications/embedding-quantization/README.html
Redirect 301 /examples/applications/image-search/README.html /examples/sentence_transformer/applications/image-search/README.html
Redirect 301 /examples/applications/parallel-sentence-mining/README.html /examples/sentence_transformer/applications/parallel-sentence-mining/README.html
Redirect 301 /examples/applications/paraphrase-mining/README.html /examples/sentence_transformer/applications/paraphrase-mining/README.html
Redirect 301 /examples/applications/retrieve_rerank/README.html /examples/sentence_transformer/applications/retrieve_rerank/README.html
Redirect 301 /examples/applications/semantic-search/README.html /examples/sentence_transformer/applications/semantic-search/README.html
Redirect 301 /examples/applications/text-summarization/README.html /examples/sentence_transformer/applications/text-summarization/README.html
Redirect 301 /examples/domain_adaptation/README.html /examples/sentence_transformer/domain_adaptation/README.html
Redirect 301 /examples/README.html /examples/sentence_transformer/README.html
Redirect 301 /examples/training/adaptive_layer/README.html /examples/sentence_transformer/training/adaptive_layer/README.html
Redirect 301 /examples/training/data_augmentation/README.html /examples/sentence_transformer/training/data_augmentation/README.html
Redirect 301 /examples/training/distillation/README.html /examples/sentence_transformer/training/distillation/README.html
Redirect 301 /examples/training/matryoshka/README.html /examples/sentence_transformer/training/matryoshka/README.html
Redirect 301 /examples/training/ms_marco/multilingual/README.html /examples/sentence_transformer/training/ms_marco/multilingual/README.html
Redirect 301 /examples/training/ms_marco/README.html /examples/sentence_transformer/training/ms_marco/README.html
Redirect 301 /examples/training/multilingual/README.html /examples/sentence_transformer/training/multilingual/README.html
Redirect 301 /examples/training/nli/README.html /examples/sentence_transformer/training/nli/README.html
Redirect 301 /examples/training/paraphrases/README.html /examples/sentence_transformer/training/paraphrases/README.html
Redirect 301 /examples/training/peft/README.html /examples/sentence_transformer/training/peft/README.html
Redirect 301 /examples/training/prompts/README.html /examples/sentence_transformer/training/prompts/README.html
Redirect 301 /examples/training/quora_duplicate_questions/README.html /examples/sentence_transformer/training/quora_duplicate_questions/README.html
Redirect 301 /examples/training/README.html /examples/sentence_transformer/training/README.html
Redirect 301 /examples/training/sts/README.html /examples/sentence_transformer/training/sts/README.html
Redirect 301 /examples/training/hpo/README.html /examples/sentence_transformer/training/hpo/README.html
Redirect 301 /examples/unsupervised_learning/CT/README.html /examples/sentence_transformer/unsupervised_learning/CT/README.html
Redirect 301 /examples/unsupervised_learning/CT_In-Batch_Negatives/README.html /examples/sentence_transformer/unsupervised_learning/CT_In-Batch_Negatives/README.html
Redirect 301 /examples/unsupervised_learning/MLM/README.html /examples/sentence_transformer/unsupervised_learning/MLM/README.html
Redirect 301 /examples/unsupervised_learning/query_generation/README.html /examples/sentence_transformer/unsupervised_learning/query_generation/README.html
Redirect 301 /examples/unsupervised_learning/README.html /examples/sentence_transformer/unsupervised_learning/README.html
Redirect 301 /examples/unsupervised_learning/SimCSE/README.html /examples/sentence_transformer/unsupervised_learning/SimCSE/README.html
Redirect 301 /examples/unsupervised_learning/TSDAE/README.html /examples/sentence_transformer/unsupervised_learning/TSDAE/README.html
# Redirect to index.html when request file does not exist
# RewriteCond %{REQUEST_FILENAME} !-f
# RewriteCond %{REQUEST_FILENAME} !-d
# RewriteRule ^ /index.html [L,R=302]
ErrorDocument 404 /index.html
================================================
FILE: docs/Makefile
================================================
docs:
sphinx-build -c . -a -E .. _build
docs-quick:
sphinx-build -c . .. _build
================================================
FILE: docs/_static/css/custom.css
================================================
.wy-nav-content {
max-width: 1280px;
}
a.icon-home {
font-size: 1.4em;
}
dl.class > dt {
width: 100%;
}
dd > dl {
width: 100%;
}
.toctree-l1 > ul {
margin-top: 0px !important;
}
.wy-side-nav-search .wy-dropdown>a:hover, .wy-side-nav-search>a:hover {
background: none;
}
.project-name {
font-size: 1.4em;
}
.wy-side-nav-search {
padding-top: 0px;
}
.components {
display: flex;
flex-flow: row wrap;
gap: 1rem; /* Use gap for consistent spacing */
}
.components > .box {
flex: 0 0 auto; /* Don't grow or shrink, use natural size */
margin: 0; /* Remove margin since we're using gap */
padding: 1rem;
border-style: solid;
border-width: 1px;
border-radius: 0.5rem;
border-color: rgb(55 65 81);
background-color: #e3e3e3;
color: #404040;
width: 11.3rem;
box-sizing: border-box;
}
.components > .box:nth-child(1) > .header {
background-image: linear-gradient(to bottom right, #60a5fa, #3b82f6);
}
.components > .box:nth-child(2) > .header {
background-image: linear-gradient(to bottom right, #fb923c, #f97316);
}
.components > .box:nth-child(3) > .header {
background-image: linear-gradient(to bottom right, #f472b6, #ec4899);
}
.components > .box:nth-child(4) > .header {
background-image: linear-gradient(to bottom right, #a78bfa, #8b5cf6);
}
.components > .box:nth-child(5) > .header {
background-image: linear-gradient(to bottom right, #34d399, #10b981);
}
.components > .box:nth-child(6) > .header {
background-image: linear-gradient(to bottom right, #fbbf24, #f59e0b);
}
.components > .optional {
background: repeating-linear-gradient(
135deg,
#f1f1f1,
#f1f1f1 25px,
#e3e3e3 25px,
#e3e3e3 50px
);
}
.components > .box > .header {
border-style: solid;
border-width: 1px;
border-radius: 0.5rem;
border-color: rgb(55 65 81);
padding: 0.5rem 0.2rem;
text-align: center;
margin-bottom: 0.5rem;
font-weight: bold;
color: white;
}
.sidebar p {
font-size: 100% !important;
}
.training-arguments {
background-color: #f3f6f6;
border: 1px solid #e1e4e5;
}
.training-arguments > .header {
font-weight: 700;
padding: 6px 12px;
background: #e1e4e5;
}
.training-arguments > .table {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(15em, 1fr));
}
.training-arguments > .table > a {
padding: 0.5rem;
border: 1px solid #e1e4e5;
}
================================================
FILE: docs/_static/html/models_en_sentence_embeddings.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>SBERT.net Models</title>
<!-- Vue.js -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/vue/2.6.12/vue.min.js" integrity="sha512-BKbSR+cfyxLdMAsE0naLReFSLg8/pjbgfxHh/k/kUC82Hy7r6HtR5hLhobaln2gcTvzkyyehrdREdjpsQwy2Jw==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
<!-- Bootstrap -->
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css"
integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous">
<script src="https://code.jquery.com/jquery-3.4.1.slim.min.js"
integrity="sha384-J6qa4849blE2+poT4WnyKhv5vZF5SrPo0iEjwBvKU7imGFAV0wwj1yYfoRSJoZ+n"
crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js"
integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo"
crossorigin="anonymous"></script>
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/js/bootstrap.min.js"
integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6"
crossorigin="anonymous"></script>
<!-- Axios -->
<!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/axios/0.21.1/axios.min.js" integrity="sha512-bZS47S7sPOxkjU/4Bt0zrhEtWx0y0CRkhEp8IckzK+ltifIIE9EMIMTuT/mEzoIMewUINruDBIR/jJnbguonqQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script> -->
<!-- Font-awesome -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css"
integrity="sha512-iBBXm8fW90+nuLcSKlbmrPcLa0OT92xO1BIsZ+ywDWZCvqsWgccV3gFoRBv0z+8dLJgyAHIhR35VZc2oM/gI1w=="
crossorigin="anonymous" referrerpolicy="no-referrer"/>
<!-- Lodash -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js"
integrity="sha512-WFN04846sdKMIP5LKNphMaWzU7YpMyCU245etK3g/2ARYbPK9Ub18eG+ljU96qKRCWh+quCY7yefSmlkQw1ANQ=="
crossorigin="anonymous" referrerpolicy="no-referrer"></script>
<style>
.fa-active {
color: #337ab7;
}
.header-cell {
cursor: pointer;
}
.models-table thead th {
position: sticky;
top: 0;
z-index: 1;
background-color: #ffffff;
}
.info-icon {
color: #cccccc;
}
.info-icon-disabled {
color: #cccccc;
}
.link-active, .toggle-active, .info-icon-active, .info-icon:hover {
color: #0056b3;
}
.info-icon-model {
padding-left: 10px;
}
.bs-popover-auto[x-placement^=bottom], .bs-popover-bottom {
margin-top: .5rem;
}
.popover {
max-width: 400px;
}
.toggle-link {
text-decoration: none;
cursor: pointer;
padding: 10px;
}
.toggle-link:hover, .toggle-link:focus {
text-decoration: none;
}
.toggle-inactive, .toggle-inactive:hover, .toggle-inactive:focus {
color: #333;
}
</style>
</head>
<body>
<div id="app">
<div class="text-right p-2">
<span class="toggle-link" v-bind:class="{'toggle-active': show_all_models, 'toggle-inactive': !show_all_models}" @click="show_all_models = !show_all_models">
<span v-if="show_all_models">All models</span><span v-else>All models</span>
<i class="fas" v-bind:class="{'toggle-active': show_all_models, 'fa-toggle-on': show_all_models, 'fa-toggle-off': !show_all_models}"></i>
</span>
</div>
<table class="table table-sm">
<thead>
<tr>
<th class="header-cell" @click="sortAsc = (sortBy=='name') ? sortAsc = !sortAsc : false; sortBy='name'">
<i class="fas fa-active" v-if="sortBy == 'name'" v-bind:class="{ 'fa-sort-amount-up': !sortAsc, 'fa-sort-amount-down-alt': sortAsc }"></i>
Model Name
</th>
<th class="header-cell text-center" @click="sortAsc = (sortBy=='sentence_performance') ? sortAsc = !sortAsc : false; sortBy='sentence_performance'">
<i class="fas fa-active" v-if="sortBy == 'sentence_performance'" v-bind:class="{ 'fa-sort-amount-up': !sortAsc, 'fa-sort-amount-down-alt': sortAsc }"></i>
Performance Sentence Embeddings (14 Datasets)
<span class="info-icon" data-trigger="hover" data-toggle="popover" title="Performance Sentence Embeddings" data-content="Average performance on encoding sentences over 14 diverse tasks from different domains.<br>Higher = Better" data-html="true" data-placement="bottom"><i class="fas fa-info-circle"></i></span>
</th>
<th class="header-cell text-center" @click="sortAsc = (sortBy=='semantic_search') ? sortAsc = !sortAsc : false; sortBy='semantic_search'">
<i class="fas fa-active" v-if="sortBy == 'semantic_search'" v-bind:class="{ 'fa-sort-amount-up': !sortAsc, 'fa-sort-amount-down-alt': sortAsc }"></i>
Performance Semantic Search (6 Datasets)
<span class="info-icon" data-trigger="hover" data-toggle="popover" title="Performance Semantic Search" data-content="Performance on 6 diverse tasks for semantic search: Encoding of queries / questions and paragraphs up to 512 word pieces.<br>Higher = Better." data-html="true" data-placement="bottom"><i class="fas fa-info-circle"></i></span>
</th>
<th class="header-cell text-center" @click="sortAsc = (sortBy=='avg_performance') ? sortAsc = !sortAsc : false; sortBy='avg_performance'">
<i class="fas fa-active" v-if="sortBy == 'avg_performance'" v-bind:class="{ 'fa-sort-amount-up': !sortAsc, 'fa-sort-amount-down-alt': sortAsc }"></i>
Avg. Performance
<span class="info-icon" data-trigger="hover" data-toggle="popover" title="Average Performance" data-content="Average of sentence performance and semantic search performance.<br>Higher = Better." data-html="true" data-placement="bottom"><i class="fas fa-info-circle"></i></span>
</th>
<th class="header-cell text-center" @click="sortAsc = (sortBy=='speed') ? sortAsc = !sortAsc : false; sortBy='speed'">
<i class="fas fa-active" v-if="sortBy == 'speed'" v-bind:class="{ 'fa-sort-amount-up': !sortAsc, 'fa-sort-amount-down-alt': sortAsc }"></i>
Speed
<span class="info-icon" data-trigger="hover" data-toggle="popover" title="Encoding Speed" data-content="Encoding speed (sentences / sec) on a V100 GPU.<br>Higher = Better" data-html="true" data-placement="bottom"><i class="fas fa-info-circle"></i></span>
</th>
<th class="header-cell text-center" @click="sortAsc = (sortBy=='size') ? sortAsc = !sortAsc : false; sortBy='size'">
<i class="fas fa-active" v-if="sortBy == 'size'" v-bind:class="{ 'fa-sort-amount-up': !sortAsc, 'fa-sort-amount-down-alt': sortAsc }"></i>
Model Size
<span class="info-icon" data-trigger="hover" data-toggle="popover" title="Size" data-content="Size (in MB) of the model." data-html="true" data-placement="bottom"><i class="fas fa-info-circle"></i></span>
</th>
</tr>
</thead>
<tbody>
<template v-for="item in sortedModels">
<tr v-on:click="item.show_details = !item.show_details" style="cursor: pointer">
<td style="white-space: nowrap;">
{{ item.name }}
<span class="info-icon-model" v-bind:class="{'info-icon-active': item.show_details, 'info-icon-disabled': !item.show_details}" ><i class="fas fa-info-circle"></i></span>
</td>
<td class="text-center">{{ item.sentence_performance > 0 ? item.sentence_performance.toFixed(2) : "" }}</td>
<td class="text-center">{{ item.semantic_search > 0 ? item.semantic_search.toFixed(2) : "" }}</td>
<td class="text-center">{{ (item.sentence_performance > 0 && item.semantic_search > 0) ? item.avg_performance.toFixed(2) : "" }}</td>
<td class="text-center">{{ item.speed }}</td>
<td class="text-center">{{ item.size }} MB</td>
</tr>
<tr v-if="item.show_details">
<td colspan="6" style="padding-left: 20px">
<table class="table table-sm" style="width: 100%; font-size: 0.9em;">
<thead>
<tr>
<td colspan="2">
<b>{{ item.name }}</b>
<button title="Copy model name" type="button" class="btn btn-link p-0" v-on:click="copyClipboard(item.name)" data-toggle="tooltip" data-placement="bottom" data-trigger="hover" :id="item.name+'-copy-btn'" style="border: 0;">
<svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
</button>
<br>
</td>
</tr>
</thead>
<tbody>
<tr v-if="item.hasOwnProperty('description')">
<th>Description:</th>
<td>{{item.description}}</td>
</tr>
<tr>
<th>Base Model:</th>
<td><span v-html="item.base_model"></span></td>
</tr>
<tr>
<th>Max Sequence Length:</th>
<td>{{item.max_seq_length || ''}}</td>
</tr>
<tr>
<th>Dimensions:</th>
<td>{{item.dim }}</td>
</tr>
<tr>
<th style="width: 220px;">Normalized Embeddings:</th>
<td>{{item.normalized_embeddings}}</td>
</tr>
<tr>
<th>Suitable Score Functions:</th>
<td>
<span v-html="getScoreFunction(item.score_functions)"></span>
<!--<span v-if="item.normalized_embeddings">dot-product (<code>util.dot_score</code>), cosine-similarity (<code>util.cos_sim</code>), and euclidean distance
</span>
<span v-else>Unclear</span> -->
</td>
</tr>
<tr>
<th>Size:</th>
<td>{{item.size}} MB</td>
</tr>
<tr>
<th>Pooling:</th>
<td>{{item.pooling}}</td>
</tr>
<tr>
<th>Training Data:</th>
<td>{{item.training_data}}</td>
</tr>
<tr>
<th>Model Card:</th>
<td><a :href="'https://huggingface.co/sentence-transformers/'+item.name" target="_blank">https://huggingface.co/sentence-transformers/{{item.name}}</a></td>
</tr>
</tbody>
</table>
</td>
</tr>
</template>
</tbody>
</table>
</div>
<script>
var app = new Vue({
el: '#app',
data: {
show_all_models: false,
models: [
{
"name": "average_word_embeddings_glove.6B.300d",
"base_model": "Word Embeddings: GloVe",
"pooling": "Mean Pooling",
"training_data": "-",
"sentence_performance": 49.79,
"semantic_search": 22.71,
"speed": 34000,
"size": 420,
"dim": 300,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"]
},
{
"name": "average_word_embeddings_komninos",
"base_model": "Word Embeddings: Komninos et al.",
"pooling": "Mean Pooling",
"training_data": "-",
"sentence_performance": 51.13,
"semantic_search": 21.64,
"speed": 22000,
"size": 240,
"dim": 300,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"]
},
{
"name": "paraphrase-MiniLM-L3-v2",
"base_model": '<a href="https://huggingface.co/nreimers/MiniLM-L6-H384-uncased" target="_blank">nreimers/MiniLM-L3-H384-uncased</a>',
"pooling": "Mean Pooling",
"training_data": "AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits",
"sentence_performance": 62.29,
"semantic_search": 39.19,
"speed": 19000,
"size": 61,
"dim": 384,
"max_seq_length": 128,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"],
"recommended_model": true
},
{
"name": "paraphrase-MiniLM-L6-v2",
"base_model": '<a href="https://huggingface.co/nreimers/MiniLM-L6-H384-uncased" target="_blank">nreimers/MiniLM-L6-H384-uncased</a>',
"pooling": "Mean Pooling",
"training_data": "AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits",
"sentence_performance": 64.82,
"semantic_search": 40.31,
"speed": 14200,
"size": 80,
"dim": 384,
"max_seq_length": 128,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"]
},
{
"name": "paraphrase-MiniLM-L12-v2",
"base_model": '<a href="https://huggingface.co/microsoft/MiniLM-L12-H384-uncased" target="_blank">microsoft/MiniLM-L12-H384-uncased</a>',
"pooling": "Mean Pooling",
"training_data": "AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits",
"sentence_performance": 66.01,
"semantic_search": 43.01,
"speed": 7500,
"size": 120,
"dim": 384,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"]
},
{
"name": "paraphrase-distilroberta-base-v2",
"base_model": '<a href="https://huggingface.co/distilroberta-base" target="_blank">distilroberta-base</a>',
"pooling": "Mean Pooling",
"training_data": "AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits",
"sentence_performance": 66.27,
"semantic_search": 43.10,
"speed": 4000,
"size": 290,
"dim": 768,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"]
},
{
"name": "paraphrase-TinyBERT-L6-v2",
"base_model": '<a href="https://huggingface.co/nreimers/TinyBERT_L-6_H-768_v2" target="_blank">nreimers/TinyBERT_L-6_H-768_v2</a>',
"pooling": "Mean Pooling",
"training_data": "AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits",
"sentence_performance": 66.19,
"semantic_search": 41.07,
"speed": 4500,
"size": 240,
"dim": 768,
"max_seq_length": 128,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"]
},
{
"name": "paraphrase-mpnet-base-v2",
"base_model": '<a href="https://huggingface.co/microsoft/mpnet-base" target="_blank">microsoft/mpnet-base</a>',
"pooling": "Mean Pooling",
"training_data": "AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits",
"sentence_performance": 67.97,
"semantic_search": 47.43,
"speed": 2800,
"size": 420,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"]
},
{
"name": "paraphrase-albert-small-v2",
"base_model": '<a href="https://huggingface.co/nreimers/albert-small-v2" target="_blank">nreimers/albert-small-v2</a>',
"pooling": "Mean Pooling",
"training_data": "AllNLI, sentence-compression, SimpleWiki, altlex, msmarco-triplets, quora_duplicates, coco_captions,flickr30k_captions, yahoo_answers_title_question, S2ORC_citation_pairs, stackexchange_duplicate_questions, wiki-atomic-edits",
"sentence_performance": 64.46,
"semantic_search": 40.04,
"speed": 5000,
"size": 43,
"dim": 768,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"],
"recommended_model": true
},
{
"name": "paraphrase-multilingual-mpnet-base-v2",
"base_model": "Teacher: paraphrase-mpnet-base-v2; Student: xlm-roberta-base",
"pooling": "Mean Pooling",
"training_data": "Multi-lingual model of paraphrase-mpnet-base-v2, extended to 50+ languages.",
"sentence_performance": 65.83,
"semantic_search": 41.68,
"speed": 2500,
"size": 970,
"dim": 768,
"max_seq_length": 128,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"],
"recommended_model": true
},
{
"name": "paraphrase-multilingual-MiniLM-L12-v2",
"base_model": "Teacher: paraphrase-MiniLM-L12-v2; Student: microsoft/Multilingual-MiniLM-L12-H384",
"pooling": "Mean Pooling",
"training_data": "Multi-lingual model of paraphrase-multilingual-MiniLM-L12-v2, extended to 50+ languages.",
"sentence_performance": 64.25,
"semantic_search": 39.19,
"speed": 7500,
"size": 420,
"dim": 384,
"max_seq_length": 128,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"],
"recommended_model": true
},
{
"name": "distiluse-base-multilingual-cased-v1",
"base_model": "Teacher: mUSE; Student: distilbert-base-multilingual",
"pooling": "Mean Pooling",
"training_data": "Multi-Lingual model of Universal Sentence Encoder for 15 languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish.",
"sentence_performance": 61.30,
"semantic_search": 29.87,
"speed": 4000,
"size": 480,
"dim": 512,
"max_seq_length": 128,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"],
"recommended_model": true
},
{
"name": "distiluse-base-multilingual-cased-v2",
"base_model": "Teacher: mUSE; Student: distilbert-base-multilingual",
"pooling": "Mean Pooling",
"training_data": "Multi-Lingual model of Universal Sentence Encoder for 50 languages.",
"sentence_performance": 60.18,
"semantic_search": 27.35,
"speed": 4000,
"size": 480,
"dim": 512,
"max_seq_length": 128,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["cos"],
"recommended_model": true
},
{
"name": "all-distilroberta-v1",
"description": "All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.",
"base_model": '<a href="https://huggingface.co/distilroberta-base" target="_blank">distilroberta-base</a>',
"pooling": "Mean Pooling",
"training_data": "1B+ training pairs. For details, see model card.",
"sentence_performance": 68.73,
"semantic_search": 50.94,
"speed": 4000,
"size": 290,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
"recommended_model": true
},
{
"name": "all-MiniLM-L6-v1",
"description": "All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.",
"base_model": '<a href="https://huggingface.co/nreimers/MiniLM-L6-H384-uncased" target="_blank">nreimers/MiniLM-L6-H384-uncased</a>',
"pooling": "Mean Pooling",
"training_data": "1B+ training pairs. For details, see model card.",
"sentence_performance": 68.03,
"semantic_search": 48.07,
"speed": 14200,
"size": 80,
"dim": 384,
"max_seq_length": 128,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"]
},
{
"name": "all-MiniLM-L6-v2",
"description": "All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.",
"base_model": '<a href="https://huggingface.co/nreimers/MiniLM-L6-H384-uncased" target="_blank">nreimers/MiniLM-L6-H384-uncased</a>',
"pooling": "Mean Pooling",
"training_data": "1B+ training pairs. For details, see model card.",
"sentence_performance": 68.06,
"semantic_search": 49.54,
"speed": 14200,
"size": 80,
"dim": 384,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
"recommended_model": true
},
{
"name": "all-MiniLM-L12-v1",
"description": "All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.",
"base_model": '<a href="https://huggingface.co/microsoft/MiniLM-L12-H384-uncased" target="_blank">microsoft/MiniLM-L12-H384-uncased</a>',
"pooling": "Mean Pooling",
"training_data": "1B+ training pairs. For details, see model card.",
"sentence_performance": 68.83,
"semantic_search": 50.78,
"speed": 7500,
"size": 120,
"dim": 384,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"]
},
{
"name": "all-MiniLM-L12-v2",
"description": "All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.",
"base_model": '<a href="https://huggingface.co/microsoft/MiniLM-L12-H384-uncased" target="_blank">microsoft/MiniLM-L12-H384-uncased</a>',
"pooling": "Mean Pooling",
"training_data": "1B+ training pairs. For details, see model card.",
"sentence_performance": 68.7,
"semantic_search": 50.82,
"speed": 7500,
"size": 120,
"dim": 384,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
"recommended_model": true
},
{
"name": "all-mpnet-base-v1",
"description": "All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.",
"base_model": '<a href="https://huggingface.co/microsoft/mpnet-base" target="_blank">microsoft/mpnet-base</a>',
"pooling": "Mean Pooling",
"training_data": "1B+ training pairs. For details, see model card.",
"sentence_performance": 69.98,
"semantic_search": 54.69,
"speed": 2800,
"size": 420,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"]
},
{
"name": "all-mpnet-base-v2",
"description": "All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.",
"base_model": '<a href="https://huggingface.co/microsoft/mpnet-base" target="_blank">microsoft/mpnet-base</a>',
"pooling": "Mean Pooling",
"training_data": "1B+ training pairs. For details, see model card.",
"sentence_performance": 69.57,
"semantic_search": 57.02,
"speed": 2800,
"size": 420,
"dim": 768,
"max_seq_length": 384,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
"recommended_model": true
},
{
"name": "all-roberta-large-v1",
"description": "All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs.",
"base_model": '<a href="https://huggingface.co/microsoft/roberta-large" target="_blank">roberta-large</a>',
"pooling": "Mean Pooling",
"training_data": "1B+ training pairs. For details, see model card.",
"sentence_performance": 70.23,
"semantic_search": 53.05,
"speed": 800,
"size": 1360,
"dim": 1024,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"]
},
{
"name": "multi-qa-MiniLM-L6-dot-v1",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs.",
"base_model": '<a href="https://huggingface.co/nreimers/MiniLM-L6-H384-uncased" target="_blank">nreimers/MiniLM-L6-H384-uncased</a>',
"pooling": "CLS Pooling",
"training_data": "215M (question, answer) pairs from diverse sources.",
"sentence_performance": 63.90,
"semantic_search": 49.19,
"speed": 14200,
"size": 80,
"dim": 384,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["dot"]
},
{
"name": "multi-qa-MiniLM-L6-cos-v1",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs.",
"base_model": '<a href="https://huggingface.co/nreimers/MiniLM-L6-H384-uncased" target="_blank">nreimers/MiniLM-L6-H384-uncased</a>',
"pooling": "Mean Pooling",
"training_data": "215M (question, answer) pairs from diverse sources.",
"sentence_performance": 64.33,
"semantic_search": 51.83,
"speed": 14200,
"size": 80,
"dim": 384,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
"recommended_model": true
},
{
"name": "multi-qa-distilbert-dot-v1",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs.",
"base_model": '<a href="https://huggingface.co/distilbert-base" target="_blank">distilbert-base</a>',
"pooling": "CLS Pooling",
"training_data": "215M (question, answer) pairs from diverse sources.",
"sentence_performance": 66.67,
"semantic_search": 52.51,
"speed": 4000,
"size": 250,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["dot"]
},
{
"name": "multi-qa-distilbert-cos-v1",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs.",
"base_model": '<a href="https://huggingface.co/distilbert-base" target="_blank">distilbert-base</a>',
"pooling": "Mean Pooling",
"training_data": "215M (question, answer) pairs from diverse sources.",
"sentence_performance": 65.98,
"semantic_search": 52.83,
"speed": 4000,
"size": 250,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
"recommended_model": true
},
{
"name": "multi-qa-mpnet-base-dot-v1",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs.",
"base_model": '<a href="https://huggingface.co/microsoft/mpnet-base" target="_blank"microsoft/mpnet-base</a>',
"pooling": "CLS Pooling",
"training_data": "215M (question, answer) pairs from diverse sources.",
"sentence_performance": 66.76,
"semantic_search": 57.60,
"speed": 2800,
"size": 420,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["dot"],
"recommended_model": true
},
{
"name": "multi-qa-mpnet-base-cos-v1",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs.",
"base_model": '<a href="https://huggingface.co/microsoft/mpnet-base" target="_blank">microsoft/mpnet-base</a>',
"pooling": "Mean Pooling",
"training_data": "215M (question, answer) pairs from diverse sources.",
"sentence_performance": 66.29,
"semantic_search": 57.46,
"speed": 2800,
"size": 420,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
{
"name": "msmarco-distilbert-dot-v5",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on the MS MARCO passages dataset.",
"base_model": '<a href="https://huggingface.co/distilbert-base" target="_blank">distilbert-base</a>',
"pooling": "Mean Pooling",
"training_data": "500k (query, answer) pairs from MS MARCO Passages dataset.",
"sentence_performance": 61.84,
"semantic_search": 49.47,
"speed": 4000,
"size": 250,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["dot"],
},
{
"name": "msmarco-bert-base-dot-v5",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on the MS MARCO passages dataset.",
"base_model": '<a href="https://huggingface.co/Luyu/co-condenser-marco" target="_blank">Luyu/co-condenser-marco</a>',
"pooling": "Mean Pooling",
"training_data": "500k (query, answer) pairs from MS MARCO Passages dataset.",
"sentence_performance": 62.68,
"semantic_search": 52.11,
"speed": 2800,
"size": 420,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["dot"],
},
{
"name": "msmarco-distilbert-base-tas-b",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on the MS MARCO passages dataset.",
"base_model": '<a href="https://huggingface.co/distilbert-base" target="_blank">distilbert-base</a>',
"pooling": "Mean Pooling",
"training_data": "500k (query, answer) pairs from MS MARCO Passages dataset.",
"sentence_performance": 62.57,
"semantic_search": 49.25,
"speed": 4000,
"size": 250,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": false,
"score_functions": ["dot"],
},
{
"name": "sentence-t5-base",
"description": "This model was tuned for sentence similarity tasks.",
"base_model": '<a href="https://huggingface.co/t5-base" target="_blank">t5-base</a>',
"pooling": "Mean Pooling",
"training_data": "2B question-answer pairs from diverse online communities.",
"sentence_performance": 67.84,
"semantic_search": 44.63,
"speed": 2500,
"size": 210,
"dim": 768,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
{
"name": "sentence-t5-large",
"description": "This model was tuned for sentence similarity tasks.",
"base_model": '<a href="https://huggingface.co/t5-large" target="_blank">t5-large</a>',
"pooling": "Mean Pooling",
"training_data": "2B question-answer pairs from diverse online communities.",
"sentence_performance": 68.74,
"semantic_search": 49.05,
"speed": 800,
"size": 640,
"dim": 768,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
{
"name": "sentence-t5-xl",
"description": "This model was tuned for sentence similarity tasks.",
"base_model": '<a href="https://huggingface.co/t5-3b" target="_blank">t5-3b</a>',
"pooling": "Mean Pooling",
"training_data": "2B question-answer pairs from diverse online communities.",
"sentence_performance": 69.23,
"semantic_search": 51.19,
"speed": 230,
"size": 2370,
"dim": 768,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
{
"name": "sentence-t5-xxl",
"description": "This model was tuned for sentence similarity tasks.",
"base_model": '<a href="https://huggingface.co/t5-11b" target="_blank">t5-11b</a>',
"pooling": "Mean Pooling",
"training_data": "2B question-answer pairs from diverse online communities.",
"sentence_performance": 70.88,
"semantic_search": 54.40,
"speed": 50,
"size": 9230,
"dim": 768,
"max_seq_length": 256,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
{
"name": "gtr-t5-base",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages.",
"base_model": '<a href="https://huggingface.co/t5-base" target="_blank">t5-base</a>',
"pooling": "Mean Pooling",
"training_data": "2B question-answer pairs from diverse online communities and then on MS-MARCO.",
"sentence_performance": 67.65,
"semantic_search": 51.15,
"speed": 2500,
"size": 210,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
{
"name": "gtr-t5-large",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages.",
"base_model": '<a href="https://huggingface.co/t5-large" target="_blank">t5-large</a>',
"pooling": "Mean Pooling",
"training_data": "2B question-answer pairs from diverse online communities and then on MS-MARCO.",
"sentence_performance": 69.90,
"semantic_search": 54.85,
"speed": 800,
"size": 640,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
{
"name": "gtr-t5-xl",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages.",
"base_model": '<a href="https://huggingface.co/t5-3b" target="_blank">t5-3b</a>',
"pooling": "Mean Pooling",
"training_data": "2B question-answer pairs from diverse online communities and then on MS-MARCO.",
"sentence_performance": 69.88,
"semantic_search": 55.88,
"speed": 230,
"size": 2370,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
{
"name": "gtr-t5-xxl",
"description": "This model was tuned for semantic search: Given a query/question, it can find relevant passages.",
"base_model": '<a href="https://huggingface.co/t5-11b" target="_blank">t5-11b</a>',
"pooling": "Mean Pooling",
"training_data": "2B question-answer pairs from diverse online communities and then on MS-MARCO.",
"sentence_performance": 70.73,
"semantic_search": 55.76,
"speed": 50,
"size": 9230,
"dim": 768,
"max_seq_length": 512,
"show_details": false,
"normalized_embeddings": true,
"score_functions": ["dot", "cos", "eucl"],
},
],
sortBy: 'avg_performance',
sortAsc: false
},
methods: {
copyClipboard: function(msg) {
console.log(msg);
const clipboardData = window.clipboardData || navigator.clipboard;
clipboardData.writeText(msg);
const copy_btn = $("#"+msg+"-copy-btn")
copy_btn.tooltip('show');
copy_btn.tooltip('hide').attr('data-original-title', "Copied").tooltip('show');
//copy_btn.prop('title', 'your new title'); //.tooltip('show');
setTimeout(function(){ copy_btn.tooltip('hide').attr('data-original-title', "Copy model name"); }, 1000);
},
getScoreFunction: function(score_fct) {
let output_html = [];
for(let fct of score_fct) {
switch(fct) {
case "dot":
output_html.push("dot-product (<code>util.dot_score</code>)");
break;
case "cos":
output_html.push("cosine-similarity (<code>util.cos_sim</code>)");
break;
case "eucl":
output_html.push("euclidean distance")
break;
default:
output_html.push(fct)
}
}
return output_html.join(", ");
}
},
created: function() {
let uri = window.location.search.substring(1);
let params = new URLSearchParams(uri);
if(params.get("model_name") !== null) {
this.show_all_models = true;
}
},
computed: {
sortedModels: function() {
//Add avg. of sentence and semantic search performance
let models_ext = this.models.map(function(elem, index) { elem.avg_performance = (elem.sentence_performance + elem.semantic_search)/2.0; return elem;} );
if(!this.show_all_models) {
models_ext = models_ext.filter(item => item.recommended_model);
}
return _.orderBy(models_ext, (item) => item[this.sortBy] || (this.sortAsc ? 9999 : -9999), this.sortAsc ? 'asc' : 'desc')
}
}
})
</script>
<script>
$(function () {
$('[data-toggle="popover"]').popover()
});
$(function () {
$('[data-toggle="tooltip"]').tooltip()
})
</script>
</body>
</html>
================================================
FILE: docs/_static/js/custom.js
================================================
function addGithubButton() {
const div = `
<div class="github-repo">
<div style="display: flex; justify-content: center;">
<div id="hf-button">
<a href="https://huggingface.co/models?library=sentence-transformers" target="_blank" title="See all Sentence Transformer models"><img src="https://sbert.net/_static/hf-logo.svg" style="margin: 0px 10px 0px -10px; padding: 0px; height: 28px; width: 28px;"></a>
</div>
<a class="github-button"
href="https://github.com/huggingface/sentence-transformers" data-size="large" data-show-count="true" aria-label="Star huggingface/sentence-transformers on GitHub" title="sentence-transformers on GitHub">
Star
</a>
</div>
</div>
`;
document.getElementsByClassName("logo")[0].parentElement.insertAdjacentHTML("afterend", div);
}
/*!
* github-buttons v2.2.10
* (c) 2019 なつき
* @license BSD-2-Clause
*/
/**
* modified to run programmatically
*/
function parseGithubButtons (){"use strict";var e=window.document,t=e.location,o=window.encodeURIComponent,r=window.decodeURIComponent,n=window.Math,a=window.HTMLElement,i=window.XMLHttpRequest,l="https://unpkg.com/github-buttons@2.2.10/dist/buttons.html",c=i&&i.prototype&&"withCredentials"in i.prototype,d=c&&a&&a.prototype.attachShadow&&!a.prototype.attachShadow.prototype,s=function(e,t,o){e.addEventListener?e.addEventListener(t,o):e.attachEvent("on"+t,o)},u=function(e,t,o){e.removeEventListener?e.removeEventListener(t,o):e.detachEvent("on"+t,o)},h=function(e,t,o){var r=function(n){return u(e,t,r),o(n)};s(e,t,r)},f=function(e,t,o){var r=function(n){if(t.test(e.readyState))return u(e,"readystatechange",r),o(n)};s(e,"readystatechange",r)},p=function(e){return function(t,o,r){var n=e.createElement(t);if(o)for(var a in o){var i=o[a];null!=i&&(null!=n[a]?n[a]=i:n.setAttribute(a,i))}if(r)for(var l=0,c=r.length;l<c;l++){var d=r[l];n.appendChild("string"==typeof d?e.createTextNode(d):d)}return n}},g=p(e),b=function(e){var t;return function(){t||(t=1,e.apply(this,arguments))}},m="body{margin:0}a{color:#24292e;text-decoration:none;outline:0}.octicon{display:inline-block;vertical-align:text-top;fill:currentColor}.widget{ display:inline-block;overflow:hidden;font-family:-apple-system, BlinkMacSystemFont, \"Segoe UI\", Helvetica, Arial, sans-serif;font-size:0;white-space:nowrap;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.btn,.social-count{display:inline-block;height:14px;padding:2px 5px;font-size:11px;font-weight:600;line-height:14px;vertical-align:bottom;cursor:pointer;border:1px solid #c5c9cc;border-radius:0.25em}.btn{background-color:#eff3f6;background-image:-webkit-linear-gradient(top, #fafbfc, #eff3f6 90%);background-image:-moz-linear-gradient(top, #fafbfc, #eff3f6 90%);background-image:linear-gradient(180deg, #fafbfc, #eff3f6 90%);background-position:-1px -1px;background-repeat:repeat-x;background-size:110% 110%;border-color:rgba(27,31,35,0.2);-ms-filter:\"progid:DXImageTransform.Microsoft.Gradient(startColorstr='#FFFAFBFC', endColorstr='#FFEEF2F5')\";*filter:progid:DXImageTransform.Microsoft.Gradient(startColorstr='#FFFAFBFC', endColorstr='#FFEEF2F5')}.btn:active{background-color:#e9ecef;background-image:none;border-color:#a5a9ac;border-color:rgba(27,31,35,0.35);box-shadow:inset 0 0.15em 0.3em rgba(27,31,35,0.15)}.btn:focus,.btn:hover{background-color:#e6ebf1;background-image:-webkit-linear-gradient(top, #f0f3f6, #e6ebf1 90%);background-image:-moz-linear-gradient(top, #f0f3f6, #e6ebf1 90%);background-image:linear-gradient(180deg, #f0f3f6, #e6ebf1 90%);border-color:#a5a9ac;border-color:rgba(27,31,35,0.35);-ms-filter:\"progid:DXImageTransform.Microsoft.Gradient(startColorstr='#FFF0F3F6', endColorstr='#FFE5EAF0')\";*filter:progid:DXImageTransform.Microsoft.Gradient(startColorstr='#FFF0F3F6', endColorstr='#FFE5EAF0')}.social-count{position:relative;margin-left:5px;background-color:#fff}.social-count:focus,.social-count:hover{color:#0366d6}.social-count b,.social-count i{position:absolute;top:50%;left:0;display:block;width:0;height:0;margin:-4px 0 0 -4px;border:solid transparent;border-width:4px 4px 4px 0;_line-height:0;_border-top-color:red !important;_border-bottom-color:red !important;_border-left-color:red !important;_filter:chroma(color=red)}.social-count b{border-right-color:#c5c9cc}.social-count i{margin-left:-3px;border-right-color:#fff}.lg .btn,.lg .social-count{height:16px;padding:5px 10px;font-size:12px;line-height:16px}.lg .social-count{margin-left:6px}.lg .social-count b,.lg .social-count i{margin:-5px 0 0 -5px;border-width:5px 5px 5px 0}.lg .social-count i{margin-left:-4px}\n",v={"mark-github":{width:16,height:16,path:'<path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>'},eye:{width:16,height:16,path:'<path fill-rule="evenodd" d="M8.06 2C3 2 0 8 0 8s3 6 8.06 6C13 14 16 8 16 8s-3-6-7.94-6zM8 12c-2.2 0-4-1.78-4-4 0-2.2 1.8-4 4-4 2.22 0 4 1.8 4 4 0 2.22-1.78 4-4 4zm2-4c0 1.11-.89 2-2 2-1.11 0-2-.89-2-2 0-1.11.89-2 2-2 1.11 0 2 .89 2 2z"/>'},star:{width:14,height:16,path:'<path fill-rule="evenodd" d="M14 6l-4.9-.64L7 1 4.9 5.36 0 6l3.6 3.26L2.67 14 7 11.67 11.33 14l-.93-4.74L14 6z"/>'},"repo-forked":{width:10,height:16,path:'<path fill-rule="evenodd" d="M8 1a1.993 1.993 0 0 0-1 3.72V6L5 8 3 6V4.72A1.993 1.993 0 0 0 2 1a1.993 1.993 0 0 0-1 3.72V6.5l3 3v1.78A1.993 1.993 0 0 0 5 15a1.993 1.993 0 0 0 1-3.72V9.5l3-3V4.72A1.993 1.993 0 0 0 8 1zM2 4.2C1.34 4.2.8 3.65.8 3c0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2zm3 10c-.66 0-1.2-.55-1.2-1.2 0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2zm3-10c-.66 0-1.2-.55-1.2-1.2 0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2z"/>'},"issue-opened":{width:14,height:16,path:'<path fill-rule="evenodd" d="M7 2.3c3.14 0 5.7 2.56 5.7 5.7s-2.56 5.7-5.7 5.7A5.71 5.71 0 0 1 1.3 8c0-3.14 2.56-5.7 5.7-5.7zM7 1C3.14 1 0 4.14 0 8s3.14 7 7 7 7-3.14 7-7-3.14-7-7-7zm1 3H6v5h2V4zm0 6H6v2h2v-2z"/>'},"cloud-download":{width:16,height:16,path:'<path fill-rule="evenodd" d="M9 12h2l-3 3-3-3h2V7h2v5zm3-8c0-.44-.91-3-4.5-3C5.08 1 3 2.92 3 5 1.02 5 0 6.52 0 8c0 1.53 1 3 3 3h3V9.7H3C1.38 9.7 1.3 8.28 1.3 8c0-.17.05-1.7 1.7-1.7h1.3V5c0-1.39 1.56-2.7 3.2-2.7 2.55 0 3.13 1.55 3.2 1.8v1.2H12c.81 0 2.7.22 2.7 2.2 0 2.09-2.25 2.2-2.7 2.2h-2V11h2c2.08 0 4-1.16 4-3.5C16 5.06 14.08 4 12 4z"/>'}},w={},x=function(e,t,o){var r=p(e.ownerDocument),n=e.appendChild(r("style",{type:"text/css"}));n.styleSheet?n.styleSheet.cssText=m:n.appendChild(e.ownerDocument.createTextNode(m));var a,l,d=r("a",{className:"btn",href:t.href,target:"_blank",innerHTML:(a=t["data-icon"],l=/^large$/i.test(t["data-size"])?16:14,a=(""+a).toLowerCase().replace(/^octicon-/,""),{}.hasOwnProperty.call(v,a)||(a="mark-github"),'<svg version="1.1" width="'+l*v[a].width/v[a].height+'" height="'+l+'" viewBox="0 0 '+v[a].width+" "+v[a].height+'" class="octicon octicon-'+a+'" aria-hidden="true">'+v[a].path+"</svg>"),"aria-label":t["aria-label"]||void 0},[" ",r("span",{},[t["data-text"]||""])]);/\.github\.com$/.test("."+d.hostname)?/^https?:\/\/((gist\.)?github\.com\/[^\/?#]+\/[^\/?#]+\/archive\/|github\.com\/[^\/?#]+\/[^\/?#]+\/releases\/download\/|codeload\.github\.com\/)/.test(d.href)&&(d.target="_top"):(d.href="#",d.target="_self");var u,h,g,x,y=e.appendChild(r("div",{className:"widget"+(/^large$/i.test(t["data-size"])?" lg":"")},[d]));/^(true|1)$/i.test(t["data-show-count"])&&"github.com"===d.hostname&&(u=d.pathname.replace(/^(?!\/)/,"/").match(/^\/([^\/?#]+)(?:\/([^\/?#]+)(?:\/(?:(subscription)|(fork)|(issues)|([^\/?#]+)))?)?(?:[\/?#]|$)/))&&!u[6]?(u[2]?(h="/repos/"+u[1]+"/"+u[2],u[3]?(x="subscribers_count",g="watchers"):u[4]?(x="forks_count",g="network"):u[5]?(x="open_issues_count",g="issues"):(x="stargazers_count",g="stargazers")):(h="/users/"+u[1],g=x="followers"),function(e,t){var o=w[e]||(w[e]=[]);if(!(o.push(t)>1)){var r=b(function(){for(delete w[e];t=o.shift();)t.apply(null,arguments)});if(c){var n=new i;s(n,"abort",r),s(n,"error",r),s(n,"load",function(){var e;try{e=JSON.parse(n.responseText)}catch(e){return void r(e)}r(200!==n.status,e)}),n.open("GET",e),n.send()}else{var a=this||window;a._=function(e){a._=null,r(200!==e.meta.status,e.data)};var l=p(a.document)("script",{async:!0,src:e+(/\?/.test(e)?"&":"?")+"callback=_"}),d=function(){a._&&a._({meta:{}})};s(l,"load",d),s(l,"error",d),l.readyState&&f(l,/de|m/,d),a.document.getElementsByTagName("head")[0].appendChild(l)}}}.call(this,"https://api.github.com"+h,function(e,t){if(!e){var n=t[x];y.appendChild(r("a",{className:"social-count",href:t.html_url+"/"+g,target:"_blank","aria-label":n+" "+x.replace(/_count$/,"").replace("_"," ").slice(0,n<2?-1:void 0)+" on GitHub"},[r("b"),r("i"),r("span",{},[(""+n).replace(/\B(?=(\d{3})+(?!\d))/g,",")])]))}o&&o(y)})):o&&o(y)},y=window.devicePixelRatio||1,C=function(e){return(y>1?n.ceil(n.round(e*y)/y*2)/2:n.ceil(e))||0},F=function(e,t){e.style.width=t[0]+"px",e.style.height=t[1]+"px"},k=function(t,r){if(null!=t&&null!=r)if(t.getAttribute&&(t=function(e){for(var t={href:e.href,title:e.title,"aria-label":e.getAttribute("aria-label")},o=["icon","text","size","show-count"],r=0,n=o.length;r<n;r++){var a="data-"+o[r];t[a]=e.getAttribute(a)}return null==t["data-text"]&&(t["data-text"]=e.textContent||e.innerText),t}(t)),d){var a=g("span",{title:t.title||void 0});x(a.attachShadow({mode:"closed"}),t,function(){r(a)})}else{var i=g("iframe",{src:"javascript:0",title:t.title||void 0,allowtransparency:!0,scrolling:"no",frameBorder:0});F(i,[0,0]),i.style.border="none";var c=function(){var a,d=i.contentWindow;try{a=d.document.body}catch(t){return void e.body.appendChild(i.parentNode.removeChild(i))}u(i,"load",c),x.call(d,a,t,function(e){var a=function(e){var t=e.offsetWidth,o=e.offsetHeight;if(e.getBoundingClientRect){var r=e.getBoundingClientRect();t=n.max(t,C(r.width)),o=n.max(o,C(r.height))}return[t,o]}(e);i.parentNode.removeChild(i),h(i,"load",function(){F(i,a)}),i.src=l+"#"+(i.name=function(e){var t=[];for(var r in e){var n=e[r];null!=n&&t.push(o(r)+"="+o(n))}return t.join("&")}(t)),r(i)})};s(i,"load",c),e.body.appendChild(i)}};t.protocol+"//"+t.host+t.pathname===l?x(e.body,function(e){for(var t={},o=e.split("&"),n=0,a=o.length;n<a;n++){var i=o[n];if(""!==i){var l=i.split("=");t[r(l[0])]=null!=l[1]?r(l.slice(1).join("=")):void 0}}return t}(window.name||t.hash.replace(/^#/,""))):function(t){if(/m/.test(e.readyState)||!/g/.test(e.readyState)&&!e.documentElement.doScroll)setTimeout(t);else if(e.addEventListener){var o=b(t);h(e,"DOMContentLoaded",o),h(window,"load",o)}else f(e,/m/,t)}(function(){for(var t=e.querySelectorAll?e.querySelectorAll("a.github-button"):function(){for(var t=[],o=e.getElementsByTagName("a"),r=0,n=o.length;r<n;r++)~(" "+o[r].className+" ").replace(/[ \t\n\f\r]+/g," ").indexOf(" github-button ")&&t.push(o[r]);return t}(),o=0,r=t.length;o<r;o++)!function(e){k(e,function(t){e.parentNode.replaceChild(t,e)})}(t[o])})};
function onLoad() {
addGithubButton();
parseGithubButtons();
}
window.addEventListener("load", onLoad);
================================================
FILE: docs/_templates/layout.html
================================================
{% extends "!layout.html" %}
{% block extrahead %}
<!-- Privacy-friendly analytics by Plausible -->
<script async src="https://plausible.io/js/pa-B9Apen_9cO_gfwxvmnY5y.js"></script>
<script>
window.plausible=window.plausible||function(){(plausible.q=plausible.q||[]).push(arguments)},plausible.init=plausible.init||function(i){plausible.o=i||{}};
plausible.init()
</script>
{% endblock %}
================================================
FILE: docs/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import datetime
import importlib
import inspect
import os
import posixpath
from sphinx.application import Sphinx
from sphinx.writers.html5 import HTML5Translator
# -- Project information -----------------------------------------------------
project = "Sentence Transformers"
copyright = str(datetime.datetime.now().year)
author = "Nils Reimers, Tom Aarsen"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.napoleon",
"sphinx.ext.autodoc",
"myst_parser",
"sphinx_markdown_tables",
"sphinx_copybutton",
"sphinx.ext.intersphinx",
"sphinx.ext.linkcode",
"sphinx_inline_tabs",
"sphinxcontrib.mermaid",
"sphinx_toolbox.collapse",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to include when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
include_patterns = [
"docs/**",
"sentence_transformers/**/.py",
"examples/**",
"index.rst",
]
intersphinx_mapping = {
"datasets": ("https://huggingface.co/docs/datasets/main/en/", None),
"transformers": ("https://huggingface.co/docs/transformers/main/en/", None),
"huggingface_hub": ("https://huggingface.co/docs/huggingface_hub/main/en/", None),
"optimum": ("https://huggingface.co/docs/optimum/main/en/", None),
"peft": ("https://huggingface.co/docs/peft/main/en/", None),
"torch": ("https://pytorch.org/docs/stable/", None),
}
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
html_theme_options = {
"logo_only": True,
"canonical_url": "https://www.sbert.net",
"collapse_navigation": False,
"navigation_depth": 3,
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static", "img/hf-logo.svg"]
# Add any paths that contain "extra" files, such as .htaccess or
# robots.txt.
html_extra_path = [".htaccess"]
html_css_files = [
"css/custom.css",
]
html_js_files = [
"js/custom.js",
]
html_show_sourcelink = False
html_context = {
"display_github": True,
"github_user": "huggingface",
"github_repo": "sentence-transformers",
"github_version": "main/",
}
html_logo = "img/logo.png"
html_favicon = "img/favicon.ico"
autoclass_content = "both"
# Required to get rid of some myst.xref_missing warnings
myst_heading_anchors = 3
# https://github.com/readthedocs/sphinx-autoapi/issues/202#issuecomment-907582382
def linkcode_resolve(domain, info):
# Non-linkable objects from the starter kit in the tutorial.
if domain == "js" or info["module"] == "connect4":
return
assert domain == "py", "expected only Python objects"
mod = importlib.import_module(info["module"])
if "." in info["fullname"]:
objname, attrname = info["fullname"].split(".")
obj = getattr(mod, objname)
try:
# object is a method of a class
obj = getattr(obj, attrname)
except AttributeError:
# object is an attribute of a class
return None
else:
obj = getattr(mod, info["fullname"])
obj = inspect.unwrap(obj)
try:
file = inspect.getsourcefile(obj)
lines = inspect.getsourcelines(obj)
except TypeError:
# e.g. object is a typing.Union
return None
file = os.path.relpath(file, os.path.abspath(".."))
if not file.startswith("sentence_transformers"):
# e.g. object is a typing.NewType
return None
start, end = lines[1], lines[1] + len(lines[0]) - 1
return f"https://github.com/huggingface/sentence-transformers/blob/main/{file}#L{start}-L{end}"
def visit_download_reference(self, node):
root = "https://github.com/huggingface/sentence-transformers/tree/main"
atts = {"class": "reference download", "download": ""}
if not self.builder.download_support:
self.context.append("")
elif "refuri" in node:
atts["class"] += " external"
atts["href"] = node["refuri"]
self.body.append(self.starttag(node, "a", "", **atts))
self.context.append("</a>")
elif "reftarget" in node and "refdoc" in node:
atts["class"] += " external"
atts["href"] = posixpath.join(root, os.path.dirname(node["refdoc"]), node["reftarget"])
self.body.append(self.starttag(node, "a", "", **atts))
self.context.append("</a>")
else:
self.context.append("")
HTML5Translator.visit_download_reference = visit_download_reference
def setup(app: Sphinx):
pass
================================================
FILE: docs/cross_encoder/loss_overview.md
================================================
# Loss Overview
## Loss Table
Loss functions play a critical role in the performance of your fine-tuned Cross Encoder model. Sadly, there is no "one size fits all" loss function. Ideally, this table should help narrow down your choice of loss function(s) by matching them to your data formats.
```{eval-rst}
.. note::
You can often convert one training data format into another, allowing more loss functions to be viable for your scenario. For example, ``(sentence_A, sentence_B) pairs`` with ``class`` labels can be converted into ``(anchor, positive, negative) triplets`` by sampling sentences with the same or different classes.
Additionally, :func:`~sentence_transformers.util.mine_hard_negatives` can easily be used to turn ``(anchor, positive)`` to:
- ``(anchor, positive, negative) triplets`` with ``output_format="triplet"``,
- ``(anchor, positive, negative_1, …, negative_n) tuples`` with ``output_format="n-tuple"``.
- ``(anchor, passage, label) labeled pairs`` with a label of 0 for negative and 1 for positive with ``output_format="labeled-pair"``,
- ``(anchor, [doc1, doc2, ..., docN], [label1, label2, ..., labelN]) triplets`` with labels of 0 for negative and 1 for positive with ``output_format="labeled-list"``
As well as formats with similarity scores instead of binarized labels, by setting ``output_scores=True``.
```
| Inputs | Labels | Number of Model Output Labels | Appropriate Loss Functions |
|---------------------------------------------------|------------------------------------------|-------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `(sentence_A, sentence_B) pairs` | `class` | `num_classes` | <a href="../package_reference/cross_encoder/losses.html#crossentropyloss">`CrossEntropyLoss`</a> |
| `(anchor, positive) pairs` | `none` | `1` | <a href="../package_reference/cross_encoder/losses.html#multiplenegativesrankingloss">`MultipleNegativesRankingLoss`</a><br><a href="../package_reference/cross_encoder/losses.html#cachedmultiplenegativesrankingloss">`CachedMultipleNegativesRankingLoss`</a> |
| `(anchor, positive/negative) pairs` | `1 if positive, 0 if negative` | `1` | <a href="../package_reference/cross_encoder/losses.html#binarycrossentropyloss">`BinaryCrossEntropyLoss`</a> |
| `(sentence_A, sentence_B) pairs` | `float similarity score between 0 and 1` | `1` | <a href="../package_reference/cross_encoder/losses.html#binarycrossentropyloss">`BinaryCrossEntropyLoss`</a> |
| `(anchor, positive, negative) triplets` | `none` | `1` | <a href="../package_reference/cross_encoder/losses.html#multiplenegativesrankingloss">`MultipleNegativesRankingLoss`</a><br><a href="../package_reference/cross_encoder/losses.html#cachedmultiplenegativesrankingloss">`CachedMultipleNegativesRankingLoss`</a> |
| `(anchor, positive, negative_1, ..., negative_n)` | `none` | `1` | <a href="../package_reference/cross_encoder/losses.html#multiplenegativesrankingloss">`MultipleNegativesRankingLoss`</a><br><a href="../package_reference/cross_encoder/losses.html#cachedmultiplenegativesrankingloss">`CachedMultipleNegativesRankingLoss`</a> |
| `(query, [doc1, doc2, ..., docN])` | `[score1, score2, ..., scoreN]` | `1` | <ol style="margin-bottom: 0;line-height: inherit;"><li><a href="../package_reference/cross_encoder/losses.html#lambdaloss">`LambdaLoss`</a></li><li><a href="../package_reference/cross_encoder/losses.html#plistmleloss">`PListMLELoss`</a></li><li><a href="../package_reference/cross_encoder/losses.html#listnetloss">`ListNetLoss`</a></li><li><a href="../package_reference/cross_encoder/losses.html#ranknetloss">`RankNetLoss`</a></li><li><a href="../package_reference/cross_encoder/losses.html#listmleloss">`ListMLELoss`</a></li></ol> |
## Distillation
These loss functions are specifically designed to be used when distilling the knowledge from one model into another.
For example, when finetuning a small model to behave more like a larger & stronger one, or when finetuning a model to become multi-lingual.
| Texts | Labels | Appropriate Loss Functions |
|---------------------------------------------------|---------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|
| `(sentence_A, sentence_B) pairs` | `similarity score` | <a href="../package_reference/cross_encoder/losses.html#mseloss">`MSELoss`</a> |
| `(query, passage_one, passage_two) triplets` | `gold_sim(query, passage_one) - gold_sim(query, passage_two)` | <a href="../package_reference/cross_encoder/losses.html#marginmseloss">`MarginMSELoss`</a> |
| `(query, positive, negative_1, ..., negative_n)` | `[gold_sim(query, positive) - gold_sim(query, negative_i) for i in 1..n]` | <a href="../package_reference/cross_encoder/losses.html#marginmseloss">`MarginMSELoss`</a> |
| `(query, positive, negative)` | `[gold_sim(query, positive), gold_sim(query, negative)]` | <a href="../package_reference/cross_encoder/losses.html#marginmseloss">`MarginMSELoss`</a> |
| `(query, positive, negative_1, ..., negative_n) ` | `[gold_sim(query, positive), gold_sim(query, negative_i)...] ` | <a href="../package_reference/cross_encoder/losses.html#marginmseloss">`MarginMSELoss`</a> |
## Commonly used Loss Functions
In practice, not all loss functions get used equally often. The most common scenarios are:
- `(sentence_A, sentence_B) pairs` with `float similarity score` or `1 if positive, 0 if negative`: <a href="../package_reference/cross_encoder/losses.html#binarycrossentropyloss"><code>BinaryCrossEntropyLoss</code></a> is a traditional option that remains very challenging to outperform.
- `(anchor, positive) pairs` without any labels: combined with <a href="../package_reference/util.html#sentence_transformers.util.mine_hard_negatives"><code>mine_hard_negatives</code></a>
- with <code>output_format="labeled-list"</code>, then <a href="../package_reference/cross_encoder/losses.html#lambdaloss"><code>LambdaLoss</code></a> is frequently used for learning-to-rank tasks.
- with <code>output_format="labeled-pair"</code>, then <a href="../package_reference/cross_encoder/losses.html#binarycrossentropyloss"><code>BinaryCrossEntropyLoss</code></a> remains a strong option.
## Custom Loss Functions
```{eval-rst}
Advanced users can create and train with their own loss functions. Custom loss functions only have a few requirements:
- They must be a subclass of :class:`torch.nn.Module`.
- They must have ``model`` as the first argument in the constructor.
- They must implement a ``forward`` method that accepts ``inputs`` and ``labels``. The former is a nested list of texts in the batch, with each element in the outer list representing a column in the training dataset. You have to combine these texts into pairs that can be 1) tokenized and 2) fed to the model. The latter is an optional (list of) tensor(s) of labels from a ``label``, ``labels``, ``score``, or ``scores`` column in the dataset. The method must return a single loss value or a dictionary of loss components (component names to loss values) that will be summed to produce the final loss value. When returning a dictionary, the individual components will be logged separately in addition to the summed loss, allowing you to monitor the individual components of the loss.
To get full support with the automatic model card generation, you may also wish to implement:
- a ``get_config_dict`` method that returns a dictionary of loss parameters.
- a ``citation`` property so your work gets cited in all models that train with the loss.
Consider inspecting existing loss functions to get a feel for how loss functions are commonly implemented.
```
================================================
FILE: docs/cross_encoder/pretrained_models.md
================================================
# Pretrained Models
```{eval-rst}
We have released various pre-trained Cross Encoder models via our Cross Encoder Hugging Face organization. Additionally, numerous community Cross Encoder models have been publicly released on the Hugging Face Hub.
* **Original models**: `Cross Encoder Hugging Face organization <https://huggingface.co/models?library=sentence-transformers&author=cross-encoder>`_.
* **Community models**: `All Cross Encoder models on Hugging Face <https://huggingface.co/models?library=sentence-transformers&pipeline_tag=text-ranking>`_.
Each of these models can be easily downloaded and used like so:
```
```python
from sentence_transformers import CrossEncoder
import torch
# Load https://huggingface.co/cross-encoder/ms-marco-MiniLM-L6-v2
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2", activation_fn=torch.nn.Sigmoid())
scores = model.predict([
("How many people live in Berlin?", "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers."),
("How many people live in Berlin?", "Berlin is well known for its museums."),
])
# => array([0.9998173 , 0.01312432], dtype=float32)
```
Cross-Encoders require text pairs as inputs and output a score 0...1 (if the Sigmoid activation function is used). They do not work for individual sentences and they don't compute embeddings for individual texts.
## MS MARCO
[MS MARCO Passage Retrieval](https://github.com/microsoft/MSMARCO-Passage-Ranking) is a large dataset with real user queries from Bing search engine with annotated relevant text passages. Models trained on this dataset are very effective as rerankers for search systems.
```{eval-rst}
.. note::
You can initialize these models with ``activation_fn=torch.nn.Sigmoid()`` to force the model to return scores between 0 and 1. Otherwise, the raw value can reasonably range between -10 and 10.
```
| Model Name | NDCG@10 (TREC DL 19) | MRR@10 (MS Marco Dev) | Docs / Sec |
| ------------- | :-------------: | :-----: | ---: |
| [cross-encoder/ms-marco-TinyBERT-L2-v2](https://huggingface.co/cross-encoder/ms-marco-TinyBERT-L2) | 69.84 | 32.56 | 9000
| [cross-encoder/ms-marco-MiniLM-L2-v2](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L2-v2) | 71.01 | 34.85 | 4100
| [cross-encoder/ms-marco-MiniLM-L4-v2](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L4-v2) | 73.04 | 37.70 | 2500
| **[cross-encoder/ms-marco-MiniLM-L6-v2](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L6-v2)** | 74.30 | 39.01 | 1800
| [cross-encoder/ms-marco-MiniLM-L12-v2](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L12-v2) | 74.31 | 39.02 | 960
| [cross-encoder/ms-marco-electra-base](https://huggingface.co/cross-encoder/ms-marco-electra-base) | 71.99 | 36.41 | 340 |
For details on the usage, see [Retrieve & Re-Rank](../../examples/sentence_transformer/applications/retrieve_rerank/README.md).
## SQuAD (QNLI)
QNLI is based on the [SQuAD dataset](https://rajpurkar.github.io/SQuAD-explorer/) ([HF](https://huggingface.co/datasets/rajpurkar/squad)) and was introduced by the [GLUE Benchmark](https://huggingface.co/papers/1804.07461) ([HF](https://huggingface.co/datasets/nyu-mll/glue)). Given a passage from Wikipedia, annotators created questions that are answerable by that passage. These models output higher scores if a passage answers a question.
| Model Name | Accuracy on QNLI dev set |
| ------------- | :----------------------------: |
| [cross-encoder/qnli-distilroberta-base](https://huggingface.co/cross-encoder/qnli-distilroberta-base) | 90.96 |
| [cross-encoder/qnli-electra-base](https://huggingface.co/cross-encoder/qnli-electra-base) | 93.21 |
## STSbenchmark
The following models can be used like this:
```python
from sentence_transformers import CrossEncoder
model = CrossEncoder("cross-encoder/stsb-roberta-base")
scores = model.predict([("It's a wonderful day outside.", "It's so sunny today!"), ("It's a wonderful day outside.", "He drove to work earlier.")])
# => array([0.60443085, 0.00240758], dtype=float32)
```
They return a score 0...1 indicating the semantic similarity of the given sentence pair.
| Model Name | STSbenchmark Test Performance |
| ------------- | :----------------------------: |
| [cross-encoder/stsb-TinyBERT-L4](https://huggingface.co/cross-encoder/stsb-TinyBERT-L4) | 85.50 |
| [cross-encoder/stsb-distilroberta-base](https://huggingface.co/cross-encoder/stsb-distilroberta-base) | 87.92 |
| [cross-encoder/stsb-roberta-base](https://huggingface.co/cross-encoder/stsb-roberta-base) | 90.17 |
| [cross-encoder/stsb-roberta-large](https://huggingface.co/cross-encoder/stsb-roberta-large) | 91.47 |
## Quora Duplicate Questions
These models have been trained on the [Quora duplicate questions dataset](https://huggingface.co/datasets/sentence-transformers/quora-duplicates). They can used like the STSb models and give a score 0...1 indicating the probability that two questions are duplicate questions.
| Model Name | Average Precision dev set |
| ------------- | :----------------------------: |
| [cross-encoder/quora-distilroberta-base](https://huggingface.co/cross-encoder/quora-distilroberta-base) | 87.48 |
| [cross-encoder/quora-roberta-base](https://huggingface.co/cross-encoder/quora-roberta-base) | 87.80 |
| [cross-encoder/quora-roberta-large](https://huggingface.co/cross-encoder/quora-roberta-large) | 87.91 |
```{eval-rst}
.. note::
The model don't work for question similarity. The question "How to learn Java?" and "How to learn Python?" will get a low score, as these questions are not duplicates. For question similarity, a :class:`~sentence_transformers.SentenceTransformer` trained on the Quora dataset will yield much more meaningful results.
```
## NLI
Given two sentences, are these contradicting each other, entailing one the other or are these neutral? The following models were trained on the [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNLI](https://huggingface.co/datasets/nyu-mll/multi_nli) datasets.
| Model Name | Accuracy on MNLI mismatched set |
| ------------- | :----------------------------: |
| [cross-encoder/nli-deberta-v3-base](https://huggingface.co/cross-encoder/nli-deberta-v3-base) | 90.04 |
| [cross-encoder/nli-deberta-base](https://huggingface.co/cross-encoder/nli-deberta-base) | 88.08 |
| [cross-encoder/nli-deberta-v3-xsmall](https://huggingface.co/cross-encoder/nli-deberta-v3-xsmall) | 87.77 |
| [cross-encoder/nli-deberta-v3-small](https://huggingface.co/cross-encoder/nli-deberta-v3-small) | 87.55 |
| [cross-encoder/nli-roberta-base](https://huggingface.co/cross-encoder/nli-roberta-base) | 87.47 |
| [cross-encoder/nli-MiniLM2-L6-H768](https://huggingface.co/cross-encoder/nli-MiniLM2-L6-H768) | 86.89 |
| [cross-encoder/nli-distilroberta-base](https://huggingface.co/cross-encoder/nli-distilroberta-base) | 83.98 |
```python
from sentence_transformers import CrossEncoder
model = CrossEncoder("cross-encoder/nli-deberta-v3-base")
scores = model.predict([
("A man is eating pizza", "A man eats something"),
("A black race car starts up in front of a crowd of people.", "A man is driving down a lonely road."),
])
# Convert scores to labels
label_mapping = ["contradiction", "entailment", "neutral"]
labels = [label_mapping[score_max] for score_max in scores.argmax(axis=1)]
# => ['entailment', 'contradiction']
```
## Community Models
Some notable models from the Community include:
- [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base)
- [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large)
- [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)
- [BAAI/bge-reranker-v2-gemma](https://huggingface.co/BAAI/bge-reranker-v2-gemma)
- [BAAI/bge-reranker-v2-minicpm-layerwise](https://huggingface.co/BAAI/bge-reranker-v2-minicpm-layerwise)
- [jinaai/jina-reranker-v1-tiny-en](https://huggingface.co/jinaai/jina-reranker-v1-tiny-en)
- [jinaai/jina-reranker-v1-turbo-en](https://huggingface.co/jinaai/jina-reranker-v1-turbo-en)
- [mixedbread-ai/mxbai-rerank-xsmall-v1](https://huggingface.co/mixedbread-ai/mxbai-rerank-xsmall-v1)
- [mixedbread-ai/mxbai-rerank-base-v1](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)
- [mixedbread-ai/mxbai-rerank-large-v1](https://huggingface.co/mixedbread-ai/mxbai-rerank-large-v1)
- [maidalun1020/bce-reranker-base_v1](https://huggingface.co/maidalun1020/bce-reranker-base_v1)
- [Alibaba-NLP/gte-reranker-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-reranker-modernbert-base)
- [Alibaba-NLP/gte-multilingual-reranker-base](https://huggingface.co/Alibaba-NLP/gte-multilingual-reranker-base)
================================================
FILE: docs/cross_encoder/training/examples.rst
================================================
Training Examples
=================
.. toctree::
:maxdepth: 1
:caption: Supervised Learning
../../../examples/cross_encoder/training/sts/README
../../../examples/cross_encoder/training/nli/README
../../../examples/cross_encoder/training/quora_duplicate_questions/README
../../../examples/cross_encoder/training/ms_marco/README
../../../examples/cross_encoder/training/rerankers/README
../../../examples/cross_encoder/training/distillation/README
.. toctree::
:maxdepth: 1
:caption: Advanced Usage
../../sentence_transformer/training/distributed
================================================
FILE: docs/cross_encoder/training_overview.md
================================================
# Training Overview
## Why Finetune?
Cross Encoder models are very often used as 2nd stage rerankers in a [Retrieve and Rerank](../../examples/sentence_transformer/applications/retrieve_rerank/README.md) search stack. In such a situation, the Cross Encoder reranks the top X candidates from the retriever (which can be a [Sentence Transformer model](../sentence_transformer/usage/usage.rst)). To avoid the reranker model reducing the performance on your use case, finetuning it can be crucial. Rerankers always have just 1 output label.
Beyond that, Cross Encoder models can also be used as pair classifiers. For example, a model trained on Natural Language Inference data can be used to classify pairs of texts as "contradiction", "entailment", and "neutral". Pair Classifiers generally have more than 1 output label.
See [**Training Examples**](training/examples) for numerous training scripts for common real-world applications that you can adopt.
## Training Components
Training Cross Encoder models involves between 4 to 6 components, just like [training Sentence Transformer models](../sentence_transformer/training_overview.md):
<div class="components">
<a href="#model" class="box">
<div class="header">Model</div>
Learn how to initialize the <b>model</b> for training.
</a>
<a href="#dataset" class="box">
<div class="header">Dataset</div>
Learn how to prepare the <b>data</b> for training.
</a>
<a href="#loss-function" class="box">
<div class="header">Loss Function</div>
Learn how to prepare and choose a <b>loss</b> function.
</a>
<a href="#training-arguments" class="box optional">
<div class="header">Training Arguments</div>
Learn which <b>training arguments</b> are useful.
</a>
<a href="#evaluator" class="box optional">
<div class="header">Evaluator</div>
Learn how to <b>evaluate</b> during and after training.
</a>
<a href="#trainer" class="box">
<div class="header">Trainer</div>
Learn how to start the <b>training</b> process.
</a>
</div>
<p></p>
## Model
```{eval-rst}
Cross Encoder models are initialized by loading a pretrained `transformers <https://huggingface.co/docs/transformers>`_ model using a sequence classification head. If the model itself does not have such a head, then it will be added automatically. Consequently, initializing a Cross Encoder model is rather simple:
.. sidebar:: Documentation
- :class:`sentence_transformers.cross_encoder.CrossEncoder`
::
from sentence_transformers import CrossEncoder
# This model already has a sequence classification head
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")
# And this model does not, so it will be added automatically
model = CrossEncoder("google-bert/bert-base-uncased")
.. tip::
You can find pretrained reranker models in the `Cross Encoder > Pretrained Models <pretrained_models.html>`_ documentation.
For other models, the strongest pretrained models are often "encoder models", i.e. models that are trained to produce a meaningful token embedding for inputs. You can find strong candidates here:
- `fill-mask models <https://huggingface.co/models?pipeline_tag=fill-mask>`_ - trained for token embeddings
- `sentence similarity models <https://huggingface.co/models?pipeline_tag=sentence-similarity>`_ - trained for text embeddings
- `feature-extraction models <https://huggingface.co/models?pipeline_tag=feature-extraction>`_ - trained for text embeddings
Consider looking for base models that are designed on your language and/or domain of interest. For example, `klue/bert-base <https://huggingface.co/klue/bert-base>`_ will work much better than `google-bert/bert-base-uncased <https://huggingface.co/google-bert/bert-base-uncased>`_ for Korean.
```
## Dataset
```{eval-rst}
The :class:`CrossEncoderTrainer` trains and evaluates using :class:`datasets.Dataset` (one dataset) or :class:`datasets.DatasetDict` instances (multiple datasets, see also `Multi-dataset training <#multi-dataset-training>`_).
.. tab:: Data on 🤗 Hugging Face Hub
If you want to load data from the `Hugging Face Datasets <https://huggingface.co/datasets>`_, then you should use :func:`datasets.load_dataset`:
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ul class="simple">
<li><a class="reference external" href="https://huggingface.co/docs/datasets/main/en/loading#hugging-face-hub">Datasets, Loading from the Hugging Face Hub</a></li>
<li><a class="reference external" href="https://huggingface.co/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset" title="(in datasets vmain)"><code class="xref py py-func docutils literal notranslate"><span class="pre">datasets.load_dataset()</span></code></a></li>
<li><a class="reference external" href="https://huggingface.co/datasets/sentence-transformers/all-nli">sentence-transformers/all-nli</a></li>
</ul>
</div>
::
from datasets import load_dataset
train_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="train")
eval_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split="dev")
print(train_dataset)
"""
Dataset({
features: ['premise', 'hypothesis', 'label'],
num_rows: 942069
})
"""
Some datasets (including `sentence-transformers/all-nli <https://huggingface.co/datasets/sentence-transformers/all-nli>`_) require you to provide a "subset" alongside the dataset name. ``sentence-transformers/all-nli`` has 4 subsets, each with different data formats: `pair <https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/pair>`_, `pair-class <https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/pair-class>`_, `pair-score <https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/pair-score>`_, `triplet <https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/triplet>`_.
.. note::
Many Hugging Face datasets that work out of the box with Sentence Transformers have been tagged with ``sentence-transformers``, allowing you to easily find them by browsing to `https://huggingface.co/datasets?other=sentence-transformers <https://huggingface.co/datasets?other=sentence-transformers>`_. We strongly recommend that you browse these datasets to find training datasets that might be useful for your tasks.
.. tab:: Local Data (CSV, JSON, Parquet, Arrow, SQL)
If you have local data in common file-formats, then you can load this data easily using :func:`datasets.load_dataset`:
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ul class="simple">
<li><a class="reference external" href="https://huggingface.co/docs/datasets/main/en/loading#local-and-remote-files">Datasets, Loading local files</a></li>
<li><a class="reference external" href="https://huggingface.co/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset" title="(in datasets vmain)"><code class="xref py py-func docutils literal notranslate"><span class="pre">datasets.load_dataset()</span></code></a></li>
</ul>
</div>
::
from datasets import load_dataset
dataset = load_dataset("csv", data_files="my_file.csv")
or::
from datasets import load_dataset
dataset = load_dataset("json", data_files="my_file.json")
.. tab:: Local Data that requires pre-processing
If you have local data that requires some extra pre-processing, my recommendation is to initialize your dataset using :meth:`datasets.Dataset.from_dict` and a dictionary of lists, like so:
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ul class="simple">
<li><a class="reference external" href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.from_dict" title="(in datasets vmain)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">datasets.Dataset.from_dict()</span></code></a></li>
</ul>
</div>
::
from datasets import Dataset
anchors = []
positives = []
# Open a file, do preprocessing, filtering, cleaning, etc.
# and append to the lists
dataset = Dataset.from_dict({
"anchor": anchors,
"positive": positives,
})
Each key from the dictionary will become a column in the resulting dataset.
```
### Dataset Format
```{eval-rst}
It is important that your dataset format matches your loss function (or that you choose a loss function that matches your dataset format and model). Verifying whether a dataset format and model work with a loss function involves three steps:
1. All columns not named "label", "labels", "score", or "scores" are considered *Inputs* according to the `Loss Overview <loss_overview.html>`_ table. The number of remaining columns must match the number of valid inputs for your chosen loss. The names of these columns are **irrelevant**, only the **order matters**.
2. If your loss function requires a *Label* according to the `Loss Overview <loss_overview.html>`_ table, then your dataset must have a **column named "label", "labels", "score", or "scores"**. This column is automatically taken as the label.
3. The number of model output labels matches what is required for the loss according to `Loss Overview <loss_overview.html>`_ table.
For example, given a dataset with columns ``["text1", "text2", "label"]`` where the "label" column has float similarity score ranging from 0 to 1 and a model outputting 1 label, we can use it with :class:`~sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss` because:
1. the dataset has a "label" column as is required for this loss function.
2. the dataset has 2 non-label columns, exactly the amount required by this loss functions.
3. the model has 1 output label, exactly as required by this loss function.
Be sure to re-order your dataset columns with :meth:`Dataset.select_columns <datasets.Dataset.select_columns>` if your columns are not ordered correctly. For example, if your dataset has ``["good_answer", "bad_answer", "question"]`` as columns, then this dataset can technically be used with a loss that requires (anchor, positive, negative) triplets, but the ``good_answer`` column will be taken as the anchor, ``bad_answer`` as the positive, and ``question`` as the negative.
Additionally, if your dataset has extraneous columns (e.g. sample_id, metadata, source, type), you should remove these with :meth:`Dataset.remove_columns <datasets.Dataset.remove_columns>` as they will be used as inputs otherwise. You can also use :meth:`Dataset.select_columns <datasets.Dataset.select_columns>` to keep only the desired columns.
```
### Hard Negatives Mining
The success of training CrossEncoder models often depends on the quality of the *negatives*, i.e. the passages for which the query-negative score should be low. Negatives can be divided into two types:
- **Soft negatives**: passages that are completely unrelated.
- **Hard negatives**: passages that seem like they might be relevant for the query, but are not.
A concise example is:
- **Query**: Where was Apple founded?
- **Soft Negative**: The Cache River Bridge is a Parker pony truss that spans the Cache River between Walnut Ridge and Paragould, Arkansas.
- **Hard Negative**: The Fuji apple is an apple cultivar developed in the late 1930s, and brought to market in 1962.
```{eval-rst}
The strongest CrossEncoder models are generally trained to recognize hard negatives, and so it's valuable to be able to "mine" hard negatives. Sentence Transformers supports a strong :func:`~sentence_transformers.util.mine_hard_negatives` function that can assist, given a dataset of query-answer pairs:
.. sidebar:: Documentation
* `sentence-transformers/gooaq <https://huggingface.co/datasets/sentence-transformers/gooaq>`_
* `sentence-transformers/static-retrieval-mrl-en-v1 <https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1>`_
* :class:`~sentence_transformers.SentenceTransformer`
* :func:`~sentence_transformers.util.mine_hard_negatives`
::
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import mine_hard_negatives
# Load the GooAQ dataset: https://huggingface.co/datasets/sentence-transformers/gooaq
train_dataset = load_dataset("sentence-transformers/gooaq", split=f"train").select(range(100_000))
print(train_dataset)
# Mine hard negatives using a very efficient embedding model
embedding_model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
hard_train_dataset = mine_hard_negatives(
train_dataset,
embedding_model,
num_negatives=5, # How many negatives per question-answer pair
range_min=10, # Skip the x most similar samples
range_max=100, # Consider only the x most similar samples
max_score=0.8, # Only consider samples with a similarity score of at most x
absolute_margin=0.1, # Anchor-negative similarity is at least x lower than anchor-positive similarity
relative_margin=0.1, # Anchor-negative similarity is at most 1-x times the anchor-positive similarity, e.g. 90%
sampling_strategy="top", # Sample the top negatives from the range
batch_size=4096, # Use a batch size of 4096 for the embedding model
output_format="labeled-pair", # The output format is (query, passage, label), as required by BinaryCrossEntropyLoss
use_faiss=True, # Using FAISS is recommended to keep memory usage low (pip install faiss-gpu or pip install faiss-cpu)
)
print(hard_train_dataset)
print(hard_train_dataset[1])
```
<details><summary>Click to see the outputs of this script.</summary>
```
Dataset({
features: ['question', 'answer'],
num_rows: 100000
})
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:01<00:00, 12.74it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 37.50it/s]
Querying FAISS index: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:18<00:00, 2.66s/it]
Metric Positive Negative Difference
Count 100,000 436,925
Mean 0.5882 0.4040 0.2157
Median 0.5989 0.4024 0.1836
Std 0.1425 0.0905 0.1013
Min -0.0514 0.1405 0.1014
25% 0.4993 0.3377 0.1352
50% 0.5989 0.4024 0.1836
75% 0.6888 0.4681 0.2699
Max 0.9748 0.7486 0.7545
Skipped 2,420,871 potential negatives (23.97%) due to the absolute_margin of 0.1.
Skipped 43 potential negatives (0.00%) due to the max_score of 0.8.
Could not find enough negatives for 63075 samples (12.62%). Consider adjusting the range_max, range_min, absolute_margin, relative_margin and max_score parameters if you'd like to find more valid negatives.
Dataset({
features: ['question', 'answer', 'label'],
num_rows: 536925
})
{
'question': 'how to transfer bookmarks from one laptop to another?',
'answer': 'Using an External Drive Just about any external drive, including a USB thumb drive, or an SD card can be used to transfer your files from one laptop to another. Connect the drive to your old laptop; drag your files to the drive, then disconnect it and transfer the drive contents onto your new laptop.',
'label': 0
}
```
</details>
<br>
## Loss Function
Loss functions quantify how well a model performs for a given batch of data, allowing an optimizer to update the model weights to produce more favourable (i.e., lower) loss values. This is the core of the training process.
Sadly, there is no single loss function that works best for all use-cases. Instead, which loss function to use greatly depends on your available data and on your target task. See [Dataset Format](#dataset-format) to learn what datasets are valid for which loss functions. Additionally, the [Loss Overview](loss_overview) will be your best friend to learn about the options.
```{eval-rst}
Most loss functions can be initialized with just the :class:`~sentence_transformers.cross_encoder.CrossEncoder` that you're training, alongside some optional parameters, e.g.:
.. sidebar:: Documentation
- :class:`sentence_transformers.cross_encoder.losses.MultipleNegativesRankingLoss`
- `Losses API Reference <../package_reference/cross_encoder/losses.html>`_
- `Loss Overview <loss_overview.html>`_
::
from datasets import load_dataset
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.losses import MultipleNegativesRankingLoss
# Load a model to train/finetune
model = CrossEncoder("xlm-roberta-base", num_labels=1) # num_labels=1 is for rerankers
# Initialize the MultipleNegativesRankingLoss
# This loss requires pairs of related texts or triplets
loss = MultipleNegativesRankingLoss(model)
# Load an example training dataset that works with our loss function:
train_dataset = load_dataset("sentence-transformers/gooaq", split="train")
```
## Training Arguments
```{eval-rst}
The :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments` class can be used to specify parameters for influencing training performance as well as defining the tracking/debugging parameters. Although it is optional, it is heavily recommended to experiment with the various useful arguments.
```
<div class="training-arguments">
<div class="header">Key Training Arguments for improving training performance</div>
<div class="table">
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.learning_rate"><code>learning_rate</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.lr_scheduler_type"><code>lr_scheduler_type</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.warmup_ratio"><code>warmup_ratio</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.num_train_epochs"><code>num_train_epochs</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.max_steps"><code>max_steps</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size"><code>per_device_train_batch_size</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.per_device_eval_batch_size"><code>per_device_eval_batch_size</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.auto_find_batch_size "><code>auto_find_batch_size</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.fp16"><code>fp16</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.bf16"><code>bf16</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.load_best_model_at_end"><code>load_best_model_at_end</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.metric_for_best_model"><code>metric_for_best_model</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps"><code>gradient_accumulation_steps</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.gradient_checkpointing"><code>gradient_checkpointing</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.eval_accumulation_steps"><code>eval_accumulation_steps</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.optim"><code>optim</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.dataloader_num_workers"><code>dataloader_num_workers</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.dataloader_prefetch_factor"><code>dataloader_prefetch_factor</code></a>
<a href="../package_reference/cross_encoder/training_args.html#sentence_transformers.cross_encoder.training_args.SentenceTransformerTrainingArguments"><code>batch_sampler</code></a>
<a href="../package_reference/cross_encoder/training_args.html#sentence_transformers.cross_encoder.training_args.SentenceTransformerTrainingArguments"><code>multi_dataset_batch_sampler</code></a>
<a href="../package_reference/cross_encoder/training_args.html#sentence_transformers.cross_encoder.training_args.SentenceTransformerTrainingArguments"><code>learning_rate_mapping</code></a>
</div>
</div>
<br>
<div class="training-arguments">
<div class="header">Key Training Arguments for observing training performance</div>
<div class="table">
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.eval_strategy"><code>eval_strategy</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.eval_steps"><code>eval_steps</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.save_strategy"><code>save_strategy</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.save_steps"><code>save_steps</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.save_total_limit"><code>save_total_limit</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.report_to"><code>report_to</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.run_name"><code>run_name</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.log_level"><code>log_level</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.logging_steps"><code>logging_steps</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.push_to_hub"><code>push_to_hub</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.hub_model_id"><code>hub_model_id</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy"><code>hub_strategy</code></a>
<a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.hub_private_repo"><code>hub_private_repo</code></a>
</div>
</div>
<br>
```{eval-rst}
Here is an example of how :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments` can be initialized:
```
```python
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments
args = CrossEncoderTrainingArguments(
# Required parameter:
output_dir="models/reranker-MiniLM-msmarco-v1",
# Optional training parameters:
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=2e-5,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
batch_sampler=BatchSamplers.NO_DUPLICATES, # losses that use "in-batch negatives" benefit from no duplicates
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
save_total_limit=2,
logging_steps=100,
run_name="reranker-MiniLM-msmarco-v1", # Will be used in W&B if `wandb` is installed
)
```
## Evaluator
```{eval-rst}
You can provide the :class:`~sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer` with an ``eval_dataset`` to get the evaluation loss during training, but it may be useful to get more concrete metrics during training, too. For this, you can use evaluators to assess the model's performance with useful metrics before, during, or after training. You can use both an ``eval_dataset`` and an evaluator, one or the other, or neither. They evaluate based on the ``eval_strategy`` and ``eval_steps`` `Training Arguments <#training-arguments>`_.
Here are the implemented Evaluators that come with Sentence Transformers for Cross Encoder models:
============================================================================================= ========================================================================================================================================================================
Evaluator Required Data
============================================================================================= ========================================================================================================================================================================
:class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderClassificationEvaluator` Pairs with class labels (binary or multiclass).
:class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderCorrelationEvaluator` Pairs with similarity scores.
:class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderNanoBEIREvaluator` No data required.
:class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator` List of ``{'query': '...', 'positive': [...], 'negative': [...]}`` dictionaries. Negatives can be mined with :func:`~sentence_transformers.util.mine_hard_negatives`.
============================================================================================= ========================================================================================================================================================================
Additionally, :class:`~sentence_transformers.evaluation.SequentialEvaluator` should be used to combine multiple evaluators into one Evaluator that can be passed to the :class:`~sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer`.
Sometimes you don't have the required evaluation data to prepare one of these evaluators on your own, but you still want to track how well the model performs on some common benchmarks. In that case, you can use these evaluators with data from Hugging Face.
.. tab:: CrossEncoderNanoBEIREvaluator
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ul class="simple">
<li><a class="reference external" href="https://huggingface.co/cross-encoder/ms-marco-MiniLM-L6-v2">cross-encoder/ms-marco-MiniLM-L6-v2</a></li>
<li><a class="reference internal" href="../package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.CrossEncoderNanoBEIREvaluator" title="sentence_transformers.evaluation.CrossEncoderNanoBEIREvaluator"><code class="xref py py-class docutils literal notranslate"><span class="pre">sentence_transformers.evaluation.CrossEncoderNanoBEIREvaluator</span></code></a></li>
</ul>
</div>
::
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator
# Load a model
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")
# Initialize the evaluator. Unlike most other evaluators, this one loads the relevant datasets
# directly from Hugging Face, so there's no mandatory arguments
dev_evaluator = CrossEncoderNanoBEIREvaluator()
# You can run evaluation like so:
# results = dev_evaluator(model)
.. tab:: CrossEncoderRerankingEvaluator with GooAQ mined negatives
Preparing data for :class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator` can be difficult as you need negatives in addition to your query-positive data.
The :func:`~sentence_transformers.util.mine_hard_negatives` function has a convenient ``include_positives`` parameter, which can be set to ``True`` to also mine for the positive texts. When supplied as ``documents`` (which have to be 1. ranked and 2. contain positives) to :class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator`, the evaluator will not just evaluate the reranking performance of the CrossEncoder, but also the original rankings by the embedding model used for mining.
For example::
CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev dataset:
Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 49.0, Mean 49.1, Max 50.0
Base -> Reranked
MAP: 53.28 -> 67.28
MRR@10: 52.40 -> 66.65
NDCG@10: 59.12 -> 71.35
Note that by default, if you are using :class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator` with ``documents``, the evaluator will rerank with *all* positives, even if they are not in the documents. This is useful for getting a stronger signal out of your evaluator, but does give a slightly unrealistic performance. After all, the maximum performance is now 100, whereas normally its bounded by whether the first-stage retriever actually retrieved the positives.
You can enable the realistic behaviour by setting ``always_rerank_positives=False`` when initializing :class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator`. Repeating the same script with this realistic two-stage performance results in::
CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev dataset:
Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 49.0, Mean 49.1, Max 50.0
Base -> Reranked
MAP: 53.28 -> 66.12
MRR@10: 52.40 -> 65.61
NDCG@10: 59.12 -> 70.10
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ul class="simple">
<li><a class="reference external" href="https://huggingface.co/cross-encoder/ms-marco-MiniLM-L6-v2">cross-encoder/ms-marco-MiniLM-L6-v2</a></li>
<li><a class="reference external" href="https://huggingface.co/datasets/sentence-transformers/gooaq">sentence-transformers/gooaq</a></li>
<li><a class="reference internal" href="../package_reference/util.html#sentence_transformers.util.mine_hard_negatives" title="sentence_transformers.util.mine_hard_negatives"><code class="xref py py-class docutils literal notranslate"><span class="pre">sentence_transformers.util.mine_hard_negatives</span></code></a></li>
<li><a class="reference internal" href="../package_reference/cross_encoder/evaluation.html#sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator" title="sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator"><code class="xref py py-class docutils literal notranslate"><span class="pre">sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator</span></code></a></li>
</ul>
</div>
::
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from sentence_transformers.util import mine_hard_negatives
# Load a model
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")
# Load the GooAQ dataset: https://huggingface.co/datasets/sentence-transformers/gooaq
full_dataset = load_dataset("sentence-transformers/gooaq", split=f"train").select(range(100_000))
dataset_dict = full_dataset.train_test_split(test_size=1_000, seed=12)
train_dataset = dataset_dict["train"]
eval_dataset = dataset_dict["test"]
print(eval_dataset)
"""
Dataset({
features: ['question', 'answer'],
num_rows: 1000
})
"""
# Mine hard negatives using a very efficient embedding model
embedding_model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
hard_eval_dataset = mine_hard_negatives(
eval_dataset,
embedding_model,
corpus=full_dataset["answer"], # Use the full dataset as the corpus
num_negatives=50, # How many negatives per question-answer pair
batch_size=4096, # Use a batch size of 4096 for the embedding model
output_format="n-tuple", # The output format is (query, positive, negative1, negative2, ...) for the evaluator
include_positives=True, # Key: Include the positive answer in the list of negatives
use_faiss=True, # Using FAISS is recommended to keep memory usage low (pip install faiss-gpu or pip install faiss-cpu)
)
print(hard_eval_dataset)
"""
Dataset({
features: ['question', 'answer', 'negative_1', 'negative_2', 'negative_3', 'negative_4', 'negative_5', 'negative_6', 'negative_7', 'negative_8', 'negative_9', 'negative_10', 'negative_11', 'negative_12', 'negative_13', 'negative_14', 'negative_15', 'negative_16', 'negative_17', 'negative_18', 'negative_19', 'negative_20', 'negative_21', 'negative_22', 'negative_23', 'negative_24', 'negative_25', 'negative_26', 'negative_27', 'negative_28', 'negative_29', 'negative_30', 'negative_31', 'negative_32', 'negative_33', 'negative_34', 'negative_35', 'negative_36', 'negative_37', 'negative_38', 'negative_39', 'negative_40', 'negative_41', 'negative_42', 'negative_43', 'negative_44', 'negative_45', 'negative_46', 'negative_47', 'negative_48', 'negative_49', 'negative_50'],
num_rows: 1000
})
"""
reranking_evaluator = CrossEncoderRerankingEvaluator(
samples=[
{
"query": sample["question"],
"positive": [sample["answer"]],
"documents": [sample[column_name] for column_name in hard_eval_dataset.column_names[2:]],
}
for sample in hard_eval_dataset
],
batch_size=32,
name="gooaq-dev",
)
# You can run evaluation like so
results = reranking_evaluator(model)
"""
CrossEncoderRerankingEvaluator: Evaluating the model on the gooaq-dev dataset:
Queries: 1000 Positives: Min 1.0, Mean 1.0, Max 1.0 Negatives: Min 49.0, Mean 49.1, Max 50.0
Base -> Reranked
MAP: 53.28 -> 67.28
MRR@10: 52.40 -> 66.65
NDCG@10: 59.12 -> 71.35
"""
# {'gooaq-dev_map': 0.6728370126462222, 'gooaq-dev_mrr@10': 0.6665190476190477, 'gooaq-dev_ndcg@10': 0.7135068904582963, 'gooaq-dev_base_map': 0.5327714512001362, 'gooaq-dev_base_mrr@10': 0.5239674603174603, 'gooaq-dev_base_ndcg@10': 0.5912299141913905}
.. tab:: CrossEncoderCorrelationEvaluator with STSb
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ul class="simple">
<li><a class="reference external" href="https://huggingface.co/cross-encoder/stsb-TinyBERT-L4">cross-encoder/stsb-TinyBERT-L4</a></li>
<li><a class="reference external" href="https://huggingface.co/datasets/sentence-transformers/stsb">sentence-transformers/stsb</a></li>
<li><a class="reference internal" href="../package_reference/cross_encoder/evaluation.html#sentence_transformers.cross_encoder.evaluation.CrossEncoderCorrelationEvaluator" title="sentence_transformers.cross_encoder.evaluation.CrossEncoderCorrelationEvaluator"><code class="xref py py-class docutils literal notranslate"><span class="pre">sentence_transformers.cross_encoder.evaluation.CrossEncoderCorrelationEvaluator</span></code></a></li>
</ul>
</div>
::
from datasets import load_dataset
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CrossEncoderCorrelationEvaluator
# Load a model
model = CrossEncoder("cross-encoder/stsb-TinyBERT-L4")
# Load the STSB dataset (https://huggingface.co/datasets/sentence-transformers/stsb)
eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
pairs = list(zip(eval_dataset["sentence1"], eval_dataset["sentence2"]))
# Initialize the evaluator
dev_evaluator = CrossEncoderCorrelationEvaluator(
sentence_pairs=pairs,
scores=eval_dataset["score"],
name="sts_dev",
)
# You can run evaluation like so:
# results = dev_evaluator(model)
.. tab:: CrossEncoderClassificationEvaluator with AllNLI
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ul class="simple">
<li><a class="reference external" href="https://huggingface.co/cross-encoder/nli-deberta-v3-base">cross-encoder/nli-deberta-v3-base</a></li>
<li><a class="reference external" href="https://huggingface.co/datasets/sentence-transformers/all-nli">sentence-transformers/all-nli</a></li>
<li><a class="reference internal" href="../package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.TripletEvaluator" title="sentence_transformers.evaluation.TripletEvaluator"><code class="xref py py-class docutils literal notranslate"><span class="pre">sentence_transformers.evaluation.TripletEvaluator</span></code></a></li>
</ul>
</div>
::
from datasets import load_dataset
from sentence_transformers import CrossEncoder
from sentence_transformers.evaluation import TripletEvaluator, SimilarityFunction
# Load a model
model = CrossEncoder("cross-encoder/nli-deberta-v3-base")
# Load triplets from the AllNLI dataset (https://huggingface.co/datasets/sentence-transformers/all-nli)
max_samples = 1000
eval_dataset = load_dataset("sentence-transformers/all-nli", "pair-class", split=f"dev[:{max_samples}]")
# Create a list of pairs, and map the labels to the labels that the model knows
pairs = list(zip(eval_dataset["premise"], eval_dataset["hypothesis"]))
label_mapping = {0: 1, 1: 2, 2: 0}
labels = [label_mapping[label] for label in eval_dataset["label"]]
# Initialize the evaluator
cls_evaluator = CrossEncoderClassificationEvaluator(
sentence_pairs=pairs,
labels=labels,
name="all-nli-dev",
)
# You can run evaluation like so:
# results = cls_evaluator(model)
.. warning::
When using `Distributed Training <training/distributed.html>`_, the evaluator only runs on the first device, unlike the training and evaluation datasets, which are shared across all devices.
```
## Trainer
```{eval-rst}
The :class:`~sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer` is where all previous components come together. We only have to specify the trainer with the model, training arguments (optional), training dataset, evaluation dataset (optional), loss function, evaluator (optional) and we can start training. Let's have a look at a script where all of these components come together:
.. tab:: Simple Example
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ol class="arabic simple">
<li><p><a class="reference internal" href="../package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.CrossEncoder" title="sentence_transformers.cross_encoder.CrossEncoder"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoder</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.model_card.CrossEncoderModelCardData" title="sentence_transformers.cross_encoder.model_card.CrossEncoderModelCardData"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderModelCardData</span></code></a></p></li>
<li><p><a class="reference external" href="https://huggingface.co/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset" title="(in datasets vmain)"><code class="xref py py-func docutils literal notranslate"><span class="pre">load_dataset()</span></code></a></p></li>
<li><p><a class="reference external" href="https://huggingface.co/datasets/sentence-transformers/gooaq">sentence-transformers/gooaq</a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/losses.html#sentence_transformers.cross_encoder.losses.CachedMultipleNegativesRankingLoss" title="sentence_transformers.cross_encoder.losses.CachedMultipleNegativesRankingLoss"><code class="xref py py-class docutils literal notranslate"><span class="pre">CachedMultipleNegativesRankingLoss</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/evaluation.html#sentence_transformers.cross_encoder.evaluation.CrossEncoderNanoBEIREvaluator" title="sentence_transformers.cross_encoder.evaluation.CrossEncoderNanoBEIREvaluator"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderNanoBEIREvaluator</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/training_args.html#sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments" title="sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderTrainingArguments</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/trainer.html#sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer" title="sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderTrainer</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/trainer.html#sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer.train" title="sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer.train"><code class="xref py py-meth docutils literal notranslate"><span class="pre">CrossEncoderTrainer.train()</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.CrossEncoder.save_pretrained" title="sentence_transformers.cross_encoder.CrossEncoder.save_pretrained"><code class="xref py py-meth docutils literal notranslate"><span class="pre">CrossEncoder.save_pretrained()</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.CrossEncoder.push_to_hub" title="sentence_transformers.cross_encoder.CrossEncoder.push_to_hub"><code class="xref py py-meth docutils literal notranslate"><span class="pre">CrossEncoder.push_to_hub()</span></code></a></p></li>
</ol>
</div>
::
import logging
import traceback
from datasets import load_dataset
from sentence_transformers.cross_encoder import (
CrossEncoder,
CrossEncoderModelCardData,
CrossEncoderTrainer,
CrossEncoderTrainingArguments,
)
from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator
from sentence_transformers.cross_encoder.losses import CachedMultipleNegativesRankingLoss
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
model_name = "microsoft/MiniLM-L12-H384-uncased"
train_batch_size = 64
num_epochs = 1
num_rand_negatives = 5 # How many random negatives should be used for each question-answer pair
# 1a. Load a model to finetune with 1b. (Optional) model card data
model = CrossEncoder(
model_name,
model_card_data=CrossEncoderModelCardData(
language="en",
license="apache-2.0",
model_name="MiniLM-L12-H384 trained on GooAQ",
),
)
print("Model max length:", model.max_length)
print("Model num labels:", model.num_labels)
# 2. Load the GooAQ dataset: https://huggingface.co/datasets/sentence-transformers/gooaq
logging.info("Read the gooaq training dataset")
full_dataset = load_dataset("sentence-transformers/gooaq", split="train").select(range(100_000))
dataset_dict = full_dataset.train_test_split(test_size=1_000, seed=12)
train_dataset = dataset_dict["train"]
eval_dataset = dataset_dict["test"]
logging.info(train_dataset)
logging.info(eval_dataset)
# 3. Define our training loss.
loss = CachedMultipleNegativesRankingLoss(
model=model,
num_negatives=num_rand_negatives,
mini_batch_size=32, # Informs the memory usage
)
# 4. Use CrossEncoderNanoBEIREvaluator, a light-weight evaluator for English reranking
evaluator = CrossEncoderNanoBEIREvaluator(
dataset_names=["msmarco", "nfcorpus", "nq"],
batch_size=train_batch_size,
)
evaluator(model)
# 5. Define the training arguments
short_model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
run_name = f"reranker-{short_model_name}-gooaq-cmnrl"
args = CrossEncoderTrainingArguments(
# Required parameter:
output_dir=f"models/{run_name}",
# Optional training parameters:
num_train_epochs=num_epochs,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
learning_rate=2e-5,
warmup_ratio=0.1,
fp16=False, # Set to False if you get an error that your GPU can't run on FP16
bf16=True, # Set to True if you have a GPU that supports BF16
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
save_total_limit=2,
logging_steps=50,
logging_first_step=True,
run_name=run_name, # Will be used in W&B if `wandb` is installed
seed=12,
)
# 6. Create the trainer & start training
trainer = CrossEncoderTrainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=loss,
evaluator=evaluator,
)
trainer.train()
# 7. Evaluate the final model, useful to include these in the model card
evaluator(model)
# 8. Save the final model
final_output_dir = f"models/{run_name}/final"
model.save_pretrained(final_output_dir)
# 9. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
try:
model.push_to_hub(run_name)
except Exception:
logging.error(
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = CrossEncoder({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{run_name}')`."
)
.. tab:: Extensive Example
.. raw:: html
<div class="sidebar">
<p class="sidebar-title">Documentation</p>
<ol class="arabic simple">
<li><p><a class="reference internal" href="../package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.CrossEncoder" title="sentence_transformers.cross_encoder.CrossEncoder"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoder</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.model_card.CrossEncoderModelCardData" title="sentence_transformers.cross_encoder.model_card.CrossEncoderModelCardData"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderModelCardData</span></code></a></p></li>
<li><p><a class="reference external" href="https://huggingface.co/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset" title="(in datasets vmain)"><code class="xref py py-func docutils literal notranslate"><span class="pre">load_dataset()</span></code></a></p></li>
<li><p><a class="reference external" href="https://huggingface.co/datasets/sentence-transformers/gooaq">sentence-transformers/gooaq</a></p></li>
<li><p><a class="reference internal" href="../package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer" title="sentence_transformers.SentenceTransformer"><code class="xref py py-class docutils literal notranslate"><span class="pre">SentenceTransformer</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/util.html#sentence_transformers.util.mine_hard_negatives" title="sentence_transformers.util.mine_hard_negatives"><code class="xref py py-class docutils literal notranslate"><span class="pre">mine_hard_negatives</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/losses.html#sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss" title="sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss"><code class="xref py py-class docutils literal notranslate"><span class="pre">BinaryCrossEntropyLoss</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/evaluation.html#sentence_transformers.cross_encoder.evaluation.CrossEncoderNanoBEIREvaluator" title="sentence_transformers.cross_encoder.evaluation.CrossEncoderNanoBEIREvaluator"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderNanoBEIREvaluator</span></code></a></p></li>
<li><p><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderRerankingEvaluators</span></code></p></li>
<li><p><a class="reference internal" href="../package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.SequentialEvaluator" title="sentence_transformers.evaluation.SequentialEvaluator"><code class="xref py py-class docutils literal notranslate"><span class="pre">SequentialEvaluator</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/training_args.html#sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments" title="sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderTrainingArguments</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/trainer.html#sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer" title="sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer"><code class="xref py py-class docutils literal notranslate"><span class="pre">CrossEncoderTrainer</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/trainer.html#sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer.train" title="sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer.train"><code class="xref py py-meth docutils literal notranslate"><span class="pre">CrossEncoderTrainer.train()</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.CrossEncoder.save_pretrained" title="sentence_transformers.cross_encoder.CrossEncoder.save_pretrained"><code class="xref py py-meth docutils literal notranslate"><span class="pre">CrossEncoder.save_pretrained()</span></code></a></p></li>
<li><p><a class="reference internal" href="../package_reference/cross_encoder/cross_encoder.html#sentence_transformers.cross_encoder.CrossEncoder.push_to_hub" title="sentence_transformers.cross_encoder.CrossEncoder.push_to_hub"><code class="xref py py-meth docutils literal notranslate"><span class="pre">CrossEncoder.push_to_hub()</span></code></a></p></li>
</ol>
</div>
::
import logging
import traceback
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import (
CrossEncoder,
CrossEncoderModelCardData,
CrossEncoderTrainer,
CrossEncoderTrainingArguments,
)
from sentence_transformers.cross_encoder.evaluation import (
CrossEncoderNanoBEIREvaluator,
CrossEncoderRerankingEvaluator,
)
from sentence_transformers.cross_encoder.losses import BinaryCrossEntropyLoss
from sentence_transformers.evaluation import SequentialEvaluator
from sentence_transformers.util import mine_hard_negatives
# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
def main():
model_name = "answerdotai/ModernBERT-base"
train_batch_size = 64
num_epochs = 1
num_hard_negatives = 5 # How many hard negatives should be mined for each question-answer pair
# 1a. Load a model to finetune with 1b. (Optional) model card data
model = CrossEncoder(
model_name,
model_card_data=CrossEncoderModelCardData(
language="en",
license="apache-2.0",
model_name="ModernBERT-base trained on GooAQ",
),
)
print("Model max length:", model.max_length)
print("Model num labels:", model.num_labels)
# 2a. Load the GooAQ dataset: https://huggingface.co/datasets/sentence-transformers/gooaq
logging.info("Read the gooaq training dataset")
full_dataset = load_dataset("sentence-transformers/gooaq", split="train").select(range(100_000))
dataset_dict = full_dataset.train_test_split(test_size=1_000, seed=12)
train_dataset = dataset_dict["train"]
eval_dataset = dataset_dict["test"]
logging.info(train_dataset)
logging.info(eval_dataset)
# 2b. Modify our training dataset to include hard negatives using a very efficient embedding model
embedding_model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
hard_train_dataset = mine_hard_negatives(
train_dataset,
embedding_model,
num_negatives=num_hard_negatives, # How many negatives per question-answer pair
margin=0, # Similarity between query and negative samples should be x lower than query-positive similarity
range_min=0, # Skip the x most similar samples
range_max=100, # Consider only the x most similar samples
sampling_strategy="top", # Sample the top negatives from the range
batch_size=4096, # Use a batch size of 4096 for the embedding model
output_format="labeled-pair", # The output format is (query, passage, label), as required by BinaryCrossEntropyLoss
use_faiss=True,
)
logging.info(hard_train_dataset)
# 2c. (Optionally) Save the hard training dataset to disk
# hard_train_dataset.save_to_disk("gooaq-hard-train")
# Load again with:
# hard_train_dataset = load_from_disk("gooaq-hard-train")
# 3. Define our training loss.
# pos_weight is recommended to be set as the ratio between positives to negatives, a.k.a. `num_hard_negatives`
loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(num_hard_negatives))
# 4a. Define evaluators. We use the CrossEncoderNanoBEIREvaluator, which is a light-weight evaluator for English reranking
nano_beir_evaluator = CrossEncoderNanoBEIREvaluator(
dataset_names=["msmarco", "nfcorpus", "nq"],
batch_size=train_batch_size,
)
# 4b. Define a reranking evaluator by mining hard negatives given query-answer pairs
# We include the positive answer in the list of negatives, so the evaluator can use the performance of the
# embedding model as a baseline.
hard_eval_dataset = mine_hard_negatives(
eval_dataset,
embedding_model,
corpus=full_dataset["answer"], # Use the full dataset as the corpus
num_negatives=30, # How many documents to rerank
batch_size=4096,
include_positives=True,
output_format="n-tuple",
use_faiss=True,
)
logging.info(hard_eval_dataset)
reranking_evaluator = CrossEncoderRerankingEvaluator(
samples=[
{
"query": sample["question"],
"positive": [sample["answer"]],
"documents": [sample[column_name] for column_name in hard_eval_dataset.column_names[2:]],
}
for sample in hard_eval_dataset
],
batch_size=train_batch_size,
name="gooaq-dev",
# Realistic setting: only rerank the positives that the retriever found
# Set to True to rerank *all* positives
always_rerank_positives=False,
)
# 4c. Combine the evaluators & run the base model on them
evaluator = SequentialEvaluator([reranking_evaluator, nano_beir_evaluator])
evaluator(model)
# 5. Define the training arguments
short_model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
run_name = f"reranker-{short_model_name}-gooaq-bce"
args = CrossEncoderTrainingArguments(
# Required parameter:
output_dir=f"models/{run_name}",
# Optional training parameters:
num_train_epochs=num_epochs,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
learning_rate=2e-5,
warmup_ratio=0.1,
fp16=False, # Set to False if you get an error that your GPU can't run on FP16
bf16=True, # Set to True if you have a GPU that supports BF16
dataloader_num_workers=4,
load_best_model_at_end=True,
metric_for_best_model="eval_gooaq-dev_ndcg@10",
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=1000,
save_strategy="steps",
save_steps=1000,
save_total_limit=2,
logging_steps=200,
logging_first_step=True,
run_name=run_name, # Will be used in W&B if `wandb` is installed
seed=12,
)
# 6. Create the trainer & start training
trainer = CrossEncoderTrainer(
model=model,
args=args,
train_dataset=hard_train_dataset,
loss=loss,
evaluator=evaluator,
)
trainer.train()
# 7. Evaluate the final model, useful to include these in the model card
evaluator(model)
# 8. Save the final model
final_output_dir = f"models/{run_name}/final"
model.save_pretrained(final_output_dir)
# 9. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
try:
model.push_to_hub(run_name)
except Exception:
logging.error(
f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
f"`huggingface-cli login`, followed by loading the model using `model = CrossEncoder({final_output_dir!r})` "
f"and saving it using `model.push_to_hub('{run_name}')`."
)
if __name__ == "__main__":
main()
```
### Callbacks
```{eval-rst}
This CrossEncoder trainer integrates support for various :class:`transformers.TrainerCallback` subclasses, such as:
- :class:`~transformers.integrations.WandbCallback` to automatically log training metrics to W&B if ``wandb`` is installed
- :class:`~transformers.integrations.TensorBoardCallback` to log training metrics to TensorBoard if ``tensorboard`` is accessible.
- :class:`~transformers.integrations.CodeCarbonCallback` to track the carbon emissions of your model during training if ``codecarbon`` is installed.
- Note: These carbon emissions will be included in your automatically generated model card.
See the Transformers `Callbacks <https://huggingface.co/docs/transformers/main/en/main_classes/callback>`_
documentation for more information on the integrated callbacks and how to write your own callbacks.
```
## Multi-Dataset Training
```{eval-rst}
The top performing models are trained using many datasets at once. Normally, this is rather tricky, as each dataset has a different format. However, :class:`~sentence_transformers.cross_encoder.trainer.CrossEncoderTrainer` can train with multiple datasets without having to convert each dataset to the same format. It can even apply different loss functions to each of the datasets. The steps to train with multiple datasets are:
- Use a dictionary of :class:`~datasets.Dataset` instances (or a :class:`~datasets.DatasetDict`) as the ``train_dataset`` (and optionally also ``eval_dataset``).
- (Optional) Use a dictionary of loss functions mapping dataset names to losses. Only required if you wish to use different loss function for different datasets.
Each training/evaluation batch will only contain samples from one of the datasets. The order in which batches are samples from the multiple datasets is defined by the :class:`~sentence_transformers.training_args.MultiDatasetBatchSamplers` enum, which can be passed to the :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments` via ``multi_dataset_batch_sampler``. Valid options are:
- ``MultiDatasetBatchSamplers.ROUND_ROBIN``: Round-robin sampling from each dataset until one is exhausted. With this strategy, it’s likely that not all samples from each dataset are used, but each dataset is sampled from equally.
- ``MultiDatasetBatchSamplers.PROPORTIONAL`` (default): Sample from each dataset in proportion to its size. With this strategy, all samples from each dataset are used and larger datasets are sampled from more frequently.
```
## Training Tips
```{eval-rst}
Cross Encoder models have their own unique quirks, so here's some tips to help you out:
#. :class:`~sentence_transformers.cross_encoder.CrossEncoder` models overfit rather quickly, so it's recommended to use an evaluator like :class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderNanoBEIREvaluator` or :class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderRerankingEvaluator` together with the ``load_best_model_at_end`` and ``metric_for_best_model`` training arguments to load the model with the best evaluation performance after training.
#. :class:`~sentence_transformers.cross_encoder.CrossEncoder` are particularly receptive to strong hard negatives (:func:`~sentence_transformers.util.mine_hard_negatives`). They teach the model to be very strict, useful e.g. when distinguishing between passages that answer a question or passages that relate to a question.
a. Note that if you only use hard negatives, `your model may unexpectedly perform worse for easier tasks <https://huggingface.co/papers/2411.11767>`_. This can mean that reranking the top 200 results from a first-stage retrieval system (e.g. with a :class:`~sentence_transformers.SentenceTransformer` model) can actually give worse top-10 results than reranking the top 100. Training using random negatives alongside hard negatives can mitigate this.
#. Don't underestimate :class:`~sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss`, it remains a very strong option despite being simpler than learning-to-rank (:class:`~sentence_transformers.cross_encoder.losses.LambdaLoss`, :class:`~sentence_transformers.cross_encoder.losses.ListNetLoss`) or in-batch negatives (:class:`~sentence_transformers.cross_encoder.losses.CachedMultipleNegativesRankingLoss`, :class:`~sentence_transformers.cross_encoder.losses.MultipleNegativesRankingLoss`) losses, and its data is easy to prepare, especially using :func:`~sentence_transformers.util.mine_hard_negatives`.
```
## Deprecated Training
```{eval-rst}
Prior to the Sentence Transformers v4.0 release, models would be trained with the :meth:`CrossEncoder.fit() <sentence_transformers.cr
gitextract_by8kvk5i/
├── .github/
│ └── workflows/
│ ├── quality.yml
│ └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── NOTICE.txt
├── README.md
├── docs/
│ ├── .htaccess
│ ├── Makefile
│ ├── _static/
│ │ ├── css/
│ │ │ └── custom.css
│ │ ├── html/
│ │ │ └── models_en_sentence_embeddings.html
│ │ └── js/
│ │ └── custom.js
│ ├── _templates/
│ │ └── layout.html
│ ├── conf.py
│ ├── cross_encoder/
│ │ ├── loss_overview.md
│ │ ├── pretrained_models.md
│ │ ├── training/
│ │ │ └── examples.rst
│ │ ├── training_overview.md
│ │ └── usage/
│ │ ├── efficiency.rst
│ │ └── usage.rst
│ ├── img/
│ │ └── logo.xcf
│ ├── installation.md
│ ├── migration_guide.md
│ ├── package_reference/
│ │ ├── cross_encoder/
│ │ │ ├── cross_encoder.md
│ │ │ ├── evaluation.md
│ │ │ ├── index.rst
│ │ │ ├── losses.md
│ │ │ ├── trainer.md
│ │ │ └── training_args.md
│ │ ├── sentence_transformer/
│ │ │ ├── SentenceTransformer.md
│ │ │ ├── datasets.md
│ │ │ ├── evaluation.md
│ │ │ ├── index.rst
│ │ │ ├── losses.md
│ │ │ ├── models.md
│ │ │ ├── quantization.md
│ │ │ ├── sampler.md
│ │ │ ├── trainer.md
│ │ │ └── training_args.md
│ │ ├── sparse_encoder/
│ │ │ ├── SparseEncoder.md
│ │ │ ├── callbacks.md
│ │ │ ├── evaluation.md
│ │ │ ├── index.rst
│ │ │ ├── losses.md
│ │ │ ├── models.md
│ │ │ ├── search_engines.md
│ │ │ ├── trainer.md
│ │ │ └── training_args.md
│ │ └── util.md
│ ├── pretrained-models/
│ │ ├── ce-msmarco.md
│ │ ├── dpr.md
│ │ ├── msmarco-v1.md
│ │ ├── msmarco-v2.md
│ │ ├── msmarco-v3.md
│ │ ├── msmarco-v5.md
│ │ ├── nli-models.md
│ │ ├── nq-v1.md
│ │ ├── sts-models.md
│ │ └── wikipedia-sections-models.md
│ ├── publications.md
│ ├── quickstart.rst
│ ├── requirements.txt
│ ├── sentence_transformer/
│ │ ├── dataset_overview.md
│ │ ├── loss_overview.md
│ │ ├── pretrained_models.md
│ │ ├── training/
│ │ │ ├── distributed.rst
│ │ │ └── examples.rst
│ │ ├── training_overview.md
│ │ └── usage/
│ │ ├── backend_export_sidebar.rst
│ │ ├── custom_models.rst
│ │ ├── efficiency.rst
│ │ ├── mteb_evaluation.md
│ │ ├── semantic_textual_similarity.rst
│ │ └── usage.rst
│ └── sparse_encoder/
│ ├── loss_overview.md
│ ├── pretrained_models.md
│ ├── training/
│ │ └── examples.rst
│ ├── training_overview.md
│ └── usage/
│ ├── efficiency.rst
│ └── usage.rst
├── examples/
│ ├── cross_encoder/
│ │ ├── applications/
│ │ │ ├── README.md
│ │ │ ├── cross-encoder_reranking.py
│ │ │ └── cross-encoder_usage.py
│ │ └── training/
│ │ ├── README.md
│ │ ├── distillation/
│ │ │ ├── README.md
│ │ │ ├── train_cross_encoder_kd_margin_mse.py
│ │ │ └── train_cross_encoder_kd_mse.py
│ │ ├── ms_marco/
│ │ │ ├── README.md
│ │ │ ├── eval_cross-encoder-trec-dl.py
│ │ │ ├── training_ms_marco_bce.py
│ │ │ ├── training_ms_marco_bce_preprocessed.py
│ │ │ ├── training_ms_marco_cmnrl.py
│ │ │ ├── training_ms_marco_lambda.py
│ │ │ ├── training_ms_marco_lambda_hard_neg.py
│ │ │ ├── training_ms_marco_lambda_preprocessed.py
│ │ │ ├── training_ms_marco_listmle.py
│ │ │ ├── training_ms_marco_listnet.py
│ │ │ ├── training_ms_marco_plistmle.py
│ │ │ └── training_ms_marco_ranknet.py
│ │ ├── nli/
│ │ │ ├── README.md
│ │ │ └── training_nli.py
│ │ ├── quora_duplicate_questions/
│ │ │ ├── README.md
│ │ │ └── training_quora_duplicate_questions.py
│ │ ├── rerankers/
│ │ │ ├── README.md
│ │ │ ├── training_gooaq_bce.py
│ │ │ ├── training_gooaq_cmnrl.py
│ │ │ ├── training_gooaq_lambda.py
│ │ │ └── training_nq_bce.py
│ │ └── sts/
│ │ ├── README.md
│ │ └── training_stsbenchmark.py
│ ├── sentence_transformer/
│ │ ├── README.md
│ │ ├── applications/
│ │ │ ├── README.md
│ │ │ ├── clustering/
│ │ │ │ ├── README.md
│ │ │ │ ├── agglomerative.py
│ │ │ │ ├── fast_clustering.py
│ │ │ │ └── kmeans.py
│ │ │ ├── computing-embeddings/
│ │ │ │ ├── README.rst
│ │ │ │ ├── computing_embeddings.py
│ │ │ │ ├── computing_embeddings_multi_gpu.py
│ │ │ │ └── computing_embeddings_streaming.py
│ │ │ ├── embedding-quantization/
│ │ │ │ ├── README.md
│ │ │ │ ├── semantic_search_faiss.py
│ │ │ │ ├── semantic_search_faiss_benchmark.py
│ │ │ │ ├── semantic_search_recommended.py
│ │ │ │ ├── semantic_search_usearch.py
│ │ │ │ └── semantic_search_usearch_benchmark.py
│ │ │ ├── image-search/
│ │ │ │ ├── Image_Classification.ipynb
│ │ │ │ ├── Image_Clustering.ipynb
│ │ │ │ ├── Image_Duplicates.ipynb
│ │ │ │ ├── Image_Search-multilingual.ipynb
│ │ │ │ ├── Image_Search.ipynb
│ │ │ │ ├── README.md
│ │ │ │ └── example.py
│ │ │ ├── parallel-sentence-mining/
│ │ │ │ ├── README.md
│ │ │ │ ├── bitext_mining.py
│ │ │ │ ├── bitext_mining_utils.py
│ │ │ │ └── bucc2018.py
│ │ │ ├── paraphrase-mining/
│ │ │ │ └── README.md
│ │ │ ├── retrieve_rerank/
│ │ │ │ ├── README.md
│ │ │ │ ├── in_document_search_crossencoder.py
│ │ │ │ └── retrieve_rerank_simple_wikipedia.ipynb
│ │ │ ├── semantic-search/
│ │ │ │ ├── README.md
│ │ │ │ ├── semantic_search.py
│ │ │ │ ├── semantic_search_nq_opensearch.py
│ │ │ │ ├── semantic_search_publications.py
│ │ │ │ ├── semantic_search_quora_annoy.py
│ │ │ │ ├── semantic_search_quora_elasticsearch.py
│ │ │ │ ├── semantic_search_quora_faiss.py
│ │ │ │ ├── semantic_search_quora_hnswlib.py
│ │ │ │ ├── semantic_search_quora_pytorch.py
│ │ │ │ └── semantic_search_wikipedia_qa.py
│ │ │ └── text-summarization/
│ │ │ ├── LexRank.py
│ │ │ ├── README.md
│ │ │ └── text-summarization.py
│ │ ├── domain_adaptation/
│ │ │ └── README.md
│ │ ├── evaluation/
│ │ │ ├── evaluation_inference_speed.py
│ │ │ ├── evaluation_no_dup_batch_sampler_speed.py
│ │ │ ├── evaluation_stsbenchmark.py
│ │ │ └── evaluation_translation_matching.py
│ │ ├── training/
│ │ │ ├── README.md
│ │ │ ├── adaptive_layer/
│ │ │ │ ├── README.md
│ │ │ │ ├── adaptive_layer_nli.py
│ │ │ │ └── adaptive_layer_sts.py
│ │ │ ├── avg_word_embeddings/
│ │ │ │ ├── training_stsbenchmark_avg_word_embeddings.py
│ │ │ │ ├── training_stsbenchmark_bilstm.py
│ │ │ │ ├── training_stsbenchmark_bow.py
│ │ │ │ ├── training_stsbenchmark_cnn.py
│ │ │ │ └── training_stsbenchmark_tf-idf_word_embeddings.py
│ │ │ ├── clip/
│ │ │ │ ├── train_clip.ipynb
│ │ │ │ └── training_clip_flickr8k_mlflow.py
│ │ │ ├── data_augmentation/
│ │ │ │ ├── README.md
│ │ │ │ ├── train_sts_indomain_bm25.py
│ │ │ │ ├── train_sts_indomain_nlpaug.py
│ │ │ │ ├── train_sts_indomain_semantic.py
│ │ │ │ ├── train_sts_qqp_crossdomain.py
│ │ │ │ └── train_sts_seed_optimization.py
│ │ │ ├── distillation/
│ │ │ │ ├── README.md
│ │ │ │ ├── dimensionality_reduction.py
│ │ │ │ ├── model_distillation.py
│ │ │ │ ├── model_distillation_layer_reduction.py
│ │ │ │ └── model_quantization.py
│ │ │ ├── hpo/
│ │ │ │ ├── README.rst
│ │ │ │ └── hpo_nli.py
│ │ │ ├── matryoshka/
│ │ │ │ ├── 2d_matryoshka_nli.py
│ │ │ │ ├── 2d_matryoshka_sts.py
│ │ │ │ ├── README.md
│ │ │ │ ├── matryoshka_eval_stsb.py
│ │ │ │ ├── matryoshka_nli.py
│ │ │ │ ├── matryoshka_nli_reduced_dim.py
│ │ │ │ └── matryoshka_sts.py
│ │ │ ├── ms_marco/
│ │ │ │ ├── README.md
│ │ │ │ ├── eval_msmarco.py
│ │ │ │ ├── multilingual/
│ │ │ │ │ ├── README.md
│ │ │ │ │ └── translate_queries.py
│ │ │ │ ├── train-kldiv.py
│ │ │ │ ├── train-margin-mse.py
│ │ │ │ ├── train_bi-encoder_margin-mse.py
│ │ │ │ └── train_bi-encoder_mnrl.py
│ │ │ ├── multilingual/
│ │ │ │ ├── README.md
│ │ │ │ ├── get_parallel_data_opus.py
│ │ │ │ ├── get_parallel_data_talks.py
│ │ │ │ ├── get_parallel_data_tatoeba.py
│ │ │ │ ├── get_parallel_data_wikimatrix.py
│ │ │ │ └── make_multilingual.py
│ │ │ ├── nli/
│ │ │ │ ├── README.md
│ │ │ │ ├── training_nli.py
│ │ │ │ ├── training_nli_angle.py
│ │ │ │ ├── training_nli_v2.py
│ │ │ │ └── training_nli_v3.py
│ │ │ ├── other/
│ │ │ │ ├── training_batch_hard_trec.py
│ │ │ │ ├── training_gooaq_infonce_gor.py
│ │ │ │ ├── training_multi-task.py
│ │ │ │ └── training_wikipedia_sections.py
│ │ │ ├── paraphrases/
│ │ │ │ ├── README.md
│ │ │ │ └── training.py
│ │ │ ├── peft/
│ │ │ │ ├── README.md
│ │ │ │ └── training_gooaq_lora.py
│ │ │ ├── prompts/
│ │ │ │ ├── README.md
│ │ │ │ └── training_nq_prompts.py
│ │ │ ├── quora_duplicate_questions/
│ │ │ │ ├── README.md
│ │ │ │ ├── application_duplicate_questions_mining.py
│ │ │ │ ├── create_splits.py
│ │ │ │ ├── training_MultipleNegativesRankingLoss.py
│ │ │ │ ├── training_OnlineContrastiveLoss.py
│ │ │ │ └── training_multi-task-learning.py
│ │ │ ├── sts/
│ │ │ │ ├── README.md
│ │ │ │ ├── training_stsbenchmark.py
│ │ │ │ └── training_stsbenchmark_continue_training.py
│ │ │ └── unsloth/
│ │ │ ├── README.md
│ │ │ ├── training_gooaq_unsloth.py
│ │ │ └── training_medical_unsloth.py
│ │ └── unsupervised_learning/
│ │ ├── CT/
│ │ │ ├── README.md
│ │ │ ├── train_askubuntu_ct.py
│ │ │ ├── train_ct_from_file.py
│ │ │ └── train_stsb_ct.py
│ │ ├── CT_In-Batch_Negatives/
│ │ │ ├── README.md
│ │ │ ├── train_askubuntu_ct-improved.py
│ │ │ ├── train_ct-improved_from_file.py
│ │ │ └── train_stsb_ct-improved.py
│ │ ├── MLM/
│ │ │ ├── README.md
│ │ │ └── train_mlm.py
│ │ ├── README.md
│ │ ├── SimCSE/
│ │ │ ├── README.md
│ │ │ ├── train_askubuntu_simcse.py
│ │ │ ├── train_simcse_from_file.py
│ │ │ └── train_stsb_simcse.py
│ │ ├── TSDAE/
│ │ │ ├── README.md
│ │ │ ├── eval_askubuntu.py
│ │ │ ├── train_askubuntu_tsdae.py
│ │ │ ├── train_stsb_tsdae.py
│ │ │ └── train_tsdae_from_file.py
│ │ └── query_generation/
│ │ ├── 1_programming_query_generation.py
│ │ ├── 2_programming_train_bi-encoder.py
│ │ ├── 3_programming_semantic_search.py
│ │ ├── README.md
│ │ └── example_query_generation.py
│ └── sparse_encoder/
│ ├── applications/
│ │ ├── README.md
│ │ ├── computing_embeddings/
│ │ │ ├── README.rst
│ │ │ └── compute_embeddings.py
│ │ ├── retrieve_rerank/
│ │ │ ├── README.md
│ │ │ ├── hybrid_search.py
│ │ │ └── retrieve_rerank_simple_wikipedia.ipynb
│ │ ├── semantic_search/
│ │ │ ├── README.md
│ │ │ ├── semantic_search_elasticsearch.py
│ │ │ ├── semantic_search_manual.py
│ │ │ ├── semantic_search_opensearch.py
│ │ │ ├── semantic_search_qdrant.py
│ │ │ ├── semantic_search_seismic.py
│ │ │ └── semantic_search_splade_index.py
│ │ └── semantic_textual_similarity/
│ │ ├── README.md
│ │ └── semantic_textual_similarity.py
│ ├── evaluation/
│ │ ├── README.md
│ │ ├── sparse_classification_evaluator.py
│ │ ├── sparse_mse_evaluator.py
│ │ ├── sparse_nanobeir_advanced_evaluator.py
│ │ ├── sparse_nanobeir_evaluator.py
│ │ ├── sparse_reranking_evaluator.py
│ │ ├── sparse_retrieval_evaluator.py
│ │ ├── sparse_similarity_evaluator.py
│ │ ├── sparse_translation_evaluator.py
│ │ └── sparse_triplet_evaluator.py
│ └── training/
│ ├── README.md
│ ├── distillation/
│ │ ├── README.md
│ │ └── train_splade_msmarco_margin_mse.py
│ ├── ms_marco/
│ │ ├── README.md
│ │ └── train_splade_msmarco_mnrl.py
│ ├── nli/
│ │ ├── README.md
│ │ └── train_splade_nli.py
│ ├── peft/
│ │ └── train_splade_gooaq_peft.py
│ ├── quora_duplicate_questions/
│ │ ├── README.md
│ │ └── training_splade_quora.py
│ ├── retrievers/
│ │ ├── README.md
│ │ ├── train_csr_nq.py
│ │ ├── train_splade_gooaq.py
│ │ ├── train_splade_nq.py
│ │ └── train_splade_nq_cached.py
│ └── sts/
│ ├── README.md
│ └── train_splade_stsbenchmark.py
├── index.rst
├── pyproject.toml
├── sentence_transformers/
│ ├── LoggingHandler.py
│ ├── SentenceTransformer.py
│ ├── __init__.py
│ ├── backend/
│ │ ├── __init__.py
│ │ ├── load.py
│ │ ├── optimize.py
│ │ ├── quantize.py
│ │ └── utils.py
│ ├── cross_encoder/
│ │ ├── CrossEncoder.py
│ │ ├── __init__.py
│ │ ├── data_collator.py
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── classification.py
│ │ │ ├── correlation.py
│ │ │ ├── deprecated.py
│ │ │ ├── nano_beir.py
│ │ │ └── reranking.py
│ │ ├── fit_mixin.py
│ │ ├── losses/
│ │ │ ├── BinaryCrossEntropyLoss.py
│ │ │ ├── CachedMultipleNegativesRankingLoss.py
│ │ │ ├── CrossEntropyLoss.py
│ │ │ ├── LambdaLoss.py
│ │ │ ├── ListMLELoss.py
│ │ │ ├── ListNetLoss.py
│ │ │ ├── MSELoss.py
│ │ │ ├── MarginMSELoss.py
│ │ │ ├── MultipleNegativesRankingLoss.py
│ │ │ ├── PListMLELoss.py
│ │ │ ├── RankNetLoss.py
│ │ │ └── __init__.py
│ │ ├── model_card.py
│ │ ├── model_card_template.md
│ │ ├── trainer.py
│ │ ├── training_args.py
│ │ └── util.py
│ ├── data_collator.py
│ ├── datasets/
│ │ ├── DenoisingAutoEncoderDataset.py
│ │ ├── NoDuplicatesDataLoader.py
│ │ ├── ParallelSentencesDataset.py
│ │ ├── SentenceLabelDataset.py
│ │ ├── SentencesDataset.py
│ │ └── __init__.py
│ ├── evaluation/
│ │ ├── BinaryClassificationEvaluator.py
│ │ ├── EmbeddingSimilarityEvaluator.py
│ │ ├── InformationRetrievalEvaluator.py
│ │ ├── LabelAccuracyEvaluator.py
│ │ ├── MSEEvaluator.py
│ │ ├── MSEEvaluatorFromDataFrame.py
│ │ ├── NanoBEIREvaluator.py
│ │ ├── ParaphraseMiningEvaluator.py
│ │ ├── RerankingEvaluator.py
│ │ ├── SentenceEvaluator.py
│ │ ├── SequentialEvaluator.py
│ │ ├── SimilarityFunction.py
│ │ ├── TranslationEvaluator.py
│ │ ├── TripletEvaluator.py
│ │ └── __init__.py
│ ├── fit_mixin.py
│ ├── losses/
│ │ ├── AdaptiveLayerLoss.py
│ │ ├── AnglELoss.py
│ │ ├── BatchAllTripletLoss.py
│ │ ├── BatchHardSoftMarginTripletLoss.py
│ │ ├── BatchHardTripletLoss.py
│ │ ├── BatchSemiHardTripletLoss.py
│ │ ├── CachedGISTEmbedLoss.py
│ │ ├── CachedMultipleNegativesRankingLoss.py
│ │ ├── CachedMultipleNegativesSymmetricRankingLoss.py
│ │ ├── CoSENTLoss.py
│ │ ├── ContrastiveLoss.py
│ │ ├── ContrastiveTensionLoss.py
│ │ ├── CosineSimilarityLoss.py
│ │ ├── DenoisingAutoEncoderLoss.py
│ │ ├── DistillKLDivLoss.py
│ │ ├── GISTEmbedLoss.py
│ │ ├── GlobalOrthogonalRegularizationLoss.py
│ │ ├── MSELoss.py
│ │ ├── MarginMSELoss.py
│ │ ├── Matryoshka2dLoss.py
│ │ ├── MatryoshkaLoss.py
│ │ ├── MegaBatchMarginLoss.py
│ │ ├── MultipleNegativesRankingLoss.py
│ │ ├── MultipleNegativesSymmetricRankingLoss.py
│ │ ├── OnlineContrastiveLoss.py
│ │ ├── SoftmaxLoss.py
│ │ ├── TripletLoss.py
│ │ └── __init__.py
│ ├── model_card.py
│ ├── model_card_template.md
│ ├── model_card_templates.py
│ ├── models/
│ │ ├── BoW.py
│ │ ├── CLIPModel.py
│ │ ├── CNN.py
│ │ ├── Dense.py
│ │ ├── Dropout.py
│ │ ├── InputModule.py
│ │ ├── LSTM.py
│ │ ├── LayerNorm.py
│ │ ├── Module.py
│ │ ├── Normalize.py
│ │ ├── Pooling.py
│ │ ├── Router.py
│ │ ├── StaticEmbedding.py
│ │ ├── Transformer.py
│ │ ├── WeightedLayerPooling.py
│ │ ├── WordEmbeddings.py
│ │ ├── WordWeights.py
│ │ ├── __init__.py
│ │ └── tokenizer/
│ │ ├── PhraseTokenizer.py
│ │ ├── WhitespaceTokenizer.py
│ │ ├── WordTokenizer.py
│ │ └── __init__.py
│ ├── peft_mixin.py
│ ├── py.typed
│ ├── quantization.py
│ ├── readers/
│ │ ├── InputExample.py
│ │ ├── LabelSentenceReader.py
│ │ ├── NLIDataReader.py
│ │ ├── PairedFilesReader.py
│ │ ├── STSDataReader.py
│ │ ├── TripletReader.py
│ │ └── __init__.py
│ ├── sampler.py
│ ├── similarity_functions.py
│ ├── sparse_encoder/
│ │ ├── SparseEncoder.py
│ │ ├── __init__.py
│ │ ├── callbacks/
│ │ │ ├── __init__.py
│ │ │ └── splade_callbacks.py
│ │ ├── data_collator.py
│ │ ├── evaluation/
│ │ │ ├── ReciprocalRankFusionEvaluator.py
│ │ │ ├── SparseBinaryClassificationEvaluator.py
│ │ │ ├── SparseEmbeddingSimilarityEvaluator.py
│ │ │ ├── SparseInformationRetrievalEvaluator.py
│ │ │ ├── SparseMSEEvaluator.py
│ │ │ ├── SparseNanoBEIREvaluator.py
│ │ │ ├── SparseRerankingEvaluator.py
│ │ │ ├── SparseTranslationEvaluator.py
│ │ │ ├── SparseTripletEvaluator.py
│ │ │ └── __init__.py
│ │ ├── losses/
│ │ │ ├── CSRLoss.py
│ │ │ ├── CachedSpladeLoss.py
│ │ │ ├── FlopsLoss.py
│ │ │ ├── SparseAnglELoss.py
│ │ │ ├── SparseCoSENTLoss.py
│ │ │ ├── SparseCosineSimilarityLoss.py
│ │ │ ├── SparseDistillKLDivLoss.py
│ │ │ ├── SparseMSELoss.py
│ │ │ ├── SparseMarginMSELoss.py
│ │ │ ├── SparseMultipleNegativesRankingLoss.py
│ │ │ ├── SparseTripletLoss.py
│ │ │ ├── SpladeLoss.py
│ │ │ └── __init__.py
│ │ ├── model_card.py
│ │ ├── model_card_template.md
│ │ ├── models/
│ │ │ ├── MLMTransformer.py
│ │ │ ├── SparseAutoEncoder.py
│ │ │ ├── SparseStaticEmbedding.py
│ │ │ ├── SpladePooling.py
│ │ │ └── __init__.py
│ │ ├── search_engines.py
│ │ ├── trainer.py
│ │ └── training_args.py
│ ├── trainer.py
│ ├── training_args.py
│ └── util/
│ ├── __init__.py
│ ├── decorators.py
│ ├── distributed.py
│ ├── environment.py
│ ├── file_io.py
│ ├── hard_negatives.py
│ ├── misc.py
│ ├── retrieval.py
│ ├── similarity.py
│ └── tensor.py
└── tests/
├── __init__.py
├── conftest.py
├── cross_encoder/
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_backends.py
│ ├── test_cross_encoder.py
│ ├── test_deprecated_imports.py
│ ├── test_model_card.py
│ ├── test_multi_process.py
│ ├── test_pretrained.py
│ ├── test_train_stsb.py
│ └── test_trainer.py
├── evaluation/
│ ├── test_binary_classification_evaluator.py
│ ├── test_information_retrieval_evaluator.py
│ ├── test_label_accuracy_evaluator.py
│ ├── test_nanobeir_evaluator.py
│ ├── test_paraphrase_mining_evaluator.py
│ └── test_triplet_evaluator.py
├── losses/
│ └── test_MatryoshkaLoss.py
├── models/
│ ├── __init__.py
│ ├── test_dense.py
│ ├── test_pooling.py
│ ├── test_router.py
│ ├── test_static_embedding.py
│ └── test_transformer.py
├── samplers/
│ ├── test_group_by_label_batch_sampler.py
│ ├── test_no_duplicates_batch_sampler.py
│ └── test_round_robin_batch_sampler.py
├── sparse_encoder/
│ ├── __init__.py
│ ├── conftest.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── test_csr.py
│ │ └── test_sparse_static_embedding.py
│ ├── test_backends.py
│ ├── test_model_card.py
│ ├── test_multi_process.py
│ ├── test_opensearch_models.py
│ ├── test_pretrained.py
│ ├── test_sparse_encoder.py
│ ├── test_train_stsb.py
│ ├── test_trainer.py
│ └── utils.py
├── test_backends.py
├── test_cmnrl.py
├── test_compute_embeddings.py
├── test_custom_models.py
├── test_image_embeddings.py
├── test_model_card.py
├── test_model_card_data.py
├── test_multi_process.py
├── test_pretrained.py
├── test_pretrained_stsb.py
├── test_sentence_transformer.py
├── test_train_stsb.py
├── test_trainer.py
├── test_training_args.py
├── util/
│ ├── test_hard_negatives.py
│ ├── test_import.py
│ ├── test_retrieval.py
│ ├── test_similarity.py
│ └── test_tensor.py
└── utils.py
SYMBOL INDEX (253 symbols across 68 files)
FILE: docs/_static/js/custom.js
function addGithubButton (line 1) | function addGithubButton() {
function parseGithubButtons (line 28) | function parseGithubButtons (){"use strict";var e=window.document,t=e.lo...
function onLoad (line 32) | function onLoad() {
FILE: docs/conf.py
function linkcode_resolve (line 123) | def linkcode_resolve(domain, info):
function visit_download_reference (line 159) | def visit_download_reference(self, node):
function setup (line 182) | def setup(app: Sphinx):
FILE: examples/cross_encoder/training/distillation/train_cross_encoder_kd_margin_mse.py
function main (line 13) | def main():
FILE: examples/cross_encoder/training/distillation/train_cross_encoder_kd_mse.py
function main (line 13) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_bce.py
function main (line 15) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_bce_preprocessed.py
function main (line 14) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_cmnrl.py
function main (line 19) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_lambda.py
function main (line 15) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_lambda_hard_neg.py
function main (line 16) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_lambda_preprocessed.py
function main (line 15) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_listmle.py
function main (line 14) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_listnet.py
function main (line 14) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_plistmle.py
function main (line 14) | def main():
FILE: examples/cross_encoder/training/ms_marco/training_ms_marco_ranknet.py
function main (line 17) | def main():
FILE: examples/cross_encoder/training/rerankers/training_gooaq_bce.py
function main (line 23) | def main():
FILE: examples/cross_encoder/training/rerankers/training_gooaq_lambda.py
function main (line 22) | def main():
FILE: examples/cross_encoder/training/rerankers/training_nq_bce.py
function main (line 23) | def main():
FILE: examples/sentence_transformer/applications/embedding-quantization/semantic_search_recommended.py
function search (line 68) | def search(query, top_k: int = 10, rescore_multiplier: int = 4):
FILE: examples/sentence_transformer/applications/parallel-sentence-mining/bitext_mining_utils.py
function score (line 18) | def score(x, y, fwd_mean, bwd_mean, margin):
function score_candidates (line 22) | def score_candidates(x, y, candidate_inds, fwd_mean, bwd_mean, margin):
function kNN (line 31) | def kNN(x, y, k, use_ann_search=False, ann_num_clusters=32768, ann_num_c...
function file_open (line 52) | def file_open(filepath):
FILE: examples/sentence_transformer/applications/semantic-search/semantic_search_nq_opensearch.py
function create_and_ingest_index (line 51) | def create_and_ingest_index(os_client, index_name, corpus, embeddings):
FILE: examples/sentence_transformer/applications/semantic-search/semantic_search_publications.py
function search_papers (line 39) | def search_papers(title, abstract):
FILE: examples/sentence_transformer/applications/text-summarization/LexRank.py
function degree_centrality_scores (line 15) | def degree_centrality_scores(
function _power_method (line 43) | def _power_method(transition_matrix, increase_power=True, max_iter=10000):
function connected_nodes (line 66) | def connected_nodes(matrix):
function create_markov_matrix (line 78) | def create_markov_matrix(weights_matrix):
function create_markov_matrix_discrete (line 92) | def create_markov_matrix_discrete(weights_matrix, threshold):
function stationary_distribution (line 100) | def stationary_distribution(
FILE: examples/sentence_transformer/evaluation/evaluation_no_dup_batch_sampler_speed.py
function run_sampler (line 58) | def run_sampler(
function parse_args (line 168) | def parse_args() -> argparse.Namespace:
function _iter_texts (line 207) | def _iter_texts(value: object) -> list[str]:
function _format_bytes (line 214) | def _format_bytes(value: int) -> str:
class _RssReport (line 225) | class _RssReport:
method __init__ (line 226) | def __init__(self, start_rss: int, end_rss: int, peak_rss: int) -> None:
class _RssSampler (line 234) | class _RssSampler:
method __init__ (line 237) | def __init__(self, interval: float = 0.1) -> None:
method _total_rss (line 245) | def _total_rss(self) -> int:
method _run (line 261) | def _run(self) -> None:
method start (line 268) | def start(self) -> None:
method stop (line 274) | def stop(self) -> None:
method report (line 282) | def report(self) -> _RssReport:
class _UssReport (line 286) | class _UssReport:
method __init__ (line 287) | def __init__(self, start_uss: int, end_uss: int, peak_uss: int) -> None:
class _UssSampler (line 295) | class _UssSampler:
method __init__ (line 298) | def __init__(self, interval: float = 0.1) -> None:
method _total_uss (line 306) | def _total_uss(self) -> int:
method _run (line 322) | def _run(self) -> None:
method start (line 329) | def start(self) -> None:
method stop (line 335) | def stop(self) -> None:
method report (line 343) | def report(self) -> _UssReport:
function _dup_stats (line 347) | def _dup_stats(dataset: Dataset, show_progress: bool, desc: str) -> tupl...
function compute_uniqueness (line 388) | async def compute_uniqueness(
function _load_hf_dataset (line 412) | def _load_hf_dataset(name: str, subset: str | None, split: str) -> Dataset:
function main (line 419) | def main() -> None:
FILE: examples/sentence_transformer/training/clip/training_clip_flickr8k_mlflow.py
function to_binary_flickr8k (line 25) | def to_binary_flickr8k(batch: dict) -> dict:
FILE: examples/sentence_transformer/training/data_augmentation/train_sts_seed_optimization.py
class SeedTestingEarlyStoppingCallback (line 106) | class SeedTestingEarlyStoppingCallback(TrainerCallback):
method __init__ (line 107) | def __init__(self, num_steps_until_stop: int):
method on_step_end (line 110) | def on_step_end(
FILE: examples/sentence_transformer/training/distillation/model_distillation.py
function combine_sentences (line 66) | def combine_sentences(batch):
function deduplicate (line 76) | def deduplicate(dataset):
function map_embeddings (line 145) | def map_embeddings(batch):
FILE: examples/sentence_transformer/training/distillation/model_distillation_layer_reduction.py
function combine_sentences (line 85) | def combine_sentences(batch):
function deduplicate (line 95) | def deduplicate(dataset):
function map_embeddings (line 129) | def map_embeddings(batch):
FILE: examples/sentence_transformer/training/distillation/model_quantization.py
function evaluate (line 111) | def evaluate(model: SentenceTransformer, name: str) -> None:
FILE: examples/sentence_transformer/training/hpo/hpo_nli.py
function hpo_search_space (line 28) | def hpo_search_space(trial):
function hpo_model_init (line 38) | def hpo_model_init(trial):
function hpo_loss_init (line 43) | def hpo_loss_init(model):
function hpo_compute_objective (line 48) | def hpo_compute_objective(metrics):
FILE: examples/sentence_transformer/training/matryoshka/matryoshka_eval_stsb.py
function _grouped_barplot_ratios (line 23) | def _grouped_barplot_ratios(group_name_to_x_to_y: dict[str, dict[int, fl...
function plot_across_dimensions (line 72) | def plot_across_dimensions(
FILE: examples/sentence_transformer/training/ms_marco/train-kldiv.py
function main (line 23) | def main():
FILE: examples/sentence_transformer/training/ms_marco/train-margin-mse.py
function main (line 23) | def main():
FILE: examples/sentence_transformer/training/ms_marco/train_bi-encoder_margin-mse.py
class MSMARCODataset (line 197) | class MSMARCODataset(Dataset):
method __init__ (line 198) | def __init__(self, queries, corpus, ce_scores):
method __getitem__ (line 209) | def __getitem__(self, item):
method __len__ (line 233) | def __len__(self):
FILE: examples/sentence_transformer/training/ms_marco/train_bi-encoder_mnrl.py
class MSMARCODataset (line 217) | class MSMARCODataset(Dataset):
method __init__ (line 218) | def __init__(self, queries, corpus):
method __getitem__ (line 228) | def __getitem__(self, item):
method __len__ (line 242) | def __len__(self):
FILE: examples/sentence_transformer/training/multilingual/make_multilingual.py
function prepare_dataset (line 128) | def prepare_dataset(batch):
FILE: examples/sentence_transformer/training/other/training_batch_hard_trec.py
function trec_dataset (line 37) | def trec_dataset(
function triplets_from_labeled_dataset (line 58) | def triplets_from_labeled_dataset(dataset):
FILE: examples/sentence_transformer/training/other/training_gooaq_infonce_gor.py
class InfoNCEGORLoss (line 40) | class InfoNCEGORLoss(torch.nn.Module):
method __init__ (line 49) | def __init__(self, model: SentenceTransformer, similarity_fct=cos_sim,...
method forward (line 55) | def forward(
FILE: examples/sentence_transformer/training/quora_duplicate_questions/create_splits.py
function get_duplicate_set (line 389) | def get_duplicate_set(ids_set):
function write_qids (line 454) | def write_qids(name, ids_list):
function write_mining_files (line 466) | def write_mining_files(name, ids, dups):
FILE: examples/sentence_transformer/unsupervised_learning/CT/train_askubuntu_ct.py
function to_ct_pairs (line 51) | def to_ct_pairs(sample, pos_neg_ratio=8):
FILE: examples/sentence_transformer/unsupervised_learning/CT/train_ct_from_file.py
function to_ct_pairs (line 75) | def to_ct_pairs(sample, pos_neg_ratio=8):
FILE: examples/sentence_transformer/unsupervised_learning/CT/train_stsb_ct.py
function to_ct_pairs (line 36) | def to_ct_pairs(sample, pos_neg_ratio=8):
FILE: examples/sentence_transformer/unsupervised_learning/MLM/train_mlm.py
class TokenizedSentencesDataset (line 80) | class TokenizedSentencesDataset:
method __init__ (line 81) | def __init__(self, sentences, tokenizer, max_length, cache_tokenizatio...
method __getitem__ (line 87) | def __getitem__(self, item):
method __len__ (line 107) | def __len__(self):
FILE: examples/sentence_transformer/unsupervised_learning/SimCSE/train_stsb_simcse.py
function simcse_map (line 42) | def simcse_map(example):
FILE: examples/sentence_transformer/unsupervised_learning/TSDAE/train_askubuntu_tsdae.py
function noise_transform (line 51) | def noise_transform(batch, del_ratio=0.6):
FILE: examples/sentence_transformer/unsupervised_learning/TSDAE/train_stsb_tsdae.py
function noise_transform (line 39) | def noise_transform(batch, del_ratio=0.6):
FILE: examples/sentence_transformer/unsupervised_learning/TSDAE/train_tsdae_from_file.py
function noise_transform (line 57) | def noise_transform(batch, del_ratio=0.6):
FILE: examples/sparse_encoder/applications/computing_embeddings/compute_embeddings.py
function get_memory_size (line 77) | def get_memory_size(tensor):
FILE: examples/sparse_encoder/training/distillation/train_splade_msmarco_margin_mse.py
function main (line 26) | def main():
FILE: examples/sparse_encoder/training/ms_marco/train_splade_msmarco_mnrl.py
function main (line 28) | def main():
FILE: examples/sparse_encoder/training/nli/train_splade_nli.py
function main (line 31) | def main():
FILE: examples/sparse_encoder/training/peft/train_splade_gooaq_peft.py
function main (line 32) | def main():
FILE: examples/sparse_encoder/training/quora_duplicate_questions/training_splade_quora.py
function main (line 31) | def main():
FILE: examples/sparse_encoder/training/retrievers/train_csr_nq.py
function main (line 32) | def main():
FILE: examples/sparse_encoder/training/retrievers/train_splade_gooaq.py
function main (line 30) | def main():
FILE: examples/sparse_encoder/training/retrievers/train_splade_nq.py
function main (line 30) | def main():
FILE: examples/sparse_encoder/training/retrievers/train_splade_nq_cached.py
function main (line 30) | def main():
FILE: examples/sparse_encoder/training/sts/train_splade_stsbenchmark.py
function main (line 30) | def main():
FILE: sentence_transformers/LoggingHandler.py
class LoggingHandler (line 8) | class LoggingHandler(logging.Handler):
method __init__ (line 9) | def __init__(self, level=logging.NOTSET) -> None:
method emit (line 12) | def emit(self, record) -> None:
function install_logger (line 23) | def install_logger(given_logger, level=logging.WARNING, fmt="%(levelname...
FILE: sentence_transformers/SentenceTransformer.py
class SentenceTransformer (line 61) | class SentenceTransformer(nn.Sequential, FitMixin, PeftAdapterMixin):
method __init__ (line 167) | def __init__(
method get_backend (line 408) | def get_backend(self) -> Literal["torch", "onnx", "openvino"]:
method get_model_kwargs (line 416) | def get_model_kwargs(self) -> list[str]:
method encode_query (line 446) | def encode_query(
method encode_document (line 581) | def encode_document(
method encode (line 721) | def encode(
method encode (line 743) | def encode(
method encode (line 765) | def encode(
method encode (line 786) | def encode(
method encode (line 807) | def encode(
method encode (line 828) | def encode(
method encode (line 849) | def encode(
method encode (line 869) | def encode(
method forward (line 1179) | def forward(self, input: dict[str, Tensor], **kwargs) -> dict[str, Ten...
method similarity_fn_name (line 1197) | def similarity_fn_name(self) -> Literal["cosine", "dot", "euclidean", ...
method similarity_fn_name (line 1214) | def similarity_fn_name(
method similarity (line 1226) | def similarity(self, embeddings1: Tensor, embeddings2: Tensor) -> Tens...
method similarity (line 1229) | def similarity(self, embeddings1: npt.NDArray[np.float32], embeddings2...
method similarity (line 1232) | def similarity(self) -> Callable[[Tensor | npt.NDArray[np.float32], Te...
method similarity_pairwise (line 1276) | def similarity_pairwise(self, embeddings1: Tensor, embeddings2: Tensor...
method similarity_pairwise (line 1279) | def similarity_pairwise(
method similarity_pairwise (line 1284) | def similarity_pairwise(
method start_multi_process_pool (line 1322) | def start_multi_process_pool(
method stop_multi_process_pool (line 1372) | def stop_multi_process_pool(pool: dict[Literal["input", "output", "pro...
method encode_multi_process (line 1396) | def encode_multi_process(
method _encode_multi_process (line 1487) | def _encode_multi_process(
method _encode_multi_process_worker (line 1553) | def _encode_multi_process_worker(
method set_pooling_include_prompt (line 1576) | def set_pooling_include_prompt(self, include_prompt: bool) -> None:
method _get_prompt_length (line 1594) | def _get_prompt_length(self, prompt: str, **kwargs) -> int:
method get_max_seq_length (line 1615) | def get_max_seq_length(self) -> int | None:
method tokenize (line 1627) | def tokenize(self, texts: list[str] | list[dict] | list[tuple[str, str...
method get_sentence_features (line 1643) | def get_sentence_features(self, *features) -> dict[Literal["sentence_e...
method get_sentence_embedding_dimension (line 1646) | def get_sentence_embedding_dimension(self) -> int | None:
method truncate_sentence_embeddings (line 1666) | def truncate_sentence_embeddings(self, truncate_dim: int | None) -> It...
method _first_module (line 1695) | def _first_module(self) -> torch.nn.Module:
method _last_module (line 1699) | def _last_module(self) -> torch.nn.Module:
method save (line 1703) | def save(
method save_pretrained (line 1796) | def save_pretrained(
method _update_default_model_id (line 1824) | def _update_default_model_id(self, model_card):
method _create_model_card (line 1832) | def _create_model_card(
method save_to_hub (line 1872) | def save_to_hub(
method push_to_hub (line 1936) | def push_to_hub(
method _text_length (line 2054) | def _text_length(self, text: list[int] | list[list[int]]) -> int:
method evaluate (line 2070) | def evaluate(self, evaluator: SentenceEvaluator, output_path: str | No...
method _load_auto_model (line 2085) | def _load_auto_model(
method _load_module_class_from_ref (line 2143) | def _load_module_class_from_ref(
method _load_sbert_model (line 2171) | def _load_sbert_model(
method load (line 2354) | def load(input_path) -> SentenceTransformer:
method device (line 2358) | def device(self) -> device:
method tokenizer (line 2386) | def tokenizer(self) -> Any:
method tokenizer (line 2393) | def tokenizer(self, value) -> None:
method max_seq_length (line 2400) | def max_seq_length(self) -> int:
method max_seq_length (line 2419) | def max_seq_length(self, value) -> None:
method transformers_model (line 2426) | def transformers_model(self) -> PreTrainedModel | None:
method _target_device (line 2453) | def _target_device(self) -> torch.device:
method _target_device (line 2460) | def _target_device(self, device: int | str | torch.device | None = Non...
method dtype (line 2464) | def dtype(self) -> torch.dtype | None:
method _no_split_modules (line 2471) | def _no_split_modules(self) -> list[str]:
method _keys_to_ignore_on_save (line 2478) | def _keys_to_ignore_on_save(self) -> list[str]:
method gradient_checkpointing_enable (line 2484) | def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=...
method _get_model_type (line 2490) | def _get_model_type(
FILE: sentence_transformers/backend/load.py
function load_onnx_model (line 14) | def load_onnx_model(model_name_or_path: str, config: PretrainedConfig, t...
function load_openvino_model (line 90) | def load_openvino_model(model_name_or_path: str, config: PretrainedConfi...
FILE: sentence_transformers/backend/optimize.py
function export_optimized_onnx_model (line 19) | def export_optimized_onnx_model(
FILE: sentence_transformers/backend/quantize.py
function export_dynamic_quantized_onnx_model (line 24) | def export_dynamic_quantized_onnx_model(
function export_static_quantized_openvino_model (line 124) | def export_static_quantized_openvino_model(
FILE: sentence_transformers/backend/utils.py
function _save_pretrained_wrapper (line 21) | def _save_pretrained_wrapper(_save_pretrained_fn: Callable, subfolder: s...
function backend_should_export (line 40) | def backend_should_export(
function backend_warn_to_save (line 138) | def backend_warn_to_save(model_name_or_path: str, is_local: bool, backen...
function save_or_push_to_hub_model (line 155) | def save_or_push_to_hub_model(
FILE: sentence_transformers/cross_encoder/CrossEncoder.py
function _save_pretrained_wrapper (line 46) | def _save_pretrained_wrapper(_save_pretrained_fn: Callable, subfolder: s...
class CrossEncoder (line 54) | class CrossEncoder(nn.Module, PushToHubMixin, FitMixin):
method __init__ (line 125) | def __init__(
method _load_model (line 235) | def _load_model(
method get_backend (line 265) | def get_backend(self) -> Literal["torch", "onnx", "openvino"]:
method start_multi_process_pool (line 273) | def start_multi_process_pool(
method stop_multi_process_pool (line 324) | def stop_multi_process_pool(pool: dict[Literal["input", "output", "pro...
method _multi_process (line 344) | def _multi_process(
method _multi_process_worker (line 423) | def _multi_process_worker(
method set_activation_fn (line 460) | def set_activation_fn(self, activation_fn: Callable | None, set_defaul...
method get_default_activation_fn (line 469) | def get_default_activation_fn(self) -> Callable:
method set_config_value (line 495) | def set_config_value(self, key: str, value) -> None:
method config (line 511) | def config(self) -> PretrainedConfig:
method num_labels (line 515) | def num_labels(self) -> int:
method max_length (line 519) | def max_length(self) -> int:
method max_length (line 523) | def max_length(self, value: int) -> None:
method default_activation_function (line 531) | def default_activation_function(self) -> Callable:
method forward (line 534) | def forward(self, *args, **kwargs):
method predict (line 538) | def predict(
method predict (line 553) | def predict(
method predict (line 568) | def predict(
method predict (line 583) | def predict(
method predict (line 599) | def predict(
method rank (line 744) | def rank(
method save (line 856) | def save(self, path: str, *, safe_serialization: bool = True, **kwargs...
method save_pretrained (line 869) | def save_pretrained(self, path: str, *, safe_serialization: bool = Tru...
method _create_model_card (line 884) | def _create_model_card(self, path: str) -> None:
method push_to_hub (line 919) | def push_to_hub(
method transformers_model (line 1000) | def transformers_model(self) -> PreTrainedModel | None:
method _target_device (line 1024) | def _target_device(self) -> torch.device:
method _target_device (line 1031) | def _target_device(self, device: int | str | torch.device | None = Non...
method device (line 1035) | def device(self) -> torch.device:
method gradient_checkpointing_enable (line 1038) | def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=...
FILE: sentence_transformers/cross_encoder/data_collator.py
class CrossEncoderDataCollator (line 13) | class CrossEncoderDataCollator(SentenceTransformerDataCollator):
method __call__ (line 29) | def __call__(self, features: list[dict[str, Any]]) -> dict[str, torch....
FILE: sentence_transformers/cross_encoder/evaluation/deprecated.py
class CEBinaryAccuracyEvaluator (line 16) | class CEBinaryAccuracyEvaluator(CrossEncoderClassificationEvaluator):
method from_input_examples (line 22) | def from_input_examples(cls, examples: list[InputExample], **kwargs):
class CEBinaryClassificationEvaluator (line 37) | class CEBinaryClassificationEvaluator(CrossEncoderClassificationEvaluator):
method from_input_examples (line 43) | def from_input_examples(cls, examples: list[InputExample], **kwargs):
class CEF1Evaluator (line 58) | class CEF1Evaluator(CrossEncoderClassificationEvaluator):
method from_input_examples (line 64) | def from_input_examples(cls, examples: list[InputExample], **kwargs):
class CESoftmaxAccuracyEvaluator (line 79) | class CESoftmaxAccuracyEvaluator(CrossEncoderClassificationEvaluator):
method from_input_examples (line 85) | def from_input_examples(cls, examples: list[InputExample], **kwargs):
class CECorrelationEvaluator (line 99) | class CECorrelationEvaluator(CrossEncoderCorrelationEvaluator):
class CERerankingEvaluator (line 107) | class CERerankingEvaluator(CrossEncoderRerankingEvaluator):
FILE: sentence_transformers/cross_encoder/losses/CachedMultipleNegativesRankingLoss.py
class RandContext (line 16) | class RandContext:
method __init__ (line 24) | def __init__(self, *tensors) -> None:
method __enter__ (line 28) | def __enter__(self) -> None:
method __exit__ (line 34) | def __exit__(self, exc_type, exc_val, exc_tb) -> None:
function _backward_hook (line 39) | def _backward_hook(
class CachedMultipleNegativesRankingLoss (line 62) | class CachedMultipleNegativesRankingLoss(MultipleNegativesRankingLoss):
method __init__ (line 63) | def __init__(
method predict_minibatch (line 186) | def predict_minibatch(
method predict_minibatch_iter (line 202) | def predict_minibatch_iter(
method calculate_loss_and_cache_gradients (line 230) | def calculate_loss_and_cache_gradients(self, logits: list[Tensor], bat...
method forward (line 240) | def forward(self, inputs: list[list[str]], labels: Tensor) -> Tensor:
method get_config_dict (line 280) | def get_config_dict(self):
FILE: sentence_transformers/cross_encoder/losses/CrossEntropyLoss.py
class CrossEntropyLoss (line 8) | class CrossEntropyLoss(nn.Module):
method __init__ (line 9) | def __init__(self, model: CrossEncoder, activation_fn: nn.Module = nn....
method forward (line 67) | def forward(self, inputs: list[list[str]], labels: Tensor) -> Tensor:
Copy disabled (too large)
Download .json
Condensed preview — 537 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (15,897K chars).
[
{
"path": ".github/workflows/quality.yml",
"chars": 677,
"preview": "name: Quality\n\non:\n push:\n branches:\n - main\n - \"*-release\"\n - \"*-pre\"\n pull_request:\n branches:\n"
},
{
"path": ".github/workflows/tests.yml",
"chars": 2211,
"preview": "name: Unit tests\n\non:\n push:\n branches:\n - main\n - \"*-release\"\n pull_request:\n branches:\n - main\n"
},
{
"path": ".gitignore",
"chars": 755,
"preview": "# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwhe"
},
{
"path": ".pre-commit-config.yaml",
"chars": 170,
"preview": "repos:\n - repo: https://github.com/astral-sh/ruff-pre-commit\n rev: v0.14.5\n hooks:\n - id: ruff\n args:"
},
{
"path": "LICENSE",
"chars": 11338,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "MANIFEST.in",
"chars": 187,
"preview": "include sentence_transformers/model_card_template.md\ninclude sentence_transformers/cross_encoder/model_card_template.md\n"
},
{
"path": "Makefile",
"chars": 509,
"preview": "\n.PHONY: check\ncheck: ## Run code quality tools.\n\t@echo \"Linting code via pre-commit\"\n\t@pre-commit run -a\n\n.PHONY: test\n"
},
{
"path": "NOTICE.txt",
"chars": 329,
"preview": "-------------------------------------------------------------------------------\r\nSentence Transformers\r\n\r\nCopyright 2019"
},
{
"path": "README.md",
"chars": 14202,
"preview": "<!--- BADGES: START --->\n\n[](https://huggingface.co"
},
{
"path": "docs/.htaccess",
"chars": 7217,
"preview": "RewriteEngine On\nRewriteCond %{HTTPS} !=on\nRewriteRule ^ https://%{HTTP_HOST}%{REQUEST_URI} [L,R=301]\n\n# Moved main page"
},
{
"path": "docs/Makefile",
"chars": 83,
"preview": "\ndocs:\n\tsphinx-build -c . -a -E .. _build\n\ndocs-quick:\n\tsphinx-build -c . .. _build"
},
{
"path": "docs/_static/css/custom.css",
"chars": 2487,
"preview": ".wy-nav-content {\n max-width: 1280px;\n}\n\na.icon-home {\n font-size: 1.4em;\n}\n\ndl.class > dt {\n width: 100%;\n}\n\nd"
},
{
"path": "docs/_static/html/models_en_sentence_embeddings.html",
"chars": 49736,
"preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n <meta charset=\"utf-8\">\n <meta name=\"viewport\" content=\"width=device-width"
},
{
"path": "docs/_static/js/custom.js",
"chars": 11687,
"preview": "function addGithubButton() {\n const div = `\n <div class=\"github-repo\">\n <div style=\"display: flex; "
},
{
"path": "docs/_templates/layout.html",
"chars": 397,
"preview": "{% extends \"!layout.html\" %}\n{% block extrahead %}\n<!-- Privacy-friendly analytics by Plausible -->\n<script async src=\"h"
},
{
"path": "docs/conf.py",
"chars": 5748,
"preview": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common op"
},
{
"path": "docs/cross_encoder/loss_overview.md",
"chars": 11629,
"preview": "# Loss Overview\n\n## Loss Table\n\nLoss functions play a critical role in the performance of your fine-tuned Cross Encoder "
},
{
"path": "docs/cross_encoder/pretrained_models.md",
"chars": 8662,
"preview": "# Pretrained Models\n\n```{eval-rst}\nWe have released various pre-trained Cross Encoder models via our Cross Encoder Huggi"
},
{
"path": "docs/cross_encoder/training/examples.rst",
"chars": 583,
"preview": "\nTraining Examples\n=================\n\n.. toctree::\n :maxdepth: 1\n :caption: Supervised Learning\n\n ../../../example"
},
{
"path": "docs/cross_encoder/training_overview.md",
"chars": 71530,
"preview": "# Training Overview\n\n## Why Finetune?\n\nCross Encoder models are very often used as 2nd stage rerankers in a [Retrieve an"
},
{
"path": "docs/cross_encoder/usage/efficiency.rst",
"chars": 28420,
"preview": "\nSpeeding up Inference\n=====================\n\nSentence Transformers supports 3 backends for performing inference with Cr"
},
{
"path": "docs/cross_encoder/usage/usage.rst",
"chars": 4415,
"preview": "\nUsage\n=====\n\nCharacteristics of Cross Encoder (a.k.a reranker) models:\n\n1. Calculates a **similarity score** given **pa"
},
{
"path": "docs/installation.md",
"chars": 4942,
"preview": "# Installation\n\nWe recommend **Python 3.10+**, **[PyTorch 1.11.0+](https://pytorch.org/get-started/locally/)**, and **[t"
},
{
"path": "docs/migration_guide.md",
"chars": 72629,
"preview": "# Migration Guide\n\n## Migrating from v4.x to v5.x\n\n```{eval-rst}\nThe v5 Sentence Transformers release introduced :class:"
},
{
"path": "docs/package_reference/cross_encoder/cross_encoder.md",
"chars": 998,
"preview": "# CrossEncoder\n\n## CrossEncoder\n\nFor an introduction to Cross-Encoders, see [Cross-Encoders](../../cross_encoder/usage/u"
},
{
"path": "docs/package_reference/cross_encoder/evaluation.md",
"chars": 714,
"preview": "# Evaluation\n\nCrossEncoder have their own evaluation classes in `sentence_transformers.cross_encoder.evaluation`.\n\n## Cr"
},
{
"path": "docs/package_reference/cross_encoder/index.rst",
"chars": 113,
"preview": "\nCross Encoder\n=============\n\n.. toctree::\n\n cross_encoder\n trainer\n training_args\n losses\n evaluation\n"
},
{
"path": "docs/package_reference/cross_encoder/losses.md",
"chars": 2403,
"preview": "# Losses\n\n`sentence_transformers.cross_encoder.losses` defines different loss functions that can be used to fine-tune cr"
},
{
"path": "docs/package_reference/cross_encoder/trainer.md",
"chars": 518,
"preview": "# Trainer\n\n## CrossEncoderTrainer\n\n```{eval-rst}\n.. autoclass:: sentence_transformers.cross_encoder.trainer.CrossEncoder"
},
{
"path": "docs/package_reference/cross_encoder/training_args.md",
"chars": 207,
"preview": "# Training Arguments\n\n## CrossEncoderTrainingArguments\n\n```{eval-rst}\n.. autoclass:: sentence_transformers.cross_encoder"
},
{
"path": "docs/package_reference/sentence_transformer/SentenceTransformer.md",
"chars": 1038,
"preview": "# SentenceTransformer\n\n## SentenceTransformer\n\n```{eval-rst}\n.. autoclass:: sentence_transformers.SentenceTransformer\n "
},
{
"path": "docs/package_reference/sentence_transformer/datasets.md",
"chars": 1890,
"preview": "# Datasets\n\n```{eval-rst}\n\n.. note::\n The ``sentence_transformers.datasets`` classes have been deprecated, and only e"
},
{
"path": "docs/package_reference/sentence_transformer/evaluation.md",
"chars": 1395,
"preview": "# Evaluation\n\n`sentence_transformers.evaluation` defines different classes, that can be used to evaluate the model durin"
},
{
"path": "docs/package_reference/sentence_transformer/index.rst",
"chars": 181,
"preview": "\nSentence Transformer\n====================\n\n.. toctree::\n\n SentenceTransformer\n trainer\n training_args\n losses\n "
},
{
"path": "docs/package_reference/sentence_transformer/losses.md",
"chars": 4465,
"preview": "# Losses\n\n`sentence_transformers.losses` defines different loss functions that can be used to fine-tune embedding models"
},
{
"path": "docs/package_reference/sentence_transformer/models.md",
"chars": 1465,
"preview": "# Modules\n\n`sentence_transformers.models` defines different building blocks, a.k.a. Modules, that can be used to create "
},
{
"path": "docs/package_reference/sentence_transformer/quantization.md",
"chars": 723,
"preview": "# quantization\n\n`sentence_transformers.quantization` defines different helpful functions to perform embedding quantizati"
},
{
"path": "docs/package_reference/sentence_transformer/sampler.md",
"chars": 885,
"preview": "# Samplers\n\n## BatchSamplers\n\n```{eval-rst}\n.. autoclass:: sentence_transformers.training_args.BatchSamplers\n :member"
},
{
"path": "docs/package_reference/sentence_transformer/trainer.md",
"chars": 518,
"preview": "# Trainer\n\n## SentenceTransformerTrainer\n\n```{eval-rst}\n.. autoclass:: sentence_transformers.trainer.SentenceTransformer"
},
{
"path": "docs/package_reference/sentence_transformer/training_args.md",
"chars": 207,
"preview": "# Training Arguments\n\n## SentenceTransformerTrainingArguments\n\n```{eval-rst}\n.. autoclass:: sentence_transformers.traini"
},
{
"path": "docs/package_reference/sparse_encoder/SparseEncoder.md",
"chars": 1102,
"preview": "# SparseEncoder\n\n## SparseEncoder\n```{eval-rst}\n.. autoclass:: sentence_transformers.sparse_encoder.SparseEncoder\n :me"
},
{
"path": "docs/package_reference/sparse_encoder/callbacks.md",
"chars": 196,
"preview": "# Callbacks\n\n## SpladeRegularizerWeightSchedulerCallback\n\n```{eval-rst}\n.. autoclass:: sentence_transformers.sparse_enco"
},
{
"path": "docs/package_reference/sparse_encoder/evaluation.md",
"chars": 1441,
"preview": "# Evaluation\n\n`sentence_transformers.sparse_encoder.evaluation` defines different classes, that can be used to evaluate "
},
{
"path": "docs/package_reference/sparse_encoder/index.rst",
"chars": 188,
"preview": "Sparse Encoder\n=============\n\n.. toctree::\n\n SparseEncoder\n trainer\n training_args\n losses\n ../sentence_transf"
},
{
"path": "docs/package_reference/sparse_encoder/losses.md",
"chars": 2791,
"preview": "# Losses\n\n`sentence_transformers.sparse_encoder.losses` defines different loss functions that can be used to fine-tune s"
},
{
"path": "docs/package_reference/sparse_encoder/models.md",
"chars": 918,
"preview": "# Modules\n\n`sentence_transformers.sparse_encoder.models` defines different building blocks, that can be used to create S"
},
{
"path": "docs/package_reference/sparse_encoder/search_engines.md",
"chars": 401,
"preview": "# Search Engines\n`sentence_transformers.sparse_encoder.search_engines` defines different helpful functions to integrate "
},
{
"path": "docs/package_reference/sparse_encoder/trainer.md",
"chars": 509,
"preview": "# Trainer\n\n## SparseEncoderTrainer\n```{eval-rst}\n.. autoclass:: sentence_transformers.sparse_encoder.SparseEncoderTraine"
},
{
"path": "docs/package_reference/sparse_encoder/training_args.md",
"chars": 207,
"preview": "# Training Arguments\n\n## SparseEncoderTrainingArguments\n```{eval-rst}\n.. autoclass:: sentence_transformers.sparse_encode"
},
{
"path": "docs/package_reference/util.md",
"chars": 791,
"preview": "# util\n\n`sentence_transformers.util` defines different helpful functions to work with text embeddings.\n\n## Helper Functi"
},
{
"path": "docs/pretrained-models/ce-msmarco.md",
"chars": 2849,
"preview": "# MS MARCO Cross-Encoders\n\n[MS MARCO](https://microsoft.github.io/msmarco/) is a large scale information retrieval corpu"
},
{
"path": "docs/pretrained-models/dpr.md",
"chars": 1876,
"preview": "# DPR-Models\nIn [Dense Passage Retrieval for Open-Domain Question Answering](https://huggingface.co/papers/2004.04906) "
},
{
"path": "docs/pretrained-models/msmarco-v1.md",
"chars": 1273,
"preview": "# MSMARCO Models\n[MS MARCO](https://microsoft.github.io/msmarco/) is a large scale information retrieval corpus that was"
},
{
"path": "docs/pretrained-models/msmarco-v2.md",
"chars": 1842,
"preview": "# MSMARCO Models (Version 2)\n[MS MARCO](https://microsoft.github.io/msmarco/) is a large scale information retrieval cor"
},
{
"path": "docs/pretrained-models/msmarco-v3.md",
"chars": 4268,
"preview": "# MSMARCO Models \n[MS MARCO](https://microsoft.github.io/msmarco/) is a large scale information retrieval corpus that wa"
},
{
"path": "docs/pretrained-models/msmarco-v5.md",
"chars": 5394,
"preview": "# MSMARCO Models \n[MS MARCO](https://microsoft.github.io/msmarco/) is a large scale information retrieval corpus that wa"
},
{
"path": "docs/pretrained-models/nli-models.md",
"chars": 2217,
"preview": "# NLI Models\nConneau et al., 2017, show in the InferSent-Paper ([Supervised Learning of Universal Sentence Representatio"
},
{
"path": "docs/pretrained-models/nq-v1.md",
"chars": 1188,
"preview": "# Natural Questions Models\n[Google's Natural Questions dataset](https://ai.google.com/research/NaturalQuestions) consist"
},
{
"path": "docs/pretrained-models/sts-models.md",
"chars": 1707,
"preview": "# STS Models\nThe models were first trained on [NLI data](nli-models.md), then we fine-tuned them on the STS benchmark da"
},
{
"path": "docs/pretrained-models/wikipedia-sections-models.md",
"chars": 2118,
"preview": "# Wikipedia Sections Models\nThe `wikipedia-sections-models` implement the idea from Ein Dor et al., 2018, [Learning Them"
},
{
"path": "docs/publications.md",
"chars": 8761,
"preview": "# Publications\n\nIf you find this repository helpful, feel free to cite our publication [Sentence-BERT: Sentence Embeddin"
},
{
"path": "docs/quickstart.rst",
"chars": 11237,
"preview": "Quickstart\n==========\n\nSentence Transformer\n--------------------\n\nCharacteristics of Sentence Transformer (a.k.a bi-enco"
},
{
"path": "docs/requirements.txt",
"chars": 189,
"preview": "sphinx==8.1.3\nJinja2==3.1.6\nmyst-parser==4.0.0\nsphinx_markdown_tables==0.0.17\nsphinx-copybutton==0.5.2\nsphinx_inline_tab"
},
{
"path": "docs/sentence_transformer/dataset_overview.md",
"chars": 22484,
"preview": "# Dataset Overview\n\n```{eval-rst}\n.. hint::\n\n **Quickstart:** Find `curated datasets <https://huggingface.co/collectio"
},
{
"path": "docs/sentence_transformer/loss_overview.md",
"chars": 16988,
"preview": "# Loss Overview\n\n## Loss Table\n\nLoss functions play a critical role in the performance of your fine-tuned model. Sadly, "
},
{
"path": "docs/sentence_transformer/pretrained_models.md",
"chars": 15752,
"preview": "# Pretrained Models\n\n```{eval-rst}\nWe provide various pre-trained Sentence Transformers models via our Sentence Transfor"
},
{
"path": "docs/sentence_transformer/training/distributed.rst",
"chars": 4556,
"preview": "\nDistributed Training\n====================\n\nSentence Transformers implements two forms of distributed training: Data Par"
},
{
"path": "docs/sentence_transformer/training/examples.rst",
"chars": 1340,
"preview": "\nTraining Examples\n=================\n\n.. toctree::\n :maxdepth: 1\n :caption: Supervised Learning\n\n ../../../example"
},
{
"path": "docs/sentence_transformer/training_overview.md",
"chars": 52074,
"preview": "# Training Overview\n\n## Why Finetune?\n\nFinetuning Sentence Transformer models often heavily improves the performance of "
},
{
"path": "docs/sentence_transformer/usage/backend_export_sidebar.rst",
"chars": 311,
"preview": ".. sidebar:: Export, Optimize, and Quantize Hugging Face models\n\n This Hugging Face Space provides a user interface fo"
},
{
"path": "docs/sentence_transformer/usage/custom_models.rst",
"chars": 17567,
"preview": "Creating Custom Models\n=======================\n\nStructure of Sentence Transformer Models\n-------------------------------"
},
{
"path": "docs/sentence_transformer/usage/efficiency.rst",
"chars": 27102,
"preview": "\nSpeeding up Inference\n=====================\n\nSentence Transformers supports 3 backends for computing embeddings, each w"
},
{
"path": "docs/sentence_transformer/usage/mteb_evaluation.md",
"chars": 6592,
"preview": "# Evaluation with MTEB\n\nThe [Massive Text Embedding Benchmark (MTEB)](https://github.com/embeddings-benchmark/mteb) is a"
},
{
"path": "docs/sentence_transformer/usage/semantic_textual_similarity.rst",
"chars": 5845,
"preview": "Semantic Textual Similarity\n===========================\n\nFor Semantic Textual Similarity (STS), we want to produce embed"
},
{
"path": "docs/sentence_transformer/usage/usage.rst",
"chars": 2525,
"preview": "\nUsage\n=====\n\nCharacteristics of Sentence Transformer (a.k.a bi-encoder) models:\n\n1. Calculates a **fixed-size vector re"
},
{
"path": "docs/sparse_encoder/loss_overview.md",
"chars": 13586,
"preview": "# Loss Overview\n\n```{eval-rst}\n.. warning:: \n To train a :class:`~sentence_transformers.sparse_encoder.SparseEncoder`"
},
{
"path": "docs/sparse_encoder/pretrained_models.md",
"chars": 8788,
"preview": "# Pretrained Models\n\n```{eval-rst}\nSeveral Sparse Encoder models have been publicly released on the Hugging Face Hub:\n\n*"
},
{
"path": "docs/sparse_encoder/training/examples.rst",
"chars": 680,
"preview": "Training Examples\n================\n\nThis page provides examples showing how to train Sparse Encoder models for various t"
},
{
"path": "docs/sparse_encoder/training_overview.md",
"chars": 65615,
"preview": "# Training Overview\n\n## Why Finetune?\nFinetuning Sparse Encoder models often heavily improves the performance of the mod"
},
{
"path": "docs/sparse_encoder/usage/efficiency.rst",
"chars": 25826,
"preview": "\nSpeeding up Inference\n=====================\n\nSentence Transformers supports 3 backends for computing sparse embeddings "
},
{
"path": "docs/sparse_encoder/usage/usage.rst",
"chars": 2520,
"preview": "Usage\n=====\n\nCharacteristics of Sparse Encoder models:\n\n1. Calculates **sparse vector representations** where most dimen"
},
{
"path": "examples/cross_encoder/applications/README.md",
"chars": 3668,
"preview": "# Cross-Encoders\n\nSentenceTransformers also supports to load Cross-Encoders for sentence pair scoring and sentence pair "
},
{
"path": "examples/cross_encoder/applications/cross-encoder_reranking.py",
"chars": 8588,
"preview": "\"\"\"\nThis script contains an example how to perform re-ranking with a Cross-Encoder for semantic search.\n\nFirst, we use a"
},
{
"path": "examples/cross_encoder/applications/cross-encoder_usage.py",
"chars": 1473,
"preview": "\"\"\"\nThis example computes the score between a query and all possible\nsentences in a corpus using a Cross-Encoder for sem"
},
{
"path": "examples/cross_encoder/training/README.md",
"chars": 1266,
"preview": "# Training\n\nThis folder contains various examples to fine-tune `CrossEncoder` models for specific tasks.\n\nFor the beginn"
},
{
"path": "examples/cross_encoder/training/distillation/README.md",
"chars": 6226,
"preview": "# Model Distillation\n\nModel distillation refers to training an (often smaller) student model to mimic the behaviour of a"
},
{
"path": "examples/cross_encoder/training/distillation/train_cross_encoder_kd_margin_mse.py",
"chars": 5493,
"preview": "import logging\nimport traceback\n\nfrom datasets import load_dataset, load_from_disk\n\nfrom sentence_transformers.cross_enc"
},
{
"path": "examples/cross_encoder/training/distillation/train_cross_encoder_kd_mse.py",
"chars": 5353,
"preview": "import logging\nimport traceback\n\nfrom datasets import load_dataset, load_from_disk\n\nfrom sentence_transformers.cross_enc"
},
{
"path": "examples/cross_encoder/training/ms_marco/README.md",
"chars": 9509,
"preview": "# MS MARCO\n\n[MS MARCO Passage Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) is a large dataset to train"
},
{
"path": "examples/cross_encoder/training/ms_marco/eval_cross-encoder-trec-dl.py",
"chars": 4011,
"preview": "\"\"\"\nThis file evaluates CrossEncoder on the TREC 2019 Deep Learning (DL) Track: https://huggingface.co/papers/2003.07820"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_bce.py",
"chars": 4693,
"preview": "import logging\nimport traceback\n\nimport torch\nfrom datasets import load_dataset\n\nfrom sentence_transformers.cross_encode"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_bce_preprocessed.py",
"chars": 5584,
"preview": "import logging\nimport traceback\n\nimport torch\nfrom datasets import load_dataset, load_from_disk\n\nfrom sentence_transform"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_cmnrl.py",
"chars": 5434,
"preview": "import logging\nimport traceback\nfrom collections import defaultdict\n\nimport torch\nfrom datasets import load_dataset\nfrom"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_lambda.py",
"chars": 6419,
"preview": "import logging\nimport traceback\nfrom datetime import datetime\n\nimport torch\nfrom datasets import load_dataset\n\nfrom sent"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_lambda_hard_neg.py",
"chars": 8516,
"preview": "import logging\nimport traceback\nfrom datetime import datetime\n\nimport torch\nfrom datasets import Dataset, concatenate_da"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_lambda_preprocessed.py",
"chars": 5463,
"preview": "import logging\nimport traceback\nfrom datetime import datetime\n\nimport torch\nfrom datasets import load_dataset\n\nfrom sent"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_listmle.py",
"chars": 6261,
"preview": "import logging\nimport traceback\n\nimport torch\nfrom datasets import load_dataset\n\nfrom sentence_transformers.cross_encode"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_listnet.py",
"chars": 6220,
"preview": "import logging\nimport traceback\n\nimport torch\nfrom datasets import load_dataset\n\nfrom sentence_transformers.cross_encode"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_plistmle.py",
"chars": 6830,
"preview": "import logging\nimport traceback\n\nimport torch\nfrom datasets import load_dataset\n\nfrom sentence_transformers.cross_encode"
},
{
"path": "examples/cross_encoder/training/ms_marco/training_ms_marco_ranknet.py",
"chars": 6393,
"preview": "from __future__ import annotations\n\nimport logging\nimport traceback\nfrom datetime import datetime\n\nimport torch\nfrom dat"
},
{
"path": "examples/cross_encoder/training/nli/README.md",
"chars": 3116,
"preview": "# Natural Language Inference\n\nGiven two sentence (premise and hypothesis), Natural Language Inference (NLI) is the task "
},
{
"path": "examples/cross_encoder/training/nli/training_nli.py",
"chars": 4459,
"preview": "\"\"\"\nThis examples trains a CrossEncoder for the NLI task. A CrossEncoder takes a sentence pair\nas input and outputs a la"
},
{
"path": "examples/cross_encoder/training/quora_duplicate_questions/README.md",
"chars": 3793,
"preview": "# Quora Duplicate Questions\n\n```{eval-rst}\nThis folder contains scripts that demonstrate how to train SentenceTransforme"
},
{
"path": "examples/cross_encoder/training/quora_duplicate_questions/training_quora_duplicate_questions.py",
"chars": 4462,
"preview": "\"\"\"\nThis examples trains a CrossEncoder for the Quora Duplicate Questions Detection task. A CrossEncoder takes a sentenc"
},
{
"path": "examples/cross_encoder/training/rerankers/README.md",
"chars": 10163,
"preview": "# Rerankers\n\n```{eval-rst}\nReranker models are often :class:`~sentence_transformers.cross_encoder.CrossEncoder` models w"
},
{
"path": "examples/cross_encoder/training/rerankers/training_gooaq_bce.py",
"chars": 7222,
"preview": "import logging\nimport traceback\n\nimport torch\nfrom datasets import load_dataset\n\nfrom sentence_transformers import Sente"
},
{
"path": "examples/cross_encoder/training/rerankers/training_gooaq_cmnrl.py",
"chars": 3965,
"preview": "import logging\nimport traceback\n\nfrom datasets import load_dataset\n\nfrom sentence_transformers.cross_encoder import Cros"
},
{
"path": "examples/cross_encoder/training/rerankers/training_gooaq_lambda.py",
"chars": 6988,
"preview": "import logging\nimport traceback\n\nfrom datasets import load_dataset\n\nfrom sentence_transformers import SentenceTransforme"
},
{
"path": "examples/cross_encoder/training/rerankers/training_nq_bce.py",
"chars": 7249,
"preview": "import logging\nimport traceback\n\nimport torch\nfrom datasets import load_dataset\n\nfrom sentence_transformers import Sente"
},
{
"path": "examples/cross_encoder/training/sts/README.md",
"chars": 3830,
"preview": "# Semantic Textual Similarity\n\n```{eval-rst}\nSemantic Textual Similarity (STS) assigns a score on the similarity of two "
},
{
"path": "examples/cross_encoder/training/sts/training_stsbenchmark.py",
"chars": 4300,
"preview": "\"\"\"\nThis examples trains a CrossEncoder for the STSbenchmark task. A CrossEncoder takes a sentence pair\nas input and out"
},
{
"path": "examples/sentence_transformer/README.md",
"chars": 859,
"preview": "# Examples\n\nThis folder contains various examples how to use SentenceTransformers.\n\n## Applications\n\nThe [applications]("
},
{
"path": "examples/sentence_transformer/applications/README.md",
"chars": 2226,
"preview": "# Applications\n\nSentenceTransformers can be used for various use-cases. In these folders, you find several example scrip"
},
{
"path": "examples/sentence_transformer/applications/clustering/README.md",
"chars": 3634,
"preview": "# Clustering\n\nSentence-Transformers can be used in different ways to perform clustering of small or large set of sentenc"
},
{
"path": "examples/sentence_transformer/applications/clustering/agglomerative.py",
"chars": 1733,
"preview": "\"\"\"\nThis is a simple application for sentence embeddings: clustering\n\nSentences are mapped to sentence embeddings and th"
},
{
"path": "examples/sentence_transformer/applications/clustering/fast_clustering.py",
"chars": 2926,
"preview": "\"\"\"\nThis is a more complex example on performing clustering on large scale dataset.\n\nThis examples find in a large set o"
},
{
"path": "examples/sentence_transformer/applications/clustering/kmeans.py",
"chars": 1326,
"preview": "\"\"\"\nThis is a simple application for sentence embeddings: clustering\n\nSentences are mapped to sentence embeddings and th"
},
{
"path": "examples/sentence_transformer/applications/computing-embeddings/README.rst",
"chars": 9839,
"preview": "Computing Embeddings\n====================\n\nOnce you have `installed <../../../../docs/installation.html>`_ Sentence Tran"
},
{
"path": "examples/sentence_transformer/applications/computing-embeddings/computing_embeddings.py",
"chars": 1123,
"preview": "\"\"\"\nThis basic example loads a pre-trained model from the web and uses it to\ngenerate sentence embeddings for a given li"
},
{
"path": "examples/sentence_transformer/applications/computing-embeddings/computing_embeddings_multi_gpu.py",
"chars": 1085,
"preview": "\"\"\"\nThis example starts multiple processes (1 per GPU), which encode\nsentences in parallel. This gives a near linear spe"
},
{
"path": "examples/sentence_transformer/applications/computing-embeddings/computing_embeddings_streaming.py",
"chars": 2015,
"preview": "\"\"\"\nThis example starts multiple processes (1 per GPU), which encode\nsentences in parallel. This gives a near linear spe"
},
{
"path": "examples/sentence_transformer/applications/embedding-quantization/README.md",
"chars": 12435,
"preview": "# Embedding Quantization\n\nEmbeddings may be challenging to scale up, which leads to expensive solutions and high latenci"
},
{
"path": "examples/sentence_transformer/applications/embedding-quantization/semantic_search_faiss.py",
"chars": 4686,
"preview": "import time\n\nfrom datasets import load_dataset\n\nfrom sentence_transformers import SentenceTransformer\nfrom sentence_tran"
},
{
"path": "examples/sentence_transformer/applications/embedding-quantization/semantic_search_faiss_benchmark.py",
"chars": 2119,
"preview": "from datasets import load_dataset\n\nfrom sentence_transformers import SentenceTransformer\nfrom sentence_transformers.quan"
},
{
"path": "examples/sentence_transformer/applications/embedding-quantization/semantic_search_recommended.py",
"chars": 4887,
"preview": "\"\"\"\nThis script showcases a recommended approach to perform semantic search using quantized embeddings with FAISS and us"
},
{
"path": "examples/sentence_transformer/applications/embedding-quantization/semantic_search_usearch.py",
"chars": 4697,
"preview": "import time\n\nfrom datasets import load_dataset\n\nfrom sentence_transformers import SentenceTransformer\nfrom sentence_tran"
},
{
"path": "examples/sentence_transformer/applications/embedding-quantization/semantic_search_usearch_benchmark.py",
"chars": 2123,
"preview": "from datasets import load_dataset\n\nfrom sentence_transformers import SentenceTransformer\nfrom sentence_transformers.quan"
},
{
"path": "examples/sentence_transformer/applications/image-search/Image_Classification.ipynb",
"chars": 903720,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"fantastic-grave\",\n \"metadata\": {},\n \"source\": [\n \"# Zero-"
},
{
"path": "examples/sentence_transformer/applications/image-search/Image_Clustering.ipynb",
"chars": 3002749,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"entitled-exploration\",\n \"metadata\": {},\n \"source\": [\n \"# "
},
{
"path": "examples/sentence_transformer/applications/image-search/Image_Duplicates.ipynb",
"chars": 3922210,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"together-subsection\",\n \"metadata\": {},\n \"source\": [\n \"# I"
},
{
"path": "examples/sentence_transformer/applications/image-search/Image_Search-multilingual.ipynb",
"chars": 2602528,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"introductory-framework\",\n \"metadata\": {},\n \"source\": [\n \""
},
{
"path": "examples/sentence_transformer/applications/image-search/Image_Search.ipynb",
"chars": 2733932,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"resident-liberal\",\n \"metadata\": {},\n \"source\": [\n \"# Join"
},
{
"path": "examples/sentence_transformer/applications/image-search/README.md",
"chars": 2861,
"preview": "# Image Search\n\nSentenceTransformers provides models that allow to embed images and text into the same vector space. Thi"
},
{
"path": "examples/sentence_transformer/applications/image-search/example.py",
"chars": 1105,
"preview": "from PIL import Image\n\nfrom sentence_transformers import SentenceTransformer, models, util\n\n###########\n\nimage = Image.o"
},
{
"path": "examples/sentence_transformer/applications/parallel-sentence-mining/README.md",
"chars": 2642,
"preview": "# Translated Sentence Mining\n\nBitext mining describes the process of finding parallel (translated) sentence pairs in mon"
},
{
"path": "examples/sentence_transformer/applications/parallel-sentence-mining/bitext_mining.py",
"chars": 6076,
"preview": "\"\"\"\nThis scripts show how to mine parallel (translated) sentences from two list of monolingual sentences.\n\nAs input, you"
},
{
"path": "examples/sentence_transformer/applications/parallel-sentence-mining/bitext_mining_utils.py",
"chars": 1850,
"preview": "\"\"\"\nThis file contains some utilities functions used to find parallel sentences\nin two monolingual corpora.\n\nCode in thi"
},
{
"path": "examples/sentence_transformer/applications/parallel-sentence-mining/bucc2018.py",
"chars": 7869,
"preview": "\"\"\"\nThis script tests the approach on the BUCC 2018 shared task on finding parallel sentences:\nhttps://comparable.limsi."
},
{
"path": "examples/sentence_transformer/applications/paraphrase-mining/README.md",
"chars": 2777,
"preview": "# Paraphrase Mining\n\nParaphrase mining is the task of finding paraphrases (texts with identical / similar meaning) in a "
},
{
"path": "examples/sentence_transformer/applications/retrieve_rerank/README.md",
"chars": 5628,
"preview": "# Retrieve & Re-Rank\n\nIn [Semantic Search](../semantic-search/README.md) we have shown how to use SentenceTransformer to"
},
{
"path": "examples/sentence_transformer/applications/retrieve_rerank/in_document_search_crossencoder.py",
"chars": 9249,
"preview": "\"\"\"\nThis example show how in-document search can be used with a CrossEncoder.\n\nThe document is split into passage. Here,"
},
{
"path": "examples/sentence_transformer/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb",
"chars": 131845,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"id\": \"ZyP3dXRfcXLa\"\n },\n \"source\": [\n \"# Retr"
},
{
"path": "examples/sentence_transformer/applications/semantic-search/README.md",
"chars": 14874,
"preview": "# Semantic Search\n\nSemantic search seeks to improve search accuracy by understanding the semantic meaning of the search "
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search.py",
"chars": 2928,
"preview": "\"\"\"\nThis is a simple application for sentence embeddings: semantic search\n\nWe have a corpus with various sentences. Then"
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search_nq_opensearch.py",
"chars": 5310,
"preview": "\"\"\"\nThis script contains an example how to perform semantic search with OpenSearch.\n\nAs dataset, we use the Natural Ques"
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search_publications.py",
"chars": 8127,
"preview": "\"\"\"\nThis example demonstrates how we can perform semantic search for scientific publications.\n\nAs model, we use SPECTER "
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search_quora_annoy.py",
"chars": 6164,
"preview": "\"\"\"\nThis example uses Approximate Nearest Neighbor Search (ANN) with Annoy (https://github.com/spotify/annoy).\n\nSearchin"
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search_quora_elasticsearch.py",
"chars": 4713,
"preview": "\"\"\"\nThis script contains an example how to perform semantic search with Elasticsearch.\n\nAs dataset, we use the Quora Dup"
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search_quora_faiss.py",
"chars": 6583,
"preview": "\"\"\"\nThis example uses Approximate Nearest Neighbor Search (ANN) with FAISS (https://github.com/facebookresearch/faiss).\n"
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search_quora_hnswlib.py",
"chars": 5859,
"preview": "\"\"\"\nThis example uses Approximate Nearest Neighbor Search (ANN) with Hnswlib (https://github.com/nmslib/hnswlib/).\n\nSea"
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search_quora_pytorch.py",
"chars": 3416,
"preview": "\"\"\"\nThis script contains an example how to perform semantic search with PyTorch. It performs exact nearest neighborh sea"
},
{
"path": "examples/sentence_transformer/applications/semantic-search/semantic_search_wikipedia_qa.py",
"chars": 3375,
"preview": "\"\"\"\nThis examples demonstrates the setup for Question-Answer-Retrieval.\n\nYou can input a query or a question. The script"
},
{
"path": "examples/sentence_transformer/applications/text-summarization/LexRank.py",
"chars": 3126,
"preview": "\"\"\"\nLexRank implementation\nSource: https://github.com/crabcamp/lexrank/tree/dev\n\"\"\"\n\nimport logging\n\nimport numpy as np\n"
},
{
"path": "examples/sentence_transformer/applications/text-summarization/README.md",
"chars": 538,
"preview": "# Text Summarization\n\nSentenceTransformers can be used for (extractive) text summarization: The document is broken down "
},
{
"path": "examples/sentence_transformer/applications/text-summarization/text-summarization.py",
"chars": 7141,
"preview": "\"\"\"\nThis example uses LexRank (https://jair.org/index.php/jair/article/view/10396/24901)\nto create an extractive summari"
},
{
"path": "examples/sentence_transformer/domain_adaptation/README.md",
"chars": 9009,
"preview": "# Domain Adaptation\n\nThe goal of **Domain Adaptation** is to adapt text embedding models to your specific text domain wi"
},
{
"path": "examples/sentence_transformer/evaluation/evaluation_inference_speed.py",
"chars": 1085,
"preview": "\"\"\"\nThis examples measures the inference speed of a certain model\n\nUsage:\npython evaluation_inference_speed.py\nOR\npython"
},
{
"path": "examples/sentence_transformer/evaluation/evaluation_no_dup_batch_sampler_speed.py",
"chars": 17700,
"preview": "from __future__ import annotations\n\n\"\"\"Benchmark NoDuplicates batch samplers on Hugging Face datasets.\n\nQuick run:\n p"
},
{
"path": "examples/sentence_transformer/evaluation/evaluation_stsbenchmark.py",
"chars": 1735,
"preview": "\"\"\"\nThis examples loads a pre-trained model and evaluates it on the STSbenchmark dataset\n\nUsage:\npython evaluation_stsbe"
},
{
"path": "examples/sentence_transformer/evaluation/evaluation_translation_matching.py",
"chars": 2426,
"preview": "\"\"\"\nGiven a dataset with parallel sentences, one \"english\" column and one \"non_english\" column, this script evaluates a "
},
{
"path": "examples/sentence_transformer/training/README.md",
"chars": 3026,
"preview": "# Training\n\nThis folder contains various examples to fine-tune `SentenceTransformers` for specific tasks.\n\nFor the begin"
},
{
"path": "examples/sentence_transformer/training/adaptive_layer/README.md",
"chars": 10668,
"preview": "# Adaptive Layers\n\nEmbedding models are often encoder models with numerous layers, such as 12 (e.g. [all-mpnet-base-v2]("
},
{
"path": "examples/sentence_transformer/training/adaptive_layer/adaptive_layer_nli.py",
"chars": 5016,
"preview": "\"\"\"\nThe system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI"
},
{
"path": "examples/sentence_transformer/training/adaptive_layer/adaptive_layer_sts.py",
"chars": 4651,
"preview": "\"\"\"\nThis examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from s"
},
{
"path": "examples/sentence_transformer/training/avg_word_embeddings/training_stsbenchmark_avg_word_embeddings.py",
"chars": 4887,
"preview": "\"\"\"\nThis example uses average word embeddings (for example from GloVe). It adds two fully-connected feed-forward layers "
},
{
"path": "examples/sentence_transformer/training/avg_word_embeddings/training_stsbenchmark_bilstm.py",
"chars": 4185,
"preview": "\"\"\"\nThis example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled,\nfor example wit"
},
{
"path": "examples/sentence_transformer/training/avg_word_embeddings/training_stsbenchmark_bow.py",
"chars": 5651,
"preview": "\"\"\"\nThis example uses a simple bag-of-words (BoW) approach. A sentence is mapped\nto a sparse vector with e.g. 25,000 dim"
},
{
"path": "examples/sentence_transformer/training/avg_word_embeddings/training_stsbenchmark_cnn.py",
"chars": 4511,
"preview": "\"\"\"\nThis example runs a CNN after the word embedding lookup. The output of the CNN is than pooled,\nfor example with mean"
},
{
"path": "examples/sentence_transformer/training/avg_word_embeddings/training_stsbenchmark_tf-idf_word_embeddings.py",
"chars": 6117,
"preview": "\"\"\"\nThis example weights word embeddings (like GloVe) with IDF weights. The IDF weights can for example be computed on W"
},
{
"path": "examples/sentence_transformer/training/clip/train_clip.ipynb",
"chars": 301332,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Simple image training using CLIP "
},
{
"path": "examples/sentence_transformer/training/clip/training_clip_flickr8k_mlflow.py",
"chars": 3849,
"preview": "import logging\nfrom datetime import datetime\n\nfrom datasets import DatasetDict, load_dataset\n\nfrom sentence_transformers"
},
{
"path": "examples/sentence_transformer/training/data_augmentation/README.md",
"chars": 7177,
"preview": "# Augmented SBERT\n\n## Motivation\n\nBi-encoders (a.k.a. sentence embeddings models) require substantial training data and "
},
{
"path": "examples/sentence_transformer/training/data_augmentation/train_sts_indomain_bm25.py",
"chars": 10499,
"preview": "\"\"\"\nThe script shows how to train Augmented SBERT (In-Domain) strategy for STSb dataset with BM25 sampling.\nWe utilise e"
},
{
"path": "examples/sentence_transformer/training/data_augmentation/train_sts_indomain_nlpaug.py",
"chars": 7204,
"preview": "\"\"\"\nThe script shows how to train Augmented SBERT (In-Domain) strategy for STSb dataset with nlp textual augmentation.\nW"
},
{
"path": "examples/sentence_transformer/training/data_augmentation/train_sts_indomain_semantic.py",
"chars": 9032,
"preview": "\"\"\"\nThe script shows how to train Augmented SBERT (In-Domain) strategy for STSb dataset with Semantic Search Sampling.\n\n"
},
{
"path": "examples/sentence_transformer/training/data_augmentation/train_sts_qqp_crossdomain.py",
"chars": 8908,
"preview": "\"\"\"\nThe script shows how to train Augmented SBERT (Domain-Transfer/Cross-Domain) strategy for STSb-QQP dataset.\nFor our "
},
{
"path": "examples/sentence_transformer/training/data_augmentation/train_sts_seed_optimization.py",
"chars": 6521,
"preview": "\"\"\"\nThis script is identical to examples/sentence_transformer/training/sts/training_stsbenchmark.py with seed optimizati"
},
{
"path": "examples/sentence_transformer/training/distillation/README.md",
"chars": 4224,
"preview": "# Model Distillation\n\nThis page contains an example to make SentenceTransformer models **faster, cheaper and lighter**. "
},
{
"path": "examples/sentence_transformer/training/distillation/dimensionality_reduction.py",
"chars": 3597,
"preview": "\"\"\"\nThe pre-trained models produce embeddings of size 512 - 1024. However, when storing a large\nnumber of embeddings, th"
},
{
"path": "examples/sentence_transformer/training/distillation/model_distillation.py",
"chars": 9882,
"preview": "\"\"\"\nThis file contains an example how to make a SentenceTransformer model faster and lighter.\n\nThis is achieved by using"
},
{
"path": "examples/sentence_transformer/training/distillation/model_distillation_layer_reduction.py",
"chars": 9475,
"preview": "\"\"\"\nThis file contains an example how to make a SentenceTransformer model faster and lighter.\n\nThis is achieved by using"
},
{
"path": "examples/sentence_transformer/training/distillation/model_quantization.py",
"chars": 5383,
"preview": "\"\"\"\nA quantized model executes some or all of the operations with integers rather than floating point values. This allow"
},
{
"path": "examples/sentence_transformer/training/hpo/README.rst",
"chars": 15721,
"preview": "\nHyperparameter Optimization\n===========================\n\nThe :class:`~sentence_transformers.trainer.SentenceTransformer"
},
{
"path": "examples/sentence_transformer/training/hpo/hpo_nli.py",
"chars": 3657,
"preview": "from datasets import load_dataset\n\nfrom sentence_transformers import (\n SentenceTransformer,\n SentenceTransformerT"
},
{
"path": "examples/sentence_transformer/training/matryoshka/2d_matryoshka_nli.py",
"chars": 5103,
"preview": "\"\"\"\nThe system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI"
},
{
"path": "examples/sentence_transformer/training/matryoshka/2d_matryoshka_sts.py",
"chars": 4682,
"preview": "\"\"\"\nThis examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from s"
},
{
"path": "examples/sentence_transformer/training/matryoshka/README.md",
"chars": 8784,
"preview": "# Matryoshka Embeddings\n\nDense embedding models typically produce embeddings with a fixed size, such as 768 or 1024. All"
},
{
"path": "examples/sentence_transformer/training/matryoshka/matryoshka_eval_stsb.py",
"chars": 6027,
"preview": "\"\"\"\nThis script evaluates embedding models truncated at different dimensions on the STS\nbenchmark.\n\"\"\"\n\nimport argparse\n"
},
{
"path": "examples/sentence_transformer/training/matryoshka/matryoshka_nli.py",
"chars": 5632,
"preview": "\"\"\"\nThe system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI"
},
{
"path": "examples/sentence_transformer/training/matryoshka/matryoshka_nli_reduced_dim.py",
"chars": 6282,
"preview": "\"\"\"\nThe system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI"
},
{
"path": "examples/sentence_transformer/training/matryoshka/matryoshka_sts.py",
"chars": 5252,
"preview": "\"\"\"\nThis examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from s"
},
{
"path": "examples/sentence_transformer/training/ms_marco/README.md",
"chars": 8476,
"preview": "# MS MARCO\n\n[MS MARCO Passage Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) is a large dataset to train"
},
{
"path": "examples/sentence_transformer/training/ms_marco/eval_msmarco.py",
"chars": 3363,
"preview": "\"\"\"\nThis script runs the evaluation of an SBERT msmarco model on the\nMS MARCO dev dataset and reports different performa"
},
{
"path": "examples/sentence_transformer/training/ms_marco/multilingual/README.md",
"chars": 1151,
"preview": "# MS MARCO - Multilingual Training\n\nThis folder demonstrates how to train a multi-lingual SBERT model for [semantic sear"
},
{
"path": "examples/sentence_transformer/training/ms_marco/multilingual/translate_queries.py",
"chars": 3274,
"preview": "\"\"\"\nThis script translates the queries in the MS MARCO dataset to the defined target languages.\n\nFor machine translation"
},
{
"path": "examples/sentence_transformer/training/ms_marco/train-kldiv.py",
"chars": 3719,
"preview": "import logging\nimport random\n\nimport numpy\nimport torch\nfrom datasets import load_dataset\n\nfrom sentence_transformers im"
},
{
"path": "examples/sentence_transformer/training/ms_marco/train-margin-mse.py",
"chars": 3718,
"preview": "import logging\nimport random\n\nimport numpy\nimport torch\nfrom datasets import load_dataset\n\nfrom sentence_transformers im"
},
{
"path": "examples/sentence_transformer/training/ms_marco/train_bi-encoder_margin-mse.py",
"chars": 9768,
"preview": "import argparse\nimport gzip\nimport json\nimport logging\nimport os\nimport pickle\nimport random\nimport sys\nimport tarfile\nf"
},
{
"path": "examples/sentence_transformer/training/ms_marco/train_bi-encoder_mnrl.py",
"chars": 10337,
"preview": "\"\"\"\nThis examples show how to train a Bi-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-"
},
{
"path": "examples/sentence_transformer/training/multilingual/README.md",
"chars": 10513,
"preview": "# Multilingual Models\n\nThe issue with multilingual BERT (mBERT) as well as with XLM-RoBERTa is that those produce rather"
}
]
// ... and 337 more files (download for full content)
About this extraction
This page contains the full source code of the huggingface/sentence-transformers GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 537 files (16.9 MB), approximately 4.0M tokens, and a symbol index with 253 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.