Copy disabled (too large)
Download .txt
Showing preview only (27,223K chars total). Download the full file to get everything.
Repository: FlagOpen/FlagEmbedding
Branch: master
Commit: dbc600560b2d
Files: 1068
Total size: 25.8 MB
Directory structure:
gitextract_fx4cm0n9/
├── .github/
│ └── workflows/
│ └── documentation.yml
├── .gitignore
├── FlagEmbedding/
│ ├── __init__.py
│ ├── abc/
│ │ ├── __init__.py
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data_loader.py
│ │ │ ├── evaluator.py
│ │ │ ├── runner.py
│ │ │ ├── searcher.py
│ │ │ └── utils.py
│ │ ├── finetune/
│ │ │ ├── __init__.py
│ │ │ ├── embedder/
│ │ │ │ ├── AbsArguments.py
│ │ │ │ ├── AbsDataset.py
│ │ │ │ ├── AbsModeling.py
│ │ │ │ ├── AbsRunner.py
│ │ │ │ ├── AbsTrainer.py
│ │ │ │ └── __init__.py
│ │ │ └── reranker/
│ │ │ ├── AbsArguments.py
│ │ │ ├── AbsDataset.py
│ │ │ ├── AbsModeling.py
│ │ │ ├── AbsRunner.py
│ │ │ ├── AbsTrainer.py
│ │ │ └── __init__.py
│ │ └── inference/
│ │ ├── AbsEmbedder.py
│ │ ├── AbsReranker.py
│ │ └── __init__.py
│ ├── evaluation/
│ │ ├── __init__.py
│ │ ├── air_bench/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── examples/
│ │ │ │ ├── long-doc/
│ │ │ │ │ ├── arxiv-gemini.jsonl
│ │ │ │ │ ├── arxiv-gpt3.jsonl
│ │ │ │ │ ├── arxiv-llama2.jsonl
│ │ │ │ │ ├── arxiv-llm-survey.jsonl
│ │ │ │ │ ├── book-a-brief-history-of-time_stephen-hawking.jsonl
│ │ │ │ │ ├── book-origin-of-species_darwin.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_1.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_2.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_3.jsonl
│ │ │ │ │ ├── healthcare-pubmed_30k-40k_10-merged.jsonl
│ │ │ │ │ ├── healthcare-pubmed_40k-50k_5-merged.jsonl
│ │ │ │ │ ├── law-lex_files_300k-400k.jsonl
│ │ │ │ │ ├── law-lex_files_400k-500k.jsonl
│ │ │ │ │ ├── law-lex_files_500k-600k.jsonl
│ │ │ │ │ └── law-lex_files_600k-700k.jsonl
│ │ │ │ └── qa/
│ │ │ │ ├── arxiv.jsonl
│ │ │ │ ├── finance.jsonl
│ │ │ │ ├── healthcare.jsonl
│ │ │ │ ├── law.jsonl
│ │ │ │ ├── msmarco.jsonl
│ │ │ │ ├── news.jsonl
│ │ │ │ ├── web.jsonl
│ │ │ │ └── wiki.jsonl
│ │ │ └── runner.py
│ │ ├── beir/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── data_loader.py
│ │ │ ├── evaluator.py
│ │ │ ├── prompts.py
│ │ │ └── runner.py
│ │ ├── bright/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── data_loader.py
│ │ │ ├── prompts.py
│ │ │ ├── runner.py
│ │ │ └── searcher.py
│ │ ├── custom/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ └── runner.py
│ │ ├── miracl/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ └── runner.py
│ │ ├── mkqa/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ ├── evaluator.py
│ │ │ ├── runner.py
│ │ │ └── utils/
│ │ │ ├── compute_metrics.py
│ │ │ └── normalize_text.py
│ │ ├── mldr/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ └── runner.py
│ │ ├── msmarco/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ └── runner.py
│ │ └── mteb/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── arguments.py
│ │ ├── examples/
│ │ │ ├── AmazonCounterfactualClassification.csv
│ │ │ ├── AmazonPolarityClassification.csv
│ │ │ ├── AmazonReviewsClassification.csv
│ │ │ ├── ArguAna.csv
│ │ │ ├── ArxivClusteringP2P.csv
│ │ │ ├── ArxivClusteringS2S.csv
│ │ │ ├── AskUbuntuDupQuestions.csv
│ │ │ ├── BIOSSES.csv
│ │ │ ├── Banking77Classification.csv
│ │ │ ├── BiorxivClusteringP2P.csv
│ │ │ ├── BiorxivClusteringS2S.csv
│ │ │ ├── CQADupstack.csv
│ │ │ ├── CQADupstackRetrieval.csv
│ │ │ ├── ClimateFEVER.csv
│ │ │ ├── DBPedia.csv
│ │ │ ├── EmotionClassification.csv
│ │ │ ├── FEVER.csv
│ │ │ ├── FiQA2018.csv
│ │ │ ├── HotpotQA.csv
│ │ │ ├── ImdbClassification.csv
│ │ │ ├── MSMARCO.csv
│ │ │ ├── MTOPDomainClassification.csv
│ │ │ ├── MTOPIntentClassification.csv
│ │ │ ├── MassiveIntentClassification.csv
│ │ │ ├── MassiveScenarioClassification.csv
│ │ │ ├── MedrxivClusteringP2P.csv
│ │ │ ├── MedrxivClusteringS2S.csv
│ │ │ ├── MindSmallReranking.csv
│ │ │ ├── NFCorpus.csv
│ │ │ ├── NQ.csv
│ │ │ ├── QuoraRetrieval.csv
│ │ │ ├── RedditClustering.csv
│ │ │ ├── RedditClusteringP2P.csv
│ │ │ ├── SCIDOCS.csv
│ │ │ ├── SICK-R.csv
│ │ │ ├── STS12.csv
│ │ │ ├── STS13.csv
│ │ │ ├── STS14.csv
│ │ │ ├── STS15.csv
│ │ │ ├── STS16.csv
│ │ │ ├── STS17.csv
│ │ │ ├── STS22.csv
│ │ │ ├── STSBenchmark.csv
│ │ │ ├── SciDocsRR.csv
│ │ │ ├── SciFact.csv
│ │ │ ├── SprintDuplicateQuestions.csv
│ │ │ ├── StackExchangeClustering.csv
│ │ │ ├── StackExchangeClusteringP2P.csv
│ │ │ ├── StackOverflowDupQuestions.csv
│ │ │ ├── SummEval.csv
│ │ │ ├── TRECCOVID.csv
│ │ │ ├── Touche2020.csv
│ │ │ ├── ToxicConversationsClassification.csv
│ │ │ ├── TweetSentimentExtractionClassification.csv
│ │ │ ├── TwentyNewsgroupsClustering.csv
│ │ │ ├── TwitterSemEval2015.csv
│ │ │ └── TwitterURLCorpus.csv
│ │ ├── prompts.py
│ │ ├── runner.py
│ │ └── searcher.py
│ ├── finetune/
│ │ ├── __init__.py
│ │ ├── embedder/
│ │ │ ├── __init__.py
│ │ │ ├── decoder_only/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── arguments.py
│ │ │ │ │ ├── load_model.py
│ │ │ │ │ ├── modeling.py
│ │ │ │ │ ├── runner.py
│ │ │ │ │ └── trainer.py
│ │ │ │ └── icl/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── arguments.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── load_model.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── runner.py
│ │ │ │ └── trainer.py
│ │ │ └── encoder_only/
│ │ │ ├── __init__.py
│ │ │ ├── base/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── runner.py
│ │ │ │ └── trainer.py
│ │ │ └── m3/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── modeling.py
│ │ │ ├── runner.py
│ │ │ └── trainer.py
│ │ └── reranker/
│ │ ├── __init__.py
│ │ ├── decoder_only/
│ │ │ ├── __init__.py
│ │ │ ├── base/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── arguments.py
│ │ │ │ ├── load_model.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── runner.py
│ │ │ │ └── trainer.py
│ │ │ └── layerwise/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── configuration_minicpm_reranker.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── modeling_minicpm_reranker.py
│ │ │ ├── runner.py
│ │ │ └── trainer.py
│ │ └── encoder_only/
│ │ ├── __init__.py
│ │ └── base/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── modeling.py
│ │ ├── runner.py
│ │ └── trainer.py
│ ├── inference/
│ │ ├── __init__.py
│ │ ├── auto_embedder.py
│ │ ├── auto_reranker.py
│ │ ├── embedder/
│ │ │ ├── __init__.py
│ │ │ ├── decoder_only/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ └── icl.py
│ │ │ ├── encoder_only/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ └── m3.py
│ │ │ └── model_mapping.py
│ │ └── reranker/
│ │ ├── __init__.py
│ │ ├── decoder_only/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── layerwise.py
│ │ │ ├── lightweight.py
│ │ │ └── models/
│ │ │ ├── __init__.py
│ │ │ ├── configuration_minicpm_reranker.py
│ │ │ ├── gemma_config.py
│ │ │ ├── gemma_model.py
│ │ │ └── modeling_minicpm_reranker.py
│ │ ├── encoder_only/
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ └── model_mapping.py
│ └── utils/
│ ├── __init__.py
│ └── transformers_compat.py
├── LICENSE
├── Manifest.in
├── README.md
├── README_zh.md
├── Tutorials/
│ ├── 1_Embedding/
│ │ ├── 1.1_Intro&Inference.ipynb
│ │ ├── 1.2.1_BGE_Series.ipynb
│ │ ├── 1.2.2_Auto_Embedder.ipynb
│ │ ├── 1.2.3_BGE_v1&1.5.ipynb
│ │ ├── 1.2.4_BGE-M3.ipynb
│ │ ├── 1.2.5_BGE_EN_ICL.ipynb
│ │ ├── 1.2.6_BGE_VL.ipynb
│ │ └── 1.2.7_BGE_Code_v1.ipynb
│ ├── 2_Metrics/
│ │ ├── 2.1_Similarity_Metrics.ipynb
│ │ └── 2.2_Eval_Metrics.ipynb
│ ├── 3_Indexing/
│ │ ├── 3.1.1_Intro_to_Faiss.ipynb
│ │ ├── 3.1.2_Faiss_GPU.ipynb
│ │ ├── 3.1.3_Faiss_Indexes.ipynb
│ │ ├── 3.1.4_Faiss_Quantizers.ipynb
│ │ └── 3.1.5_Faiss_Index_Choosing.ipynb
│ ├── 4_Evaluation/
│ │ ├── 4.1.1_Evaluation_MSMARCO.ipynb
│ │ ├── 4.2.1_MTEB_Intro.ipynb
│ │ ├── 4.2.2_MTEB_Leaderboard.ipynb
│ │ ├── 4.2.3_C-MTEB.ipynb
│ │ ├── 4.3.1_Sentence_Transformers_Eval.ipynb
│ │ ├── 4.4.1_BEIR.ipynb
│ │ ├── 4.5.1_MIRACL.ipynb
│ │ ├── 4.5.2_MLDR.ipynb
│ │ └── utils/
│ │ ├── compute_metrics.py
│ │ └── normalize_text.py
│ ├── 5_Reranking/
│ │ ├── 5.1_Intro.ipynb
│ │ ├── 5.2_BGE_Reranker.ipynb
│ │ └── 5.3_Reranker_Eval.ipynb
│ ├── 6_RAG/
│ │ ├── 6.1_RAG_From_Scratch.ipynb
│ │ ├── 6.2_RAG_LangChain.ipynb
│ │ └── 6.3_RAG_LlamaIndex.ipynb
│ ├── 7_Fine-tuning/
│ │ ├── 7.1.1_Data_preparation.ipynb
│ │ ├── 7.1.2_Fine-tune.ipynb
│ │ ├── 7.1.3_Eval_FT_Model.ipynb
│ │ ├── 7.2.1_Hard_Negative_Mining.ipynb
│ │ └── config/
│ │ ├── ds_stage0.json
│ │ └── ds_stage1.json
│ ├── README.md
│ └── quick_start.ipynb
├── dataset/
│ └── README.md
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── make.bat
│ ├── requirements.txt
│ └── source/
│ ├── API/
│ │ ├── abc/
│ │ │ ├── evaluation/
│ │ │ │ ├── arguments.rst
│ │ │ │ ├── data_loader.rst
│ │ │ │ ├── evaluator.rst
│ │ │ │ ├── runner.rst
│ │ │ │ └── searcher.rst
│ │ │ ├── evaluation.rst
│ │ │ ├── finetune/
│ │ │ │ ├── embedder/
│ │ │ │ │ ├── AbsArguments.rst
│ │ │ │ │ ├── AbsDataset.rst
│ │ │ │ │ ├── AbsModeling.rst
│ │ │ │ │ ├── AbsRunner.rst
│ │ │ │ │ └── AbsTrainer.rst
│ │ │ │ ├── embedder.rst
│ │ │ │ ├── reranker/
│ │ │ │ │ ├── AbsArguments.rst
│ │ │ │ │ ├── AbsDataset.rst
│ │ │ │ │ ├── AbsModeling.rst
│ │ │ │ │ ├── AbsRunner.rst
│ │ │ │ │ └── AbsTrainer.rst
│ │ │ │ └── reranker.rst
│ │ │ ├── finetune.rst
│ │ │ ├── inference/
│ │ │ │ ├── AbsEmbedder.rst
│ │ │ │ └── AbsReranker.rst
│ │ │ └── inference.rst
│ │ ├── abc.rst
│ │ ├── evaluation/
│ │ │ ├── airbench/
│ │ │ │ ├── arguments.rst
│ │ │ │ └── runner.rst
│ │ │ ├── airbench.rst
│ │ │ ├── beir/
│ │ │ │ ├── arguments.rst
│ │ │ │ ├── data_loader.rst
│ │ │ │ ├── evaluator.rst
│ │ │ │ └── runner.rst
│ │ │ ├── beir.rst
│ │ │ ├── miracl/
│ │ │ │ ├── data_loader.rst
│ │ │ │ └── runner.rst
│ │ │ ├── miracl.rst
│ │ │ ├── mkqa/
│ │ │ │ ├── data_loader.rst
│ │ │ │ ├── evaluator.rst
│ │ │ │ └── runner.rst
│ │ │ ├── mkqa.rst
│ │ │ ├── mldr/
│ │ │ │ ├── data_loader.rst
│ │ │ │ └── runner.rst
│ │ │ ├── mldr.rst
│ │ │ ├── msmarco/
│ │ │ │ ├── data_loader.rst
│ │ │ │ └── runner.rst
│ │ │ ├── msmarco.rst
│ │ │ ├── mteb/
│ │ │ │ ├── arguments.rst
│ │ │ │ ├── runner.rst
│ │ │ │ └── searcher.rst
│ │ │ └── mteb.rst
│ │ ├── evaluation.rst
│ │ ├── finetune/
│ │ │ ├── embedder/
│ │ │ │ ├── decoder_only/
│ │ │ │ │ ├── base/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ ├── base.rst
│ │ │ │ │ ├── icl/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── dataset.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ └── icl.rst
│ │ │ │ ├── decoder_only.rst
│ │ │ │ ├── encoder_only/
│ │ │ │ │ ├── base/
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ ├── base.rst
│ │ │ │ │ ├── m3/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ └── m3.rst
│ │ │ │ └── encoder_only.rst
│ │ │ ├── embedder.rst
│ │ │ ├── reranker/
│ │ │ │ ├── decoder_only/
│ │ │ │ │ ├── base/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ ├── base.rst
│ │ │ │ │ ├── layerwise/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ └── layerwise.rst
│ │ │ │ ├── decoder_only.rst
│ │ │ │ ├── encoder_only/
│ │ │ │ │ ├── base/
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ └── base.rst
│ │ │ │ └── encoder_only.rst
│ │ │ └── reranker.rst
│ │ ├── finetune.rst
│ │ ├── index.rst
│ │ ├── inference/
│ │ │ ├── FlagAutoModel.rst
│ │ │ ├── FlagAutoReranker.rst
│ │ │ ├── embedder/
│ │ │ │ ├── decoder_only/
│ │ │ │ │ ├── BaseLLMEmbedder.rst
│ │ │ │ │ └── ICLLLMEmbedder.rst
│ │ │ │ ├── embedder.rst
│ │ │ │ └── encoder_only/
│ │ │ │ ├── BaseEmbedder.rst
│ │ │ │ └── M3Embedder.rst
│ │ │ └── reranker/
│ │ │ ├── decoder_only/
│ │ │ │ ├── BaseLLMReranker.rst
│ │ │ │ ├── LayerWiseLLMReranker.rst
│ │ │ │ └── LightweightLLMReranker.rst
│ │ │ ├── encoder_only/
│ │ │ │ └── BaseReranker.rst
│ │ │ └── reranker.rst
│ │ └── inference.rst
│ ├── C-MTEB.rst
│ ├── FAQ/
│ │ └── index.rst
│ ├── Introduction/
│ │ ├── IR.rst
│ │ ├── embedder.rst
│ │ ├── index.rst
│ │ ├── installation.rst
│ │ ├── overview.rst
│ │ ├── quick_start.rst
│ │ ├── reranker.rst
│ │ ├── retrieval_demo.ipynb
│ │ └── similarity.rst
│ ├── _static/
│ │ └── css/
│ │ └── custom.css
│ ├── bge/
│ │ ├── bge_code.rst
│ │ ├── bge_icl.rst
│ │ ├── bge_m3.rst
│ │ ├── bge_reranker.rst
│ │ ├── bge_reranker_v2.rst
│ │ ├── bge_v1_v1.5.rst
│ │ ├── bge_vl.rst
│ │ └── index.rst
│ ├── community/
│ │ └── index.rst
│ ├── conf.py
│ ├── index.rst
│ └── tutorial/
│ ├── 1_Embedding/
│ │ ├── 1.1.1.ipynb
│ │ ├── 1.2.1.ipynb
│ │ ├── 1.2.2.ipynb
│ │ ├── 1.2.3.ipynb
│ │ ├── 1.2.4.ipynb
│ │ └── 1.2.5.ipynb
│ ├── 1_Embedding.rst
│ ├── 2_Metrics/
│ │ ├── 2.1.ipynb
│ │ └── 2.2.ipynb
│ ├── 2_Metrics.rst
│ ├── 3_Indexing/
│ │ ├── 3.1.1.ipynb
│ │ ├── 3.1.2.ipynb
│ │ ├── 3.1.3.ipynb
│ │ ├── 3.1.4.ipynb
│ │ └── 3.1.5.ipynb
│ ├── 3_Indexing.rst
│ ├── 4_Evaluation/
│ │ ├── 4.1.1.ipynb
│ │ ├── 4.2.1.ipynb
│ │ ├── 4.2.2.ipynb
│ │ ├── 4.2.3.ipynb
│ │ ├── 4.3.1.ipynb
│ │ ├── 4.4.1.ipynb
│ │ ├── 4.5.1.ipynb
│ │ └── 4.5.2.ipynb
│ ├── 4_Evaluation.rst
│ ├── 5_Reranking/
│ │ ├── 5.1.ipynb
│ │ ├── 5.2.ipynb
│ │ └── 5.3.ipynb
│ ├── 5_Reranking.rst
│ ├── 6_RAG/
│ │ ├── 6.1.ipynb
│ │ ├── 6.2.ipynb
│ │ └── 6.3.ipynb
│ ├── 6_RAG.rst
│ ├── 7_Finetuning/
│ │ ├── 7.1.1.ipynb
│ │ ├── 7.1.2.ipynb
│ │ ├── 7.1.3.ipynb
│ │ └── 7.2.1.ipynb
│ ├── 7_Finetuning.rst
│ └── index.rst
├── examples/
│ ├── README.md
│ ├── evaluation/
│ │ ├── README.md
│ │ ├── air_bench/
│ │ │ └── eval_air_bench.sh
│ │ ├── beir/
│ │ │ └── eval_beir.sh
│ │ ├── bright/
│ │ │ └── eval_bright_short.sh
│ │ ├── miracl/
│ │ │ └── eval_miracl.sh
│ │ ├── mkqa/
│ │ │ └── eval_mkqa.sh
│ │ ├── mldr/
│ │ │ └── eval_mldr.sh
│ │ ├── msmarco/
│ │ │ └── eval_msmarco.sh
│ │ └── mteb/
│ │ └── eval_mteb.sh
│ ├── finetune/
│ │ ├── ds_stage0.json
│ │ ├── ds_stage1.json
│ │ ├── embedder/
│ │ │ ├── README.md
│ │ │ ├── decoder_only/
│ │ │ │ ├── base.sh
│ │ │ │ ├── base_same_dataset.sh
│ │ │ │ └── icl_same_dataset.sh
│ │ │ ├── encoder_only/
│ │ │ │ ├── base.sh
│ │ │ │ ├── base_same_dataset.sh
│ │ │ │ ├── m3.sh
│ │ │ │ └── m3_same_dataset.sh
│ │ │ └── example_data/
│ │ │ ├── classification-no_in_batch_neg/
│ │ │ │ ├── AmazonClassification.jsonl
│ │ │ │ └── Banking77Classification.jsonl
│ │ │ ├── clustering-no_in_batch_neg/
│ │ │ │ ├── arXiv_title.jsonl
│ │ │ │ └── bioRXiv_title.jsonl
│ │ │ ├── retrieval/
│ │ │ │ ├── msmarco.jsonl
│ │ │ │ ├── nli.jsonl
│ │ │ │ └── nq.jsonl
│ │ │ └── sts/
│ │ │ └── sts.jsonl
│ │ └── reranker/
│ │ ├── README.md
│ │ ├── decoder_only/
│ │ │ ├── base.sh
│ │ │ └── layerwise.sh
│ │ ├── encoder_only/
│ │ │ └── base.sh
│ │ └── example_data/
│ │ ├── normal/
│ │ │ └── examples.jsonl
│ │ └── prompt_based/
│ │ └── examples.jsonl
│ └── inference/
│ ├── embedder/
│ │ ├── README.md
│ │ ├── decoder_only/
│ │ │ ├── auto_base_multi_devices.py
│ │ │ ├── auto_base_single_device.py
│ │ │ ├── auto_icl_multi_devices.py
│ │ │ ├── auto_icl_single_device.py
│ │ │ ├── base_multi_devices.py
│ │ │ ├── base_single_device.py
│ │ │ ├── icl_multi_devices.py
│ │ │ └── icl_single_device.py
│ │ └── encoder_only/
│ │ ├── auto_base_multi_devices.py
│ │ ├── auto_base_single_device.py
│ │ ├── auto_m3_multi_devices.py
│ │ ├── auto_m3_single_device.py
│ │ ├── base_multi_devices.py
│ │ ├── base_single_device.py
│ │ ├── m3_multi_devices.py
│ │ ├── m3_multi_devices_compute_score.py
│ │ ├── m3_single_device.py
│ │ └── m3_single_device_compute_score.py
│ └── reranker/
│ ├── README.md
│ ├── decoder_only/
│ │ ├── auto_base_multi_devices.py
│ │ ├── auto_base_single_device.py
│ │ ├── auto_layerwise_multi_devices.py
│ │ ├── auto_layerwise_single_device.py
│ │ ├── auto_lightweight_multi_devices.py
│ │ ├── auto_lightweight_single_device.py
│ │ ├── base_multi_devices.py
│ │ ├── base_single_device.py
│ │ ├── layerwise_multi_devices.py
│ │ ├── layerwise_single_device.py
│ │ ├── lightweight_multi_devices.py
│ │ └── lightweight_single_device.py
│ └── encoder_only/
│ ├── auto_base_multi_devices.py
│ ├── auto_base_single_device.py
│ ├── base_multi_devices.py
│ └── base_single_device.py
├── research/
│ ├── BGE_Coder/
│ │ ├── README.md
│ │ ├── data_generation/
│ │ │ ├── constant.py
│ │ │ ├── corpus_generator.py
│ │ │ ├── format_generated_examples.py
│ │ │ ├── llm.py
│ │ │ ├── run_generation.py
│ │ │ ├── search.py
│ │ │ ├── triplet_generator.py
│ │ │ └── utils.py
│ │ └── evaluation/
│ │ ├── coderag_eval/
│ │ │ ├── eval.sh
│ │ │ ├── prepare_data.sh
│ │ │ └── test/
│ │ │ ├── arguments.py
│ │ │ ├── create/
│ │ │ │ ├── code_search_net.py
│ │ │ │ ├── ds1000.py
│ │ │ │ ├── general_programming.py
│ │ │ │ ├── humaneval.py
│ │ │ │ ├── live_code_bench.py
│ │ │ │ ├── mbpp.py
│ │ │ │ ├── odex.py
│ │ │ │ ├── repoeval.py
│ │ │ │ ├── repoeval_repo.py
│ │ │ │ ├── swebench.py
│ │ │ │ ├── swebench_repo.py
│ │ │ │ └── utils.py
│ │ │ ├── main.py
│ │ │ └── prompts.py
│ │ └── coir_eval/
│ │ ├── arguments.py
│ │ ├── eval.sh
│ │ ├── main.py
│ │ └── prompts.py
│ ├── BGE_M3/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── data.py
│ │ ├── modeling.py
│ │ ├── run.py
│ │ ├── split_data_by_length.py
│ │ └── trainer.py
│ ├── BGE_Reasoner/
│ │ └── README.md
│ ├── BGE_VL/
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── eval/
│ │ │ ├── data/
│ │ │ │ ├── circo_corpus.jsonl
│ │ │ │ ├── circo_query.jsonl
│ │ │ │ ├── fashioniq_dress_corpus.jsonl
│ │ │ │ ├── fashioniq_dress_query_val.jsonl
│ │ │ │ ├── fashioniq_shirt_corpus.jsonl
│ │ │ │ ├── fashioniq_shirt_query_val.jsonl
│ │ │ │ ├── fashioniq_toptee_corpus.jsonl
│ │ │ │ └── fashioniq_toptee_query_val.jsonl
│ │ │ ├── eval_Circo.py
│ │ │ ├── eval_fashioniq.py
│ │ │ ├── flag_dataset.py
│ │ │ ├── flag_mmret.py
│ │ │ └── results/
│ │ │ ├── mmret_base_circo.json
│ │ │ └── mmret_large_circo.json
│ │ ├── modeling_MMRet_CLIP.py
│ │ └── retrieval_demo.ipynb
│ ├── BGE_VL_Screenshot/
│ │ └── README.md
│ ├── C_MTEB/
│ │ ├── C_MTEB/
│ │ │ ├── __init__.py
│ │ │ └── tasks/
│ │ │ ├── Classification.py
│ │ │ ├── Clustering.py
│ │ │ ├── MultiLongDocRetrieval.py
│ │ │ ├── PairClassification.py
│ │ │ ├── Reranking.py
│ │ │ ├── Retrieval.py
│ │ │ ├── STS.py
│ │ │ └── __init__.py
│ │ ├── MKQA/
│ │ │ ├── README.md
│ │ │ ├── dense_retrieval/
│ │ │ │ ├── step0-generate_embedding.py
│ │ │ │ ├── step1-search_results.py
│ │ │ │ └── step2-eval_dense_mkqa.py
│ │ │ ├── hybrid_retrieval/
│ │ │ │ ├── step0-hybrid_search_results.py
│ │ │ │ └── step1-eval_hybrid_mkqa.py
│ │ │ ├── multi_vector_rerank/
│ │ │ │ ├── hybrid_all_results.py
│ │ │ │ ├── step0-rerank_results.py
│ │ │ │ └── step1-eval_rerank_mkqa.py
│ │ │ ├── sparse_retrieval/
│ │ │ │ ├── bm25_baseline.py
│ │ │ │ ├── bm25_baseline_same_tokenizer.py
│ │ │ │ ├── step0-encode_query-and-corpus.py
│ │ │ │ ├── step1-search_results.py
│ │ │ │ └── step2-eval_sparse_mkqa.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── evaluation.py
│ │ │ └── normalize_text.py
│ │ ├── MLDR/
│ │ │ ├── README.md
│ │ │ ├── dense_retrieval/
│ │ │ │ ├── step0-generate_embedding.py
│ │ │ │ ├── step1-search_results.py
│ │ │ │ └── step2-eval_dense_mldr.py
│ │ │ ├── hybrid_retrieval/
│ │ │ │ ├── step0-hybrid_search_results.py
│ │ │ │ └── step1-eval_hybrid_mldr.py
│ │ │ ├── mteb_dense_eval/
│ │ │ │ ├── eval_MLDR.py
│ │ │ │ └── flag_dres_model.py
│ │ │ ├── multi_vector_rerank/
│ │ │ │ ├── hybrid_all_results.py
│ │ │ │ ├── step0-rerank_results.py
│ │ │ │ └── step1-eval_rerank_mldr.py
│ │ │ └── sparse_retrieval/
│ │ │ ├── bm25_baseline.py
│ │ │ ├── bm25_baseline_same_tokenizer.py
│ │ │ ├── step0-encode_query-and-corpus.py
│ │ │ ├── step1-search_results.py
│ │ │ └── step2-eval_sparse_mldr.py
│ │ ├── README.md
│ │ ├── eval_C-MTEB.py
│ │ ├── eval_MTEB.py
│ │ ├── eval_cross_encoder.py
│ │ ├── flag_dres_model.py
│ │ ├── setup.py
│ │ └── summarize_results.py
│ ├── LLARA/
│ │ ├── README.md
│ │ ├── data/
│ │ │ ├── finetune/
│ │ │ │ └── toy_finetune_data.jsonl
│ │ │ └── pretrain/
│ │ │ └── toy_pretrain_data.jsonl
│ │ ├── finetune/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ ├── pretrain/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ └── stage1.json
│ ├── LM_Cocktail/
│ │ ├── LM_Cocktail/
│ │ │ ├── __init__.py
│ │ │ ├── cocktail.py
│ │ │ └── utils.py
│ │ ├── README.md
│ │ ├── embedder_examples.json
│ │ ├── llm_examples.json
│ │ └── setup.py
│ ├── Long_LLM/
│ │ ├── activation_beacon/
│ │ │ ├── README.md
│ │ │ ├── data/
│ │ │ │ ├── config/
│ │ │ │ │ ├── code.json
│ │ │ │ │ ├── even.json
│ │ │ │ │ ├── fsdp-offload.yaml
│ │ │ │ │ ├── fsdp.yaml
│ │ │ │ │ ├── slimpajama.json
│ │ │ │ │ ├── zero3-infer-offload.yaml
│ │ │ │ │ └── zero3-infer.yaml
│ │ │ │ ├── deepspeed/
│ │ │ │ │ ├── stage2-offload.json
│ │ │ │ │ ├── stage2.json
│ │ │ │ │ ├── stage3-offload-optim.json
│ │ │ │ │ ├── stage3-offload.json
│ │ │ │ │ └── stage3.json
│ │ │ │ └── toy/
│ │ │ │ └── infbench.json
│ │ │ ├── examples/
│ │ │ │ ├── evaluation.md
│ │ │ │ └── training.md
│ │ │ ├── main/
│ │ │ │ ├── eval_generation.py
│ │ │ │ ├── eval_infbench.py
│ │ │ │ ├── eval_lm.py
│ │ │ │ ├── eval_longbench.py
│ │ │ │ ├── eval_mmlu.py
│ │ │ │ ├── eval_msc.py
│ │ │ │ ├── eval_multiturn.py
│ │ │ │ ├── eval_needle.py
│ │ │ │ ├── eval_passkey.py
│ │ │ │ ├── eval_topic.py
│ │ │ │ ├── infbench_utils.py
│ │ │ │ ├── longbench_utils.py
│ │ │ │ ├── pretrain_data.py
│ │ │ │ ├── train.py
│ │ │ │ └── vllm_symlink.py
│ │ │ └── src/
│ │ │ ├── __init__.py
│ │ │ ├── args.py
│ │ │ ├── chat.py
│ │ │ ├── data.py
│ │ │ ├── llama/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_llama.py
│ │ │ │ └── modeling_llama.py
│ │ │ ├── metrics.py
│ │ │ ├── mistral/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_mistral.py
│ │ │ │ └── modeling_mistral.py
│ │ │ ├── modeling_beacon.py
│ │ │ ├── modeling_utils.py
│ │ │ ├── qwen2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_qwen2.py
│ │ │ │ └── modeling_qwen2.py
│ │ │ ├── trainer.py
│ │ │ ├── utils.py
│ │ │ └── vllm_utils.py
│ │ └── longllm_qlora/
│ │ ├── README.md
│ │ ├── data/
│ │ │ └── narrativeqa.json
│ │ ├── data_pipeline/
│ │ │ ├── README.md
│ │ │ ├── _openai.py
│ │ │ ├── data/
│ │ │ │ └── README.md
│ │ │ ├── prepare_bio_book.ipynb
│ │ │ ├── prepare_multi_details_book.ipynb
│ │ │ ├── prepare_multi_details_paper_long.ipynb
│ │ │ ├── prepare_one_detail_book.ipynb
│ │ │ ├── prepare_one_detail_paper_long.ipynb
│ │ │ └── raw_data/
│ │ │ └── README.md
│ │ ├── main/
│ │ │ ├── eval_generation.py
│ │ │ ├── eval_infbench.py
│ │ │ ├── eval_lm.py
│ │ │ ├── eval_longbench.py
│ │ │ ├── eval_mmlu.py
│ │ │ ├── eval_needle.py
│ │ │ ├── eval_passkey.py
│ │ │ ├── eval_topic.py
│ │ │ ├── infbench_utils.py
│ │ │ ├── longbench_utils.py
│ │ │ └── train.py
│ │ └── src/
│ │ ├── __init__.py
│ │ ├── args.py
│ │ ├── chat.py
│ │ ├── data.py
│ │ ├── metrics.py
│ │ ├── modeling_utils.py
│ │ ├── trainer.py
│ │ └── utils.py
│ ├── MLVU/
│ │ ├── README.md
│ │ ├── data/
│ │ │ ├── 1_plotQA.json
│ │ │ ├── 2_needle.json
│ │ │ ├── 3_ego.json
│ │ │ ├── 4_count.json
│ │ │ ├── 5_order.json
│ │ │ ├── 6_anomaly_reco.json
│ │ │ ├── 7_topic_reasoning.json
│ │ │ ├── 8_sub_scene.json
│ │ │ └── 9_summary.json
│ │ └── evaluation/
│ │ ├── README.md
│ │ ├── generation_evaluation/
│ │ │ ├── calculate.py
│ │ │ ├── calculate_sum.py
│ │ │ ├── evaluate_ssc.py
│ │ │ ├── evaluate_summary.py
│ │ │ └── open_bench.py
│ │ ├── models/
│ │ │ ├── videochat2/
│ │ │ │ ├── choice_bench.py
│ │ │ │ └── open_bench.py
│ │ │ └── videollava/
│ │ │ ├── choice_bench.py
│ │ │ └── open_bench.py
│ │ └── multiple_choice_evaluation/
│ │ └── choice_bench.py
│ ├── Matroyshka_reranker/
│ │ ├── README.md
│ │ ├── finetune/
│ │ │ ├── compensation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── arguments.py
│ │ │ │ ├── data.py
│ │ │ │ ├── load_model.py
│ │ │ │ ├── mistral_config.py
│ │ │ │ ├── mistral_model.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── run.py
│ │ │ │ ├── stage1.json
│ │ │ │ └── trainer.py
│ │ │ └── self_distillation/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── mistral_config.py
│ │ │ ├── mistral_model.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ ├── stage1.json
│ │ │ └── trainer.py
│ │ ├── inference/
│ │ │ ├── __init__.py
│ │ │ ├── mistral_config.py
│ │ │ ├── mistral_model.py
│ │ │ └── rank_model.py
│ │ └── requirements.txt
│ ├── README.md
│ ├── Reinforced_IR/
│ │ ├── README.md
│ │ ├── data_generation/
│ │ │ ├── agent/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gpt.py
│ │ │ │ ├── vllm.py
│ │ │ │ └── vllm_instruct.py
│ │ │ ├── generate_generator_data.py
│ │ │ ├── generate_retriever_data.py
│ │ │ ├── generate_retriever_distill_data.py
│ │ │ ├── generate_universal_query.py
│ │ │ ├── prompts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── generate_prompts.py
│ │ │ │ ├── get_prompts.py
│ │ │ │ ├── hyde_prompts.py
│ │ │ │ ├── teacher_prompts.py
│ │ │ │ └── train_prompts.py
│ │ │ └── utils.py
│ │ ├── finetune/
│ │ │ ├── generator/
│ │ │ │ ├── save_tokenizer.py
│ │ │ │ └── update_file.py
│ │ │ ├── retriever/
│ │ │ │ ├── arguments.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── run.py
│ │ │ │ ├── runner.py
│ │ │ │ └── trainer.py
│ │ │ └── stage1.json
│ │ ├── inference/
│ │ │ ├── agent/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gpt.py
│ │ │ │ ├── vllm.py
│ │ │ │ └── vllm_instruct.py
│ │ │ ├── ir_model.py
│ │ │ ├── multi.py
│ │ │ └── test.py
│ │ └── requirements.txt
│ ├── baai_general_embedding/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── finetune/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── eval_msmarco.py
│ │ │ ├── hn_mine.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ └── retromae_pretrain/
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── data.py
│ │ ├── enhancedDecoder.py
│ │ ├── modeling.py
│ │ ├── run.py
│ │ ├── trainer.py
│ │ └── utils.py
│ ├── llm_dense_retriever/
│ │ ├── README.md
│ │ ├── examples/
│ │ │ └── bge-en-icl/
│ │ │ ├── AIR-Bench/
│ │ │ │ ├── long-doc/
│ │ │ │ │ ├── arxiv-gemini.jsonl
│ │ │ │ │ ├── arxiv-gpt3.jsonl
│ │ │ │ │ ├── arxiv-llama2.jsonl
│ │ │ │ │ ├── arxiv-llm-survey.jsonl
│ │ │ │ │ ├── book-a-brief-history-of-time_stephen-hawking.jsonl
│ │ │ │ │ ├── book-origin-of-species_darwin.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_1.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_2.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_3.jsonl
│ │ │ │ │ ├── healthcare-pubmed_30k-40k_10-merged.jsonl
│ │ │ │ │ ├── healthcare-pubmed_40k-50k_5-merged.jsonl
│ │ │ │ │ ├── law-lex_files_300k-400k.jsonl
│ │ │ │ │ ├── law-lex_files_400k-500k.jsonl
│ │ │ │ │ ├── law-lex_files_500k-600k.jsonl
│ │ │ │ │ └── law-lex_files_600k-700k.jsonl
│ │ │ │ └── qa/
│ │ │ │ ├── arxiv.jsonl
│ │ │ │ ├── finance.jsonl
│ │ │ │ ├── healthcare.jsonl
│ │ │ │ ├── law.jsonl
│ │ │ │ ├── msmarco.jsonl
│ │ │ │ ├── news.jsonl
│ │ │ │ ├── web.jsonl
│ │ │ │ └── wiki.jsonl
│ │ │ └── MTEB/
│ │ │ ├── AmazonCounterfactualClassification.json
│ │ │ ├── AmazonPolarityClassification.json
│ │ │ ├── AmazonReviewsClassification.json
│ │ │ ├── ArguAna.json
│ │ │ ├── ArxivClusteringP2P.json
│ │ │ ├── ArxivClusteringS2S.json
│ │ │ ├── AskUbuntuDupQuestions.json
│ │ │ ├── BIOSSES.json
│ │ │ ├── Banking77Classification.json
│ │ │ ├── BiorxivClusteringP2P.json
│ │ │ ├── BiorxivClusteringS2S.json
│ │ │ ├── CQADupstackRetrieval.json
│ │ │ ├── ClimateFEVER.json
│ │ │ ├── DBPedia.json
│ │ │ ├── EmotionClassification.json
│ │ │ ├── FEVER.json
│ │ │ ├── FiQA2018.json
│ │ │ ├── HotpotQA.json
│ │ │ ├── ImdbClassification.json
│ │ │ ├── MSMARCO.json
│ │ │ ├── MTOPDomainClassification.json
│ │ │ ├── MTOPIntentClassification.json
│ │ │ ├── MassiveIntentClassification.json
│ │ │ ├── MassiveScenarioClassification.json
│ │ │ ├── MedrxivClusteringP2P.json
│ │ │ ├── MedrxivClusteringS2S.json
│ │ │ ├── MindSmallReranking.json
│ │ │ ├── NFCorpus.json
│ │ │ ├── NQ.json
│ │ │ ├── QuoraRetrieval.json
│ │ │ ├── RedditClustering.json
│ │ │ ├── RedditClusteringP2P.json
│ │ │ ├── SCIDOCS.json
│ │ │ ├── SICK-R.json
│ │ │ ├── STS12.json
│ │ │ ├── STS13.json
│ │ │ ├── STS14.json
│ │ │ ├── STS15.json
│ │ │ ├── STS16.json
│ │ │ ├── STS17.json
│ │ │ ├── STS22.json
│ │ │ ├── STSBenchmark.json
│ │ │ ├── SciDocsRR.json
│ │ │ ├── SciFact.json
│ │ │ ├── SprintDuplicateQuestions.json
│ │ │ ├── StackExchangeClustering.json
│ │ │ ├── StackExchangeClusteringP2P.json
│ │ │ ├── StackOverflowDupQuestions.json
│ │ │ ├── SummEval.json
│ │ │ ├── TRECCOVID.json
│ │ │ ├── Touche2020.json
│ │ │ ├── ToxicConversationsClassification.json
│ │ │ ├── TweetSentimentExtractionClassification.json
│ │ │ ├── TwentyNewsgroupsClustering.json
│ │ │ ├── TwitterSemEval2015.json
│ │ │ └── TwitterURLCorpus.json
│ │ └── finetune/
│ │ ├── arguments.py
│ │ ├── data.py
│ │ ├── load_model.py
│ │ ├── modeling.py
│ │ ├── run.py
│ │ └── trainer.py
│ ├── llm_embedder/
│ │ ├── README.md
│ │ ├── data/
│ │ │ ├── deepspeed/
│ │ │ │ ├── stage0.json
│ │ │ │ ├── stage2-offload.json
│ │ │ │ ├── stage2.json
│ │ │ │ ├── stage3-offload-all.json
│ │ │ │ ├── stage3-offload-optim.json
│ │ │ │ └── stage3.json
│ │ │ └── toy/
│ │ │ ├── chat.json
│ │ │ ├── convsearch.json
│ │ │ ├── icl.json
│ │ │ ├── lrlm.json
│ │ │ ├── qa.json
│ │ │ └── tool.json
│ │ ├── docs/
│ │ │ ├── evaluation.md
│ │ │ └── fine-tune.md
│ │ ├── environment.yaml
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── eval_icl.py
│ │ │ ├── eval_lrlm.py
│ │ │ ├── eval_mmlu.py
│ │ │ ├── eval_msc.py
│ │ │ ├── eval_popqa.py
│ │ │ ├── eval_qa.py
│ │ │ ├── eval_qrecc.py
│ │ │ ├── eval_retrieval.py
│ │ │ ├── eval_tool.py
│ │ │ └── icl_utils.py
│ │ ├── run_dense.py
│ │ ├── run_lm_score.py
│ │ ├── run_ranker.py
│ │ ├── scripts/
│ │ │ ├── llm-embedder.sh
│ │ │ └── ours2st.py
│ │ └── src/
│ │ ├── __init__.py
│ │ ├── lm/
│ │ │ ├── __init__.py
│ │ │ ├── args.py
│ │ │ ├── modeling_lm.py
│ │ │ └── modeling_srlm.py
│ │ ├── retrieval/
│ │ │ ├── __init__.py
│ │ │ ├── args.py
│ │ │ ├── data.py
│ │ │ ├── evalnq.py
│ │ │ ├── metrics.py
│ │ │ ├── modeling_bm25.py
│ │ │ ├── modeling_dense.py
│ │ │ ├── modeling_ranker.py
│ │ │ ├── modeling_unified.py
│ │ │ └── trainer.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── llama_patch.py
│ │ └── util.py
│ ├── llm_reranker/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── evaluate.py
│ │ ├── finetune_for_instruction/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ ├── finetune_for_layerwise/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── configuration_minicpm_reranker.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── modeling_minicpm_reranker.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ ├── merge/
│ │ │ ├── __init__.py
│ │ │ ├── configuration_minicpm_reranker.py
│ │ │ ├── merge_base_model.py
│ │ │ ├── merge_layerwise_model_from_finetuned_model.py
│ │ │ ├── merge_layerwise_model_from_raw_model.py
│ │ │ └── modeling_minicpm_reranker.py
│ │ ├── stage1.json
│ │ └── toy_finetune_data.jsonl
│ ├── old-examples/
│ │ ├── finetune/
│ │ │ ├── README.md
│ │ │ ├── ds_config.json
│ │ │ ├── toy_evaluation_data/
│ │ │ │ ├── toy_corpus.json
│ │ │ │ └── toy_query.json
│ │ │ └── toy_finetune_data.jsonl
│ │ ├── pretrain/
│ │ │ ├── README.md
│ │ │ ├── retromae_pretrain/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── arguments.py
│ │ │ │ ├── data.py
│ │ │ │ ├── enhancedDecoder.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── run.py
│ │ │ │ ├── trainer.py
│ │ │ │ └── utils.py
│ │ │ └── toy_pretrain_data.jsonl
│ │ ├── reranker/
│ │ │ ├── README.md
│ │ │ ├── ds_config.json
│ │ │ └── toy_finetune_data.jsonl
│ │ ├── search_demo/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── pre_process.py
│ │ │ ├── readme.md
│ │ │ ├── requirements.txt
│ │ │ ├── run.py
│ │ │ └── tool.py
│ │ └── unified_finetune/
│ │ ├── README.md
│ │ ├── toy_train_data/
│ │ │ ├── toy_train_data1.jsonl
│ │ │ └── toy_train_data2.jsonl
│ │ └── unified_finetune_bge-m3_exmaple.sh
│ ├── reranker/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── data.py
│ │ ├── modeling.py
│ │ ├── run.py
│ │ └── trainer.py
│ └── visual_bge/
│ ├── README.md
│ ├── __init__.py
│ ├── setup.py
│ └── visual_bge/
│ ├── eva_clip/
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── eva_vit_model.py
│ │ ├── factory.py
│ │ ├── hf_configs.py
│ │ ├── hf_model.py
│ │ ├── loss.py
│ │ ├── model.py
│ │ ├── model_configs/
│ │ │ ├── EVA01-CLIP-B-16.json
│ │ │ ├── EVA01-CLIP-g-14-plus.json
│ │ │ ├── EVA01-CLIP-g-14.json
│ │ │ ├── EVA02-CLIP-B-16.json
│ │ │ ├── EVA02-CLIP-L-14-336.json
│ │ │ ├── EVA02-CLIP-L-14.json
│ │ │ ├── EVA02-CLIP-bigE-14-plus.json
│ │ │ └── EVA02-CLIP-bigE-14.json
│ │ ├── modified_resnet.py
│ │ ├── openai.py
│ │ ├── pretrained.py
│ │ ├── rope.py
│ │ ├── timm_model.py
│ │ ├── tokenizer.py
│ │ ├── transform.py
│ │ ├── transformer.py
│ │ └── utils.py
│ └── modeling.py
├── scripts/
│ ├── README.md
│ ├── add_reranker_score.py
│ ├── hn_mine.py
│ └── split_data_by_length.py
├── setup.py
└── tests/
├── README.md
├── conftest.py
├── test_imports_v5.py
├── test_infer_embedder_basic.py
└── test_infer_reranker_basic.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/documentation.yml
================================================
name: documentation
on: [push, pull_request, workflow_dispatch]
permissions:
contents: write
jobs:
docs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- name: Install doc dependencies
run: |
pip install . sphinx myst_parser myst-nb sphinx-design pydata-sphinx-theme sphinxcontrib-googleanalytics
- name: Install content dependencies
run: |
pip install faiss-cpu mteb air-benchmark beir
- name: Sphinx build
run: |
sphinx-build docs/source docs/build
- name: Add CNAME
run: |
echo bge-model.com > docs/build/CNAME
- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@v3
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
with:
publish_branch: gh-pages
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: docs/build/
force_orphan: true
================================================
FILE: .gitignore
================================================
*.memmap
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.idea/
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
../docs/_build/
../docs/build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
Untitled.ipynb
try.py
update_model_card.py
model_card.md
pic.py
pic2.py
# Pyre type checker
.pyre/
# MacOS associated
.DS_Store
# results
en_results
zh_results
================================================
FILE: FlagEmbedding/__init__.py
================================================
from .abc.inference import *
from .inference import *
================================================
FILE: FlagEmbedding/abc/__init__.py
================================================
================================================
FILE: FlagEmbedding/abc/evaluation/__init__.py
================================================
from .arguments import AbsEvalArgs, AbsEvalModelArgs
from .evaluator import AbsEvaluator
from .data_loader import AbsEvalDataLoader
from .searcher import EvalRetriever, EvalDenseRetriever, EvalReranker
from .runner import AbsEvalRunner
__all__ = [
"AbsEvalArgs",
"AbsEvalModelArgs",
"AbsEvaluator",
"AbsEvalDataLoader",
"EvalRetriever",
"EvalDenseRetriever",
"EvalReranker",
"AbsEvalRunner",
]
================================================
FILE: FlagEmbedding/abc/evaluation/arguments.py
================================================
"""
Adapted from https://github.com/AIR-Bench/AIR-Bench/blob/0.1.0/air_benchmark/evaluation_utils/evaluation_arguments.py
"""
import os
from dataclasses import dataclass, field
from typing import List, Optional
@dataclass
class AbsEvalArgs:
"""
Base class for evaluation arguments.
"""
eval_name: str = field(
default=None,
metadata={"help": "The name of the evaluation task, such as msmarco, beir, miracl, etc."}
)
dataset_dir: Optional[str] = field(
default=None,
metadata={
"help": "1) If you want to perform evaluation on your own dataset, you can provide the path to the dataset directory (must exists in local). "
"The dataset directory should contain the following files: corpus.jsonl, <split>_queries.jsonl, <split>_qrels.jsonl, or contain multiple directories, each of which contains the following files: corpus.jsonl, <split>_queries.jsonl, <split>_qrels.jsonl."
"2) If you want to perform evaluation on the datasets we provide evaluation APIs for, you can provide the path to saving the downloaded dataset. If you provide None, the dataset will be only downloaded to the cache directory."
}
)
force_redownload: bool = field(
default=False, metadata={"help": "Whether to force redownload the dataset. This is useful when you load dataset from remote and want to update the dataset."}
)
dataset_names: Optional[str] = field(
default=None,
metadata={
"help": "The names of the datasets to evaluate. Default: None. If None, all available datasets will be evaluated. The name can be a specific dataset name (BEIR), a specific language (MIRACL), etc.",
"nargs": "+"
}
)
splits: str = field(
default="test",
metadata={"help": "Splits to evaluate. Default: test", "nargs": "+"}
)
corpus_embd_save_dir: str = field(
default=None, metadata={"help": "Path to save corpus embeddings. If None, embeddings are not saved."}
)
output_dir: str = field(
default="./search_results", metadata={"help": "Path to save results."}
)
search_top_k: int = field(
default=1000, metadata={"help": "Top k for retrieving."}
)
rerank_top_k: int = field(default=100, metadata={"help": "Top k for reranking."})
cache_path: str = field(
default=None, metadata={"help": "Cache directory for loading datasets."}
)
token: str = field(
default_factory=lambda: os.getenv('HF_TOKEN', None),
metadata={"help": "The token to use when accessing the model."}
)
overwrite: bool = field(
default=False, metadata={"help": "whether to overwrite evaluation results"}
)
ignore_identical_ids: bool = field(
default=False, metadata={"help": "whether to ignore identical ids in search results"}
)
# ================ for evaluation ===============
k_values: int = field(
default_factory=lambda: [1, 3, 5, 10, 100, 1000],
metadata={"help": "k values for evaluation. Default: [1, 3, 5, 10, 100, 1000]", "nargs": "+"}
)
eval_output_method: str = field(
default="markdown",
metadata={"help": "The output method for evaluation results. Available methods: ['json', 'markdown']. Default: markdown.", "choices": ["json", "markdown"]}
)
eval_output_path: str = field(
default="./eval_results.md", metadata={"help": "The path to save evaluation results."}
)
eval_metrics: str = field(
default_factory=lambda: ["ndcg_at_10", "recall_at_10"],
metadata={"help": "The metrics to evaluate. Default: ['ndcg_at_10', 'recall_at_10']", "nargs": "+"}
)
@dataclass
class AbsEvalModelArgs:
"""
Base class for model arguments during evaluation.
"""
embedder_name_or_path: str = field(
metadata={"help": "The embedder name or path.", "required": True}
)
embedder_model_class: Optional[str] = field(
default=None, metadata={"help": "The embedder model class. Available classes: ['encoder-only-base', 'encoder-only-m3', 'decoder-only-base', 'decoder-only-icl']. Default: None. For the custom model, you need to specifiy the model class.", "choices": ["encoder-only-base", "encoder-only-m3", "decoder-only-base", "decoder-only-icl"]}
)
normalize_embeddings: bool = field(
default=True, metadata={"help": "whether to normalize the embeddings"}
)
pooling_method: str = field(
default="cls", metadata={"help": "The pooling method fot the embedder."}
)
use_fp16: bool = field(
default=True, metadata={"help": "whether to use fp16 for inference"}
)
devices: Optional[str] = field(
default=None, metadata={"help": "Devices to use for inference.", "nargs": "+"}
)
query_instruction_for_retrieval: Optional[str] = field(
default=None, metadata={"help": "Instruction for query"}
)
query_instruction_format_for_retrieval: str = field(
default="{}{}", metadata={"help": "Format for query instruction"}
)
examples_for_task: Optional[str] = field(
default=None, metadata={"help": "Examples for task"}
)
examples_instruction_format: str = field(
default="{}{}", metadata={"help": "Format for examples instruction"}
)
trust_remote_code: bool = field(
default=False, metadata={"help": "Trust remote code"}
)
reranker_name_or_path: Optional[str] = field(
default=None, metadata={"help": "The reranker name or path."}
)
reranker_model_class: Optional[str] = field(
default=None, metadata={"help": "The reranker model class. Available classes: ['encoder-only-base', 'decoder-only-base', 'decoder-only-layerwise', 'decoder-only-lightweight']. Default: None. For the custom model, you need to specify the model class.", "choices": ["encoder-only-base", "decoder-only-base", "decoder-only-layerwise", "decoder-only-lightweight"]}
)
reranker_peft_path: Optional[str] = field(
default=None, metadata={"help": "The reranker peft path."}
)
use_bf16: bool = field(
default=False, metadata={"help": "whether to use bf16 for inference"}
)
query_instruction_for_rerank: Optional[str] = field(
default=None, metadata={"help": "Instruction for query"}
)
query_instruction_format_for_rerank: str = field(
default="{}{}", metadata={"help": "Format for query instruction"}
)
passage_instruction_for_rerank: Optional[str] = field(
default=None, metadata={"help": "Instruction for passage"}
)
passage_instruction_format_for_rerank: str = field(
default="{}{}", metadata={"help": "Format for passage instruction"}
)
cache_dir: str = field(
default=None, metadata={"help": "Cache directory for models."}
)
# ================ for inference ===============
embedder_batch_size: int = field(
default=3000, metadata={"help": "Batch size for inference."}
)
reranker_batch_size: int = field(
default=3000, metadata={"help": "Batch size for inference."}
)
embedder_query_max_length: int = field(
default=512, metadata={"help": "Max length for query."}
)
embedder_passage_max_length: int = field(
default=512, metadata={"help": "Max length for passage."}
)
reranker_query_max_length: Optional[int] = field(
default=None, metadata={"help": "Max length for reranking."}
)
reranker_max_length: int = field(
default=512, metadata={"help": "Max length for reranking."}
)
normalize: bool = field(
default=False, metadata={"help": "whether to normalize the reranking scores"}
)
prompt: Optional[str] = field(
default=None, metadata={"help": "The prompt for the reranker."}
)
cutoff_layers: List[int] = field(
default=None, metadata={"help": "The output layers of layerwise/lightweight reranker."}
)
compress_ratio: int = field(
default=1, metadata={"help": "The compress ratio of lightweight reranker."}
)
compress_layers: Optional[int] = field(
default=None, metadata={"help": "The compress layers of lightweight reranker.", "nargs": "+"}
)
def __post_init__(self):
# replace "\\n" with "\n"
if "\\n" in self.query_instruction_format_for_retrieval:
self.query_instruction_format_for_retrieval = self.query_instruction_format_for_retrieval.replace("\\n", "\n")
if "\\n" in self.examples_instruction_format:
self.examples_instruction_format = self.examples_instruction_format.replace("\\n", "\n")
if "\\n" in self.query_instruction_format_for_rerank:
self.query_instruction_format_for_rerank = self.query_instruction_format_for_rerank.replace("\\n", "\n")
if "\\n" in self.passage_instruction_format_for_rerank:
self.passage_instruction_format_for_rerank = self.passage_instruction_format_for_rerank.replace("\\n", "\n")
================================================
FILE: FlagEmbedding/abc/evaluation/data_loader.py
================================================
"""
Adapted from https://github.com/AIR-Bench/AIR-Bench/blob/0.1.0/air_benchmark/evaluation_utils/data_loader.py
"""
import os
import logging
import datasets
import subprocess
from abc import ABC, abstractmethod
from typing import List, Optional, Union
logger = logging.getLogger(__name__)
class AbsEvalDataLoader(ABC):
"""
Base class of data loader for evaluation.
Args:
eval_name (str): The experiment name of current evaluation.
dataset_dir (str, optional): path to the datasets. Defaults to ``None``.
cache_dir (str, optional): Path to HuggingFace cache directory. Defaults to ``None``.
token (str, optional): HF_TOKEN to access the private datasets/models in HF. Defaults to ``None``.
force_redownload: If True, will force redownload the dataset to cover the local dataset. Defaults to ``False``.
"""
def __init__(
self,
eval_name: str,
dataset_dir: Optional[str] = None,
cache_dir: Optional[str] = None,
token: Optional[str] = None,
force_redownload: bool = False
):
self.eval_name = eval_name
self.dataset_dir = dataset_dir
if cache_dir is None:
cache_dir = os.getenv('HF_HUB_CACHE', '~/.cache/huggingface/hub')
self.cache_dir = os.path.join(cache_dir, eval_name)
self.token = token
self.force_redownload = force_redownload
self.hf_download_mode = None if not force_redownload else "force_redownload"
def available_dataset_names(self) -> List[str]:
"""
Returns: List[str]: Available dataset names.
"""
return []
@abstractmethod
def available_splits(self, dataset_name: Optional[str] = None) -> List[str]:
"""
Returns: List[str]: Available splits in the dataset.
"""
pass
def check_dataset_names(self, dataset_names: Union[str, List[str]]) -> List[str]:
"""Check the validity of dataset names
Args:
dataset_names (Union[str, List[str]]): a dataset name (str) or a list of dataset names (List[str])
Raises:
ValueError
Returns:
List[str]: List of valid dataset names.
"""
available_dataset_names = self.available_dataset_names()
if isinstance(dataset_names, str):
dataset_names = [dataset_names]
for dataset_name in dataset_names:
if dataset_name not in available_dataset_names:
raise ValueError(f"Dataset name '{dataset_name}' not found in the dataset. Available dataset names: {available_dataset_names}")
return dataset_names
def check_splits(self, splits: Union[str, List[str]], dataset_name: Optional[str] = None) -> List[str]:
"""Check whether the splits are available in the dataset.
Args:
splits (Union[str, List[str]]): Splits to check.
dataset_name (Optional[str], optional): Name of dataset to check. Defaults to ``None``.
Returns:
List[str]: The available splits.
"""
available_splits = self.available_splits(dataset_name=dataset_name)
if isinstance(splits, str):
splits = [splits]
checked_splits = []
for split in splits:
if split not in available_splits:
logger.warning(f"Split '{split}' not found in the dataset. Removing it from the list.")
else:
checked_splits.append(split)
return checked_splits
def load_corpus(self, dataset_name: Optional[str] = None) -> datasets.DatasetDict:
"""Load the corpus from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
else:
save_dir = os.path.join(self.dataset_dir, dataset_name)
return self._load_local_corpus(save_dir, dataset_name=dataset_name)
else:
return self._load_remote_corpus(dataset_name=dataset_name)
def load_qrels(self, dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load the qrels from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
split (str, optional): The split to load relevance from. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of relevance of query and document.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
else:
checked_dataset_names = self.check_dataset_names(dataset_name)
if len(checked_dataset_names) == 0:
raise ValueError(f"Dataset name {dataset_name} not found in the dataset.")
dataset_name = checked_dataset_names[0]
save_dir = os.path.join(self.dataset_dir, dataset_name)
return self._load_local_qrels(save_dir, dataset_name=dataset_name, split=split)
else:
return self._load_remote_qrels(dataset_name=dataset_name, split=split)
def load_queries(self, dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load the queries from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
split (str, optional): The split to load queries from. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of queries with id as key, query text as value.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
else:
checked_dataset_names = self.check_dataset_names(dataset_name)
if len(checked_dataset_names) == 0:
raise ValueError(f"Dataset name {dataset_name} not found in the dataset.")
dataset_name = checked_dataset_names[0]
save_dir = os.path.join(self.dataset_dir, dataset_name)
return self._load_local_queries(save_dir, dataset_name=dataset_name, split=split)
else:
return self._load_remote_queries(dataset_name=dataset_name, split=split)
def _load_remote_corpus(
self,
dataset_name: Optional[str] = None,
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Abstract method to load corpus from remote dataset, to be overrode in child class.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
save_dir (Optional[str], optional): Path to save the new downloaded corpus. Defaults to ``None``.
Raises:
NotImplementedError: Loading remote corpus is not implemented.
Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
"""
raise NotImplementedError("Loading remote corpus is not implemented.")
def _load_remote_qrels(
self,
dataset_name: Optional[str] = None,
split: str = 'test',
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Abstract method to load relevance from remote dataset, to be overrode in child class.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
split (str, optional): Split to load from the remote dataset. Defaults to ``'test'``.
save_dir (Optional[str], optional): Path to save the new downloaded relevance. Defaults to ``None``.
Raises:
NotImplementedError: Loading remote qrels is not implemented.
Returns:
datasets.DatasetDict: A dict of relevance of query and document.
"""
raise NotImplementedError("Loading remote qrels is not implemented.")
def _load_remote_queries(
self,
dataset_name: Optional[str] = None,
split: str = 'test',
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Abstract method to load queries from remote dataset, to be overrode in child class.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
split (str, optional): Split to load from the remote dataset. Defaults to ``'test'``.
save_dir (Optional[str], optional): Path to save the new downloaded queries. Defaults to ``None``.
Raises:
NotImplementedError
Returns:
datasets.DatasetDict: A dict of queries with id as key, query text as value.
"""
raise NotImplementedError("Loading remote queries is not implemented.")
def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None) -> datasets.DatasetDict:
"""Load corpus from local dataset.
Args:
save_dir (str): Path to save the loaded corpus.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
"""
corpus_path = os.path.join(save_dir, 'corpus.jsonl')
if self.force_redownload or not os.path.exists(corpus_path):
logger.warning(f"Corpus not found in {corpus_path}. Trying to download the corpus from the remote and save it to {save_dir}.")
return self._load_remote_corpus(dataset_name=dataset_name, save_dir=save_dir)
else:
corpus_data = datasets.load_dataset('json', data_files=corpus_path, cache_dir=self.cache_dir)['train']
corpus = {}
for e in corpus_data:
corpus[e['id']] = {'title': e.get('title', ""), 'text': e['text']}
return datasets.DatasetDict(corpus)
def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load relevance from local dataset.
Args:
save_dir (str): Path to save the loaded relevance.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
split (str, optional): Split to load from the local dataset. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of relevance of query and document.
"""
checked_split = self.check_splits(split, dataset_name=dataset_name)
if len(checked_split) == 0:
raise ValueError(f"Split {split} not found in the dataset.")
split = checked_split[0]
qrels_path = os.path.join(save_dir, f"{split}_qrels.jsonl")
if self.force_redownload or not os.path.exists(qrels_path):
logger.warning(f"Qrels not found in {qrels_path}. Trying to download the qrels from the remote and save it to {save_dir}.")
return self._load_remote_qrels(dataset_name=dataset_name, split=split, save_dir=save_dir)
else:
qrels_data = datasets.load_dataset('json', data_files=qrels_path, cache_dir=self.cache_dir)['train']
qrels = {}
for data in qrels_data:
qid = data['qid']
if qid not in qrels:
qrels[qid] = {}
qrels[qid][data['docid']] = data['relevance']
return datasets.DatasetDict(qrels)
def _load_local_queries(self, save_dir: str, dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load queries from local dataset.
Args:
save_dir (str): Path to save the loaded queries.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
split (str, optional): Split to load from the local dataset. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of queries with id as key, query text as value.
"""
checked_split = self.check_splits(split, dataset_name=dataset_name)
if len(checked_split) == 0:
raise ValueError(f"Split {split} not found in the dataset.")
split = checked_split[0]
queries_path = os.path.join(save_dir, f"{split}_queries.jsonl")
if self.force_redownload or not os.path.exists(queries_path):
logger.warning(f"Queries not found in {queries_path}. Trying to download the queries from the remote and save it to {save_dir}.")
return self._load_remote_queries(dataset_name=dataset_name, split=split, save_dir=save_dir)
else:
queries_data = datasets.load_dataset('json', data_files=queries_path, cache_dir=self.cache_dir)['train']
queries = {e['id']: e['text'] for e in queries_data}
return datasets.DatasetDict(queries)
def _download_file(self, download_url: str, save_dir: str):
"""Download file from provided URL.
Args:
download_url (str): Source URL of the file.
save_dir (str): Path to the directory to save the zip file.
Raises:
FileNotFoundError
Returns:
str: The path of the downloaded file.
"""
save_path = os.path.join(save_dir, download_url.split('/')[-1])
if self.force_redownload or (not os.path.exists(save_path) or os.path.getsize(save_path) == 0):
cmd = ["wget", "-O", save_path, download_url]
else:
cmd = ["wget", "-nc", "-O", save_path, download_url]
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
logger.warning(e.output)
if not os.path.exists(save_path) or os.path.getsize(save_path) == 0:
raise FileNotFoundError(f"Failed to download file from {download_url} to {save_path}")
else:
logger.info(f"Downloaded file from {download_url} to {save_path}")
return save_path
def _get_fpath_size(self, fpath: str) -> int:
"""Get the total size of the files in provided path.
Args:
fpath (str): path of files to compute the size.
Returns:
int: The total size in bytes.
"""
if not os.path.isdir(fpath):
return os.path.getsize(fpath)
else:
total_size = 0
for dirpath, _, filenames in os.walk(fpath):
for f in filenames:
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)
return total_size
def _download_gz_file(self, download_url: str, save_dir: str):
"""Download and unzip the gzip file from provided URL.
Args:
download_url (str): Source URL of the gzip file.
save_dir (str): Path to the directory to save the gzip file.
Raises:
FileNotFoundError
Returns:
str: The path to the file after unzip.
"""
gz_file_path = self._download_file(download_url, save_dir)
cmd = ["gzip", "-d", gz_file_path]
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
logger.warning(e.output)
file_path = gz_file_path.replace(".gz", "")
if not os.path.exists(file_path) or self._get_fpath_size(file_path) == 0:
raise FileNotFoundError(f"Failed to unzip file {gz_file_path}")
return file_path
def _download_zip_file(self, download_url: str, save_dir: str):
"""Download and unzip the zip file from provided URL.
Args:
download_url (str): Source URL of the zip file.
save_dir (str): Path to the directory to save the zip file.
Raises:
FileNotFoundError
Returns:
str: The path to the file after unzip.
"""
zip_file_path = self._download_file(download_url, save_dir)
file_path = zip_file_path.replace(".zip", "")
if self.force_redownload or not os.path.exists(file_path):
cmd = ["unzip", "-o", zip_file_path, "-d", file_path]
else:
cmd = ["unzip", "-n", zip_file_path, "-d", file_path]
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
logger.warning(e.output)
if not os.path.exists(file_path) or self._get_fpath_size(file_path) == 0:
raise FileNotFoundError(f"Failed to unzip file {zip_file_path}")
return file_path
================================================
FILE: FlagEmbedding/abc/evaluation/evaluator.py
================================================
"""
Adapted from https://github.com/AIR-Bench/AIR-Bench/blob/0.1.0/air_benchmark/evaluation_utils/evaluator.py
"""
import json
import logging
import os
import json
import pandas as pd
from typing import Dict, Optional, List, Union
from .data_loader import AbsEvalDataLoader
from .searcher import EvalRetriever, EvalReranker
from .utils import evaluate_metrics, evaluate_mrr, evaluate_recall_cap
logger = logging.getLogger(__name__)
class AbsEvaluator:
"""
Base class of Evaluator.
Args:
eval_name (str): The experiment name of current evaluation.
data_loader (AbsEvalDataLoader): The data_loader to deal with data.
overwrite (bool): If true, will overwrite the existing results.
"""
def __init__(
self,
eval_name: str,
data_loader: AbsEvalDataLoader,
overwrite: bool = False,
):
self.eval_name = eval_name
self.data_loader = data_loader
self.overwrite = overwrite
def check_data_info(
self,
data_info: Dict[str, str],
model_name: str,
reranker_name: str,
split: str,
dataset_name: Optional[str] = None,
):
"""Check the validity of data info.
Args:
data_info (Dict[str, str]): The loaded data info to be check.
model_name (str): Name of model used.
reranker_name (str): Name of reranker used.
split (str): Split used in searching.
dataset_name (Optional[str], optional): Name of dataset used. Defaults to None.
Raises:
ValueError: eval_name mismatch
ValueError: model_name or reranker_name mismatch
ValueError: split mismatch
ValueError: dataset_name mismatch
"""
if data_info["eval_name"] != self.eval_name:
raise ValueError(
f'eval_name mismatch: {data_info["eval_name"]} vs {self.eval_name}'
)
if (
data_info["model_name"] != model_name
or data_info["reranker_name"] != reranker_name
):
raise ValueError(
f'model_name or reranker_name mismatch: {data_info["model_name"]} vs {model_name} or {data_info["reranker_name"]} vs {reranker_name}'
)
if (data_info["split"] != split):
raise ValueError(
f'split mismatch: {data_info["split"]} vs {split}'
)
if dataset_name is not None and data_info["dataset_name"] != dataset_name:
raise ValueError(
f'dataset_name mismatch: {data_info["dataset_name"]} vs {dataset_name}'
)
def get_corpus_embd_save_dir(
self,
retriever_name: str,
corpus_embd_save_dir: Optional[str] = None,
dataset_name: Optional[str] = None
):
"""
If corpus_embd_save_dir is not None, then it will be used as the base directory to save the corpus embeddings. For dataset such as MKQA,
the corpus for all languages is the same, so the subclass can override this method to save the corpus embeddings in the same directory.
Args:
retriever_name (str): Name of the retriever.
corpus_embd_save_dir (str, optional): Directory that saving the corpus embedding.
dataset_name (str, optional):
"""
if corpus_embd_save_dir is not None:
if dataset_name is not None:
corpus_embd_save_dir = os.path.join(corpus_embd_save_dir, retriever_name, dataset_name)
else:
corpus_embd_save_dir = os.path.join(corpus_embd_save_dir, retriever_name)
return corpus_embd_save_dir
def __call__(
self,
splits: Union[str, List[str]],
search_results_save_dir: str,
retriever: EvalRetriever,
reranker: Optional[EvalReranker] = None,
corpus_embd_save_dir: Optional[str] = None,
ignore_identical_ids: bool = False,
k_values: List[int] = [1, 3, 5, 10, 100, 1000],
dataset_name: Optional[str] = None,
**kwargs,
):
"""This is called during the evaluation process.
Args:
splits (Union[str, List[str]]): Splits of datasets.
search_results_save_dir (str): Directory to save the search results.
retriever (EvalRetriever): object of :class:EvalRetriever.
reranker (Optional[EvalReranker], optional): Object of :class:EvalReranker. Defaults to :data:`None`.
corpus_embd_save_dir (Optional[str], optional): Directory to save the embedded corpus. Defaults to :data:`None`.
ignore_identical_ids (bool, optional): If True, will ignore identical ids in search results. Defaults to :data:`False`.
k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`.
dataset_name (Optional[str], optional): Name of the datasets. Defaults to :data:`None`.
"""
# Check Splits
checked_splits = self.data_loader.check_splits(splits, dataset_name=dataset_name)
if len(checked_splits) == 0:
logger.warning(f"{splits} not found in the dataset. Skipping evaluation.")
return
splits = checked_splits
if dataset_name is not None:
save_name = f"{dataset_name}-" + "{split}.json"
else:
save_name = "{split}.json"
corpus_embd_save_dir = self.get_corpus_embd_save_dir(
retriever_name=str(retriever),
corpus_embd_save_dir=corpus_embd_save_dir,
dataset_name=dataset_name
)
# Retrieval Stage
no_reranker_search_results_save_dir = os.path.join(
search_results_save_dir, str(retriever), "NoReranker"
)
os.makedirs(no_reranker_search_results_save_dir, exist_ok=True)
flag = False
for split in splits:
split_no_reranker_search_results_save_path = os.path.join(
no_reranker_search_results_save_dir, save_name.format(split=split)
)
if not os.path.exists(split_no_reranker_search_results_save_path) or self.overwrite:
flag = True
break
no_reranker_search_results_dict = {}
if flag:
corpus = self.data_loader.load_corpus(dataset_name=dataset_name)
queries_dict = {
split: self.data_loader.load_queries(dataset_name=dataset_name, split=split)
for split in splits
}
all_queries = {}
for _, split_queries in queries_dict.items():
all_queries.update(split_queries)
all_no_reranker_search_results = retriever(
corpus=corpus,
queries=all_queries,
corpus_embd_save_dir=corpus_embd_save_dir,
ignore_identical_ids=ignore_identical_ids,
**kwargs,
)
for split in splits:
split_queries = queries_dict[split]
no_reranker_search_results_dict[split] = {
qid: all_no_reranker_search_results[qid] for qid in split_queries
}
split_no_reranker_search_results_save_path = os.path.join(
no_reranker_search_results_save_dir, save_name.format(split=split)
)
self.save_search_results(
eval_name=self.eval_name,
model_name=str(retriever),
reranker_name="NoReranker",
search_results=no_reranker_search_results_dict[split],
output_path=split_no_reranker_search_results_save_path,
split=split,
dataset_name=dataset_name,
)
else:
for split in splits:
split_no_reranker_search_results_save_path = os.path.join(
no_reranker_search_results_save_dir, save_name.format(split=split)
)
data_info, search_results = self.load_search_results(split_no_reranker_search_results_save_path)
self.check_data_info(
data_info=data_info,
model_name=str(retriever),
reranker_name="NoReranker",
split=split,
dataset_name=dataset_name,
)
no_reranker_search_results_dict[split] = search_results
retriever.stop_multi_process_pool()
eval_results_save_path = os.path.join(no_reranker_search_results_save_dir, 'EVAL', 'eval_results.json')
if not os.path.exists(eval_results_save_path) or self.overwrite or flag:
retriever_eval_results = self.evaluate_results(no_reranker_search_results_save_dir, k_values=k_values)
self.output_eval_results_to_json(retriever_eval_results, eval_results_save_path)
# Reranking Stage
if reranker is not None:
reranker_search_results_save_dir = os.path.join(
search_results_save_dir, str(retriever), str(reranker)
)
os.makedirs(reranker_search_results_save_dir, exist_ok=True)
corpus = self.data_loader.load_corpus(dataset_name=dataset_name)
queries_dict = {
split: self.data_loader.load_queries(dataset_name=dataset_name, split=split)
for split in splits
}
flag = False
for split in splits:
rerank_search_results_save_path = os.path.join(
reranker_search_results_save_dir, save_name.format(split=split)
)
if os.path.exists(rerank_search_results_save_path) and not self.overwrite:
continue
flag = True
rerank_search_results = reranker(
corpus=corpus,
queries=queries_dict[split],
search_results=no_reranker_search_results_dict[split],
ignore_identical_ids=ignore_identical_ids,
**kwargs,
)
self.save_search_results(
eval_name=self.eval_name,
model_name=str(retriever),
reranker_name=str(reranker),
search_results=rerank_search_results,
output_path=rerank_search_results_save_path,
split=split,
dataset_name=dataset_name,
)
reranker.stop_multi_process_pool()
eval_results_save_path = os.path.join(reranker_search_results_save_dir, 'EVAL', 'eval_results.json')
if not os.path.exists(eval_results_save_path) or self.overwrite or flag:
reranker_eval_results = self.evaluate_results(reranker_search_results_save_dir, k_values=k_values)
self.output_eval_results_to_json(reranker_eval_results, eval_results_save_path)
@staticmethod
def save_search_results(
eval_name: str,
model_name: str,
reranker_name: str,
search_results: Dict[str, Dict[str, float]],
output_path: str,
split: str,
dataset_name: Optional[str] = None,
):
"""Save the metadata and search results into a file.
Args:
eval_name (str): The experiment name of current evaluation.
model_name (str): Name of model used.
reranker_name (str): Name of reranker used.
search_results (Dict[str, Dict[str, float]]): Dictionary of search results.
output_path (str): Output path to write the results.
split (str): Split used in searching.
dataset_name (Optional[str], optional): Name of dataset used. Defaults to :data:`None`.
"""
data = {
"eval_name": eval_name,
"model_name": model_name,
"reranker_name": reranker_name,
"split": split,
"dataset_name": dataset_name,
"search_results": search_results,
}
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4)
@staticmethod
def load_search_results(input_path: str):
"""Load search results from path.
Args:
input_path (str): Path to load from.
Returns:
dict, dict: data info that contains metadata and search results.
"""
with open(input_path, "r", encoding="utf-8") as f:
data_info = json.load(f)
search_results = data_info.pop("search_results")
return data_info, search_results
@staticmethod
def compute_metrics(
qrels: Dict[str, Dict[str, int]],
search_results: Dict[str, Dict[str, float]],
k_values: List[int],
):
"""Evaluate the model with metrics.
Args:
qrels (Dict[str, Dict[str, int]]): Ground truth relevance of queries and documents.
search_results (Dict[str, Dict[str, float]]): Dictionary of search results
k_values (List[int]): Cutoffs.
Returns:
dict: The results of the metrics.
"""
ndcg, _map, recall, precision = evaluate_metrics(
qrels=qrels,
results=search_results,
k_values=k_values,
)
mrr = evaluate_mrr(
qrels=qrels,
results=search_results,
k_values=k_values,
)
recall_cap = evaluate_recall_cap(
qrels=qrels,
results=search_results,
k_values=k_values,
)
scores = {
**{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
**{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
**{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()},
**{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()},
**{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr.items()},
**{f"recall_cap_at_{k.split('@')[1]}": v for (k, v) in recall_cap.items()},
}
return scores
def evaluate_results(
self,
search_results_save_dir: str,
k_values: List[int] = [1, 3, 5, 10, 100, 1000]
):
"""Compute metrics according to the results in the directory.
Args:
search_results_save_dir (str): Path to the search results.
k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`.
Returns:
dict: Evaluation results.
"""
eval_results_dict = {}
for file in os.listdir(search_results_save_dir):
if not file.endswith('.json'):
continue
file_path = os.path.join(search_results_save_dir, file)
data_info, search_results = self.load_search_results(file_path)
_eval_name = data_info['eval_name']
assert _eval_name == self.eval_name, f'Mismatch eval_name: {_eval_name} vs {self.eval_name} in {file_path}'
split = data_info['split']
dataset_name = data_info.get('dataset_name', None)
qrels = self.data_loader.load_qrels(dataset_name=dataset_name, split=split)
eval_results = self.compute_metrics(
qrels=qrels,
search_results=search_results,
k_values=k_values
)
if dataset_name is not None:
key = f"{dataset_name}-{split}"
else:
key = split
eval_results_dict[key] = eval_results
return eval_results_dict
@staticmethod
def output_eval_results_to_json(eval_results_dict: dict, output_path: str):
"""Write the evaluation results into a json file.
Args:
eval_results_dict (dict): Dictionary of the evaluation results.
output_path (str): Output path to write the json file.
"""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(eval_results_dict, f, indent=4)
logger.info(f"Results saved to {output_path}")
@staticmethod
def get_results_df(metric: str, eval_results_dict: dict):
"""Get the results from dictionary to a DataFrame.
Args:
metric (str): Selected metric.
eval_results_dict (dict): Dictionary of the evaluation results.
Returns:
DataFrame: DataFrame of the results.
"""
results_dict = {}
for model_name, model_results in eval_results_dict.items():
results_dict[model_name] = {}
for reranker_name, reranker_results in model_results.items():
results_dict[model_name][reranker_name] = {}
for split, split_results in reranker_results.items():
if metric in split_results:
results_dict[model_name][reranker_name][split] = split_results[metric]
else:
results_dict[model_name][reranker_name][split] = None
model_reranker_pairs = set()
all_splits = set()
for model_name, model_results in results_dict.items():
for reranker_name, reranker_results in model_results.items():
model_reranker_pairs.add((model_name, reranker_name))
all_splits.update(reranker_results.keys())
index = [(model, reranker) for model, reranker in model_reranker_pairs]
multi_index = pd.MultiIndex.from_tuples(index, names=['Model', 'Reranker'])
all_splits = sorted(list(all_splits))
overall_columns = ['average'] + all_splits
overall_df = pd.DataFrame(index=multi_index, columns=overall_columns)
for model, reranker in model_reranker_pairs:
for split in all_splits:
if model in results_dict and reranker in results_dict[model] and split in results_dict[model][reranker]:
overall_df.loc[(model, reranker), split] = results_dict[model][reranker][split]
else:
overall_df.loc[(model, reranker), split] = None
if overall_df.loc[(model, reranker), all_splits].isnull().any():
overall_df.loc[(model, reranker), 'average'] = None
else:
overall_df.loc[(model, reranker), 'average'] = overall_df.loc[(model, reranker), all_splits].mean()
return overall_df
@staticmethod
def output_eval_results_to_markdown(eval_results_dict: dict, output_path: str, metrics: Union[List[str], str]):
"""Write the evaluation results to a markdown file.
Args:
eval_results_dict (dict): Dictionary that contains evaluation results.
output_path (str): Path to write the output to.
metrics (Union[List[str], str]): The metrics that will be written in the markdown file.
"""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
if isinstance(metrics, str):
metrics = [metrics]
with open(output_path, 'w', encoding='utf-8') as f:
for metric in metrics:
f.write(f"## {metric}\n\n")
results_df = AbsEvaluator.get_results_df(metric, eval_results_dict)
max_index = dict(results_df.idxmax(axis=0))
splits = results_df.columns
f.write(f"| Model | Reranker | {' | '.join(splits)} |\n")
f.write(f"| :---- | :---- | {' | '.join([':---:' for _ in splits])} |\n")
for i, row in results_df.iterrows():
line = f"| {i[0]} | {i[1]} | "
for s, v in row.items():
if v is None:
line += "- | "
else:
if i != max_index[s]:
line += f'{v*100:.3f} | '
else:
line += f'**{v*100:.3f}** | '
f.write(line + "\n")
f.write("\n")
logger.info(f"Results saved to {output_path}")
================================================
FILE: FlagEmbedding/abc/evaluation/runner.py
================================================
import os
import json
import logging
from typing import List, Union, Tuple
from FlagEmbedding import FlagAutoModel, FlagAutoReranker, AbsEmbedder, AbsReranker
from .arguments import AbsEvalArgs, AbsEvalModelArgs
from .evaluator import AbsEvaluator
from .searcher import EvalDenseRetriever, EvalReranker
from .data_loader import AbsEvalDataLoader
logger = logging.getLogger(__name__)
class AbsEvalRunner:
"""
Abstract class of evaluation runner.
Args:
eval_args (AbsEvalArgs): :class:AbsEvalArgs object with the evaluation arguments.
model_args (AbsEvalModelArgs): :class:AbsEvalModelArgs object with the model arguments.
"""
def __init__(
self,
eval_args: AbsEvalArgs,
model_args: AbsEvalModelArgs,
):
self.eval_args = eval_args
self.model_args = model_args
self.retriever, self.reranker = self.load_retriever_and_reranker()
self.data_loader = self.load_data_loader()
self.evaluator = self.load_evaluator()
@staticmethod
def get_models(model_args: AbsEvalModelArgs) -> Tuple[AbsEmbedder, Union[AbsReranker, None]]:
"""Get the embedding and reranker model
Args:
model_args (AbsEvalModelArgs): :class:AbsEvalModelArgs object with the model arguments.
Returns:
Tuple[AbsEmbedder, Union[AbsReranker, None]]: A :class:AbsEmbedder object of embedding model, and
:class:AbsReranker object of reranker model if path provided.
"""
embedder = FlagAutoModel.from_finetuned(
model_name_or_path=model_args.embedder_name_or_path,
model_class=model_args.embedder_model_class,
normalize_embeddings=model_args.normalize_embeddings,
pooling_method=model_args.pooling_method,
use_fp16=model_args.use_fp16,
query_instruction_for_retrieval=model_args.query_instruction_for_retrieval,
query_instruction_format=model_args.query_instruction_format_for_retrieval,
devices=model_args.devices,
examples_for_task=model_args.examples_for_task,
examples_instruction_format=model_args.examples_instruction_format,
trust_remote_code=model_args.trust_remote_code,
cache_dir=model_args.cache_dir,
batch_size=model_args.embedder_batch_size,
query_max_length=model_args.embedder_query_max_length,
passage_max_length=model_args.embedder_passage_max_length,
)
embedder.model.config._name_or_path = model_args.embedder_name_or_path
reranker = None
if model_args.reranker_name_or_path is not None:
reranker = FlagAutoReranker.from_finetuned(
model_name_or_path=model_args.reranker_name_or_path,
model_class=model_args.reranker_model_class,
peft_path=model_args.reranker_peft_path,
use_fp16=model_args.use_fp16,
use_bf16=model_args.use_bf16,
query_instruction_for_rerank=model_args.query_instruction_for_rerank,
query_instruction_format=model_args.query_instruction_format_for_rerank,
passage_instruction_for_rerank=model_args.passage_instruction_for_rerank,
passage_instruction_format=model_args.passage_instruction_format_for_rerank,
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
devices=model_args.devices,
normalize=model_args.normalize,
prompt=model_args.prompt,
cutoff_layers=model_args.cutoff_layers,
compress_layers=model_args.compress_layers,
compress_ratio=model_args.compress_ratio,
batch_size=model_args.reranker_batch_size,
query_max_length=model_args.reranker_query_max_length,
max_length=model_args.reranker_max_length,
)
reranker.model.config._name_or_path = model_args.reranker_name_or_path
return embedder, reranker
def load_retriever_and_reranker(self) -> Tuple[EvalDenseRetriever, Union[EvalReranker, None]]:
"""Load retriever and reranker for evaluation
Returns:
Tuple[EvalDenseRetriever, Union[EvalReranker, None]]: A :class:EvalDenseRetriever object for retrieval, and a
:class:EvalReranker object if reranker provided.
"""
embedder, reranker = self.get_models(self.model_args)
retriever = EvalDenseRetriever(
embedder,
search_top_k=self.eval_args.search_top_k,
overwrite=self.eval_args.overwrite
)
if reranker is not None:
reranker = EvalReranker(reranker, rerank_top_k=self.eval_args.rerank_top_k)
return retriever, reranker
def load_data_loader(self) -> AbsEvalDataLoader:
"""Load the data loader
Returns:
AbsEvalDataLoader: Data loader object for that specific task.
"""
data_loader = AbsEvalDataLoader(
eval_name=self.eval_args.eval_name,
dataset_dir=self.eval_args.dataset_dir,
cache_dir=self.eval_args.cache_path,
token=self.eval_args.token,
force_redownload=self.eval_args.force_redownload,
)
return data_loader
def load_evaluator(self) -> AbsEvaluator:
"""Load the evaluator for evaluation
Returns:
AbsEvaluator: the evaluator to run the evaluation.
"""
evaluator = AbsEvaluator(
eval_name=self.eval_args.eval_name,
data_loader=self.data_loader,
overwrite=self.eval_args.overwrite,
)
return evaluator
@staticmethod
def evaluate_metrics(
search_results_save_dir: str,
output_method: str = "markdown",
output_path: str = "./eval_dev_results.md",
metrics: Union[str, List[str]] = ["ndcg_at_10", "recall_at_10"]
):
"""Evaluate the provided metrics and write the results.
Args:
search_results_save_dir (str): Path to save the search results.
output_method (str, optional): Output results to `json` or `markdown`. Defaults to :data:`"markdown"`.
output_path (str, optional): Path to write the output. Defaults to :data:`"./eval_dev_results.md"`.
metrics (Union[str, List[str]], optional): metrics to use. Defaults to :data:`["ndcg_at_10", "recall_at_10"]`.
Raises:
FileNotFoundError: Eval results not found
ValueError: Invalid output method
"""
eval_results_dict = {}
for model_name in sorted(os.listdir(search_results_save_dir)):
model_search_results_save_dir = os.path.join(search_results_save_dir, model_name)
if not os.path.isdir(model_search_results_save_dir):
continue
for reranker_name in sorted(os.listdir(model_search_results_save_dir)):
reranker_search_results_save_dir = os.path.join(model_search_results_save_dir, reranker_name)
if not os.path.isdir(reranker_search_results_save_dir):
continue
eval_results_path = os.path.join(reranker_search_results_save_dir, 'EVAL', "eval_results.json")
if os.path.exists(eval_results_path):
eval_results = json.load(open(eval_results_path, encoding='utf-8'))
else:
logger.warning(f"Eval results not found: {eval_results_path}")
continue
if model_name not in eval_results_dict:
eval_results_dict[model_name] = {}
eval_results_dict[model_name][reranker_name] = eval_results
if output_method == "json":
AbsEvaluator.output_eval_results_to_json(eval_results_dict, output_path)
elif output_method == "markdown":
AbsEvaluator.output_eval_results_to_markdown(eval_results_dict, output_path, metrics)
else:
raise ValueError(f"Invalid output method: {output_method}. Available methods: ['json', 'markdown']")
def run(self):
"""
Run the whole evaluation.
"""
if self.eval_args.dataset_names is None:
dataset_names = self.data_loader.available_dataset_names()
else:
dataset_names = self.data_loader.check_dataset_names(self.eval_args.dataset_names)
if len(dataset_names) == 0:
logger.info(f"Running {self.eval_args.eval_name} evaluation on the default dataset.")
self.evaluator(
splits=self.eval_args.splits,
search_results_save_dir=self.eval_args.output_dir,
retriever=self.retriever,
reranker=self.reranker,
corpus_embd_save_dir=self.eval_args.corpus_embd_save_dir,
ignore_identical_ids=self.eval_args.ignore_identical_ids,
k_values=self.eval_args.k_values
)
logger.info(f"{self.eval_args.eval_name} evaluation completed.")
else:
logger.info(f"Running {self.eval_args.eval_name} evaluation on the following dataset names: {dataset_names}")
for dataset_name in dataset_names:
logger.info(f"Running {self.eval_args.eval_name} evaluation on: {dataset_name}")
self.evaluator(
splits=self.eval_args.splits,
search_results_save_dir=self.eval_args.output_dir,
retriever=self.retriever,
reranker=self.reranker,
corpus_embd_save_dir=self.eval_args.corpus_embd_save_dir,
ignore_identical_ids=self.eval_args.ignore_identical_ids,
k_values=self.eval_args.k_values,
dataset_name=dataset_name,
)
logger.info(f"{self.eval_args.eval_name} evaluation on {dataset_names} completed.")
logger.info("Start computing metrics.")
self.evaluate_metrics(
search_results_save_dir=self.eval_args.output_dir,
output_method=self.eval_args.eval_output_method,
output_path=self.eval_args.eval_output_path,
metrics=self.eval_args.eval_metrics
)
================================================
FILE: FlagEmbedding/abc/evaluation/searcher.py
================================================
"""
Adapted from https://github.com/AIR-Bench/AIR-Bench/blob/0.1.0/air_benchmark/evaluation_utils/searcher.py
"""
import os
import logging
import gc
import torch
import numpy as np
from typing import Any, Dict, Optional
from abc import ABC, abstractmethod
from FlagEmbedding.abc.inference import AbsEmbedder, AbsReranker
from FlagEmbedding.abc.evaluation.utils import index, search
logger = logging.getLogger(__name__)
class EvalRetriever(ABC):
"""
This is the base class for retriever.
"""
def __init__(self, embedder: AbsEmbedder, search_top_k: int = 1000, overwrite: bool = False):
self.embedder = embedder
self.search_top_k = search_top_k
self.overwrite = overwrite
def __str__(self) -> str:
"""
Returns: str: Name of the retriever.
"""
return os.path.basename(self.embedder.model.config._name_or_path)
def stop_multi_process_pool(self):
self.embedder.stop_self_pool()
# if self.embedder.pool is not None:
# self.embedder.stop_multi_process_pool(self.embedder.pool)
# self.embedder.pool = None
# self.embedder.model.to('cpu')
# gc.collect()
# torch.cuda.empty_cache()
@abstractmethod
def __call__(
self,
corpus: Dict[str, Dict[str, Any]],
queries: Dict[str, str],
corpus_embd_save_dir: Optional[str] = None,
ignore_identical_ids: bool = False,
**kwargs,
) -> Dict[str, Dict[str, float]]:
"""
Abstract method to be overrode. This is called during the retrieval process.
Parameters:
corpus: Dict[str, Dict[str, Any]]: Corpus of documents.
Structure: {<docid>: {"text": <text>}}.
Example: {"doc-0": {"text": "This is a document."}}
queries: Dict[str, str]: Queries to search for.
Structure: {<qid>: <query>}.
Example: {"q-0": "This is a query."}
corpus_embd_save_dir (Optional[str]): Defaults to :data:`None`.
ignore_identical_ids (bool): Defaults to :data:`False`.
**kwargs: Any: Additional arguments.
Returns: Dict[str, Dict[str, float]]: Top-k search results for each query. k is specified by search_top_k.
Structure: {qid: {docid: score}}. The higher is the score, the more relevant is the document.
Example: {"q-0": {"doc-0": 0.9}}
"""
class EvalDenseRetriever(EvalRetriever):
"""
Child class of :class:EvalRetriever for dense retrieval.
"""
def __call__(
self,
corpus: Dict[str, Dict[str, Any]],
queries: Dict[str, str],
corpus_embd_save_dir: Optional[str] = None,
ignore_identical_ids: bool = False,
**kwargs,
) -> Dict[str, Dict[str, float]]:
"""
This is called during the retrieval process.
Parameters:
corpus: Dict[str, Dict[str, Any]]: Corpus of documents.
Structure: {<docid>: {"text": <text>}}.
Example: {"doc-0": {"text": "This is a document."}}
queries: Dict[str, str]: Queries to search for.
Structure: {<qid>: <query>}.
Example: {"q-0": "This is a query."}
corpus_embd_save_dir (Optional[str]): Defaults to :data:`None`.
ignore_identical_ids (bool): Defaults to :data:`False`.
**kwargs: Any: Additional arguments.
Returns: Dict[str, Dict[str, float]]: Top-k search results for each query. k is specified by search_top_k.
Structure: {qid: {docid: score}}. The higher is the score, the more relevant is the document.
Example: {"q-0": {"doc-0": 0.9}}
"""
if ignore_identical_ids:
logger.warning("ignore_identical_ids is set to True. This means that the search results will not contain identical ids. Note: Dataset such as MIRACL should NOT set this to True.")
# dense embedding models do not require language as input: AIRBench evaluation
kwargs.pop("language", None)
corpus_ids = []
corpus_texts = []
for docid, doc in corpus.items():
corpus_ids.append(docid)
corpus_texts.append(
doc["text"] if "title" not in doc
else f"{doc['title']} {doc['text']}".strip()
)
queries_ids = []
queries_texts = []
for qid, query in queries.items():
queries_ids.append(qid)
queries_texts.append(query)
if corpus_embd_save_dir is not None:
if os.path.exists(os.path.join(corpus_embd_save_dir, "doc.npy")) and not self.overwrite:
corpus_emb = np.load(os.path.join(corpus_embd_save_dir, "doc.npy"))
else:
corpus_emb = self.embedder.encode_corpus(corpus_texts, **kwargs)
else:
corpus_emb = self.embedder.encode_corpus(corpus_texts, **kwargs)
queries_emb = self.embedder.encode_queries(queries_texts, **kwargs)
# check if the embeddings are in dictionary format: M3Embedder
if isinstance(corpus_emb, dict):
corpus_emb = corpus_emb["dense_vecs"]
if isinstance(queries_emb, dict):
queries_emb = queries_emb["dense_vecs"]
if corpus_embd_save_dir is not None and \
(not os.path.exists(os.path.join(corpus_embd_save_dir, "doc.npy")) or self.overwrite):
os.makedirs(corpus_embd_save_dir, exist_ok=True)
np.save(os.path.join(corpus_embd_save_dir, "doc.npy"), corpus_emb)
gc.collect()
torch.cuda.empty_cache()
faiss_index = index(corpus_embeddings=corpus_emb)
all_scores, all_indices = search(query_embeddings=queries_emb, faiss_index=faiss_index, k=self.search_top_k)
results = {}
for idx, (scores, indices) in enumerate(zip(all_scores, all_indices)):
results[queries_ids[idx]] = {}
for score, indice in zip(scores, indices):
if indice != -1:
if ignore_identical_ids and corpus_ids[indice] == queries_ids[idx]:
continue
results[queries_ids[idx]][corpus_ids[indice]] = float(score)
return results
class EvalReranker:
"""
Class for reranker during evaluation.
"""
def __init__(self, reranker: AbsReranker, rerank_top_k: int = 100):
self.reranker = reranker
self.rerank_top_k = rerank_top_k
def __str__(self) -> str:
"""
Returns: str: Name of the reranker.
"""
return os.path.basename(self.reranker.model.config._name_or_path)
def stop_multi_process_pool(self):
self.reranker.stop_self_pool()
# if self.reranker.pool is not None:
# self.reranker.stop_multi_process_pool(self.reranker.pool)
# self.reranker.pool = None
# self.reranker.model.to('cpu')
# gc.collect()
# torch.cuda.empty_cache()
def __call__(
self,
corpus: Dict[str, Dict[str, Any]],
queries: Dict[str, str],
search_results: Dict[str, Dict[str, float]],
ignore_identical_ids: bool = False,
**kwargs,
) -> Dict[str, Dict[str, float]]:
"""
This is called during the reranking process.
Parameters:
corpus: Dict[str, Dict[str, Any]]: Corpus of documents.
Structure: {<docid>: {"text": <text>}}.
Example: {"doc-0": {"text": "This is a document."}}
queries: Dict[str, str]: Queries to search for.
Structure: {<qid>: <query>}.
Example: {"q-0": "This is a query."}
search_results: Dict[str, Dict[str, float]]: Search results for each query.
Structure: {qid: {docid: score}}. The higher is the score, the more relevant is the document.
Example: {"q-0": {"doc-0": 0.9}}
**kwargs: Any: Additional arguments.
Returns: Dict[str, Dict[str, float]]: Reranked search results for each query. k is specified by rerank_top_k.
Structure: {qid: {docid: score}}. The higher is the score, the more relevant is the document.
Example: {"q-0": {"doc-0": 0.9}}
"""
# truncate search results to top_k
for qid in search_results:
search_results[qid] = dict(
sorted(search_results[qid].items(), key=lambda x: x[1], reverse=True)[
:self.rerank_top_k
]
)
# generate sentence pairs
sentence_pairs = []
pairs = []
for qid in search_results:
for docid in search_results[qid]:
if ignore_identical_ids and qid == docid:
continue
sentence_pairs.append(
{
"qid": qid,
"docid": docid,
"query": queries[qid],
"doc": corpus[docid]["text"] if "title" not in corpus[docid]
else f"{corpus[docid]['title']} {corpus[docid]['text']}".strip(),
}
)
pairs.append(
(
queries[qid],
corpus[docid]["text"] if "title" not in corpus[docid]
else f"{corpus[docid]['title']} {corpus[docid]['text']}".strip()
)
)
# compute scores
scores = self.reranker.compute_score(pairs)
for i, score in enumerate(scores):
sentence_pairs[i]["score"] = float(score)
# rerank
reranked_results = {qid: {} for qid in search_results}
for pair in sentence_pairs:
reranked_results[pair["qid"]][pair["docid"]] = pair["score"]
return reranked_results
================================================
FILE: FlagEmbedding/abc/evaluation/utils.py
================================================
import faiss
import torch
import logging
import numpy as np
import pytrec_eval
from tqdm import tqdm
from collections import defaultdict
from typing import Dict, List, Tuple, Optional
logger = logging.getLogger(__name__)
# Modified from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/custom_metrics.py#L4
def evaluate_mrr(
qrels: Dict[str, Dict[str, int]],
results: Dict[str, Dict[str, float]],
k_values: List[int],
) -> Tuple[Dict[str, float]]:
"""Compute mean reciprocal rank (MRR).
Args:
qrels (Dict[str, Dict[str, int]]): Ground truth relevance.
results (Dict[str, Dict[str, float]]): Search results to evaluate.
k_values (List[int]): Cutoffs.
Returns:
Tuple[Dict[str, float]]: MRR results at provided k values.
"""
mrr = defaultdict(list)
k_max, top_hits = max(k_values), {}
for query_id, doc_scores in results.items():
top_hits[query_id] = sorted(
doc_scores.items(), key=lambda item: item[1], reverse=True
)[0:k_max]
for query_id in top_hits:
query_relevant_docs = {
doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0
}
for k in k_values:
rr = 0
for rank, hit in enumerate(top_hits[query_id][0:k], 1):
if hit[0] in query_relevant_docs:
rr = 1.0 / rank
break
mrr[f"MRR@{k}"].append(rr)
for k in k_values:
mrr[f"MRR@{k}"] = round(sum(mrr[f"MRR@{k}"]) / len(qrels), 5)
return mrr
# Modified from https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/custom_metrics.py#L33
def evaluate_recall_cap(
qrels: Dict[str, Dict[str, int]],
results: Dict[str, Dict[str, float]],
k_values: List[int]
) -> Tuple[Dict[str, float]]:
"""Compute capped recall.
Args:
qrels (Dict[str, Dict[str, int]]): Ground truth relevance.
results (Dict[str, Dict[str, float]]): Search results to evaluate.
k_values (List[int]): Cutoffs.
Returns:
Tuple[Dict[str, float]]: Capped recall results at provided k values.
"""
capped_recall = {}
for k in k_values:
capped_recall[f"R_cap@{k}"] = 0.0
k_max = max(k_values)
logging.info("\n")
for query_id, doc_scores in results.items():
top_hits = sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)[0:k_max]
query_relevant_docs = [doc_id for doc_id in qrels[query_id] if qrels[query_id][doc_id] > 0]
for k in k_values:
retrieved_docs = [row[0] for row in top_hits[0:k] if qrels[query_id].get(row[0], 0) > 0]
denominator = min(len(query_relevant_docs), k)
capped_recall[f"R_cap@{k}"] += (len(retrieved_docs) / denominator)
for k in k_values:
capped_recall[f"R_cap@{k}"] = round(capped_recall[f"R_cap@{k}"]/len(qrels), 5)
logging.info("R_cap@{}: {:.4f}".format(k, capped_recall[f"R_cap@{k}"]))
return capped_recall
# Modified from https://github.com/embeddings-benchmark/mteb/blob/18f730696451a5aaa026494cecf288fd5cde9fd0/mteb/evaluation/evaluators/RetrievalEvaluator.py#L501
def evaluate_metrics(
qrels: Dict[str, Dict[str, int]],
results: Dict[str, Dict[str, float]],
k_values: List[int],
) -> Tuple[
Dict[str, float],
Dict[str, float],
Dict[str, float],
Dict[str, float],
]:
"""Evaluate the main metrics.
Args:
qrels (Dict[str, Dict[str, int]]): Ground truth relevance.
results (Dict[str, Dict[str, float]]): Search results to evaluate.
k_values (List[int]): Cutoffs.
Returns:
Tuple[ Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float], ]: Results of different metrics at
different provided k values.
"""
all_ndcgs, all_aps, all_recalls, all_precisions = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)
map_string = "map_cut." + ",".join([str(k) for k in k_values])
ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
recall_string = "recall." + ",".join([str(k) for k in k_values])
precision_string = "P." + ",".join([str(k) for k in k_values])
evaluator = pytrec_eval.RelevanceEvaluator(
qrels, {map_string, ndcg_string, recall_string, precision_string}
)
scores = evaluator.evaluate(results)
for query_id in scores.keys():
for k in k_values:
all_ndcgs[f"NDCG@{k}"].append(scores[query_id]["ndcg_cut_" + str(k)])
all_aps[f"MAP@{k}"].append(scores[query_id]["map_cut_" + str(k)])
all_recalls[f"Recall@{k}"].append(scores[query_id]["recall_" + str(k)])
all_precisions[f"P@{k}"].append(scores[query_id]["P_" + str(k)])
ndcg, _map, recall, precision = (
all_ndcgs.copy(),
all_aps.copy(),
all_recalls.copy(),
all_precisions.copy(),
)
for k in k_values:
ndcg[f"NDCG@{k}"] = round(sum(ndcg[f"NDCG@{k}"]) / len(scores), 5)
_map[f"MAP@{k}"] = round(sum(_map[f"MAP@{k}"]) / len(scores), 5)
recall[f"Recall@{k}"] = round(sum(recall[f"Recall@{k}"]) / len(scores), 5)
precision[f"P@{k}"] = round(sum(precision[f"P@{k}"]) / len(scores), 5)
return ndcg, _map, recall, precision
def index(
index_factory: str = "Flat",
corpus_embeddings: Optional[np.ndarray] = None,
load_path: Optional[str] = None,
device: Optional[str] = None
):
"""Create and add embeddings into a Faiss index.
Args:
index_factory (str, optional): Type of Faiss index to create. Defaults to "Flat".
corpus_embeddings (Optional[np.ndarray], optional): The embedding vectors of the corpus. Defaults to None.
load_path (Optional[str], optional): Path to load embeddings from. Defaults to None.
device (Optional[str], optional): Device to hold Faiss index. Defaults to None.
Returns:
faiss.Index: The Faiss index that contains all the corpus embeddings.
"""
if corpus_embeddings is None:
corpus_embeddings = np.load(load_path)
logger.info(f"Shape of embeddings: {corpus_embeddings.shape}")
# create faiss index
logger.info(f'Indexing {corpus_embeddings.shape[0]} documents...')
faiss_index = faiss.index_factory(corpus_embeddings.shape[-1], index_factory, faiss.METRIC_INNER_PRODUCT)
if device is None and torch.cuda.is_available():
try:
co = faiss.GpuMultipleClonerOptions()
co.shard = True
co.useFloat16 = True
faiss_index = faiss.index_cpu_to_all_gpus(faiss_index, co)
except:
print('faiss do not support GPU, please uninstall faiss-cpu, faiss-gpu and install faiss-gpu again.')
logger.info('Adding embeddings ...')
corpus_embeddings = corpus_embeddings.astype(np.float32)
faiss_index.train(corpus_embeddings)
faiss_index.add(corpus_embeddings)
logger.info('Embeddings add over...')
return faiss_index
def search(
faiss_index: faiss.Index,
k: int = 100,
query_embeddings: Optional[np.ndarray] = None,
load_path: Optional[str] = None
):
"""
1. Encode queries into dense embeddings;
2. Search through faiss index
Args:
faiss_index (faiss.Index): The Faiss index that contains all the corpus embeddings.
k (int, optional): Top k numbers of closest neighbours. Defaults to :data:`100`.
query_embeddings (Optional[np.ndarray], optional): The embedding vectors of queries. Defaults to :data:`None`.
load_path (Optional[str], optional): Path to load embeddings from. Defaults to :data:`None`.
Returns:
Tuple[np.ndarray, np.ndarray]: The scores of search results and their corresponding indices.
"""
if query_embeddings is None:
query_embeddings = np.load(load_path)
query_size = len(query_embeddings)
all_scores = []
all_indices = []
for i in tqdm(range(0, query_size, 32), desc="Searching"):
j = min(i + 32, query_size)
query_embedding = query_embeddings[i: j]
score, indice = faiss_index.search(query_embedding.astype(np.float32), k=k)
all_scores.append(score)
all_indices.append(indice)
all_scores = np.concatenate(all_scores, axis=0)
all_indices = np.concatenate(all_indices, axis=0)
return all_scores, all_indices
================================================
FILE: FlagEmbedding/abc/finetune/__init__.py
================================================
================================================
FILE: FlagEmbedding/abc/finetune/embedder/AbsArguments.py
================================================
import os
from typing import Optional
from dataclasses import dataclass, field
from transformers import TrainingArguments
@dataclass
class AbsEmbedderModelArguments:
"""
Abstract class for model arguments.
"""
model_name_or_path: str = field(
metadata={"help": "The model checkpoint for initialization."}
)
config_name: str = field(
default=None,
metadata={"help": "Pretrained config name or path if not the same as model_name."}
)
tokenizer_name: str = field(
default=None,
metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."}
)
cache_dir: str = field(
default=None,
metadata={"help": "Where do you want to store the pre-trained models downloaded from s3."}
)
trust_remote_code: bool = field(
default=False,
metadata={"help": "Trust remote code"}
)
use_fast_tokenizer: bool = field(
default=True,
metadata={"help": "Whether to use fast tokenizer or not."}
)
token: str = field(
default_factory=lambda: os.getenv('HF_TOKEN', None),
metadata={"help": "The token to use when accessing the model."}
)
@dataclass
class AbsEmbedderDataArguments:
"""
Abstract class for data arguments.
"""
train_data: str = field(
default=None, metadata={
"help": "One or more paths to training data. `query: str`, `pos: List[str]`, `neg: List[str]` are required in the training data.",
"nargs": "+"
}
)
cache_path: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the cached data"}
)
train_group_size: int = field(default=8)
query_max_len: int = field(
default=32,
metadata={
"help": "The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated."
},
)
passage_max_len: int = field(
default=128,
metadata={
"help": "The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated."
},
)
pad_to_multiple_of: Optional[int] = field(
default=None,
metadata={
"help": "If set will pad the sequence to be a multiple of the provided value."
},
)
max_example_num_per_dataset: int = field(
default=100000000, metadata={"help": "the max number of examples for each dataset"}
)
query_instruction_for_retrieval: str= field(
default=None, metadata={"help": "instruction for query"}
)
query_instruction_format: str = field(
default="{}{}", metadata={"help": "format for query instruction"}
)
knowledge_distillation: bool = field(
default=False,
metadata={"help": "Use knowledge distillation when `pos_scores: List[float]` and `neg_scores: List[float]` are in features of training data"}
)
passage_instruction_for_retrieval: Optional[str] = field(
default=None, metadata={"help": "instruction for passage"}
)
passage_instruction_format: Optional[str] = field(
default="{}{}", metadata={"help": "format for passage instruction"}
)
shuffle_ratio: float = field(
default=0.0, metadata={"help": "The ratio of shuffling the text"}
)
# Parameters for SameDatasetDataArguments
same_dataset_within_batch: bool = field(
default=False, metadata={"help": "All samples in the same batch comes from the same dataset."}
)
small_threshold: int = field(
default=0,
metadata={"help": "The threshold of small dataset. All small dataset in the same directory will be merged into one dataset."}
)
drop_threshold: int = field(
default=0,
metadata={"help": "The threshold for dropping merged small dataset. If the number of examples in the merged small dataset is less than this threshold, it will be dropped."}
)
def __post_init__(self):
# replace "\\n" with "\n"
if "\\n" in self.query_instruction_format:
self.query_instruction_format = self.query_instruction_format.replace("\\n", "\n")
if "\\n" in self.passage_instruction_format:
self.passage_instruction_format = self.passage_instruction_format.replace("\\n", "\n")
# check the existence of train data
for train_dir in self.train_data:
if not os.path.exists(train_dir):
raise FileNotFoundError(f"cannot find file: {train_dir}, please set a true path")
@dataclass
class AbsEmbedderTrainingArguments(TrainingArguments):
negatives_cross_device: bool = field(default=False, metadata={"help": "share negatives across devices"})
temperature: Optional[float] = field(default=0.02, metadata={"help": "temperature used for similarity score"})
fix_position_embedding: bool = field(default=False, metadata={"help": "Freeze the parameters of position embeddings"})
sentence_pooling_method: str = field(default='cls', metadata={"help": "the pooling method. Available options: cls, mean, last_token. Default: cls", "choices": ['cls', 'mean', 'last_token']})
normalize_embeddings: bool = field(default=True, metadata={"help": "whether to normalize the embeddings"})
sub_batch_size: Optional[int] = field(default=None, metadata={"help": "sub batch size for training"})
kd_loss_type: str = field(default='kl_div', metadata={"help": "the loss type for knowledge distillation. Available options: kl_div, m3_kd_loss. Default: kl_div.", "choices": ['kl_div', 'm3_kd_loss']})
================================================
FILE: FlagEmbedding/abc/finetune/embedder/AbsDataset.py
================================================
import os
import math
import random
import logging
import datasets
import numpy as np
import torch.distributed as dist
from dataclasses import dataclass
from torch.utils.data import Dataset
from transformers import (
PreTrainedTokenizer,
DataCollatorWithPadding,
TrainerCallback,
TrainerState,
TrainerControl
)
from .AbsArguments import AbsEmbedderDataArguments, AbsEmbedderTrainingArguments
logger = logging.getLogger(__name__)
class AbsEmbedderTrainDataset(Dataset):
"""Abstract class for training dataset.
Args:
args (AbsEmbedderDataArguments): Data arguments.
tokenizer (PreTrainedTokenizer): Tokenizer to use.
"""
def __init__(
self,
args: AbsEmbedderDataArguments,
tokenizer: PreTrainedTokenizer
):
self.args = args
self.tokenizer = tokenizer
self.shuffle_ratio = args.shuffle_ratio
train_datasets = []
for data_dir in args.train_data:
if not os.path.isdir(data_dir):
if not (data_dir.endswith('.json') or data_dir.endswith('.jsonl')): continue
temp_dataset = self._load_dataset(data_dir)
if len(temp_dataset) == 0: continue
train_datasets.append(temp_dataset)
else:
for file in os.listdir(data_dir):
if not (file.endswith('.json') or file.endswith('.jsonl')): continue
temp_dataset = self._load_dataset(os.path.join(data_dir, file))
if len(temp_dataset) == 0: continue
train_datasets.append(temp_dataset)
self.dataset = datasets.concatenate_datasets(train_datasets)
def _load_dataset(self, file_path: str):
"""Load dataset from path.
Args:
file_path (str): Path to load the datasets from.
Raises:
ValueError: `pos_scores` and `neg_scores` not found in the features of training data
Returns:
datasets.Dataset: Loaded HF dataset.
"""
safe_rank = dist.get_rank() if dist.is_initialized() else 0
if safe_rank == 0:
logger.info(f'loading data from {file_path} ...')
temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=self.args.cache_path)
if len(temp_dataset) > self.args.max_example_num_per_dataset:
temp_dataset = temp_dataset.select(random.sample(list(range(len(temp_dataset))), self.args.max_example_num_per_dataset))
if not self.args.knowledge_distillation:
if 'pos_scores' in temp_dataset.column_names:
temp_dataset = temp_dataset.remove_columns(['pos_scores'])
if 'neg_scores' in temp_dataset.column_names:
temp_dataset = temp_dataset.remove_columns(['neg_scores'])
else:
if 'pos_scores' not in temp_dataset.column_names or 'neg_scores' not in temp_dataset.column_names:
raise ValueError(f"`pos_scores` and `neg_scores` not found in the features of training data in {file_path}, which is necessary when using knowledge distillation.")
return temp_dataset
def _shuffle_text(self, text):
"""shuffle the input text.
Args:
text (str): Input text.
Returns:
str: Shuffled text.
"""
if self.shuffle_ratio > 0 and len(text) > 100 and random.random() < self.shuffle_ratio:
split_text = []
chunk_size = len(text)//3 + 1
for i in range(0, len(text), chunk_size):
split_text.append(text[i:i+chunk_size])
random.shuffle(split_text)
return " ".join(split_text)
else:
return text
def __len__(self):
return len(self.dataset)
def __getitem__(self, item):
data = self.dataset[item]
train_group_size = self.args.train_group_size
query = data['query']
if self.args.query_instruction_for_retrieval is not None:
query = self.args.query_instruction_format.format(
data['prompt'] if 'prompt' in data else self.args.query_instruction_for_retrieval,
query
)
passages = []
teacher_scores = []
assert isinstance(data['pos'], list) and isinstance(data['neg'], list)
pos_idx = random.choice(list(range(len(data['pos']))))
passages.append(self._shuffle_text(data['pos'][pos_idx]))
neg_all_idx = list(range(len(data['neg'])))
if len(data['neg']) < train_group_size - 1:
num = math.ceil((train_group_size - 1) / len(data['neg']))
neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1)
else:
neg_idxs = random.sample(neg_all_idx, self.args.train_group_size - 1)
for neg_idx in neg_idxs:
passages.append(data['neg'][neg_idx])
if self.args.knowledge_distillation:
assert isinstance(data['pos_scores'], list) and isinstance(data['neg_scores'], list)
teacher_scores.append(data['pos_scores'][pos_idx])
for neg_idx in neg_idxs:
teacher_scores.append(data['neg_scores'][neg_idx])
if not all(isinstance(score, (int, float)) for score in teacher_scores):
raise ValueError(f"pos_score or neg_score must be digit")
else:
teacher_scores = None
if self.args.passage_instruction_for_retrieval is not None:
passages = [
self.args.passage_instruction_format.format(
self.args.passage_instruction_for_retrieval, p
)
for p in passages
]
return query, passages, teacher_scores
@dataclass
class AbsEmbedderCollator(DataCollatorWithPadding):
"""
The abstract embedder collator.
"""
query_max_len: int = 32
passage_max_len: int = 128
sub_batch_size: int = -1
def __call__(self, features):
queries = [f[0] for f in features]
passages = [f[1] for f in features]
teacher_scores = [f[2] for f in features]
if teacher_scores[0] is None:
teacher_scores = None
elif isinstance(teacher_scores[0], list):
teacher_scores = sum(teacher_scores, [])
if isinstance(queries[0], list):
queries = sum(queries, [])
if isinstance(passages[0], list):
passages = sum(passages, [])
queries_inputs = self.tokenizer(
queries,
truncation=True,
max_length=self.query_max_len,
return_tensors=None
)
passages_inputs = self.tokenizer(
passages,
truncation=True,
max_length=self.passage_max_len,
return_tensors=None
)
if self.sub_batch_size is None or self.sub_batch_size <= 0:
q_collated = self.tokenizer.pad(
queries_inputs,
padding=self.padding,
max_length=self.query_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors
)
d_collated = self.tokenizer.pad(
passages_inputs,
padding=self.padding,
max_length=self.passage_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors
)
else:
batch_size = self.sub_batch_size
q_collated = []
for i in range(0, len(queries_inputs['attention_mask']), batch_size):
start = i
end = min(len(queries_inputs['attention_mask']), i + batch_size)
sub_features = {}
for k, v in queries_inputs.items():
sub_features[k] = v[start:end]
q_collated.append(self.tokenizer.pad(
sub_features,
padding=self.padding,
max_length=self.query_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors
))
d_collated = []
for i in range(0, len(passages_inputs['attention_mask']), batch_size):
start = i
end = min(len(passages_inputs['attention_mask']), i + batch_size)
sub_features = {}
for k, v in passages_inputs.items():
sub_features[k] = v[start:end]
d_collated.append(self.tokenizer.pad(
sub_features,
padding=self.padding,
max_length=self.passage_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors
))
return {
"queries": q_collated,
"passages": d_collated,
"teacher_scores": teacher_scores,
"no_in_batch_neg_flag": False
}
class AbsEmbedderSameDatasetTrainDataset(AbsEmbedderTrainDataset):
"""Abstract class for training dataset that samples batches from same dataset.
Args:
args (AbsEmbedderDataArguments): Data arguments.
default_batch_size (int): The default batch size for training.
seed (int): Random seed.
tokenizer (PreTrainedTokenizer): Tokenizer to use.
process_index (int, optional): Current process index. Defaults to 0.
num_processes (int, optional): Total number of processes. Defaults to 1.
"""
def __init__(
self,
args: AbsEmbedderDataArguments,
default_batch_size: int,
seed: int,
tokenizer: PreTrainedTokenizer,
process_index: int=0,
num_processes: int=1
):
self.args = args
self.shuffle_ratio = args.shuffle_ratio
self.defaut_batch_size = default_batch_size
self.deterministic_generator = np.random.default_rng(seed)
self.tokenizer = tokenizer
self.process_index = process_index
self.num_processes = num_processes
self.step = 0
train_datasets = []
each_data_idxs = []
batch_size_idxs = []
no_in_batch_neg_flags = []
cur_all_num = 0
small_threshold = args.small_threshold
drop_threshold = args.drop_threshold
for data_dir in args.train_data:
if not os.path.isdir(data_dir):
# Add `no_in_batch_neg` **suffix** to `data_dir` to indicate that this dataset does not use in-batch negatives
no_in_batch_neg_flag = data_dir.split('.')[-2].endswith('no_in_batch_neg')
if not (data_dir.endswith('.json') or data_dir.endswith('.jsonl')): continue
temp_dataset = self._load_dataset(data_dir)
if len(temp_dataset) == 0 or len(temp_dataset) < small_threshold: continue
else:
train_datasets.append(temp_dataset)
each_data_idxs.append(np.arange(len(temp_dataset)) + cur_all_num)
cur_all_num += len(temp_dataset)
batch_size_idxs.append(self._get_file_batch_size(temp_dataset, default_batch_size))
no_in_batch_neg_flags.append(no_in_batch_neg_flag)
else:
small_datasets = []
small_batch_size = math.inf
# Add `no_in_batch_neg` **suffix** to `data_dir` to indicate that this dataset does not use in-batch negatives
no_in_batch_neg_flag = data_dir.endswith('no_in_batch_neg')
for file in os.listdir(data_dir):
if not (file.endswith('.json') or file.endswith('.jsonl')): continue
temp_dataset = self._load_dataset(os.path.join(data_dir, file))
if len(temp_dataset) == 0: continue
elif len(temp_dataset) < small_threshold:
small_datasets.append(temp_dataset)
small_batch_size = min(small_batch_size, self._get_file_batch_size(temp_dataset, default_batch_size))
else:
train_datasets.append(temp_dataset)
each_data_idxs.append(np.arange(len(temp_dataset)) + cur_all_num)
cur_all_num += len(temp_dataset)
batch_size_idxs.append(self._get_file_batch_size(temp_dataset, default_batch_size))
no_in_batch_neg_flags.append(no_in_batch_neg_flag)
if len(small_datasets) > 0:
small_dataset = datasets.concatenate_datasets(small_datasets)
if len(small_dataset) >= drop_threshold:
train_datasets.append(small_dataset)
each_data_idxs.append(np.arange(len(small_dataset)) + cur_all_num)
cur_all_num += len(small_dataset)
batch_size_idxs.append(small_batch_size)
no_in_batch_neg_flags.append(no_in_batch_neg_flag)
self.dataset = datasets.concatenate_datasets(train_datasets)
self.each_data_idxs = each_data_idxs
self.datasets_inxs = np.arange(len(each_data_idxs))
self.batch_size_idxs = batch_size_idxs
self.no_in_batch_neg_flags = no_in_batch_neg_flags
self.refresh_epoch()
def _load_dataset(self, file_path: str):
"""Load datset from given path.
Args:
file_path (str): The path to load or download from HF hub.
Returns:
datasets.Dataset: The loaded dataset.
"""
safe_rank = dist.get_rank() if dist.is_initialized() else 0
if safe_rank == 0:
logger.info(f'loading data from {file_path} ...')
temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=self.args.cache_path)
if len(temp_dataset) > self.args.max_example_num_per_dataset:
temp_dataset = temp_dataset.select(random.sample(list(range(len(temp_dataset))), self.args.max_example_num_per_dataset))
if not self.args.knowledge_distillation:
if 'pos_scores' in temp_dataset.column_names:
temp_dataset = temp_dataset.remove_columns(['pos_scores'])
if 'neg_scores' in temp_dataset.column_names:
temp_dataset = temp_dataset.remove_columns(['neg_scores'])
return temp_dataset
@staticmethod
def _get_file_batch_size(temp_dataset: datasets.Dataset, default_batch_size: int):
"""Get the appropriate batch size for the dataset.
Args:
temp_dataset (datasets.Dataset): Loaded :data:`datasets.Dataset` object.
default_batch_size (int): The default batch size to use if not specified in the dataset.
Returns:
int: The final batch size to use.
"""
if 'batch_size' in temp_dataset.column_names:
return temp_dataset['batch_size'][0]
if 'type' in temp_dataset.column_names:
data_type = temp_dataset['type'][0]
if 'symmetric' in data_type:
return default_batch_size // 2 # make the symmetric data have smaller batch size
return default_batch_size
def refresh_epoch(self):
"""
Refresh data for epoch.
"""
logger.info(f'-- Rank {self.process_index}: refresh data --')
self.deterministic_generator.shuffle(self.datasets_inxs)
batch_datas = []
for dataset_inx in self.datasets_inxs:
self.deterministic_generator.shuffle(self.each_data_idxs[dataset_inx])
cur_batch_size = self.batch_size_idxs[dataset_inx]*self.num_processes
no_in_batch_neg_flag = self.no_in_batch_neg_flags[dataset_inx]
for start_index in range(0, len(self.each_data_idxs[dataset_inx]), cur_batch_size):
# judge the last batch's length
if len(self.each_data_idxs[dataset_inx]) - start_index < cur_batch_size:
break
batch_datas.append((
self.each_data_idxs[dataset_inx][start_index:start_index+cur_batch_size],
no_in_batch_neg_flag
))
self.deterministic_generator.shuffle(batch_datas)
self.batch_datas = batch_datas
self.step = 0
def __len__(self):
return len(self.batch_datas) * self.num_processes
def __getitem__(self, _):
batch_indices, no_in_batch_neg_flag = self.batch_datas[self.step] # extend here
cur_batch_size = int(len(batch_indices) / self.num_processes)
batch_indices = batch_indices[self.process_index * cur_batch_size: (self.process_index + 1) * cur_batch_size]
batch_data = self.dataset[batch_indices]
self.step += 1
queries, passages, teacher_scores = self._create_batch_data(batch_raw_data=batch_data)
return queries, passages, teacher_scores, no_in_batch_neg_flag
def _get_train_group_size(self, batch_raw_data):
"""Get the training group size and data type.
Args:
batch_raw_data (datasets.Dataset): One batch of raw data.
Returns:
int: The training group size.
str: The type of data for the task.
"""
if 'type' in batch_raw_data:
data_type = batch_raw_data['type'][0]
if data_type in ['only_1neg']:
return 2, data_type
elif data_type in ['symmetric_class']:
return min(len(batch_raw_data['neg'][0]) + 1, self.args.train_group_size), data_type
else:
return self.args.train_group_size, data_type
elif 'train_group_size' in batch_raw_data:
train_group_size = batch_raw_data['train_group_size'][0]
if isinstance(train_group_size, int) and train_group_size > 0:
return train_group_size, None
else:
return self.args.train_group_size, None
return self.args.train_group_size, None
def _create_batch_data(self, batch_raw_data):
"""Create a comple batch of data with queries, documents and teacher scores.
Args:
batch_raw_data (datasets.Dataset): One batch of raw data.
Returns:
List[str]: Queries with instruction format.
List[str]: Documents with instruction format.
List[float]: Teacher scores for model distillation.
"""
queries, passages, teacher_scores = [], [], []
train_group_size, data_type = self._get_train_group_size(batch_raw_data)
for i in range(len(batch_raw_data['query'])):
if data_type is not None:
assert batch_raw_data['type'][i] == data_type, f"Data type is not consistent in the same batch"
queries.append(
self.args.query_instruction_format.format(
batch_raw_data['prompt'][i] if 'prompt' in batch_raw_data else self.args.query_instruction_for_retrieval,
batch_raw_data['query'][i]
)
)
tmp_passages = []
pos_idx = random.choice(list(range(len(batch_raw_data['pos'][i]))))
pos = self._shuffle_text(batch_raw_data['pos'][i][pos_idx])
tmp_passages.append(pos)
neg_all_idx = list(range(len(batch_raw_data['neg'][i])))
if len(batch_raw_data['neg'][i]) < train_group_size - 1:
num = math.ceil((train_group_size - 1) / len(batch_raw_data['neg'][i]))
neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1)
else:
neg_idxs = random.sample(neg_all_idx, train_group_size - 1)
for neg_idx in neg_idxs:
tmp_passages.append(batch_raw_data['neg'][i][neg_idx])
if self.args.knowledge_distillation:
if 'pos_scores' in batch_raw_data and batch_raw_data['pos_scores'][i] is not None:
teacher_scores.append(batch_raw_data['pos_scores'][i][pos_idx])
for neg_idx in neg_idxs:
if 'neg_scores' in batch_raw_data and batch_raw_data['neg_scores'][i] is not None:
teacher_scores.append(batch_raw_data['neg_scores'][i][neg_idx])
else:
teacher_scores = None
if data_type is not None and data_type in ['symmetric_sts', 'symmetric_clustering']:
tmp_passages = [
self.args.query_instruction_format.format(
batch_raw_data['prompt'][i] if 'prompt' in batch_raw_data else self.args.query_instruction_for_retrieval,
p
) for p in tmp_passages
]
else:
if self.args.passage_instruction_for_retrieval is not None:
tmp_passages = [
self.args.passage_instruction_format.format(
self.args.passage_instruction_for_retrieval, p
) for p in tmp_passages
]
passages.extend(tmp_passages)
if teacher_scores is not None:
if len(teacher_scores) > 0 and len(passages) > 0:
assert len(teacher_scores) == len(passages)
return queries, passages, teacher_scores
@dataclass
class AbsEmbedderSameDatasetCollator(DataCollatorWithPadding):
"""
EmbedCollator for SameDataset.
Note that after using this collator, the training_args should be set as:
``training_args.per_device_train_batch_size = 1``
``training_args.dataloader_num_workers = 0 # avoid multi-processing``
"""
query_max_len: int = 32
passage_max_len: int = 128
sub_batch_size: int = -1
def __call__(self, features):
queries = features[0][0]
passages = features[0][1]
teacher_scores = features[0][2]
no_in_batch_neg_flag = features[0][3]
queries_inputs = self.tokenizer(
queries,
truncation=True,
max_length=self.query_max_len,
return_tensors=None
)
passages_inputs = self.tokenizer(
passages,
truncation=True,
max_length=self.passage_max_len,
return_tensors=None
)
if self.sub_batch_size is None or self.sub_batch_size <= 0:
q_collated = self.tokenizer.pad(
queries_inputs,
padding=self.padding,
max_length=self.query_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors,
)
d_collated = self.tokenizer.pad(
passages_inputs,
padding=self.padding,
max_length=self.passage_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors,
)
else:
batch_size = self.sub_batch_size
q_collated = []
for i in range(0, len(queries_inputs['attention_mask']), batch_size):
start = i
end = min(len(queries_inputs['attention_mask']), i + batch_size)
sub_features = {}
for k, v in queries_inputs.items():
sub_features[k] = v[start:end]
q_collated.append(self.tokenizer.pad(
sub_features,
padding=self.padding,
max_length=self.query_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors,
))
d_collated = []
for i in range(0, len(passages_inputs['attention_mask']), batch_size):
start = i
end = min(len(passages_inputs['attention_mask']), i + batch_size)
sub_features = {}
for k, v in passages_inputs.items():
sub_features[k] = v[start:end]
d_collated.append(self.tokenizer.pad(
sub_features,
padding=self.padding,
max_length=self.passage_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors,
))
if isinstance(teacher_scores, list) and len(teacher_scores) == 0:
teacher_scores = None
return {
"queries": q_collated,
"passages": d_collated,
"teacher_scores": teacher_scores,
"no_in_batch_neg_flag": no_in_batch_neg_flag
}
class EmbedderTrainerCallbackForDataRefresh(TrainerCallback):
"""
Callback class to inspect the state of the training loop and take decision.
"""
def __init__(self, train_dataset: AbsEmbedderSameDatasetTrainDataset):
self.train_dataset = train_dataset
def on_epoch_end(
self,
args: AbsEmbedderTrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs
):
"""
Event called at the end of an epoch.
"""
self.train_dataset.refresh_epoch()
================================================
FILE: FlagEmbedding/abc/finetune/embedder/AbsModeling.py
================================================
import torch
from torch import nn, Tensor
import torch.nn.functional as F
import torch.distributed as dist
from transformers import PreTrainedTokenizer
from transformers.file_utils import ModelOutput
import logging
from dataclasses import dataclass
from abc import ABC, abstractmethod
from typing import Dict, Optional, List, Union
logger = logging.getLogger(__name__)
@dataclass
class EmbedderOutput(ModelOutput):
"""
Output information returned by the model.
"""
q_reps: Optional[Tensor] = None
p_reps: Optional[Tensor] = None
loss: Optional[Tensor] = None
scores: Optional[Tensor] = None
class AbsEmbedderModel(ABC, nn.Module):
"""Abstract class of embedding model for training.
Args:
base_model: The base model to train on.
tokenizer (PreTrainedTokenizer, optional): The tokenizer to use. Defaults to ``None``.
negatives_cross_device (bool, optional): If True, will compute cross devices negative loss. Defaults to ``False``.
temperature (float, optional): Temperature to control the scale of scores. Defaults to ``1.0``.
sub_batch_size (int, optional): Sub-batch size during encoding. If negative, will not split to sub-batch.
Defaults to ``-1``.
kd_loss_type (str, optional): Type of knowledge distillation loss. Defaults to ``"kl_div"``.
"""
def __init__(
self,
base_model,
tokenizer: PreTrainedTokenizer = None,
negatives_cross_device: bool = False,
temperature: float = 1.0,
sub_batch_size: int = -1,
kd_loss_type: str = 'kl_div',
):
nn.Module.__init__(self)
self.model = base_model
self.tokenizer = tokenizer
self.temperature = temperature
self.negatives_cross_device = negatives_cross_device
if self.negatives_cross_device:
if not dist.is_initialized():
raise ValueError('Distributed training has not been initialized for representation all gather.')
self.process_rank = dist.get_rank() if dist.is_initialized() else 0
self.world_size = dist.get_world_size() if dist.is_initialized() else 1
self.sub_batch_size = sub_batch_size
self.kd_loss_type = kd_loss_type
@abstractmethod
def encode(self, features):
"""Abstract method encode and get the embedding.
Args:
features (Union[list, dict]): Features feed to the model.
"""
pass
@abstractmethod
def compute_loss(self, scores, target):
"""Abstract method compute the loss.
Args:
scores (torch.Tensor): Computed score.
target (torch.Tensor): The target value.
"""
pass
@abstractmethod
def compute_score(self, q_reps, p_reps):
"""Abstract method to compute the score.
Args:
q_reps (torch.Tensor): Queries representations.
p_reps (torch.Tensor): Passages rerpresentations.
"""
pass
@abstractmethod
def save(self, output_dir: str):
"""Abstract method to save the model.
Args:
output_dir (str): Directory for saving the model.
"""
pass
def get_local_score(self, q_reps, p_reps, all_scores):
"""Get the local score of queries and passages.
Args:
q_reps (torch.Tensor): Queries representations.
p_reps (torch.Tensor): Passages rerpresentations.
all_scores (torch.Tensor): All the query-passage scores computed.
Returns:
torch.Tensor: Local scores to compute loss.
"""
group_size = p_reps.size(0) // q_reps.size(0)
indices = torch.arange(0, q_reps.size(0), device=q_reps.device) * group_size
specific_scores = []
for i in range(group_size):
specific_scores.append(
all_scores[torch.arange(q_reps.size(0), device=q_reps.device), indices + i]
)
return torch.stack(specific_scores, dim=1).view(q_reps.size(0), -1)
def compute_local_score(self, q_reps, p_reps, compute_score_func=None, **kwargs):
"""Compute the local score of queries and passages.
Args:
q_reps (torch.Tensor): Queries representations.
p_reps (torch.Tensor): Passages rerpresentations.
compute_score_func (function, optional): Function to compute score. Defaults to ``None``, which will use the
:meth:`self.compute_score`.
Returns:
torch.Tensor: Local scores to compute loss.
"""
if compute_score_func is None:
all_scores = self.compute_score(q_reps, p_reps)
else:
all_scores = compute_score_func(q_reps, p_reps, **kwargs)
loacl_scores = self.get_local_score(q_reps, p_reps, all_scores)
return loacl_scores
def _compute_no_in_batch_neg_loss(self, q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs):
"""
Compute loss when using no in-batch negatives and no cross-device negatives
"""
group_size = p_reps.size(0) // q_reps.size(0)
local_scores = self.compute_local_score(q_reps, p_reps, compute_score_func, **kwargs) # (batch_size, group_size)
if teacher_targets is not None:
# compute kd loss
loss = self.distill_loss(self.kd_loss_type, teacher_targets, local_scores, group_size=group_size)
# add normal loss if needed
if self.kd_loss_type == "kl_div":
local_targets = torch.zeros(local_scores.size(0), device=local_scores.device, dtype=torch.long) # (batch_size)
loss += self.compute_loss(local_scores, local_targets)
else:
local_targets = torch.zeros(local_scores.size(0), device=local_scores.device, dtype=torch.long) # (batch_size)
loss = self.compute_loss(local_scores, local_targets)
return local_scores, loss
def _compute_in_batch_neg_loss(self, q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs):
"""
Compute loss when only using in-batch negatives
"""
group_size = p_reps.size(0) // q_reps.size(0)
if compute_score_func is None:
scores = self.compute_score(q_reps, p_reps) # (batch_size, batch_size * group_size)
else:
scores = compute_score_func(q_reps, p_reps, **kwargs) # (batch_size, batch_size * group_size)
if teacher_targets is not None:
# compute kd loss
if self.kd_loss_type == "kl_div":
student_scores = self.get_local_score(q_reps, p_reps, scores) # (batch_size, group_size)
loss = self.distill_loss(self.kd_loss_type, teacher_targets, student_scores, group_size)
idxs = torch.arange(q_reps.size(0), device=q_reps.device, dtype=torch.long)
targets = idxs * (p_reps.size(0) // q_reps.size(0)) # (batch_size)
loss += self.compute_loss(scores, targets)
elif self.kd_loss_type == "m3_kd_loss":
loss = self.distill_loss(self.kd_loss_type, teacher_targets, scores, group_size)
else:
raise ValueError(f"Invalid kd_loss_type: {self.kd_loss_type}")
else:
idxs = torch.arange(q_reps.size(0), device=q_reps.device, dtype=torch.long)
targets = idxs * group_size # (batch_size)
loss = self.compute_loss(scores, targets)
return scores, loss
def _compute_cross_device_neg_loss(self, q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs):
"""
Compute loss when using both in-batch negatives and cross-device negatives
"""
group_size = p_reps.size(0) // q_reps.size(0)
cross_q_reps = self._dist_gather_tensor(q_reps) # (world_size * batch_size, dim)
cross_p_reps = self._dist_gather_tensor(p_reps) # (world_size * batch_size * group_size, dim)
if compute_score_func is None:
cross_scores = self.compute_score(cross_q_reps, cross_p_reps) # (world_size * batch_size, world_size * batch_size * group_size)
else:
cross_scores = compute_score_func(cross_q_reps, cross_p_reps, **kwargs) # (world_size * batch_size, world_size * batch_size * group_size)
if teacher_targets is not None:
# compute kd loss
if self.kd_loss_type == "kl_div":
student_scores = self.get_local_score(cross_q_reps, cross_p_reps, cross_scores) # (world_size * batch_size, group_size)
student_scores = student_scores[
q_reps.size(0)*self.process_rank : q_reps.size(0)*(self.process_rank+1)
] # (batch_size, group_size)
loss = self.distill_loss(self.kd_loss_type, teacher_targets, student_scores, group_size)
cross_idxs = torch.arange(cross_q_reps.size(0), device=cross_q_reps.device, dtype=torch.long)
cross_targets = cross_idxs * group_size # (world_size * batch_size)
loss += self.compute_loss(cross_scores, cross_targets)
elif self.kd_loss_type == "m3_kd_loss":
cross_teacher_targets = self._dist_gather_tensor(teacher_targets) # (world_size * batch_size, group_size)
loss = self.distill_loss(self.kd_loss_type, cross_teacher_targets, cross_scores, group_size)
else:
raise ValueError(f"Invalid kd_loss_type: {self.kd_loss_type}")
else:
cross_idxs = torch.arange(cross_q_reps.size(0), device=cross_q_reps.device, dtype=torch.long)
cross_targets = cross_idxs * group_size # (world_size * batch_size)
loss = self.compute_loss(cross_scores, cross_targets)
return cross_scores, loss
def forward(
self,
queries: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None,
passages: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None,
teacher_scores: Union[None, List[float]] = None,
no_in_batch_neg_flag: bool = False,
):
"""The computation performed at every call.
Args:
queries (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional): Input queries. Defaults to ``None``.
passages (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional): Input passages. Defaults to ``None``.
teacher_scores (Union[None, List[float]], optional): Teacher scores for distillation. Defaults to ``None``.
no_in_batch_neg_flag (bool, optional): If True, use no in-batch negatives and no cross-device negatives. Defaults to ``False``.
Returns:
EmbedderOutput: Output of the forward call of model.
"""
q_reps = self.encode(queries) # (batch_size, dim)
p_reps = self.encode(passages) # (batch_size * group_size, dim)
if self.training:
if teacher_scores is not None:
teacher_scores = torch.tensor(teacher_scores, device=q_reps.device)
teacher_scores = teacher_scores.view(q_reps.size(0), -1).detach() # (batch_size, group_size)
teacher_targets = F.softmax(teacher_scores, dim=-1) # (batch_size, group_size)
else:
teacher_targets = None
if no_in_batch_neg_flag:
compute_loss_func = self._compute_no_in_batch_neg_loss
else:
if self.negatives_cross_device:
compute_loss_func = self._compute_cross_device_neg_loss
else:
compute_loss_func = self._compute_in_batch_neg_loss
scores, loss = compute_loss_func(q_reps, p_reps, teacher_targets=teacher_targets)
else:
loss = None
return EmbedderOutput(
loss=loss,
)
@staticmethod
def distill_loss(kd_loss_type, teacher_targets, student_scores, group_size=None):
"""Compute the distillation loss.
Args:
kd_loss_type (str): Type of knowledge distillation loss, supports "kl_div" and "m3_kd_loss".
teacher_targets (torch.Tensor): Targets from the teacher model.
student_scores (torch.Tensor): Score of student model.
group_size (int, optional): Number of groups for . Defaults to ``None``.
Raises:
ValueError: Invalid kd_loss_type
Returns:
torch.Tensor: A scalar of computed distillation loss.
"""
if kd_loss_type == 'kl_div':
# teacher_targets: (batch_size, group_size) / (world_size * batch_size, group_size)
# student_scores: (batch_size, group_size) / (world_size * batch_size, group_size)
return - torch.mean(
torch.sum(torch.log_softmax(student_scores, dim=-1) * teacher_targets, dim=-1)
)
elif kd_loss_type == 'm3_kd_loss':
# teacher_targets: (batch_size, group_size) / (world_size * batch_size, group_size)
# student_scores: (batch_size, batch_size * group_size) / (world_size * batch_size, world_size * batch_size * group_size)
labels = torch.arange(student_scores.size(0), device=student_scores.device, dtype=torch.long)
labels = labels * group_size
loss = 0
mask = torch.zeros_like(student_scores)
for i in range(group_size):
temp_target = labels + i
temp_scores = student_scores + mask
temp_loss = F.cross_entropy(temp_scores, temp_target, reduction="none") # B
loss += torch.mean(teacher_targets[:, i] * temp_loss)
mask = torch.scatter(mask, dim=-1, index=temp_target.unsqueeze(-1),
value=torch.finfo(student_scores.dtype).min)
return loss
else:
raise ValueError(f"Invalid kd_loss_type: {kd_loss_type}")
def _dist_gather_tensor(self, t: Optional[torch.Tensor]):
"""Gather a tensor from all processes in a distributed setting.
Args:
t (Optional[torch.Tensor]): The input tensor to be gathered. If `None`, no gathering is performed.
Returns:
Union[torch.Tensor, None]: A concatenated tensor from all processes if ``t`` is not ``None``,
otherwise returns ``None``.
"""
if t is None:
return None
t = t.contiguous()
all_tensors = [torch.empty_like(t) for _ in range(self.world_size)]
dist.all_gather(all_tensors, t)
all_tensors[self.process_rank] = t
all_tensors = torch.cat(all_tensors, dim=0)
return all_tensors
================================================
FILE: FlagEmbedding/abc/finetune/embedder/AbsRunner.py
================================================
import os
import logging
from pathlib import Path
from typing import Tuple
from abc import ABC, abstractmethod
from transformers import set_seed, PreTrainedTokenizer
from .AbsArguments import (
AbsEmbedderModelArguments,
AbsEmbedderDataArguments,
AbsEmbedderTrainingArguments
)
from .AbsTrainer import AbsEmbedderTrainer
from .AbsModeling import AbsEmbedderModel
from .AbsDataset import (
AbsEmbedderTrainDataset, AbsEmbedderCollator,
AbsEmbedderSameDatasetTrainDataset, AbsEmbedderSameDatasetCollator
)
logger = logging.getLogger(__name__)
class AbsEmbedderRunner(ABC):
"""Abstract class to run embedding model fine-tuning.
Args:
model_args (AbsEmbedderModelArguments): Model arguments
data_args (AbsEmbedderDataArguments): Data arguments.
training_args (AbsEmbedderTrainingArguments): Training arguments.
"""
def __init__(
self,
model_args: AbsEmbedderModelArguments,
data_args: AbsEmbedderDataArguments,
training_args: AbsEmbedderTrainingArguments
):
self.model_args = model_args
self.data_args = data_args
self.training_args = training_args
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank,
training_args.device,
training_args.n_gpu,
bool(training_args.local_rank != -1),
training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)
logger.info("Model parameters %s", model_args)
logger.info("Data parameters %s", data_args)
# Set seed
set_seed(training_args.seed)
self.tokenizer, self.model = self.load_tokenizer_and_model()
self.train_dataset = self.load_train_dataset()
self.data_collator = self.load_data_collator()
self.trainer = self.load_trainer()
@abstractmethod
def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderModel]:
"""Abstract method to load the tokenizer and model.
Returns:
Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Loaded tokenizer and model instances.
"""
pass
@abstractmethod
def load_trainer(self) -> AbsEmbedderTrainer:
"""Abstract method to load the trainer.
Returns:
AbsEmbedderTrainer: The loaded trainer instance.
"""
pass
def load_train_dataset(self) -> AbsEmbedderTrainDataset:
"""Loads the training dataset based on data arguments.
Returns:
AbsEmbedderTrainDataset: The loaded dataset instance.
"""
if self.data_args.same_dataset_within_batch:
train_dataset = AbsEmbedderSameDatasetTrainDataset(
args=self.data_args,
default_batch_size=self.training_args.per_device_train_batch_size,
seed=self.training_args.seed,
tokenizer=self.tokenizer,
process_index=self.training_args.process_index,
num_processes=self.training_args.world_size
)
self.training_args.per_device_train_batch_size = 1
self.training_args.dataloader_num_workers = 0 # avoid multi-processing
else:
train_dataset = AbsEmbedderTrainDataset(
args=self.data_args,
tokenizer=self.tokenizer
)
return train_dataset
def load_data_collator(self) -> AbsEmbedderCollator:
"""Loads the appropriate data collator.
Returns:
AbsEmbedderCollator: Loaded data collator.
"""
if self.data_args.same_dataset_within_batch:
EmbedCollator = AbsEmbedderSameDatasetCollator
else:
EmbedCollator = AbsEmbedderCollator
data_collator = EmbedCollator(
tokenizer=self.tokenizer,
query_max_len=self.data_args.query_max_len,
passage_max_len=self.data_args.passage_max_len,
sub_batch_size=self.training_args.sub_batch_size,
pad_to_multiple_of=self.data_args.pad_to_multiple_of,
padding=True,
return_tensors="pt"
)
return data_collator
def run(self):
"""
Executes the training process.
"""
Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True)
# Training
self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint)
self.trainer.save_model()
================================================
FILE: FlagEmbedding/abc/finetune/embedder/AbsTrainer.py
================================================
import logging
from typing import Optional
from abc import ABC, abstractmethod
from transformers.trainer import Trainer
logger = logging.getLogger(__name__)
class AbsEmbedderTrainer(ABC, Trainer):
"""
Abstract class for the trainer of embedder.
"""
@abstractmethod
def _save(self, output_dir: Optional[str] = None, state_dict=None):
pass
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
"""
How the loss is computed by Trainer. By default, all models return the loss in the first element.
Subclass and override for custom behavior.
Args:
model (AbsEmbedderModel): The model being trained.
inputs (dict): A dictionary of input tensors to be passed to the model.
return_outputs (bool, optional): If ``True``, returns both the loss and the model's outputs. Otherwise,
returns only the loss.
Returns:
Union[torch.Tensor, tuple(torch.Tensor, EmbedderOutput)]: The computed loss. If ``return_outputs`` is ``True``,
also returns the model's outputs in a tuple ``(loss, outputs)``.
"""
outputs = model(**inputs)
loss = outputs.loss
return (loss, outputs) if return_outputs else loss
================================================
FILE: FlagEmbedding/abc/finetune/embedder/__init__.py
================================================
from .AbsArguments import (
AbsEmbedderDataArguments,
AbsEmbedderModelArguments,
AbsEmbedderTrainingArguments,
)
from .AbsDataset import (
AbsEmbedderCollator, AbsEmbedderSameDatasetCollator,
AbsEmbedderSameDatasetTrainDataset,
AbsEmbedderTrainDataset,
EmbedderTrainerCallbackForDataRefresh,
)
from .AbsModeling import AbsEmbedderModel, EmbedderOutput
from .AbsTrainer import AbsEmbedderTrainer
from .AbsRunner import AbsEmbedderRunner
__all__ = [
"AbsEmbedderModelArguments",
"AbsEmbedderDataArguments",
"AbsEmbedderTrainingArguments",
"AbsEmbedderModel",
"AbsEmbedderTrainer",
"AbsEmbedderRunner",
"AbsEmbedderTrainDataset",
"AbsEmbedderCollator",
"AbsEmbedderSameDatasetTrainDataset",
"AbsEmbedderSameDatasetCollator",
"EmbedderOutput",
"EmbedderTrainerCallbackForDataRefresh",
]
================================================
FILE: FlagEmbedding/abc/finetune/reranker/AbsArguments.py
================================================
import os
from typing import Optional
from dataclasses import dataclass, field
from transformers import TrainingArguments
@dataclass
class AbsRerankerModelArguments:
"""
Abstract class for reranker model arguments.
"""
model_name_or_path: str = field(
metadata={"help": "The model checkpoint for initialization."}
)
config_name: str = field(
default=None,
metadata={"help": "Pretrained config name or path if not the same as model_name."}
)
tokenizer_name: str = field(
default=None,
metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."}
)
cache_dir: str = field(
default=None,
metadata={"help": "Where do you want to store the pre-trained models downloaded from s3."}
)
trust_remote_code: bool = field(
default=False,
metadata={"help": "Trust remote code"}
)
model_type: str = field(
default='encoder',
metadata={"help": "Type of finetune, ['encoder', 'decoder']"}
)
use_fast_tokenizer: bool = field(
default=True,
metadata={"help": "Whether to use fast tokenizer or not."}
)
token: str = field(
default_factory=lambda: os.getenv('HF_TOKEN', None),
metadata={"help": "The token to use when accessing the model."}
)
# finetune_type: str = field(
# default='sratch',
# metadata={"help": "Type of finetune, ['sratch', 'finetune']"}
# )
@dataclass
class AbsRerankerDataArguments:
"""
Abstract class for reranker data arguments.
"""
train_data: str = field(
default=None, metadata={
"help": "One or more paths to training data. `query: str`, `pos: List[str]`, `neg: List[str]` are required in the training data.",
"nargs": "+"
}
)
cache_path: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the cached data"}
)
train_group_size: int = field(default=8)
query_max_len: int = field(
default=32,
metadata={
"help": "The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated."
},
)
passage_max_len: int = field(
default=128,
metadata={
"help": "The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated."
},
)
max_len: int = field(
default=512,
metadata={
"help": "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated."
},
)
pad_to_multiple_of: Optional[int] = field(
default=None,
metadata={
"help": "If set will pad the sequence to be a multiple of the provided value."
},
)
max_example_num_per_dataset: int = field(
default=100000000, metadata={"help": "the max number of examples for each dataset"}
)
query_instruction_for_rerank: str= field(
default=None, metadata={"help": "instruction for query"}
)
query_instruction_format: str = field(
default="{}{}", metadata={"help": "format for query instruction"}
)
knowledge_distillation: bool = field(
default=False,
metadata={"help": "Use knowledge distillation when `pos_scores: List[float]` and `neg_scores: List[float]` are in features of training data"}
)
passage_instruction_for_rerank: Optional[str] = field(
default=None, metadata={"help": "instruction for passage"}
)
passage_instruction_format: Optional[str] = field(
default="{}{}", metadata={"help": "format for passage instruction"}
)
shuffle_ratio: float = field(
default=0.0, metadata={"help": "The ratio of shuffling the text"}
)
sep_token: str = field(
default='\n', metadata={"help": "The sep token for LLM reranker to discriminate between query and passage"}
)
def __post_init__(self):
# replace "\\n" with "\n"
if "\\n" in self.query_instruction_format:
self.query_instruction_format = self.query_instruction_format.replace("\\n", "\n")
if "\\n" in self.passage_instruction_format:
self.passage_instruction_format = self.passage_instruction_format.replace("\\n", "\n")
# check the existence of train data
for train_dir in self.train_data:
if not os.path.exists(train_dir):
raise FileNotFoundError(f"cannot find file: {train_dir}, please set a true path")
@dataclass
class AbsRerankerTrainingArguments(TrainingArguments):
sub_batch_size: Optional[int] = field(default=None, metadata={"help": "sub batch size for training, not implemented yet"})
================================================
FILE: FlagEmbedding/abc/finetune/reranker/AbsDataset.py
================================================
import os
import math
import random
import logging
import datasets
import numpy as np
import torch.distributed as dist
from dataclasses import dataclass
from torch.utils.data import Dataset
from transformers import (
PreTrainedTokenizer,
DataCollatorWithPadding,
BatchEncoding,
DataCollatorForSeq2Seq
)
from typing import List
from .AbsArguments import AbsRerankerDataArguments
logger = logging.getLogger(__name__)
class AbsRerankerTrainDataset(Dataset):
"""Abstract class for reranker training dataset.
Args:
args (AbsRerankerDataArguments): Data arguments.
tokenizer (PreTrainedTokenizer): Tokenizer to use.
"""
def __init__(
self,
args: AbsRerankerDataArguments,
tokenizer: PreTrainedTokenizer
):
self.args = args
self.tokenizer = tokenizer
train_datasets = []
for data_dir in args.train_data:
if not os.path.isdir(data_dir):
if not (data_dir.endswith('.json') or data_dir.endswith('.jsonl')): continue
temp_dataset = self._load_dataset(data_dir)
if len(temp_dataset) == 0: continue
train_datasets.append(temp_dataset)
else:
for file in os.listdir(data_dir):
if not (file.endswith('.json') or file.endswith('.jsonl')): continue
temp_dataset = self._load_dataset(os.path.join(data_dir, file))
if len(temp_dataset) == 0: continue
train_datasets.append(temp_dataset)
self.dataset = datasets.concatenate_datasets(train_datasets)
self.max_length = self.args.query_max_len + self.args.passage_max_len
def _load_dataset(self, file_path: str):
"""Load dataset from path.
Args:
file_path (str): Path to load the datasets from.
Raises:
ValueError: `pos_scores` and `neg_scores` not found in the features of training data
Returns:
datasets.Dataset: Loaded HF dataset.
"""
safe_rank = dist.get_rank() if dist.is_initialized() else 0
if safe_rank == 0:
logger.info(f'loading data from {file_path} ...')
temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=self.args.cache_path)
if len(temp_dataset) > self.args.max_example_num_per_dataset:
temp_dataset = temp_dataset.select(random.sample(list(range(len(temp_dataset))), self.args.max_example_num_per_dataset))
if not self.args.knowledge_distillation:
if 'pos_scores' in temp_dataset.column_names:
temp_dataset = temp_dataset.remove_columns(['pos_scores'])
if 'neg_scores' in temp_dataset.column_names:
temp_dataset = temp_dataset.remove_columns(['neg_scores'])
else:
if 'pos_scores' not in temp_dataset.column_names or 'neg_scores' not in temp_dataset.column_names:
raise ValueError(f"`pos_scores` and `neg_scores` not found in the features of training data in {file_path}, which is necessary when using knowledge distillation.")
return temp_dataset
def _shuffle_text(self, text):
"""shuffle the input text.
Args:
text (str): Input text.
Returns:
str: Shuffled text.
"""
if self.args.shuffle_ratio > 0 and len(text) > 100 and random.random() < self.args.shuffle_ratio:
split_text = []
chunk_size = len(text)//3 + 1
for i in range(0, len(text), chunk_size):
split_text.append(text[i:i+chunk_size])
random.shuffle(split_text)
return " ".join(split_text)
else:
return text
def __len__(self):
return len(self.dataset)
def create_one_example(self, qry_encoding: str, doc_encoding: str):
"""Creates a single input example by encoding and preparing a query and document pair for the model.
Args:
qry_encoding (str): Query to be encoded.
doc_encoding (str): Document to be encoded.
Returns:
dict: A dictionary containing tokenized and prepared inputs, ready for model consumption.
"""
qry_inputs = self.tokenizer.encode(qry_encoding, truncation=True, max_length=self.args.query_max_len + self.args.passage_max_len // 4, add_special_tokens=False)
doc_inputs = self.tokenizer.encode(doc_encoding, truncation=True, max_length=self.args.passage_max_len + self.args.query_max_len // 2, add_special_tokens=False)
item = self.tokenizer.prepare_for_model(
qry_inputs,
doc_inputs,
truncation='only_second',
max_length=self.args.query_max_len + self.args.passage_max_len,
padding=False,
)
return item
def __getitem__(self, item):
data = self.dataset[item]
train_group_size = self.args.train_group_size
query = data['query']
if self.args.query_instruction_for_rerank is not None:
query = self.args.query_instruction_format.format(
data['query_prompt'] if 'query_prompt' in data else self.args.query_instruction_for_rerank,
query
)
passages = []
teacher_scores = []
assert isinstance(data['pos'], list) and isinstance(data['neg'], list)
pos_idx = random.choice(list(range(len(data['pos']))))
passages.append(self._shuffle_text(data['pos'][pos_idx]))
neg_all_idx = list(range(len(data['neg'])))
if len(data['neg']) < train_group_size - 1:
num = math.ceil((train_group_size - 1) / len(data['neg']))
neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1)
else:
neg_idxs = random.sample(neg_all_idx, self.args.train_group_size - 1)
for neg_idx in neg_idxs:
passages.append(data['neg'][neg_idx])
if self.args.knowledge_distillation:
assert isinstance(data['pos_scores'], list) and isinstance(data['neg_scores'], list)
teacher_scores.append(data['pos_scores'][pos_idx])
for neg_idx in neg_idxs:
teacher_scores.append(data['neg_scores'][neg_idx])
if not all(isinstance(score, (int, float)) for score in teacher_scores):
raise ValueError(f"pos_score or neg_score must be digit")
else:
teacher_scores = None
if self.args.passage_instruction_for_rerank is not None:
passages = [
self.args.passage_instruction_format.format(
data['passage_prompt'] if 'passage_prompt' in data else self.args.passage_instruction_for_rerank, p
)
for p in passages
]
batch_data = []
for passage in passages:
batch_data.append(self.create_one_example(query, passage))
return batch_data, teacher_scores
@dataclass
class AbsRerankerCollator(DataCollatorWithPadding):
"""
The abstract reranker collator.
"""
query_max_len: int = 32
passage_max_len: int = 128
def __call__(self, features) -> List[BatchEncoding]:
teacher_scores = [f[1] for f in features]
if teacher_scores[0] is None:
teacher_scores = None
elif isinstance(teacher_scores[0], list):
teacher_scores = sum(teacher_scores, [])
features = [f[0] for f in features]
if isinstance(features[0], list):
features = sum(features, [])
collated = self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.query_max_len + self.passage_max_len,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=self.return_tensors,
)
return {
"pair": collated,
"teacher_scores": teacher_scores,
}
class AbsLLMRerankerTrainDataset(AbsRerankerTrainDataset):
"""Abstract class for LLM reranker training dataset.
Args:
args (AbsRerankerDataArguments): Data arguments.
tokenizer (PreTrainedTokenizer): Tokenizer to use.
"""
def __init__(
self,
args: AbsRerankerDataArguments,
tokenizer: PreTrainedTokenizer
):
super().__init__(args, tokenizer)
sep = self.args.sep_token
self.sep_inputs = self.tokenizer(
sep,
return_tensors=None,
add_special_tokens=False
)['input_ids']
def __getitem__(self, item) -> List[BatchEncoding]:
data = self.dataset[item]
train_group_size = self.args.train_group_size
query = data['query']
if self.args.query_instruction_for_rerank is not None:
query = self.args.query_instruction_format.format(
data['query_prompt'] if 'query_prompt' in data else self.args.query_instruction_for_rerank,
query
)
passages = []
teacher_scores = []
assert isinstance(data['pos'], list) and isinstance(data['neg'], list)
pos_idx = random.choice(list(range(len(data['pos']))))
passages.append(self._shuffle_text(data['pos'][pos_idx]))
neg_all_idx = list(range(len(data['neg'])))
if len(data['neg']) < train_group_size - 1:
num = math.ceil((train_group_size - 1) / len(data['neg']))
neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1)
else:
neg_idxs = random.sample(neg_all_idx, self.args.train_group_size - 1)
for neg_idx in neg_idxs:
passages.append(data['neg'][neg_idx])
if self.args.knowledge_distillation:
assert isinstance(data['pos_scores'], list) and isinstance(data['neg_scores'], list)
teacher_scores.append(data['pos_scores'][pos_idx])
for neg_idx in neg_idxs:
teacher_scores.append(data['neg_scores'][neg_idx])
if not all(isinstance(score, (int, float)) for score in teacher_scores):
raise ValueError(f"pos_score or neg_score must be digit")
else:
teacher_scores = None
if self.args.passage_instruction_for_rerank is not None:
passages = [
self.args.passage_instruction_format.format(
data['passage_prompt'] if 'passage_prompt' in data else self.args.passage_instruction_for_rerank, p
)
for p in passages
]
prompt = self.dataset[item].get('prompt', "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'.")
query_inputs = self.tokenizer(
query,
return_tensors=None,
max_length=self.args.query_max_len + self.args.passage_max_len // 4,
truncation=True,
add_special_tokens=False
)
prompt_inputs = self.tokenizer(
prompt,
return_tensors=None,
add_special_tokens=False
)['input_ids']
max_length = self.max_length - len(prompt_inputs) - len(self.sep_inputs)
passages_inputs = []
for i, passage in enumerate(passages):
passage_inputs = self.tokenizer(
passage,
return_tensors=None,
max_length=self.args.passage_max_len + self.args.query_max_len // 2,
truncation=True,
add_special_tokens=False
)
if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.tokenizer.pad_token_id:
item = self.tokenizer.prepare_for_model(
[self.tokenizer.bos_token_id] + query_inputs['input_ids'],
self.sep_inputs + passage_inputs['input_ids'],
truncation='only_second',
max_length=max_length,
padding=False,
return_attention_mask=False,
return_token_type_ids=False,
add_special_tokens=False
)
else:
item = self.tokenizer.prepare_for_model(
query_inputs['input_ids'],
self.sep_inputs + passage_inputs['input_ids'],
truncation='only_second',
max_length=max_length,
padding=False,
return_attention_mask=False,
return_token_type_ids=False,
add_special_tokens=False
)
passage_inputs['input_ids'] = item['input_ids'] + self.sep_inputs + prompt_inputs
passage_inputs['attention_mask'] = [1] * len(passage_inputs['input_ids'])
# passage_inputs['labels'] = passage_inputs['input_ids'].copy()
# passage_inputs['labels'] = [-100] * (len(passage_inputs['input_ids']) - 1) + passage_inputs['labels'][(len(passage_inputs['input_ids']) - 1):]
passage_inputs.pop('token_type_ids') if 'token_type_ids' in passage_inputs.keys() else None
if 'position_ids' in passage_inputs.keys():
passage_inputs['position_ids'] = list(range(len(passage_inputs['input_ids'])))
passages_inputs.append(passage_inputs)
return passages_inputs, teacher_scores
@dataclass
class AbsLLMRerankerCollator(DataCollatorForSeq2Seq):
"""
Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
and pass batch separately to the actual collator.
Abstract out data detail for the model.
"""
query_max_len: int = 32
passage_max_len: int = 128
def __call__(self, features, return_tensors='pt'):
if return_tensors is None:
return_tensors = self.return_tensors
teacher_scores = [f[1] for f in features]
if teacher_scores[0] is None:
teacher_scores = None
elif isinstance(teacher_scores[0], list):
teacher_scores = sum(teacher_scores, [])
features = [f[0] for f in features]
if isinstance(features[0], list):
features = sum(features, [])
labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
# same length to return tensors.
if labels is not None:
max_label_length = max(len(l) for l in labels)
# print(max_label_length)
if self.pad_to_multiple_of is not None:
max_label_length = (
(max_label_length + self.pad_to_multiple_of - 1)
// self.pad_to_multiple_of
* self.pad_to_multiple_of
)
padding_side = self.tokenizer.padding_side
for feature in features:
remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
if isinstance(feature["labels"], list):
feature["labels"] = (
feature["labels"] + remainder
if padding_side == "right" else remainder + feature["labels"]
)
elif padding_side == "right":
feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
else:
feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
collated = self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.query_max_len + self.passage_max_len,
return_tensors=return_tensors,
pad_to_multiple_of=self.pad_to_multiple_of,
)
return {
"pair": collated,
"teacher_scores": teacher_scores,
}
================================================
FILE: FlagEmbedding/abc/finetune/reranker/AbsModeling.py
================================================
import torch
from torch import nn, Tensor
from transformers import PreTrainedTokenizer
from transformers.file_utils import ModelOutput
import logging
from dataclasses import dataclass
from abc import ABC, abstractmethod
from typing import Dict, Optional, List, Union
logger = logging.getLogger(__name__)
@dataclass
class RerankerOutput(ModelOutput):
loss: Optional[Tensor] = None
scores: Optional[Tensor] = None
class AbsRerankerModel(ABC, nn.Module):
"""Abstract class of embedding model for training.
Args:
base_model: The base model to train on.
tokenizer (PreTrainedTokenizer, optional): The tokenizer to use. Defaults to ``None``.
train_batch_size (int, optional): Batch size used for training. Defaults to ``4``.
"""
def __init__(
self,
base_model: None,
tokenizer: PreTrainedTokenizer = None,
train_batch_size: int = 4,
):
nn.Module.__init__(self)
self.model = base_model
self.tokenizer = tokenizer
self.cross_entropy = nn.CrossEntropyLoss(reduction='mean')
if self.model.config.pad_token_id is None:
self.model.config.pad_token_id = self.tokenizer.pad_token_id
self.config = self.model.config
self.train_batch_size = train_batch_size
self.yes_loc = self.tokenizer('Yes', add_special_tokens=False)['input_ids'][-1]
def gradient_checkpointing_enable(self, **kwargs):
"""
Activates gradient checkpointing for the current model.
"""
self.model.gradient_checkpointing_enable(**kwargs)
def enable_input_require_grads(self, **kwargs):
"""
Enables the gradients for the input embeddings.
"""
self.model.enable_input_require_grads(**kwargs)
@abstractmethod
def encode(self, features):
"""Abstract method of encode.
Args:
features (dict): Teatures to pass to the model.
"""
pass
def forward(self, pair: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None, teacher_scores: Optional[Tensor] = None):
"""The computation performed at every call.
Args:
pair (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional): The query-document pair. Defaults to ``None``.
teacher_scores (Optional[Tensor], optional): Teacher scores of knowledge distillation. Defaults to None.
Returns:
RerankerOutput: Output of reranker model.
"""
ranker_logits = self.encode(pair) # (batch_size * num, dim)
if teacher_scores is not None:
teacher_scores = torch.Tensor(teacher_scores)
teacher_targets = teacher_scores.view(self.train_batch_size, -1)
teacher_targets = torch.softmax(teacher_targets.detach(), dim=-1)
if self.training:
grouped_logits = ranker_logits.view(self.train_batch_size, -1)
target = torch.zeros(self.train_batch_size, device=grouped_logits.device, dtype=torch.long)
loss = self.compute_loss(grouped_logits, target)
if teacher_scores is not None:
teacher_targets = teacher_targets.to(grouped_logits.device)
# print(teacher_targets, torch.mean(torch.sum(torch.log_softmax(grouped_logits, dim=-1) * teacher_targets, dim=-1)))
loss += - torch.mean(torch.sum(torch.log_softmax(grouped_logits, dim=-1) * teacher_targets, dim=-1))
else:
loss = None
# print(loss)
return RerankerOutput(
loss=loss,
scores=ranker_logits,
)
def compute_loss(self, scores, target):
"""Compute the loss.
Args:
scores (torch.Tensor): Computed scores.
target (torch.Tensor): The target value.
Returns:
torch.Tensor: The computed loss.
"""
return self.cross_entropy(scores, target)
def save(self, output_dir: str):
"""Save the model.
Args:
output_dir (str): Directory for saving the model.
"""
# self.model.save_pretrained(output_dir)
state_dict = self.model.state_dict()
state_dict = type(state_dict)(
{k: v.clone().cpu()
for k,
v in state_dict.items()})
self.model.save_pretrained(output_dir, state_dict=state_dict)
def save_pretrained(self, *args, **kwargs):
"""
Save the tokenizer and model.
"""
self.tokenizer.save_pretrained(*args, **kwargs)
return self.model.save_pretrained(*args, **kwargs)
================================================
FILE: FlagEmbedding/abc/finetune/reranker/AbsRunner.py
================================================
import os
import logging
from pathlib import Path
from typing import Tuple
from abc import ABC, abstractmethod
from transformers import set_seed, PreTrainedTokenizer
from .AbsArguments import (
AbsRerankerModelArguments,
AbsRerankerDataArguments,
AbsRerankerTrainingArguments
)
from .AbsTrainer import AbsRerankerTrainer
from .AbsModeling import AbsRerankerModel
from .AbsDataset import (
AbsRerankerTrainDataset, AbsRerankerCollator,
AbsLLMRerankerTrainDataset, AbsLLMRerankerCollator
)
logger = logging.getLogger(__name__)
class AbsRerankerRunner(ABC):
"""Abstract class to run reranker model fine-tuning.
Args:
model_args (AbsRerankerModelArguments): Model arguments
data_args (AbsRerankerDataArguments): Data arguments.
training_args (AbsRerankerTrainingArguments): Training arguments.
"""
def __init__(
self,
model_args: AbsRerankerModelArguments,
data_args: AbsRerankerDataArguments,
training_args: AbsRerankerTrainingArguments
):
self.model_args = model_args
self.data_args = data_args
self.training_args = training_args
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank,
training_args.device,
training_args.n_gpu,
bool(training_args.local_rank != -1),
training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)
logger.info("Model parameters %s", model_args)
logger.info("Data parameters %s", data_args)
# Set seed
set_seed(training_args.seed)
self.tokenizer, self.model = self.load_tokenizer_and_model()
self.train_dataset = self.load_train_dataset()
self.data_collator = self.load_data_collator()
self.trainer = self.load_trainer()
@abstractmethod
def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRerankerModel]:
"""Abstract method to load the tokenizer and model.
Returns:
Tuple[PreTrainedTokenizer, AbsRerankerModel]: Loaded tokenizer and model instances.
"""
pass
@abstractmethod
def load_trainer(self) -> AbsRerankerTrainer:
"""Abstract method to load the trainer.
Returns:
AbsRerankerTrainer: The loaded trainer instance.
"""
pass
def load_train_dataset(self) -> AbsRerankerTrainDataset:
"""Loads the training dataset based on data arguments.
Returns:
AbsRerankerTrainDataset: The loaded dataset instance.
"""
if self.model_args.model_type == 'encoder':
train_dataset = AbsRerankerTrainDataset(
args=self.data_args,
tokenizer=self.tokenizer
)
else:
train_dataset = AbsLLMRerankerTrainDataset(
args=self.data_args,
tokenizer=self.tokenizer
)
return train_dataset
def load_data_collator(self) -> AbsRerankerCollator:
"""Loads the appropriate data collator.
Returns:
AbsRerankerCollator: Loaded data collator.
"""
if self.model_args.model_type == 'encoder':
RerankerCollator = AbsRerankerCollator
else:
RerankerCollator = AbsLLMRerankerCollator
data_collator = RerankerCollator(
tokenizer=self.tokenizer,
query_max_len=self.data_args.query_max_len,
passage_max_len=self.data_args.passage_max_len,
pad_to_multiple_of=self.data_args.pad_to_multiple_of,
padding=True,
return_tensors="pt"
)
return data_collator
def run(self):
"""
Executes the training process.
"""
Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True)
# Training
self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint)
self.trainer.save_model()
================================================
FILE: FlagEmbedding/abc/finetune/reranker/AbsTrainer.py
================================================
import logging
from typing import Optional
from abc import ABC, abstractmethod
from transformers.trainer import Trainer
logger = logging.getLogger(__name__)
class AbsRerankerTrainer(ABC, Trainer):
"""
Abstract class for the trainer of reranker.
"""
@abstractmethod
def _save(self, output_dir: Optional[str] = None, state_dict=None):
pass
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
"""
How the loss is computed by Trainer. By default, all models return the loss in the first element.
Subclass and override for custom behavior.
Args:
model (AbsRerankerModel): The model being trained.
inputs (dict): A dictionary of input tensors to be passed to the model.
return_outputs (bool, optional): If ``True``, returns both the loss and the model's outputs. Otherwise,
returns only the loss. Defaults to ``False``.
Returns:
Union[torch.Tensor, tuple(torch.Tensor, RerankerOutput)]: The computed loss. If ``return_outputs`` is ``True``,
also returns the model's outputs in a tuple ``(loss, outputs)``.
"""
outputs = model(**inputs)
loss = outputs.loss
return (loss, outputs) if return_outputs else loss
================================================
FILE: FlagEmbedding/abc/finetune/reranker/__init__.py
================================================
from .AbsArguments import AbsRerankerDataArguments, AbsRerankerModelArguments, AbsRerankerTrainingArguments
from .AbsDataset import (
AbsRerankerTrainDataset, AbsRerankerCollator,
AbsLLMRerankerTrainDataset, AbsLLMRerankerCollator
)
from .AbsModeling import AbsRerankerModel, RerankerOutput
from .AbsTrainer import AbsRerankerTrainer
from .AbsRunner import AbsRerankerRunner
__all__ = [
"AbsRerankerDataArguments",
"AbsRerankerModelArguments",
"AbsRerankerTrainingArguments",
"AbsRerankerTrainDataset",
"AbsRerankerCollator",
"AbsLLMRerankerTrainDataset",
"AbsLLMRerankerCollator",
"AbsRerankerModel",
"RerankerOutput",
"AbsRerankerTrainer",
"AbsRerankerRunner",
]
================================================
FILE: FlagEmbedding/abc/inference/AbsEmbedder.py
================================================
import logging
from tqdm import tqdm, trange
from abc import ABC, abstractmethod
from typing import Any, Union, List, Dict, Literal, Optional
import queue
import multiprocessing as mp
from multiprocessing import Queue
import math
import gc
import torch
import numpy as np
from transformers import is_torch_npu_available
try:
import torch_musa
except Exception:
pass
logger = logging.getLogger(__name__)
class AbsEmbedder(ABC):
"""
Base class for embedder.
Extend this class and implement :meth:`encode_queries`, :meth:`encode_corpus`, :meth:`encode` for custom embedders.
Args:
model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and
load a model from HuggingFace Hub with the name.
normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`.
use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance
degradation. Defaults to :data:`True`.
query_instruction_for_retrieval: (Optional[str], optional): Query instruction for retrieval tasks, which will be used with
with :attr:`query_instruction_format`. Defaults to :data:`None`.
query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`.
devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`.
batch_size (int, optional): Batch size for inference. Defaults to :data:`256`.
query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`.
passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`.
convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor.
Defaults to :data:`True`.
kwargs (Dict[Any], optional): Additional parameters for HuggingFace Transformers config or children classes.
"""
def __init__(
self,
model_name_or_path: str,
normalize_embeddings: bool = True,
use_fp16: bool = True,
query_instruction_for_retrieval: Optional[str] = None,
query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_retrieval
devices: Optional[Union[str, int, List[str], List[int]]] = None,
# inference
batch_size: int = 256,
query_max_length: int = 512,
passage_max_length: int = 512,
convert_to_numpy: bool = True,
**kwargs: Any,
):
self.model_name_or_path = model_name_or_path
self.normalize_embeddings = normalize_embeddings
self.use_fp16 = use_fp16
self.query_instruction_for_retrieval = query_instruction_for_retrieval
self.query_instruction_format = query_instruction_format
self.target_devices = self.get_target_devices(devices)
self.batch_size = batch_size
self.query_max_length = query_max_length
self.passage_max_length = passage_max_length
self.convert_to_numpy = convert_to_numpy
for k in kwargs:
setattr(self, k, kwargs[k])
self.kwargs = kwargs
# tokenizer and model are initialized in the child class
self.tokenizer = None
self.model = None
self.pool = None
def stop_self_pool(self):
if self.pool is not None:
self.stop_multi_process_pool(self.pool)
self.pool = None
try:
self.model.to('cpu')
torch.cuda.empty_cache()
except:
pass
if gc is not None and callable(gc.collect):
gc.collect()
@staticmethod
def get_target_devices(devices: Union[str, int, List[str], List[int]]) -> List[str]:
"""
Args:
devices (Union[str, int, List[str], List[int]]): specified devices, can be `str`, `int`, list of `str`, or list of `int`.
Raises:
ValueError: Devices should be a string or an integer or a list of strings or a list of integers.
Returns:
List[str]: A list of target devices in format.
"""
if devices is None:
if torch.cuda.is_available():
return [f"cuda:{i}" for i in range(torch.cuda.device_count())]
elif is_torch_npu_available():
return [f"npu:{i}" for i in range(torch.npu.device_count())]
elif hasattr(torch, "musa") and torch.musa.is_available():
return [f"musa:{i}" for i in range(torch.musa.device_count())]
elif torch.backends.mps.is_available():
try:
return [f"mps:{i}" for i in range(torch.mps.device_count())]
except:
return ["mps"]
else:
return ["cpu"]
elif isinstance(devices, str):
return [devices]
elif isinstance(devices, int):
if hasattr(torch, "musa") and torch.musa.is_available():
return [f"musa:{devices}"]
else:
return [f"cuda:{devices}"]
elif isinstance(devices, list):
if isinstance(devices[0], str):
return devices
elif isinstance(devices[0], int):
if hasattr(torch, "musa") and torch.musa.is_available():
return [f"musa:{device}" for device in devices]
else:
return [f"cuda:{device}" for device in devices]
else:
raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.")
else:
raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.")
@staticmethod
def get_detailed_instruct(instruction_format: str, instruction: str, sentence: str):
"""Combine the instruction and sentence along with the instruction format.
Args:
instruction_format (str): Format for instruction.
instruction (str): The text of instruction.
sentence (str): The sentence to concatenate with.
Returns:
str: The complete sentence with instruction
"""
if "\\n" in instruction_format:
instruction_format = instruction_format.replace("\\n", "\n")
return instruction_format.format(instruction, sentence)
def encode_queries(
self,
queries: Union[List[str], str],
batch_size: Optional[int] = None,
max_length: Optional[int] = None,
convert_to_numpy: Optional[bool] = None,
**kwargs: Any
):
"""encode the queries using the instruction if provided.
Args:
queries (Union[List[str], str]): Input queries to encode.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will
be a Torch Tensor. Defaults to :data:`None`.
Returns:
Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
"""
if batch_size is None: batch_size = self.batch_size
if max_length is None: max_length = self.query_max_length
if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy
return self.encode(
queries,
batch_size=batch_size,
max_length=max_length,
convert_to_numpy=convert_to_numpy,
instruction=self.query_instruction_for_retrieval,
instruction_format=self.query_instruction_format,
**kwargs
)
def encode_corpus(
self,
corpus: Union[List[str], str],
batch_size: Optional[int] = None,
max_length: Optional[int] = None,
convert_to_numpy: Optional[bool] = None,
**kwargs: Any
):
"""encode the corpus using the instruction if provided.
Args:
corpus (Union[List[str], str]): Input corpus to encode.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will
be a Torch Tensor. Defaults to :data:`None`.
Returns:
Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
"""
passage_instruction_for_retrieval = self.kwargs.get("passage_instruction_for_retrieval", None)
passage_instruction_format = self.kwargs.get("passage_instruction_format", "{}{}")
if batch_size is None: batch_size = self.batch_size
if max_length is None: max_length = self.passage_max_length
if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy
return self.encode(
corpus,
batch_size=batch_size,
max_length=max_length,
convert_to_numpy=convert_to_numpy,
instruction=passage_instruction_for_retrieval,
instruction_format=passage_instruction_format,
**kwargs
)
def encode(
self,
sentences: Union[List[str], str],
batch_size: Optional[int] = None,
max_length: Optional[int] = None,
convert_to_numpy: Optional[bool] = None,
instruction: Optional[str] = None,
instruction_format: Optional[str] = None,
**kwargs: Any
):
"""encode the input sentences with the embedding model.
Args:
sentences (Union[List[str], str]): Input sentences to encode.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will
be a Torch Tensor. Defaults to :data:`None`.
instruction (Optional[str], optional): The text of instruction. Defaults to :data:`None`.
instruction_format (Optional[str], optional): Format for instruction. Defaults to :data:`None`.
Returns:
Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
"""
if batch_size is None: batch_size = self.batch_size
if max_length is None: max_length = self.passage_max_length
if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy
if instruction is not None:
if isinstance(sentences, str):
sentences = self.get_detailed_instruct(instruction_format, instruction, sentences)
else:
sentences = [self.get_detailed_instruct(instruction_format, instruction, sentence) for sentence in
sentences]
if isinstance(sentences, str) or len(self.target_devices) == 1:
return self.encode_single_device(
sentences,
batch_size=batch_size,
max_length=max_length,
convert_to_numpy=convert_to_numpy,
device=self.target_devices[0],
**kwargs
)
if self.pool is None:
self.pool = self.start_multi_process_pool(AbsEmbedder._encode_multi_process_worker)
embeddings = self.encode_multi_process(
sentences,
self.pool,
batch_size=batch_size,
max_length=max_length,
convert_to_numpy=convert_to_numpy,
**kwargs
)
return embeddings
def __del__(self):
self.stop_self_pool()
@abstractmethod
def encode_single_device(
self,
sentences: Union[List[str], str],
batch_size: int = 256,
max_length: int = 512,
convert_to_numpy: bool = True,
device: Optional[str] = None,
**kwargs: Any,
):
"""
This method should encode sentences and return embeddings on a single device.
"""
pass
# adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L807
def start_multi_process_pool(
self,
process_target_func: Any,
) -> Dict[Literal["input", "output", "processes"], Any]:
"""
Starts a multi-process pool to process the encoding with several independent processes
via :meth:`SentenceTransformer.encode_multi_process <sentence_transformers.SentenceTransformer.encode_multi_process>`.
This method is recommended if you want to encode on multiple GPUs or CPUs. It is advised
to start only one process per GPU. This method works together with encode_multi_process
and stop_multi_process_pool.
Returns:
Dict[str, Any]: A dictionary with the target processes, an input queue, and an output queue.
"""
if self.model is None:
raise ValueError("Model is not initialized.")
logger.info("Start multi-process pool on devices: {}".format(", ".join(map(str, self.target_devices))))
self.model.to("cpu")
self.model.share_memory()
ctx = mp.get_context("spawn")
input_queue = ctx.Queue()
output_queue = ctx.Queue()
processes = []
for device_id in tqdm(self.target_devices, desc='initial target device'):
p = ctx.Process(
target=process_target_func,
args=(device_id, self, input_queue, output_queue),
daemon=True,
)
p.start()
processes.append(p)
return {"input": input_queue, "output": output_queue, "processes": processes}
# adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L976
@staticmethod
def _encode_multi_process_worker(
target_device: str, model: 'AbsEmbedder', input_queue: Queue, results_queue: Queue
) -> None:
"""
Internal working process to encode sentences in multi-process setup
"""
while True:
try:
chunk_id, sentences, kwargs = (
input_queue.get()
)
embeddings = model.encode_single_device(
sentences,
device=target_device,
**kwargs
)
results_queue.put([chunk_id, embeddings])
except queue.Empty:
break
# copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857
@staticmethod
def stop_multi_process_pool(pool: Dict[Literal["input", "output", "processes"], Any]) -> None:
"""
Stops all processes started with start_multi_process_pool.
Args:
pool (Dict[str, object]): A dictionary containing the input queue, output queue, and process list.
Returns:
None
"""
for p in pool["processes"]:
p.terminate()
for p in pool["processes"]:
p.join()
p.close()
pool["input"].close()
pool["output"].close()
pool = None
# adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L877
def encode_multi_process(
self,
sentences: List[str],
pool: Dict[Literal["input", "output", "processes"], Any],
**kwargs
):
chunk_size = math.ceil(len(sentences) / len(pool["processes"]))
input_queue = pool["input"]
last_chunk_id = 0
chunk = []
for sentence in sentences:
chunk.append(sentence)
if len(chunk) >= chunk_size:
input_queue.put(
[last_chunk_id, chunk, kwargs]
)
last_chunk_id += 1
chunk = []
if len(chunk) > 0:
input_queue.put([last_chunk_id, chunk, kwargs])
last_chunk_id += 1
output_queue = pool["output"]
results_list = sorted(
[output_queue.get() for _ in trange(last_chunk_id, desc="Chunks")],
key=lambda x: x[0],
)
embeddings = self._concatenate_results_from_multi_process([result[1] for result in results_list])
return embeddings
def _concatenate_results_from_multi_process(self, results_list: List[Union[torch.Tensor, np.ndarray, Any]]):
"""concatenate and return the results from all the processes
Args:
results_list (List[Union[torch.Tensor, np.ndarray, Any]]): A list of results from all the processes.
Raises:
NotImplementedError: Unsupported type for results_list
Returns:
Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
"""
if isinstance(results_list[0], torch.Tensor):
# move all tensors to the same device
results_list = [res.to(self.target_devices[0]) for res in results_list]
return torch.cat(results_list, dim=0)
elif isinstance(results_list[0], np.ndarray):
return np.concatenate(results_list, axis=0)
else:
raise NotImplementedError("Unsupported type for results_list")
================================================
FILE: FlagEmbedding/abc/inference/AbsReranker.py
================================================
import logging
from abc import ABC, abstractmethod
from typing import Any, Union, List, Tuple, Dict, Literal, Optional
import multiprocessing as mp
from multiprocessing import Queue
import math
import gc
import torch
import numpy as np
from tqdm import tqdm, trange
from transformers import is_torch_npu_available
try:
import torch_musa
except Exception:
pass
logger = logging.getLogger(__name__)
class AbsReranker(ABC):
"""
Base class for Reranker.
Extend this class and implement :meth:`compute_score_single_gpu` for custom rerankers.
Args:
model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and
load a model from HuggingFace Hub with the name.
use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance
degradation. Defaults to :data:`False`.
query_instruction_for_rerank: (Optional[str], optional): Query instruction for reranking, which will be used with
with :attr:`query_instruction_format`. Defaults to :data:`None`.
query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_rerank`. Defaults to :data:`"{}{}"`.
passage_instruction_for_rerank (Optional[str], optional): Passage instruction for reranking. Defaults to :data:`None`.
passage_instruction_format (str, optional): Passage instruction format when using :attr:`passage_instruction_for_rerank`.
Defaults to :data:`"{}{}"`.
devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`.
batch_size (int, optional): Batch size for inference. Defaults to :data:`128`.
query_max_length (int, optional): Maximum length for query. Defaults to :data:`None`.
max_length (int, optional): Maximum length. Defaults to :data:`512`.
normalize (bool, optional): If true, normalize the result. Defaults to :data:`False`.
kwargs (Dict[Any], optional): Additional parameters for HuggingFace Transformers config or children classes.
"""
def __init__(
self,
model_name_or_path: str,
use_fp16: bool = False,
query_instruction_for_rerank: Optional[str] = None,
query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_rerank
passage_instruction_for_rerank: Optional[str] = None,
passage_instruction_format: str = "{}{}", # specify the format of passage_instruction_for_rerank
devices: Optional[Union[str, int, List[str], List[int]]] = None,
# inference
batch_size: int = 128,
query_max_length: Optional[int] = None,
max_length: int = 512,
normalize: bool = False,
**kwargs: Any,
):
self.model_name_or_path = model_name_or_path
self.use_fp16 = use_fp16
self.query_instruction_for_rerank = query_instruction_for_rerank
self.query_instruction_format = query_instruction_format
self.passage_instruction_for_rerank = passage_instruction_for_rerank
self.passage_instruction_format = passage_instruction_format
self.target_devices = self.get_target_devices(devices)
self.batch_size = batch_size
self.query_max_length = query_max_length
self.max_length = max_length
self.normalize = normalize
for k in kwargs:
setattr(self, k, kwargs[k])
self.kwargs = kwargs
# tokenizer and model are initialized in the child class
self.model = None
self.tokenizer = None
self.pool = None
def stop_self_pool(self):
if self.pool is not None:
self.stop_multi_process_pool(self.pool)
self.pool = None
try:
self.model.to('cpu')
torch.cuda.empty_cache()
except:
pass
if gc is not None and callable(gc.collect):
gc.collect()
@staticmethod
def get_target_devices(devices: Union[str, int, List[str], List[int]]) -> List[str]:
"""
Args:
devices (Union[str, int, List[str], List[int]]): Specified devices, can be `str`, `int`, list of `str`, or list of `int`.
Raises:
ValueError: Devices should be a string or an integer or a list of strings or a list of integers.
Returns:
List[str]: A list of target devices in format
"""
if devices is None:
if torch.cuda.is_available():
return [f"cuda:{i}" for i in range(torch.cuda.device_count())]
elif is_torch_npu_available():
return [f"npu:{i}" for i in range(torch.npu.device_count())]
elif hasattr(torch, "musa") and torch.musa.is_available():
return [f"musa:{i}" for i in range(torch.musa.device_count())]
elif torch.backends.mps.is_available():
return ["mps"]
else:
return ["cpu"]
elif isinstance(devices, str):
return [devices]
elif isinstance(devices, int):
if hasattr(torch, "musa") and torch.musa.is_available():
return [f"musa:{devices}"]
else:
return [f"cuda:{devices}"]
elif isinstance(devices, list):
if isinstance(devices[0], str):
return devices
elif isinstance(devices[0], int):
if hasattr(torch, "musa") and torch.musa.is_available():
return [f"musa:{device}" for device in devices]
else:
return [f"cuda:{device}" for device in devices]
else:
raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.")
else:
raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.")
def get_detailed_instruct(self, instruction_format: str, instruction: str, sentence: str):
"""Combine the instruction and sentence along with the instruction format.
Args:
instruction_format (str): Format for instruction.
instruction (str): The text of instruction.
sentence (str): The sentence to concatenate with.
Returns:
str: The complete sentence with instruction
"""
if "\\n" in instruction_format:
instruction_format = instruction_format.replace("\\n", "\n")
return instruction_format.format(instruction, sentence)
def get_detailed_inputs(self, sentence_pairs: Union[str, List[str]]):
"""get detailed instruct for all the inputs
Args:
sentence_pairs (Union[str, List[str]]): Input sentence pairs
Returns:
list[list[str]]: The complete sentence pairs with instruction
"""
if isinstance(sentence_pairs, str):
sentence_pairs = [sentence_pairs]
if self.query_instruction_for_rerank is not None:
if self.passage_instruction_for_rerank is None:
return [
[
self.get_detailed_instruct(self.query_instruction_format, self.query_instruction_for_rerank, sentence_pair[0]),
sentence_pair[1]
] for sentence_pair in sentence_pairs
]
else:
return [
[
self.get_detailed_instruct(self.query_instruction_format, self.query_instruction_for_rerank, sentence_pair[0]),
self.get_detailed_instruct(self.passage_instruction_format, self.passage_instruction_for_rerank, sentence_pair[1])
] for sentence_pair in sentence_pairs
]
else:
if self.passage_instruction_for_rerank is None:
return [
[
sentence_pair[0],
sentence_pair[1]
] for sentence_pair in sentence_pairs
]
else:
return [
[
sentence_pair[0],
self.get_detailed_instruct(self.passage_instruction_format, self.passage_instruction_for_rerank, sentence_pair[1])
] for sentence_pair in sentence_pairs
]
def compute_score(
self,
sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]],
**kwargs
):
"""Compute score for each sentence pair
Args:
sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): Input sentence pairs to compute.
Returns:
numpy.ndarray: scores of all the sentence pairs.
"""
if isinstance(sentence_pairs[0], str):
sentence_pairs = [sentence_pairs]
sentence_pairs = self.get_detailed_inputs(sentence_pairs)
if isinstance(sentence_pairs, str) or len(self.target_devices) == 1:
return self.compute_score_single_gpu(
sentence_pairs,
device=self.target_devices[0],
**kwargs
)
if self.pool is None:
self.pool = self.start_multi_process_pool()
scores = self.encode_multi_process(sentence_pairs,
self.pool,
**kwargs)
return scores
def __del__(self):
self.stop_self_pool()
@abstractmethod
def compute_score_single_gpu(
self,
sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]],
batch_size: int = 256,
query_max_length: Optional[int] = None,
max_length: int = 512,
normalize: bool = False,
device: Optional[str] = None,
**kwargs: Any,
):
"""
This method should compute the scores of sentence_pair and return scores.
"""
pass
# copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857
def start_multi_process_pool(self) -> Dict[Literal["input", "output", "processes"], Any]:
"""
Starts a multi-process pool to process the encoding with several independent processes
via :meth:`SentenceTransformer.encode_multi_process <sentence_transformers.SentenceTransformer.encode_multi_process>`.
This method is recommended if you want to encode on multiple GPUs or CPUs. It is advised
to start only one process per GPU. This method works together with encode_multi_process
and stop_multi_process_pool.
Returns:
Dict[str, Any]: A dictionary with the target processes, an input queue, and an output queue.
"""
logger.info("Start multi-process pool on devices: {}".format(", ".join(map(str, self.target_devices))))
self.model.to("cpu")
self.model.share_memory()
ctx = mp.get_context("spawn")
input_queue = ctx.Queue()
output_queue = ctx.Queue()
processes = []
for device_id in tqdm(self.target_devices, desc='initial target device'):
p = ctx.Process(
target=AbsReranker._encode_multi_process_worker,
args=(device_id, self, input_queue, output_queue),
daemon=True,
)
p.start()
processes.append(p)
return {"input": input_queue, "output": output_queue, "processes": processes}
# copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857
def encode_multi_process(
self,
sentence_pairs: List,
pool: Dict[Literal["input", "output", "processes"], Any],
**kwargs
) -> np.ndarray:
chunk_size = math.ceil(len(sentence_pairs) / len(pool["processes"]))
input_queue = pool["input"]
last_chunk_id = 0
chunk = []
for sentence_pair in sentence_pairs:
chunk.append(sentence_pair)
if len(chunk) >= chunk_size:
input_queue.put(
[last_chunk_id, chunk, kwargs]
)
last_chunk_id += 1
chunk = []
if len(chunk) > 0:
input_queue.put([last_chunk_id, chunk, kwargs])
last_chunk_id += 1
output_queue = pool["output"]
results_list = sorted(
[output_queue.get() for _ in trange(last_chunk_id, desc="Chunks")],
key=lambda x: x[0],
)
scores = np.concatenate([result[1] for result in results_list])
return scores
# copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857
@staticmethod
def _encode_multi_process_worker(
target_device: str, model: 'AbsReranker', input_queue: Queue, results_queue: Queue
) -> None:
"""
Internal working process to encode sentences in multi-process setup
"""
while True:
try:
chunk_id, sentences, kwargs = (
input_queue.get()
)
embeddings
gitextract_fx4cm0n9/
├── .github/
│ └── workflows/
│ └── documentation.yml
├── .gitignore
├── FlagEmbedding/
│ ├── __init__.py
│ ├── abc/
│ │ ├── __init__.py
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data_loader.py
│ │ │ ├── evaluator.py
│ │ │ ├── runner.py
│ │ │ ├── searcher.py
│ │ │ └── utils.py
│ │ ├── finetune/
│ │ │ ├── __init__.py
│ │ │ ├── embedder/
│ │ │ │ ├── AbsArguments.py
│ │ │ │ ├── AbsDataset.py
│ │ │ │ ├── AbsModeling.py
│ │ │ │ ├── AbsRunner.py
│ │ │ │ ├── AbsTrainer.py
│ │ │ │ └── __init__.py
│ │ │ └── reranker/
│ │ │ ├── AbsArguments.py
│ │ │ ├── AbsDataset.py
│ │ │ ├── AbsModeling.py
│ │ │ ├── AbsRunner.py
│ │ │ ├── AbsTrainer.py
│ │ │ └── __init__.py
│ │ └── inference/
│ │ ├── AbsEmbedder.py
│ │ ├── AbsReranker.py
│ │ └── __init__.py
│ ├── evaluation/
│ │ ├── __init__.py
│ │ ├── air_bench/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── examples/
│ │ │ │ ├── long-doc/
│ │ │ │ │ ├── arxiv-gemini.jsonl
│ │ │ │ │ ├── arxiv-gpt3.jsonl
│ │ │ │ │ ├── arxiv-llama2.jsonl
│ │ │ │ │ ├── arxiv-llm-survey.jsonl
│ │ │ │ │ ├── book-a-brief-history-of-time_stephen-hawking.jsonl
│ │ │ │ │ ├── book-origin-of-species_darwin.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_1.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_2.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_3.jsonl
│ │ │ │ │ ├── healthcare-pubmed_30k-40k_10-merged.jsonl
│ │ │ │ │ ├── healthcare-pubmed_40k-50k_5-merged.jsonl
│ │ │ │ │ ├── law-lex_files_300k-400k.jsonl
│ │ │ │ │ ├── law-lex_files_400k-500k.jsonl
│ │ │ │ │ ├── law-lex_files_500k-600k.jsonl
│ │ │ │ │ └── law-lex_files_600k-700k.jsonl
│ │ │ │ └── qa/
│ │ │ │ ├── arxiv.jsonl
│ │ │ │ ├── finance.jsonl
│ │ │ │ ├── healthcare.jsonl
│ │ │ │ ├── law.jsonl
│ │ │ │ ├── msmarco.jsonl
│ │ │ │ ├── news.jsonl
│ │ │ │ ├── web.jsonl
│ │ │ │ └── wiki.jsonl
│ │ │ └── runner.py
│ │ ├── beir/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── data_loader.py
│ │ │ ├── evaluator.py
│ │ │ ├── prompts.py
│ │ │ └── runner.py
│ │ ├── bright/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── data_loader.py
│ │ │ ├── prompts.py
│ │ │ ├── runner.py
│ │ │ └── searcher.py
│ │ ├── custom/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ └── runner.py
│ │ ├── miracl/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ └── runner.py
│ │ ├── mkqa/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ ├── evaluator.py
│ │ │ ├── runner.py
│ │ │ └── utils/
│ │ │ ├── compute_metrics.py
│ │ │ └── normalize_text.py
│ │ ├── mldr/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ └── runner.py
│ │ ├── msmarco/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── data_loader.py
│ │ │ └── runner.py
│ │ └── mteb/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── arguments.py
│ │ ├── examples/
│ │ │ ├── AmazonCounterfactualClassification.csv
│ │ │ ├── AmazonPolarityClassification.csv
│ │ │ ├── AmazonReviewsClassification.csv
│ │ │ ├── ArguAna.csv
│ │ │ ├── ArxivClusteringP2P.csv
│ │ │ ├── ArxivClusteringS2S.csv
│ │ │ ├── AskUbuntuDupQuestions.csv
│ │ │ ├── BIOSSES.csv
│ │ │ ├── Banking77Classification.csv
│ │ │ ├── BiorxivClusteringP2P.csv
│ │ │ ├── BiorxivClusteringS2S.csv
│ │ │ ├── CQADupstack.csv
│ │ │ ├── CQADupstackRetrieval.csv
│ │ │ ├── ClimateFEVER.csv
│ │ │ ├── DBPedia.csv
│ │ │ ├── EmotionClassification.csv
│ │ │ ├── FEVER.csv
│ │ │ ├── FiQA2018.csv
│ │ │ ├── HotpotQA.csv
│ │ │ ├── ImdbClassification.csv
│ │ │ ├── MSMARCO.csv
│ │ │ ├── MTOPDomainClassification.csv
│ │ │ ├── MTOPIntentClassification.csv
│ │ │ ├── MassiveIntentClassification.csv
│ │ │ ├── MassiveScenarioClassification.csv
│ │ │ ├── MedrxivClusteringP2P.csv
│ │ │ ├── MedrxivClusteringS2S.csv
│ │ │ ├── MindSmallReranking.csv
│ │ │ ├── NFCorpus.csv
│ │ │ ├── NQ.csv
│ │ │ ├── QuoraRetrieval.csv
│ │ │ ├── RedditClustering.csv
│ │ │ ├── RedditClusteringP2P.csv
│ │ │ ├── SCIDOCS.csv
│ │ │ ├── SICK-R.csv
│ │ │ ├── STS12.csv
│ │ │ ├── STS13.csv
│ │ │ ├── STS14.csv
│ │ │ ├── STS15.csv
│ │ │ ├── STS16.csv
│ │ │ ├── STS17.csv
│ │ │ ├── STS22.csv
│ │ │ ├── STSBenchmark.csv
│ │ │ ├── SciDocsRR.csv
│ │ │ ├── SciFact.csv
│ │ │ ├── SprintDuplicateQuestions.csv
│ │ │ ├── StackExchangeClustering.csv
│ │ │ ├── StackExchangeClusteringP2P.csv
│ │ │ ├── StackOverflowDupQuestions.csv
│ │ │ ├── SummEval.csv
│ │ │ ├── TRECCOVID.csv
│ │ │ ├── Touche2020.csv
│ │ │ ├── ToxicConversationsClassification.csv
│ │ │ ├── TweetSentimentExtractionClassification.csv
│ │ │ ├── TwentyNewsgroupsClustering.csv
│ │ │ ├── TwitterSemEval2015.csv
│ │ │ └── TwitterURLCorpus.csv
│ │ ├── prompts.py
│ │ ├── runner.py
│ │ └── searcher.py
│ ├── finetune/
│ │ ├── __init__.py
│ │ ├── embedder/
│ │ │ ├── __init__.py
│ │ │ ├── decoder_only/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── arguments.py
│ │ │ │ │ ├── load_model.py
│ │ │ │ │ ├── modeling.py
│ │ │ │ │ ├── runner.py
│ │ │ │ │ └── trainer.py
│ │ │ │ └── icl/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── arguments.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── load_model.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── runner.py
│ │ │ │ └── trainer.py
│ │ │ └── encoder_only/
│ │ │ ├── __init__.py
│ │ │ ├── base/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── runner.py
│ │ │ │ └── trainer.py
│ │ │ └── m3/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── modeling.py
│ │ │ ├── runner.py
│ │ │ └── trainer.py
│ │ └── reranker/
│ │ ├── __init__.py
│ │ ├── decoder_only/
│ │ │ ├── __init__.py
│ │ │ ├── base/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── arguments.py
│ │ │ │ ├── load_model.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── runner.py
│ │ │ │ └── trainer.py
│ │ │ └── layerwise/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── arguments.py
│ │ │ ├── configuration_minicpm_reranker.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── modeling_minicpm_reranker.py
│ │ │ ├── runner.py
│ │ │ └── trainer.py
│ │ └── encoder_only/
│ │ ├── __init__.py
│ │ └── base/
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── modeling.py
│ │ ├── runner.py
│ │ └── trainer.py
│ ├── inference/
│ │ ├── __init__.py
│ │ ├── auto_embedder.py
│ │ ├── auto_reranker.py
│ │ ├── embedder/
│ │ │ ├── __init__.py
│ │ │ ├── decoder_only/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ └── icl.py
│ │ │ ├── encoder_only/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ └── m3.py
│ │ │ └── model_mapping.py
│ │ └── reranker/
│ │ ├── __init__.py
│ │ ├── decoder_only/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── layerwise.py
│ │ │ ├── lightweight.py
│ │ │ └── models/
│ │ │ ├── __init__.py
│ │ │ ├── configuration_minicpm_reranker.py
│ │ │ ├── gemma_config.py
│ │ │ ├── gemma_model.py
│ │ │ └── modeling_minicpm_reranker.py
│ │ ├── encoder_only/
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ └── model_mapping.py
│ └── utils/
│ ├── __init__.py
│ └── transformers_compat.py
├── LICENSE
├── Manifest.in
├── README.md
├── README_zh.md
├── Tutorials/
│ ├── 1_Embedding/
│ │ ├── 1.1_Intro&Inference.ipynb
│ │ ├── 1.2.1_BGE_Series.ipynb
│ │ ├── 1.2.2_Auto_Embedder.ipynb
│ │ ├── 1.2.3_BGE_v1&1.5.ipynb
│ │ ├── 1.2.4_BGE-M3.ipynb
│ │ ├── 1.2.5_BGE_EN_ICL.ipynb
│ │ ├── 1.2.6_BGE_VL.ipynb
│ │ └── 1.2.7_BGE_Code_v1.ipynb
│ ├── 2_Metrics/
│ │ ├── 2.1_Similarity_Metrics.ipynb
│ │ └── 2.2_Eval_Metrics.ipynb
│ ├── 3_Indexing/
│ │ ├── 3.1.1_Intro_to_Faiss.ipynb
│ │ ├── 3.1.2_Faiss_GPU.ipynb
│ │ ├── 3.1.3_Faiss_Indexes.ipynb
│ │ ├── 3.1.4_Faiss_Quantizers.ipynb
│ │ └── 3.1.5_Faiss_Index_Choosing.ipynb
│ ├── 4_Evaluation/
│ │ ├── 4.1.1_Evaluation_MSMARCO.ipynb
│ │ ├── 4.2.1_MTEB_Intro.ipynb
│ │ ├── 4.2.2_MTEB_Leaderboard.ipynb
│ │ ├── 4.2.3_C-MTEB.ipynb
│ │ ├── 4.3.1_Sentence_Transformers_Eval.ipynb
│ │ ├── 4.4.1_BEIR.ipynb
│ │ ├── 4.5.1_MIRACL.ipynb
│ │ ├── 4.5.2_MLDR.ipynb
│ │ └── utils/
│ │ ├── compute_metrics.py
│ │ └── normalize_text.py
│ ├── 5_Reranking/
│ │ ├── 5.1_Intro.ipynb
│ │ ├── 5.2_BGE_Reranker.ipynb
│ │ └── 5.3_Reranker_Eval.ipynb
│ ├── 6_RAG/
│ │ ├── 6.1_RAG_From_Scratch.ipynb
│ │ ├── 6.2_RAG_LangChain.ipynb
│ │ └── 6.3_RAG_LlamaIndex.ipynb
│ ├── 7_Fine-tuning/
│ │ ├── 7.1.1_Data_preparation.ipynb
│ │ ├── 7.1.2_Fine-tune.ipynb
│ │ ├── 7.1.3_Eval_FT_Model.ipynb
│ │ ├── 7.2.1_Hard_Negative_Mining.ipynb
│ │ └── config/
│ │ ├── ds_stage0.json
│ │ └── ds_stage1.json
│ ├── README.md
│ └── quick_start.ipynb
├── dataset/
│ └── README.md
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── make.bat
│ ├── requirements.txt
│ └── source/
│ ├── API/
│ │ ├── abc/
│ │ │ ├── evaluation/
│ │ │ │ ├── arguments.rst
│ │ │ │ ├── data_loader.rst
│ │ │ │ ├── evaluator.rst
│ │ │ │ ├── runner.rst
│ │ │ │ └── searcher.rst
│ │ │ ├── evaluation.rst
│ │ │ ├── finetune/
│ │ │ │ ├── embedder/
│ │ │ │ │ ├── AbsArguments.rst
│ │ │ │ │ ├── AbsDataset.rst
│ │ │ │ │ ├── AbsModeling.rst
│ │ │ │ │ ├── AbsRunner.rst
│ │ │ │ │ └── AbsTrainer.rst
│ │ │ │ ├── embedder.rst
│ │ │ │ ├── reranker/
│ │ │ │ │ ├── AbsArguments.rst
│ │ │ │ │ ├── AbsDataset.rst
│ │ │ │ │ ├── AbsModeling.rst
│ │ │ │ │ ├── AbsRunner.rst
│ │ │ │ │ └── AbsTrainer.rst
│ │ │ │ └── reranker.rst
│ │ │ ├── finetune.rst
│ │ │ ├── inference/
│ │ │ │ ├── AbsEmbedder.rst
│ │ │ │ └── AbsReranker.rst
│ │ │ └── inference.rst
│ │ ├── abc.rst
│ │ ├── evaluation/
│ │ │ ├── airbench/
│ │ │ │ ├── arguments.rst
│ │ │ │ └── runner.rst
│ │ │ ├── airbench.rst
│ │ │ ├── beir/
│ │ │ │ ├── arguments.rst
│ │ │ │ ├── data_loader.rst
│ │ │ │ ├── evaluator.rst
│ │ │ │ └── runner.rst
│ │ │ ├── beir.rst
│ │ │ ├── miracl/
│ │ │ │ ├── data_loader.rst
│ │ │ │ └── runner.rst
│ │ │ ├── miracl.rst
│ │ │ ├── mkqa/
│ │ │ │ ├── data_loader.rst
│ │ │ │ ├── evaluator.rst
│ │ │ │ └── runner.rst
│ │ │ ├── mkqa.rst
│ │ │ ├── mldr/
│ │ │ │ ├── data_loader.rst
│ │ │ │ └── runner.rst
│ │ │ ├── mldr.rst
│ │ │ ├── msmarco/
│ │ │ │ ├── data_loader.rst
│ │ │ │ └── runner.rst
│ │ │ ├── msmarco.rst
│ │ │ ├── mteb/
│ │ │ │ ├── arguments.rst
│ │ │ │ ├── runner.rst
│ │ │ │ └── searcher.rst
│ │ │ └── mteb.rst
│ │ ├── evaluation.rst
│ │ ├── finetune/
│ │ │ ├── embedder/
│ │ │ │ ├── decoder_only/
│ │ │ │ │ ├── base/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ ├── base.rst
│ │ │ │ │ ├── icl/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── dataset.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ └── icl.rst
│ │ │ │ ├── decoder_only.rst
│ │ │ │ ├── encoder_only/
│ │ │ │ │ ├── base/
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ ├── base.rst
│ │ │ │ │ ├── m3/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ └── m3.rst
│ │ │ │ └── encoder_only.rst
│ │ │ ├── embedder.rst
│ │ │ ├── reranker/
│ │ │ │ ├── decoder_only/
│ │ │ │ │ ├── base/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ ├── base.rst
│ │ │ │ │ ├── layerwise/
│ │ │ │ │ │ ├── arguments.rst
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ └── layerwise.rst
│ │ │ │ ├── decoder_only.rst
│ │ │ │ ├── encoder_only/
│ │ │ │ │ ├── base/
│ │ │ │ │ │ ├── modeling.rst
│ │ │ │ │ │ ├── runner.rst
│ │ │ │ │ │ └── trainer.rst
│ │ │ │ │ └── base.rst
│ │ │ │ └── encoder_only.rst
│ │ │ └── reranker.rst
│ │ ├── finetune.rst
│ │ ├── index.rst
│ │ ├── inference/
│ │ │ ├── FlagAutoModel.rst
│ │ │ ├── FlagAutoReranker.rst
│ │ │ ├── embedder/
│ │ │ │ ├── decoder_only/
│ │ │ │ │ ├── BaseLLMEmbedder.rst
│ │ │ │ │ └── ICLLLMEmbedder.rst
│ │ │ │ ├── embedder.rst
│ │ │ │ └── encoder_only/
│ │ │ │ ├── BaseEmbedder.rst
│ │ │ │ └── M3Embedder.rst
│ │ │ └── reranker/
│ │ │ ├── decoder_only/
│ │ │ │ ├── BaseLLMReranker.rst
│ │ │ │ ├── LayerWiseLLMReranker.rst
│ │ │ │ └── LightweightLLMReranker.rst
│ │ │ ├── encoder_only/
│ │ │ │ └── BaseReranker.rst
│ │ │ └── reranker.rst
│ │ └── inference.rst
│ ├── C-MTEB.rst
│ ├── FAQ/
│ │ └── index.rst
│ ├── Introduction/
│ │ ├── IR.rst
│ │ ├── embedder.rst
│ │ ├── index.rst
│ │ ├── installation.rst
│ │ ├── overview.rst
│ │ ├── quick_start.rst
│ │ ├── reranker.rst
│ │ ├── retrieval_demo.ipynb
│ │ └── similarity.rst
│ ├── _static/
│ │ └── css/
│ │ └── custom.css
│ ├── bge/
│ │ ├── bge_code.rst
│ │ ├── bge_icl.rst
│ │ ├── bge_m3.rst
│ │ ├── bge_reranker.rst
│ │ ├── bge_reranker_v2.rst
│ │ ├── bge_v1_v1.5.rst
│ │ ├── bge_vl.rst
│ │ └── index.rst
│ ├── community/
│ │ └── index.rst
│ ├── conf.py
│ ├── index.rst
│ └── tutorial/
│ ├── 1_Embedding/
│ │ ├── 1.1.1.ipynb
│ │ ├── 1.2.1.ipynb
│ │ ├── 1.2.2.ipynb
│ │ ├── 1.2.3.ipynb
│ │ ├── 1.2.4.ipynb
│ │ └── 1.2.5.ipynb
│ ├── 1_Embedding.rst
│ ├── 2_Metrics/
│ │ ├── 2.1.ipynb
│ │ └── 2.2.ipynb
│ ├── 2_Metrics.rst
│ ├── 3_Indexing/
│ │ ├── 3.1.1.ipynb
│ │ ├── 3.1.2.ipynb
│ │ ├── 3.1.3.ipynb
│ │ ├── 3.1.4.ipynb
│ │ └── 3.1.5.ipynb
│ ├── 3_Indexing.rst
│ ├── 4_Evaluation/
│ │ ├── 4.1.1.ipynb
│ │ ├── 4.2.1.ipynb
│ │ ├── 4.2.2.ipynb
│ │ ├── 4.2.3.ipynb
│ │ ├── 4.3.1.ipynb
│ │ ├── 4.4.1.ipynb
│ │ ├── 4.5.1.ipynb
│ │ └── 4.5.2.ipynb
│ ├── 4_Evaluation.rst
│ ├── 5_Reranking/
│ │ ├── 5.1.ipynb
│ │ ├── 5.2.ipynb
│ │ └── 5.3.ipynb
│ ├── 5_Reranking.rst
│ ├── 6_RAG/
│ │ ├── 6.1.ipynb
│ │ ├── 6.2.ipynb
│ │ └── 6.3.ipynb
│ ├── 6_RAG.rst
│ ├── 7_Finetuning/
│ │ ├── 7.1.1.ipynb
│ │ ├── 7.1.2.ipynb
│ │ ├── 7.1.3.ipynb
│ │ └── 7.2.1.ipynb
│ ├── 7_Finetuning.rst
│ └── index.rst
├── examples/
│ ├── README.md
│ ├── evaluation/
│ │ ├── README.md
│ │ ├── air_bench/
│ │ │ └── eval_air_bench.sh
│ │ ├── beir/
│ │ │ └── eval_beir.sh
│ │ ├── bright/
│ │ │ └── eval_bright_short.sh
│ │ ├── miracl/
│ │ │ └── eval_miracl.sh
│ │ ├── mkqa/
│ │ │ └── eval_mkqa.sh
│ │ ├── mldr/
│ │ │ └── eval_mldr.sh
│ │ ├── msmarco/
│ │ │ └── eval_msmarco.sh
│ │ └── mteb/
│ │ └── eval_mteb.sh
│ ├── finetune/
│ │ ├── ds_stage0.json
│ │ ├── ds_stage1.json
│ │ ├── embedder/
│ │ │ ├── README.md
│ │ │ ├── decoder_only/
│ │ │ │ ├── base.sh
│ │ │ │ ├── base_same_dataset.sh
│ │ │ │ └── icl_same_dataset.sh
│ │ │ ├── encoder_only/
│ │ │ │ ├── base.sh
│ │ │ │ ├── base_same_dataset.sh
│ │ │ │ ├── m3.sh
│ │ │ │ └── m3_same_dataset.sh
│ │ │ └── example_data/
│ │ │ ├── classification-no_in_batch_neg/
│ │ │ │ ├── AmazonClassification.jsonl
│ │ │ │ └── Banking77Classification.jsonl
│ │ │ ├── clustering-no_in_batch_neg/
│ │ │ │ ├── arXiv_title.jsonl
│ │ │ │ └── bioRXiv_title.jsonl
│ │ │ ├── retrieval/
│ │ │ │ ├── msmarco.jsonl
│ │ │ │ ├── nli.jsonl
│ │ │ │ └── nq.jsonl
│ │ │ └── sts/
│ │ │ └── sts.jsonl
│ │ └── reranker/
│ │ ├── README.md
│ │ ├── decoder_only/
│ │ │ ├── base.sh
│ │ │ └── layerwise.sh
│ │ ├── encoder_only/
│ │ │ └── base.sh
│ │ └── example_data/
│ │ ├── normal/
│ │ │ └── examples.jsonl
│ │ └── prompt_based/
│ │ └── examples.jsonl
│ └── inference/
│ ├── embedder/
│ │ ├── README.md
│ │ ├── decoder_only/
│ │ │ ├── auto_base_multi_devices.py
│ │ │ ├── auto_base_single_device.py
│ │ │ ├── auto_icl_multi_devices.py
│ │ │ ├── auto_icl_single_device.py
│ │ │ ├── base_multi_devices.py
│ │ │ ├── base_single_device.py
│ │ │ ├── icl_multi_devices.py
│ │ │ └── icl_single_device.py
│ │ └── encoder_only/
│ │ ├── auto_base_multi_devices.py
│ │ ├── auto_base_single_device.py
│ │ ├── auto_m3_multi_devices.py
│ │ ├── auto_m3_single_device.py
│ │ ├── base_multi_devices.py
│ │ ├── base_single_device.py
│ │ ├── m3_multi_devices.py
│ │ ├── m3_multi_devices_compute_score.py
│ │ ├── m3_single_device.py
│ │ └── m3_single_device_compute_score.py
│ └── reranker/
│ ├── README.md
│ ├── decoder_only/
│ │ ├── auto_base_multi_devices.py
│ │ ├── auto_base_single_device.py
│ │ ├── auto_layerwise_multi_devices.py
│ │ ├── auto_layerwise_single_device.py
│ │ ├── auto_lightweight_multi_devices.py
│ │ ├── auto_lightweight_single_device.py
│ │ ├── base_multi_devices.py
│ │ ├── base_single_device.py
│ │ ├── layerwise_multi_devices.py
│ │ ├── layerwise_single_device.py
│ │ ├── lightweight_multi_devices.py
│ │ └── lightweight_single_device.py
│ └── encoder_only/
│ ├── auto_base_multi_devices.py
│ ├── auto_base_single_device.py
│ ├── base_multi_devices.py
│ └── base_single_device.py
├── research/
│ ├── BGE_Coder/
│ │ ├── README.md
│ │ ├── data_generation/
│ │ │ ├── constant.py
│ │ │ ├── corpus_generator.py
│ │ │ ├── format_generated_examples.py
│ │ │ ├── llm.py
│ │ │ ├── run_generation.py
│ │ │ ├── search.py
│ │ │ ├── triplet_generator.py
│ │ │ └── utils.py
│ │ └── evaluation/
│ │ ├── coderag_eval/
│ │ │ ├── eval.sh
│ │ │ ├── prepare_data.sh
│ │ │ └── test/
│ │ │ ├── arguments.py
│ │ │ ├── create/
│ │ │ │ ├── code_search_net.py
│ │ │ │ ├── ds1000.py
│ │ │ │ ├── general_programming.py
│ │ │ │ ├── humaneval.py
│ │ │ │ ├── live_code_bench.py
│ │ │ │ ├── mbpp.py
│ │ │ │ ├── odex.py
│ │ │ │ ├── repoeval.py
│ │ │ │ ├── repoeval_repo.py
│ │ │ │ ├── swebench.py
│ │ │ │ ├── swebench_repo.py
│ │ │ │ └── utils.py
│ │ │ ├── main.py
│ │ │ └── prompts.py
│ │ └── coir_eval/
│ │ ├── arguments.py
│ │ ├── eval.sh
│ │ ├── main.py
│ │ └── prompts.py
│ ├── BGE_M3/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── data.py
│ │ ├── modeling.py
│ │ ├── run.py
│ │ ├── split_data_by_length.py
│ │ └── trainer.py
│ ├── BGE_Reasoner/
│ │ └── README.md
│ ├── BGE_VL/
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── eval/
│ │ │ ├── data/
│ │ │ │ ├── circo_corpus.jsonl
│ │ │ │ ├── circo_query.jsonl
│ │ │ │ ├── fashioniq_dress_corpus.jsonl
│ │ │ │ ├── fashioniq_dress_query_val.jsonl
│ │ │ │ ├── fashioniq_shirt_corpus.jsonl
│ │ │ │ ├── fashioniq_shirt_query_val.jsonl
│ │ │ │ ├── fashioniq_toptee_corpus.jsonl
│ │ │ │ └── fashioniq_toptee_query_val.jsonl
│ │ │ ├── eval_Circo.py
│ │ │ ├── eval_fashioniq.py
│ │ │ ├── flag_dataset.py
│ │ │ ├── flag_mmret.py
│ │ │ └── results/
│ │ │ ├── mmret_base_circo.json
│ │ │ └── mmret_large_circo.json
│ │ ├── modeling_MMRet_CLIP.py
│ │ └── retrieval_demo.ipynb
│ ├── BGE_VL_Screenshot/
│ │ └── README.md
│ ├── C_MTEB/
│ │ ├── C_MTEB/
│ │ │ ├── __init__.py
│ │ │ └── tasks/
│ │ │ ├── Classification.py
│ │ │ ├── Clustering.py
│ │ │ ├── MultiLongDocRetrieval.py
│ │ │ ├── PairClassification.py
│ │ │ ├── Reranking.py
│ │ │ ├── Retrieval.py
│ │ │ ├── STS.py
│ │ │ └── __init__.py
│ │ ├── MKQA/
│ │ │ ├── README.md
│ │ │ ├── dense_retrieval/
│ │ │ │ ├── step0-generate_embedding.py
│ │ │ │ ├── step1-search_results.py
│ │ │ │ └── step2-eval_dense_mkqa.py
│ │ │ ├── hybrid_retrieval/
│ │ │ │ ├── step0-hybrid_search_results.py
│ │ │ │ └── step1-eval_hybrid_mkqa.py
│ │ │ ├── multi_vector_rerank/
│ │ │ │ ├── hybrid_all_results.py
│ │ │ │ ├── step0-rerank_results.py
│ │ │ │ └── step1-eval_rerank_mkqa.py
│ │ │ ├── sparse_retrieval/
│ │ │ │ ├── bm25_baseline.py
│ │ │ │ ├── bm25_baseline_same_tokenizer.py
│ │ │ │ ├── step0-encode_query-and-corpus.py
│ │ │ │ ├── step1-search_results.py
│ │ │ │ └── step2-eval_sparse_mkqa.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ ├── evaluation.py
│ │ │ └── normalize_text.py
│ │ ├── MLDR/
│ │ │ ├── README.md
│ │ │ ├── dense_retrieval/
│ │ │ │ ├── step0-generate_embedding.py
│ │ │ │ ├── step1-search_results.py
│ │ │ │ └── step2-eval_dense_mldr.py
│ │ │ ├── hybrid_retrieval/
│ │ │ │ ├── step0-hybrid_search_results.py
│ │ │ │ └── step1-eval_hybrid_mldr.py
│ │ │ ├── mteb_dense_eval/
│ │ │ │ ├── eval_MLDR.py
│ │ │ │ └── flag_dres_model.py
│ │ │ ├── multi_vector_rerank/
│ │ │ │ ├── hybrid_all_results.py
│ │ │ │ ├── step0-rerank_results.py
│ │ │ │ └── step1-eval_rerank_mldr.py
│ │ │ └── sparse_retrieval/
│ │ │ ├── bm25_baseline.py
│ │ │ ├── bm25_baseline_same_tokenizer.py
│ │ │ ├── step0-encode_query-and-corpus.py
│ │ │ ├── step1-search_results.py
│ │ │ └── step2-eval_sparse_mldr.py
│ │ ├── README.md
│ │ ├── eval_C-MTEB.py
│ │ ├── eval_MTEB.py
│ │ ├── eval_cross_encoder.py
│ │ ├── flag_dres_model.py
│ │ ├── setup.py
│ │ └── summarize_results.py
│ ├── LLARA/
│ │ ├── README.md
│ │ ├── data/
│ │ │ ├── finetune/
│ │ │ │ └── toy_finetune_data.jsonl
│ │ │ └── pretrain/
│ │ │ └── toy_pretrain_data.jsonl
│ │ ├── finetune/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ ├── pretrain/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ └── stage1.json
│ ├── LM_Cocktail/
│ │ ├── LM_Cocktail/
│ │ │ ├── __init__.py
│ │ │ ├── cocktail.py
│ │ │ └── utils.py
│ │ ├── README.md
│ │ ├── embedder_examples.json
│ │ ├── llm_examples.json
│ │ └── setup.py
│ ├── Long_LLM/
│ │ ├── activation_beacon/
│ │ │ ├── README.md
│ │ │ ├── data/
│ │ │ │ ├── config/
│ │ │ │ │ ├── code.json
│ │ │ │ │ ├── even.json
│ │ │ │ │ ├── fsdp-offload.yaml
│ │ │ │ │ ├── fsdp.yaml
│ │ │ │ │ ├── slimpajama.json
│ │ │ │ │ ├── zero3-infer-offload.yaml
│ │ │ │ │ └── zero3-infer.yaml
│ │ │ │ ├── deepspeed/
│ │ │ │ │ ├── stage2-offload.json
│ │ │ │ │ ├── stage2.json
│ │ │ │ │ ├── stage3-offload-optim.json
│ │ │ │ │ ├── stage3-offload.json
│ │ │ │ │ └── stage3.json
│ │ │ │ └── toy/
│ │ │ │ └── infbench.json
│ │ │ ├── examples/
│ │ │ │ ├── evaluation.md
│ │ │ │ └── training.md
│ │ │ ├── main/
│ │ │ │ ├── eval_generation.py
│ │ │ │ ├── eval_infbench.py
│ │ │ │ ├── eval_lm.py
│ │ │ │ ├── eval_longbench.py
│ │ │ │ ├── eval_mmlu.py
│ │ │ │ ├── eval_msc.py
│ │ │ │ ├── eval_multiturn.py
│ │ │ │ ├── eval_needle.py
│ │ │ │ ├── eval_passkey.py
│ │ │ │ ├── eval_topic.py
│ │ │ │ ├── infbench_utils.py
│ │ │ │ ├── longbench_utils.py
│ │ │ │ ├── pretrain_data.py
│ │ │ │ ├── train.py
│ │ │ │ └── vllm_symlink.py
│ │ │ └── src/
│ │ │ ├── __init__.py
│ │ │ ├── args.py
│ │ │ ├── chat.py
│ │ │ ├── data.py
│ │ │ ├── llama/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_llama.py
│ │ │ │ └── modeling_llama.py
│ │ │ ├── metrics.py
│ │ │ ├── mistral/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_mistral.py
│ │ │ │ └── modeling_mistral.py
│ │ │ ├── modeling_beacon.py
│ │ │ ├── modeling_utils.py
│ │ │ ├── qwen2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configuration_qwen2.py
│ │ │ │ └── modeling_qwen2.py
│ │ │ ├── trainer.py
│ │ │ ├── utils.py
│ │ │ └── vllm_utils.py
│ │ └── longllm_qlora/
│ │ ├── README.md
│ │ ├── data/
│ │ │ └── narrativeqa.json
│ │ ├── data_pipeline/
│ │ │ ├── README.md
│ │ │ ├── _openai.py
│ │ │ ├── data/
│ │ │ │ └── README.md
│ │ │ ├── prepare_bio_book.ipynb
│ │ │ ├── prepare_multi_details_book.ipynb
│ │ │ ├── prepare_multi_details_paper_long.ipynb
│ │ │ ├── prepare_one_detail_book.ipynb
│ │ │ ├── prepare_one_detail_paper_long.ipynb
│ │ │ └── raw_data/
│ │ │ └── README.md
│ │ ├── main/
│ │ │ ├── eval_generation.py
│ │ │ ├── eval_infbench.py
│ │ │ ├── eval_lm.py
│ │ │ ├── eval_longbench.py
│ │ │ ├── eval_mmlu.py
│ │ │ ├── eval_needle.py
│ │ │ ├── eval_passkey.py
│ │ │ ├── eval_topic.py
│ │ │ ├── infbench_utils.py
│ │ │ ├── longbench_utils.py
│ │ │ └── train.py
│ │ └── src/
│ │ ├── __init__.py
│ │ ├── args.py
│ │ ├── chat.py
│ │ ├── data.py
│ │ ├── metrics.py
│ │ ├── modeling_utils.py
│ │ ├── trainer.py
│ │ └── utils.py
│ ├── MLVU/
│ │ ├── README.md
│ │ ├── data/
│ │ │ ├── 1_plotQA.json
│ │ │ ├── 2_needle.json
│ │ │ ├── 3_ego.json
│ │ │ ├── 4_count.json
│ │ │ ├── 5_order.json
│ │ │ ├── 6_anomaly_reco.json
│ │ │ ├── 7_topic_reasoning.json
│ │ │ ├── 8_sub_scene.json
│ │ │ └── 9_summary.json
│ │ └── evaluation/
│ │ ├── README.md
│ │ ├── generation_evaluation/
│ │ │ ├── calculate.py
│ │ │ ├── calculate_sum.py
│ │ │ ├── evaluate_ssc.py
│ │ │ ├── evaluate_summary.py
│ │ │ └── open_bench.py
│ │ ├── models/
│ │ │ ├── videochat2/
│ │ │ │ ├── choice_bench.py
│ │ │ │ └── open_bench.py
│ │ │ └── videollava/
│ │ │ ├── choice_bench.py
│ │ │ └── open_bench.py
│ │ └── multiple_choice_evaluation/
│ │ └── choice_bench.py
│ ├── Matroyshka_reranker/
│ │ ├── README.md
│ │ ├── finetune/
│ │ │ ├── compensation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── arguments.py
│ │ │ │ ├── data.py
│ │ │ │ ├── load_model.py
│ │ │ │ ├── mistral_config.py
│ │ │ │ ├── mistral_model.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── run.py
│ │ │ │ ├── stage1.json
│ │ │ │ └── trainer.py
│ │ │ └── self_distillation/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── mistral_config.py
│ │ │ ├── mistral_model.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ ├── stage1.json
│ │ │ └── trainer.py
│ │ ├── inference/
│ │ │ ├── __init__.py
│ │ │ ├── mistral_config.py
│ │ │ ├── mistral_model.py
│ │ │ └── rank_model.py
│ │ └── requirements.txt
│ ├── README.md
│ ├── Reinforced_IR/
│ │ ├── README.md
│ │ ├── data_generation/
│ │ │ ├── agent/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gpt.py
│ │ │ │ ├── vllm.py
│ │ │ │ └── vllm_instruct.py
│ │ │ ├── generate_generator_data.py
│ │ │ ├── generate_retriever_data.py
│ │ │ ├── generate_retriever_distill_data.py
│ │ │ ├── generate_universal_query.py
│ │ │ ├── prompts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── generate_prompts.py
│ │ │ │ ├── get_prompts.py
│ │ │ │ ├── hyde_prompts.py
│ │ │ │ ├── teacher_prompts.py
│ │ │ │ └── train_prompts.py
│ │ │ └── utils.py
│ │ ├── finetune/
│ │ │ ├── generator/
│ │ │ │ ├── save_tokenizer.py
│ │ │ │ └── update_file.py
│ │ │ ├── retriever/
│ │ │ │ ├── arguments.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── run.py
│ │ │ │ ├── runner.py
│ │ │ │ └── trainer.py
│ │ │ └── stage1.json
│ │ ├── inference/
│ │ │ ├── agent/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gpt.py
│ │ │ │ ├── vllm.py
│ │ │ │ └── vllm_instruct.py
│ │ │ ├── ir_model.py
│ │ │ ├── multi.py
│ │ │ └── test.py
│ │ └── requirements.txt
│ ├── baai_general_embedding/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── finetune/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── eval_msmarco.py
│ │ │ ├── hn_mine.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ └── retromae_pretrain/
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── data.py
│ │ ├── enhancedDecoder.py
│ │ ├── modeling.py
│ │ ├── run.py
│ │ ├── trainer.py
│ │ └── utils.py
│ ├── llm_dense_retriever/
│ │ ├── README.md
│ │ ├── examples/
│ │ │ └── bge-en-icl/
│ │ │ ├── AIR-Bench/
│ │ │ │ ├── long-doc/
│ │ │ │ │ ├── arxiv-gemini.jsonl
│ │ │ │ │ ├── arxiv-gpt3.jsonl
│ │ │ │ │ ├── arxiv-llama2.jsonl
│ │ │ │ │ ├── arxiv-llm-survey.jsonl
│ │ │ │ │ ├── book-a-brief-history-of-time_stephen-hawking.jsonl
│ │ │ │ │ ├── book-origin-of-species_darwin.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_1.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_2.jsonl
│ │ │ │ │ ├── healthcare-pubmed_100k-200k_3.jsonl
│ │ │ │ │ ├── healthcare-pubmed_30k-40k_10-merged.jsonl
│ │ │ │ │ ├── healthcare-pubmed_40k-50k_5-merged.jsonl
│ │ │ │ │ ├── law-lex_files_300k-400k.jsonl
│ │ │ │ │ ├── law-lex_files_400k-500k.jsonl
│ │ │ │ │ ├── law-lex_files_500k-600k.jsonl
│ │ │ │ │ └── law-lex_files_600k-700k.jsonl
│ │ │ │ └── qa/
│ │ │ │ ├── arxiv.jsonl
│ │ │ │ ├── finance.jsonl
│ │ │ │ ├── healthcare.jsonl
│ │ │ │ ├── law.jsonl
│ │ │ │ ├── msmarco.jsonl
│ │ │ │ ├── news.jsonl
│ │ │ │ ├── web.jsonl
│ │ │ │ └── wiki.jsonl
│ │ │ └── MTEB/
│ │ │ ├── AmazonCounterfactualClassification.json
│ │ │ ├── AmazonPolarityClassification.json
│ │ │ ├── AmazonReviewsClassification.json
│ │ │ ├── ArguAna.json
│ │ │ ├── ArxivClusteringP2P.json
│ │ │ ├── ArxivClusteringS2S.json
│ │ │ ├── AskUbuntuDupQuestions.json
│ │ │ ├── BIOSSES.json
│ │ │ ├── Banking77Classification.json
│ │ │ ├── BiorxivClusteringP2P.json
│ │ │ ├── BiorxivClusteringS2S.json
│ │ │ ├── CQADupstackRetrieval.json
│ │ │ ├── ClimateFEVER.json
│ │ │ ├── DBPedia.json
│ │ │ ├── EmotionClassification.json
│ │ │ ├── FEVER.json
│ │ │ ├── FiQA2018.json
│ │ │ ├── HotpotQA.json
│ │ │ ├── ImdbClassification.json
│ │ │ ├── MSMARCO.json
│ │ │ ├── MTOPDomainClassification.json
│ │ │ ├── MTOPIntentClassification.json
│ │ │ ├── MassiveIntentClassification.json
│ │ │ ├── MassiveScenarioClassification.json
│ │ │ ├── MedrxivClusteringP2P.json
│ │ │ ├── MedrxivClusteringS2S.json
│ │ │ ├── MindSmallReranking.json
│ │ │ ├── NFCorpus.json
│ │ │ ├── NQ.json
│ │ │ ├── QuoraRetrieval.json
│ │ │ ├── RedditClustering.json
│ │ │ ├── RedditClusteringP2P.json
│ │ │ ├── SCIDOCS.json
│ │ │ ├── SICK-R.json
│ │ │ ├── STS12.json
│ │ │ ├── STS13.json
│ │ │ ├── STS14.json
│ │ │ ├── STS15.json
│ │ │ ├── STS16.json
│ │ │ ├── STS17.json
│ │ │ ├── STS22.json
│ │ │ ├── STSBenchmark.json
│ │ │ ├── SciDocsRR.json
│ │ │ ├── SciFact.json
│ │ │ ├── SprintDuplicateQuestions.json
│ │ │ ├── StackExchangeClustering.json
│ │ │ ├── StackExchangeClusteringP2P.json
│ │ │ ├── StackOverflowDupQuestions.json
│ │ │ ├── SummEval.json
│ │ │ ├── TRECCOVID.json
│ │ │ ├── Touche2020.json
│ │ │ ├── ToxicConversationsClassification.json
│ │ │ ├── TweetSentimentExtractionClassification.json
│ │ │ ├── TwentyNewsgroupsClustering.json
│ │ │ ├── TwitterSemEval2015.json
│ │ │ └── TwitterURLCorpus.json
│ │ └── finetune/
│ │ ├── arguments.py
│ │ ├── data.py
│ │ ├── load_model.py
│ │ ├── modeling.py
│ │ ├── run.py
│ │ └── trainer.py
│ ├── llm_embedder/
│ │ ├── README.md
│ │ ├── data/
│ │ │ ├── deepspeed/
│ │ │ │ ├── stage0.json
│ │ │ │ ├── stage2-offload.json
│ │ │ │ ├── stage2.json
│ │ │ │ ├── stage3-offload-all.json
│ │ │ │ ├── stage3-offload-optim.json
│ │ │ │ └── stage3.json
│ │ │ └── toy/
│ │ │ ├── chat.json
│ │ │ ├── convsearch.json
│ │ │ ├── icl.json
│ │ │ ├── lrlm.json
│ │ │ ├── qa.json
│ │ │ └── tool.json
│ │ ├── docs/
│ │ │ ├── evaluation.md
│ │ │ └── fine-tune.md
│ │ ├── environment.yaml
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── eval_icl.py
│ │ │ ├── eval_lrlm.py
│ │ │ ├── eval_mmlu.py
│ │ │ ├── eval_msc.py
│ │ │ ├── eval_popqa.py
│ │ │ ├── eval_qa.py
│ │ │ ├── eval_qrecc.py
│ │ │ ├── eval_retrieval.py
│ │ │ ├── eval_tool.py
│ │ │ └── icl_utils.py
│ │ ├── run_dense.py
│ │ ├── run_lm_score.py
│ │ ├── run_ranker.py
│ │ ├── scripts/
│ │ │ ├── llm-embedder.sh
│ │ │ └── ours2st.py
│ │ └── src/
│ │ ├── __init__.py
│ │ ├── lm/
│ │ │ ├── __init__.py
│ │ │ ├── args.py
│ │ │ ├── modeling_lm.py
│ │ │ └── modeling_srlm.py
│ │ ├── retrieval/
│ │ │ ├── __init__.py
│ │ │ ├── args.py
│ │ │ ├── data.py
│ │ │ ├── evalnq.py
│ │ │ ├── metrics.py
│ │ │ ├── modeling_bm25.py
│ │ │ ├── modeling_dense.py
│ │ │ ├── modeling_ranker.py
│ │ │ ├── modeling_unified.py
│ │ │ └── trainer.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── llama_patch.py
│ │ └── util.py
│ ├── llm_reranker/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── evaluate.py
│ │ ├── finetune_for_instruction/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ ├── finetune_for_layerwise/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── configuration_minicpm_reranker.py
│ │ │ ├── data.py
│ │ │ ├── load_model.py
│ │ │ ├── modeling.py
│ │ │ ├── modeling_minicpm_reranker.py
│ │ │ ├── run.py
│ │ │ └── trainer.py
│ │ ├── merge/
│ │ │ ├── __init__.py
│ │ │ ├── configuration_minicpm_reranker.py
│ │ │ ├── merge_base_model.py
│ │ │ ├── merge_layerwise_model_from_finetuned_model.py
│ │ │ ├── merge_layerwise_model_from_raw_model.py
│ │ │ └── modeling_minicpm_reranker.py
│ │ ├── stage1.json
│ │ └── toy_finetune_data.jsonl
│ ├── old-examples/
│ │ ├── finetune/
│ │ │ ├── README.md
│ │ │ ├── ds_config.json
│ │ │ ├── toy_evaluation_data/
│ │ │ │ ├── toy_corpus.json
│ │ │ │ └── toy_query.json
│ │ │ └── toy_finetune_data.jsonl
│ │ ├── pretrain/
│ │ │ ├── README.md
│ │ │ ├── retromae_pretrain/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── arguments.py
│ │ │ │ ├── data.py
│ │ │ │ ├── enhancedDecoder.py
│ │ │ │ ├── modeling.py
│ │ │ │ ├── run.py
│ │ │ │ ├── trainer.py
│ │ │ │ └── utils.py
│ │ │ └── toy_pretrain_data.jsonl
│ │ ├── reranker/
│ │ │ ├── README.md
│ │ │ ├── ds_config.json
│ │ │ └── toy_finetune_data.jsonl
│ │ ├── search_demo/
│ │ │ ├── __init__.py
│ │ │ ├── arguments.py
│ │ │ ├── pre_process.py
│ │ │ ├── readme.md
│ │ │ ├── requirements.txt
│ │ │ ├── run.py
│ │ │ └── tool.py
│ │ └── unified_finetune/
│ │ ├── README.md
│ │ ├── toy_train_data/
│ │ │ ├── toy_train_data1.jsonl
│ │ │ └── toy_train_data2.jsonl
│ │ └── unified_finetune_bge-m3_exmaple.sh
│ ├── reranker/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── data.py
│ │ ├── modeling.py
│ │ ├── run.py
│ │ └── trainer.py
│ └── visual_bge/
│ ├── README.md
│ ├── __init__.py
│ ├── setup.py
│ └── visual_bge/
│ ├── eva_clip/
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── eva_vit_model.py
│ │ ├── factory.py
│ │ ├── hf_configs.py
│ │ ├── hf_model.py
│ │ ├── loss.py
│ │ ├── model.py
│ │ ├── model_configs/
│ │ │ ├── EVA01-CLIP-B-16.json
│ │ │ ├── EVA01-CLIP-g-14-plus.json
│ │ │ ├── EVA01-CLIP-g-14.json
│ │ │ ├── EVA02-CLIP-B-16.json
│ │ │ ├── EVA02-CLIP-L-14-336.json
│ │ │ ├── EVA02-CLIP-L-14.json
│ │ │ ├── EVA02-CLIP-bigE-14-plus.json
│ │ │ └── EVA02-CLIP-bigE-14.json
│ │ ├── modified_resnet.py
│ │ ├── openai.py
│ │ ├── pretrained.py
│ │ ├── rope.py
│ │ ├── timm_model.py
│ │ ├── tokenizer.py
│ │ ├── transform.py
│ │ ├── transformer.py
│ │ └── utils.py
│ └── modeling.py
├── scripts/
│ ├── README.md
│ ├── add_reranker_score.py
│ ├── hn_mine.py
│ └── split_data_by_length.py
├── setup.py
└── tests/
├── README.md
├── conftest.py
├── test_imports_v5.py
├── test_infer_embedder_basic.py
└── test_infer_reranker_basic.py
Showing preview only (268K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (3104 symbols across 447 files)
FILE: FlagEmbedding/abc/evaluation/arguments.py
class AbsEvalArgs (line 10) | class AbsEvalArgs:
class AbsEvalModelArgs (line 82) | class AbsEvalModelArgs:
method __post_init__ (line 181) | def __post_init__(self):
FILE: FlagEmbedding/abc/evaluation/data_loader.py
class AbsEvalDataLoader (line 14) | class AbsEvalDataLoader(ABC):
method __init__ (line 25) | def __init__(
method available_dataset_names (line 42) | def available_dataset_names(self) -> List[str]:
method available_splits (line 49) | def available_splits(self, dataset_name: Optional[str] = None) -> List...
method check_dataset_names (line 55) | def check_dataset_names(self, dataset_names: Union[str, List[str]]) ->...
method check_splits (line 76) | def check_splits(self, splits: Union[str, List[str]], dataset_name: Op...
method load_corpus (line 97) | def load_corpus(self, dataset_name: Optional[str] = None) -> datasets....
method load_qrels (line 115) | def load_qrels(self, dataset_name: Optional[str] = None, split: str = ...
method load_queries (line 143) | def load_queries(self, dataset_name: Optional[str] = None, split: str ...
method _load_remote_corpus (line 171) | def _load_remote_corpus(
method _load_remote_qrels (line 190) | def _load_remote_qrels(
method _load_remote_queries (line 211) | def _load_remote_queries(
method _load_local_corpus (line 232) | def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str...
method _load_local_qrels (line 255) | def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str]...
method _load_local_queries (line 290) | def _load_local_queries(self, save_dir: str, dataset_name: Optional[st...
method _download_file (line 319) | def _download_file(self, download_url: str, save_dir: str):
method _get_fpath_size (line 350) | def _get_fpath_size(self, fpath: str) -> int:
method _download_gz_file (line 369) | def _download_gz_file(self, download_url: str, save_dir: str):
method _download_zip_file (line 395) | def _download_zip_file(self, download_url: str, save_dir: str):
FILE: FlagEmbedding/abc/evaluation/evaluator.py
class AbsEvaluator (line 18) | class AbsEvaluator:
method __init__ (line 27) | def __init__(
method check_data_info (line 37) | def check_data_info(
method get_corpus_embd_save_dir (line 80) | def get_corpus_embd_save_dir(
method __call__ (line 102) | def __call__(
method save_search_results (line 267) | def save_search_results(
method load_search_results (line 302) | def load_search_results(input_path: str):
method compute_metrics (line 318) | def compute_metrics(
method evaluate_results (line 358) | def evaluate_results(
method output_eval_results_to_json (line 403) | def output_eval_results_to_json(eval_results_dict: dict, output_path: ...
method get_results_df (line 417) | def get_results_df(metric: str, eval_results_dict: dict):
method output_eval_results_to_markdown (line 467) | def output_eval_results_to_markdown(eval_results_dict: dict, output_pa...
FILE: FlagEmbedding/abc/evaluation/runner.py
class AbsEvalRunner (line 16) | class AbsEvalRunner:
method __init__ (line 24) | def __init__(
method get_models (line 37) | def get_models(model_args: AbsEvalModelArgs) -> Tuple[AbsEmbedder, Uni...
method load_retriever_and_reranker (line 92) | def load_retriever_and_reranker(self) -> Tuple[EvalDenseRetriever, Uni...
method load_data_loader (line 109) | def load_data_loader(self) -> AbsEvalDataLoader:
method load_evaluator (line 124) | def load_evaluator(self) -> AbsEvaluator:
method evaluate_metrics (line 138) | def evaluate_metrics(
method run (line 183) | def run(self):
FILE: FlagEmbedding/abc/evaluation/searcher.py
class EvalRetriever (line 18) | class EvalRetriever(ABC):
method __init__ (line 22) | def __init__(self, embedder: AbsEmbedder, search_top_k: int = 1000, ov...
method __str__ (line 27) | def __str__(self) -> str:
method stop_multi_process_pool (line 33) | def stop_multi_process_pool(self):
method __call__ (line 43) | def __call__(
class EvalDenseRetriever (line 71) | class EvalDenseRetriever(EvalRetriever):
method __call__ (line 75) | def __call__(
class EvalReranker (line 160) | class EvalReranker:
method __init__ (line 164) | def __init__(self, reranker: AbsReranker, rerank_top_k: int = 100):
method __str__ (line 168) | def __str__(self) -> str:
method stop_multi_process_pool (line 174) | def stop_multi_process_pool(self):
method __call__ (line 183) | def __call__(
FILE: FlagEmbedding/abc/evaluation/utils.py
function evaluate_mrr (line 14) | def evaluate_mrr(
function evaluate_recall_cap (line 56) | def evaluate_recall_cap(
function evaluate_metrics (line 95) | def evaluate_metrics(
function index (line 150) | def index(
function search (line 192) | def search(
FILE: FlagEmbedding/abc/finetune/embedder/AbsArguments.py
class AbsEmbedderModelArguments (line 9) | class AbsEmbedderModelArguments:
class AbsEmbedderDataArguments (line 44) | class AbsEmbedderDataArguments:
method __post_init__ (line 120) | def __post_init__(self):
class AbsEmbedderTrainingArguments (line 134) | class AbsEmbedderTrainingArguments(TrainingArguments):
FILE: FlagEmbedding/abc/finetune/embedder/AbsDataset.py
class AbsEmbedderTrainDataset (line 23) | class AbsEmbedderTrainDataset(Dataset):
method __init__ (line 30) | def __init__(
method _load_dataset (line 54) | def _load_dataset(self, file_path: str):
method _shuffle_text (line 83) | def _shuffle_text(self, text):
method __len__ (line 102) | def __len__(self):
method __getitem__ (line 105) | def __getitem__(self, item):
class AbsEmbedderCollator (line 154) | class AbsEmbedderCollator(DataCollatorWithPadding):
method __call__ (line 162) | def __call__(self, features):
class AbsEmbedderSameDatasetTrainDataset (line 245) | class AbsEmbedderSameDatasetTrainDataset(AbsEmbedderTrainDataset):
method __init__ (line 256) | def __init__(
method _load_dataset (line 337) | def _load_dataset(self, file_path: str):
method _get_file_batch_size (line 361) | def _get_file_batch_size(temp_dataset: datasets.Dataset, default_batch...
method refresh_epoch (line 379) | def refresh_epoch(self):
method __len__ (line 403) | def __len__(self):
method __getitem__ (line 406) | def __getitem__(self, _):
method _get_train_group_size (line 415) | def _get_train_group_size(self, batch_raw_data):
method _create_batch_data (line 441) | def _create_batch_data(self, batch_raw_data):
class AbsEmbedderSameDatasetCollator (line 514) | class AbsEmbedderSameDatasetCollator(DataCollatorWithPadding):
method __call__ (line 527) | def __call__(self, features):
class EmbedderTrainerCallbackForDataRefresh (line 607) | class EmbedderTrainerCallbackForDataRefresh(TrainerCallback):
method __init__ (line 611) | def __init__(self, train_dataset: AbsEmbedderSameDatasetTrainDataset):
method on_epoch_end (line 614) | def on_epoch_end(
FILE: FlagEmbedding/abc/finetune/embedder/AbsModeling.py
class EmbedderOutput (line 17) | class EmbedderOutput(ModelOutput):
class AbsEmbedderModel (line 27) | class AbsEmbedderModel(ABC, nn.Module):
method __init__ (line 39) | def __init__(
method encode (line 64) | def encode(self, features):
method compute_loss (line 73) | def compute_loss(self, scores, target):
method compute_score (line 83) | def compute_score(self, q_reps, p_reps):
method save (line 93) | def save(self, output_dir: str):
method get_local_score (line 101) | def get_local_score(self, q_reps, p_reps, all_scores):
method compute_local_score (line 121) | def compute_local_score(self, q_reps, p_reps, compute_score_func=None,...
method _compute_no_in_batch_neg_loss (line 140) | def _compute_no_in_batch_neg_loss(self, q_reps, p_reps, teacher_target...
method _compute_in_batch_neg_loss (line 162) | def _compute_in_batch_neg_loss(self, q_reps, p_reps, teacher_targets=N...
method _compute_cross_device_neg_loss (line 194) | def _compute_cross_device_neg_loss(self, q_reps, p_reps, teacher_targe...
method forward (line 234) | def forward(
method distill_loss (line 280) | def distill_loss(kd_loss_type, teacher_targets, student_scores, group_...
method _dist_gather_tensor (line 320) | def _dist_gather_tensor(self, t: Optional[torch.Tensor]):
FILE: FlagEmbedding/abc/finetune/embedder/AbsRunner.py
class AbsEmbedderRunner (line 24) | class AbsEmbedderRunner(ABC):
method __init__ (line 32) | def __init__(
method load_tokenizer_and_model (line 79) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEm...
method load_trainer (line 88) | def load_trainer(self) -> AbsEmbedderTrainer:
method load_train_dataset (line 96) | def load_train_dataset(self) -> AbsEmbedderTrainDataset:
method load_data_collator (line 120) | def load_data_collator(self) -> AbsEmbedderCollator:
method run (line 142) | def run(self):
FILE: FlagEmbedding/abc/finetune/embedder/AbsTrainer.py
class AbsEmbedderTrainer (line 9) | class AbsEmbedderTrainer(ABC, Trainer):
method _save (line 14) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method compute_loss (line 17) | def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
FILE: FlagEmbedding/abc/finetune/reranker/AbsArguments.py
class AbsRerankerModelArguments (line 9) | class AbsRerankerModelArguments:
class AbsRerankerDataArguments (line 52) | class AbsRerankerDataArguments:
method __post_init__ (line 126) | def __post_init__(self):
class AbsRerankerTrainingArguments (line 140) | class AbsRerankerTrainingArguments(TrainingArguments):
FILE: FlagEmbedding/abc/finetune/reranker/AbsDataset.py
class AbsRerankerTrainDataset (line 23) | class AbsRerankerTrainDataset(Dataset):
method __init__ (line 30) | def __init__(
method _load_dataset (line 55) | def _load_dataset(self, file_path: str):
method _shuffle_text (line 84) | def _shuffle_text(self, text):
method __len__ (line 103) | def __len__(self):
method create_one_example (line 106) | def create_one_example(self, qry_encoding: str, doc_encoding: str):
method __getitem__ (line 127) | def __getitem__(self, item):
class AbsRerankerCollator (line 180) | class AbsRerankerCollator(DataCollatorWithPadding):
method __call__ (line 187) | def __call__(self, features) -> List[BatchEncoding]:
class AbsLLMRerankerTrainDataset (line 211) | class AbsLLMRerankerTrainDataset(AbsRerankerTrainDataset):
method __init__ (line 218) | def __init__(
method __getitem__ (line 231) | def __getitem__(self, item) -> List[BatchEncoding]:
class AbsLLMRerankerCollator (line 341) | class AbsLLMRerankerCollator(DataCollatorForSeq2Seq):
method __call__ (line 350) | def __call__(self, features, return_tensors='pt'):
FILE: FlagEmbedding/abc/finetune/reranker/AbsModeling.py
class RerankerOutput (line 15) | class RerankerOutput(ModelOutput):
class AbsRerankerModel (line 20) | class AbsRerankerModel(ABC, nn.Module):
method __init__ (line 28) | def __init__(
method gradient_checkpointing_enable (line 47) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 53) | def enable_input_require_grads(self, **kwargs):
method encode (line 60) | def encode(self, features):
method forward (line 68) | def forward(self, pair: Union[Dict[str, Tensor], List[Dict[str, Tensor...
method compute_loss (line 101) | def compute_loss(self, scores, target):
method save (line 113) | def save(self, output_dir: str):
method save_pretrained (line 127) | def save_pretrained(self, *args, **kwargs):
FILE: FlagEmbedding/abc/finetune/reranker/AbsRunner.py
class AbsRerankerRunner (line 24) | class AbsRerankerRunner(ABC):
method __init__ (line 32) | def __init__(
method load_tokenizer_and_model (line 79) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRe...
method load_trainer (line 88) | def load_trainer(self) -> AbsRerankerTrainer:
method load_train_dataset (line 96) | def load_train_dataset(self) -> AbsRerankerTrainDataset:
method load_data_collator (line 114) | def load_data_collator(self) -> AbsRerankerCollator:
method run (line 135) | def run(self):
FILE: FlagEmbedding/abc/finetune/reranker/AbsTrainer.py
class AbsRerankerTrainer (line 9) | class AbsRerankerTrainer(ABC, Trainer):
method _save (line 14) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method compute_loss (line 17) | def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
FILE: FlagEmbedding/abc/inference/AbsEmbedder.py
class AbsEmbedder (line 24) | class AbsEmbedder(ABC):
method __init__ (line 47) | def __init__(
method stop_self_pool (line 84) | def stop_self_pool(self):
method get_target_devices (line 97) | def get_target_devices(devices: Union[str, int, List[str], List[int]])...
method get_detailed_instruct (line 144) | def get_detailed_instruct(instruction_format: str, instruction: str, s...
method encode_queries (line 159) | def encode_queries(
method encode_corpus (line 193) | def encode_corpus(
method encode (line 230) | def encode(
method __del__ (line 287) | def __del__(self):
method encode_single_device (line 291) | def encode_single_device(
method start_multi_process_pool (line 306) | def start_multi_process_pool(
method _encode_multi_process_worker (line 346) | def _encode_multi_process_worker(
method stop_multi_process_pool (line 369) | def stop_multi_process_pool(pool: Dict[Literal["input", "output", "pro...
method encode_multi_process (line 391) | def encode_multi_process(
method _concatenate_results_from_multi_process (line 424) | def _concatenate_results_from_multi_process(self, results_list: List[U...
FILE: FlagEmbedding/abc/inference/AbsReranker.py
class AbsReranker (line 23) | class AbsReranker(ABC):
method __init__ (line 47) | def __init__(
method stop_self_pool (line 86) | def stop_self_pool(self):
method get_target_devices (line 99) | def get_target_devices(devices: Union[str, int, List[str], List[int]])...
method get_detailed_instruct (line 142) | def get_detailed_instruct(self, instruction_format: str, instruction: ...
method get_detailed_inputs (line 157) | def get_detailed_inputs(self, sentence_pairs: Union[str, List[str]]):
method compute_score (line 200) | def compute_score(
method __del__ (line 231) | def __del__(self):
method compute_score_single_gpu (line 235) | def compute_score_single_gpu(
method start_multi_process_pool (line 251) | def start_multi_process_pool(self) -> Dict[Literal["input", "output", ...
method encode_multi_process (line 284) | def encode_multi_process(
method _encode_multi_process_worker (line 319) | def _encode_multi_process_worker(
method stop_multi_process_pool (line 342) | def stop_multi_process_pool(pool: Dict[Literal["input", "output", "pro...
FILE: FlagEmbedding/evaluation/air_bench/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/air_bench/arguments.py
class AIRBenchEvalModelArgs (line 7) | class AIRBenchEvalModelArgs:
method __post_init__ (line 106) | def __post_init__(self):
FILE: FlagEmbedding/evaluation/air_bench/runner.py
class AIRBenchEvalRunner (line 12) | class AIRBenchEvalRunner:
method __init__ (line 20) | def __init__(
method load_retriever_and_reranker (line 31) | def load_retriever_and_reranker(self) -> Tuple[EvalDenseRetriever, Uni...
method run (line 48) | def run(self):
FILE: FlagEmbedding/evaluation/beir/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/beir/arguments.py
class BEIREvalArgs (line 7) | class BEIREvalArgs(AbsEvalArgs):
FILE: FlagEmbedding/evaluation/beir/data_loader.py
class BEIREvalDataLoader (line 15) | class BEIREvalDataLoader(AbsEvalDataLoader):
method available_dataset_names (line 19) | def available_dataset_names(self) -> List[str]:
method available_sub_dataset_names (line 28) | def available_sub_dataset_names(self, dataset_name: Optional[str] = No...
method available_splits (line 42) | def available_splits(self, dataset_name: Optional[str] = None) -> List...
method _load_remote_corpus (line 56) | def _load_remote_corpus(
method _load_remote_qrels (line 127) | def _load_remote_qrels(
method _load_remote_queries (line 212) | def _load_remote_queries(
method load_corpus (line 291) | def load_corpus(self, dataset_name: Optional[str] = None, sub_dataset_...
method load_qrels (line 310) | def load_qrels(self, dataset_name: Optional[str] = None, sub_dataset_n...
method load_queries (line 339) | def load_queries(self, dataset_name: Optional[str] = None, sub_dataset...
method _load_local_corpus (line 368) | def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str...
method _load_local_qrels (line 397) | def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str]...
method _load_local_queries (line 438) | def _load_local_queries(self, save_dir: str, dataset_name: Optional[st...
FILE: FlagEmbedding/evaluation/beir/evaluator.py
class BEIREvaluator (line 12) | class BEIREvaluator(AbsEvaluator):
method check_data_info (line 16) | def check_data_info(
method __call__ (line 66) | def __call__(
method evaluate_results (line 351) | def evaluate_results(
method save_search_results (line 418) | def save_search_results(
FILE: FlagEmbedding/evaluation/beir/runner.py
class BEIREvalRunner (line 11) | class BEIREvalRunner(AbsEvalRunner):
method run (line 15) | def run(self):
method load_data_loader (line 63) | def load_data_loader(self) -> BEIREvalDataLoader:
method load_evaluator (line 78) | def load_evaluator(self) -> BEIREvaluator:
FILE: FlagEmbedding/evaluation/bright/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/bright/arguments.py
class BrightEvalArgs (line 7) | class BrightEvalArgs(AbsEvalArgs):
FILE: FlagEmbedding/evaluation/bright/data_loader.py
class BrightShortEvalDataLoader (line 14) | class BrightShortEvalDataLoader(AbsEvalDataLoader):
method available_dataset_names (line 18) | def available_dataset_names(self) -> List[str]:
method available_splits (line 34) | def available_splits(self, dataset_name: str) -> List[str]:
method _load_remote_corpus (line 51) | def _load_remote_corpus(
method _load_remote_qrels (line 89) | def _load_remote_qrels(
method _load_remote_queries (line 165) | def _load_remote_queries(
class BrightLongEvalDataLoader (line 209) | class BrightLongEvalDataLoader(AbsEvalDataLoader):
method available_dataset_names (line 213) | def available_dataset_names(self) -> List[str]:
method available_splits (line 227) | def available_splits(self, dataset_name: str) -> List[str]:
method _load_remote_corpus (line 244) | def _load_remote_corpus(
method _load_remote_qrels (line 282) | def _load_remote_qrels(
method _load_remote_queries (line 358) | def _load_remote_queries(
FILE: FlagEmbedding/evaluation/bright/runner.py
class BrightEvalRunner (line 14) | class BrightEvalRunner(AbsEvalRunner):
method __init__ (line 18) | def __init__(self, eval_args: BrightEvalArgs, model_args: BrightEvalMo...
method load_data_loader (line 23) | def load_data_loader(self) -> Union[BrightShortEvalDataLoader, BrightL...
method load_retriever_and_reranker (line 45) | def load_retriever_and_reranker(self) -> Tuple[BrightEvalDenseRetrieve...
method run (line 62) | def run(self):
FILE: FlagEmbedding/evaluation/bright/searcher.py
class BrightEvalDenseRetriever (line 15) | class BrightEvalDenseRetriever(EvalRetriever):
method __call__ (line 19) | def __call__(
FILE: FlagEmbedding/evaluation/custom/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/custom/data_loader.py
class CustomEvalDataLoader (line 10) | class CustomEvalDataLoader(AbsEvalDataLoader):
method available_dataset_names (line 11) | def available_dataset_names(self) -> List[str]:
method available_splits (line 14) | def available_splits(self, dataset_name: Optional[str] = None) -> List...
FILE: FlagEmbedding/evaluation/custom/runner.py
class CustomEvalRunner (line 6) | class CustomEvalRunner(AbsEvalRunner):
method load_data_loader (line 7) | def load_data_loader(self) -> CustomEvalDataLoader:
FILE: FlagEmbedding/evaluation/miracl/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/miracl/data_loader.py
class MIRACLEvalDataLoader (line 13) | class MIRACLEvalDataLoader(AbsEvalDataLoader):
method available_dataset_names (line 17) | def available_dataset_names(self) -> List[str]:
method available_splits (line 26) | def available_splits(self, dataset_name: str) -> List[str]:
method _load_remote_corpus (line 41) | def _load_remote_corpus(
method _load_remote_qrels (line 84) | def _load_remote_qrels(
method _load_remote_queries (line 135) | def _load_remote_queries(
FILE: FlagEmbedding/evaluation/miracl/runner.py
class MIRACLEvalRunner (line 6) | class MIRACLEvalRunner(AbsEvalRunner):
method load_data_loader (line 10) | def load_data_loader(self) -> MIRACLEvalDataLoader:
FILE: FlagEmbedding/evaluation/mkqa/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/mkqa/data_loader.py
class MKQAEvalDataLoader (line 15) | class MKQAEvalDataLoader(AbsEvalDataLoader):
method available_dataset_names (line 19) | def available_dataset_names(self) -> List[str]:
method available_splits (line 28) | def available_splits(self, dataset_name: Optional[str] = None) -> List...
method load_corpus (line 40) | def load_corpus(self, dataset_name: Optional[str] = None) -> datasets....
method _load_local_qrels (line 56) | def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str]...
method _load_remote_corpus (line 89) | def _load_remote_corpus(
method _load_remote_qrels (line 132) | def _load_remote_qrels(
method _load_remote_queries (line 183) | def _load_remote_queries(
FILE: FlagEmbedding/evaluation/mkqa/evaluator.py
class MKQAEvaluator (line 10) | class MKQAEvaluator(AbsEvaluator):
method get_corpus_embd_save_dir (line 14) | def get_corpus_embd_save_dir(
method evaluate_results (line 35) | def evaluate_results(
method compute_metrics (line 87) | def compute_metrics(
FILE: FlagEmbedding/evaluation/mkqa/runner.py
class MKQAEvalRunner (line 7) | class MKQAEvalRunner(AbsEvalRunner):
method load_data_loader (line 11) | def load_data_loader(self) -> MKQAEvalDataLoader:
method load_evaluator (line 26) | def load_evaluator(self) -> MKQAEvaluator:
FILE: FlagEmbedding/evaluation/mkqa/utils/compute_metrics.py
class SimpleTokenizer (line 10) | class SimpleTokenizer:
method __init__ (line 14) | def __init__(self):
method tokenize (line 24) | def tokenize(self, text, uncased=False):
function _normalize (line 33) | def _normalize(text):
function has_answer (line 37) | def has_answer(answers, text, tokenizer) -> bool:
function check_answer (line 51) | def check_answer(example, tokenizer) -> List[bool]:
function evaluate_qa_recall (line 65) | def evaluate_qa_recall(ctxs, answers, k_values: Union[int, List[int]]=100):
FILE: FlagEmbedding/evaluation/mkqa/utils/normalize_text.py
function normalize_text (line 133) | def normalize_text(text: str):
FILE: FlagEmbedding/evaluation/mldr/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/mldr/data_loader.py
class MLDREvalDataLoader (line 13) | class MLDREvalDataLoader(AbsEvalDataLoader):
method available_dataset_names (line 17) | def available_dataset_names(self) -> List[str]:
method available_splits (line 26) | def available_splits(self, dataset_name: Optional[str] = None) -> List...
method _load_remote_corpus (line 38) | def _load_remote_corpus(
method _load_remote_qrels (line 77) | def _load_remote_qrels(
method _load_remote_queries (line 142) | def _load_remote_queries(
FILE: FlagEmbedding/evaluation/mldr/runner.py
class MLDREvalRunner (line 6) | class MLDREvalRunner(AbsEvalRunner):
method load_data_loader (line 10) | def load_data_loader(self) -> MLDREvalDataLoader:
FILE: FlagEmbedding/evaluation/msmarco/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/msmarco/data_loader.py
class MSMARCOEvalDataLoader (line 13) | class MSMARCOEvalDataLoader(AbsEvalDataLoader):
method available_dataset_names (line 17) | def available_dataset_names(self) -> List[str]:
method available_splits (line 26) | def available_splits(self, dataset_name: Optional[str] = None) -> List...
method _load_remote_corpus (line 38) | def _load_remote_corpus(
method _load_remote_qrels (line 104) | def _load_remote_qrels(
method _load_remote_queries (line 198) | def _load_remote_queries(
FILE: FlagEmbedding/evaluation/msmarco/runner.py
class MSMARCOEvalRunner (line 6) | class MSMARCOEvalRunner(AbsEvalRunner):
method load_data_loader (line 10) | def load_data_loader(self) -> MSMARCOEvalDataLoader:
FILE: FlagEmbedding/evaluation/mteb/__main__.py
function main (line 9) | def main():
FILE: FlagEmbedding/evaluation/mteb/arguments.py
class MTEBEvalArgs (line 8) | class MTEBEvalArgs(AbsEvalArgs):
FILE: FlagEmbedding/evaluation/mteb/prompts.py
function get_task_def_by_task_name_and_type (line 4) | def get_task_def_by_task_name_and_type(task_name: str, task_type: str) -...
FILE: FlagEmbedding/evaluation/mteb/runner.py
function ensure_dir (line 17) | def ensure_dir(file_path):
class MTEBEvalRunner (line 22) | class MTEBEvalRunner(AbsEvalRunner):
method __init__ (line 26) | def __init__(
method load_retriever_and_reranker (line 36) | def load_retriever_and_reranker(self) -> Tuple[MTEBEvalDenseRetriever,...
method read_results (line 52) | def read_results(self, output_folder, tasks):
method output_json (line 97) | def output_json(self, tasks_results, save_file):
method run (line 136) | def run(self):
FILE: FlagEmbedding/evaluation/mteb/searcher.py
class MTEBEvalDenseRetriever (line 7) | class MTEBEvalDenseRetriever(EvalDenseRetriever):
method __init__ (line 11) | def __init__(self, embedder, **kwargs):
method set_examples (line 14) | def set_examples(self, examples_for_task: Optional[List[dict]] = None):
method set_instruction (line 22) | def set_instruction(self, instruction: Optional[str] = None):
method get_instruction (line 30) | def get_instruction(self):
method set_normalize_embeddings (line 38) | def set_normalize_embeddings(self, normalize_embeddings: bool = True):
method stop_pool (line 46) | def stop_pool(self):
method encode_queries (line 53) | def encode_queries(self, queries: List[str], **kwargs):
method encode_corpus (line 67) | def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs):
method encode (line 85) | def encode(self, corpus: List[Dict[str, str]], **kwargs):
class MTEBEvalReranker (line 103) | class MTEBEvalReranker(EvalReranker):
method __init__ (line 107) | def __init__(self, reranker, **kwargs):
FILE: FlagEmbedding/finetune/embedder/decoder_only/base/__main__.py
function main (line 11) | def main():
FILE: FlagEmbedding/finetune/embedder/decoder_only/base/arguments.py
function default_target_modules (line 7) | def default_target_modules() -> List[int]:
class DecoderOnlyEmbedderModelArguments (line 12) | class DecoderOnlyEmbedderModelArguments(AbsEmbedderModelArguments):
FILE: FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py
function find_largest_checkpoint (line 13) | def find_largest_checkpoint(checkpoint_dir):
function get_model (line 38) | def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:...
function save_merged_model (line 123) | def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, out...
FILE: FlagEmbedding/finetune/embedder/decoder_only/base/modeling.py
class BiDecoderOnlyEmbedderModel (line 11) | class BiDecoderOnlyEmbedderModel(AbsEmbedderModel):
method __init__ (line 27) | def __init__(
method encode (line 50) | def encode(self, features):
method _sentence_embedding (line 94) | def _sentence_embedding(self, last_hidden_state, attention_mask):
method compute_score (line 129) | def compute_score(self, q_reps, p_reps):
method _compute_similarity (line 143) | def _compute_similarity(self, q_reps, p_reps):
method compute_loss (line 157) | def compute_loss(self, scores, target):
method gradient_checkpointing_enable (line 169) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 175) | def enable_input_require_grads(self, **kwargs):
method save (line 181) | def save(self, output_dir: str):
FILE: FlagEmbedding/finetune/embedder/decoder_only/base/runner.py
class DecoderOnlyEmbedderRunner (line 17) | class DecoderOnlyEmbedderRunner(AbsEmbedderRunner):
method __init__ (line 25) | def __init__(
method load_tokenizer_and_model (line 36) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEm...
method load_trainer (line 102) | def load_trainer(self) -> DecoderOnlyEmbedderTrainer:
method run (line 119) | def run(self):
FILE: FlagEmbedding/finetune/embedder/decoder_only/base/trainer.py
class DecoderOnlyEmbedderTrainer (line 11) | class DecoderOnlyEmbedderTrainer(AbsEmbedderTrainer):
method _save (line 15) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
FILE: FlagEmbedding/finetune/embedder/decoder_only/icl/__main__.py
function main (line 11) | def main():
FILE: FlagEmbedding/finetune/embedder/decoder_only/icl/arguments.py
function default_target_modules (line 10) | def default_target_modules() -> List[int]:
class DecoderOnlyEmbedderICLModelArguments (line 15) | class DecoderOnlyEmbedderICLModelArguments(AbsEmbedderModelArguments):
class DecoderOnlyEmbedderICLDataArguments (line 83) | class DecoderOnlyEmbedderICLDataArguments(AbsEmbedderDataArguments):
FILE: FlagEmbedding/finetune/embedder/decoder_only/icl/dataset.py
class DecoderOnlyEmbedderICLSameDatasetTrainDataset (line 17) | class DecoderOnlyEmbedderICLSameDatasetTrainDataset(AbsEmbedderSameDatas...
method __init__ (line 28) | def __init__(
method _create_batch_data (line 51) | def _create_batch_data(self, batch_raw_data):
class AbsEmbedderSameDatasetCollator (line 201) | class AbsEmbedderSameDatasetCollator(DataCollatorWithPadding):
method __call__ (line 214) | def __call__(self, features):
FILE: FlagEmbedding/finetune/embedder/decoder_only/icl/load_model.py
function find_largest_checkpoint (line 13) | def find_largest_checkpoint(checkpoint_dir):
function get_model (line 38) | def get_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_d...
function save_merged_model (line 123) | def save_merged_model(model_args: DecoderOnlyEmbedderICLModelArguments, ...
FILE: FlagEmbedding/finetune/embedder/decoder_only/icl/modeling.py
class BiDecoderOnlyEmbedderICLModel (line 11) | class BiDecoderOnlyEmbedderICLModel(AbsEmbedderModel):
method __init__ (line 27) | def __init__(
method encode (line 50) | def encode(self, features):
method _sentence_embedding (line 94) | def _sentence_embedding(self, last_hidden_state, attention_mask):
method compute_score (line 129) | def compute_score(self, q_reps, p_reps):
method _compute_similarity (line 143) | def _compute_similarity(self, q_reps, p_reps):
method compute_loss (line 157) | def compute_loss(self, scores, target):
method gradient_checkpointing_enable (line 169) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 175) | def enable_input_require_grads(self, **kwargs):
method save (line 181) | def save(self, output_dir: str):
FILE: FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py
class DecoderOnlyEmbedderICLRunner (line 18) | class DecoderOnlyEmbedderICLRunner(AbsEmbedderRunner):
method __init__ (line 26) | def __init__(
method load_tokenizer_and_model (line 37) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEm...
method load_trainer (line 103) | def load_trainer(self) -> DecoderOnlyEmbedderICLTrainer:
method load_train_dataset (line 120) | def load_train_dataset(self) -> DecoderOnlyEmbedderICLSameDatasetTrain...
method run (line 144) | def run(self):
FILE: FlagEmbedding/finetune/embedder/decoder_only/icl/trainer.py
class DecoderOnlyEmbedderICLTrainer (line 11) | class DecoderOnlyEmbedderICLTrainer(AbsEmbedderTrainer):
method _save (line 15) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
FILE: FlagEmbedding/finetune/embedder/encoder_only/base/__main__.py
function main (line 11) | def main():
FILE: FlagEmbedding/finetune/embedder/encoder_only/base/modeling.py
class BiEncoderOnlyEmbedderModel (line 11) | class BiEncoderOnlyEmbedderModel(AbsEmbedderModel):
method __init__ (line 27) | def __init__(
method encode (line 50) | def encode(self, features):
method _sentence_embedding (line 93) | def _sentence_embedding(self, last_hidden_state, attention_mask):
method compute_score (line 128) | def compute_score(self, q_reps, p_reps):
method _compute_similarity (line 142) | def _compute_similarity(self, q_reps, p_reps):
method compute_loss (line 156) | def compute_loss(self, scores, target):
method gradient_checkpointing_enable (line 168) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 174) | def enable_input_require_grads(self, **kwargs):
method save (line 180) | def save(self, output_dir: str):
FILE: FlagEmbedding/finetune/embedder/encoder_only/base/runner.py
class EncoderOnlyEmbedderRunner (line 15) | class EncoderOnlyEmbedderRunner(AbsEmbedderRunner):
method load_tokenizer_and_model (line 19) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEm...
method load_trainer (line 70) | def load_trainer(self) -> EncoderOnlyEmbedderTrainer:
FILE: FlagEmbedding/finetune/embedder/encoder_only/base/trainer.py
class EncoderOnlyEmbedderTrainer (line 11) | class EncoderOnlyEmbedderTrainer(AbsEmbedderTrainer):
method _save (line 15) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
FILE: FlagEmbedding/finetune/embedder/encoder_only/m3/__main__.py
function main (line 11) | def main():
FILE: FlagEmbedding/finetune/embedder/encoder_only/m3/arguments.py
class EncoderOnlyEmbedderM3ModelArguments (line 10) | class EncoderOnlyEmbedderM3ModelArguments(AbsEmbedderModelArguments):
class EncoderOnlyEmbedderM3TrainingArguments (line 18) | class EncoderOnlyEmbedderM3TrainingArguments(AbsEmbedderTrainingArguments):
FILE: FlagEmbedding/finetune/embedder/encoder_only/m3/modeling.py
class EncoderOnlyEmbedderM3Model (line 15) | class EncoderOnlyEmbedderM3Model(AbsEmbedderModel):
method __init__ (line 32) | def __init__(
method _dense_embedding (line 75) | def _dense_embedding(self, last_hidden_state, attention_mask):
method _sparse_embedding (line 110) | def _sparse_embedding(self, hidden_state, input_ids, return_embedding:...
method _colbert_embedding (line 153) | def _colbert_embedding(self, last_hidden_state, mask):
method compute_score (line 167) | def compute_score(
method compute_dense_score (line 189) | def compute_dense_score(self, q_reps, p_reps):
method compute_sparse_score (line 203) | def compute_sparse_score(self, q_reps, p_reps):
method compute_colbert_score (line 217) | def compute_colbert_score(self, q_reps, p_reps, q_mask: torch.Tensor=N...
method ensemble_score (line 233) | def ensemble_score(self, q_reps, p_reps, dense_scores=None, sparse_sco...
method _encode (line 253) | def _encode(self, features):
method encode (line 276) | def encode(self, features):
method _compute_similarity (line 328) | def _compute_similarity(self, q_reps, p_reps):
method _get_queries_attention_mask (line 342) | def _get_queries_attention_mask(self, queries: Union[Dict[str, Tensor]...
method forward (line 368) | def forward(
method compute_loss (line 472) | def compute_loss(self, scores, target):
method gradient_checkpointing_enable (line 484) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 490) | def enable_input_require_grads(self, **kwargs):
method save (line 496) | def save(self, output_dir: str):
class EncoderOnlyEmbedderM3ModelForInference (line 518) | class EncoderOnlyEmbedderM3ModelForInference(EncoderOnlyEmbedderM3Model):
method forward (line 522) | def forward(self,
FILE: FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py
class EncoderOnlyEmbedderM3Runner (line 22) | class EncoderOnlyEmbedderM3Runner(AbsEmbedderRunner):
method __init__ (line 31) | def __init__(
method get_model (line 43) | def get_model(
method load_tokenizer_and_model (line 100) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEm...
method load_trainer (line 156) | def load_trainer(self) -> EncoderOnlyEmbedderM3Trainer:
FILE: FlagEmbedding/finetune/embedder/encoder_only/m3/trainer.py
class EncoderOnlyEmbedderM3Trainer (line 11) | class EncoderOnlyEmbedderM3Trainer(AbsEmbedderTrainer):
method _save (line 15) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
FILE: FlagEmbedding/finetune/reranker/decoder_only/base/__main__.py
function main (line 14) | def main():
FILE: FlagEmbedding/finetune/reranker/decoder_only/base/arguments.py
function default_target_modules (line 7) | def default_target_modules() -> List[int]:
class RerankerModelArguments (line 12) | class RerankerModelArguments(AbsRerankerModelArguments):
FILE: FlagEmbedding/finetune/reranker/decoder_only/base/load_model.py
function find_largest_checkpoint (line 12) | def find_largest_checkpoint(checkpoint_dir):
function get_model (line 37) | def get_model(model_args: RerankerModelArguments):
function save_merged_model (line 106) | def save_merged_model(model_args: RerankerModelArguments, output_dir: str):
FILE: FlagEmbedding/finetune/reranker/decoder_only/base/modeling.py
class CrossDecoderModel (line 10) | class CrossDecoderModel(AbsRerankerModel):
method __init__ (line 19) | def __init__(
method encode (line 31) | def encode(self, features):
FILE: FlagEmbedding/finetune/reranker/decoder_only/base/runner.py
class DecoderOnlyRerankerRunner (line 19) | class DecoderOnlyRerankerRunner(AbsRerankerRunner):
method __init__ (line 28) | def __init__(
method load_tokenizer_and_model (line 36) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRe...
method load_trainer (line 81) | def load_trainer(self) -> DecoderOnlyRerankerTrainer:
method run (line 96) | def run(self):
FILE: FlagEmbedding/finetune/reranker/decoder_only/base/trainer.py
class DecoderOnlyRerankerTrainer (line 13) | class DecoderOnlyRerankerTrainer(AbsRerankerTrainer):
method _save (line 17) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
FILE: FlagEmbedding/finetune/reranker/decoder_only/layerwise/__main__.py
function main (line 14) | def main():
FILE: FlagEmbedding/finetune/reranker/decoder_only/layerwise/arguments.py
function default_target_modules (line 7) | def default_target_modules() -> List[int]:
class RerankerModelArguments (line 12) | class RerankerModelArguments(AbsRerankerModelArguments):
FILE: FlagEmbedding/finetune/reranker/decoder_only/layerwise/configuration_minicpm_reranker.py
class LayerWiseMiniCPMConfig (line 30) | class LayerWiseMiniCPMConfig(PretrainedConfig):
method __init__ (line 116) | def __init__(
method _rope_scaling_validation (line 189) | def _rope_scaling_validation(self):
FILE: FlagEmbedding/finetune/reranker/decoder_only/layerwise/load_model.py
function find_largest_checkpoint (line 16) | def find_largest_checkpoint(checkpoint_dir):
function get_model (line 41) | def get_model(model_args: RerankerModelArguments, only_for_one_logit):
function save_merged_model (line 170) | def save_merged_model(model_args: RerankerModelArguments, output_dir: str):
FILE: FlagEmbedding/finetune/reranker/decoder_only/layerwise/modeling.py
class CrossDecoderModel (line 12) | class CrossDecoderModel(AbsRerankerModel):
method __init__ (line 22) | def __init__(
method encode (line 37) | def encode(self, features):
method forward (line 50) | def forward(self, pair: Union[Dict[str, Tensor], List[Dict[str, Tensor...
FILE: FlagEmbedding/finetune/reranker/decoder_only/layerwise/modeling_minicpm_reranker.py
function _get_unpad_data (line 77) | def _get_unpad_data(attention_mask):
function _expand_mask (line 89) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
function _make_causal_mask (line 96) | def _make_causal_mask(
function rms_layernorm (line 108) | def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
class MiniCPMRMSNorm (line 115) | class MiniCPMRMSNorm(nn.Module):
method __init__ (line 116) | def __init__(self, hidden_size, eps=1e-6):
method forward (line 124) | def forward(self, hidden_states):
class MiniCPMRotaryEmbedding (line 131) | class MiniCPMRotaryEmbedding(nn.Module):
method __init__ (line 132) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _set_cos_sin_cache (line 147) | def _set_cos_sin_cache(self, seq_len, device, dtype):
method forward (line 157) | def forward(self, x, seq_len=None):
class MiniCPMLinearScalingRotaryEmbedding (line 168) | class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
method __init__ (line 171) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _set_cos_sin_cache (line 175) | def _set_cos_sin_cache(self, seq_len, device, dtype):
class MiniCPMDynamicNTKScalingRotaryEmbedding (line 187) | class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
method __init__ (line 190) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _set_cos_sin_cache (line 194) | def _set_cos_sin_cache(self, seq_len, device, dtype):
function rotate_half (line 214) | def rotate_half(x):
function apply_rotary_pos_emb (line 221) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
class MiniCPMMLP (line 256) | class MiniCPMMLP(nn.Module):
method __init__ (line 257) | def __init__(self, config):
method forward (line 267) | def forward(self, x):
function repeat_kv (line 290) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
class MiniCPMAttention (line 302) | class MiniCPMAttention(nn.Module):
method __init__ (line 305) | def __init__(self, config: LayerWiseMiniCPMConfig, layer_idx: Optional...
method _init_rope (line 338) | def _init_rope(self):
method _shape (line 365) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
method forward (line 368) | def forward(
class MiniCPMFlashAttention2 (line 473) | class MiniCPMFlashAttention2(MiniCPMAttention):
method __init__ (line 480) | def __init__(self, *args, **kwargs):
method forward (line 488) | def forward(
method _flash_attention_forward (line 576) | def _flash_attention_forward(
method _upad_input (line 633) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
class MiniCPMSdpaAttention (line 672) | class MiniCPMSdpaAttention(MiniCPMAttention):
method forward (line 680) | def forward(
class MiniCPMDecoderLayer (line 766) | class MiniCPMDecoderLayer(nn.Module):
method __init__ (line 767) | def __init__(self, config: LayerWiseMiniCPMConfig, layer_idx: int):
method forward (line 779) | def forward(
class MiniCPMPreTrainedModel (line 862) | class MiniCPMPreTrainedModel(PreTrainedModel):
method _init_weights (line 872) | def _init_weights(self, module):
class LayerWiseMiniCPMModel (line 958) | class LayerWiseMiniCPMModel(MiniCPMPreTrainedModel):
method __init__ (line 966) | def __init__(self, config: LayerWiseMiniCPMConfig):
method get_input_embeddings (line 984) | def get_input_embeddings(self):
method set_input_embeddings (line 987) | def set_input_embeddings(self, value):
method forward (line 991) | def forward(
class LayerWiseHead (line 1135) | class LayerWiseHead(nn.Module):
method __init__ (line 1138) | def __init__(self, input_size, output_size):
method forward (line 1142) | def forward(self, **kwargs):
class LayerWiseMiniCPMForCausalLM (line 1145) | class LayerWiseMiniCPMForCausalLM(MiniCPMPreTrainedModel):
method __init__ (line 1148) | def __init__(self, config):
method get_input_embeddings (line 1191) | def get_input_embeddings(self):
method set_input_embeddings (line 1194) | def set_input_embeddings(self, value):
method get_output_embeddings (line 1197) | def get_output_embeddings(self):
method set_output_embeddings (line 1200) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 1203) | def set_decoder(self, decoder):
method get_decoder (line 1206) | def get_decoder(self):
method forward (line 1211) | def forward(
method prepare_inputs_for_generation (line 1408) | def prepare_inputs_for_generation(
method _reorder_cache (line 1465) | def _reorder_cache(past_key_values, beam_idx):
method chat (line 1474) | def chat(self, tokenizer, query: str, history: List[Dict] = None, role...
FILE: FlagEmbedding/finetune/reranker/decoder_only/layerwise/runner.py
class DecoderOnlyRerankerRunner (line 18) | class DecoderOnlyRerankerRunner(AbsRerankerRunner):
method __init__ (line 27) | def __init__(
method load_tokenizer_and_model (line 35) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRe...
method load_trainer (line 82) | def load_trainer(self) -> DecoderOnlyRerankerTrainer:
method run (line 97) | def run(self):
FILE: FlagEmbedding/finetune/reranker/decoder_only/layerwise/trainer.py
class DecoderOnlyRerankerTrainer (line 13) | class DecoderOnlyRerankerTrainer(AbsRerankerTrainer):
method _save (line 17) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
FILE: FlagEmbedding/finetune/reranker/encoder_only/base/__main__.py
function main (line 11) | def main():
FILE: FlagEmbedding/finetune/reranker/encoder_only/base/modeling.py
class CrossEncoderModel (line 9) | class CrossEncoderModel(AbsRerankerModel):
method __init__ (line 17) | def __init__(
method encode (line 29) | def encode(self, features):
FILE: FlagEmbedding/finetune/reranker/encoder_only/base/runner.py
class EncoderOnlyRerankerRunner (line 15) | class EncoderOnlyRerankerRunner(AbsRerankerRunner):
method load_tokenizer_and_model (line 19) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRe...
method load_trainer (line 63) | def load_trainer(self) -> EncoderOnlyRerankerTrainer:
FILE: FlagEmbedding/finetune/reranker/encoder_only/base/trainer.py
class EncoderOnlyRerankerTrainer (line 11) | class EncoderOnlyRerankerTrainer(AbsRerankerTrainer):
method _save (line 15) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
FILE: FlagEmbedding/inference/auto_embedder.py
class FlagAutoModel (line 13) | class FlagAutoModel:
method __init__ (line 17) | def __init__(self):
method from_finetuned (line 23) | def from_finetuned(
FILE: FlagEmbedding/inference/auto_reranker.py
class FlagAutoReranker (line 14) | class FlagAutoReranker:
method __init__ (line 18) | def __init__(self):
method from_finetuned (line 24) | def from_finetuned(
FILE: FlagEmbedding/inference/embedder/decoder_only/base.py
function last_token_pool (line 12) | def last_token_pool(last_hidden_states: torch.Tensor,
class BaseLLMEmbedder (line 32) | class BaseLLMEmbedder(AbsEmbedder):
method __init__ (line 58) | def __init__(
method encode_queries (line 104) | def encode_queries(
method encode_corpus (line 132) | def encode_corpus(
method encode (line 160) | def encode(
method encode_single_device (line 189) | def encode_single_device(
FILE: FlagEmbedding/inference/embedder/decoder_only/icl.py
function last_token_pool (line 16) | def last_token_pool(last_hidden_states: torch.Tensor,
class ICLLLMEmbedder (line 36) | class ICLLLMEmbedder(AbsEmbedder):
method __init__ (line 66) | def __init__(
method __del__ (line 124) | def __del__(self):
method set_examples (line 128) | def set_examples(self, examples_for_task: Optional[List[dict]] = None):
method get_detailed_example (line 163) | def get_detailed_example(instruction_format: str, instruction: str, qu...
method stop_self_query_pool (line 179) | def stop_self_query_pool(self):
method encode_queries (line 190) | def encode_queries(
method encode_corpus (line 237) | def encode_corpus(
method encode (line 266) | def encode(
method _encode_queries_multi_process_worker (line 296) | def _encode_queries_multi_process_worker(
method encode_queries_single_device (line 318) | def encode_queries_single_device(
method encode_single_device (line 454) | def encode_single_device(
FILE: FlagEmbedding/inference/embedder/encoder_only/base.py
class BaseEmbedder (line 11) | class BaseEmbedder(AbsEmbedder):
method __init__ (line 40) | def __init__(
method encode_queries (line 85) | def encode_queries(
method encode_corpus (line 113) | def encode_corpus(
method encode (line 141) | def encode(
method encode_single_device (line 170) | def encode_single_device(
method pooling (line 278) | def pooling(
FILE: FlagEmbedding/inference/embedder/encoder_only/m3.py
class M3Embedder (line 20) | class M3Embedder(AbsEmbedder):
method __init__ (line 50) | def __init__(
method convert_id_to_token (line 106) | def convert_id_to_token(self, lexical_weights: List[Dict]):
method compute_lexical_matching_score (line 129) | def compute_lexical_matching_score(
method colbert_score (line 163) | def colbert_score(self, q_reps, p_reps):
method encode_queries (line 179) | def encode_queries(
method encode_corpus (line 221) | def encode_corpus(
method encode (line 263) | def encode(
method encode_single_device (line 306) | def encode_single_device(
method compute_score (line 482) | def compute_score(
method compute_score_multi_process (line 535) | def compute_score_multi_process(
method _compute_score_multi_process_worker (line 571) | def _compute_score_multi_process_worker(
method compute_score_single_device (line 593) | def compute_score_single_device(
method _concatenate_results_from_multi_process (line 719) | def _concatenate_results_from_multi_process(
method _concatenate_compute_score_results_from_multi_process (line 753) | def _concatenate_compute_score_results_from_multi_process(
FILE: FlagEmbedding/inference/embedder/model_mapping.py
class EmbedderModelClass (line 10) | class EmbedderModelClass(Enum):
class PoolingMethod (line 25) | class PoolingMethod(Enum):
class EmbedderConfig (line 32) | class EmbedderConfig:
function support_native_bge_model_list (line 268) | def support_native_bge_model_list()->List[str]:
function support_model_list (line 271) | def support_model_list()->List[str]:
FILE: FlagEmbedding/inference/reranker/decoder_only/base.py
function last_logit_pool (line 15) | def last_logit_pool(logits: Tensor,
class DatasetForReranker (line 35) | class DatasetForReranker(Dataset):
method __init__ (line 47) | def __init__(
method __len__ (line 85) | def __len__(self):
method __getitem__ (line 88) | def __getitem__(self, item):
class Collater (line 122) | class Collater:
method __init__ (line 130) | def __init__(self, tokenizer, max_len):
method __call__ (line 138) | def __call__(self, data):
class BaseLLMReranker (line 171) | class BaseLLMReranker(AbsReranker):
method __init__ (line 199) | def __init__(
method compute_score_single_gpu (line 257) | def compute_score_single_gpu(
FILE: FlagEmbedding/inference/reranker/decoder_only/layerwise.py
function last_logit_pool_layerwise (line 18) | def last_logit_pool_layerwise(logits: Tensor,
class LayerWiseLLMReranker (line 38) | class LayerWiseLLMReranker(AbsReranker):
method __init__ (line 67) | def __init__(
method compute_score_single_gpu (line 136) | def compute_score_single_gpu(
FILE: FlagEmbedding/inference/reranker/decoder_only/lightweight.py
function last_logit_pool_lightweight (line 15) | def last_logit_pool_lightweight(logits: Tensor,
class Collater_for_lightweight (line 35) | class Collater_for_lightweight:
method __init__ (line 43) | def __init__(self, tokenizer, max_len):
method __call__ (line 51) | def __call__(self, data):
class LightweightLLMReranker (line 90) | class LightweightLLMReranker(AbsReranker):
method __init__ (line 122) | def __init__(
method compute_score_single_gpu (line 206) | def compute_score_single_gpu(
FILE: FlagEmbedding/inference/reranker/decoder_only/models/configuration_minicpm_reranker.py
class LayerWiseMiniCPMConfig (line 30) | class LayerWiseMiniCPMConfig(PretrainedConfig):
method __init__ (line 116) | def __init__(
method _rope_scaling_validation (line 189) | def _rope_scaling_validation(self):
FILE: FlagEmbedding/inference/reranker/decoder_only/models/gemma_config.py
class CostWiseGemmaConfig (line 26) | class CostWiseGemmaConfig(Gemma2Config):
method __init__ (line 54) | def __init__(
FILE: FlagEmbedding/inference/reranker/decoder_only/models/gemma_model.py
function _get_unpad_data (line 69) | def _get_unpad_data(attention_mask):
class CostWiseGemma2PreTrainedModel (line 84) | class CostWiseGemma2PreTrainedModel(PreTrainedModel):
method _init_weights (line 97) | def _init_weights(self, module):
class CostWiseModelOutputWithPast (line 112) | class CostWiseModelOutputWithPast(ModelOutput):
class CostWiseCausalLMOutputWithPast (line 120) | class CostWiseCausalLMOutputWithPast(ModelOutput):
function token_compress (line 128) | def token_compress(compress_ratio,
class CostWiseGemmaModel (line 237) | class CostWiseGemmaModel(CostWiseGemma2PreTrainedModel):
method __init__ (line 245) | def __init__(self, config: CostWiseGemmaConfig):
method get_input_embeddings (line 260) | def get_input_embeddings(self):
method set_input_embeddings (line 263) | def set_input_embeddings(self, value):
method forward (line 267) | def forward(
method _update_causal_mask (line 434) | def _update_causal_mask(
class CostWiseHead (line 479) | class CostWiseHead(nn.Module):
method __init__ (line 482) | def __init__(self, input_size, output_size):
method forward (line 486) | def forward(self, **kwargs):
class CostWiseGemmaForCausalLM (line 490) | class CostWiseGemmaForCausalLM(CostWiseGemma2PreTrainedModel):
method __init__ (line 493) | def __init__(self, config: CostWiseGemmaConfig):
method get_input_embeddings (line 510) | def get_input_embeddings(self):
method set_input_embeddings (line 513) | def set_input_embeddings(self, value):
method get_output_embeddings (line 516) | def get_output_embeddings(self):
method set_output_embeddings (line 519) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 522) | def set_decoder(self, decoder):
method get_decoder (line 525) | def get_decoder(self):
method forward (line 530) | def forward(
method prepare_inputs_for_generation (line 664) | def prepare_inputs_for_generation(
method _reorder_cache (line 739) | def _reorder_cache(past_key_values, beam_idx):
FILE: FlagEmbedding/inference/reranker/decoder_only/models/modeling_minicpm_reranker.py
function _get_unpad_data (line 80) | def _get_unpad_data(attention_mask):
function _expand_mask (line 92) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
function _make_causal_mask (line 99) | def _make_causal_mask(
function rms_layernorm (line 111) | def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
class MiniCPMRMSNorm (line 118) | class MiniCPMRMSNorm(nn.Module):
method __init__ (line 119) | def __init__(self, hidden_size, eps=1e-6):
method forward (line 127) | def forward(self, hidden_states):
class MiniCPMRotaryEmbedding (line 134) | class MiniCPMRotaryEmbedding(nn.Module):
method __init__ (line 135) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _set_cos_sin_cache (line 150) | def _set_cos_sin_cache(self, seq_len, device, dtype):
method forward (line 160) | def forward(self, x, seq_len=None):
class MiniCPMLinearScalingRotaryEmbedding (line 171) | class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
method __init__ (line 174) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _set_cos_sin_cache (line 178) | def _set_cos_sin_cache(self, seq_len, device, dtype):
class MiniCPMDynamicNTKScalingRotaryEmbedding (line 190) | class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
method __init__ (line 193) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _set_cos_sin_cache (line 197) | def _set_cos_sin_cache(self, seq_len, device, dtype):
function rotate_half (line 217) | def rotate_half(x):
function apply_rotary_pos_emb (line 224) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
class MiniCPMMLP (line 259) | class MiniCPMMLP(nn.Module):
method __init__ (line 260) | def __init__(self, config):
method forward (line 270) | def forward(self, x):
function repeat_kv (line 293) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
class MiniCPMAttention (line 305) | class MiniCPMAttention(nn.Module):
method __init__ (line 308) | def __init__(self, config: LayerWiseMiniCPMConfig, layer_idx: Optional...
method _init_rope (line 341) | def _init_rope(self):
method _shape (line 368) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
method forward (line 371) | def forward(
class MiniCPMFlashAttention2 (line 476) | class MiniCPMFlashAttention2(MiniCPMAttention):
method __init__ (line 483) | def __init__(self, *args, **kwargs):
method forward (line 491) | def forward(
method _flash_attention_forward (line 579) | def _flash_attention_forward(
method _upad_input (line 636) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
class MiniCPMSdpaAttention (line 675) | class MiniCPMSdpaAttention(MiniCPMAttention):
method forward (line 683) | def forward(
class MiniCPMDecoderLayer (line 769) | class MiniCPMDecoderLayer(nn.Module):
method __init__ (line 770) | def __init__(self, config: LayerWiseMiniCPMConfig, layer_idx: int):
method forward (line 782) | def forward(
class MiniCPMPreTrainedModel (line 865) | class MiniCPMPreTrainedModel(PreTrainedModel):
method _init_weights (line 875) | def _init_weights(self, module):
class LayerWiseMiniCPMModel (line 961) | class LayerWiseMiniCPMModel(MiniCPMPreTrainedModel):
method __init__ (line 969) | def __init__(self, config: LayerWiseMiniCPMConfig):
method get_input_embeddings (line 987) | def get_input_embeddings(self):
method set_input_embeddings (line 990) | def set_input_embeddings(self, value):
method forward (line 994) | def forward(
class LayerWiseHead (line 1138) | class LayerWiseHead(nn.Module):
method __init__ (line 1141) | def __init__(self, input_size, output_size):
method forward (line 1145) | def forward(self, **kwargs):
class LayerWiseMiniCPMForCausalLM (line 1148) | class LayerWiseMiniCPMForCausalLM(MiniCPMPreTrainedModel):
method __init__ (line 1151) | def __init__(self, config):
method get_input_embeddings (line 1194) | def get_input_embeddings(self):
method set_input_embeddings (line 1197) | def set_input_embeddings(self, value):
method get_output_embeddings (line 1200) | def get_output_embeddings(self):
method set_output_embeddings (line 1203) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 1206) | def set_decoder(self, decoder):
method get_decoder (line 1209) | def get_decoder(self):
method forward (line 1214) | def forward(
method prepare_inputs_for_generation (line 1411) | def prepare_inputs_for_generation(
method _reorder_cache (line 1468) | def _reorder_cache(past_key_values, beam_idx):
method chat (line 1477) | def chat(self, tokenizer, query: str, history: List[Dict] = None, role...
FILE: FlagEmbedding/inference/reranker/encoder_only/base.py
function sigmoid (line 10) | def sigmoid(x):
class BaseReranker (line 14) | class BaseReranker(AbsReranker):
method __init__ (line 34) | def __init__(
method compute_score_single_gpu (line 78) | def compute_score_single_gpu(
FILE: FlagEmbedding/inference/reranker/model_mapping.py
class RerankerModelClass (line 10) | class RerankerModelClass(Enum):
class RerankerConfig (line 26) | class RerankerConfig:
FILE: FlagEmbedding/utils/transformers_compat.py
function is_torch_fx_available (line 10) | def is_torch_fx_available():
function import_from_candidates (line 22) | def import_from_candidates(candidates, default=None):
FILE: Tutorials/4_Evaluation/utils/compute_metrics.py
class SimpleTokenizer (line 10) | class SimpleTokenizer:
method __init__ (line 14) | def __init__(self):
method tokenize (line 24) | def tokenize(self, text, uncased=False):
function _normalize (line 33) | def _normalize(text):
function has_answer (line 37) | def has_answer(answers, text, tokenizer) -> bool:
function check_answer (line 51) | def check_answer(example, tokenizer) -> List[bool]:
function evaluate_qa_recall (line 65) | def evaluate_qa_recall(ctxs, answers, k_values: Union[int, List[int]]=100):
FILE: Tutorials/4_Evaluation/utils/normalize_text.py
function normalize_text (line 133) | def normalize_text(text: str):
FILE: examples/inference/embedder/decoder_only/auto_base_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/embedder/decoder_only/auto_base_single_device.py
function test_base_single_device (line 5) | def test_base_single_device():
FILE: examples/inference/embedder/decoder_only/auto_icl_multi_devices.py
function test_icl_multi_devices (line 5) | def test_icl_multi_devices():
FILE: examples/inference/embedder/decoder_only/auto_icl_single_device.py
function test_icl_single_device (line 5) | def test_icl_single_device():
FILE: examples/inference/embedder/decoder_only/base_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/embedder/decoder_only/base_single_device.py
function test_base_single_device (line 5) | def test_base_single_device():
FILE: examples/inference/embedder/decoder_only/icl_multi_devices.py
function test_icl_multi_devices (line 5) | def test_icl_multi_devices():
FILE: examples/inference/embedder/decoder_only/icl_single_device.py
function test_icl_single_device (line 5) | def test_icl_single_device():
FILE: examples/inference/embedder/encoder_only/auto_base_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/embedder/encoder_only/auto_base_single_device.py
function test_base_single_device (line 5) | def test_base_single_device():
FILE: examples/inference/embedder/encoder_only/auto_m3_multi_devices.py
function test_m3_multi_devices (line 5) | def test_m3_multi_devices():
FILE: examples/inference/embedder/encoder_only/auto_m3_single_device.py
function test_m3_single_device (line 5) | def test_m3_single_device():
FILE: examples/inference/embedder/encoder_only/base_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/embedder/encoder_only/base_single_device.py
function test_base_single_device (line 5) | def test_base_single_device():
FILE: examples/inference/embedder/encoder_only/m3_multi_devices.py
function test_m3_multi_devices (line 5) | def test_m3_multi_devices():
FILE: examples/inference/embedder/encoder_only/m3_multi_devices_compute_score.py
function test_m3_multi_devices (line 5) | def test_m3_multi_devices():
FILE: examples/inference/embedder/encoder_only/m3_single_device.py
function test_m3_single_device (line 5) | def test_m3_single_device():
FILE: examples/inference/embedder/encoder_only/m3_single_device_compute_score.py
function test_m3_single_device (line 5) | def test_m3_single_device():
FILE: examples/inference/reranker/decoder_only/auto_base_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/auto_base_single_device.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/auto_layerwise_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/auto_layerwise_single_device.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/auto_lightweight_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/auto_lightweight_single_device.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/base_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/base_single_device.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/layerwise_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/layerwise_single_device.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/lightweight_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/decoder_only/lightweight_single_device.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/encoder_only/auto_base_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/encoder_only/auto_base_single_device.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/encoder_only/base_multi_devices.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: examples/inference/reranker/encoder_only/base_single_device.py
function test_base_multi_devices (line 5) | def test_base_multi_devices():
FILE: research/BGE_Coder/data_generation/constant.py
class TaskType (line 6) | class TaskType(Enum):
function get_task_def_by_task_type (line 65) | def get_task_def_by_task_type(task_type: Union[str, TaskType]) -> Tuple[...
class Language (line 206) | class Language(Enum):
class CodeLanguage (line 270) | class CodeLanguage(Enum):
class Task (line 337) | class Task:
function get_task (line 346) | def get_task(
function get_pos_as_input_by_task_type (line 391) | def get_pos_as_input_by_task_type(task_type: TaskType) -> bool:
function get_generation_prompt (line 423) | def get_generation_prompt(
function get_quality_control_prompt (line 759) | def get_quality_control_prompt(
class DocLength (line 1256) | class DocLength(Enum):
function get_gen_hard_neg_prompt (line 1267) | def get_gen_hard_neg_prompt(task: Task, query: str, pos: str) -> str:
FILE: research/BGE_Coder/data_generation/corpus_generator.py
class CorpusGenerator (line 11) | class CorpusGenerator:
method __init__ (line 12) | def __init__(
method _load_corpus (line 18) | def _load_corpus(self, corpus_dir: str, doc_length: List[str], externa...
method run (line 80) | def run(
FILE: research/BGE_Coder/data_generation/format_generated_examples.py
function format_generated_examples (line 7) | def format_generated_examples(
function main (line 56) | def main():
FILE: research/BGE_Coder/data_generation/llm.py
class LLM (line 11) | class LLM:
method __init__ (line 12) | def __init__(
method split_text (line 41) | def split_text(self, text: str, anchor_points: Tuple[float, float] = (...
method chat (line 47) | def chat(
FILE: research/BGE_Coder/data_generation/run_generation.py
function compute_md5 (line 18) | def compute_md5(text: str):
function get_args (line 22) | def get_args():
function gen_triplets (line 161) | def gen_triplets(
function get_save_path (line 196) | def get_save_path(
function save_triplets (line 213) | def save_triplets(
function main (line 250) | def main(args):
FILE: research/BGE_Coder/data_generation/search.py
function create_index (line 8) | def create_index(embeddings: np.ndarray, use_gpu: bool = False):
function search (line 20) | def search(
function get_top1 (line 45) | def get_top1(
FILE: research/BGE_Coder/data_generation/triplet_generator.py
function compute_md5 (line 17) | def compute_md5(text: str):
class TripletGenerator (line 21) | class TripletGenerator(LLM):
method __init__ (line 22) | def __init__(
method _gen_for_code_modification_retrieval (line 34) | def _gen_for_code_modification_retrieval(
method _gen_for_code_comparison_retrieval (line 81) | def _gen_for_code_comparison_retrieval(
method _gen_for_code_context_retrieval (line 126) | def _gen_for_code_context_retrieval(
method _arrange_query_and_pos (line 146) | def _arrange_query_and_pos(task: Task, input_text: str, response: str):
method _gen_for_normal_task (line 168) | def _gen_for_normal_task(
method _gen_for_bug_desc_retrieval (line 208) | def _gen_for_bug_desc_retrieval(
method _gen_for_two_step_not_use_last (line 253) | def _gen_for_two_step_not_use_last(
method _gen_for_two_step_use_last (line 297) | def _gen_for_two_step_use_last(
method generate_triplets (line 341) | def generate_triplets(
method gen_hard_negatives (line 470) | def gen_hard_negatives(self, result: dict, task: Task, num_negatives: ...
method run_single (line 485) | def run_single(
method run (line 552) | def run(
method run_for_gen_neg (line 605) | def run_for_gen_neg(
FILE: research/BGE_Coder/data_generation/utils.py
function clean_content (line 4) | def clean_content(content: str):
function clean_code (line 19) | def clean_code(code: str, lang: str, length_threshold: int = 30) -> str:
FILE: research/BGE_Coder/evaluation/coderag_eval/test/arguments.py
class CodeRAGEvalArgs (line 9) | class CodeRAGEvalArgs:
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/code_search_net.py
function document2code (line 8) | def document2code(data, split="train"):
function main (line 25) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/ds1000.py
function download_source (line 16) | def download_source(source_dir):
function download_dataset (line 32) | def download_dataset(source_dir):
function get_dataset (line 47) | def get_dataset(source_dir, mode: str = "Completion", key: str = "All"):
function document2code (line 65) | def document2code(data: list):
function main (line 95) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/general_programming.py
function save_file_jsonl (line 25) | def save_file_jsonl(data, path):
function main (line 31) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/humaneval.py
function document2code (line 8) | def document2code(data, split="test"):
function main (line 25) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/live_code_bench.py
function get_queries (line 9) | def get_queries(data, split="test") -> list[dict]:
function get_corpus (line 17) | def get_corpus(hf_name: str, cache_dir: str) -> list[dict]:
function main (line 26) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/mbpp.py
function get_function_name (line 8) | def get_function_name(code: str) -> str:
function document2code (line 19) | def document2code(data, split="test"):
function main (line 36) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/odex.py
function document2code (line 12) | def document2code(data, split="test"):
function main (line 43) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/repoeval.py
function iterate_repository (line 38) | def iterate_repository(base_dir: str, repo: str) -> dict:
function window_overlap (line 61) | def window_overlap(span: tuple, target_span: tuple) -> bool:
class RepoWindowMaker (line 67) | class RepoWindowMaker:
method __init__ (line 68) | def __init__(self, base_dir, repo, tasks, window_size, slice_size):
method _buid_windows_for_a_file (line 77) | def _buid_windows_for_a_file(self, fpath_tuple, code):
method _merge_windows_with_same_context (line 102) | def _merge_windows_with_same_context(self, code_windows):
method build_windows (line 116) | def build_windows(self):
function download_data (line 144) | def download_data(directory: str = "repoeval"):
function repo2code (line 177) | def repo2code(
function main (line 221) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/repoeval_repo.py
function iterate_repository (line 38) | def iterate_repository(base_dir: str, repo: str) -> dict:
function window_overlap (line 61) | def window_overlap(span: tuple, target_span: tuple) -> bool:
class RepoWindowMaker (line 67) | class RepoWindowMaker:
method __init__ (line 68) | def __init__(self, base_dir, repo, tasks, window_size, slice_size):
method _buid_windows_for_a_file (line 77) | def _buid_windows_for_a_file(self, fpath_tuple, code):
method _merge_windows_with_same_context (line 102) | def _merge_windows_with_same_context(self, code_windows):
method build_windows (line 116) | def build_windows(self):
function download_data (line 147) | def download_data(directory: str = "repoeval"):
function repo2code (line 174) | def repo2code(
function main (line 210) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/swebench.py
class ContextManager (line 18) | class ContextManager:
method __init__ (line 19) | def __init__(self, repo_path, base_commit, verbose=False):
method __enter__ (line 25) | def __enter__(self):
method get_environment (line 40) | def get_environment(self):
method get_readme_files (line 43) | def get_readme_files(self):
method __exit__ (line 49) | def __exit__(self, exc_type, exc_val, exc_tb):
class AutoContextManager (line 53) | class AutoContextManager(ContextManager):
method __init__ (line 56) | def __init__(self, instance, root_dir=None, verbose=False, token=None):
method __exit__ (line 77) | def __exit__(self, exc_type, exc_val, exc_tb):
function ingest_files (line 83) | def ingest_files(filenames):
function get_oracle_filenames (line 91) | def get_oracle_filenames(instance):
function is_test (line 106) | def is_test(name, test_phrases=None):
function list_files (line 112) | def list_files(root_dir, include_tests=False):
function detect_encoding (line 120) | def detect_encoding(filename):
function ingest_directory_contents (line 128) | def ingest_directory_contents(root_dir, include_tests=False):
function get_file_contents (line 144) | def get_file_contents(input_instances, verbose: bool = False, tmp_dir: s...
function document2code (line 173) | def document2code(data, split: str = "test"):
function main (line 219) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/swebench_repo.py
class ContextManager (line 18) | class ContextManager:
method __init__ (line 19) | def __init__(self, repo_path, base_commit, verbose=False):
method __enter__ (line 25) | def __enter__(self):
method get_environment (line 40) | def get_environment(self):
method get_readme_files (line 43) | def get_readme_files(self):
method __exit__ (line 49) | def __exit__(self, exc_type, exc_val, exc_tb):
class AutoContextManager (line 53) | class AutoContextManager(ContextManager):
method __init__ (line 56) | def __init__(self, instance, root_dir=None, verbose=False, token=None):
method __exit__ (line 77) | def __exit__(self, exc_type, exc_val, exc_tb):
function ingest_files (line 83) | def ingest_files(filenames):
function get_oracle_filenames (line 91) | def get_oracle_filenames(instance):
function is_test (line 106) | def is_test(name, test_phrases=None):
function list_files (line 112) | def list_files(root_dir, include_tests=False):
function detect_encoding (line 120) | def detect_encoding(filename):
function ingest_directory_contents (line 128) | def ingest_directory_contents(root_dir, include_tests=False):
function get_file_contents (line 144) | def get_file_contents(input_instances, verbose: bool = False, tmp_dir: s...
function process_single_item (line 174) | def process_single_item(item, args):
function main (line 226) | def main():
FILE: research/BGE_Coder/evaluation/coderag_eval/test/create/utils.py
function load_jsonlines (line 5) | def load_jsonlines(file):
function save_file_jsonl (line 10) | def save_file_jsonl(data, fp):
function save_tsv_dict (line 14) | def save_tsv_dict(data, fp, fields):
function cost_esitmate (line 25) | def cost_esitmate(path):
FILE: research/BGE_Coder/evaluation/coderag_eval/test/main.py
function get_model (line 23) | def get_model(model_args: CodeRAGEvalModelArgs):
function get_top_docs (line 102) | def get_top_docs(results: dict, corpus: dict, task_id: str, topk: int = ...
function main (line 111) | def main(
FILE: research/BGE_Coder/evaluation/coderag_eval/test/prompts.py
function get_task_def_by_task_name (line 4) | def get_task_def_by_task_name(task_name: str) -> str:
FILE: research/BGE_Coder/evaluation/coir_eval/arguments.py
function coir_tasks (line 9) | def coir_tasks():
class COIREvalArgs (line 35) | class COIREvalArgs:
FILE: research/BGE_Coder/evaluation/coir_eval/main.py
function get_model (line 11) | def get_model(model_args: COIREvalModelArgs):
function main (line 84) | def main(
FILE: research/BGE_Coder/evaluation/coir_eval/prompts.py
function get_task_def_by_task_name (line 4) | def get_task_def_by_task_name(task_name: str) -> str:
FILE: research/BGE_M3/arguments.py
class ModelArguments (line 9) | class ModelArguments:
class DataArguments (line 29) | class DataArguments:
method __post_init__ (line 82) | def __post_init__(self):
class RetrieverTrainingArguments (line 88) | class RetrieverTrainingArguments(TrainingArguments):
FILE: research/BGE_M3/data.py
class SameDatasetTrainDataset (line 16) | class SameDatasetTrainDataset(Dataset):
method __init__ (line 19) | def __init__(self, args: DataArguments, batch_size: int, seed: int, pr...
method print_batch_size (line 108) | def print_batch_size(self, batch_size: int, train_group_size: int):
method get_file_batch_size (line 122) | def get_file_batch_size(file: str, batch_size: int, train_group_size: ...
method refresh_epoch (line 170) | def refresh_epoch(self):
method __getitem__ (line 188) | def __getitem__(self, _):
method shuffle_text (line 198) | def shuffle_text(self, text):
method create_batch_data (line 209) | def create_batch_data(self, batch_raw_data):
method __len__ (line 248) | def __len__(self):
class EmbedCollator (line 253) | class EmbedCollator(DataCollatorWithPadding):
method __call__ (line 262) | def __call__(self, features):
FILE: research/BGE_M3/modeling.py
class EncoderOutput (line 18) | class EncoderOutput(ModelOutput):
class BGEM3Model (line 25) | class BGEM3Model(nn.Module):
method __init__ (line 27) | def __init__(self,
method load_model (line 69) | def load_model(self, model_name, colbert_dim: int = -1):
method gradient_checkpointing_enable (line 91) | def gradient_checkpointing_enable(self, **kwargs):
method dense_embedding (line 94) | def dense_embedding(self, hidden_state, mask):
method sparse_embedding (line 102) | def sparse_embedding(self, hidden_state, input_ids, return_embedding: ...
method colbert_embedding (line 132) | def colbert_embedding(self, last_hidden_state, mask):
method dense_score (line 137) | def dense_score(self, q_reps, p_reps):
method sparse_score (line 142) | def sparse_score(self, q_reps, p_reps):
method colbert_score (line 147) | def colbert_score(self, q_reps, p_reps, q_mask: torch.Tensor):
method _encode (line 154) | def _encode(self, features):
method encode (line 167) | def encode(self, features, sub_batch_size=None):
method compute_sub_batch_size (line 196) | def compute_sub_batch_size(self, features):
method compute_similarity (line 203) | def compute_similarity(self, q_reps, p_reps):
method distill_loss (line 208) | def distill_loss(self, teacher_targets, student_scores, group_size):
method forward (line 223) | def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, ...
method compute_loss (line 317) | def compute_loss(self, scores, target):
method _dist_gather_tensor (line 320) | def _dist_gather_tensor(self, t: Optional[torch.Tensor]):
method save (line 333) | def save(self, output_dir: str):
method load_pooler (line 349) | def load_pooler(self, model_dir):
class BGEM3ForInference (line 356) | class BGEM3ForInference(BGEM3Model):
method forward (line 358) | def forward(self,
FILE: research/BGE_M3/run.py
class TrainerCallbackForDataRefresh (line 28) | class TrainerCallbackForDataRefresh(TrainerCallback):
method __init__ (line 29) | def __init__(self, train_dataset):
method on_epoch_end (line 32) | def on_epoch_end(self, args: TrainingArguments, state: TrainerState, c...
function main (line 39) | def main():
FILE: research/BGE_M3/split_data_by_length.py
function get_args (line 24) | def get_args():
class SplitByLengthHandler (line 38) | class SplitByLengthHandler:
method __init__ (line 39) | def __init__(self,
method _get_length_ranges_list (line 76) | def _get_length_ranges_list(length_list: list):
method _process_dir (line 90) | def _process_dir(self, dir_path: str, output_dir: str):
method _process_file (line 104) | def _process_file(self, file_path: str, output_path: str):
method run (line 165) | def run(self, input_path: str, output_dir: str, log_name: str=None):
FILE: research/BGE_M3/trainer.py
function save_ckpt_for_sentence_transformers (line 5) | def save_ckpt_for_sentence_transformers(ckpt_dir, pooling_mode: str = 'c...
class BiTrainer (line 16) | class BiTrainer(Trainer):
method _save (line 17) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method compute_loss (line 41) | def compute_loss(self, model, inputs, return_outputs=False):
FILE: research/BGE_VL/eval/eval_Circo.py
class Args (line 18) | class Args:
function index (line 70) | def index(model: Flag_mmret, corpus: datasets.Dataset, batch_size: int =...
function search (line 134) | def search(model: Flag_mmret, queries: datasets, faiss_index: faiss.Inde...
function main (line 162) | def main():
FILE: research/BGE_VL/eval/eval_fashioniq.py
class Args (line 23) | class Args:
function index (line 71) | def index(model: Flag_mmret, corpus: datasets.Dataset, batch_size: int =...
function search (line 135) | def search(model: Flag_mmret, queries: datasets, faiss_index: faiss.Inde...
function evaluate (line 162) | def evaluate(preds, labels, cutoffs=[1,5,10,20,50,100]):
function main (line 200) | def main():
FILE: research/BGE_VL/eval/flag_dataset.py
class MMIT_Dataset (line 22) | class MMIT_Dataset(Dataset):
method __init__ (line 23) | def __init__(self, captions, image_ids, image_dir, image_processor) ->...
method __getitem__ (line 34) | def __getitem__(self, item):
method __len__ (line 46) | def __len__(self):
class MMIT_Collator (line 50) | class MMIT_Collator:
method __init__ (line 51) | def __init__(self, tokenizer, caption_max_len):
method __call__ (line 57) | def __call__(self, features):
class Image_Dataset (line 79) | class Image_Dataset(Dataset):
method __init__ (line 80) | def __init__(self, image_ids, image_dir, image_processor) -> None:
method __getitem__ (line 85) | def __getitem__(self, item):
method __len__ (line 91) | def __len__(self):
class Image_Collator (line 94) | class Image_Collator:
method __init__ (line 95) | def __init__(self, tokenizer, caption_max_len):
method __call__ (line 100) | def __call__(self, features):
FILE: research/BGE_VL/eval/flag_mmret.py
class Flag_mmret (line 11) | class Flag_mmret(nn.Module):
method __init__ (line 12) | def __init__(
method encode_queries (line 47) | def encode_queries(self, queries: Union[List[str], str],
method encode_corpus (line 72) | def encode_corpus(self,
method encode_text (line 90) | def encode_text(self, sentences: Union[List[str], str], batch_size: in...
method encode_mm_it (line 123) | def encode_mm_it(self, captions: Union[List[str], str], image_ids: Uni...
method encode_image (line 172) | def encode_image(self, image_ids: Union[List[str], str], batch_size: ...
FILE: research/BGE_VL/modeling_MMRet_CLIP.py
function contrastive_loss (line 59) | def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
function clip_loss (line 63) | def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
function _get_vector_norm (line 69) | def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
class CLIPVisionModelOutput (line 81) | class CLIPVisionModelOutput(ModelOutput):
class CLIPTextModelOutput (line 110) | class CLIPTextModelOutput(ModelOutput):
class CLIPOutput (line 139) | class CLIPOutput(ModelOutput):
method to_tuple (line 168) | def to_tuple(self) -> Tuple[Any]:
class CLIPVisionEmbeddings (line 175) | class CLIPVisionEmbeddings(nn.Module):
method __init__ (line 176) | def __init__(self, config: CLIPVisionConfig):
method forward (line 198) | def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
class CLIPTextEmbeddings (line 210) | class CLIPTextEmbeddings(nn.Module):
method __init__ (line 211) | def __init__(self, config: CLIPTextConfig):
method forward (line 223) | def forward(
class CLIPAttention (line 243) | class CLIPAttention(nn.Module):
method __init__ (line 246) | def __init__(self, config):
method _shape (line 265) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
method forward (line 268) | def forward(
class CLIPFlashAttention2 (line 347) | class CLIPFlashAttention2(CLIPAttention):
method __init__ (line 355) | def __init__(self, *args, **kwargs):
method forward (line 364) | def forward(
class CLIPSdpaAttention (line 434) | class CLIPSdpaAttention(CLIPAttention):
method forward (line 442) | def forward(
class CLIPMLP (line 514) | class CLIPMLP(nn.Module):
method __init__ (line 515) | def __init__(self, config):
method forward (line 522) | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
class CLIPEncoderLayer (line 529) | class CLIPEncoderLayer(nn.Module):
method __init__ (line 530) | def __init__(self, config: CLIPConfig):
method forward (line 538) | def forward(
class CLIPPreTrainedModel (line 579) | class CLIPPreTrainedModel(PreTrainedModel):
method _init_weights (line 591) | def _init_weights(self, module):
class CLIPEncoder (line 748) | class CLIPEncoder(nn.Module):
method __init__ (line 757) | def __init__(self, config: CLIPConfig):
method forward (line 763) | def forward(
class CLIPTextTransformer (line 845) | class CLIPTextTransformer(nn.Module):
method __init__ (line 846) | def __init__(self, config: CLIPTextConfig):
method forward (line 862) | def forward(
class CLIPTextModel (line 949) | class CLIPTextModel(CLIPPreTrainedModel):
method __init__ (line 954) | def __init__(self, config: CLIPTextConfig):
method get_input_embeddings (line 960) | def get_input_embeddings(self) -> nn.Module:
method set_input_embeddings (line 963) | def set_input_embeddings(self, value):
method forward (line 968) | def forward(
class CLIPVisionTransformer (line 1006) | class CLIPVisionTransformer(nn.Module):
method __init__ (line 1007) | def __init__(self, config: CLIPVisionConfig):
method forward (line 1019) | def forward(
class CLIPVisionModel (line 1068) | class CLIPVisionModel(CLIPPreTrainedModel):
method __init__ (line 1073) | def __init__(self, config: CLIPVisionConfig):
method get_input_embeddings (line 1079) | def get_input_embeddings(self) -> nn.Module:
method forward (line 1084) | def forward(
class CLIPModel (line 1124) | class CLIPModel(CLIPPreTrainedModel):
method __init__ (line 1128) | def __init__(self, config: CLIPConfig):
method set_processor (line 1163) | def set_processor(self, model_name):
method get_text_features (line 1167) | def get_text_features(
method get_image_features (line 1214) | def get_image_features(
method encode_image (line 1263) | def encode_image(self, images):
method encode_text (line 1268) | def encode_text(self, text):
method encode_multimodal (line 1273) | def encode_multimodal(self, images, text):
method data_process (line 1282) | def data_process(self, images=None, text=None):
method encode (line 1309) | def encode(self, images=None, text=None):
method forward (line 1321) | def forward(
class CLIPTextModelWithProjection (line 1421) | class CLIPTextModelWithProjection(CLIPPreTrainedModel):
method __init__ (line 1426) | def __init__(self, config: CLIPTextConfig):
method get_input_embeddings (line 1437) | def get_input_embeddings(self) -> nn.Module:
method set_input_embeddings (line 1440) | def set_input_embeddings(self, value):
method forward (line 1445) | def forward(
class CLIPVisionModelWithProjection (line 1503) | class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
method __init__ (line 1507) | def __init__(self, config: CLIPVisionConfig):
method get_input_embeddings (line 1518) | def get_input_embeddings(self) -> nn.Module:
method forward (line 1523) | def forward(
class CLIPForImageClassification (line 1583) | class CLIPForImageClassification(CLIPPreTrainedModel):
method __init__ (line 1586) | def __init__(self, config: CLIPConfig) -> None:
method forward (line 1610) | def forward(
FILE: research/C_MTEB/C_MTEB/tasks/Classification.py
class TNews (line 7) | class TNews(AbsTaskClassification):
method metadata_dict (line 76) | def metadata_dict(self) -> dict[str, str]:
class IFlyTek (line 82) | class IFlyTek(AbsTaskClassification):
method metadata_dict (line 152) | def metadata_dict(self) -> dict[str, str]:
class MultilingualSentiment (line 159) | class MultilingualSentiment(AbsTaskClassification):
method metadata_dict (line 186) | def metadata_dict(self) -> dict[str, str]:
class JDReview (line 192) | class JDReview(AbsTaskClassification):
method metadata_dict (line 224) | def metadata_dict(self) -> dict[str, str]:
class OnlineShopping (line 230) | class OnlineShopping(AbsTaskClassification):
method metadata_dict (line 262) | def metadata_dict(self) -> dict[str, str]:
class Waimai (line 268) | class Waimai(AbsTaskClassification):
method metadata_dict (line 300) | def metadata_dict(self) -> dict[str, str]:
FILE: research/C_MTEB/C_MTEB/tasks/Clustering.py
class CLSClusteringFastS2S (line 17) | class CLSClusteringFastS2S(AbsTaskClusteringFast):
method dataset_transform (line 56) | def dataset_transform(self):
class CLSClusteringFastP2P (line 77) | class CLSClusteringFastP2P(AbsTaskClusteringFast):
method dataset_transform (line 116) | def dataset_transform(self):
class CLSClusteringS2S (line 137) | class CLSClusteringS2S(AbsTaskClustering):
class CLSClusteringP2P (line 173) | class CLSClusteringP2P(AbsTaskClustering):
class ThuNewsClusteringFastS2S (line 207) | class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
method dataset_transform (line 246) | def dataset_transform(self):
class ThuNewsClusteringFastP2P (line 267) | class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
method dataset_transform (line 306) | def dataset_transform(self):
class ThuNewsClusteringS2S (line 327) | class ThuNewsClusteringS2S(AbsTaskClustering):
class ThuNewsClusteringP2P (line 370) | class ThuNewsClusteringP2P(AbsTaskClustering):
FILE: research/C_MTEB/C_MTEB/tasks/MultiLongDocRetrieval.py
function load_mldr_data (line 11) | def load_mldr_data(path: str, langs: list, eval_splits: list, cache_dir:...
class MultiLongDocRetrieval (line 31) | class MultiLongDocRetrieval(MultilingualTask, AbsTaskRetrieval):
method description (line 33) | def description(self):
method load_data (line 46) | def load_data(self, **kwargs):
method evaluate (line 58) | def evaluate(
FILE: research/C_MTEB/C_MTEB/tasks/PairClassification.py
class Ocnli (line 7) | class Ocnli(AbsTaskPairClassification):
method dataset_transform (line 40) | def dataset_transform(self):
class Cmnli (line 45) | class Cmnli(AbsTaskPairClassification):
method dataset_transform (line 113) | def dataset_transform(self):
FILE: research/C_MTEB/C_MTEB/tasks/Reranking.py
class T2Reranking (line 7) | class T2Reranking(AbsTaskReranking):
class MMarcoReranking (line 42) | class MMarcoReranking(AbsTaskReranking):
class CMedQAv1 (line 77) | class CMedQAv1(AbsTaskReranking):
class CMedQAv2 (line 116) | class CMedQAv2(AbsTaskReranking):
FILE: research/C_MTEB/C_MTEB/tasks/Retrieval.py
function load_retrieval_data (line 11) | def load_retrieval_data(dataset_path, dataset_revision, qrel_revision, e...
class T2Retrieval (line 28) | class T2Retrieval(AbsTaskRetrieval):
method load_data (line 75) | def load_data(self, **kwargs):
class MMarcoRetrieval (line 88) | class MMarcoRetrieval(AbsTaskRetrieval):
method load_data (line 135) | def load_data(self, **kwargs):
class DuRetrieval (line 148) | class DuRetrieval(AbsTaskRetrieval):
method load_data (line 193) | def load_data(self, **kwargs):
class CovidRetrieval (line 206) | class CovidRetrieval(AbsTaskRetrieval):
method load_data (line 244) | def load_data(self, **kwargs):
class CmedqaRetrieval (line 257) | class CmedqaRetrieval(AbsTaskRetrieval):
method load_data (line 295) | def load_data(self, **kwargs):
class EcomRetrieval (line 308) | class EcomRetrieval(AbsTaskRetrieval):
method load_data (line 348) | def load_data(self, **kwargs):
class MedicalRetrieval (line 361) | class MedicalRetrieval(AbsTaskRetrieval):
method load_data (line 401) | def load_data(self, **kwargs):
class VideoRetrieval (line 414) | class VideoRetrieval(AbsTaskRetrieval):
method load_data (line 454) | def load_data(self, **kwargs):
FILE: research/C_MTEB/C_MTEB/tasks/STS.py
class ATEC (line 8) | class ATEC(AbsTaskSTS):
method metadata_dict (line 54) | def metadata_dict(self) -> dict[str, str]:
class BQ (line 61) | class BQ(AbsTaskSTS):
method metadata_dict (line 96) | def metadata_dict(self) -> dict[str, str]:
class LCQMC (line 103) | class LCQMC(AbsTaskSTS):
method metadata_dict (line 138) | def metadata_dict(self) -> dict[str, str]:
class PAWSX (line 145) | class PAWSX(AbsTaskSTS):
method metadata_dict (line 180) | def metadata_dict(self) -> dict[str, str]:
class STSB (line 187) | class STSB(AbsTaskSTS):
method metadata_dict (line 222) | def metadata_dict(self) -> dict[str, str]:
class AFQMC (line 229) | class AFQMC(AbsTaskSTS):
method metadata_dict (line 275) | def metadata_dict(self) -> dict[str, str]:
class QBQTC (line 282) | class QBQTC(AbsTaskSTS):
method metadata_dict (line 309) | def metadata_dict(self) -> dict[str, str]:
FILE: research/C_MTEB/MKQA/dense_retrieval/step0-generate_embedding.py
class ModelArgs (line 28) | class ModelArgs:
class EvalArgs (line 48) | class EvalArgs:
function get_model (line 67) | def get_model(model_args: ModelArgs):
function parse_corpus (line 77) | def parse_corpus(corpus: datasets.Dataset):
function generate_index (line 89) | def generate_index(model: FlagModel, corpus: datasets.Dataset, max_passa...
function save_result (line 100) | def save_result(index: faiss.Index, docid: list, index_save_dir: str):
function main (line 109) | def main():
FILE: research/C_MTEB/MKQA/dense_retrieval/step1-search_results.py
class ModelArgs (line 28) | class ModelArgs:
class EvalArgs (line 52) | class EvalArgs:
function get_query_encoder (line 88) | def get_query_encoder(model_args: ModelArgs):
function check_languages (line 104) | def check_languages(languages):
function get_queries_and_qids (line 114) | def get_queries_and_qids(qa_data_dir: str, lang: str, add_instruction: b...
function save_result (line 131) | def save_result(search_results, result_save_path: str, qids: list, max_h...
function main (line 142) | def main():
FILE: research/C_MTEB/MKQA/dense_retrieval/step2-eval_dense_mkqa.py
class EvalArgs (line 58) | class EvalArgs:
function check_languages (line 99) | def check_languages(languages):
function compute_average (line 109) | def compute_average(results: dict):
function save_results (line 121) | def save_results(model_name: str, pooling_method: str, normalize_embeddi...
function get_corpus_dict (line 141) | def get_corpus_dict():
function get_qa_dict (line 152) | def get_qa_dict(qa_path: str):
function get_search_result_dict (line 162) | def get_search_result_dict(search_result_path: str, top_k: int=100):
function evaluate (line 181) | def evaluate(corpus_dict: dict, qa_dict: dict, search_result_path: str, ...
function main (line 204) | def main():
FILE: research/C_MTEB/MKQA/hybrid_retrieval/step0-hybrid_search_results.py
class EvalArgs (line 21) | class EvalArgs:
function check_languages (line 61) | def check_languages(languages):
function get_search_result_dict (line 71) | def get_search_result_dict(search_result_path: str, top_k: int=1000):
function get_queries_dict (line 91) | def get_queries_dict(queries_path: str):
function save_hybrid_results (line 100) | def save_hybrid_results(sparse_search_result_path: str, dense_search_res...
function main (line 134) | def main():
FILE: research/C_MTEB/MKQA/hybrid_retrieval/step1-eval_hybrid_mkqa.py
class EvalArgs (line 50) | class EvalArgs:
function check_languages (line 91) | def check_languages(languages):
function compute_average (line 101) | def compute_average(results: dict):
function save_results (line 113) | def save_results(model_name: str, pooling_method: str, normalize_embeddi...
function get_corpus_dict (line 133) | def get_corpus_dict():
function get_qa_dict (line 144) | def get_qa_dict(qa_path: str):
function get_search_result_dict (line 154) | def get_search_result_dict(search_result_path: str, top_k: int=100):
function evaluate (line 173) | def evaluate(corpus_dict: dict, qa_dict: dict, search_result_path: str, ...
function main (line 196) | def main():
FILE: research/C_MTEB/MKQA/multi_vector_rerank/hybrid_all_results.py
class EvalArgs (line 23) | class EvalArgs:
function check_languages (line 75) | def check_languages(languages):
function get_search_result_dict (line 85) | def get_search_result_dict(search_result_path: str, top_k: int=1000):
function get_queries_dict (line 105) | def get_queries_dict(queries_path: str):
function save_hybrid_results (line 114) | def save_hybrid_results(sparse_search_result_dict: dict, dense_search_re...
function main (line 151) | def main():
FILE: research/C_MTEB/MKQA/multi_vector_rerank/step0-rerank_results.py
class ModelArgs (line 33) | class ModelArgs:
class EvalArgs (line 53) | class EvalArgs:
function check_languages (line 113) | def check_languages(languages):
function get_reranker (line 123) | def get_reranker(model_args: ModelArgs, device: str=None):
function get_search_result_dict (line 133) | def get_search_result_dict(search_result_path: str, top_k: int=100):
function get_queries_dict (line 152) | def get_queries_dict(queries_path: str):
function get_corpus_dict (line 162) | def get_corpus_dict(corpus: datasets.Dataset):
function save_rerank_results (line 172) | def save_rerank_results(queries_dict: dict, corpus_dict: dict, reranker:...
function get_shard (line 211) | def get_shard(search_result_dict: dict, num_shards: int, shard_id: int):
function rerank_results (line 225) | def rerank_results(corpus_dict: dict, languages: list, eval_args: EvalAr...
function main (line 277) | def main():
FILE: research/C_MTEB/MKQA/multi_vector_rerank/step1-eval_rerank_mkqa.py
class EvalArgs (line 48) | class EvalArgs:
function check_languages (line 85) | def check_languages(languages):
function compute_average (line 95) | def compute_average(results: dict):
function save_results (line 107) | def save_results(model_name: str, reranker_name: str, results: dict, sav...
function get_corpus_dict (line 126) | def get_corpus_dict():
function get_qa_dict (line 137) | def get_qa_dict(qa_path: str):
function get_search_result_dict (line 147) | def get_search_result_dict(search_result_path: str, top_k: int=100):
function evaluate (line 166) | def evaluate(corpus_dict: dict, qa_dict: dict, search_result_path: str, ...
function main (line 189) | def main():
FILE: research/C_MTEB/MKQA/sparse_retrieval/bm25_baseline.py
function generate_corpus (line 25) | def generate_corpus(corpus_save_path: str):
function generate_queries (line 42) | def generate_queries(qa_data_dir: str, lang: str, queries_save_dir: str):
function index (line 65) | def index(corpus_save_dir: str, index_save_dir: str):
function search (line 77) | def search(index_save_dir: str, queries_save_dir: str, lang: str, result...
function main (line 92) | def main():
FILE: research/C_MTEB/MKQA/sparse_retrieval/bm25_baseline_same_tokenizer.py
function _map_func_corpus (line 32) | def _map_func_corpus(examples):
function _map_func_query (line 51) | def _map_func_query(examples):
function generate_corpus (line 70) | def generate_corpus(corpus_save_path: str):
function generate_queries (line 90) | def generate_queries(qa_data_dir: str, lang: str, queries_save_dir: str):
function index (line 115) | def index(corpus_save_dir: str, index_save_dir: str):
function search (line 127) | def search(index_save_dir: str, queries_save_dir: str, lang: str, result...
function main (line 141) | def main():
FILE: research/C_MTEB/MKQA/sparse_retrieval/step0-encode_query-and-corpus.py
class ModelArgs (line 31) | class ModelArgs:
class EvalArgs (line 51) | class EvalArgs:
function get_model (line 82) | def get_model(model_args: ModelArgs):
function check_languages (line 92) | def check_languages(languages):
function parse_corpus (line 102) | def parse_corpus(corpus: datasets.Dataset):
function get_queries (line 114) | def get_queries(qa_data_dir: str, lang: str):
function encode_and_save_corpus (line 134) | def encode_and_save_corpus(corpus_save_path: str, model: BGEM3FlagModel,...
function encode_and_save_queries (line 161) | def encode_and_save_queries(queries_save_path: str, model: BGEM3FlagMode...
function main (line 191) | def main():
FILE: research/C_MTEB/MKQA/sparse_retrieval/step1-search_results.py
class ModelArgs (line 20) | class ModelArgs:
class EvalArgs (line 28) | class EvalArgs:
function check_languages (line 64) | def check_languages(languages):
function generate_index (line 74) | def generate_index(corpus_embd_dir: str, index_save_dir: str, threads: i...
function search_and_save_results (line 87) | def search_and_save_results(index_save_dir: str, query_embd_path: str, r...
function parse_corpus (line 101) | def parse_corpus(corpus: datasets.Dataset):
function main (line 107) | def main():
FILE: research/C_MTEB/MKQA/sparse_retrieval/step2-eval_sparse_mkqa.py
class EvalArgs (line 56) | class EvalArgs:
function check_languages (line 97) | def check_languages(languages):
function compute_average (line 107) | def compute_average(results: dict):
function save_results (line 119) | def save_results(model_name: str, pooling_method: str, normalize_embeddi...
function get_corpus_dict (line 139) | def get_corpus_dict():
function get_qa_dict (line 150) | def get_qa_dict(qa_path: str):
function get_search_result_dict (line 160) | def get_search_result_dict(search_result_path: str, top_k: int=100):
function evaluate (line 179) | def evaluate(corpus_dict: dict, qa_dict: dict, search_result_path: str, ...
function main (line 202) | def main():
FILE: research/C_MTEB/MKQA/utils/evaluation.py
class SimpleTokenizer (line 8) | class SimpleTokenizer:
method __init__ (line 12) | def __init__(self):
method tokenize (line 22) | def tokenize(self, text, uncased=False):
function _normalize (line 31) | def _normalize(text):
function has_answer (line 35) | def has_answer(answers, text, tokenizer) -> bool:
function check_answer (line 49) | def check_answer(example, tokenizer) -> List[bool]:
function evaluate_recall_qa (line 63) | def evaluate_recall_qa(ctxs, answers, k=100):
FILE: research/C_MTEB/MKQA/utils/normalize_text.py
function normalize (line 133) | def normalize(text):
FILE: research/C_MTEB/MLDR/dense_retrieval/step0-generate_embedding.py
class ModelArgs (line 23) | class ModelArgs:
class EvalArgs (line 43) | class EvalArgs:
function get_model (line 67) | def get_model(model_args: ModelArgs):
function check_languages (line 77) | def check_languages(languages):
function load_corpus (line 87) | def load_corpus(lang: str):
function generate_index (line 95) | def generate_index(model: FlagModel, corpus: datasets.Dataset, max_passa...
function save_result (line 106) | def save_result(index: faiss.Index, docid: list, index_save_dir: str):
function main (line 115) | def main():
FILE: research/C_MTEB/MLDR/dense_retrieval/step1-search_results.py
class ModelArgs (line 24) | class ModelArgs:
class EvalArgs (line 48) | class EvalArgs:
function get_query_encoder (line 76) | def get_query_encoder(model_args: ModelArgs):
function check_languages (line 92) | def check_languages(languages):
function get_queries_and_qids (line 102) | def get_queries_and_qids(lang: str, split: str='test', add_instruction: ...
function save_result (line 115) | def save_result(search_results, result_save_path: str, qids: list, max_h...
function main (line 126) | def main():
FILE: research/C_MTEB/MLDR/dense_retrieval/step2-eval_dense_mldr.py
class EvalArgs (line 49) | class EvalArgs:
function check_languages (line 86) | def check_languages(languages):
function compute_average (line 96) | def compute_average(results: dict):
function save_results (line 108) | def save_results(model_name: str, pooling_method: str, normalize_embeddi...
function map_metric (line 128) | def map_metric(metric: str):
function evaluate (line 138) | def evaluate(script_path, qrels_path, search_result_path, metrics: list):
function main (line 164) | def main():
FILE: research/C_MTEB/MLDR/hybrid_retrieval/step0-hybrid_search_results.py
class EvalArgs (line 19) | class EvalArgs:
function check_languages (line 55) | def check_languages(languages):
function get_search_result_dict (line 65) | def get_search_result_dict(search_result_path: str, top_k: int=1000):
function save_hybrid_results (line 85) | def save_hybrid_results(sparse_search_result_dict: dict, dense_search_re...
function main (line 116) | def main():
FILE: research/C_MTEB/MLDR/hybrid_retrieval/step1-eval_hybrid_mldr.py
class EvalArgs (line 39) | class EvalArgs:
function check_languages (line 76) | def check_languages(languages):
function compute_average (line 86) | def compute_average(results: dict):
function save_results (line 98) | def save_results(model_name: str, pooling_method: str, normalize_embeddi...
function map_metric (line 118) | def map_metric(metric: str):
function evaluate (line 128) | def evaluate(script_path, qrels_path, search_result_path, metrics: list):
function main (line 154) | def main():
FILE: research/C_MTEB/MLDR/mteb_dense_eval/eval_MLDR.py
class EvalArgs (line 26) | class EvalArgs:
class ModelArgs (line 43) | class ModelArgs:
function check_languages (line 86) | def check_languages(languages):
function main (line 98) | def main():
FILE: research/C_MTEB/MLDR/mteb_dense_eval/flag_dres_model.py
function _transform_func (line 13) | def _transform_func(examples: Dict[str, List],
function _transform_func_v2 (line 24) | def _transform_func_v2(examples: Dict[str, List],
class FlagDRESModel (line 39) | class FlagDRESModel(DRESModel):
method __init__ (line 40) | def __init__(
method encode_queries (line 81) | def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
method encode_corpus (line 98) | def encode_corpus(self, corpus: List[Union[Dict[str, str], str]], **kw...
method encode (line 116) | def encode(self, sentences: List[str], max_length: int, batch_size: in...
method pooling (line 161) | def pooling(self,
FILE: research/C_MTEB/MLDR/multi_vector_rerank/hybrid_all_results.py
class EvalArgs (line 21) | class EvalArgs:
function check_languages (line 69) | def check_languages(languages):
function get_search_result_dict (line 79) | def get_search_result_dict(search_result_path: str, top_k: int=1000):
function save_hybrid_results (line 99) | def save_hybrid_results(sparse_search_result_dict: dict, dense_search_re...
function main (line 136) | def main():
FILE: research/C_MTEB/MLDR/multi_vector_rerank/step0-rerank_results.py
class ModelArgs (line 28) | class ModelArgs:
class EvalArgs (line 48) | class EvalArgs:
function check_languages (line 108) | def check_languages(languages):
function get_reranker (line 118) | def get_reranker(model_args: ModelArgs, device: str=None):
function get_search_result_dict (line 128) | def get_search_result_dict(search_result_path: str, top_k: int=200):
function get_queries_dict (line 147) | def get_queries_dict(lang: str, split: str='test'):
function get_corpus_dict (line 158) | def get_corpus_dict(lang: str):
function save_rerank_results (line 169) | def save_rerank_results(queries_dict: dict, corpus_dict: dict, reranker:...
function get_shard (line 207) | def get_shard(search_result_dict: dict, num_shards: int, shard_id: int):
function rerank_results (line 221) | def rerank_results(languages: list, eval_args: EvalArgs, model_args: Mod...
function main (line 278) | def main():
FILE: research/C_MTEB/MLDR/multi_vector_rerank/step1-eval_rerank_mldr.py
class EvalArgs (line 41) | class EvalArgs:
function check_languages (line 74) | def check_languages(languages):
function compute_average (line 84) | def compute_average(results: dict):
function save_results (line 96) | def save_results(model_name: str, reranker_name: str, results: dict, sav...
function map_metric (line 115) | def map_metric(metric: str):
function evaluate (line 125) | def evaluate(script_path: str, qrels_path, search_result_path, metrics: ...
function merge_search_result (line 151) | def merge_search_result(search_result_save_dir: str, lang: str):
function main (line 175) | def main():
FILE: research/C_MTEB/MLDR/sparse_retrieval/bm25_baseline.py
function generate_corpus (line 19) | def generate_corpus(lang: str, corpus_save_dir: str):
function generate_queries (line 32) | def generate_queries(lang: str, queries_save_dir: str, split: str='test'):
function index (line 52) | def index(lang: str, corpus_save_dir: str, index_save_dir: str):
function search (line 64) | def search(index_save_dir: str, queries_save_dir: str, lang: str, result...
function main (line 79) | def main():
FILE: research/C_MTEB/MLDR/sparse_retrieval/bm25_baseline_same_tokenizer.py
function _map_func_corpus (line 26) | def _map_func_corpus(examples):
function _map_func_query (line 46) | def _map_func_query(examples):
function generate_corpus (line 67) | def generate_corpus(lang: str, corpus_save_dir: str):
function generate_queries (line 82) | def generate_queries(lang: str, queries_save_dir: str, split: str='test'):
function index (line 104) | def index(corpus_save_dir: str, index_save_dir: str):
function search (line 115) | def search(index_save_dir: str, queries_save_dir: str, lang: str, result...
function main (line 129) | def main():
FILE: research/C_MTEB/MLDR/sparse_retrieval/step0-encode_query-and-corpus.py
class ModelArgs (line 25) | class ModelArgs:
class EvalArgs (line 45) | class EvalArgs:
function get_model (line 77) | def get_model(model_args: ModelArgs):
function check_languages (line 87) | def check_languages(languages):
function load_corpus (line 97) | def load_corpus(lang: str):
function get_queries (line 105) | def get_queries(lang: str, split: str='test'):
function encode_corpus (line 119) | def encode_corpus(model: BGEM3FlagModel, corpus: datasets.Dataset, max_p...
function encode_queries (line 143) | def encode_queries(model: BGEM3FlagModel, queries: datasets.Dataset, max...
function save_result (line 170) | def save_result(encoded_queries_list: list, encoded_corpus_list: list, s...
function main (line 185) | def main():
FILE: research/C_MTEB/MLDR/sparse_retrieval/step1-search_results.py
class ModelArgs (line 16) | class ModelArgs:
class EvalArgs (line 24) | class EvalArgs:
function check_languages (line 56) | def check_languages(languages):
function generate_index (line 66) | def generate_index(lang: str, corpus_embd_dir: str, index_save_dir: str,...
function search_and_save_results (line 79) | def search_and_save_results(index_save_dir: str, query_embd_path: str, r...
function main (line 93) | def main():
FILE: research/C_MTEB/MLDR/sparse_retrieval/step2-eval_sparse_mldr.py
class EvalArgs (line 47) | class EvalArgs:
function check_languages (line 84) | def check_languages(languages):
function compute_average (line 94) | def compute_average(results: dict):
function save_results (line 106) | def save_results(model_name: str, pooling_method: str, normalize_embeddi...
function map_metric (line 130) | def map_metric(metric: str):
function evaluate (line 140) | def evaluate(script_path, qrels_path, search_result_path, metrics: list):
function main (line 166) | def main():
FILE: research/C_MTEB/eval_C-MTEB.py
function get_args (line 18) | def get_args():
FILE: research/C_MTEB/eval_MTEB.py
function get_args (line 16) | def get_args():
FILE: research/C_MTEB/eval_cross_encoder.py
function get_args (line 9) | def get_args():
FILE: research/C_MTEB/flag_dres_model.py
class FlagDRESModel (line 9) | class FlagDRESModel:
method __init__ (line 10) | def __init__(
method encode_queries (line 40) | def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
method encode_corpus (line 52) | def encode_corpus(self, corpus: List[Union[Dict[str, str], str]], **kw...
method encode (line 65) | def encode(self, sentences: List[str], **kwargs) -> np.ndarray:
method pooling (line 87) | def pooling(self,
FILE: research/C_MTEB/summarize_results.py
function read_results (line 21) | def read_results(task_types, args):
function output_markdown (line 47) | def output_markdown(tasks_results, model, save_file):
function get_args (line 119) | def get_args():
FILE: research/LLARA/finetune/arguments.py
function default_list (line 8) | def default_list() -> List[int]:
class ModelArguments (line 13) | class ModelArguments:
class DataArguments (line 83) | class DataArguments:
method __post_init__ (line 153) | def __post_init__(self):
class RetrieverTrainingArguments (line 158) | class RetrieverTrainingArguments(TrainingArguments):
FILE: research/LLARA/finetune/data.py
class TrainDatasetForEmbedding (line 19) | class TrainDatasetForEmbedding(Dataset):
method __init__ (line 20) | def __init__(
method __len__ (line 52) | def __len__(self):
method __getitem__ (line 56) | def __getitem__(self, item) -> Tuple[BatchEncoding, List[BatchEncoding]]:
class EmbedCollator (line 96) | class EmbedCollator(DataCollatorForSeq2Seq):
method __call__ (line 106) | def __call__(self, features, return_tensors='pt'):
FILE: research/LLARA/finetune/load_model.py
function get_model (line 8) | def get_model(model_args):
FILE: research/LLARA/finetune/modeling.py
class EncoderOutput (line 17) | class EncoderOutput(ModelOutput):
class BiEncoderModel (line 24) | class BiEncoderModel(nn.Module):
method __init__ (line 27) | def __init__(self,
method gradient_checkpointing_enable (line 59) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 62) | def enable_input_require_grads(self, **kwargs):
method encode (line 65) | def encode(self, features):
method compute_similarity (line 107) | def compute_similarity(self, q_reps, p_reps):
method forward (line 112) | def forward(self, query: Union[Dict[str, Tensor], List[Dict[str, Tenso...
method compute_loss (line 144) | def compute_loss(self, scores, target):
method _dist_gather_tensor (line 147) | def _dist_gather_tensor(self, t: Optional[torch.Tensor]):
method save (line 159) | def save(self, output_dir: str):
FILE: research/LLARA/finetune/run.py
function main (line 21) | def main():
FILE: research/LLARA/finetune/trainer.py
class BiTrainer (line 4) | class BiTrainer(Trainer):
method _save (line 5) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method compute_loss (line 28) | def compute_loss(self, model, inputs, return_outputs=False):
FILE: research/LLARA/pretrain/arguments.py
function default_list (line 8) | def default_list() -> List[int]:
class ModelArguments (line 13) | class ModelArguments:
class DataArguments (line 70) | class DataArguments:
method __post_init__ (line 95) | def __post_init__(self):
class PretrainTrainingArguments (line 100) | class PretrainTrainingArguments(TrainingArguments):
FILE: research/LLARA/pretrain/data.py
class TrainDatasetForEmbedding (line 16) | class TrainDatasetForEmbedding(Dataset):
method __init__ (line 17) | def __init__(
method __len__ (line 52) | def __len__(self):
method __getitem__ (line 55) | def __getitem__(self, item):
class EmbedCollator (line 90) | class EmbedCollator(DataCollatorForSeq2Seq):
method __call__ (line 98) | def __call__(self, features, return_tensors='pt'):
FILE: research/LLARA/pretrain/load_model.py
function get_model (line 6) | def get_model(model_args, use_gradient_checkpointing: bool = False):
FILE: research/LLARA/pretrain/modeling.py
class NewLlamaModel (line 20) | class NewLlamaModel(LlamaModel):
method forward (line 23) | def forward(
method _update_causal_mask (line 156) | def _update_causal_mask(
class PreLlamaModel (line 236) | class PreLlamaModel(LlamaForCausalLM):
method __init__ (line 237) | def __init__(self, config):
method forward (line 266) | def forward(
class PreModel (line 419) | class PreModel(nn.Module):
method __init__ (line 420) | def __init__(self,
method gradient_checkpointing_enable (line 426) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 429) | def enable_input_require_grads(self, **kwargs):
method forward (line 432) | def forward(self, *args, **kwargs):
method save (line 435) | def save(self, output_dir: str):
FILE: research/LLARA/pretrain/run.py
function main (line 21) | def main():
FILE: research/LLARA/pretrain/trainer.py
class PreTrainer (line 3) | class PreTrainer(Trainer):
method _save (line 4) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method compute_loss (line 21) | def compute_loss(self, model, inputs, return_outputs=False):
FILE: research/LM_Cocktail/LM_Cocktail/cocktail.py
function save_ckpt_for_sentence_transformers (line 17) | def save_ckpt_for_sentence_transformers(ckpt_dir, pooling_mode: str = 'c...
function mix_models (line 30) | def mix_models(model_names_or_paths: List[str],
function mix_models_with_data (line 72) | def mix_models_with_data(model_names_or_paths: List[str],
function mix_models_by_layers (line 125) | def mix_models_by_layers(model_names_or_paths: List[str],
FILE: research/LM_Cocktail/LM_Cocktail/utils.py
function load_llm (line 14) | def load_llm(model_name:str, trust_remote_code:bool):
function load_embedder (line 19) | def load_embedder(model_name:str, trust_remote_code:bool):
function load_reranker (line 24) | def load_reranker(model_name:str, trust_remote_code:bool):
function load_seq2seq_model (line 29) | def load_seq2seq_model(model_name:str, trust_remote_code:bool):
function load_model (line 34) | def load_model(model_name:str, model_type:str, trust_remote_code:bool=Tr...
function get_model_param_list (line 48) | def get_model_param_list(model_names: List[str], model_type:str):
function merge_param (line 57) | def merge_param(model_param_list: List[Dict], weights: List[float]):
function get_model_param_dirs (line 70) | def get_model_param_dirs(model_names: List[str], model_type:str):
function merge_param_by_layer (line 95) | def merge_param_by_layer(model_param_dirs: List[str], weights: List[floa...
function compute_weights (line 127) | def compute_weights(base_model, tokenizer, param_list: List[Dict], model...
function preprocess_data_for_seq2seq (line 158) | def preprocess_data_for_seq2seq(example_data, tokenizer, device, batch_s...
function preprocess_data_for_embedder (line 181) | def preprocess_data_for_embedder(example_data, tokenizer, device, batch_...
function seq2seq_loss (line 207) | def seq2seq_loss(base_model, input_data):
function embedder_loss (line 219) | def embedder_loss(base_model, input_data):
function preprocess_data_for_llm (line 239) | def preprocess_data_for_llm(example_data, tokenizer, device, batch_size:...
function llm_loss (line 275) | def llm_loss(base_model, input_data):
FILE: research/Long_LLM/activation_beacon/main/eval_generation.py
class Args (line 18) | class Args(ModelArgs):
function main (line 55) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_infbench.py
class Args (line 23) | class Args(ModelArgs):
function process_infbench (line 62) | def process_infbench(data, indices, tokenizer, chat_template, task:str, ...
function main (line 97) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_lm.py
class Args (line 16) | class Args(ModelArgs):
function process_lm_pre (line 50) | def process_lm_pre(tokenizer, tokenize_max_char=None):
function process_lm (line 62) | def process_lm(tokenizer, max_length=4096, stride=1024, min_length=None):
function main (line 124) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_longbench.py
class Args (line 22) | class Args(ModelArgs):
function process_longbench (line 61) | def process_longbench(data, indices, tokenizer, chat_template, task, max...
function main (line 96) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_mmlu.py
class Args (line 22) | class Args(ModelArgs):
function remove_eos (line 51) | def remove_eos(inputs: Mapping, eos_token_ids: Union[List,int]):
function process_mmlu (line 64) | def process_mmlu(tokenizer, chat_template, eos_token_id, few_shot=0, tra...
function evaluate_mmlu (line 138) | def evaluate_mmlu(eval_data, save_path, eval_preds):
function main (line 182) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_msc.py
class Args (line 23) | class Args(ModelArgs):
function process_msc (line 48) | def process_msc(data, tokenizer, max_length, chat_template):
function main (line 66) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_multiturn.py
class Args (line 21) | class Args(ModelArgs):
function process_multiturn (line 50) | def process_multiturn(data, indices, tokenizer, chat_template, min_lengt...
function main (line 114) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_needle.py
class Args (line 28) | class Args(ModelArgs):
method __post_init__ (line 98) | def __post_init__(self):
class OpenAIEvaluator (line 103) | class OpenAIEvaluator:
method __init__ (line 113) | def __init__(self,
method evaluate_response (line 147) | def evaluate_response(self, response: str) -> int:
function generate_sample (line 170) | def generate_sample(
function main (line 212) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_passkey.py
class Args (line 28) | class Args(ModelArgs):
function generate_sample (line 85) | def generate_sample(tokenizer, chat_template, context_length, passkey_de...
function main (line 126) | def main():
FILE: research/Long_LLM/activation_beacon/main/eval_topic.py
class Args (line 25) | class Args(ModelArgs):
function process_topic_retrieval (line 54) | def process_topic_retrieval(data, tokenizer, chat_template, num_topic, t...
function main (line 93) | def main():
FILE: research/Long_LLM/activation_beacon/main/infbench_utils.py
function normalize_answer (line 12) | def normalize_answer(s: str) -> str:
function normalize_zh_answer (line 31) | def normalize_zh_answer(s: str) -> str:
function f1_score (line 48) | def f1_score(prediction, ground_truth) -> tuple[float, float, float]:
function qa_f1_score (line 59) | def qa_f1_score(pred: str, ground_truths) -> float:
function qa_f1_score_zh (line 78) | def qa_f1_score_zh(pred: str, ground_truths: list[str]) -> float:
function load_json (line 100) | def load_json(fname):
function iter_jsonl (line 104) | def iter_jsonl(fname, cnt=None):
function first_int_match (line 117) | def first_int_match(prediction):
function split_retrieval_answer (line 127) | def split_retrieval_answer(pred: str):
function get_score_one_kv_retrieval (line 134) | def get_score_one_kv_retrieval(pred, label, model_name: str) -> bool:
function get_score_one_passkey (line 141) | def get_score_one_passkey(pred, label, model_name: str) -> bool:
function get_score_one_number_string (line 147) | def get_score_one_number_string(pred, label, model_name: str) -> bool:
function get_score_one_code_run (line 153) | def get_score_one_code_run(pred, label, model_name: str) -> bool:
function get_score_one_code_debug (line 172) | def get_score_one_code_debug(pred, label, model_name: str) -> bool:
function get_score_one_math_find (line 208) | def get_score_one_math_find(pred, label, model_name: str) -> bool:
function get_score_one_longdialogue_qa_eng (line 230) | def get_score_one_longdialogue_qa_eng(pred, label, model_name: str) -> b...
function get_score_one_longbook_choice_eng (line 239) | def get_score_one_longbook_choice_eng(pred, label, model_name: str) -> b...
function get_score_one_longbook_qa_eng (line 280) | def get_score_one_longbook_qa_eng(pred, label, model_name: str) -> float:
function get_score_one_longbook_sum_eng (line 284) | def get_score_one_longbook_sum_eng(
function get_score_one_longbook_qa_chn (line 297) | def get_score_one_longbook_qa_chn(pred, label, model_name: str) -> float:
function get_score_one_math_calc (line 301) | def get_score_one_math_calc(pred, label, model_name: str) -> float:
function get_score_one (line 326) | def get_score_one(
function get_labels (line 359) | def get_labels(preds: list) -> list[str]:
function get_preds (line 367) | def get_preds(preds: list, data_name: str) -> list[str]:
function get_score (line 382) | def get_score(
function compute_scores (line 396) | def compute_scores(preds_path, data_name: str, model_name: str):
function create_prompt (line 406) | def create_prompt(eg: dict, data_name: str, prompt_template: str) -> str:
function get_answer (line 516) | def get_answer(eg: dict, data_name: str):
FILE: research/Long_LLM/activation_beacon/main/longbench_utils.py
function normalize_answer (line 12) | def normalize_answer(s):
function normalize_zh_answer (line 31) | def normalize_zh_answer(s):
function count_score (line 47) | def count_score(prediction, ground_truth, **kwargs):
function retrieval_score (line 56) | def retrieval_score(prediction, ground_truth, **kwargs):
function retrieval_zh_score (line 68) | def retrieval_zh_score(prediction, ground_truth, **kwargs):
function code_sim_score (line 80) | def code_sim_score(prediction, ground_truth, **kwargs):
function classification_score (line 89) | def classification_score(prediction, ground_truth, **kwargs):
function rouge_score (line 114) | def rouge_score(prediction, ground_truth, **kwargs):
function rouge_score_zh (line 122) | def rouge_score_zh(prediction, ground_truth, **kwargs):
function f1_score (line 128) | def f1_score(prediction, ground_truth, **kwargs):
function qa_f1_score (line 138) | def qa_f1_score(prediction, ground_truth, **kwargs):
function qa_f1_score_zh (line 147) | def qa_f1_score_zh(prediction, ground_truth, **kwargs):
function scorer (line 156) | def scorer(dataset, predictions, answers, all_classes):
FILE: research/Long_LLM/activation_beacon/main/pretrain_data.py
class Args (line 22) | class Args(ModelArgs):
function prepare_pretrain_data (line 54) | def prepare_pretrain_data(data_files, tokenizer: PreTrainedTokenizer, co...
FILE: research/Long_LLM/activation_beacon/main/train.py
function main (line 20) | def main():
FILE: research/Long_LLM/activation_beacon/src/__init__.py
function get_model_and_tokenizer (line 15) | def get_model_and_tokenizer(model_args, device="cpu", evaluation_mode=Tr...
FILE: research/Long_LLM/activation_beacon/src/args.py
class ModelArgs (line 9) | class ModelArgs:
method resolve_path (line 204) | def resolve_path(self, path):
method get_generation_config (line 218) | def get_generation_config(self):
method to_dict (line 230) | def to_dict(self):
method save (line 233) | def save(self, path):
method __post_init__ (line 237) | def __post_init__(self):
class TrainingArgs (line 259) | class TrainingArgs(TrainingArguments):
method __post_init__ (line 374) | def __post_init__(self):
FILE: research/Long_LLM/activation_beacon/src/chat.py
class ChatTemplateOutput (line 17) | class ChatTemplateOutput:
function mask_nested_lists (line 22) | def mask_nested_lists(lst, mask_target, mask_value=0):
function apply_chat_template (line 31) | def apply_chat_template(template, messages, system_message=None, tokeniz...
class SeparatorStyle (line 223) | class SeparatorStyle(IntEnum):
class Conversation (line 255) | class Conversation:
method get_prompt (line 280) | def get_prompt(self) -> str:
method get_images (line 529) | def get_images(self):
method set_system_message (line 539) | def set_system_message(self, system_message: str):
method get_system_message (line 543) | def get_system_message(self):
method append_message (line 547) | def append_message(self, role: str, message: str):
method update_last_message (line 551) | def update_last_message(self, message: str):
method convert_image_to_base64 (line 559) | def convert_image_to_base64(self, image):
method to_gradio_chatbot (line 594) | def to_gradio_chatbot(self):
method to_openai_api_messages (line 610) | def to_openai_api_messages(self):
method extract_text_from_messages (line 625) | def extract_text_from_messages(self):
method copy (line 631) | def copy(self):
method dict (line 646) | def dict(self):
function register_conv_template (line 660) | def register_conv_template(template: Conversation, override: bool = False):
function get_conv_template (line 670) | def get_conv_template(name: str) -> Conversation:
FILE: research/Long_LLM/activation_beacon/src/data.py
class Data (line 18) | class Data:
method _process_pretrain_data (line 19) | def _process_pretrain_data(data, indices):
method _process_language_modeling (line 28) | def _process_language_modeling(data, indices, tokenizer, min_length, m...
method _process_instruction_tuning (line 54) | def _process_instruction_tuning(data, indices, tokenizer, chat_templat...
method prepare_train_data (line 102) | def prepare_train_data(data_files=None, tokenizer=None, max_length=409...
method prepare_eval_data (line 174) | def prepare_eval_data(data_files=None, tokenizer=None, max_length=4096...
FILE: research/Long_LLM/activation_beacon/src/llama/configuration_llama.py
class LlamaConfig (line 31) | class LlamaConfig(PretrainedConfig):
method __init__ (line 117) | def __init__(
method _rope_scaling_validation (line 196) | def _rope_scaling_validation(self):
FILE: research/Long_LLM/activation_beacon/src/llama/modeling_llama.py
function _get_unpad_data (line 63) | def _get_unpad_data(attention_mask):
class LlamaRMSNorm (line 76) | class LlamaRMSNorm(nn.Module):
method __init__ (line 77) | def __init__(self, hidden_size, eps=1e-6):
method forward (line 85) | def forward(self, hidden_states):
class LlamaMLP (line 94) | class LlamaMLP(nn.Module):
method __init__ (line 95) | def __init__(self, config):
method forward (line 105) | def forward(self, x):
function repeat_kv (line 110) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
class LlamaAttention (line 122) | class LlamaAttention(nn.Module):
method __init__ (line 125) | def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
method _init_beacon_proj (line 179) | def _init_beacon_proj(self, missing_keys):
method _shape (line 253) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
method qkv_proj_with_beacon (line 329) | def qkv_proj_with_beacon(self, hidden_states, beacon_size, beacon_indi...
method o_proj_with_beacon (line 375) | def o_proj_with_beacon(self, attn_output, beacon_size, beacon_indices):
method forward (line 390) | def forward(
class LlamaSdpaAttention (line 473) | class LlamaSdpaAttention(LlamaAttention):
method forward (line 481) | def forward(
class LlamaFlashAttention2 (line 563) | class LlamaFlashAttention2(LlamaAttention):
method __init__ (line 570) | def __init__(self, *args, **kwargs):
method forward (line 578) | def forward(
method _flash_attention_forward (line 667) | def _flash_attention_forward(
method _upad_input (line 726) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
class LlamaDecoderLayer (line 772) | class LlamaDecoderLayer(nn.Module):
method __init__ (line 773) | def __init__(self, config: LlamaConfig, layer_idx: int):
method forward (line 783) | def forward(
class LlamaPreTrainedModel (line 865) | class LlamaPreTrainedModel(PreTrainedModel):
method _init_weights (line 875) | def _init_weights(self, module):
class LlamaModel (line 961) | class LlamaModel(LlamaPreTrainedModel):
method __init__ (line 969) | def __init__(self, config: LlamaConfig):
method _init_beacon_embed (line 990) | def _init_beacon_embed(self, missing_keys):
method get_input_embeddings (line 1021) | def get_input_embeddings(self):
method set_input_embeddings (line 1024) | def set_input_embeddings(self, value):
method forward (line 1028) | def forward(
class LlamaForCausalLM (line 1148) | class LlamaForCausalLM(LlamaPreTrainedModel):
method __init__ (line 1151) | def __init__(self, config):
method get_input_embeddings (line 1159) | def get_input_embeddings(self):
method set_input_embeddings (line 1162) | def set_input_embeddings(self, value):
method get_output_embeddings (line 1165) | def get_output_embeddings(self):
method set_output_embeddings (line 1168) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 1171) | def set_decoder(self, decoder):
method get_decoder (line 1174) | def get_decoder(self):
method from_pretrained (line 1178) | def from_pretrained(cls, *args, **kwargs):
method _native_forward (line 1201) | def _native_forward(
method _beacon_forward (line 1263) | def _beacon_forward(self,
method forward (line 1335) | def forward(self, **kwargs):
method prepare_inputs_for_generation (line 1346) | def prepare_inputs_for_generation(
method _reorder_cache (line 1377) | def _reorder_cache(past_key_values, beam_idx):
FILE: research/Long_LLM/activation_beacon/src/metrics.py
class Metric (line 14) | class Metric:
method get_metric_fn (line 17) | def get_metric_fn(cls, metrics, **kwds):
method get_save_path (line 40) | def get_save_path(eval_data, output_dir=None, field="result", save_nam...
method save_result (line 57) | def save_result(preds, labels, save_path, indices=None, **kwargs):
method rouge (line 73) | def rouge(preds, labels, **kwargs):
FILE: research/Long_LLM/activation_beacon/src/mistral/configuration_mistral.py
class MistralConfig (line 29) | class MistralConfig(PretrainedConfig):
method __init__ (line 104) | def __init__(
method _rope_scaling_validation (line 181) | def _rope_scaling_validation(self):
FILE: research/Long_LLM/activation_beacon/src/mistral/modeling_mistral.py
function _get_unpad_data (line 63) | def _get_unpad_data(attention_mask):
class MistralRMSNorm (line 76) | class MistralRMSNorm(nn.Module):
method __init__ (line 77) | def __init__(self, hidden_size, eps=1e-6):
method forward (line 85) | def forward(self, hidden_states):
class MistralMLP (line 94) | class MistralMLP(nn.Module):
method __init__ (line 95) | def __init__(self, config):
method forward (line 105) | def forward(self, x):
function repeat_kv (line 110) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
class MistralAttention (line 122) | class MistralAttention(nn.Module):
method __init__ (line 125) | def __init__(self, config: MistralConfig, layer_idx: Optional[int] = N...
method _init_beacon_proj (line 178) | def _init_beacon_proj(self, missing_keys):
method _shape (line 252) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
method qkv_proj_with_beacon (line 255) | def qkv_proj_with_beacon(self, hidden_states, beacon_size, beacon_indi...
method o_proj_with_beacon (line 301) | def o_proj_with_beacon(self, attn_output, beacon_size, beacon_indices):
method forward (line 316) | def forward(
class MistralSdpaAttention (line 399) | class MistralSdpaAttention(MistralAttention):
method forward (line 407) | def forward(
class MistralFlashAttention2 (line 489) | class MistralFlashAttention2(MistralAttention):
method __init__ (line 496) | def __init__(self, *args, **kwargs):
method forward (line 504) | def forward(
method _flash_attention_forward (line 596) | def _flash_attention_forward(
method _upad_input (line 655) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
class MistralDecoderLayer (line 701) | class MistralDecoderLayer(nn.Module):
method __init__ (line 702) | def __init__(self, config: MistralConfig, layer_idx: int):
method forward (line 717) | def forward(
class MistralPreTrainedModel (line 798) | class MistralPreTrainedModel(PreTrainedModel):
method _init_weights (line 808) | def _init_weights(self, module):
class MistralModel (line 894) | class MistralModel(MistralPreTrainedModel):
method __init__ (line 902) | def __init__(self, config: MistralConfig):
method _init_beacon_embed (line 923) | def _init_beacon_embed(self, missing_keys):
method get_input_embeddings (line 954) | def get_input_embeddings(self):
method set_input_embeddings (line 957) | def set_input_embeddings(self, value):
method forward (line 961) | def forward(
class MistralForCausalLM (line 1081) | class MistralForCausalLM(MistralPreTrainedModel):
method __init__ (line 1084) | def __init__(self, config):
method get_input_embeddings (line 1092) | def get_input_embeddings(self):
method set_input_embeddings (line 1095) | def set_input_embeddings(self, value):
method get_output_embeddings (line 1098) | def get_output_embeddings(self):
method set_output_embeddings (line 1101) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 1104) | def set_decoder(self, decoder):
method get_decoder (line 1107) | def get_decoder(self):
method from_pretrained (line 1111) | def from_pretrained(cls, *args, **kwargs):
method _native_forward (line 1134) | def _native_forward(
method _beacon_forward (line 1196) | def _beacon_forward(self,
method forward (line 1268) | def forward(self, **kwargs):
method prepare_inputs_for_generation (line 1279) | def prepare_inputs_for_generation(
method _reorder_cache (line 1310) | def _reorder_cache(past_key_values, beam_idx):
FILE: research/Long_LLM/activation_beacon/src/modeling_beacon.py
class Memory (line 14) | class Memory(torch.nn.Module):
method __init__ (line 15) | def __init__(
method _post_validation (line 36) | def _post_validation(self, verbose=True):
method set (line 54) | def set(self, verbose=True, **kwargs):
method reset (line 62) | def reset(self):
method all_sequence_length (line 105) | def all_sequence_length(self):
method batch_size (line 112) | def batch_size(self):
method finish (line 119) | def finish(self):
method dtype (line 124) | def dtype(self):
method min_value (line 128) | def min_value(self):
method max_position_embeddings (line 132) | def max_position_embeddings(self):
method get_memory_size (line 139) | def get_memory_size(self):
method prepare (line 154) | def prepare(self, input_ids, attention_mask, labels, skip_first=None, ...
method set_compression_ratio (line 200) | def set_compression_ratio(self, start_idx, end_idx):
method step (line 273) | def step(self):
method _step (line 394) | def _step(self, ignore_memory=False):
method update_memory (line 659) | def update_memory(self, past_key_values):
method update_loss (line 716) | def update_loss(self, batch_loss, valid_token_num):
method output (line 729) | def output(self, model_outputs):
method _make_4d_attention_mask_and_position_ids (line 757) | def _make_4d_attention_mask_and_position_ids(
method _extract_beacon_and_raw_memory (line 890) | def _extract_beacon_and_raw_memory(
function slice_tensor (line 930) | def slice_tensor(x, start=None, end=None, step=None, index=None, dim=2):
function cat_tensor (line 978) | def cat_tensor(list_of_tensors, dim=-1):
function slice_activations (line 988) | def slice_activations(activations, start=None, end=None, k_seq_dim=2, v_...
function cat_activations (line 996) | def cat_activations(list_of_activations, k_seq_dim=2, v_seq_dim=2):
function interleave_activations (line 1009) | def interleave_activations(main_activations, augment_activations, main_s...
function softmax (line 1068) | def softmax(x:np.ndarray, axis=-1, temperature=1):
function l1_norm (line 1076) | def l1_norm(x):
FILE: research/Long_LLM/activation_beacon/src/modeling_utils.py
function optional_grad_ctx (line 12) | def optional_grad_ctx(with_grad=False):
function move_to_device (line 18) | def move_to_device(data, device):
function get_shifted_labels (line 32) | def get_shifted_labels(input_ids):
function compute_loss (line 47) | def compute_loss(logits, labels, shift=False):
function evaluate_perplexity (line 84) | def evaluate_perplexity(model, dataloader, accelerator:Optional[Accelera...
function evaluate_generation (line 141) | def evaluate_generation(model, dataloader, accelerator:Optional[Accelera...
function evaluate_nll (line 190) | def evaluate_nll(model, dataloader, accelerator:Optional[Accelerator]=No...
class ModelOutput (line 236) | class ModelOutput(BaseModelOutputWithPast):
function get_rope (line 249) | def get_rope(head_dim, base, max_position_embeddings, rope_scaling=None):
function rotate_half (line 313) | def rotate_half(x):
class RotaryEmbedding (line 320) | class RotaryEmbedding(torch.nn.Module):
method __init__ (line 321) | def __init__(self, dim, max_position_embeddings=32768, base=10000, dev...
method _set_cos_sin_cache (line 335) | def _set_cos_sin_cache(self, seq_len, device, dtype):
method forward (line 344) | def forward(self, q, k, position_ids):
class LinearScalingRotaryEmbedding (line 363) | class LinearScalingRotaryEmbedding(RotaryEmbedding):
method __init__ (line 366) | def __init__(self, dim, max_position_embeddings=32768, base=10000, dev...
method _set_cos_sin_cache (line 370) | def _set_cos_sin_cache(self, seq_len, device, dtype):
class DynamicNTKScalingRotaryEmbedding (line 382) | class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
method __init__ (line 385) | def __init__(self, dim, max_position_embeddings=32768, base=10000, dev...
method _set_cos_sin_cache (line 389) | def _set_cos_sin_cache(self, seq_len, device, dtype):
class YarnRotaryEmbedding (line 408) | class YarnRotaryEmbedding(torch.nn.Module):
method __init__ (line 409) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _get_factor (line 423) | def _get_factor(self):
method _get_temperature (line 442) | def _get_temperature(self):
method _set_cos_sin_cache (line 447) | def _set_cos_sin_cache(self, seq_len, device, dtype):
method forward (line 469) | def forward(self, q, k, position_ids):
class YarnDynamicTemperatureRotaryEmbedding (line 488) | class YarnDynamicTemperatureRotaryEmbedding(torch.nn.Module):
method __init__ (line 489) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _get_factor (line 503) | def _get_factor(self):
method _set_cos_sin_cache (line 522) | def _set_cos_sin_cache(self, seq_len, device, dtype):
method forward (line 547) | def forward(self, q, k, position_ids):
class YarnDynamicTemperatureLogNRotaryEmbedding (line 572) | class YarnDynamicTemperatureLogNRotaryEmbedding(torch.nn.Module):
method __init__ (line 573) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _get_factor (line 587) | def _get_factor(self):
method _set_cos_sin_cache (line 606) | def _set_cos_sin_cache(self, seq_len, device, dtype):
method forward (line 631) | def forward(self, q, k, position_ids):
class Llama3RotaryEmbedding (line 656) | class Llama3RotaryEmbedding(torch.nn.Module):
method __init__ (line 657) | def __init__(self, dim, max_position_embeddings=8192, base=10000, devi...
method _set_cos_sin_cache (line 687) | def _set_cos_sin_cache(self, seq_len, device):
method forward (line 696) | def forward(self, q, k, position_ids):
FILE: research/Long_LLM/activation_beacon/src/qwen2/configuration_qwen2.py
class Qwen2Config (line 28) | class Qwen2Config(PretrainedConfig):
method __init__ (line 98) | def __init__(
FILE: research/Long_LLM/activation_beacon/src/qwen2/modeling_qwen2.py
function _get_unpad_data (line 71) | def _get_unpad_data(attention_mask):
class Qwen2RMSNorm (line 84) | class Qwen2RMSNorm(nn.Module):
method __init__ (line 85) | def __init__(self, hidden_size, eps=1e-6):
method forward (line 93) | def forward(self, hidden_states):
class Qwen2MLP (line 102) | class Qwen2MLP(nn.Module):
method __init__ (line 103) | def __init__(self, config):
method forward (line 113) | def forward(self, x):
function repeat_kv (line 119) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
class Qwen2Attention (line 131) | class Qwen2Attention(nn.Module):
method __init__ (line 134) | def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
method _init_beacon_proj (line 187) | def _init_beacon_proj(self, missing_keys):
method _shape (line 261) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
method qkv_proj_with_beacon (line 264) | def qkv_proj_with_beacon(self, hidden_states, beacon_size, beacon_indi...
method o_proj_with_beacon (line 310) | def o_proj_with_beacon(self, attn_output, beacon_size, beacon_indices):
method forward (line 325) | def forward(
class Qwen2SdpaAttention (line 408) | class Qwen2SdpaAttention(Qwen2Attention):
method forward (line 416) | def forward(
class Qwen2FlashAttention2 (line 498) | class Qwen2FlashAttention2(Qwen2Attention):
method __init__ (line 505) | def __init__(self, *args, **kwargs):
method forward (line 513) | def forward(
method _flash_attention_forward (line 606) | def _flash_attention_forward(
method _upad_input (line 665) | def _upad_input(self, query_layer, key_layer, value_layer, attention_m...
class Qwen2DecoderLayer (line 711) | class Qwen2DecoderLayer(nn.Module):
method __init__ (line 712) | def __init__(self, config: Qwen2Config, layer_idx: int):
method forward (line 727) | def forward(
class Qwen2PreTrainedModel (line 808) | class Qwen2PreTrainedModel(PreTrainedModel):
method _init_weights (line 818) | def _init_weights(self, module):
class Qwen2Model (line 904) | class Qwen2Model(Qwen2PreTrainedModel):
method __init__ (line 912) | def __init__(self, config: Qwen2Config):
method _init_beacon_embed (line 933) | def _init_beacon_embed(self, missing_keys):
method get_input_embeddings (line 964) | def get_input_embeddings(self):
method set_input_embeddings (line 967) | def set_input_embeddings(self, value):
method forward (line 971) | def forward(
class Qwen2ForCausalLM (line 1091) | class Qwen2ForCausalLM(Qwen2PreTrainedModel):
method __init__ (line 1094) | def __init__(self, config):
method get_input_embeddings (line 1102) | def get_input_embeddings(self):
method set_input_embeddings (line 1105) | def set_input_embeddings(self, value):
method get_output_embeddings (line 1108) | def get_output_embeddings(self):
method set_output_embeddings (line 1111) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 1114) | def set_decoder(self, decoder):
method get_decoder (line 1117) | def get_decoder(self):
method from_pretrained (line 1121) | def from_pretrained(cls, *args, **kwargs):
method _native_forward (line 1144) | def _native_forward(
method _beacon_forward (line 1206) | def _beacon_forward(self,
method forward (line 1282) | def forward(self, **kwargs):
method prepare_inputs_for_generation (line 1293) | def prepare_inputs_for_generation(
method _reorder_cache (line 1324) | def _reorder_cache(past_key_values, beam_idx):
FILE: research/Long_LLM/activation_beacon/src/trainer.py
class ActivationBeaconTrainer (line 18) | class ActivationBeaconTrainer(Trainer):
method __init__ (line 19) | def __init__(self, *args, model_args, file_logger, **kwargs):
method compute_loss (line 24) | def compute_loss(self, model, inputs, return_outputs=False):
method _get_train_sampler (line 47) | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
method _save (line 72) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method evaluate (line 79) | def evaluate(self, eval_dataset: Dataset | None = None, ignore_keys: L...
class StrideGroupedSampler (line 144) | class StrideGroupedSampler(Sampler):
method __init__ (line 147) | def __init__(
method __len__ (line 232) | def __len__(self):
method __iter__ (line 235) | def __iter__(self):
FILE: research/Long_LLM/activation_beacon/src/utils.py
function do_nothing (line 24) | def do_nothing():
function optional_grad_ctx (line 27) | def optional_grad_ctx(with_grad=False):
function makedirs (line 33) | def makedirs(path):
function clear_dir (line 38) | def clear_dir(directory):
function split_file_dir_name_ext (line 51) | def split_file_dir_name_ext(path):
function save_pickle (line 57) | def save_pickle(obj, path:str):
function load_pickle (line 66) | def load_pickle(path):
function save_json (line 70) | def save_json(obj, path:str):
function load_json (line 76) | def load_json(path, lines=False):
function format_numel_str (line 87) | def format_numel_str(numel: int) -> str:
function batched_iter (line 103) | def batched_iter(iterable: Iterable, max_batch_size: int):
function show_time (line 114) | def show_time(times):
function filelock (line 121) | def filelock(path, process_index=0):
function normalize_text (line 133) | def normalize_text(text, ignore_case=True, ignore_punctuation=True, igno...
function wrap_text (line 156) | def wrap_text(s):
function min_max_normalize (line 165) | def min_max_normalize(array):
function softmax (line 168) | def softmax(x:np.ndarray, axis=-1):
function get_max_length_in_nested_lists (line 175) | def get_max_length_in_nested_lists(lst):
function pad_nested_lists (line 186) | def pad_nested_lists(lst, max_length, padding_value, padding_side="right"):
function mask_nested_lists (line 205) | def mask_nested_lists(lst, mask_target, mask_value=0):
function are_elements_of_same_length (line 213) | def are_elements_of_same_length(lst: List):
function add_eos (line 220) | def add_eos(inputs: Mapping, eos_token_id: int):
function remove_eos (line 238) | def remove_eos(inputs: Mapping, eos_token_ids: Union[List,int]):
class FileLogger (line 249) | class FileLogger:
method __init__ (line 250) | def __init__(self, log_file) -> None:
method log (line 253) | def log(self, metrics, **kwargs):
class DefaultDataCollator (line 274) | class DefaultDataCollator:
method __call__ (line 286) | def __call__(self, batch_elem: List) -> Dict[str, Any]:
FILE: research/Long_LLM/activation_beacon/src/vllm_utils.py
class HFStyleVllmModel (line 16) | class HFStyleVllmModel:
method __init__ (line 17) | def __init__(
method device (line 26) | def device(self):
method parse_generation_config (line 29) | def parse_generation_config(self, generation_config:Union[dict,Generat...
method generate (line 47) | def generate(
method __call__ (line 77) | def __call__(self, input_ids, attention_mask, labels, **kwargs):
FILE: research/Long_LLM/longllm_qlora/data_pipeline/_openai.py
function process_api_requests_from_file (line 110) | async def process_api_requests_from_file(
class StatusTracker (line 280) | class StatusTracker:
class APIRequest (line 294) | class APIRequest:
method call_api (line 304) | async def call_api(
function api_endpoint_from_url (line 376) | def api_endpoint_from_url(request_url):
function append_to_jsonl (line 387) | def append_to_jsonl(data, filename: str) -> None:
function num_tokens_consumed_from_request (line 394) | def num_tokens_consumed_from_request(
function task_id_generator_function (line 453) | def task_id_generator_function():
FILE: research/Long_LLM/longllm_qlora/main/eval_generation.py
class Args (line 18) | class Args(ModelArgs):
function main (line 55) | def main():
FILE: research/Long_LLM/longllm_qlora/main/eval_infbench.py
class Args (line 23) | class Args(ModelArgs):
function process_infbench (line 62) | def process_infbench(data, indices, tokenizer, chat_template, task:str, ...
function main (line 97) | def main():
FILE: research/Long_LLM/longllm_qlora/main/eval_lm.py
class Args (line 16) | class Args(ModelArgs):
function process_lm_pre (line 50) | def process_lm_pre(tokenizer, tokenize_max_char=None):
function process_lm (line 62) | def process_lm(tokenizer, max_length=4096, stride=1024, min_length=None):
function main (line 124) | def main():
FILE: research/Long_LLM/longllm_qlora/main/eval_longbench.py
class Args (line 22) | class Args(ModelArgs):
function process_longbench (line 61) | def process_longbench(data, indices, tokenizer, chat_template, task, max...
function main (line 96) | def main():
FILE: research/Long_LLM/longllm_qlora/main/eval_mmlu.py
class Args (line 22) | class Args(ModelArgs):
function remove_eos (line 51) | def remove_eos(inputs: Mapping, eos_token_ids: Union[List,int]):
function process_mmlu (line 64) | def process_mmlu(tokenizer, chat_template, eos_token_id, few_shot=0, tra...
function evaluate_mmlu (line 138) | def evaluate_mmlu(eval_data, save_path, eval_preds):
function main (line 182) | def main():
FILE: research/Long_LLM/longllm_qlora/main/eval_needle.py
class Args (line 28) | class Args(ModelArgs):
method __post_init__ (line 102) | def __post_init__(self):
class OpenAIEvaluator (line 107) | class OpenAIEvaluator:
method __init__ (line 117) | def __init__(self,
method evaluate_response (line 152) | def evaluate_response(self, response: str) -> int:
function generate_sample (line 175) | def generate_sample(
function main (line 217) | def main():
FILE: research/Long_LLM/longllm_qlora/main/eval_passkey.py
class Args (line 28) | class Args(ModelArgs):
function generate_sample (line 84) | def generate_sample(tokenizer, chat_template, context_length, passkey_de...
function main (line 125) | def main():
FILE: research/Long_LLM/longllm_qlora/main/eval_topic.py
class Args (line 23) | class Args(ModelArgs):
function process_topic_retrieval (line 42) | def process_topic_retrieval(tokenizer, chat_template, num_topic):
function main (line 69) | def main():
FILE: research/Long_LLM/longllm_qlora/main/infbench_utils.py
function normalize_answer (line 12) | def normalize_answer(s: str) -> str:
function normalize_zh_answer (line 31) | def normalize_zh_answer(s: str) -> str:
function f1_score (line 48) | def f1_score(prediction, ground_truth) -> tuple[float, float, float]:
function qa_f1_score (line 59) | def qa_f1_score(pred: str, ground_truths) -> float:
function qa_f1_score_zh (line 78) | def qa_f1_score_zh(pred: str, ground_truths: list[str]) -> float:
function load_json (line 100) | def load_json(fname):
function iter_jsonl (line 104) | def iter_jsonl(fname, cnt=None):
function first_int_match (line 117) | def first_int_match(prediction):
function split_retrieval_answer (line 127) | def split_retrieval_answer(pred: str):
function get_score_one_kv_retrieval (line 134) | def get_score_one_kv_retrieval(pred, label, model_name: str) -> bool:
function get_score_one_passkey (line 141) | def get_score_one_passkey(pred, label, model_name: str) -> bool:
function get_score_one_number_string (line 147) | def get_score_one_number_string(pred, label, model_name: str) -> bool:
function get_score_one_code_run (line 153) | def get_score_one_code_run(pred, label, model_name: str) -> bool:
function get_score_one_code_debug (line 172) | def get_score_one_code_debug(pred, label, model_name: str) -> bool:
function get_score_one_math_find (line 208) | def get_score_one_math_find(pred, label, model_name: str) -> bool:
function get_score_one_longdialogue_qa_eng (line 230) | def get_score_one_longdialogue_qa_eng(pred, label, model_name: str) -> b...
function get_score_one_longbook_choice_eng (line 239) | def get_score_one_longbook_choice_eng(pred, label, model_name: str) -> b...
function get_score_one_longbook_qa_eng (line 280) | def get_score_one_longbook_qa_eng(pred, label, model_name: str) -> float:
function get_score_one_longbook_sum_eng (line 284) | def get_score_one_longbook_sum_eng(
function get_score_one_longbook_qa_chn (line 295) | def get_score_one_longbook_qa_chn(pred, label, model_name: str) -> float:
function get_score_one_math_calc (line 299) | def get_score_one_math_calc(pred, label, model_name: str) -> float:
function get_score_one (line 324) | def get_score_one(
function get_labels (line 357) | def get_labels(preds: list) -> list[str]:
function get_preds (line 365) | def get_preds(preds: list, data_name: str) -> list[str]:
function get_score (line 380) | def get_score(
function compute_scores (line 394) | def compute_scores(preds_path, data_name: str, model_name: str):
function create_prompt (line 404) | def create_prompt(eg: dict, data_name: str, prompt_template: str) -> str:
function get_answer (line 514) | def get_answer(eg: dict, data_name: str):
FILE: research/Long_LLM/longllm_qlora/main/longbench_utils.py
function normalize_answer (line 12) | def normalize_answer(s):
function normalize_zh_answer (line 31) | def normalize_zh_answer(s):
function count_score (line 47) | def count_score(prediction, ground_truth, **kwargs):
function retrieval_score (line 56) | def retrieval_score(prediction, ground_truth, **kwargs):
function retrieval_zh_score (line 68) | def retrieval_zh_score(prediction, ground_truth, **kwargs):
function code_sim_score (line 80) | def code_sim_score(prediction, ground_truth, **kwargs):
function classification_score (line 89) | def classification_score(prediction, ground_truth, **kwargs):
function rouge_score (line 114) | def rouge_score(prediction, ground_truth, **kwargs):
function rouge_score_zh (line 122) | def rouge_score_zh(prediction, ground_truth, **kwargs):
function f1_score (line 128) | def f1_score(prediction, ground_truth, **kwargs):
function qa_f1_score (line 138) | def qa_f1_score(prediction, ground_truth, **kwargs):
function qa_f1_score_zh (line 147) | def qa_f1_score_zh(prediction, ground_truth, **kwargs):
function scorer (line 156) | def scorer(dataset, predictions, answers, all_classes):
FILE: research/Long_LLM/longllm_qlora/main/train.py
function main (line 21) | def main():
FILE: research/Long_LLM/longllm_qlora/src/__init__.py
function get_model_and_tokenizer (line 15) | def get_model_and_tokenizer(model_args, device="cpu", evaluation_mode=Tr...
FILE: research/Long_LLM/longllm_qlora/src/args.py
class ModelArgs (line 9) | class ModelArgs:
method resolve_path (line 187) | def resolve_path(self, path):
method get_generation_config (line 201) | def get_generation_config(self):
method to_dict (line 213) | def to_dict(self):
method save (line 216) | def save(self, path):
method __post_init__ (line 220) | def __post_init__(self):
class TrainingArgs (line 242) | class TrainingArgs(TrainingArguments):
method __post_init__ (line 373) | def __post_init__(self):
FILE: research/Long_LLM/longllm_qlora/src/chat.py
class ChatTemplateOutput (line 17) | class ChatTemplateOutput:
function mask_nested_lists (line 22) | def mask_nested_lists(lst, mask_target, mask_value=0):
function apply_chat_template (line 31) | def apply_chat_template(template, messages, system_message=None, tokeniz...
class SeparatorStyle (line 190) | class SeparatorStyle(IntEnum):
class Conversation (line 222) | class Conversation:
method get_prompt (line 247) | def get_prompt(self) -> str:
method get_images (line 496) | def get_images(self):
method set_system_message (line 506) | def set_system_message(self, system_message: str):
method get_system_message (line 510) | def get_system_message(self):
method append_message (line 514) | def append_message(self, role: str, message: str):
method update_last_message (line 518) | def update_last_message(self, message: str):
method convert_image_to_base64 (line 526) | def convert_image_to_base64(self, image):
method to_gradio_chatbot (line 561) | def to_gradio_chatbot(self):
method to_openai_api_messages (line 577) | def to_openai_api_messages(self):
method extract_text_from_messages (line 592) | def extract_text_from_messages(self):
method copy (line 598) | def copy(self):
method dict (line 613) | def dict(self):
function register_conv_template (line 627) | def register_conv_template(template: Conversation, override: bool = False):
function get_conv_template (line 637) | def get_conv_template(name: str) -> Conversation:
FILE: research/Long_LLM/longllm_qlora/src/data.py
class Data (line 21) | class Data:
method _process_language_modeling (line 22) | def _process_language_modeling(data, indices, tokenizer, min_length, m...
method _process_instruction_tuning (line 46) | def _process_instruction_tuning(data, indices, tokenizer, chat_templat...
method prepare_train_data (line 88) | def prepare_train_data(data_files=None, tokenizer=None, max_length=409...
method prepare_eval_data (line 157) | def prepare_eval_data(data_files=None, tokenizer=None, max_length=4096...
FILE: research/Long_LLM/longllm_qlora/src/metrics.py
class Metric (line 14) | class Metric:
method get_metric_fn (line 17) | def get_metric_fn(cls, metrics, **kwds):
method get_save_path (line 40) | def get_save_path(eval_data, output_dir=None, field="result", save_nam...
method save_result (line 57) | def save_result(preds, labels, save_path, indices=None, **kwargs):
method rouge (line 73) | def rouge(preds, labels, **kwargs):
FILE: research/Long_LLM/longllm_qlora/src/modeling_utils.py
function optional_grad_ctx (line 12) | def optional_grad_ctx(with_grad=False):
function move_to_device (line 18) | def move_to_device(data, device):
function compute_loss (line 32) | def compute_loss(logits, labels, shift=False):
function evaluate_perplexity (line 68) | def evaluate_perplexity(model, dataloader, accelerator:Optional[Accelera...
function evaluate_generation (line 119) | def evaluate_generation(model, dataloader, accelerator:Optional[Accelera...
function evaluate_nll (line 161) | def evaluate_nll(model, dataloader, accelerator:Optional[Accelerator]=No...
class BeaconModelOutput (line 207) | class BeaconModelOutput(BaseModelOutputWithPast):
FILE: research/Long_LLM/longllm_qlora/src/trainer.py
class LLMTrainer (line 14) | class LLMTrainer(Trainer):
method __init__ (line 15) | def __init__(self, *args, model_args, file_logger, **kwargs):
method _prepare_inputs (line 20) | def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]])...
method _save (line 34) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method evaluate (line 41) | def evaluate(self, eval_dataset: Dataset | None = None, ignore_keys: L...
FILE: research/Long_LLM/longllm_qlora/src/utils.py
function do_nothing (line 24) | def do_nothing():
function optional_grad_ctx (line 27) | def optional_grad_ctx(with_grad=False):
function makedirs (line 33) | def makedirs(path):
function clear_dir (line 38) | def clear_dir(directory):
function split_file_dir_name_ext (line 51) | def split_file_dir_name_ext(path):
function save_pickle (line 57) | def save_pickle(obj, path:str):
function load_pickle (line 66) | def load_pickle(path):
function save_json (line 70) | def save_json(obj, path:str):
function load_json (line 76) | def load_json(path, lines=False):
function format_numel_str (line 87) | def format_numel_str(numel: int) -> str:
function batched_iter (line 103) | def batched_iter(iterable: Iterable, max_batch_size: int):
function show_time (line 114) | def show_time(times):
function filelock (line 121) | def filelock(path, process_index=0):
function normalize_text (line 133) | def normalize_text(text, ignore_case=True, ignore_punctuation=True, igno...
function wrap_text (line 156) | def wrap_text(s):
function min_max_normalize (line 165) | def min_max_normalize(array):
function softmax (line 168) | def softmax(x:np.ndarray, axis=-1):
function get_max_length_in_nested_lists (line 175) | def get_max_length_in_nested_lists(lst):
function pad_nested_lists (line 186) | def pad_nested_lists(lst, max_length, padding_value, padding_side="right"):
function mask_nested_lists (line 205) | def mask_nested_lists(lst, mask_target, mask_value=0):
function are_elements_of_same_length (line 213) | def are_elements_of_same_length(lst: List):
function add_eos (line 220) | def add_eos(inputs: Mapping, eos_token_id: int):
function remove_eos (line 238) | def remove_eos(inputs: Mapping, eos_token_ids: Union[List,int]):
function mix_parameters (line 247) | def mix_parameters(models: List[torch.nn.Module], weights: Optional[List...
class FileLogger (line 286) | class FileLogger:
method __init__ (line 287) | def __init__(self, log_file) -> None:
method log (line 290) | def log(self, metrics, **kwargs):
class DefaultDataCollator (line 311) | class DefaultDataCollator:
method __call__ (line 323) | def __call__(self, batch_elem: List) -> Dict[str, Any]:
FILE: research/MLVU/evaluation/generation_evaluation/calculate.py
function extract_scores (line 8) | def extract_scores(text):
FILE: research/MLVU/evaluation/generation_evaluation/calculate_sum.py
function extract_scores (line 7) | def extract_scores(text):
FILE: research/MLVU/evaluation/generation_evaluation/evaluate_ssc.py
function parse_args (line 9) | def parse_args():
function get_scoring_points (line 20) | def get_scoring_points(score_points="MLVU_all/json/8_sub_scene.json"):
function annotate (line 30) | def annotate(prediction_set, caption_files, output_dir):
function main (line 112) | def main():
FILE: research/MLVU/evaluation/generation_evaluation/evaluate_summary.py
function parse_args (line 9) | def parse_args():
function annotate (line 21) | def annotate(prediction_set, caption_files, output_dir):
function main (line 102) | def main():
FILE: research/MLVU/evaluation/generation_evaluation/open_bench.py
function get_prompt2 (line 10) | def get_prompt2(conv):
class MLVU (line 24) | class MLVU(Dataset):
method __init__ (line 25) | def __init__(self, data_dir, data_list):
method __str__ (line 39) | def __str__(self):
method __len__ (line 61) | def __len__(self):
method get_index (line 64) | def get_index(self, bound, fps, max_frame, first_idx=0):
method qa_template (line 79) | def qa_template(self, data):
method __getitem__ (line 85) | def __getitem__(self, idx):
function main (line 98) | def main():
FILE: research/MLVU/evaluation/models/videochat2/choice_bench.py
function get_prompt (line 62) | def get_prompt(conv):
function get_prompt2 (line 72) | def get_prompt2(conv):
function get_context_emb (line 87) | def get_context_emb(conv, model, img_list, answer_prompt=None, print_res...
function ask (line 115) | def ask(text, conv):
class StoppingCriteriaSub (line 119) | class StoppingCriteriaSub(StoppingCriteria):
method __init__ (line 120) | def __init__(self, stops=[], encounters=1):
method __call__ (line 123) | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTen...
function answer (line 130) | def answer(conv, model, img_list, do_sample=True, max_new_tokens=200, nu...
function get_index (line 163) | def get_index(num_frames, num_segments):
function load_video (line 172) | def load_video(video_path, num_segments=8, return_msg=False, resolution=...
function get_sinusoid_encoding_table (line 205) | def get_sinusoid_encoding_table(n_position=784, d_hid=1024, cur_frame=8,...
class MLVU (line 272) | class MLVU(Dataset):
method __init__ (line 273) | def __init__(self, data_dir, data_list, num_segments=8, resolution=224):
method __str__ (line 305) | def __str__(self):
method __len__ (line 327) | def __len__(self):
method get_index (line 330) | def get_index(self, bound, fps, max_frame, first_idx=0):
method read_video (line 344) | def read_video(self, video_path, bound=None):
method qa_template (line 359) | def qa_template(self, data):
method __getitem__ (line 372) | def __getitem__(self, idx):
function infer_mvbench (line 395) | def infer_mvbench(
function check_ans (line 445) | def check_ans(pred, gt):
FILE: research/MLVU/evaluation/models/videochat2/open_bench.py
function get_prompt (line 62) | def get_prompt(conv):
function get_prompt2 (line 72) | def get_prompt2(conv):
function get_context_emb (line 87) | def get_context_emb(conv, model, img_list, answer_prompt=None, print_res...
function ask (line 115) | def ask(text, conv):
class StoppingCriteriaSub (line 119) | class StoppingCriteriaSub(StoppingCriteria):
method __init__ (line 120) | def __init__(self, stops=[], encounters=1):
method __call__ (line 123) | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTen...
function answer (line 130) | def answer(conv, model, img_list, do_sample=True, max_new_tokens=200, nu...
function get_index (line 163) | def get_index(num_frames, num_segments):
function load_video (line 172) | def load_video(video_path, num_segments=8, return_msg=False, resolution=...
function get_sinusoid_encoding_table (line 205) | def get_sinusoid_encoding_table(n_position=784, d_hid=1024, cur_frame=8,...
class MLVU (line 264) | class MLVU(Dataset):
method __init__ (line 265) | def __init__(self, data_dir, data_list, num_segments=8, resolution=224):
method __str__ (line 296) | def __str__(self):
method __len__ (line 318) | def __len__(self):
method get_index (line 321) | def get_index(self, bound, fps, max_frame, first_idx=0):
method read_video (line 335) | def read_video(self, video_path, bound=None):
method qa_template (line 349) | def qa_template(self, data):
method __getitem__ (line 354) | def __getitem__(self, idx):
function infer_mvbench (line 377) | def infer_mvbench(
FILE: research/MLVU/evaluation/models/videollava/choice_bench.py
function get_prompt2 (line 22) | def get_prompt2(conv):
class MLVU (line 36) | class MLVU(Dataset):
method __init__ (line 37) | def __init__(self, data_dir, data_list):
method __str__ (line 51) | def __str__(self):
method __len__ (line 73) | def __len__(self):
method get_index (line 76) | def get_index(self, bound, fps, max_frame, first_idx=0):
method qa_template (line 88) | def qa_template(self, data):
method __getitem__ (line 101) | def __getitem__(self, idx):
function check_ans (line 114) | def check_ans(pred, gt):
function main (line 132) | def main():
FILE: research/MLVU/evaluation/models/videollava/open_bench.py
class MLVU (line 24) | class MLVU(Dataset):
method __init__ (line 25) | def __init__(self, data_dir, data_list):
method __str__ (line 39) | def __str__(self):
method __len__ (line 61) | def __len__(self):
method get_index (line 64) | def get_index(self, bound, fps, max_frame, first_idx=0):
method qa_template (line 79) | def qa_template(self, data):
method __getitem__ (line 86) | def __getitem__(self, idx):
function main (line 99) | def main():
FILE: research/MLVU/evaluation/multiple_choice_evaluation/choice_bench.py
function get_prompt2 (line 10) | def get_prompt2(conv):
class MLVU (line 24) | class MLVU(Dataset):
method __init__ (line 25) | def __init__(self, data_dir, data_list):
method __str__ (line 39) | def __str__(self):
method __len__ (line 61) | def __len__(self):
method get_index (line 64) | def get_index(self, bound, fps, max_frame, first_idx=0):
method qa_template (line 79) | def qa_template(self, data):
method __getitem__ (line 92) | def __getitem__(self, idx):
function check_ans (line 105) | def check_ans(pred, gt):
function main (line 121) | def main():
FILE: research/Matroyshka_reranker/finetune/compensation/arguments.py
function default_list (line 8) | def default_list() -> List[str]:
class ModelArguments (line 13) | class ModelArguments:
class DataArguments (line 112) | class DataArguments:
class RetrieverTrainingArguments (line 184) | class RetrieverTrainingArguments(TrainingArguments):
FILE: research/Matroyshka_reranker/finetune/compensation/data.py
function traverse_directory_using_os (line 18) | def traverse_directory_using_os(root_folder):
class TrainDatasetForReranker (line 29) | class TrainDatasetForReranker(Dataset):
method __init__ (line 30) | def __init__(
method __len__ (line 70) | def __len__(self):
method __getitem__ (line 73) | def __getitem__(self, item) -> tuple[List[BatchEncoding], List[int], L...
class RerankCollator (line 168) | class RerankCollator(DataCollatorForSeq2Seq):
method __call__ (line 177) | def __call__(self, features_lengths, return_tensors='pt'):
FILE: research/Matroyshka_reranker/finetune/compensation/load_model.py
function get_model (line 11) | def get_model(model_args, training_args, output_token_id):
FILE: research/Matroyshka_reranker/finetune/compensation/mistral_config.py
class CostWiseMistralConfig (line 23) | class CostWiseMistralConfig(MistralConfig):
method __init__ (line 98) | def __init__(
FILE: research/Matroyshka_reranker/finetune/compensation/mistral_model.py
class CostWiseModelOutputWithPast (line 72) | class CostWiseModelOutputWithPast(ModelOutput):
class CostWiseCausalLMOutputWithPast (line 80) | class CostWiseCausalLMOutputWithPast(ModelOutput):
function token_compress (line 88) | def token_compress(compress_ratio,
class CostWiseMistralModel (line 197) | class CostWiseMistralModel(MistralPreTrainedModel):
method __init__ (line 205) | def __init__(self, config: CostWiseMistralConfig):
method get_input_embeddings (line 221) | def get_input_embeddings(self):
method set_input_embeddings (line 224) | def set_input_embeddings(self, value):
method forward (line 228) | def forward(
class CostWiseHead (line 468) | class CostWiseHead(nn.Module):
method __init__ (line 471) | def __init__(self, input_size, output_size):
method forward (line 475) | def forward(self, **kwargs):
class CostWiseMistralForCausalLM (line 478) | class CostWiseMistralForCausalLM(MistralPreTrainedModel):
method __init__ (line 481) | def __init__(self, config):
method get_input_embeddings (line 497) | def get_input_embeddings(self):
method set_input_embeddings (line 500) | def set_input_embeddings(self, value):
method get_output_embeddings (line 503) | def get_output_embeddings(self):
method set_output_embeddings (line 506) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 509) | def set_decoder(self, decoder):
method get_decoder (line 512) | def get_decoder(self):
method forward (line 517) | def forward(
method prepare_inputs_for_generation (line 642) | def prepare_inputs_for_generation(
method _reorder_cache (line 700) | def _reorder_cache(past_key_values, beam_idx):
FILE: research/Matroyshka_reranker/finetune/compensation/modeling.py
class RerankerOutput (line 19) | class RerankerOutput(ModelOutput):
function last_logit_pool (line 24) | def last_logit_pool(logits: Tensor,
function set_nested_attr (line 35) | def set_nested_attr(obj, attr, value):
function get_nested_attr (line 42) | def get_nested_attr(obj, attr):
class BiEncoderModel (line 49) | class BiEncoderModel(nn.Module):
method __init__ (line 50) | def __init__(self,
method gradient_checkpointing_enable (line 85) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 88) | def enable_input_require_grads(self, **kwargs):
method encode (line 91) | def encode(self, features, query_lengths, prompt_lengths):
method forward (line 118) | def forward(self,
method compute_loss (line 159) | def compute_loss(self, scores, target):
method save (line 162) | def save(self, output_dir: str):
method save_pretrained (line 176) | def save_pretrained(self, **kwargs):
FILE: research/Matroyshka_reranker/finetune/compensation/run.py
function main (line 20) | def main():
FILE: research/Matroyshka_reranker/finetune/compensation/trainer.py
class BiTrainer (line 6) | class BiTrainer(Trainer):
method _save (line 9) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method compute_loss (line 41) | def compute_loss(self, model, inputs, return_outputs=False):
FILE: research/Matroyshka_reranker/finetune/self_distillation/arguments.py
function default_list (line 8) | def default_list() -> List[str]:
class ModelArguments (line 13) | class ModelArguments:
class DataArguments (line 109) | class DataArguments:
class RetrieverTrainingArguments (line 181) | class RetrieverTrainingArguments(TrainingArguments):
FILE: research/Matroyshka_reranker/finetune/self_distillation/data.py
function traverse_directory_using_os (line 18) | def traverse_directory_using_os(root_folder):
class TrainDatasetForReranker (line 29) | class TrainDatasetForReranker(Dataset):
method __init__ (line 30) | def __init__(
method __len__ (line 70) | def __len__(self):
method __getitem__ (line 73) | def __getitem__(self, item) -> tuple[List[BatchEncoding], List[int], L...
class RerankCollator (line 168) | class RerankCollator(DataCollatorForSeq2Seq):
method __call__ (line 177) | def __call__(self, features_lengths, return_tensors='pt'):
FILE: research/Matroyshka_reranker/finetune/self_distillation/load_model.py
function get_model (line 9) | def get_model(model_args, training_args, output_token_id):
FILE: research/Matroyshka_reranker/finetune/self_distillation/mistral_config.py
class CostWiseMistralConfig (line 23) | class CostWiseMistralConfig(MistralConfig):
method __init__ (line 98) | def __init__(
FILE: research/Matroyshka_reranker/finetune/self_distillation/mistral_model.py
class CostWiseModelOutputWithPast (line 72) | class CostWiseModelOutputWithPast(ModelOutput):
class CostWiseCausalLMOutputWithPast (line 80) | class CostWiseCausalLMOutputWithPast(ModelOutput):
function token_compress (line 88) | def token_compress(compress_ratio,
class CostWiseMistralModel (line 197) | class CostWiseMistralModel(MistralPreTrainedModel):
method __init__ (line 205) | def __init__(self, config: CostWiseMistralConfig):
method get_input_embeddings (line 221) | def get_input_embeddings(self):
method set_input_embeddings (line 224) | def set_input_embeddings(self, value):
method forward (line 228) | def forward(
class CostWiseHead (line 468) | class CostWiseHead(nn.Module):
method __init__ (line 471) | def __init__(self, input_size, output_size):
method forward (line 475) | def forward(self, **kwargs):
class CostWiseMistralForCausalLM (line 478) | class CostWiseMistralForCausalLM(MistralPreTrainedModel):
method __init__ (line 481) | def __init__(self, config):
method get_input_embeddings (line 497) | def get_input_embeddings(self):
method set_input_embeddings (line 500) | def set_input_embeddings(self, value):
method get_output_embeddings (line 503) | def get_output_embeddings(self):
method set_output_embeddings (line 506) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 509) | def set_decoder(self, decoder):
method get_decoder (line 512) | def get_decoder(self):
method forward (line 517) | def forward(
method prepare_inputs_for_generation (line 642) | def prepare_inputs_for_generation(
method _reorder_cache (line 700) | def _reorder_cache(past_key_values, beam_idx):
FILE: research/Matroyshka_reranker/finetune/self_distillation/modeling.py
class RerankerOutput (line 15) | class RerankerOutput(ModelOutput):
function last_logit_pool (line 20) | def last_logit_pool(logits: Tensor,
class BiEncoderModel (line 31) | class BiEncoderModel(nn.Module):
method __init__ (line 32) | def __init__(self,
method gradient_checkpointing_enable (line 61) | def gradient_checkpointing_enable(self, **kwargs):
method enable_input_require_grads (line 64) | def enable_input_require_grads(self, **kwargs):
method encode (line 67) | def encode(self, features, query_lengths, prompt_lengths):
method encode_full (line 91) | def encode_full(self, features, query_lengths, prompt_lengths):
method forward (line 115) | def forward(self,
method compute_loss (line 203) | def compute_loss(self, scores, target):
method save (line 206) | def save(self, output_dir: str):
method save_pretrained (line 215) | def save_pretrained(self, **kwargs):
FILE: research/Matroyshka_reranker/finetune/self_distillation/run.py
function main (line 20) | def main():
FILE: research/Matroyshka_reranker/finetune/self_distillation/trainer.py
class BiTrainer (line 6) | class BiTrainer(Trainer):
method _save (line 9) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method compute_loss (line 41) | def compute_loss(self, model, inputs, return_outputs=False):
FILE: research/Matroyshka_reranker/inference/mistral_config.py
class CostWiseMistralConfig (line 23) | class CostWiseMistralConfig(MistralConfig):
method __init__ (line 98) | def __init__(
FILE: research/Matroyshka_reranker/inference/mistral_model.py
class CostWiseModelOutputWithPast (line 72) | class CostWiseModelOutputWithPast(ModelOutput):
class CostWiseCausalLMOutputWithPast (line 80) | class CostWiseCausalLMOutputWithPast(ModelOutput):
function token_compress (line 88) | def token_compress(compress_ratio,
class CostWiseMistralModel (line 197) | class CostWiseMistralModel(MistralPreTrainedModel):
method __init__ (line 205) | def __init__(self, config: CostWiseMistralConfig):
method get_input_embeddings (line 221) | def get_input_embeddings(self):
method set_input_embeddings (line 224) | def set_input_embeddings(self, value):
method forward (line 228) | def forward(
class CostWiseHead (line 468) | class CostWiseHead(nn.Module):
method __init__ (line 471) | def __init__(self, input_size, output_size):
method forward (line 475) | def forward(self, **kwargs):
class CostWiseMistralForCausalLM (line 478) | class CostWiseMistralForCausalLM(MistralPreTrainedModel):
method __init__ (line 481) | def __init__(self, config):
method get_input_embeddings (line 497) | def get_input_embeddings(self):
method set_input_embeddings (line 500) | def set_input_embeddings(self, value):
method get_output_embeddings (line 503) | def get_output_embeddings(self):
method set_output_embeddings (line 506) | def set_output_embeddings(self, new_embeddings):
method set_decoder (line 509) | def set_decoder(self, decoder):
method get_decoder (line 512) | def get_decoder(self):
method forward (line 517) | def forward(
method prepare_inputs_for_generation (line 642) | def prepare_inputs_for_generation(
method _reorder_cache (line 700) | def _reorder_cache(past_key_values, beam_idx):
FILE: research/Matroyshka_reranker/inference/rank_model.py
class MatroyshkaReranker (line 17) | class MatroyshkaReranker(AbsReranker):
method __init__ (line 49) | def __init__(
method compute_score_single_gpu (line 151) | def compute_score_single_gpu(
FILE: research/Reinforced_IR/data_generation/agent/gpt.py
class GPTAgent (line 13) | class GPTAgent():
method __init__ (line 14) | def __init__(
method generate_single (line 31) | def generate_single(
method generate (line 77) | def generate(
method generate_single_direct (line 107) | def generate_single_direct(
method generate_direct (line 127) | def generate_direct(
FILE: research/Reinforced_IR/data_generation/agent/vllm.py
class LLMAgent (line 8) | class LLMAgent():
method __init__ (line 9) | def __init__(
method generate (line 21) | def generate(
FILE: research/Reinforced_IR/data_generation/agent/vllm_instruct.py
class LLMInstructAgent (line 9) | class LLMInstructAgent():
method __init__ (line 10) | def __init__(
method generate (line 23) | def generate(
method generate_direct (line 61) | def generate_direct(
FILE: research/Reinforced_IR/data_generation/generate_generator_data.py
function parse_option (line 15) | def parse_option():
function main (line 47) | def main(opt):
FILE: research/Reinforced_IR/data_generation/generate_retriever_data.py
function parse_option (line 11) | def parse_option():
function main (line 46) | def main(opt):
FILE: research/Reinforced_IR/data_generation/generate_retriever_distill_data.py
function parse_option (line 13) | def parse_option():
function main (line 36) | def main(opt):
FILE: research/Reinforced_IR/data_generation/generate_universal_query.py
function parse_option (line 10) | def parse_option():
function main (line 33) | def main(opt):
FILE: research/Reinforced_IR/data_generation/prompts/get_prompts.py
function get_query_generation_prompt (line 324) | def get_query_generation_prompt(dataset_name: str, passage: str, use_exa...
function get_additional_info_generation_prompt (line 465) | def get_additional_info_generation_prompt(dataset_name: str, query: str)...
function get_additional_info_generation_long_prompt (line 488) | def get_additional_info_generation_long_prompt(dataset_name: str, query:...
function get_additional_info_generation_long_air_prompt (line 511) | def get_additional_info_generation_long_air_prompt(dataset_name: str, qu...
function get_additional_info_generation_train_prompt (line 535) | def get_additional_info_generation_train_prompt(dataset_name: str, query...
function get_quality_control_prompt (line 670) | def get_quality_control_prompt(dataset_name: str, query: str, passage: s...
function get_reranker_prompt (line 718) | def get_reranker_prompt(dataset_name: str, query: str, passage: str) -> ...
FILE: research/Reinforced_IR/data_generation/prompts/hyde_prompts.py
function get_additional_info_generation_prompt (line 42) | def get_additional_info_generation_prompt(dataset_name: str, query: str)...
FILE: research/Reinforced_IR/data_generation/prompts/teacher_prompts.py
function get_yes_prompt (line 95) | def get_yes_prompt(dataset_name: str, query: str, passage: str) -> str:
function get_rank_prompt (line 151) | def get_rank_prompt(dataset_name, num, query, passages):
FILE: research/Reinforced_IR/data_generation/utils.py
function extract_numbers (line 17) | def extract_numbers(s):
function get_distill_data (line 22) | def get_distill_data(
function generate_bge_train_data (line 65) | def generate_bge_train_data(
function generate_llm_dpo_train_data (line 229) | def generate_llm_dpo_train_data(
function evaluate_mrr (line 302) | def evaluate_mrr(qrels: Dict[str, Dict[str, int]],
function search (line 329) | def search(queries_emb, doc_emb, topk: int = 100):
function evaluate (line 358) | def evaluate(metrics: List[str] = ['recall', 'mrr', 'ndcg'],
function evaluate_better (line 422) | def evaluate_better(metrics: List[str] = ['recall', 'mrr', 'ndcg'],
FILE: research/Reinforced_IR/finetune/generator/save_tokenizer.py
function parse_option (line 9) | def parse_option():
function main (line 20) | def main(opt):
FILE: research/Reinforced_IR/finetune/generator/update_file.py
function parse_option (line 7) | def parse_option():
function main (line 20) | def main(opt):
FILE: research/Reinforced_IR/finetune/retriever/arguments.py
class IREmbedderTrainingArguments (line 11) | class IREmbedderTrainingArguments(AbsEmbedderTrainingArguments):
class IREmbedderDataArguments (line 20) | class IREmbedderDataArguments(AbsEmbedderDataArguments):
FILE: research/Reinforced_IR/finetune/retriever/dataset.py
class IREmbedderTrainDataset (line 24) | class IREmbedderTrainDataset(AbsEmbedderTrainDataset):
method __init__ (line 31) | def __init__(
method __getitem__ (line 41) | def __getitem__(self, item):
class IREmbedderCollator (line 91) | class IREmbedderCollator(AbsEmbedderCollator):
method __call__ (line 99) | def __call__(self, features):
class IREmbedderSameDatasetTrainDataset (line 224) | class IREmbedderSameDatasetTrainDataset(AbsEmbedderSameDatasetTrainDatas...
method __init__ (line 235) | def __init__(
method _shuffle_answer (line 253) | def _shuffle_answer(self, text):
method __getitem__ (line 269) | def __getitem__(self, _):
method _create_batch_data (line 277) | def _create_batch_data(self, batch_raw_data):
class IREmbedderSameDatasetCollator (line 401) | class IREmbedderSameDatasetCollator(AbsEmbedderSameDatasetCollator):
method __call__ (line 414) | def __call__(self, features):
FILE: research/Reinforced_IR/finetune/retriever/modeling.py
class BiIREmbedderModel (line 24) | class BiIREmbedderModel(BiEncoderOnlyEmbedderModel):
method __init__ (line 40) | def __init__(
method forward (line 74) | def forward(
method distill_loss (line 159) | def distill_loss(kd_loss_type, teacher_targets, student_scores, group_...
method save (line 204) | def save(self, output_dir: str):
FILE: research/Reinforced_IR/finetune/retriever/runner.py
class IREmbedderRunner (line 19) | class IREmbedderRunner(AbsEmbedderRunner):
method load_train_dataset (line 24) | def load_train_dataset(self):
method load_data_collator (line 44) | def load_data_collator(self):
method load_tokenizer_and_model (line 61) | def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEm...
method load_trainer (line 114) | def load_trainer(self) -> IREmbedderTrainer:
FILE: research/Reinforced_IR/finetune/retriever/trainer.py
class IREmbedderTrainer (line 11) | class IREmbedderTrainer(AbsEmbedderTrainer):
method _save (line 15) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
FILE: research/Reinforced_IR/inference/agent/gpt.py
class GPTAgent (line 13) | class GPTAgent():
method __init__ (line 14) | def __init__(
method generate_single (line 31) | def generate_single(
method generate (line 77) | def generate(
method generate_single_direct (line 107) | def generate_single_direct(
method generate_direct (line 127) | def generate_direct(
FILE: research/Reinforced_IR/inference/agent/vllm.py
class LLMAgent (line 8) | class LLMAgent():
method __init__ (line 9) | def __init__(
method generate (line 21) | def generate(
FILE: research/Reinforced_IR/inference/agent/vllm_instruct.py
class LLMInstructAgent (line 9) | class LLMInstructAgent():
method __init__ (line 10) | def __init__(
method generate (line 23) | def generate(
method generate_direct (line 61) | def generate_direct(
FILE: research/Reinforced_IR/inference/ir_model.py
class Reinforced_IR_Model (line 23) | class Reinforced_IR_Model():
method __init__ (line 24) | def __init__(
method load_retriever (line 69) | def load_retriever(self):
method load_generator (line 85) | def load_generator(self):
method offload_retriever (line 102) | def offload_retriever(self):
method offload_generator (line 107) | def offload_generator(self):
method encode_queries (line 112) | def encode_queries(self, task_instruction, answer_type, queries, **kwa...
method encode_corpus (line 129) | def encode_corpus(self, corpus, **kwargs):
method encode (line 133) | def encode(self, corpus, **kwargs):
FILE: research/Reinforced_IR/inference/multi.py
class Args (line 10) | class Args():
function worker_function (line 60) | def worker_function(device):
function merge (line 122) | def merge(args: Args):
FILE: research/baai_general_embedding/finetune/arguments.py
class ModelArguments (line 9) | class ModelArguments:
class DataArguments (line 30) | class DataArguments:
method __post_init__ (line 63) | def __post_init__(self):
class RetrieverTrainingArguments (line 68) | class RetrieverTrainingArguments(TrainingArguments):
FILE: research/baai_general_embedding/finetune/data.py
class TrainDatasetForEmbedding (line 14) | class TrainDatasetForEmbedding(Dataset):
method __init__ (line 15) | def __init__(
method __len__ (line 37) | def __len__(self):
method __getitem__ (line 40) | def __getitem__(self, item) -> Tuple[str, List[str]]:
class EmbedCollator (line 64) | class EmbedCollator(DataCollatorWithPadding):
method padding_score (line 73) | def padding_score(self, teacher_score):
method __call__ (line 91) | def __call__(self, features):
FILE: research/baai_general_embedding/finetune/eval_msmarco.py
class Args (line 16) | class Args:
function index (line 74) | def index(model: FlagModel, corpus: datasets.Dataset, batch_size: int = ...
function search (line 132) | def search(model: FlagModel, queries: datasets, faiss_index: faiss.Index...
function evaluate (line 155) | def evaluate(preds,
function main (line 212) | def main():
FILE: research/baai_general_embedding/finetune/hn_mine.py
function get_args (line 11) | def get_args():
function create_index (line 25) | def create_index(embeddings, use_gpu):
function batch_search (line 37) | def batch_search(index,
function get_corpus (line 50) | def get_corpus(candidate_pool):
function find_knn_neg (line 58) | def find_knn_neg(model, input_file, candidate_pool, output_file, sample_...
FILE: research/baai_general_embedding/finetune/modeling.py
class EncoderOutput (line 15) | class EncoderOutput(ModelOutput):
class BiEncoderModel (line 22) | class BiEncoderModel(nn.Module):
method __init__ (line 25) | def __init__(self,
method gradient_checkpointing_enable (line 60) | def gradient_checkpointing_enable(self, **kwargs):
method sentence_embedding (line 63) | def sentence_embedding(self, hidden_state, mask):
method encode (line 71) | def encode(self, features):
method compute_similarity (line 80) | def compute_similarity(self, q_reps, p_reps):
method forward (line 85) | def forward(self, query: Dict[str, Tensor] = None, passage: Dict[str, ...
method compute_loss (line 119) | def compute_loss(self, scores, target):
method _dist_gather_tensor (line 122) | def _dist_gather_tensor(self, t: Optional[torch.Tensor]):
method save (line 135) | def save(self, output_dir: str):
FILE: research/baai_general_embedding/finetune/run.py
function main (line 20) | def main():
FILE: research/baai_general_embedding/finetune/trainer.py
function save_ckpt_for_sentence_transformers (line 5) | def save_ckpt_for_sentence_transformers(ckpt_dir, pooling_mode: str = 'c...
class BiTrainer (line 16) | class BiTrainer(Trainer):
method _save (line 17) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
method compute_loss (line 40) | def compute_loss(self, model, inputs, return_outputs=False):
FILE: research/baai_general_embedding/retromae_pretrain/arguments.py
class DataTrainingArguments (line 7) | class DataTrainingArguments:
method __post_init__ (line 24) | def __post_init__(self):
class ModelArguments (line 30) | class ModelArguments:
FILE: research/baai_general_embedding/retromae_pretrain/data.py
class DatasetForPretraining (line 13) | class DatasetForPretraining(torch.utils.data.Dataset):
method __init__ (line 14) | def __init__(self, data_dir):
method load_dataset (line 26) | def load_dataset(self, file):
method __getitem__ (line 34) | def __getitem__(self, item):
method __len__ (line 37) | def __len__(self):
class RetroMAECollator (line 42) | class RetroMAECollator(DataCollatorForWholeWordMask):
method __call__ (line 47) | def __call__(self, examples):
FILE: research/baai_general_embedding/retromae_pretrain/enhancedDecoder.py
class BertSelfAttention (line 24) | class BertSelfAttention(nn.Module):
method __init__ (line 25) | def __ini
Copy disabled (too large)
Download .json
Condensed preview — 1068 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (28,773K chars).
[
{
"path": ".github/workflows/documentation.yml",
"chars": 1010,
"preview": "name: documentation\n\non: [push, pull_request, workflow_dispatch]\n\npermissions:\n contents: write\n\njobs:\n docs:\n runs"
},
{
"path": ".gitignore",
"chars": 1968,
"preview": "*.memmap\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n.idea/\n\n# C extensions\n*.so\n\n# Distr"
},
{
"path": "FlagEmbedding/__init__.py",
"chars": 54,
"preview": "from .abc.inference import *\nfrom .inference import *\n"
},
{
"path": "FlagEmbedding/abc/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/abc/evaluation/__init__.py",
"chars": 428,
"preview": "from .arguments import AbsEvalArgs, AbsEvalModelArgs\nfrom .evaluator import AbsEvaluator\nfrom .data_loader import AbsEva"
},
{
"path": "FlagEmbedding/abc/evaluation/arguments.py",
"chars": 9020,
"preview": "\"\"\"\nAdapted from https://github.com/AIR-Bench/AIR-Bench/blob/0.1.0/air_benchmark/evaluation_utils/evaluation_arguments.p"
},
{
"path": "FlagEmbedding/abc/evaluation/data_loader.py",
"chars": 17037,
"preview": "\"\"\"\nAdapted from https://github.com/AIR-Bench/AIR-Bench/blob/0.1.0/air_benchmark/evaluation_utils/data_loader.py\n\"\"\"\nimp"
},
{
"path": "FlagEmbedding/abc/evaluation/evaluator.py",
"chars": 20402,
"preview": "\"\"\"\nAdapted from https://github.com/AIR-Bench/AIR-Bench/blob/0.1.0/air_benchmark/evaluation_utils/evaluator.py\n\"\"\"\nimpor"
},
{
"path": "FlagEmbedding/abc/evaluation/runner.py",
"chars": 10384,
"preview": "import os\nimport json\nimport logging\nfrom typing import List, Union, Tuple\n\nfrom FlagEmbedding import FlagAutoModel, Fla"
},
{
"path": "FlagEmbedding/abc/evaluation/searcher.py",
"chars": 10001,
"preview": "\"\"\"\nAdapted from https://github.com/AIR-Bench/AIR-Bench/blob/0.1.0/air_benchmark/evaluation_utils/searcher.py\n\"\"\"\nimport"
},
{
"path": "FlagEmbedding/abc/evaluation/utils.py",
"chars": 8500,
"preview": "import faiss\nimport torch\nimport logging\nimport numpy as np\nimport pytrec_eval\nfrom tqdm import tqdm\nfrom collections im"
},
{
"path": "FlagEmbedding/abc/finetune/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/abc/finetune/embedder/AbsArguments.py",
"chars": 5652,
"preview": "import os\nfrom typing import Optional\nfrom dataclasses import dataclass, field\n\nfrom transformers import TrainingArgumen"
},
{
"path": "FlagEmbedding/abc/finetune/embedder/AbsDataset.py",
"chars": 25425,
"preview": "import os\nimport math\nimport random\nimport logging\nimport datasets\nimport numpy as np\nimport torch.distributed as dist\nf"
},
{
"path": "FlagEmbedding/abc/finetune/embedder/AbsModeling.py",
"chars": 14747,
"preview": "import torch\nfrom torch import nn, Tensor\nimport torch.nn.functional as F\nimport torch.distributed as dist\nfrom transfor"
},
{
"path": "FlagEmbedding/abc/finetune/embedder/AbsRunner.py",
"chars": 5277,
"preview": "import os\nimport logging\nfrom pathlib import Path\nfrom typing import Tuple\nfrom abc import ABC, abstractmethod\nfrom tran"
},
{
"path": "FlagEmbedding/abc/finetune/embedder/AbsTrainer.py",
"chars": 1307,
"preview": "import logging\nfrom typing import Optional\nfrom abc import ABC, abstractmethod\nfrom transformers.trainer import Trainer\n"
},
{
"path": "FlagEmbedding/abc/finetune/embedder/__init__.py",
"chars": 861,
"preview": "from .AbsArguments import (\n AbsEmbedderDataArguments,\n AbsEmbedderModelArguments,\n AbsEmbedderTrainingArgument"
},
{
"path": "FlagEmbedding/abc/finetune/reranker/AbsArguments.py",
"chars": 4831,
"preview": "import os\nfrom typing import Optional\nfrom dataclasses import dataclass, field\n\nfrom transformers import TrainingArgumen"
},
{
"path": "FlagEmbedding/abc/finetune/reranker/AbsDataset.py",
"chars": 16085,
"preview": "import os\nimport math\nimport random\nimport logging\nimport datasets\nimport numpy as np\nimport torch.distributed as dist\nf"
},
{
"path": "FlagEmbedding/abc/finetune/reranker/AbsModeling.py",
"chars": 4607,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom transformers import PreTrainedTokenizer\nfrom transformers.file_utils impo"
},
{
"path": "FlagEmbedding/abc/finetune/reranker/AbsRunner.py",
"chars": 4787,
"preview": "import os\nimport logging\nfrom pathlib import Path\nfrom typing import Tuple\nfrom abc import ABC, abstractmethod\nfrom tran"
},
{
"path": "FlagEmbedding/abc/finetune/reranker/AbsTrainer.py",
"chars": 1330,
"preview": "import logging\nfrom typing import Optional\nfrom abc import ABC, abstractmethod\nfrom transformers.trainer import Trainer\n"
},
{
"path": "FlagEmbedding/abc/finetune/reranker/__init__.py",
"chars": 718,
"preview": "from .AbsArguments import AbsRerankerDataArguments, AbsRerankerModelArguments, AbsRerankerTrainingArguments\nfrom .AbsDat"
},
{
"path": "FlagEmbedding/abc/inference/AbsEmbedder.py",
"chars": 18183,
"preview": "import logging\nfrom tqdm import tqdm, trange\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Union, List, Di"
},
{
"path": "FlagEmbedding/abc/inference/AbsReranker.py",
"chars": 14508,
"preview": "import logging\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Union, List, Tuple, Dict, Literal, Optional\n\n"
},
{
"path": "FlagEmbedding/abc/inference/__init__.py",
"chars": 126,
"preview": "from .AbsEmbedder import AbsEmbedder\nfrom .AbsReranker import AbsReranker\n\n__all__ = [\n 'AbsEmbedder',\n 'AbsRerank"
},
{
"path": "FlagEmbedding/evaluation/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/evaluation/air_bench/__init__.py",
"chars": 195,
"preview": "from .arguments import AIRBenchEvalModelArgs, AIRBenchEvalArgs\nfrom .runner import AIRBenchEvalRunner\n\n__all__ = [\n \""
},
{
"path": "FlagEmbedding/evaluation/air_bench/__main__.py",
"chars": 848,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.air_bench import (\n AIRBenchEvalArgs, AIRBen"
},
{
"path": "FlagEmbedding/evaluation/air_bench/arguments.py",
"chars": 5446,
"preview": "from dataclasses import dataclass, field\nfrom typing import List, Optional\nfrom air_benchmark import EvalArgs as AIRBenc"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/arxiv-gemini.jsonl",
"chars": 2185,
"preview": "{\"query\": \"So, which AI model did the best on the MMMU benchmark according to Yue and his team back in 2023?\", \"pos\": \"M"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/arxiv-gpt3.jsonl",
"chars": 2358,
"preview": "{\"query\": \"What gauges the effects of data contamination?\", \"pos\": \"We also undertake a systematic study of “data contam"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/arxiv-llama2.jsonl",
"chars": 2102,
"preview": "{\"query\": \"Could you elucidate on the values of temperature and top-p that are utilized for pass@1 scores?\", \"pos\": \"8 6"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/arxiv-llm-survey.jsonl",
"chars": 2374,
"preview": "{\"query\": \"What are the pre-training challenges for large language models?\", \"pos\": \"To make this survey more self-conta"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/book-a-brief-history-of-time_stephen-hawking.jsonl",
"chars": 2308,
"preview": "{\"query\": \"Why is it believed the universe began at a particular time?\", \"pos\": \"According to a number of earlycosmologi"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/book-origin-of-species_darwin.jsonl",
"chars": 2266,
"preview": "{\"query\": \"So, like, what's the big deal about the top species from the bigger groups talked about in Chapter 4?\", \"pos\""
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/healthcare-pubmed_100k-200k_1.jsonl",
"chars": 1982,
"preview": "{\"query\": \"What does 'O' represent in peptides?\", \"pos\": \"by changing the peptides to be amphiphilic or completely polar"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/healthcare-pubmed_100k-200k_2.jsonl",
"chars": 2418,
"preview": "{\"query\": \"Why do we get different parameter sets when looking at how ion and water-oxygen atoms interact?\", \"pos\": \"the"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/healthcare-pubmed_100k-200k_3.jsonl",
"chars": 1898,
"preview": "{\"query\": \"In what manner does the transference of electric charge impact the process of dimerization?\", \"pos\": \"the nat"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/healthcare-pubmed_30k-40k_10-merged.jsonl",
"chars": 2430,
"preview": "{\"query\": \"How do intrinsic and extrinsic factors impact A42 aggregation, and why is drug development challenging for th"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/healthcare-pubmed_40k-50k_5-merged.jsonl",
"chars": 1944,
"preview": "{\"query\": \"What is the primary product of photodimerization?\", \"pos\": \"sensitization is also the preferred way to promot"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/law-lex_files_300k-400k.jsonl",
"chars": 2691,
"preview": "{\"query\": \"Where can I get the NIST Standard Reference Materials Catalog?\", \"pos\": \"The calibration services, standard r"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/law-lex_files_400k-500k.jsonl",
"chars": 2037,
"preview": "{\"query\": \"When is the deadline for quarterly returns per § 53.153(a)?\", \"pos\": \"[T.D. ATF-308, 56 FR 303, Jan. 3, 1991,"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/law-lex_files_500k-600k.jsonl",
"chars": 2401,
"preview": "{\"query\": \"Define \\\"project.\\\"\", \"pos\": \"Private, as applied to an agency, organization, or institution, means that it i"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/long-doc/law-lex_files_600k-700k.jsonl",
"chars": 2624,
"preview": "{\"query\": \"What happens to the loss?\", \"pos\": \"This loss shall be the measured loss less the net gain of any voice frequ"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/qa/arxiv.jsonl",
"chars": 4160,
"preview": "{\"query\": \"How does the amount of energy affect how stuff scatters?\", \"pos\": \"it has been suggested that one may constru"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/qa/finance.jsonl",
"chars": 5331,
"preview": "{\"query\": \"What is the effect of the LME Singapore Contract on trade dynamics?\", \"pos\": \"The London Metal Exchange's, LM"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/qa/healthcare.jsonl",
"chars": 2245,
"preview": "{\"query\": \"Which technique was employed to assess the blood pressure in Wistar rats subjected to various sodium intake r"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/qa/law.jsonl",
"chars": 14821,
"preview": "{\"query\": \"In accordance with European regulatory standards, what factors are instrumental in establishing the global ma"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/qa/msmarco.jsonl",
"chars": 1345,
"preview": "{\"query\": \"Corn on the cob boiling time?\", \"pos\": \"Corn on the Cob - Boiled In a large pot, enough to hold the corn, fil"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/qa/news.jsonl",
"chars": 10017,
"preview": "{\"query\": \"Who's Bella got a crush on?\", \"pos\": \"Bella Thorne has crushes on Demi Lovato, Kristen Stewart and Camila Cab"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/qa/web.jsonl",
"chars": 2387,
"preview": "{\"query\": \"Identify the principal landmarks that are emblematic of the Baha'i religious tradition.\", \"pos\": \"House of Ba"
},
{
"path": "FlagEmbedding/evaluation/air_bench/examples/qa/wiki.jsonl",
"chars": 10229,
"preview": "{\"query\": \"Muskoka history?\", \"pos\": \"Frank Garfield \\\"Gary\\\" Denniss is a Canadian historian, newspaper columnist, reti"
},
{
"path": "FlagEmbedding/evaluation/air_bench/runner.py",
"chars": 2247,
"preview": "from typing import Union, Tuple\nfrom air_benchmark import AIRBench\n\nfrom FlagEmbedding.abc.evaluation import (\n AbsEv"
},
{
"path": "FlagEmbedding/evaluation/beir/__init__.py",
"chars": 312,
"preview": "from FlagEmbedding.abc.evaluation import (\n AbsEvalModelArgs as BEIREvalModelArgs,\n)\n\nfrom .data_loader import BEIREv"
},
{
"path": "FlagEmbedding/evaluation/beir/__main__.py",
"chars": 527,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.beir import (\n BEIREvalArgs, BEIREvalModelAr"
},
{
"path": "FlagEmbedding/evaluation/beir/arguments.py",
"chars": 385,
"preview": "from dataclasses import dataclass, field\n\nfrom FlagEmbedding.abc.evaluation.arguments import AbsEvalArgs\n\n\n@dataclass\ncl"
},
{
"path": "FlagEmbedding/evaluation/beir/data_loader.py",
"chars": 22597,
"preview": "import os\nimport json\nimport logging\nimport datasets\nfrom tqdm import tqdm\nfrom typing import List, Optional\nfrom beir i"
},
{
"path": "FlagEmbedding/evaluation/beir/evaluator.py",
"chars": 20713,
"preview": "import json\nimport logging\nimport os\nimport json\nfrom typing import Dict, Optional, List, Union\n\nfrom FlagEmbedding.abc."
},
{
"path": "FlagEmbedding/evaluation/beir/prompts.py",
"chars": 1517,
"preview": "BEIRInstructions = {\n 'dbpedia-entity': 'Given a query, retrieve relevant entity descriptions from DBPedia.',\n 'ar"
},
{
"path": "FlagEmbedding/evaluation/beir/runner.py",
"chars": 3627,
"preview": "import logging\nfrom FlagEmbedding.abc.evaluation import AbsEvalRunner\n\nfrom .data_loader import BEIREvalDataLoader\nfrom "
},
{
"path": "FlagEmbedding/evaluation/bright/__init__.py",
"chars": 475,
"preview": "from FlagEmbedding.abc.evaluation import (\n AbsEvalModelArgs as BrightEvalModelArgs,\n)\n\nfrom .data_loader import Brig"
},
{
"path": "FlagEmbedding/evaluation/bright/__main__.py",
"chars": 545,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.bright import (\n BrightEvalArgs, BrightEvalM"
},
{
"path": "FlagEmbedding/evaluation/bright/arguments.py",
"chars": 580,
"preview": "from dataclasses import dataclass, field\n\nfrom FlagEmbedding.abc.evaluation.arguments import AbsEvalArgs\n\n\n@dataclass\ncl"
},
{
"path": "FlagEmbedding/evaluation/bright/data_loader.py",
"chars": 15734,
"preview": "import os\nimport json\nimport logging\nimport datasets\nfrom tqdm import tqdm\nfrom typing import List, Optional\nfrom collec"
},
{
"path": "FlagEmbedding/evaluation/bright/prompts.py",
"chars": 2195,
"preview": "BrightShortInstructions = {\n # StackExchange\n \"biology\": \"Given a Biology post, retrieve relevant passages that he"
},
{
"path": "FlagEmbedding/evaluation/bright/runner.py",
"chars": 5514,
"preview": "import logging\nfrom typing import Union, Tuple\nfrom FlagEmbedding.abc.evaluation import AbsEvalRunner, EvalReranker, \\\n "
},
{
"path": "FlagEmbedding/evaluation/bright/searcher.py",
"chars": 5227,
"preview": "import os\nimport logging\nimport gc\nimport torch\nimport numpy as np\nfrom typing import Any, Dict, Optional\n\nfrom FlagEmbe"
},
{
"path": "FlagEmbedding/evaluation/custom/__init__.py",
"chars": 325,
"preview": "from FlagEmbedding.abc.evaluation import (\n AbsEvalArgs as CustomEvalArgs,\n AbsEvalModelArgs as CustomEvalModelArg"
},
{
"path": "FlagEmbedding/evaluation/custom/__main__.py",
"chars": 545,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.custom import (\n CustomEvalArgs, CustomEvalM"
},
{
"path": "FlagEmbedding/evaluation/custom/data_loader.py",
"chars": 394,
"preview": "import logging\nfrom tqdm import tqdm\nfrom typing import List, Optional\n\nfrom FlagEmbedding.abc.evaluation import AbsEval"
},
{
"path": "FlagEmbedding/evaluation/custom/runner.py",
"chars": 531,
"preview": "from FlagEmbedding.abc.evaluation import AbsEvalRunner\n\nfrom .data_loader import CustomEvalDataLoader\n\n\nclass CustomEval"
},
{
"path": "FlagEmbedding/evaluation/miracl/__init__.py",
"chars": 325,
"preview": "from FlagEmbedding.abc.evaluation import (\n AbsEvalArgs as MIRACLEvalArgs,\n AbsEvalModelArgs as MIRACLEvalModelArg"
},
{
"path": "FlagEmbedding/evaluation/miracl/__main__.py",
"chars": 545,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.miracl import (\n MIRACLEvalArgs, MIRACLEvalM"
},
{
"path": "FlagEmbedding/evaluation/miracl/data_loader.py",
"chars": 7325,
"preview": "import os\nimport json\nimport logging\nimport datasets\nfrom tqdm import tqdm\nfrom typing import List, Optional\n\nfrom FlagE"
},
{
"path": "FlagEmbedding/evaluation/miracl/runner.py",
"chars": 727,
"preview": "from FlagEmbedding.abc.evaluation import AbsEvalRunner\n\nfrom .data_loader import MIRACLEvalDataLoader\n\n\nclass MIRACLEval"
},
{
"path": "FlagEmbedding/evaluation/mkqa/__init__.py",
"chars": 366,
"preview": "from FlagEmbedding.abc.evaluation import (\n AbsEvalArgs as MKQAEvalArgs,\n AbsEvalModelArgs as MKQAEvalModelArgs,\n)"
},
{
"path": "FlagEmbedding/evaluation/mkqa/__main__.py",
"chars": 527,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.mkqa import (\n MKQAEvalArgs, MKQAEvalModelAr"
},
{
"path": "FlagEmbedding/evaluation/mkqa/data_loader.py",
"chars": 9511,
"preview": "import os\nimport json\nimport logging\nimport datasets\nfrom tqdm import tqdm\nfrom typing import List, Optional\n\nfrom FlagE"
},
{
"path": "FlagEmbedding/evaluation/mkqa/evaluator.py",
"chars": 4343,
"preview": "import os\nfrom tqdm import tqdm\nfrom typing import Dict, List, Optional\n\nfrom FlagEmbedding.abc.evaluation import AbsEva"
},
{
"path": "FlagEmbedding/evaluation/mkqa/runner.py",
"chars": 1140,
"preview": "from FlagEmbedding.abc.evaluation import AbsEvalRunner\n\nfrom .data_loader import MKQAEvalDataLoader\nfrom .evaluator impo"
},
{
"path": "FlagEmbedding/evaluation/mkqa/utils/compute_metrics.py",
"chars": 2845,
"preview": "\"\"\"\nRef: https://github.com/facebookresearch/contriever\n\"\"\"\nimport regex\nimport unicodedata\nfrom functools import partia"
},
{
"path": "FlagEmbedding/evaluation/mkqa/utils/normalize_text.py",
"chars": 4800,
"preview": "\"\"\"\nadapted from chemdataextractor.text.normalize\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\nTools for normalizing text.\nhttps://g"
},
{
"path": "FlagEmbedding/evaluation/mldr/__init__.py",
"chars": 309,
"preview": "from FlagEmbedding.abc.evaluation import (\n AbsEvalArgs as MLDREvalArgs,\n AbsEvalModelArgs as MLDREvalModelArgs,\n)"
},
{
"path": "FlagEmbedding/evaluation/mldr/__main__.py",
"chars": 527,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.mldr import (\n MLDREvalArgs, MLDREvalModelAr"
},
{
"path": "FlagEmbedding/evaluation/mldr/data_loader.py",
"chars": 7089,
"preview": "import os\nimport json\nimport logging\nimport datasets\nfrom tqdm import tqdm\nfrom typing import List, Optional\n\nfrom FlagE"
},
{
"path": "FlagEmbedding/evaluation/mldr/runner.py",
"chars": 715,
"preview": "from FlagEmbedding.abc.evaluation import AbsEvalRunner\n\nfrom .data_loader import MLDREvalDataLoader\n\n\nclass MLDREvalRunn"
},
{
"path": "FlagEmbedding/evaluation/msmarco/__init__.py",
"chars": 333,
"preview": "from FlagEmbedding.abc.evaluation import (\n AbsEvalArgs as MSMARCOEvalArgs,\n AbsEvalModelArgs as MSMARCOEvalModelA"
},
{
"path": "FlagEmbedding/evaluation/msmarco/__main__.py",
"chars": 554,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.msmarco import (\n MSMARCOEvalArgs, MSMARCOEv"
},
{
"path": "FlagEmbedding/evaluation/msmarco/data_loader.py",
"chars": 12131,
"preview": "import os\nimport json\nimport logging\nimport datasets\nfrom tqdm import tqdm\nfrom typing import List, Optional\n\nfrom FlagE"
},
{
"path": "FlagEmbedding/evaluation/msmarco/runner.py",
"chars": 734,
"preview": "from FlagEmbedding.abc.evaluation import AbsEvalRunner\n\nfrom .data_loader import MSMARCOEvalDataLoader\n\n\nclass MSMARCOEv"
},
{
"path": "FlagEmbedding/evaluation/mteb/__init__.py",
"chars": 242,
"preview": "from FlagEmbedding.abc.evaluation import (\n AbsEvalModelArgs as MTEBEvalModelArgs,\n)\n\nfrom .arguments import MTEBEval"
},
{
"path": "FlagEmbedding/evaluation/mteb/__main__.py",
"chars": 527,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.evaluation.mteb import (\n MTEBEvalArgs, MTEBEvalModelAr"
},
{
"path": "FlagEmbedding/evaluation/mteb/arguments.py",
"chars": 895,
"preview": "from dataclasses import dataclass, field\nfrom typing import List\n\nfrom FlagEmbedding.abc.evaluation.arguments import Abs"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/AmazonCounterfactualClassification.csv",
"chars": 454,
"preview": "text,label\n\"I wish I could have used this head set but the day I received it it wouldn't even turn on and I really wante"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/AmazonPolarityClassification.csv",
"chars": 671,
"preview": "text,label\n\"Hunting the Hard Way Thia was a gift for my Husband, who loved the book. It arrived on the date we were told"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/AmazonReviewsClassification.csv",
"chars": 951,
"preview": "text,label\n\"DO NOT ORDER THIS\\n\\nThis isn't what's described at all. Taking it out of the package lace was cut upon arri"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/ArguAna.csv",
"chars": 3150,
"preview": "query,pos\r\n\"People will die if we don’t do animal testing Every year, 23 new drugs are introduced in the UK alone.[13] A"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/ArxivClusteringP2P.csv",
"chars": 4523,
"preview": "text,label\n\"A Novel Approach to Enhancing Cybersecurity in Smart Grids through Deep Reinforcement Learning The integra"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/ArxivClusteringS2S.csv",
"chars": 417,
"preview": "text,label\n\"A Survey on Graph Neural Networks: Algorithms and Applications\",cs\n\"Hamiltonian Dynamics and KAM Theory for "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/AskUbuntuDupQuestions.csv",
"chars": 315,
"preview": "query,positive\nangularjs infinite scroll in a container,AngularJS ng-infinite-scroll not working on a specific container"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/BIOSSES.csv",
"chars": 580,
"preview": "sent1,sent2\n\"Recent studies have highlighted the crucial role of p53 in regulating cell cycle progression.\",\"Recent rese"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/Banking77Classification.csv",
"chars": 549,
"preview": "text,label\n\"What is my money worth in other countries?\",exchange_rate\n\"What can I do if my card still hasn't arrived aft"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/BiorxivClusteringP2P.csv",
"chars": 4625,
"preview": "text,label\n\"Neural Mechanisms of Social Cognition: A Study on Mirror Neurons and EmpathySocial cognition is the mental p"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/BiorxivClusteringS2S.csv",
"chars": 653,
"preview": "text,label\n\"Neural Circuit Dynamics in Decision-Making: A Computational Model of Prefrontal-Striatal Interactions\",neuro"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/CQADupstack.csv",
"chars": 316,
"preview": "query,positive\nangularjs infinite scroll in a container,AngularJS ng-infinite-scroll not working on a specific container"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/CQADupstackRetrieval.csv",
"chars": 9,
"preview": "query,pos"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/ClimateFEVER.csv",
"chars": 2078,
"preview": "query,pos\n\"Global warming is causing more frequent and intense hurricanes.\",\"Hurricanes and Climate Change Hurricanes, "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/DBPedia.csv",
"chars": 1240,
"preview": "query,pos \n\"Chefs with a show on the Food Network.\",\"Robert Irvine Robert Irvine (born 24 September 1965) is a British "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/EmotionClassification.csv",
"chars": 667,
"preview": "text,label_text\r\n\"i am bothered is that he might changed his feelings once he get back in us and leave me heartbroken\",s"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/FEVER.csv",
"chars": 4985,
"preview": "query,pos\n\"Ricky Martin acts.\",\"Ricky Martin Enrique Martín Morales ( born December 24 , 1971 ) , commonly known as Rick"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/FiQA2018.csv",
"chars": 1495,
"preview": "query,pos\r\nWhat is a negotiable security and how are they related to derivatives?,\"A negotiable security is a financial "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/HotpotQA.csv",
"chars": 978,
"preview": "query,pos \n\"Which tennis player Anna-Lena Grönefeld or Mats Wilander turned professional first ?\",\"Anna-Lena Grönefeld "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/ImdbClassification.csv",
"chars": 525,
"preview": "text,label_text\r\n\"Renny Harlin's first American film was one of the best of a slew of prison-set horror films(like 'Deat"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/MSMARCO.csv",
"chars": 956,
"preview": "query,pos\n\"what is a pms color\",\"PMS is a solid-color matching system, used primarily for specifying second or third col"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/MTOPDomainClassification.csv",
"chars": 306,
"preview": "text,label\n\"I am no longer available\",calling\n\"Cancel my reminder about my dentist appointment\",reminder\n\"Will it rain t"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/MTOPIntentClassification.csv",
"chars": 367,
"preview": "text,label\t\n\"When will my next alarm start\",GET_ALARM\n\"I need you to message Zachary Fletcher\",SEND_MESSAGE\n\"show me vid"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/MassiveIntentClassification.csv",
"chars": 433,
"preview": "text,label\n\"remind me to pay rent every month\",calendar_set\n\"please play yesterday from beatles\",play_music\n\"what will t"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/MassiveScenarioClassification.csv",
"chars": 318,
"preview": "text,label\n\"can you confirm that my meeting for tomorrow has been canceled\",calendar\n\"please open my music application a"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/MedrxivClusteringP2P.csv",
"chars": 4861,
"preview": "text,label\n\"Socioeconomic Disparities in COVID-19 Transmission Risk: A Population-Based Study from Norway\\nObjective: Ex"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/MedrxivClusteringS2S.csv",
"chars": 602,
"preview": "text,label\n\"Evaluating the Efficacy of New Therapeutic Agents in the Management of Hypertension-Induced Kidney Damage\",n"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/MindSmallReranking.csv",
"chars": 286,
"preview": "query,pos\n\"'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction\",\"Charles Rogers, former Michigan Sta"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/NFCorpus.csv",
"chars": 4692,
"preview": "query,pos\r\n\"lung disease\",\"Hibiscus anthocyanins rich extract-induced apoptotic cell death in human promyelocytic leukem"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/NQ.csv",
"chars": 1202,
"preview": "query,pos\n\"what is the capital of australia\",\"Canberra Canberra is the capital city of Australia. Founded following th"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/QuoraRetrieval.csv",
"chars": 366,
"preview": "query,pos\n\"Why do people say Dhanush (South Indian actor) is ugly? I don't think so.?\",\"Why do people say Dhanush (South"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/RedditClustering.csv",
"chars": 311,
"preview": "text,label\n\"Financial Meltdown: Strategies for Surviving Economic Collapse\",collapse.txt\n\"Exclusive Comic Book Sale: Don"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/RedditClusteringP2P.csv",
"chars": 2632,
"preview": "text,label\n\"I've been thinking a lot about friendships lately. High school can be such a weird place when it comes to ma"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/SCIDOCS.csv",
"chars": 2135,
"preview": "query,pos\r\n\"Enhancing Urban Mobility Through Intelligent Transportation Systems\",\"Intelligent Transportation Systems (IT"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/SICK-R.csv",
"chars": 326,
"preview": "sent1,sent2\n\"The cat is lounging on the sunny windowsill.\",\"The feline is resting on the sunny windowsill.\"\n\"A woman is "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/STS12.csv",
"chars": 674,
"preview": "sent1,sent2\n\"A man is dancing on the ceiling.\",\"A man is dancing on the ceiling of a room.\"\n\"That is a shameful state of"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/STS13.csv",
"chars": 302,
"preview": "sent1,sent2\n\"the state of being exposed to danger or harm\",\"the condition of being at risk of injury or loss.\"\n\"a set of"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/STS14.csv",
"chars": 360,
"preview": "sent1,sent2\n\"president obama vows to work with congress on immigration reform .\",\"obama pledges to collaborate with cong"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/STS15.csv",
"chars": 441,
"preview": "sent1,sent2\n\"The battery and bulb A are not in the same path\",\"Bulb A and the battery are not in the same circuit.\"\n\"Swi"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/STS16.csv",
"chars": 286,
"preview": "sent1,sent2\n\"what are the symptoms of a heart attack ?\",\"what are the signs of a heart attack ?\"\n\"how do i change a flat"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/STS17.csv",
"chars": 263,
"preview": "sent1,sent2\n\"The sun is setting over the mountains.\", \"The sun sets behind the mountains.\"\n\"A child is playing with a re"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/STS22.csv",
"chars": 4315,
"preview": "sent1,sent2\n\"The court said the ruling has stayed till January 18.\\n\\nThe Prevention of Money Laundering Act (PMLA) cour"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/STSBenchmark.csv",
"chars": 425,
"preview": "sent1,sent2\n\"Agribusiness: Mad cow disease found in California\",\"USDA Confirms Case of Mad Cow Disease in California\"\n\"s"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/SciDocsRR.csv",
"chars": 898,
"preview": "query,pos\n\"Intelligent Word-Based Spam Filter Detection Using Multi-Neural Networks\",\"Efficient Harmful Email identifica"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/SciFact.csv",
"chars": 1305,
"preview": "query,pos\r\n1 in 5 million in UK have abnormal PrP positivity.,\"Research conducted by the UK's National Prion Clinic indi"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/SprintDuplicateQuestions.csv",
"chars": 732,
"preview": "sent1,sent2\n\"Kyocera duraforce pro international roaming settings\",\"Make a call while roaming internationally - Kyocera "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/StackExchangeClustering.csv",
"chars": 421,
"preview": "text,label\n\"Recommendations for a lightweight Markdown editor with real-time collaboration features?\",softwarerecs.stack"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/StackExchangeClusteringP2P.csv",
"chars": 4544,
"preview": "text,label\n\"I'm currently facing an issue with my Unity project involving UI scaling across different resolutions. I've "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/StackOverflowDupQuestions.csv",
"chars": 376,
"preview": "query,pos\n\"How to handle onChange event in React when state changes programmatically?\",\"React onChange event not firing "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/SummEval.csv",
"chars": 1204,
"preview": "sum1,sum2\n\"Luis Suárez is reportedly being eyed by Barcelona for a potential return. After a successful spell at Atlétic"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/TRECCOVID.csv",
"chars": 1516,
"preview": "query,pos\n\"How effective are antiviral drugs like favipiravir and molnupiravir against COVID-19?\",\"The ongoing COVID-19 "
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/Touche2020.csv",
"chars": 7084,
"preview": "query,pos\n\"Should governments invest more in space exploration?\",\"Governments should indeed increase their investments i"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/ToxicConversationsClassification.csv",
"chars": 525,
"preview": "text,label\n\"Pull your little head out of your big ass areola!\",\"toxic\"\n\"Trudeau will survive this alright as, unfortunat"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/TweetSentimentExtractionClassification.csv",
"chars": 348,
"preview": "text,label\n\"I`d have responded, if I were going\",neutral\n\"what interview! leave me alone\",negative\n\"2am feedings for the"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/TwentyNewsgroupsClustering.csv",
"chars": 269,
"preview": "text,label\n\"Major flaw discovered in widely-used encryption protocol\",sci.crypt\n\"Bruins' Unstoppable Winning Streak\",rec"
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/TwitterSemEval2015.csv",
"chars": 520,
"preview": "sent1,sent2\n\"Excited for the new Game of Thrones episode tonight!\",\"Can't wait for tonight's Game of Thrones episode!\"\n\""
},
{
"path": "FlagEmbedding/evaluation/mteb/examples/TwitterURLCorpus.csv",
"chars": 558,
"preview": "sent1,sent2\n\"Elon Musk says Tesla will be profitable next quarter.\",\"Elon Musk claims Tesla will turn a profit next quar"
},
{
"path": "FlagEmbedding/evaluation/mteb/prompts.py",
"chars": 12049,
"preview": "from typing import Dict\n\n\ndef get_task_def_by_task_name_and_type(task_name: str, task_type: str) -> str:\n if task_typ"
},
{
"path": "FlagEmbedding/evaluation/mteb/runner.py",
"chars": 6841,
"preview": "import logging\nimport os\nimport mteb\nimport json\nimport pandas as pd\nfrom typing import Tuple, Union\n\nfrom FlagEmbedding"
},
{
"path": "FlagEmbedding/evaluation/mteb/searcher.py",
"chars": 3571,
"preview": "import numpy as np\n\nfrom typing import List, Dict, Optional\nfrom FlagEmbedding.abc.evaluation import EvalDenseRetriever,"
},
{
"path": "FlagEmbedding/finetune/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/finetune/embedder/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/base/__init__.py",
"chars": 634,
"preview": "from FlagEmbedding.abc.finetune.embedder import (\n AbsEmbedderDataArguments as DecoderOnlyEmbedderDataArguments,\n "
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/base/__main__.py",
"chars": 876,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.finetune.embedder.decoder_only.base import (\n DecoderOn"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/base/arguments.py",
"chars": 2576,
"preview": "from typing import Optional, List\nfrom dataclasses import dataclass, field\n\nfrom FlagEmbedding.abc.finetune.embedder imp"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py",
"chars": 7203,
"preview": "import os\nimport re\nimport torch\nimport logging\nfrom transformers import AutoConfig, AutoModel, AutoTokenizer\nfrom peft "
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/base/modeling.py",
"chars": 7977,
"preview": "import logging\n\nimport torch\nfrom transformers import AutoModel, PreTrainedModel, PreTrainedTokenizer\n\nfrom FlagEmbeddin"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/base/runner.py",
"chars": 5676,
"preview": "import logging\nfrom typing import Tuple\nfrom pathlib import Path\nfrom transformers import AutoConfig, AutoTokenizer, Pre"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/base/trainer.py",
"chars": 1719,
"preview": "import os\nimport torch\nimport logging\nfrom typing import Optional\n\nfrom FlagEmbedding.abc.finetune.embedder import AbsEm"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/icl/__init__.py",
"chars": 761,
"preview": "from FlagEmbedding.abc.finetune.embedder import (\n AbsEmbedderTrainingArguments as DecoderOnlyEmbedderICLTrainingArgu"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/icl/__main__.py",
"chars": 908,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.finetune.embedder.decoder_only.icl import (\n DecoderOnl"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/icl/arguments.py",
"chars": 3301,
"preview": "from typing import Optional, List\nfrom dataclasses import dataclass, field\n\nfrom FlagEmbedding.abc.finetune.embedder imp"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/icl/dataset.py",
"chars": 11735,
"preview": "import math\nimport random\nimport logging\nfrom dataclasses import dataclass\nfrom transformers import (\n PreTrainedToke"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/icl/load_model.py",
"chars": 7082,
"preview": "import os\nimport re\nimport torch\nimport logging\nfrom transformers import AutoConfig, AutoModel, AutoTokenizer\nfrom peft "
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/icl/modeling.py",
"chars": 7980,
"preview": "import logging\n\nimport torch\nfrom transformers import AutoModel, PreTrainedModel, PreTrainedTokenizer\n\nfrom FlagEmbeddin"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py",
"chars": 6990,
"preview": "import logging\nfrom typing import Tuple\nfrom pathlib import Path\nfrom transformers import AutoConfig, AutoTokenizer, Pre"
},
{
"path": "FlagEmbedding/finetune/embedder/decoder_only/icl/trainer.py",
"chars": 1722,
"preview": "import os\nimport torch\nimport logging\nfrom typing import Optional\n\nfrom FlagEmbedding.abc.finetune.embedder import AbsEm"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/base/__init__.py",
"chars": 645,
"preview": "from FlagEmbedding.abc.finetune.embedder import (\n AbsEmbedderModelArguments as EncoderOnlyEmbedderModelArguments,\n "
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/base/__main__.py",
"chars": 876,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.finetune.embedder.encoder_only.base import (\n EncoderOn"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/base/modeling.py",
"chars": 7952,
"preview": "import logging\n\nimport torch\nfrom transformers import AutoModel, PreTrainedModel, PreTrainedTokenizer\n\nfrom FlagEmbeddin"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/base/runner.py",
"chars": 3277,
"preview": "import logging\nfrom typing import Tuple\nfrom transformers import (\n AutoModel, AutoConfig,\n AutoTokenizer, PreTrai"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/base/trainer.py",
"chars": 1718,
"preview": "import os\nimport torch\nimport logging\nfrom typing import Optional\n\nfrom FlagEmbedding.abc.finetune.embedder import AbsEm"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/m3/__init__.py",
"chars": 696,
"preview": "from FlagEmbedding.abc.finetune.embedder import AbsEmbedderDataArguments as EncoderOnlyEmbedderM3DataArguments\n\nfrom .ar"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/m3/__main__.py",
"chars": 866,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.finetune.embedder.encoder_only.m3 import (\n EncoderOnly"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/m3/arguments.py",
"chars": 941,
"preview": "from dataclasses import dataclass, field\n\nfrom FlagEmbedding.abc.finetune.embedder import (\n AbsEmbedderTrainingArgum"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/m3/modeling.py",
"chars": 25144,
"preview": "import os\nimport logging\nfrom typing import Dict, List, Union, Any\n\nimport torch\nfrom torch import Tensor\nimport torch.n"
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py",
"chars": 7220,
"preview": "import os\nimport torch\nimport logging\nfrom typing import Tuple\nfrom transformers import (\n AutoModel, AutoConfig,\n "
},
{
"path": "FlagEmbedding/finetune/embedder/encoder_only/m3/trainer.py",
"chars": 1703,
"preview": "import os\nimport torch\nimport logging\nfrom typing import Optional\n\nfrom FlagEmbedding.abc.finetune.embedder import AbsEm"
},
{
"path": "FlagEmbedding/finetune/reranker/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/base/__init__.py",
"chars": 317,
"preview": "from .modeling import CrossDecoderModel\nfrom .runner import DecoderOnlyRerankerRunner\nfrom .arguments import RerankerMod"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/base/__main__.py",
"chars": 816,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.abc.finetune.reranker import (\n AbsRerankerDataArgument"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/base/arguments.py",
"chars": 1853,
"preview": "from typing import List\nfrom dataclasses import dataclass, field\n\nfrom FlagEmbedding.abc.finetune.reranker import AbsRer"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/base/load_model.py",
"chars": 6004,
"preview": "import os\nimport re\nimport logging\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer\nfrom peft im"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/base/modeling.py",
"chars": 1841,
"preview": "import torch\nfrom transformers import PreTrainedModel, AutoTokenizer\nimport logging\n\nfrom FlagEmbedding.abc.finetune.rer"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/base/runner.py",
"chars": 3999,
"preview": "import logging\nfrom typing import Tuple\nfrom pathlib import Path\nfrom FlagEmbedding.abc.finetune.reranker.AbsArguments i"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/base/trainer.py",
"chars": 2140,
"preview": "import os\nimport torch\nimport logging\nfrom typing import Optional\n# from transformers.deepspeed import is_deepspeed_zero"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/layerwise/__init__.py",
"chars": 317,
"preview": "from .modeling import CrossDecoderModel\nfrom .runner import DecoderOnlyRerankerRunner\nfrom .arguments import RerankerMod"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/layerwise/__main__.py",
"chars": 821,
"preview": "from transformers import HfArgumentParser\n\nfrom FlagEmbedding.abc.finetune.reranker import (\n AbsRerankerDataArgument"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/layerwise/arguments.py",
"chars": 2465,
"preview": "from typing import List\nfrom dataclasses import dataclass, field\n\nfrom FlagEmbedding.abc.finetune.reranker import AbsRer"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/layerwise/configuration_minicpm_reranker.py",
"chars": 9966,
"preview": "# coding=utf-8\n# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.\n#\n# This code is based on"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/layerwise/load_model.py",
"chars": 9741,
"preview": "import os\nimport re\nimport logging\nfrom torch import nn\nfrom transformers import AutoConfig, AutoModelForCausalLM, AutoT"
},
{
"path": "FlagEmbedding/finetune/reranker/decoder_only/layerwise/modeling.py",
"chars": 3563,
"preview": "import torch\nfrom transformers import PreTrainedModel, AutoTokenizer\nimport logging\nfrom typing import List, Union, Dict"
}
]
// ... and 868 more files (download for full content)
About this extraction
This page contains the full source code of the FlagOpen/FlagEmbedding GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 1068 files (25.8 MB), approximately 6.8M tokens, and a symbol index with 3104 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.