gitextract_i6tp0pjv/

├── .cursor/
│   ├── commands/
│   │   ├── git-pr.md
│   │   └── update-howto-guide.md
│   ├── rules/
│   │   ├── docs-diataxis-guidelines.mdc
│   │   ├── docs-structure.mdc
│   │   ├── project-structure.mdc
│   │   ├── update-guide.mdc
│   │   └── use-uv-cli.mdc
│   └── worktrees.json
├── .dockerignore
├── .gitattributes
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── feature_request.md
│   │   └── question.md
│   ├── pull_request_template.md
│   └── workflows/
│       ├── ci.yaml
│       ├── claude-code-review.yml
│       ├── claude-docs-apply.yml
│       ├── claude-docs-check.yml
│       ├── claude.yml
│       ├── issue-manager.yaml
│       ├── publish-examples.yml
│       └── python-publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CLAUDE.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── SECURITY.md
├── docs/
│   ├── INSTALL
│   ├── Makefile
│   ├── _static/
│   │   ├── annotated_data.json
│   │   ├── css/
│   │   │   ├── highlight_ipython3.css
│   │   │   ├── highlight_ipython3_dark.css
│   │   │   ├── highlight_ipython3_light.css
│   │   │   ├── highlight_python.css
│   │   │   ├── highlight_python_dark.css
│   │   │   ├── highlight_python_light.css
│   │   │   └── ragas.css
│   │   ├── edited_chain_runs.json
│   │   ├── js/
│   │   │   ├── commonroom.js
│   │   │   ├── header_border.js
│   │   │   ├── mathjax.js
│   │   │   ├── mendable_chat_bubble.js
│   │   │   └── toggle.js
│   │   └── sample_annotated_summary.json
│   ├── alfred.py
│   ├── community/
│   │   ├── index.md
│   │   └── pdf_export.md
│   ├── concepts/
│   │   ├── components/
│   │   │   ├── eval_dataset.md
│   │   │   ├── eval_sample.md
│   │   │   ├── index.md
│   │   │   └── prompt.md
│   │   ├── datasets.md
│   │   ├── experimentation.md
│   │   ├── feedback/
│   │   │   └── index.md
│   │   ├── index.md
│   │   ├── metrics/
│   │   │   ├── available_metrics/
│   │   │   │   ├── agents.md
│   │   │   │   ├── answer_correctness.md
│   │   │   │   ├── answer_relevance.md
│   │   │   │   ├── aspect_critic.md
│   │   │   │   ├── context_entities_recall.md
│   │   │   │   ├── context_precision.md
│   │   │   │   ├── context_recall.md
│   │   │   │   ├── factual_correctness.md
│   │   │   │   ├── faithfulness.md
│   │   │   │   ├── general_purpose.md
│   │   │   │   ├── index.md
│   │   │   │   ├── multi_modal_faithfulness.md
│   │   │   │   ├── multi_modal_relevance.md
│   │   │   │   ├── noise_sensitivity.md
│   │   │   │   ├── nvidia_metrics.md
│   │   │   │   ├── rubrics_based.md
│   │   │   │   ├── semantic_similarity.md
│   │   │   │   ├── sql.md
│   │   │   │   ├── summarization_score.md
│   │   │   │   └── traditional.md
│   │   │   ├── index.md
│   │   │   └── overview/
│   │   │       └── index.md
│   │   └── test_data_generation/
│   │       ├── agents.md
│   │       ├── index.md
│   │       └── rag.md
│   ├── extra/
│   │   ├── components/
│   │   │   ├── choose_evaluator_llm.md
│   │   │   └── choose_generator_llm.md
│   │   ├── overrides/
│   │   │   └── main.html
│   │   ├── ragas-modern.css
│   │   └── style.css
│   ├── getstarted/
│   │   ├── evals.md
│   │   ├── experiments_quickstart.md
│   │   ├── index.md
│   │   ├── install.md
│   │   ├── quickstart.md
│   │   ├── rag_eval.md
│   │   └── rag_testset_generation.md
│   ├── howtos/
│   │   ├── applications/
│   │   │   ├── _cost.md
│   │   │   ├── add_to_ci.md
│   │   │   ├── align-llm-as-judge.md
│   │   │   ├── benchmark_llm.md
│   │   │   ├── compare_embeddings.md
│   │   │   ├── compare_llms.md
│   │   │   ├── cost.ipynb
│   │   │   ├── evaluate-and-improve-rag.md
│   │   │   ├── evaluating_multi_turn_conversations.md
│   │   │   ├── index.md
│   │   │   ├── iterate_prompt.md
│   │   │   ├── prompt_optimization.md
│   │   │   ├── singlehop_testset_gen.md
│   │   │   ├── text2sql.md
│   │   │   ├── vertexai_alignment.md
│   │   │   ├── vertexai_model_comparision.md
│   │   │   └── vertexai_x_ragas.md
│   │   ├── cli/
│   │   │   ├── agent_evals.md
│   │   │   ├── benchmark_llm.md
│   │   │   ├── improve_rag.md
│   │   │   ├── index.md
│   │   │   ├── judge_alignment.md
│   │   │   ├── llamaIndex_agent_evals.md
│   │   │   ├── prompt_evals.md
│   │   │   ├── rag_eval.md
│   │   │   ├── text2sql.md
│   │   │   └── workflow_eval.md
│   │   ├── customizations/
│   │   │   ├── _caching.md
│   │   │   ├── caching.ipynb
│   │   │   ├── cancellation.md
│   │   │   ├── customize_models.md
│   │   │   ├── index.md
│   │   │   ├── metrics/
│   │   │   │   ├── _cost.md
│   │   │   │   ├── cost.ipynb
│   │   │   │   ├── metrics_language_adaptation.md
│   │   │   │   ├── modifying-prompts-metrics.md
│   │   │   │   └── tracing.md
│   │   │   ├── optimizers/
│   │   │   │   └── index.md
│   │   │   ├── run_config.md
│   │   │   └── testgenerator/
│   │   │       ├── _language_adaptation.md
│   │   │       ├── _persona_generator.md
│   │   │       ├── _testgen-custom-single-hop.md
│   │   │       ├── _testgen-customisation.md
│   │   │       ├── index.md
│   │   │       ├── language_adaptation.ipynb
│   │   │       ├── persona_generator.ipynb
│   │   │       ├── prechunked_data.md
│   │   │       ├── testgen-custom-single-hop.ipynb
│   │   │       └── testgen-customisation.ipynb
│   │   ├── index.md
│   │   ├── integrations/
│   │   │   ├── _ag_ui.md
│   │   │   ├── _arize.md
│   │   │   ├── _athina.md
│   │   │   ├── _haystack.md
│   │   │   ├── _helicone.md
│   │   │   ├── _langchain.md
│   │   │   ├── _langfuse.md
│   │   │   ├── _langgraph_agent_evaluation.md
│   │   │   ├── _langsmith.md
│   │   │   ├── _llamaindex.md
│   │   │   ├── _openlayer.md
│   │   │   ├── _opik.md
│   │   │   ├── _tonic-validate.md
│   │   │   ├── _zeno.md
│   │   │   ├── ag_ui.ipynb
│   │   │   ├── ag_ui.md
│   │   │   ├── amazon_bedrock.md
│   │   │   ├── arize.ipynb
│   │   │   ├── athina.ipynb
│   │   │   ├── gemini.md
│   │   │   ├── griptape.md
│   │   │   ├── haystack.ipynb
│   │   │   ├── haystack.md
│   │   │   ├── helicone.ipynb
│   │   │   ├── index.md
│   │   │   ├── langchain.ipynb
│   │   │   ├── langchain.md
│   │   │   ├── langfuse.ipynb
│   │   │   ├── langgraph_agent_evaluation.ipynb
│   │   │   ├── langsmith.ipynb
│   │   │   ├── langsmith.md
│   │   │   ├── llama_stack.md
│   │   │   ├── llamaindex.ipynb
│   │   │   ├── llamaindex_agents.md
│   │   │   ├── nyc_wikipedia/
│   │   │   │   └── nyc_text.txt
│   │   │   ├── oci_genai.md
│   │   │   ├── openlayer.ipynb
│   │   │   ├── opik.ipynb
│   │   │   ├── r2r.md
│   │   │   ├── swarm_agent_evaluation.md
│   │   │   ├── tonic-validate.ipynb
│   │   │   └── zeno.ipynb
│   │   ├── llm-adapters.md
│   │   ├── migrations/
│   │   │   ├── migrate_from_v01_to_v02.md
│   │   │   └── migrate_from_v03_to_v04.md
│   │   └── observability.md
│   ├── index.md
│   ├── ipynb_to_md.py
│   ├── make.bat
│   ├── quoted_spans_metric.md
│   ├── references/
│   │   ├── aevaluate.md
│   │   ├── cache.md
│   │   ├── embeddings.md
│   │   ├── evaluate.md
│   │   ├── evaluation_schema.md
│   │   ├── executor.md
│   │   ├── generate.md
│   │   ├── graph.md
│   │   ├── index.md
│   │   ├── integrations.md
│   │   ├── llms.md
│   │   ├── metrics.md
│   │   ├── optimizers.md
│   │   ├── prompt.md
│   │   ├── run_config.md
│   │   ├── synthesizers.md
│   │   ├── testset_schema.md
│   │   ├── tokenizers.md
│   │   └── transforms.md
│   └── tutorials/
│       ├── agent.md
│       ├── index.md
│       ├── prompt.md
│       ├── rag.md
│       └── workflow.md
├── examples/
│   ├── LICENSE
│   ├── README.md
│   ├── gdrive_append_example.py
│   ├── gdrive_backend_example.py
│   ├── iterate_prompt/
│   │   ├── __init__.py
│   │   ├── datasets/
│   │   │   └── support_triage.csv
│   │   ├── evals.py
│   │   ├── promptv1.txt
│   │   ├── promptv2_fewshot.txt
│   │   └── run_prompt.py
│   ├── oci_genai_example.py
│   ├── pyproject.toml
│   └── ragas_examples/
│       ├── __init__.py
│       ├── ag_ui_agent_experiments/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── experiments.py
│       │   └── test_data/
│       │       └── datasets/
│       │           ├── scientist_biographies.csv
│       │           └── weather_tool_calls.csv
│       ├── agent_evals/
│       │   ├── __init__.py
│       │   ├── agent.py
│       │   └── evals.py
│       ├── benchmark_llm/
│       │   ├── __init__.py
│       │   ├── datasets/
│       │   │   └── discount_benchmark.csv
│       │   ├── evals.py
│       │   └── prompt.py
│       ├── improve_rag/
│       │   ├── __init__.py
│       │   ├── evals/
│       │   │   └── datasets/
│       │   │       └── hf_doc_qa_eval.csv
│       │   ├── evals.py
│       │   ├── pyproject.toml
│       │   └── rag.py
│       ├── judge_alignment/
│       │   ├── __init__.py
│       │   └── evals.py
│       ├── llamaIndex_agent_evals/
│       │   ├── __init__.py
│       │   ├── contexts/
│       │   │   ├── ambiguous_removal_request.json
│       │   │   ├── duplicate_addition.json
│       │   │   └── repeated_removal.json
│       │   ├── evals.py
│       │   └── llamaindex_agent.py
│       ├── prompt_evals/
│       │   ├── __init__.py
│       │   ├── evals.py
│       │   └── prompt.py
│       ├── rag_eval/
│       │   ├── __init__.py
│       │   ├── evals.py
│       │   ├── pyproject.toml
│       │   └── rag.py
│       ├── text2sql/
│       │   ├── __init__.py
│       │   ├── analyze_errors.py
│       │   ├── data_utils.py
│       │   ├── datasets/
│       │   │   └── booksql_sample.csv
│       │   ├── db_utils.py
│       │   ├── evals.py
│       │   ├── prompt.txt
│       │   ├── prompt_v2.txt
│       │   ├── prompt_v3.txt
│       │   ├── text2sql_agent.py
│       │   └── validate_sql_dataset.py
│       └── workflow_eval/
│           ├── __init__.py
│           ├── evals.py
│           └── workflow.py
├── mkdocs-pdf.yml
├── mkdocs.yml
├── pyproject.toml
├── scripts/
│   └── dev_docs.sh
├── src/
│   └── ragas/
│       ├── __init__.py
│       ├── _analytics.py
│       ├── async_utils.py
│       ├── backends/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── base.py
│       │   ├── gdrive_backend.md
│       │   ├── gdrive_backend.py
│       │   ├── inmemory.py
│       │   ├── local_csv.py
│       │   ├── local_jsonl.py
│       │   ├── registry.py
│       │   └── utils.py
│       ├── cache.py
│       ├── callbacks.py
│       ├── cli.py
│       ├── config.py
│       ├── cost.py
│       ├── dataset.py
│       ├── dataset_schema.py
│       ├── embeddings/
│       │   ├── __init__.py
│       │   ├── base.py
│       │   ├── google_provider.py
│       │   ├── haystack_wrapper.py
│       │   ├── huggingface_provider.py
│       │   ├── litellm_provider.py
│       │   ├── openai_provider.py
│       │   └── utils.py
│       ├── evaluation.py
│       ├── exceptions.py
│       ├── executor.py
│       ├── experiment.py
│       ├── integrations/
│       │   ├── __init__.py
│       │   ├── ag_ui.py
│       │   ├── amazon_bedrock.py
│       │   ├── griptape.py
│       │   ├── helicone.py
│       │   ├── langchain.py
│       │   ├── langgraph.py
│       │   ├── langsmith.py
│       │   ├── llama_index.py
│       │   ├── opik.py
│       │   ├── r2r.py
│       │   ├── swarm.py
│       │   └── tracing/
│       │       ├── __init__.py
│       │       ├── langfuse.py
│       │       └── mlflow.py
│       ├── llms/
│       │   ├── __init__.py
│       │   ├── adapters/
│       │   │   ├── __init__.py
│       │   │   ├── base.py
│       │   │   ├── instructor.py
│       │   │   └── litellm.py
│       │   ├── base.py
│       │   ├── haystack_wrapper.py
│       │   ├── litellm_llm.py
│       │   └── oci_genai_wrapper.py
│       ├── losses.py
│       ├── messages.py
│       ├── metrics/
│       │   ├── __init__.py
│       │   ├── _answer_correctness.py
│       │   ├── _answer_relevance.py
│       │   ├── _answer_similarity.py
│       │   ├── _aspect_critic.py
│       │   ├── _bleu_score.py
│       │   ├── _chrf_score.py
│       │   ├── _context_entities_recall.py
│       │   ├── _context_precision.py
│       │   ├── _context_recall.py
│       │   ├── _datacompy_score.py
│       │   ├── _domain_specific_rubrics.py
│       │   ├── _factual_correctness.py
│       │   ├── _faithfulness.py
│       │   ├── _goal_accuracy.py
│       │   ├── _instance_specific_rubrics.py
│       │   ├── _multi_modal_faithfulness.py
│       │   ├── _multi_modal_relevance.py
│       │   ├── _noise_sensitivity.py
│       │   ├── _nv_metrics.py
│       │   ├── _rouge_score.py
│       │   ├── _simple_criteria.py
│       │   ├── _sql_semantic_equivalence.py
│       │   ├── _string.py
│       │   ├── _summarization.py
│       │   ├── _tool_call_accuracy.py
│       │   ├── _tool_call_f1.py
│       │   ├── _topic_adherence.py
│       │   ├── base.py
│       │   ├── collections/
│       │   │   ├── __init__.py
│       │   │   ├── _bleu_score.py
│       │   │   ├── _rouge_score.py
│       │   │   ├── _semantic_similarity.py
│       │   │   ├── _string.py
│       │   │   ├── agent_goal_accuracy/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── answer_accuracy/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── answer_correctness/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── answer_relevancy/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── base.py
│       │   │   ├── chrf_score/
│       │   │   │   ├── __init__.py
│       │   │   │   └── metric.py
│       │   │   ├── context_entity_recall/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── context_precision/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── context_recall/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── context_relevance/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── datacompy_score/
│       │   │   │   ├── __init__.py
│       │   │   │   └── metric.py
│       │   │   ├── domain_specific_rubrics/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── example_metric.py
│       │   │   ├── factual_correctness/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── faithfulness/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── instance_specific_rubrics/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── multi_modal_faithfulness/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── multi_modal_relevance/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── noise_sensitivity/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── quoted_spans/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── response_groundedness/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── sql_semantic_equivalence/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── summary_score/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── tool_call_accuracy/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   ├── tool_call_f1/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── metric.py
│       │   │   │   └── util.py
│       │   │   └── topic_adherence/
│       │   │       ├── __init__.py
│       │   │       ├── metric.py
│       │   │       └── util.py
│       │   ├── decorator.py
│       │   ├── discrete.py
│       │   ├── numeric.py
│       │   ├── quoted_spans.py
│       │   ├── ranking.py
│       │   ├── result.py
│       │   ├── utils.py
│       │   └── validators.py
│       ├── optimizers/
│       │   ├── __init__.py
│       │   ├── base.py
│       │   ├── dspy_adapter.py
│       │   ├── dspy_llm_wrapper.py
│       │   ├── dspy_optimizer.py
│       │   ├── genetic.py
│       │   └── utils.py
│       ├── prompt/
│       │   ├── __init__.py
│       │   ├── base.py
│       │   ├── dynamic_few_shot.py
│       │   ├── few_shot_pydantic_prompt.py
│       │   ├── metrics/
│       │   │   ├── __init__.py
│       │   │   ├── answer_accuracy.py
│       │   │   ├── answer_correctness.py
│       │   │   ├── answer_relevance.py
│       │   │   ├── base_prompt.py
│       │   │   ├── common.py
│       │   │   ├── context_entity_recall.py
│       │   │   ├── context_recall.py
│       │   │   ├── context_relevance.py
│       │   │   ├── factual_correctness.py
│       │   │   ├── noise_sensitivity.py
│       │   │   ├── response_groundedness.py
│       │   │   └── summary_score.py
│       │   ├── mixin.py
│       │   ├── multi_modal_prompt.py
│       │   ├── prompt-formats.md
│       │   ├── pydantic_prompt.py
│       │   ├── simple_prompt.py
│       │   └── utils.py
│       ├── py.typed
│       ├── run_config.py
│       ├── sdk.py
│       ├── testset/
│       │   ├── __init__.py
│       │   ├── graph.py
│       │   ├── graph_queries.py
│       │   ├── persona.py
│       │   ├── synthesizers/
│       │   │   ├── __init__.py
│       │   │   ├── base.py
│       │   │   ├── generate.py
│       │   │   ├── multi_hop/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── abstract.py
│       │   │   │   ├── base.py
│       │   │   │   ├── prompts.py
│       │   │   │   └── specific.py
│       │   │   ├── prompts.py
│       │   │   ├── single_hop/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── base.py
│       │   │   │   ├── prompts.py
│       │   │   │   └── specific.py
│       │   │   ├── testset_schema.py
│       │   │   └── utils.py
│       │   └── transforms/
│       │       ├── __init__.py
│       │       ├── base.py
│       │       ├── default.py
│       │       ├── engine.py
│       │       ├── extractors/
│       │       │   ├── __init__.py
│       │       │   ├── embeddings.py
│       │       │   ├── llm_based.py
│       │       │   └── regex_based.py
│       │       ├── filters.py
│       │       ├── relationship_builders/
│       │       │   ├── __init__.py
│       │       │   ├── cosine.py
│       │       │   └── traditional.py
│       │       └── splitters/
│       │           ├── __init__.py
│       │           └── headline.py
│       ├── tokenizers.py
│       ├── utils.py
│       └── validation.py
└── tests/
    ├── __init__.py
    ├── benchmarks/
    │   ├── Dockerfile
    │   ├── benchmark_eval.py
    │   ├── benchmark_testsetgen.py
    │   └── utils.py
    ├── conftest.py
    ├── docs/
    │   ├── __init__.py
    │   └── test_run_config.py
    ├── e2e/
    │   ├── __init__.py
    │   ├── metrics_migration/
    │   │   ├── __init__.py
    │   │   ├── base_migration_test.py
    │   │   ├── conftest.py
    │   │   ├── metric_score_diff.ipynb
    │   │   ├── plan-for-metrics-migration.md
    │   │   ├── test_answer_accuracy_migration.py
    │   │   ├── test_answer_correctness_migration.py
    │   │   ├── test_answer_relevancy_migration.py
    │   │   ├── test_bleu_migration.py
    │   │   ├── test_context_entity_recall_migration.py
    │   │   ├── test_context_precision_migration.py
    │   │   ├── test_context_recall_migration.py
    │   │   ├── test_context_relevance_migration.py
    │   │   ├── test_factual_correctness_migration.py
    │   │   ├── test_faithfulness_migration.py
    │   │   ├── test_noise_sensitivity_migration.py
    │   │   ├── test_response_groundedness_migration.py
    │   │   ├── test_rouge_migration.py
    │   │   ├── test_semantic_similarity_migration.py
    │   │   ├── test_string_migration.py
    │   │   ├── test_summary_score_migration.py
    │   │   └── test_utils.py
    │   ├── test_adaptation.py
    │   ├── test_amnesty_in_ci.py
    │   ├── test_dataset_utils.py
    │   ├── test_dspy_integration.py
    │   ├── test_fullflow.py
    │   ├── test_langchain_llm_attributes.py
    │   └── test_testset_generation.py
    ├── test_quoted_spans.py
    ├── unit/
    │   ├── backends/
    │   │   ├── test_gdrive_backend.py
    │   │   ├── test_inmemory.py
    │   │   ├── test_local_csv.py
    │   │   └── test_local_jsonl.py
    │   ├── integrations/
    │   │   ├── test_ag_ui.py
    │   │   ├── test_tracing.py
    │   │   └── test_tracing_simple.py
    │   ├── llms/
    │   │   ├── test_adapters.py
    │   │   ├── test_instructor_factory.py
    │   │   ├── test_llm.py
    │   │   └── test_system_prompt.py
    │   ├── prompt/
    │   │   ├── test_base_prompt.py
    │   │   ├── test_dynamic_few_shot_prompt.py
    │   │   ├── test_prompt_mixin.py
    │   │   ├── test_prompt_save_load.py
    │   │   └── test_prompt_utils.py
    │   ├── test_analytics.py
    │   ├── test_async_evaluation.py
    │   ├── test_async_utils.py
    │   ├── test_average_precision_algorithm.py
    │   ├── test_cache.py
    │   ├── test_cancellation.py
    │   ├── test_chrf_score.py
    │   ├── test_chrf_score_collections.py
    │   ├── test_cli.py
    │   ├── test_cosine_relationship_builders.py
    │   ├── test_cost.py
    │   ├── test_datacompy_score_collections.py
    │   ├── test_dataset_schema.py
    │   ├── test_datatable_inheritance.py
    │   ├── test_domain_specific_rubrics_collections.py
    │   ├── test_dspy_adapter.py
    │   ├── test_dspy_optimizer.py
    │   ├── test_embeddings.py
    │   ├── test_embeddings_caching.py
    │   ├── test_engine.py
    │   ├── test_executor.py
    │   ├── test_executor_in_jupyter.ipynb
    │   ├── test_experiment.py
    │   ├── test_graph.py
    │   ├── test_import.py
    │   ├── test_instance_specific_rubrics_collections.py
    │   ├── test_knowledge_graph_clusters.py
    │   ├── test_knowledge_graph_save.py
    │   ├── test_langgraph.py
    │   ├── test_llm_context.py
    │   ├── test_metric.py
    │   ├── test_metric_decorators.py
    │   ├── test_multi_hop_query_synthesizer.py
    │   ├── test_multi_modal_faithfulness_collections.py
    │   ├── test_multi_modal_relevance_collections.py
    │   ├── test_oci_genai_wrapper.py
    │   ├── test_optimizer_config.py
    │   ├── test_prechunked_generation.py
    │   ├── test_prompt.py
    │   ├── test_quoted_spans_collections.py
    │   ├── test_run_config.py
    │   ├── test_simple.py
    │   ├── test_simple_llm_metric_persistence.py
    │   ├── test_single_hop_query_synthesizer.py
    │   ├── test_sql_semantic_equivalence_collections.py
    │   ├── test_testset_schema.py
    │   ├── test_tokenizers.py
    │   ├── test_tool_call_accuracy.py
    │   ├── test_tool_call_accuracy_collections.py
    │   ├── test_tool_call_f1.py
    │   ├── test_tool_call_f1_collections.py
    │   ├── test_traditional_relationship_builders.py
    │   ├── test_utils.py
    │   ├── test_uvloop_compatibility.py
    │   └── test_validation.py
    └── utils/
        ├── __init__.py
        ├── llm_setup.py
        └── metric_comparison.py