gitextract_eh4wghd3/

├── .cursor-plugin/
│   └── plugin.json
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   └── feature_request.md
│   └── workflows/
│       ├── black.yml
│       ├── changelog.yml
│       ├── full_test_core_for_pr.yml
│       ├── test_confident.yml
│       ├── test_core.yml
│       ├── test_integrations.yml
│       └── test_metrics.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .scripts/
│   └── changelog/
│       └── generate.py
├── .vscode/
│   └── settings.json
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE.md
├── MAINTAINERS.md
├── MANIFEST.in
├── README.md
├── deepeval/
│   ├── __init__.py
│   ├── _version.py
│   ├── annotation/
│   │   ├── __init__.py
│   │   ├── annotation.py
│   │   └── api.py
│   ├── anthropic/
│   │   ├── __init__.py
│   │   ├── extractors.py
│   │   ├── patch.py
│   │   └── utils.py
│   ├── benchmarks/
│   │   ├── __init__.py
│   │   ├── arc/
│   │   │   ├── __init__.py
│   │   │   ├── arc.py
│   │   │   ├── mode.py
│   │   │   └── template.py
│   │   ├── base_benchmark.py
│   │   ├── bbq/
│   │   │   ├── __init__.py
│   │   │   ├── bbq.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── big_bench_hard/
│   │   │   ├── __init__.py
│   │   │   ├── big_bench_hard.py
│   │   │   ├── cot_prompts/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── boolean_expressions.txt
│   │   │   │   ├── causal_judgement.txt
│   │   │   │   ├── date_understanding.txt
│   │   │   │   ├── disambiguation_qa.txt
│   │   │   │   ├── dyck_languages.txt
│   │   │   │   ├── formal_fallacies.txt
│   │   │   │   ├── geometric_shapes.txt
│   │   │   │   ├── hyperbaton.txt
│   │   │   │   ├── logical_deduction_five_objects.txt
│   │   │   │   ├── logical_deduction_seven_objects.txt
│   │   │   │   ├── logical_deduction_three_objects.txt
│   │   │   │   ├── movie_recommendation.txt
│   │   │   │   ├── multistep_arithmetic_two.txt
│   │   │   │   ├── navigate.txt
│   │   │   │   ├── object_counting.txt
│   │   │   │   ├── penguins_in_a_table.txt
│   │   │   │   ├── reasoning_about_colored_objects.txt
│   │   │   │   ├── ruin_names.txt
│   │   │   │   ├── salient_translation_error_detection.txt
│   │   │   │   ├── snarks.txt
│   │   │   │   ├── sports_understanding.txt
│   │   │   │   ├── temporal_sequences.txt
│   │   │   │   ├── tracking_shuffled_objects_five_objects.txt
│   │   │   │   ├── tracking_shuffled_objects_seven_objects.txt
│   │   │   │   ├── tracking_shuffled_objects_three_objects.txt
│   │   │   │   ├── web_of_lies.txt
│   │   │   │   └── word_sorting.txt
│   │   │   ├── shot_prompts/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── boolean_expressions.txt
│   │   │   │   ├── causal_judgement.txt
│   │   │   │   ├── date_understanding.txt
│   │   │   │   ├── disambiguation_qa.txt
│   │   │   │   ├── dyck_languages.txt
│   │   │   │   ├── formal_fallacies.txt
│   │   │   │   ├── geometric_shapes.txt
│   │   │   │   ├── hyperbaton.txt
│   │   │   │   ├── logical_deduction_five_objects.txt
│   │   │   │   ├── logical_deduction_seven_objects.txt
│   │   │   │   ├── logical_deduction_three_objects.txt
│   │   │   │   ├── movie_recommendation.txt
│   │   │   │   ├── multistep_arithmetic_two.txt
│   │   │   │   ├── navigate.txt
│   │   │   │   ├── object_counting.txt
│   │   │   │   ├── penguins_in_a_table.txt
│   │   │   │   ├── reasoning_about_colored_objects.txt
│   │   │   │   ├── ruin_names.txt
│   │   │   │   ├── salient_translation_error_detection.txt
│   │   │   │   ├── snarks.txt
│   │   │   │   ├── sports_understanding.txt
│   │   │   │   ├── temporal_sequences.txt
│   │   │   │   ├── tracking_shuffled_objects_five_objects.txt
│   │   │   │   ├── tracking_shuffled_objects_seven_objects.txt
│   │   │   │   ├── tracking_shuffled_objects_three_objects.txt
│   │   │   │   ├── web_of_lies.txt
│   │   │   │   └── word_sorting.txt
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── bool_q/
│   │   │   ├── __init__.py
│   │   │   ├── bool_q.py
│   │   │   └── template.py
│   │   ├── drop/
│   │   │   ├── __init__.py
│   │   │   ├── drop.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── equity_med_qa/
│   │   │   ├── __init__.py
│   │   │   ├── equity_med_qa.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── gsm8k/
│   │   │   ├── __init__.py
│   │   │   ├── gsm8k.py
│   │   │   └── template.py
│   │   ├── hellaswag/
│   │   │   ├── __init__.py
│   │   │   ├── hellaswag.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── human_eval/
│   │   │   ├── __init__.py
│   │   │   ├── human_eval.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── ifeval/
│   │   │   ├── __init__.py
│   │   │   ├── ifeval.py
│   │   │   └── template.py
│   │   ├── lambada/
│   │   │   ├── __init__.py
│   │   │   ├── lambada.py
│   │   │   └── template.py
│   │   ├── logi_qa/
│   │   │   ├── __init__.py
│   │   │   ├── logi_qa.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── math_qa/
│   │   │   ├── __init__.py
│   │   │   ├── math_qa.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── mmlu/
│   │   │   ├── __init__.py
│   │   │   ├── mmlu.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── modes/
│   │   │   └── __init__.py
│   │   ├── results.py
│   │   ├── schema.py
│   │   ├── squad/
│   │   │   ├── __init__.py
│   │   │   ├── squad.py
│   │   │   ├── task.py
│   │   │   └── template.py
│   │   ├── tasks/
│   │   │   └── __init__.py
│   │   ├── truthful_qa/
│   │   │   ├── __init__.py
│   │   │   ├── mode.py
│   │   │   ├── task.py
│   │   │   ├── template.py
│   │   │   └── truthful_qa.py
│   │   ├── utils.py
│   │   └── winogrande/
│   │       ├── __init__.py
│   │       ├── template.py
│   │       └── winogrande.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── dotenv_handler.py
│   │   ├── generate/
│   │   │   ├── __init__.py
│   │   │   ├── command.py
│   │   │   └── utils.py
│   │   ├── inspect.py
│   │   ├── main.py
│   │   ├── server.py
│   │   ├── test/
│   │   │   ├── __init__.py
│   │   │   └── command.py
│   │   ├── types.py
│   │   └── utils.py
│   ├── confident/
│   │   ├── __init__.py
│   │   ├── api.py
│   │   └── types.py
│   ├── config/
│   │   ├── __init__.py
│   │   ├── dotenv_handler.py
│   │   ├── logging.py
│   │   ├── settings.py
│   │   ├── settings_manager.py
│   │   └── utils.py
│   ├── constants.py
│   ├── contextvars.py
│   ├── dataset/
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── dataset.py
│   │   ├── golden.py
│   │   ├── test_run_tracer.py
│   │   ├── types.py
│   │   └── utils.py
│   ├── errors.py
│   ├── evaluate/
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── compare.py
│   │   ├── configs.py
│   │   ├── console_report.py
│   │   ├── evaluate.py
│   │   ├── execute/
│   │   │   ├── __init__.py
│   │   │   ├── _common.py
│   │   │   ├── agentic.py
│   │   │   ├── e2e.py
│   │   │   ├── loop.py
│   │   │   └── trace_scope.py
│   │   ├── local_store.py
│   │   ├── types.py
│   │   └── utils.py
│   ├── inspect/
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── app.py
│   │   ├── fixtures/
│   │   │   └── test_run_sample.json
│   │   ├── loader.py
│   │   ├── styles.tcss
│   │   ├── types.py
│   │   └── widgets/
│   │       ├── __init__.py
│   │       ├── _styling.py
│   │       ├── details.py
│   │       ├── header_bar.py
│   │       ├── help_modal.py
│   │       ├── search_bar.py
│   │       └── span_tree.py
│   ├── integrations/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── agentcore/
│   │   │   ├── __init__.py
│   │   │   ├── instrumentator.py
│   │   │   └── otel.py
│   │   ├── crewai/
│   │   │   ├── __init__.py
│   │   │   ├── handler.py
│   │   │   ├── subs.py
│   │   │   ├── tool.py
│   │   │   └── wrapper.py
│   │   ├── google_adk/
│   │   │   ├── __init__.py
│   │   │   └── otel.py
│   │   ├── hugging_face/
│   │   │   ├── __init__.py
│   │   │   ├── callback.py
│   │   │   ├── rich_manager.py
│   │   │   ├── tests/
│   │   │   │   └── test_callbacks.py
│   │   │   └── utils.py
│   │   ├── langchain/
│   │   │   ├── __init__.py
│   │   │   ├── callback.py
│   │   │   ├── patch.py
│   │   │   └── utils.py
│   │   ├── llama_index/
│   │   │   ├── __init__.py
│   │   │   ├── handler.py
│   │   │   └── utils.py
│   │   ├── openinference/
│   │   │   ├── __init__.py
│   │   │   ├── instrumentator.py
│   │   │   └── otel.py
│   │   ├── pydantic_ai/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── instrumentator.py
│   │   │   └── otel.py
│   │   └── strands/
│   │       ├── __init__.py
│   │       ├── instrumentator.py
│   │       └── otel.py
│   ├── key_handler.py
│   ├── metrics/
│   │   ├── __init__.py
│   │   ├── answer_relevancy/
│   │   │   ├── __init__.py
│   │   │   ├── answer_relevancy.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── arena_g_eval/
│   │   │   ├── __init__.py
│   │   │   ├── arena_g_eval.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── utils.py
│   │   ├── argument_correctness/
│   │   │   ├── __init__.py
│   │   │   ├── argument_correctness.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── base_metric.py
│   │   ├── bias/
│   │   │   ├── __init__.py
│   │   │   ├── bias.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── contextual_precision/
│   │   │   ├── __init__.py
│   │   │   ├── contextual_precision.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── contextual_recall/
│   │   │   ├── __init__.py
│   │   │   ├── contextual_recall.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── contextual_relevancy/
│   │   │   ├── __init__.py
│   │   │   ├── contextual_relevancy.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── conversation_completeness/
│   │   │   ├── __init__.py
│   │   │   ├── conversation_completeness.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── conversational_dag/
│   │   │   ├── __init__.py
│   │   │   ├── conversational_dag.py
│   │   │   ├── nodes.py
│   │   │   └── templates.py
│   │   ├── conversational_g_eval/
│   │   │   ├── __init__.py
│   │   │   ├── conversational_g_eval.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── dag/
│   │   │   ├── __init__.py
│   │   │   ├── dag.py
│   │   │   ├── graph.py
│   │   │   ├── nodes.py
│   │   │   ├── schema.py
│   │   │   ├── serialization/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── registry.py
│   │   │   │   ├── serialization.py
│   │   │   │   └── types.py
│   │   │   ├── templates.py
│   │   │   └── utils.py
│   │   ├── exact_match/
│   │   │   ├── __init__.py
│   │   │   └── exact_match.py
│   │   ├── faithfulness/
│   │   │   ├── __init__.py
│   │   │   ├── faithfulness.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── g_eval/
│   │   │   ├── __init__.py
│   │   │   ├── g_eval.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── utils.py
│   │   ├── goal_accuracy/
│   │   │   ├── __init__.py
│   │   │   ├── goal_accuracy.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── hallucination/
│   │   │   ├── __init__.py
│   │   │   ├── hallucination.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── indicator.py
│   │   ├── json_correctness/
│   │   │   ├── __init__.py
│   │   │   ├── json_correctness.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── knowledge_retention/
│   │   │   ├── __init__.py
│   │   │   ├── knowledge_retention.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── mcp/
│   │   │   ├── __init__.py
│   │   │   ├── mcp_task_completion.py
│   │   │   ├── multi_turn_mcp_use_metric.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── mcp_use_metric/
│   │   │   ├── __init__.py
│   │   │   ├── mcp_use_metric.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── misuse/
│   │   │   ├── __init__.py
│   │   │   ├── misuse.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── multimodal_metrics/
│   │   │   ├── __init__.py
│   │   │   ├── image_coherence/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── image_coherence.py
│   │   │   │   ├── schema.py
│   │   │   │   └── template.py
│   │   │   ├── image_editing/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── image_editing.py
│   │   │   │   ├── schema.py
│   │   │   │   └── template.py
│   │   │   ├── image_helpfulness/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── image_helpfulness.py
│   │   │   │   ├── schema.py
│   │   │   │   └── template.py
│   │   │   ├── image_reference/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── image_reference.py
│   │   │   │   ├── schema.py
│   │   │   │   └── template.py
│   │   │   └── text_to_image/
│   │   │       ├── __init__.py
│   │   │       ├── schema.py
│   │   │       ├── template.py
│   │   │       └── text_to_image.py
│   │   ├── non_advice/
│   │   │   ├── __init__.py
│   │   │   ├── non_advice.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── pattern_match/
│   │   │   ├── __init__.py
│   │   │   └── pattern_match.py
│   │   ├── pii_leakage/
│   │   │   ├── __init__.py
│   │   │   ├── pii_leakage.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── plan_adherence/
│   │   │   ├── __init__.py
│   │   │   ├── plan_adherence.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── plan_quality/
│   │   │   ├── __init__.py
│   │   │   ├── plan_quality.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── prompt_alignment/
│   │   │   ├── __init__.py
│   │   │   ├── prompt_alignment.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── ragas.py
│   │   ├── role_adherence/
│   │   │   ├── __init__.py
│   │   │   ├── role_adherence.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── role_violation/
│   │   │   ├── __init__.py
│   │   │   ├── role_violation.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── step_efficiency/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── step_efficiency.py
│   │   │   └── template.py
│   │   ├── summarization/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── summarization.py
│   │   │   └── template.py
│   │   ├── task_completion/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── task_completion.py
│   │   │   └── template.py
│   │   ├── tool_correctness/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── tool_correctness.py
│   │   ├── tool_use/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── tool_use.py
│   │   ├── topic_adherence/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── topic_adherence.py
│   │   ├── toxicity/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── toxicity.py
│   │   ├── turn_contextual_precision/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── turn_contextual_precision.py
│   │   ├── turn_contextual_recall/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── turn_contextual_recall.py
│   │   ├── turn_contextual_relevancy/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── turn_contextual_relevancy.py
│   │   ├── turn_faithfulness/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── turn_faithfulness.py
│   │   ├── turn_relevancy/
│   │   │   ├── __init__.py
│   │   │   ├── schema.py
│   │   │   ├── template.py
│   │   │   └── turn_relevancy.py
│   │   └── utils.py
│   ├── model_integrations/
│   │   ├── __init__.py
│   │   ├── types.py
│   │   └── utils.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── _summac_model.py
│   │   ├── answer_relevancy_model.py
│   │   ├── base_model.py
│   │   ├── detoxify_model.py
│   │   ├── embedding_models/
│   │   │   ├── __init__.py
│   │   │   ├── azure_embedding_model.py
│   │   │   ├── local_embedding_model.py
│   │   │   ├── ollama_embedding_model.py
│   │   │   └── openai_embedding_model.py
│   │   ├── hallucination_model.py
│   │   ├── llms/
│   │   │   ├── __init__.py
│   │   │   ├── amazon_bedrock_model.py
│   │   │   ├── anthropic_model.py
│   │   │   ├── azure_model.py
│   │   │   ├── constants.py
│   │   │   ├── deepseek_model.py
│   │   │   ├── gemini_model.py
│   │   │   ├── grok_model.py
│   │   │   ├── kimi_model.py
│   │   │   ├── litellm_model.py
│   │   │   ├── local_model.py
│   │   │   ├── ollama_model.py
│   │   │   ├── openai_model.py
│   │   │   ├── openrouter_model.py
│   │   │   ├── portkey_model.py
│   │   │   └── utils.py
│   │   ├── retry_policy.py
│   │   ├── summac_model.py
│   │   ├── unbias_model.py
│   │   └── utils.py
│   ├── openai/
│   │   ├── __init__.py
│   │   ├── extractors.py
│   │   ├── patch.py
│   │   └── utils.py
│   ├── openai_agents/
│   │   ├── __init__.py
│   │   ├── agent.py
│   │   ├── callback_handler.py
│   │   ├── extractors.py
│   │   ├── patch.py
│   │   └── runner.py
│   ├── optimizer/
│   │   ├── __init__.py
│   │   ├── algorithms/
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── configs.py
│   │   │   ├── copro/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── copro.py
│   │   │   │   ├── proposer.py
│   │   │   │   ├── schema.py
│   │   │   │   └── template.py
│   │   │   ├── gepa/
│   │   │   │   ├── __init__.py
│   │   │   │   └── gepa.py
│   │   │   ├── miprov2/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bootstrapper.py
│   │   │   │   ├── miprov2.py
│   │   │   │   └── proposer/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── proposer.py
│   │   │   │       ├── schema.py
│   │   │   │       └── template.py
│   │   │   └── simba/
│   │   │       ├── __init__.py
│   │   │       ├── proposer.py
│   │   │       ├── schema.py
│   │   │       ├── simba.py
│   │   │       └── template.py
│   │   ├── configs.py
│   │   ├── policies.py
│   │   ├── prompt_optimizer.py
│   │   ├── rewriter/
│   │   │   ├── __init__.py
│   │   │   ├── rewriter.py
│   │   │   ├── schema.py
│   │   │   └── template.py
│   │   ├── scorer/
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── schema.py
│   │   │   ├── scorer.py
│   │   │   ├── template.py
│   │   │   └── utils.py
│   │   ├── types.py
│   │   └── utils.py
│   ├── plugins/
│   │   ├── __init__.py
│   │   └── plugin.py
│   ├── progress_context.py
│   ├── prompt/
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── prompt.py
│   │   └── utils.py
│   ├── py.typed
│   ├── red_teaming/
│   │   └── README.md
│   ├── scorer/
│   │   ├── __init__.py
│   │   └── scorer.py
│   ├── simulator/
│   │   ├── __init__.py
│   │   ├── controller/
│   │   │   ├── __init__.py
│   │   │   ├── controller.py
│   │   │   ├── template.py
│   │   │   └── types.py
│   │   ├── conversation_simulator.py
│   │   ├── schema.py
│   │   ├── template.py
│   │   └── utils.py
│   ├── singleton.py
│   ├── synthesizer/
│   │   ├── __init__.py
│   │   ├── base_synthesizer.py
│   │   ├── chunking/
│   │   │   ├── __init__.py
│   │   │   ├── context_generator.py
│   │   │   └── doc_chunker.py
│   │   ├── config.py
│   │   ├── schema.py
│   │   ├── synthesizer.py
│   │   ├── templates/
│   │   │   ├── __init__.py
│   │   │   ├── template.py
│   │   │   ├── template_extraction.py
│   │   │   └── template_prompt.py
│   │   ├── types.py
│   │   └── utils.py
│   ├── telemetry.py
│   ├── test_case/
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── arena_test_case.py
│   │   ├── conversational_test_case.py
│   │   ├── llm_test_case.py
│   │   ├── mcp.py
│   │   └── utils.py
│   ├── test_run/
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── cache.py
│   │   ├── hooks.py
│   │   ├── hyperparameters.py
│   │   └── test_run.py
│   ├── tracing/
│   │   ├── __init__.py
│   │   ├── api.py
│   │   ├── context.py
│   │   ├── integrations.py
│   │   ├── internal.py
│   │   ├── offline_evals/
│   │   │   ├── __init__.py
│   │   │   ├── api.py
│   │   │   ├── span.py
│   │   │   ├── thread.py
│   │   │   └── trace.py
│   │   ├── otel/
│   │   │   ├── __init__.py
│   │   │   ├── context_aware_processor.py
│   │   │   ├── exporter.py
│   │   │   ├── test_exporter.py
│   │   │   └── utils.py
│   │   ├── patchers.py
│   │   ├── perf_epoch_bridge.py
│   │   ├── trace_context.py
│   │   ├── trace_test_manager.py
│   │   ├── tracing.py
│   │   ├── types.py
│   │   └── utils.py
│   └── utils.py
├── demo_trace_scope/
│   ├── __init__.py
│   └── test_observed_app.py
├── docs/
│   ├── .gitignore
│   ├── README.md
│   ├── app/
│   │   ├── (home)/
│   │   │   ├── layout.tsx
│   │   │   └── page.tsx
│   │   ├── api/
│   │   │   └── search/
│   │   │       └── route.ts
│   │   ├── blog/
│   │   │   ├── [[...slug]]/
│   │   │   │   └── page.tsx
│   │   │   └── layout.tsx
│   │   ├── changelog/
│   │   │   ├── [[...slug]]/
│   │   │   │   └── page.tsx
│   │   │   └── layout.tsx
│   │   ├── docs/
│   │   │   ├── [[...slug]]/
│   │   │   │   └── page.tsx
│   │   │   └── layout.tsx
│   │   ├── enterprise/
│   │   │   └── page.tsx
│   │   ├── global.css
│   │   ├── guides/
│   │   │   ├── [[...slug]]/
│   │   │   │   └── page.tsx
│   │   │   └── layout.tsx
│   │   ├── integrations/
│   │   │   ├── [[...slug]]/
│   │   │   │   └── page.tsx
│   │   │   └── layout.tsx
│   │   ├── layout.tsx
│   │   ├── llms-full.txt/
│   │   │   └── route.ts
│   │   ├── llms.mdx/
│   │   │   ├── blog/
│   │   │   │   └── [[...slug]]/
│   │   │   │       └── route.ts
│   │   │   ├── changelog/
│   │   │   │   └── [[...slug]]/
│   │   │   │       └── route.ts
│   │   │   ├── docs/
│   │   │   │   └── [[...slug]]/
│   │   │   │       └── route.ts
│   │   │   ├── guides/
│   │   │   │   └── [[...slug]]/
│   │   │   │       └── route.ts
│   │   │   ├── integrations/
│   │   │   │   └── [[...slug]]/
│   │   │   │       └── route.ts
│   │   │   └── tutorials/
│   │   │       └── [[...slug]]/
│   │   │           └── route.ts
│   │   ├── llms.txt/
│   │   │   └── route.ts
│   │   ├── og/
│   │   │   └── docs/
│   │   │       └── [...slug]/
│   │   │           └── route.tsx
│   │   ├── robots.ts
│   │   ├── sitemap.ts
│   │   └── tutorials/
│   │       ├── [[...slug]]/
│   │       │   └── page.tsx
│   │       └── layout.tsx
│   ├── components/
│   │   ├── mdx-anchor.tsx
│   │   └── mdx.tsx
│   ├── content/
│   │   ├── blog/
│   │   │   ├── deepeval-alternatives-compared.mdx
│   │   │   ├── deepeval-got-a-new-look.mdx
│   │   │   ├── deepeval-vs-arize.mdx
│   │   │   ├── deepeval-vs-langfuse.mdx
│   │   │   ├── deepeval-vs-ragas.mdx
│   │   │   ├── deepeval-vs-trulens.mdx
│   │   │   ├── index.mdx
│   │   │   ├── medical-chatbot-deepeval-guide.mdx
│   │   │   ├── meta.json
│   │   │   ├── rag-contract-assistant-deepeval-guide.mdx
│   │   │   ├── top-5-geval-use-cases.mdx
│   │   │   └── use-case-cognee-ai-memory.mdx
│   │   ├── changelog/
│   │   │   ├── changelog-2024.mdx
│   │   │   ├── changelog-2025.mdx
│   │   │   ├── changelog-2026.mdx
│   │   │   ├── index.mdx
│   │   │   └── meta.json
│   │   ├── docs/
│   │   │   ├── (agentic)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── metrics-argument-correctness.mdx
│   │   │   │   ├── metrics-plan-adherence.mdx
│   │   │   │   ├── metrics-plan-quality.mdx
│   │   │   │   ├── metrics-step-efficiency.mdx
│   │   │   │   ├── metrics-task-completion.mdx
│   │   │   │   └── metrics-tool-correctness.mdx
│   │   │   ├── (algorithms)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── prompt-optimization-copro.mdx
│   │   │   │   ├── prompt-optimization-gepa.mdx
│   │   │   │   ├── prompt-optimization-miprov2.mdx
│   │   │   │   └── prompt-optimization-simba.mdx
│   │   │   ├── (benchmarks)/
│   │   │   │   ├── benchmarks-arc.mdx
│   │   │   │   ├── benchmarks-bbq.mdx
│   │   │   │   ├── benchmarks-big-bench-hard.mdx
│   │   │   │   ├── benchmarks-bool-q.mdx
│   │   │   │   ├── benchmarks-drop.mdx
│   │   │   │   ├── benchmarks-gsm8k.mdx
│   │   │   │   ├── benchmarks-hellaswag.mdx
│   │   │   │   ├── benchmarks-human-eval.mdx
│   │   │   │   ├── benchmarks-ifeval.mdx
│   │   │   │   ├── benchmarks-lambada.mdx
│   │   │   │   ├── benchmarks-logi-qa.mdx
│   │   │   │   ├── benchmarks-math-qa.mdx
│   │   │   │   ├── benchmarks-mmlu.mdx
│   │   │   │   ├── benchmarks-squad.mdx
│   │   │   │   ├── benchmarks-truthful-qa.mdx
│   │   │   │   ├── benchmarks-winogrande.mdx
│   │   │   │   └── meta.json
│   │   │   ├── (concepts)/
│   │   │   │   ├── (test-cases)/
│   │   │   │   │   ├── evaluation-arena-test-cases.mdx
│   │   │   │   │   ├── evaluation-multiturn-test-cases.mdx
│   │   │   │   │   ├── evaluation-test-cases.mdx
│   │   │   │   │   └── meta.json
│   │   │   │   ├── evaluation-datasets.mdx
│   │   │   │   ├── evaluation-llm-tracing.mdx
│   │   │   │   ├── evaluation-mcp.mdx
│   │   │   │   ├── evaluation-prompts.mdx
│   │   │   │   └── meta.json
│   │   │   ├── (custom)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── metrics-arena-g-eval.mdx
│   │   │   │   ├── metrics-conversational-dag.mdx
│   │   │   │   ├── metrics-conversational-g-eval.mdx
│   │   │   │   ├── metrics-custom.mdx
│   │   │   │   ├── metrics-dag.mdx
│   │   │   │   └── metrics-llm-evals.mdx
│   │   │   ├── (generate-goldens)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── synthesizer-generate-from-contexts.mdx
│   │   │   │   ├── synthesizer-generate-from-docs.mdx
│   │   │   │   ├── synthesizer-generate-from-goldens.mdx
│   │   │   │   └── synthesizer-generate-from-scratch.mdx
│   │   │   ├── (images)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── multimodal-metrics-image-coherence.mdx
│   │   │   │   ├── multimodal-metrics-image-editing.mdx
│   │   │   │   ├── multimodal-metrics-image-helpfulness.mdx
│   │   │   │   ├── multimodal-metrics-image-reference.mdx
│   │   │   │   └── multimodal-metrics-text-to-image.mdx
│   │   │   ├── (mcp)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── metrics-mcp-task-completion.mdx
│   │   │   │   ├── metrics-mcp-use.mdx
│   │   │   │   └── metrics-multi-turn-mcp-use.mdx
│   │   │   ├── (metrics-others)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── metrics-hallucination.mdx
│   │   │   │   ├── metrics-prompt-alignment.mdx
│   │   │   │   ├── metrics-ragas.mdx
│   │   │   │   └── metrics-summarization.mdx
│   │   │   ├── (multi-turn)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── metrics-conversation-completeness.mdx
│   │   │   │   ├── metrics-goal-accuracy.mdx
│   │   │   │   ├── metrics-knowledge-retention.mdx
│   │   │   │   ├── metrics-role-adherence.mdx
│   │   │   │   ├── metrics-tool-use.mdx
│   │   │   │   ├── metrics-topic-adherence.mdx
│   │   │   │   ├── metrics-turn-contextual-precision.mdx
│   │   │   │   ├── metrics-turn-contextual-recall.mdx
│   │   │   │   ├── metrics-turn-contextual-relevancy.mdx
│   │   │   │   ├── metrics-turn-faithfulness.mdx
│   │   │   │   └── metrics-turn-relevancy.mdx
│   │   │   ├── (non-llm)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── metrics-exact-match.mdx
│   │   │   │   ├── metrics-json-correctness.mdx
│   │   │   │   └── metrics-pattern-match.mdx
│   │   │   ├── (rag)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── metrics-answer-relevancy.mdx
│   │   │   │   ├── metrics-contextual-precision.mdx
│   │   │   │   ├── metrics-contextual-recall.mdx
│   │   │   │   ├── metrics-contextual-relevancy.mdx
│   │   │   │   └── metrics-faithfulness.mdx
│   │   │   ├── (safety)/
│   │   │   │   ├── meta.json
│   │   │   │   ├── metrics-bias.mdx
│   │   │   │   ├── metrics-misuse.mdx
│   │   │   │   ├── metrics-non-advice.mdx
│   │   │   │   ├── metrics-pii-leakage.mdx
│   │   │   │   ├── metrics-role-violation.mdx
│   │   │   │   └── metrics-toxicity.mdx
│   │   │   ├── (use-cases)/
│   │   │   │   ├── getting-started-agents.mdx
│   │   │   │   ├── getting-started-chatbots.mdx
│   │   │   │   ├── getting-started-llm-arena.mdx
│   │   │   │   ├── getting-started-mcp.mdx
│   │   │   │   ├── getting-started-rag.mdx
│   │   │   │   └── meta.json
│   │   │   ├── benchmarks-introduction.mdx
│   │   │   ├── command-line-interface.mdx
│   │   │   ├── conversation-simulator/
│   │   │   │   ├── index.mdx
│   │   │   │   └── meta.json
│   │   │   ├── conversation-simulator-custom-templates.mdx
│   │   │   ├── conversation-simulator-lifecycle-hooks.mdx
│   │   │   ├── conversation-simulator-model-callback.mdx
│   │   │   ├── conversation-simulator-stopping-logic.mdx
│   │   │   ├── data-privacy.mdx
│   │   │   ├── environment-variables.mdx
│   │   │   ├── evaluation-component-level-llm-evals.mdx
│   │   │   ├── evaluation-end-to-end-llm-evals/
│   │   │   │   ├── index.mdx
│   │   │   │   └── meta.json
│   │   │   ├── evaluation-end-to-end-multi-turn.mdx
│   │   │   ├── evaluation-end-to-end-single-turn.mdx
│   │   │   ├── evaluation-flags-and-configs.mdx
│   │   │   ├── evaluation-introduction.mdx
│   │   │   ├── evaluation-unit-testing-in-ci-cd.mdx
│   │   │   ├── faq.mdx
│   │   │   ├── getting-started.mdx
│   │   │   ├── golden-synthesizer/
│   │   │   │   ├── index.mdx
│   │   │   │   └── meta.json
│   │   │   ├── introduction-comparisons.mdx
│   │   │   ├── introduction-design-philosophy.mdx
│   │   │   ├── introduction.mdx
│   │   │   ├── meta.json
│   │   │   ├── metrics-introduction.mdx
│   │   │   ├── miscellaneous.mdx
│   │   │   ├── prompt-optimization-introduction.mdx
│   │   │   ├── synthetic-data-generation-introduction.mdx
│   │   │   ├── troubleshooting.mdx
│   │   │   ├── vibe-coder-quickstart.mdx
│   │   │   └── vibe-coding.mdx
│   │   ├── guides/
│   │   │   ├── guides-ai-agent-evaluation-metrics.mdx
│   │   │   ├── guides-ai-agent-evaluation.mdx
│   │   │   ├── guides-answer-correctness-metric.mdx
│   │   │   ├── guides-building-custom-metrics.mdx
│   │   │   ├── guides-llm-as-a-judge.mdx
│   │   │   ├── guides-llm-observability.mdx
│   │   │   ├── guides-multi-turn-evaluation-metrics.mdx
│   │   │   ├── guides-multi-turn-evaluation.mdx
│   │   │   ├── guides-multi-turn-simulation.mdx
│   │   │   ├── guides-optimizing-hyperparameters.mdx
│   │   │   ├── guides-rag-evaluation.mdx
│   │   │   ├── guides-rag-triad.mdx
│   │   │   ├── guides-red-teaming.mdx
│   │   │   ├── guides-regression-testing-in-cicd.mdx
│   │   │   ├── guides-tracing-ai-agents.mdx
│   │   │   ├── guides-tracing-multi-turn.mdx
│   │   │   ├── guides-tracing-rag.mdx
│   │   │   ├── guides-using-custom-embedding-models.mdx
│   │   │   ├── guides-using-custom-llms.mdx
│   │   │   ├── guides-using-synthesizer.mdx
│   │   │   └── meta.json
│   │   ├── integrations/
│   │   │   ├── frameworks/
│   │   │   │   ├── agentcore.mdx
│   │   │   │   ├── anthropic.mdx
│   │   │   │   ├── crewai.mdx
│   │   │   │   ├── google-adk.mdx
│   │   │   │   ├── huggingface.mdx
│   │   │   │   ├── langchain.mdx
│   │   │   │   ├── langgraph.mdx
│   │   │   │   ├── llamaindex.mdx
│   │   │   │   ├── meta.json
│   │   │   │   ├── openai-agents.mdx
│   │   │   │   ├── openai.mdx
│   │   │   │   ├── pydanticai.mdx
│   │   │   │   └── strands.mdx
│   │   │   ├── index.mdx
│   │   │   ├── meta.json
│   │   │   ├── models/
│   │   │   │   ├── amazon-bedrock.mdx
│   │   │   │   ├── anthropic.mdx
│   │   │   │   ├── azure-openai.mdx
│   │   │   │   ├── deepseek.mdx
│   │   │   │   ├── gemini.mdx
│   │   │   │   ├── grok.mdx
│   │   │   │   ├── litellm.mdx
│   │   │   │   ├── lmstudio.mdx
│   │   │   │   ├── meta.json
│   │   │   │   ├── moonshot.mdx
│   │   │   │   ├── ollama.mdx
│   │   │   │   ├── openai.mdx
│   │   │   │   ├── openrouter.mdx
│   │   │   │   ├── portkey.mdx
│   │   │   │   ├── vertex-ai.mdx
│   │   │   │   └── vllm.mdx
│   │   │   ├── others/
│   │   │   │   └── meta.json
│   │   │   └── vector-databases/
│   │   │       ├── chroma.mdx
│   │   │       ├── cognee.mdx
│   │   │       ├── elasticsearch.mdx
│   │   │       ├── meta.json
│   │   │       ├── pgvector.mdx
│   │   │       ├── qdrant.mdx
│   │   │       └── weaviate.mdx
│   │   └── tutorials/
│   │       ├── medical-chatbot/
│   │       │   ├── development.mdx
│   │       │   ├── evals-in-prod.mdx
│   │       │   ├── evaluation.mdx
│   │       │   ├── improvement.mdx
│   │       │   └── introduction.mdx
│   │       ├── meta.json
│   │       ├── rag-qa-agent/
│   │       │   ├── development.mdx
│   │       │   ├── evals-in-prod.mdx
│   │       │   ├── evaluation.mdx
│   │       │   ├── improvement.mdx
│   │       │   └── introduction.mdx
│   │       ├── summarization-agent/
│   │       │   ├── development.mdx
│   │       │   ├── evals-in-prod.mdx
│   │       │   ├── evaluation.mdx
│   │       │   ├── improvement.mdx
│   │       │   └── introduction.mdx
│   │       ├── tutorial-introduction.mdx
│   │       └── tutorial-setup.mdx
│   ├── enterprise/
│   │   └── read-me.mdx
│   ├── home/
│   │   └── read-me.mdx
│   ├── lib/
│   │   ├── authors.ts
│   │   ├── blog-categories.ts
│   │   ├── cn.ts
│   │   ├── contributors.ts
│   │   ├── defaults.ts
│   │   ├── generated/
│   │   │   ├── changelog-contributors.json
│   │   │   ├── contributors.json
│   │   │   └── repo-contributors.json
│   │   ├── layout.shared.tsx
│   │   ├── llms-route.ts
│   │   ├── remark-admonitions.ts
│   │   ├── section.tsx
│   │   ├── sections.tsx
│   │   ├── shared.ts
│   │   └── source.ts
│   ├── next.config.mjs
│   ├── package.json
│   ├── postcss.config.mjs
│   ├── proxy.ts
│   ├── public/
│   │   ├── llms-full.txt
│   │   └── llms.txt
│   ├── scripts/
│   │   ├── build-readme-hero.mjs
│   │   ├── generate-changelog-contributors.mjs
│   │   ├── generate-contributors.mjs
│   │   ├── generate-repo-contributors.mjs
│   │   ├── normalize-admonition-titles.mjs
│   │   ├── replace-img-with-image-displayer.mjs
│   │   ├── strip-redundant-mdx-imports.mjs
│   │   └── timeline-to-steps.mjs
│   ├── source.config.ts
│   ├── src/
│   │   ├── assets.ts
│   │   ├── components/
│   │   │   ├── AgentTraceTerminal/
│   │   │   │   ├── AgentTraceTerminal.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── AskAIButton/
│   │   │   │   └── index.tsx
│   │   │   ├── BlogPostMeta/
│   │   │   │   ├── BlogPostMeta.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── BrandMarks/
│   │   │   │   └── index.tsx
│   │   │   ├── Buttons/
│   │   │   │   ├── Buttons.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── Callout/
│   │   │   │   ├── Callout.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── ChangelogContributors/
│   │   │   │   ├── ChangelogContributors.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── CloudPlatformCallout/
│   │   │   │   ├── CloudPlatformCallout.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── ContributorDisplay/
│   │   │   │   ├── ContributorDisplay.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── DiscordButton/
│   │   │   │   ├── DiscordButton.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── Equation/
│   │   │   │   ├── Equation.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── FAQ/
│   │   │   │   └── index.tsx
│   │   │   ├── FeatureComparisonTable/
│   │   │   │   ├── FeatureComparisonTable.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── GithubCtaButton/
│   │   │   │   ├── GithubCtaButton.module.scss
│   │   │   │   ├── index.tsx
│   │   │   │   └── useGithubStarCount.ts
│   │   │   ├── HeroAnnouncement/
│   │   │   │   ├── HeroAnnouncement.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── Hotkey/
│   │   │   │   ├── Hotkey.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── ImageDisplayer/
│   │   │   │   ├── ImageDisplayer.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── IntegrationGrid/
│   │   │   │   ├── IntegrationGrid.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── IntegrationTagsDisplayer/
│   │   │   │   ├── IntegrationTagsDisplayer.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── LinkCards/
│   │   │   │   ├── LinkCards.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── Mermaid/
│   │   │   │   └── index.tsx
│   │   │   ├── MetricTagsDisplayer/
│   │   │   │   ├── MetricTagsDisplayer.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── PageContributors/
│   │   │   │   ├── ContributorsOverflow.tsx
│   │   │   │   ├── PageContributors.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── PauseOffscreen/
│   │   │   │   └── index.tsx
│   │   │   ├── SchemaInjector/
│   │   │   │   └── SchemaInjector.tsx
│   │   │   ├── SectionLabel/
│   │   │   │   ├── SectionLabel.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── SiteThemeSwitch/
│   │   │   │   ├── SiteThemeSwitch.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── TechStackCards/
│   │   │   │   ├── TechStackCards.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── TocFooter/
│   │   │   │   ├── TocFooter.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── VideoDisplayer/
│   │   │   │   ├── VideoDisplayer.module.scss
│   │   │   │   └── index.tsx
│   │   │   └── index.ts
│   │   ├── global.d.ts
│   │   ├── layouts/
│   │   │   ├── Footer/
│   │   │   │   ├── Footer.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── HomeLayout/
│   │   │   │   ├── HomeLayout.module.scss
│   │   │   │   ├── HomeLayout.tsx
│   │   │   │   └── index.ts
│   │   │   ├── HomeOverflowNav/
│   │   │   │   ├── HomeOverflowNav.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── HomePageShell/
│   │   │   │   ├── HomePageShell.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── NavHeader/
│   │   │   │   ├── NavHeader.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── NavLinks/
│   │   │   │   └── index.tsx
│   │   │   ├── NavMenu/
│   │   │   │   ├── NavMenu.module.scss
│   │   │   │   └── index.tsx
│   │   │   ├── NotebookSectionLayout/
│   │   │   │   └── index.tsx
│   │   │   ├── SidebarSearch/
│   │   │   │   └── index.tsx
│   │   │   ├── SiteTopNav/
│   │   │   │   ├── SiteTopNav.module.scss
│   │   │   │   └── index.tsx
│   │   │   └── UtmCapture/
│   │   │       ├── UtmCapture.tsx
│   │   │       └── index.ts
│   │   ├── sections/
│   │   │   ├── enterprise/
│   │   │   │   ├── EnterpriseComparisonTable.module.scss
│   │   │   │   ├── EnterpriseComparisonTable.tsx
│   │   │   │   ├── EnterpriseHeroSection.module.scss
│   │   │   │   ├── EnterpriseHeroSection.tsx
│   │   │   │   ├── EnterprisePlatformMockup.module.scss
│   │   │   │   └── EnterprisePlatformMockup.tsx
│   │   │   └── home/
│   │   │       ├── ClaudeCodeTerminal/
│   │   │       │   ├── ClaudeCodeTerminal.module.scss
│   │   │       │   └── index.tsx
│   │   │       ├── CompanyLogos/
│   │   │       │   ├── Aws.tsx
│   │   │       │   ├── Benz.tsx
│   │   │       │   ├── Bosch.tsx
│   │   │       │   ├── CompanyLogos.module.scss
│   │   │       │   ├── CvsHealth.tsx
│   │   │       │   ├── Ey.tsx
│   │   │       │   ├── Mastercard.tsx
│   │   │       │   ├── Nvidia.tsx
│   │   │       │   ├── OpenAI.tsx
│   │   │       │   ├── Toyota.tsx
│   │   │       │   ├── Uber.tsx
│   │   │       │   ├── index.ts
│   │   │       │   └── types.ts
│   │   │       ├── DatasetDemos/
│   │   │       │   ├── DatasetDemos.module.scss
│   │   │       │   ├── GoldenGenerationDemo.tsx
│   │   │       │   ├── MultiTurnSimulationDemo.tsx
│   │   │       │   └── index.tsx
│   │   │       ├── HomeHeroSection.tsx
│   │   │       ├── HomeIntegrationsSection/
│   │   │       │   └── index.tsx
│   │   │       ├── HomePytestDemo/
│   │   │       │   ├── HomePytestDemo.module.scss
│   │   │       │   └── index.tsx
│   │   │       ├── HomeSection.module.scss
│   │   │       ├── JudgeCards/
│   │   │       │   ├── JudgeCards.module.scss
│   │   │       │   └── index.tsx
│   │   │       ├── RepoContributors/
│   │   │       │   ├── RepoContributors.module.scss
│   │   │       │   └── index.tsx
│   │   │       ├── SOTACards/
│   │   │       │   ├── SOTACards.module.scss
│   │   │       │   └── index.tsx
│   │   │       ├── TraceLoopConnector/
│   │   │       │   ├── TraceLoopConnector.module.scss
│   │   │       │   └── index.tsx
│   │   │       └── VibeCodingLoop/
│   │   │           ├── VibeCodingLoop.module.scss
│   │   │           └── index.tsx
│   │   └── utils/
│   │       ├── html-to-markdown.ts
│   │       ├── outbound-link-rel.ts
│   │       ├── schema-helpers.ts
│   │       ├── utm.ts
│   │       └── visitor-attribution.ts
│   ├── tsconfig.json
│   └── vercel.json
├── examples/
│   ├── create_tests.py
│   ├── dag-examples/
│   │   └── conversational_dag.ipynb
│   ├── getting_started/
│   │   └── test_example.py
│   ├── mcp_evaluation/
│   │   ├── mcp_eval_multi_turn.py
│   │   └── mcp_eval_single_turn.py
│   ├── notebooks/
│   │   ├── crewai.ipynb
│   │   ├── langgraph.ipynb
│   │   ├── openai.ipynb
│   │   ├── pydantic_ai.ipynb
│   │   └── static/
│   │       └── manual.txt
│   ├── rag_evaluation/
│   │   └── rag_evaluation_with_qdrant.py
│   ├── sample.txt
│   └── tracing/
│       ├── crewai_tracing.ipynb
│       └── test_chatbot.py
├── manual_after_evals_iterator.py
├── pyproject.toml
├── scripts/
│   └── check_openai_model_capabilities.py
├── skills/
│   ├── README.md
│   └── deepeval/
│       ├── LICENSE
│       ├── SKILL.md
│       ├── references/
│       │   ├── artifact-contracts.md
│       │   ├── choose-use-case.md
│       │   ├── confident-ai.md
│       │   ├── datasets.md
│       │   ├── intake.md
│       │   ├── iteration-loop.md
│       │   ├── metrics.md
│       │   ├── pytest-component-evals.md
│       │   ├── pytest-e2e-evals.md
│       │   ├── synthetic-data.md
│       │   └── tracing.md
│       └── templates/
│           ├── conftest.py
│           ├── test_multi_turn_e2e.py
│           ├── test_single_turn_component.py
│           └── test_single_turn_e2e.py
├── test_agentcore_agent.py
├── test_pydantic_agent.py
└── tests/
    ├── __init__.py
    ├── test_confident/
    │   ├── goldens.json
    │   ├── goldens_multi_turn.json
    │   ├── simulator/
    │   │   └── example_simulator.py
    │   ├── test_annotation.py
    │   ├── test_compare.py
    │   ├── test_conversational_g_eval_upload.py
    │   ├── test_dataset.py
    │   ├── test_evaluate.py
    │   ├── test_g_eval_upload.py
    │   ├── test_prompt.py
    │   └── test_region_autodetect_request_routing.py
    ├── test_core/
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── helpers.py
    │   ├── stubs.py
    │   ├── test_cli/
    │   │   ├── __init__.py
    │   │   └── test_cli.py
    │   ├── test_config/
    │   │   ├── __init__.py
    │   │   ├── test_deprecated_computed_aliases.py
    │   │   └── test_settings.py
    │   ├── test_core.py
    │   ├── test_datasets/
    │   │   ├── convo_goldens.csv
    │   │   ├── convo_goldens.json
    │   │   ├── goldens.csv
    │   │   ├── goldens.json
    │   │   └── test_dataset.py
    │   ├── test_drop_trace_and_span.py
    │   ├── test_evaluation/
    │   │   ├── test_async_trace_metric_isolation.py
    │   │   ├── test_console_report.py
    │   │   ├── test_end_to_end/
    │   │   │   ├── __init__.py
    │   │   │   ├── test_configs.py
    │   │   │   └── test_skip_reset.py
    │   │   ├── test_execute/
    │   │   │   ├── test_error_boundary.py
    │   │   │   ├── test_execute_conversational_test_case.py
    │   │   │   ├── test_execute_llm_test_case.py
    │   │   │   ├── test_execute_mllm_test_case.py
    │   │   │   └── test_execute_timeouts.py
    │   │   ├── test_local_store.py
    │   │   ├── test_printing.py
    │   │   ├── test_results_extraction.py
    │   │   ├── test_trace_results_extraction.py
    │   │   └── test_trace_scope_assert_test.py
    │   ├── test_imports.py
    │   ├── test_models/
    │   │   ├── test_amazon_bedrock_model.py
    │   │   ├── test_anthropic_model.py
    │   │   ├── test_azure_model.py
    │   │   ├── test_azure_retry_config.py
    │   │   ├── test_bedrock_retry_config.py
    │   │   ├── test_deepseek_model.py
    │   │   ├── test_embedding_models/
    │   │   │   ├── __init__.py
    │   │   │   ├── test_azure_embedding_model.py
    │   │   │   ├── test_local_embedding_model.py
    │   │   │   ├── test_ollama_embedding_model.py
    │   │   │   └── test_openai_embedding_model.py
    │   │   ├── test_gemini_model.py
    │   │   ├── test_grok_model.py
    │   │   ├── test_kimi_model.py
    │   │   ├── test_litellm_model.py
    │   │   ├── test_local_model.py
    │   │   ├── test_models_utils.py
    │   │   ├── test_ollama_model.py
    │   │   ├── test_openai_extractors.py
    │   │   ├── test_openai_model.py
    │   │   ├── test_openai_retry_policy.py
    │   │   ├── test_openrouter_model.py
    │   │   └── test_portkey_model.py
    │   ├── test_optimization/
    │   │   ├── test_copro/
    │   │   │   ├── __init__.py
    │   │   │   ├── test_configs.py
    │   │   │   └── test_loop.py
    │   │   ├── test_gepa/
    │   │   │   ├── __init__.py
    │   │   │   ├── test_gepa_config.py
    │   │   │   └── test_loop.py
    │   │   ├── test_miprov2/
    │   │   │   └── test_report_contract.py
    │   │   ├── test_mutations/
    │   │   │   ├── __init__.py
    │   │   │   └── test_prompt_rewriter.py
    │   │   ├── test_pareto.py
    │   │   ├── test_policies/
    │   │   │   └── test_tie_breaker.py
    │   │   ├── test_prompt_optimizer.py
    │   │   ├── test_simba/
    │   │   │   ├── __init__.py
    │   │   │   ├── test_configs.py
    │   │   │   └── test_loop.py
    │   │   └── test_utils.py
    │   ├── test_prompts/
    │   │   ├── test_interpolation.py
    │   │   ├── test_load.py
    │   │   └── test_prompt.py
    │   ├── test_retry_policy.py
    │   ├── test_run/
    │   │   ├── __init__.py
    │   │   ├── test_file_sync.py
    │   │   ├── test_run_manager.py
    │   │   └── test_turns_table.py
    │   ├── test_sanitize_nan.py
    │   ├── test_simulator/
    │   │   ├── __init__.py
    │   │   ├── helpers.py
    │   │   ├── test_conversation_simulator.py
    │   │   ├── test_conversation_simulator_controller.py
    │   │   └── test_conversation_simulator_json_mode.py
    │   ├── test_stubs_contract.py
    │   ├── test_synthesizer/
    │   │   ├── __init__.py
    │   │   ├── example_simulator.py
    │   │   ├── synthesizer_data/
    │   │   │   ├── docx_example.docx
    │   │   │   └── txt_example.txt
    │   │   ├── test_context_generator.py
    │   │   ├── test_doc_chunker.py
    │   │   ├── test_generate_from_goldens.py
    │   │   └── test_synthesizer.py
    │   ├── test_synthesizer_bugs.py
    │   ├── test_telemetry.py
    │   ├── test_test_case/
    │   │   ├── __init__.py
    │   │   ├── test_deprecated_params.py
    │   │   ├── test_multi_turn/
    │   │   │   ├── __init__.py
    │   │   │   ├── test_conversational_test_case.py
    │   │   │   ├── test_turn.py
    │   │   │   └── test_utils.py
    │   │   └── test_single_turn.py
    │   ├── test_threadpool_tracing.py
    │   ├── test_trace_memory_leak.py
    │   ├── test_tracing/
    │   │   ├── __init__.py
    │   │   ├── apps/
    │   │   │   ├── __init__.py
    │   │   │   ├── async_app.py
    │   │   │   └── sync_app.py
    │   │   ├── conftest.py
    │   │   ├── example_e2e_trace_evals.py
    │   │   ├── schemas/
    │   │   │   ├── generators/
    │   │   │   │   ├── async_streaming_concurrent_schema.json
    │   │   │   │   ├── async_streaming_llm_schema.json
    │   │   │   │   ├── async_streaming_nested_schema.json
    │   │   │   │   ├── async_streaming_processor_schema.json
    │   │   │   │   ├── async_streaming_updates_schema.json
    │   │   │   │   ├── context_safety_async_full_consumption_schema.json
    │   │   │   │   ├── context_safety_async_gen_yields_inner_schema.json
    │   │   │   │   ├── context_safety_async_nested_consume_schema.json
    │   │   │   │   ├── context_safety_async_three_level_schema.json
    │   │   │   │   ├── context_safety_sync_full_consumption_schema.json
    │   │   │   │   ├── context_safety_sync_gen_observe_between_schema.json
    │   │   │   │   ├── context_safety_sync_gen_yields_inner_schema.json
    │   │   │   │   ├── context_safety_sync_nested_consume_schema.json
    │   │   │   │   ├── context_safety_sync_observe_between_yields_schema.json
    │   │   │   │   ├── context_safety_sync_siblings_schema.json
    │   │   │   │   ├── context_safety_sync_three_level_schema.json
    │   │   │   │   ├── fastapi_basic_threadpool_schema.json
    │   │   │   │   ├── fastapi_child_spans_threadpool_schema.json
    │   │   │   │   ├── fastapi_deep_nesting_threadpool_schema.json
    │   │   │   │   ├── fastapi_rag_pipeline_threadpool_schema.json
    │   │   │   │   ├── fastapi_same_thread_sanity_schema.json
    │   │   │   │   ├── sync_streaming_llm_schema.json
    │   │   │   │   ├── sync_streaming_nested_schema.json
    │   │   │   │   ├── sync_streaming_processor_schema.json
    │   │   │   │   └── sync_streaming_updates_schema.json
    │   │   │   ├── masking/
    │   │   │   │   ├── comprehensive_masked_schema.json
    │   │   │   │   ├── credit_card_masked_schema.json
    │   │   │   │   ├── email_masked_schema.json
    │   │   │   │   └── no_masking_schema.json
    │   │   │   ├── metadata/
    │   │   │   │   ├── agent_with_metadata_schema.json
    │   │   │   │   ├── llm_with_metadata_schema.json
    │   │   │   │   ├── span_basic_metadata_schema.json
    │   │   │   │   ├── span_complex_metadata_schema.json
    │   │   │   │   ├── trace_basic_metadata_schema.json
    │   │   │   │   ├── trace_full_context_schema.json
    │   │   │   │   ├── trace_nested_spans_schema.json
    │   │   │   │   └── trace_user_info_schema.json
    │   │   │   ├── nested_spans/
    │   │   │   │   ├── agent_workflow_schema.json
    │   │   │   │   ├── async_nesting_schema.json
    │   │   │   │   ├── deep_nesting_schema.json
    │   │   │   │   ├── multiple_children_schema.json
    │   │   │   │   └── simple_nesting_schema.json
    │   │   │   ├── span_types/
    │   │   │   │   ├── agent_custom_name_schema.json
    │   │   │   │   ├── agent_full_attributes_schema.json
    │   │   │   │   ├── agent_minimal_schema.json
    │   │   │   │   ├── agent_multiple_handoffs_schema.json
    │   │   │   │   ├── agent_span_schema.json
    │   │   │   │   ├── agent_with_handoffs_schema.json
    │   │   │   │   ├── custom_processor_schema.json
    │   │   │   │   ├── custom_validator_schema.json
    │   │   │   │   ├── custom_with_name_schema.json
    │   │   │   │   ├── default_span_schema.json
    │   │   │   │   ├── llm_custom_name_schema.json
    │   │   │   │   ├── llm_full_attributes_schema.json
    │   │   │   │   ├── llm_minimal_schema.json
    │   │   │   │   ├── llm_runtime_model_schema.json
    │   │   │   │   ├── llm_span_schema.json
    │   │   │   │   ├── llm_with_costs_schema.json
    │   │   │   │   ├── retriever_custom_embedder_schema.json
    │   │   │   │   ├── retriever_custom_name_schema.json
    │   │   │   │   ├── retriever_full_attributes_schema.json
    │   │   │   │   ├── retriever_minimal_schema.json
    │   │   │   │   ├── retriever_override_embedder_schema.json
    │   │   │   │   ├── retriever_span_schema.json
    │   │   │   │   ├── span_with_only_name_schema.json
    │   │   │   │   ├── tool_calculator_schema.json
    │   │   │   │   ├── tool_custom_name_schema.json
    │   │   │   │   ├── tool_description_and_name_schema.json
    │   │   │   │   ├── tool_long_description_schema.json
    │   │   │   │   ├── tool_minimal_schema.json
    │   │   │   │   └── tool_span_schema.json
    │   │   │   ├── tags/
    │   │   │   │   ├── basic_tags_schema.json
    │   │   │   │   ├── env_tags_schema.json
    │   │   │   │   ├── feature_tags_schema.json
    │   │   │   │   └── name_and_tags_schema.json
    │   │   │   └── update_functions/
    │   │   │       ├── span_context_schema.json
    │   │   │       ├── span_expected_output_schema.json
    │   │   │       ├── span_from_test_case_schema.json
    │   │   │       ├── span_input_output_schema.json
    │   │   │       ├── span_name_schema.json
    │   │   │       ├── span_override_test_case_schema.json
    │   │   │       ├── span_tools_schema.json
    │   │   │       ├── trace_context_info_schema.json
    │   │   │       ├── trace_from_test_case_schema.json
    │   │   │       ├── trace_full_context_schema.json
    │   │   │       ├── trace_identifiers_schema.json
    │   │   │       ├── trace_name_schema.json
    │   │   │       ├── trace_nested_updates_schema.json
    │   │   │       └── trace_tools_schema.json
    │   │   ├── test_configuration/
    │   │   │   ├── test_configure_multiple.py
    │   │   │   ├── test_environment.py
    │   │   │   ├── test_masking_config.py
    │   │   │   └── test_sampling_rate.py
    │   │   ├── test_generators/
    │   │   │   ├── test_async_generator.py
    │   │   │   ├── test_fastapi_streaming_repro.py
    │   │   │   ├── test_generator_context_safety.py
    │   │   │   └── test_sync_generator.py
    │   │   ├── test_integration/
    │   │   │   ├── test_current_golden_context.py
    │   │   │   ├── test_dataset_iterator.py
    │   │   │   ├── test_execute_integration.py
    │   │   │   └── test_tools_called.py
    │   │   ├── test_masking/
    │   │   │   └── test_masking.py
    │   │   ├── test_metadata/
    │   │   │   ├── test_span_metadata.py
    │   │   │   └── test_trace_metadata.py
    │   │   ├── test_nested_spans/
    │   │   │   └── test_nested_spans.py
    │   │   ├── test_span_types/
    │   │   │   ├── test_agent_span.py
    │   │   │   ├── test_custom_span.py
    │   │   │   ├── test_llm_span.py
    │   │   │   ├── test_retriever_span.py
    │   │   │   └── test_tool_span.py
    │   │   ├── test_tags/
    │   │   │   └── test_trace_tags.py
    │   │   └── test_update_functions/
    │   │       ├── test_update_current_span.py
    │   │       └── test_update_current_trace.py
    │   └── test_utils.py
    ├── test_docs/
    │   ├── test_confident/
    │   │   ├── test_integrations/
    │   │   │   ├── test_anthropic.py
    │   │   │   ├── test_crewai.py
    │   │   │   ├── test_langchain.py
    │   │   │   ├── test_langgraph.py
    │   │   │   ├── test_litellm.py
    │   │   │   ├── test_openai.py
    │   │   │   ├── test_openai_agents.py
    │   │   │   └── test_opentelemetry.py
    │   │   └── test_tracing_features/
    │   │       ├── test_attributes.py
    │   │       ├── test_environment.py
    │   │       ├── test_input_output.py
    │   │       ├── test_masking.py
    │   │       ├── test_metadata.py
    │   │       ├── test_name.py
    │   │       ├── test_sampling.py
    │   │       ├── test_span_types.py
    │   │       ├── test_tags.py
    │   │       ├── test_test_case.py
    │   │       ├── test_threads.py
    │   │       └── test_users.py
    │   └── test_deepeval/
    │       ├── test_ai_agent_evals/
    │       │   ├── test_crewai/
    │       │   │   └── test_crewai_e2e.py
    │       │   ├── test_e2e_crewai.py
    │       │   ├── test_e2e_langchain.py
    │       │   ├── test_e2e_langraph.py
    │       │   ├── test_langchain/
    │       │   │   ├── test_langchain_e2e_async.py
    │       │   │   ├── test_langchain_e2e_sync.py
    │       │   │   ├── test_langchain_online_evals.py
    │       │   │   └── test_langchain_setup.py
    │       │   ├── test_langgraph/
    │       │   │   ├── test_e2e_langgraph_async.py
    │       │   │   ├── test_e2e_langgraph_sync.py
    │       │   │   ├── test_langgraph_component.py
    │       │   │   ├── test_langgraph_online.py
    │       │   │   └── test_langgraph_setup.py
    │       │   ├── test_llamaindex/
    │       │   │   └── test_llamaindex_e2e_async.py
    │       │   ├── test_openai_agents/
    │       │   │   └── test_agent.py
    │       │   ├── test_overall.py
    │       │   ├── test_pydanticai/
    │       │   │   ├── test_pydanticai_component_level.py
    │       │   │   ├── test_pydanticai_e2e_async.py
    │       │   │   ├── test_pydanticai_online.py
    │       │   │   ├── test_pydanticai_setup.py
    │       │   │   └── test_pydanticai_trace_attr.py
    │       │   ├── test_setup_crewai.py
    │       │   ├── test_setup_end_to_end_python.py
    │       │   ├── test_setup_langchain.py
    │       │   ├── test_setup_langraph.py
    │       │   └── test_setup_llm_tracing.py
    │       └── test_llm_evals/
    │           ├── test_component_level_evals.py
    │           └── test_setup_tracing.py
    ├── test_integrations/
    │   ├── __init__.py
    │   ├── test_agentcore/
    │   │   ├── apps/
    │   │   │   ├── agentcore_eval_app.py
    │   │   │   ├── agentcore_multiple_tools_app.py
    │   │   │   ├── agentcore_simple_app.py
    │   │   │   └── agentcore_tool_app.py
    │   │   ├── conftest.py
    │   │   ├── schemas/
    │   │   │   ├── agentcore_async_parallel_tools_schema.json
    │   │   │   ├── agentcore_async_simple_schema.json
    │   │   │   ├── agentcore_async_tool_schema.json
    │   │   │   ├── agentcore_features_async.json
    │   │   │   ├── agentcore_features_sync.json
    │   │   │   ├── agentcore_multiple_tools_time_schema.json
    │   │   │   ├── agentcore_multiple_tools_weather_schema.json
    │   │   │   ├── agentcore_parallel_tools_schema.json
    │   │   │   ├── agentcore_simple_schema.json
    │   │   │   ├── agentcore_tool_metric_collection_schema.json
    │   │   │   └── agentcore_tool_schema.json
    │   │   ├── test_async.py
    │   │   ├── test_evaluate_agent.py
    │   │   ├── test_span_interceptor.py
    │   │   └── test_sync.py
    │   ├── test_anthropic/
    │   │   ├── conftest.py
    │   │   ├── simple_anthropic.py
    │   │   ├── test_async_anthropic.py
    │   │   ├── test_async_messages_create_with_trace.json
    │   │   ├── test_sync_anthropic.py
    │   │   └── test_sync_messages_create_without_trace.json
    │   ├── test_crewai/
    │   │   ├── apps/
    │   │   │   ├── async_app.py
    │   │   │   ├── evals_app.py
    │   │   │   ├── hierarchical_app.py
    │   │   │   ├── knowledge_retriever_app.py
    │   │   │   ├── multi_agent_app.py
    │   │   │   ├── simple_app.py
    │   │   │   └── tool_usage_app.py
    │   │   ├── conftest.py
    │   │   ├── crewai.json
    │   │   ├── crewai_component.json
    │   │   ├── knowledge_retriever.json
    │   │   ├── schemas/
    │   │   │   ├── crewai_akickoff.json
    │   │   │   ├── crewai_async_kickoff.json
    │   │   │   ├── crewai_async_tool_usage.json
    │   │   │   ├── crewai_features_async.json
    │   │   │   ├── crewai_features_sync.json
    │   │   │   ├── crewai_hierarchical.json
    │   │   │   ├── crewai_kickoff_for_each.json
    │   │   │   ├── crewai_kickoff_for_each_async.json
    │   │   │   ├── crewai_knowledge_retrieval.json
    │   │   │   ├── crewai_multi_agent_sequential.json
    │   │   │   ├── crewai_simple_kickoff.json
    │   │   │   └── crewai_tool_usage.json
    │   │   ├── test_async.py
    │   │   ├── test_crewai.py
    │   │   ├── test_crewai_component.py
    │   │   ├── test_knowledge_retriever.py
    │   │   ├── test_stress.py
    │   │   └── test_sync.py
    │   ├── test_exporter/
    │   │   ├── readable_spans.py
    │   │   └── test_pydantic_ai.py
    │   ├── test_googleadk/
    │   │   ├── __init__.py
    │   │   ├── apps/
    │   │   │   ├── __init__.py
    │   │   │   ├── googleadk_eval_app.py
    │   │   │   ├── googleadk_multiple_tools_app.py
    │   │   │   ├── googleadk_simple_app.py
    │   │   │   └── googleadk_tool_app.py
    │   │   ├── conftest.py
    │   │   ├── schemas/
    │   │   │   ├── README.md
    │   │   │   ├── googleadk_async_parallel_tools_schema.json
    │   │   │   ├── googleadk_async_simple_schema.json
    │   │   │   ├── googleadk_async_tool_schema.json
    │   │   │   ├── googleadk_features_async.json
    │   │   │   ├── googleadk_features_sync.json
    │   │   │   ├── googleadk_multiple_tools_time_schema.json
    │   │   │   ├── googleadk_multiple_tools_weather_schema.json
    │   │   │   ├── googleadk_parallel_tools_schema.json
    │   │   │   ├── googleadk_simple_schema.json
    │   │   │   ├── googleadk_tool_metric_collection_schema.json
    │   │   │   └── googleadk_tool_schema.json
    │   │   ├── test_async.py
    │   │   ├── test_evaluate_agent.py
    │   │   ├── test_span_interceptor.py
    │   │   └── test_sync.py
    │   ├── test_langchain/
    │   │   ├── __init__.py
    │   │   ├── apps/
    │   │   │   ├── __init__.py
    │   │   │   ├── langchain_agent_app.py
    │   │   │   ├── langchain_conditional_app.py
    │   │   │   ├── langchain_metric_collection_app.py
    │   │   │   ├── langchain_multiple_tools_app.py
    │   │   │   ├── langchain_next_span_app.py
    │   │   │   ├── langchain_parallel_tools_app.py
    │   │   │   ├── langchain_retriever_app.py
    │   │   │   ├── langchain_simple_app.py
    │   │   │   ├── langchain_single_tool_app.py
    │   │   │   └── langchain_streaming_app.py
    │   │   ├── conftest.py
    │   │   ├── langchain.json
    │   │   ├── schemas/
    │   │   │   ├── langchain_agent_complex_schema.json
    │   │   │   ├── langchain_agent_multi_step_schema.json
    │   │   │   ├── langchain_agent_simple_schema.json
    │   │   │   ├── langchain_async_agent_complex_schema.json
    │   │   │   ├── langchain_async_agent_multi_step_schema.json
    │   │   │   ├── langchain_async_agent_simple_schema.json
    │   │   │   ├── langchain_async_conditional_fact_check_schema.json
    │   │   │   ├── langchain_async_conditional_general_schema.json
    │   │   │   ├── langchain_async_conditional_research_schema.json
    │   │   │   ├── langchain_async_conditional_summarize_schema.json
    │   │   │   ├── langchain_async_mixed_tools_schema.json
    │   │   │   ├── langchain_async_multiple_tools_schema.json
    │   │   │   ├── langchain_async_next_llm_span_schema.json
    │   │   │   ├── langchain_async_parallel_mixed_schema.json
    │   │   │   ├── langchain_async_parallel_stocks_schema.json
    │   │   │   ├── langchain_async_parallel_weather_schema.json
    │   │   │   ├── langchain_async_retriever_langchain_schema.json
    │   │   │   ├── langchain_async_retriever_python_schema.json
    │   │   │   ├── langchain_async_simple_schema.json
    │   │   │   ├── langchain_async_single_tool_schema.json
    │   │   │   ├── langchain_async_streaming_multi_schema.json
    │   │   │   ├── langchain_async_streaming_schema.json
    │   │   │   ├── langchain_conditional_fact_check_schema.json
    │   │   │   ├── langchain_conditional_general_schema.json
    │   │   │   ├── langchain_conditional_research_schema.json
    │   │   │   ├── langchain_conditional_summarize_schema.json
    │   │   │   ├── langchain_metric_collection_schema.json
    │   │   │   ├── langchain_multiple_tools_mixed_schema.json
    │   │   │   ├── langchain_multiple_tools_schema.json
    │   │   │   ├── langchain_next_llm_span_schema.json
    │   │   │   ├── langchain_parallel_mixed_schema.json
    │   │   │   ├── langchain_parallel_stocks_schema.json
    │   │   │   ├── langchain_parallel_weather_schema.json
    │   │   │   ├── langchain_retriever_langchain_schema.json
    │   │   │   ├── langchain_retriever_metric_collection_schema.json
    │   │   │   ├── langchain_retriever_python_schema.json
    │   │   │   ├── langchain_simple_schema.json
    │   │   │   ├── langchain_single_tool_schema.json
    │   │   │   ├── langchain_streaming_multi_schema.json
    │   │   │   └── langchain_streaming_schema.json
    │   │   ├── test_async.py
    │   │   ├── test_langchain.py
    │   │   ├── test_next_span.py
    │   │   └── test_sync.py
    │   ├── test_langgraph/
    │   │   ├── __init__.py
    │   │   ├── apps/
    │   │   │   ├── __init__.py
    │   │   │   ├── langgraph_async_app.py
    │   │   │   ├── langgraph_conditional_app.py
    │   │   │   ├── langgraph_metric_collection_app.py
    │   │   │   ├── langgraph_multi_turn_app.py
    │   │   │   ├── langgraph_multiple_tools_app.py
    │   │   │   ├── langgraph_next_span_app.py
    │   │   │   ├── langgraph_parallel_tools_app.py
    │   │   │   ├── langgraph_retriever_app.py
    │   │   │   ├── langgraph_simple_app.py
    │   │   │   ├── langgraph_streaming_app.py
    │   │   │   └── main.py
    │   │   ├── conftest.py
    │   │   ├── schemas/
    │   │   │   ├── langgraph_async_conditional_schema.json
    │   │   │   ├── langgraph_async_multi_turn_schema.json
    │   │   │   ├── langgraph_async_multiple_tools_schema.json
    │   │   │   ├── langgraph_async_next_llm_span_schema.json
    │   │   │   ├── langgraph_async_no_tools_schema.json
    │   │   │   ├── langgraph_async_parallel_heavy_schema.json
    │   │   │   ├── langgraph_async_parallel_schema.json
    │   │   │   ├── langgraph_async_single_tool_schema.json
    │   │   │   ├── langgraph_async_streaming_multi_schema.json
    │   │   │   ├── langgraph_async_streaming_schema.json
    │   │   │   ├── langgraph_conditional_fact_check_schema.json
    │   │   │   ├── langgraph_conditional_general_schema.json
    │   │   │   ├── langgraph_conditional_research_schema.json
    │   │   │   ├── langgraph_conditional_summarize_schema.json
    │   │   │   ├── langgraph_full_flow_schema.json
    │   │   │   ├── langgraph_metric_collection_schema.json
    │   │   │   ├── langgraph_multi_turn_schema.json
    │   │   │   ├── langgraph_multiple_tools_mixed_schema.json
    │   │   │   ├── langgraph_multiple_tools_schema.json
    │   │   │   ├── langgraph_next_llm_span_schema.json
    │   │   │   ├── langgraph_parallel_mixed_schema.json
    │   │   │   ├── langgraph_parallel_stocks_schema.json
    │   │   │   ├── langgraph_parallel_weather_schema.json
    │   │   │   ├── langgraph_retriever_langchain_schema.json
    │   │   │   ├── langgraph_retriever_metric_collection_schema.json
    │   │   │   ├── langgraph_retriever_python_schema.json
    │   │   │   ├── langgraph_simple_schema.json
    │   │   │   ├── langgraph_stateless_schema.json
    │   │   │   ├── langgraph_streaming_multi_schema.json
    │   │   │   └── langgraph_streaming_schema.json
    │   │   ├── test_async.py
    │   │   ├── test_create_task.py
    │   │   ├── test_next_span.py
    │   │   └── test_sync.py
    │   ├── test_llamaindex/
    │   │   ├── apps/
    │   │   │   ├── agent_app.py
    │   │   │   ├── eval_app.py
    │   │   │   ├── rag_app.py
    │   │   │   ├── router_app.py
    │   │   │   └── simple_app.py
    │   │   ├── conftest.py
    │   │   ├── schemas/
    │   │   │   ├── llama_index_async_agent_math_schema.json
    │   │   │   ├── llama_index_async_agent_schema.json
    │   │   │   ├── llama_index_async_rag_schema.json
    │   │   │   ├── llama_index_async_router_schema.json
    │   │   │   ├── llama_index_async_simple_schema.json
    │   │   │   ├── llama_index_features_async.json
    │   │   │   ├── llama_index_rag_llama_schema.json
    │   │   │   ├── llama_index_rag_python_schema.json
    │   │   │   ├── llama_index_router_math_schema.json
    │   │   │   └── llama_index_simple_schema.json
    │   │   ├── test_async.py
    │   │   └── test_sync.py
    │   ├── test_openai/
    │   │   ├── conftest.py
    │   │   ├── simple_openai.py
    │   │   ├── test_async_openai.py
    │   │   ├── test_async_openai_without_trace.json
    │   │   ├── test_async_response_create_with_trace.json
    │   │   ├── test_sync_openai.py
    │   │   ├── test_sync_openai_with_trace.json
    │   │   ├── test_sync_response_create_without_trace.json
    │   │   ├── test_tool_call_flow_completion.json
    │   │   ├── test_tool_call_flow_completion.py
    │   │   ├── test_tool_call_flow_response.json
    │   │   └── test_tool_call_flow_response.py
    │   ├── test_openai_agents/
    │   │   ├── apps/
    │   │   │   ├── eval_agent.py
    │   │   │   ├── handoff_agent.py
    │   │   │   ├── session_agent.py
    │   │   │   ├── simple_agent.py
    │   │   │   ├── streaming_agent.py
    │   │   │   └── tool_agent.py
    │   │   ├── conftest.py
    │   │   ├── schemas/
    │   │   │   ├── openai_agents_async_handoff_schema.json
    │   │   │   ├── openai_agents_async_simple_schema.json
    │   │   │   ├── openai_agents_async_tool_schema.json
    │   │   │   ├── openai_agents_eval_schema.json
    │   │   │   ├── openai_agents_handoff_spanish_schema.json
    │   │   │   ├── openai_agents_session_schema.json
    │   │   │   ├── openai_agents_simple_schema.json
    │   │   │   ├── openai_agents_tool_math_schema.json
    │   │   │   └── openai_agents_tool_weather_schema.json
    │   │   ├── test_async.py
    │   │   ├── test_scenerios/
    │   │   │   ├── multi_agents.json
    │   │   │   ├── run.json
    │   │   │   ├── run_streamed.json
    │   │   │   ├── test_multi_agents.py
    │   │   │   ├── test_run.py
    │   │   │   ├── test_run_streamed.py
    │   │   │   ├── test_run_sync.py
    │   │   │   ├── test_weather_agent_patched.py
    │   │   │   ├── test_with_trace.py
    │   │   │   ├── test_with_trace_and_wrapped.py
    │   │   │   ├── weather_agent_patched.json
    │   │   │   ├── with_trace.json
    │   │   │   └── with_trace_and_wrapped.json
    │   │   └── test_sync.py
    │   ├── test_openrouter/
    │   │   └── test_openrouter_generation.py
    │   ├── test_pydanticai/
    │   │   ├── apps/
    │   │   │   ├── __init__.py
    │   │   │   ├── eval_app.py
    │   │   │   ├── pydanticai_isolation_app.py
    │   │   │   ├── pydanticai_metric_collection_app.py
    │   │   │   ├── pydanticai_modes_app.py
    │   │   │   ├── pydanticai_multiple_tools_app.py
    │   │   │   ├── pydanticai_next_span_app.py
    │   │   │   ├── pydanticai_simple_app.py
    │   │   │   ├── pydanticai_streaming_app.py
    │   │   │   └── pydanticai_tool_app.py
    │   │   ├── conftest.py
    │   │   ├── schemas/
    │   │   │   ├── pydanticai_async_parallel_tools_schema.json
    │   │   │   ├── pydanticai_async_simple_schema.json
    │   │   │   ├── pydanticai_async_tool_schema.json
    │   │   │   ├── pydanticai_bare_tool_enrichment_schema.json
    │   │   │   ├── pydanticai_features_async.json
    │   │   │   ├── pydanticai_features_sync.json
    │   │   │   ├── pydanticai_multiple_tools_time_schema.json
    │   │   │   ├── pydanticai_multiple_tools_weather_schema.json
    │   │   │   ├── pydanticai_next_llm_only_schema.json
    │   │   │   ├── pydanticai_next_stacked_schema.json
    │   │   │   ├── pydanticai_observe_mode_schema.json
    │   │   │   ├── pydanticai_parallel_tools_schema.json
    │   │   │   ├── pydanticai_simple_schema.json
    │   │   │   ├── pydanticai_streaming_schema.json
    │   │   │   ├── pydanticai_tool_schema.json
    │   │   │   ├── pydanticai_trace_metric_collection_schema.json
    │   │   │   └── pydanticai_with_trace_mode_schema.json
    │   │   ├── test_async.py
    │   │   ├── test_evaluate_agent.py
    │   │   ├── test_span_interceptor.py
    │   │   └── test_sync.py
    │   ├── test_strands/
    │   │   ├── __init__.py
    │   │   ├── apps/
    │   │   │   ├── __init__.py
    │   │   │   ├── strands_eval_app.py
    │   │   │   ├── strands_multiple_tools_app.py
    │   │   │   ├── strands_simple_app.py
    │   │   │   └── strands_tool_app.py
    │   │   ├── conftest.py
    │   │   ├── schemas/
    │   │   │   ├── README.md
    │   │   │   ├── strands_async_parallel_tools_schema.json
    │   │   │   ├── strands_async_simple_schema.json
    │   │   │   ├── strands_async_tool_schema.json
    │   │   │   ├── strands_features_async.json
    │   │   │   ├── strands_features_sync.json
    │   │   │   ├── strands_multiple_tools_time_schema.json
    │   │   │   ├── strands_multiple_tools_weather_schema.json
    │   │   │   ├── strands_parallel_tools_schema.json
    │   │   │   ├── strands_simple_schema.json
    │   │   │   ├── strands_tool_metric_collection_schema.json
    │   │   │   └── strands_tool_schema.json
    │   │   ├── test_async.py
    │   │   ├── test_evaluate_agent.py
    │   │   ├── test_span_interceptor.py
    │   │   └── test_sync.py
    │   └── utils.py
    └── test_metrics/
        ├── test_answer_relevancy_metric.py
        ├── test_answer_relevancy_metric_empty_output.py
        ├── test_arena_geval_metric.py
        ├── test_bias_metric.py
        ├── test_contextual_precision_metric.py
        ├── test_contextual_recall_metric.py
        ├── test_contextual_relevancy_metric.py
        ├── test_conversation_completeness_metric.py
        ├── test_conversational_dag.py
        ├── test_conversational_g_eval.py
        ├── test_dag.py
        ├── test_dag_serialization.py
        ├── test_exact_match_metric.py
        ├── test_faithfulness_metric.py
        ├── test_g_eval_metric.py
        ├── test_g_eval_utils.py
        ├── test_goal_accuracy_metric.py
        ├── test_hallucination_metric.py
        ├── test_image_coherence_metric.py
        ├── test_image_editing_metric.py
        ├── test_image_helpfulness_metric.py
        ├── test_image_reference_metric.py
        ├── test_json_correctness_metric.py
        ├── test_knowledge_retention_metric.py
        ├── test_mcp_task_completetion_metric.py
        ├── test_mcp_use_metric.py
        ├── test_misuse_metric.py
        ├── test_multi_turn_mcp_use_metric.py
        ├── test_non_advice_metric.py
        ├── test_pattern_match_metric.py
        ├── test_pii_lekage_metric.py
        ├── test_plan_adherence_metric.py
        ├── test_plan_quality_metric.py
        ├── test_prompt_alignment_metric.py
        ├── test_role_adherence_metric.py
        ├── test_role_violation_metric.py
        ├── test_step_efficiency_metric.py
        ├── test_summarization_metric.py
        ├── test_task_completetion_metric.py
        ├── test_text_to_image_metric.py
        ├── test_tool_use_metric.py
        ├── test_topic_adherence_metric.py
        ├── test_toxicity_metric.py
        ├── test_turn_contextual_precision_metric.py
        ├── test_turn_contextual_recall_metric.py
        ├── test_turn_faithfulness_metric.py
        ├── test_turn_relevancy_metric.py
        └── turn_contextual_relevancy_metric.py